Package opennlp.tools.formats.ad

Source Code of opennlp.tools.formats.ad.ADParagraphStream

/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package opennlp.tools.formats.ad;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
import java.util.Stack;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import opennlp.tools.formats.ad.ADParagraphStream.ParagraphParser.Node;
import opennlp.tools.util.FilterObjectStream;
import opennlp.tools.util.ObjectStream;

/**
* Stream filter which merges text lines into paragraphs, following the Arvores
* Deitadas syntax.
* <p>
* Information about the format:<br>
* Susana Afonso.
* "Árvores deitadas: Descrição do formato e das opções de análise na Floresta Sintáctica"
* .<br>
* 12 de Fevereiro de 2006.
* http://www.linguateca.pt/documentos/Afonso2006ArvoresDeitadas.pdf
* <p>
* <b>Note:</b> Do not use this class, internal use only!
*/
public class ADParagraphStream extends
    FilterObjectStream<String, ADParagraphStream.Paragraph> {

  public static class Paragraph {

    private String text;
    private Node root;

    public String getText() {
      return text;
    }

    public void setText(String text) {
      this.text = text;
    }

    public Node getRoot() {
      return root;
    }

    public void setRoot(Node root) {
      this.root = root;
    }

  }

  /**
   * Parses a sample of AD corpus. A sentence in AD corpus is represented by a
   * Tree. In this class we declare some types to represent that tree.
   */
  public static class ParagraphParser {

    private Pattern rootPattern = Pattern.compile("^[^:=]+:[^(\\s]+$");
    private Pattern nodePattern = Pattern
        .compile("^([=-]*)([^:=]+:[^\\(\\s]+)(\\(([^\\)]+)\\))?\\s*$");
    private Pattern leafPattern = Pattern
        .compile("^([=-]*)([^:=]+:[^\\(\\s]+)\\(([\"'].+[\"'])?\\s*([^\\)]+)?\\)\\s+(.+)");
    private Pattern bizarreLeafPattern = Pattern
        .compile("^([=-]*)([^:=]+=[^\\(\\s]+)\\(([\"'].+[\"'])?\\s*([^\\)]+)?\\)\\s+(.+)");
    private Pattern punctuationPattern = Pattern.compile("^(=*)(\\W+)$");

    /**
     * Parse the paragraph
     */
    public Paragraph parse(String paragraphString) {
      BufferedReader reader = new BufferedReader(new StringReader(
          paragraphString));
      Paragraph sentence = new Paragraph();
      Node root = new Node();
      try {
        // first line is <s ...>
        String line = reader.readLine();
        if (line.startsWith("<s")) {
          // should finde the source source
          while (!line.startsWith("SOURCE")) {
            line = reader.readLine();
            if (line == null) {
              return new Paragraph();
            }
          }
        }
        line = reader.readLine();
        // we should have the plain sentence
        // we remove the first token
        int start = line.indexOf(" ");
        sentence.setText(line.substring(start + 1));
        // now we look for the root node
        line = reader.readLine();

        while (!rootPattern.matcher(line).matches()) {
          line = reader.readLine();
          if (line == null) {
            return sentence;
          }
        }
        // got the root. Add it to the stack
        Stack<Node> nodeStack = new Stack<Node>();
        // we get the complete line

        root.setSyntacticTag("ROOT");
        root.setLevel(0);
        nodeStack.add(root);
        // now we have to take care of the lastLevel. Every time it raises, we
        // will add the
        // leaf to the node at the top. If it decreases, we remove the top.
        //line = reader.readLine();
        while (line.length() != 0 && line.startsWith("</s>") == false) {
          TreeElement element = this.getElement(line);
         
          if(element != null) {
            // remove elements at same level or higher
            while (!nodeStack.isEmpty()
                && element.getLevel() > 0 && element.getLevel() <= nodeStack.peek().getLevel()) {
              nodeStack.pop();
            }
            if( element.isLeaf() ) {
              if (nodeStack.isEmpty()) {
                root.addElement(element);
              } else {
                // look for the node with the correct level
                Node peek = nodeStack.peek();
                if (element.level == 0) { // add to the root
                  nodeStack.firstElement().addElement(element);
                } else {
                  Node parent = null;
                  int index = nodeStack.size() - 1;
                  while(parent == null) {
                    if(peek.getLevel() < element.getLevel()) {
                      parent = peek;
                    } else {
                      index--;
                      if(index > -1) {
                        peek = nodeStack.get(index);
                      } else {
                        parent = nodeStack.firstElement();
                      }
                    }
                  }
                  parent.addElement(element);
                }
              }
            } else {
              if (!nodeStack.isEmpty()) {
                nodeStack.peek().addElement(element);
              }
              nodeStack.push((Node) element);
            }
          }
          line = reader.readLine();
        }

      } catch (Exception e) {
        System.err.println(paragraphString);
        e.printStackTrace();
        return sentence;
      }
      // second line should be SOURCE
      sentence.setRoot(root);
      return sentence;
    }

    /**
     * Parse a tree element from a AD line
     *
     * @param line
     *          the AD line
     * @return the tree element
     */
    public TreeElement getElement(String line) {
      // try node
      Matcher nodeMatcher = nodePattern.matcher(line);
      if (nodeMatcher.matches()) {
        int level = nodeMatcher.group(1).length();
        String syntacticTag = nodeMatcher.group(2);
        String morphologicalTag = nodeMatcher.group(3);
        Node node = new Node();
        node.setLevel(level);
        node.setSyntacticTag(syntacticTag);
        node.setMorphologicalTag(morphologicalTag);
        return node;
      }

      Matcher leafMatcher = leafPattern.matcher(line);
      if (leafMatcher.matches()) {
        int level = leafMatcher.group(1).length();
        String syntacticTag = leafMatcher.group(2);
        String lemma = leafMatcher.group(3);
        String morphologicalTag = leafMatcher.group(4);
        String lexeme = leafMatcher.group(5);
        Leaf leaf = new Leaf();
        leaf.setLevel(level);
        leaf.setSyntacticTag(syntacticTag);
        leaf.setMorphologicalTag(morphologicalTag);
        leaf.setLexeme(lexeme);
        if (lemma != null) {
          if (lemma.length() > 2) {
            lemma = lemma.substring(1, lemma.length() - 1);
          }
          leaf.setLemma(lemma);
        }

        return leaf;
      }

      Matcher punctuationMatcher = punctuationPattern.matcher(line);
      if (punctuationMatcher.matches()) {
        int level = punctuationMatcher.group(1).length();
        String lexeme = punctuationMatcher.group(2);
        Leaf leaf = new Leaf();
        leaf.setLevel(level);
        leaf.setLexeme(lexeme);
        return leaf;
      }

      // process the bizarre cases
      if(line.equals("_") || line.startsWith("<lixo") || line.startsWith("pause")) {
        return null;
      }
     
      if(line.startsWith("=")) {
        Matcher bizarreLeafMatcher = bizarreLeafPattern.matcher(line);
        if (bizarreLeafMatcher.matches()) {
          int level = bizarreLeafMatcher.group(1).length();
          String syntacticTag = bizarreLeafMatcher.group(2);
          String lemma = bizarreLeafMatcher.group(3);
          String morphologicalTag = bizarreLeafMatcher.group(4);
          String lexeme = bizarreLeafMatcher.group(5);
          Leaf leaf = new Leaf();
          leaf.setLevel(level);
          leaf.setSyntacticTag(syntacticTag);
          leaf.setMorphologicalTag(morphologicalTag);
          leaf.setLexeme(lexeme);
          if (lemma != null) {
            if (lemma.length() > 2) {
              lemma = lemma.substring(1, lemma.length() - 1);
            }
            leaf.setLemma(lemma);
          }

          return leaf;
        } else {
          int level = line.lastIndexOf("=");
          String lexeme = line.substring(level + 1);
         
           Leaf leaf = new Leaf();
           leaf.setLevel(level + 1);
           leaf.setSyntacticTag("");
           leaf.setMorphologicalTag("");
           leaf.setLexeme(lexeme);
          
           return leaf;
        }
      }
     
      System.err.println("Couldn't parse leaf: " + line);
      Leaf leaf = new Leaf();
      leaf.setLevel(0);
      leaf.setSyntacticTag("");
      leaf.setMorphologicalTag("");
      leaf.setLexeme(line);

      return leaf;
    }

    /** Represents a tree element, Node or Leaf */
    public abstract class TreeElement {

      private String syntacticTag;
      private String morphologicalTag;
      private int level;
     
      public boolean isLeaf() {return false;}

      public void setSyntacticTag(String syntacticTag) {
        this.syntacticTag = syntacticTag;
      }

      public String getSyntacticTag() {
        return syntacticTag;
      }

      public void setLevel(int level) {
        this.level = level;
      }

      public int getLevel() {
        return level;
      }

      public void setMorphologicalTag(String morphologicalTag) {
        this.morphologicalTag = morphologicalTag;
      }

      public String getMorphologicalTag() {
        return morphologicalTag;
      }
    }

    /** Represents the AD node */
    public class Node extends TreeElement {
      private List<TreeElement> elems = new ArrayList<TreeElement>();

      public void addElement(TreeElement element) {
        elems.add(element);
      };

      public TreeElement[] getElements() {
        return elems.toArray(new TreeElement[elems.size()]);
      }

      @Override
      public String toString() {
        StringBuffer sb = new StringBuffer();
        // print itself and its children
        for (int i = 0; i < this.getLevel(); i++) {
          sb.append("=");
        }
        sb.append(this.getSyntacticTag());
        if (this.getMorphologicalTag() != null) {
          sb.append(this.getMorphologicalTag());
        }
        sb.append("\n");
        for (TreeElement element : elems) {
          sb.append(element.toString());
        }
        return sb.toString();
      }
    }

    /** Represents the AD leaf */
    public class Leaf extends TreeElement {

      private String word;
      private String lemma;

      public boolean isLeaf() {return true;}
     
      public void setLexeme(String lexeme) {
        this.word = lexeme;
      }

      public String getLexeme() {
        return word;
      }

      @Override
      public String toString() {
        StringBuffer sb = new StringBuffer();
        // print itself and its children
        for (int i = 0; i < this.getLevel(); i++) {
          sb.append("=");
        }
        if (this.getSyntacticTag() != null) {
          sb.append(this.getSyntacticTag() + "(" + this.getMorphologicalTag()
              + ") ");
        }
        sb.append(this.word + "\n");
        return sb.toString();
      }

      public void setLemma(String lemma) {
        this.lemma = lemma;
      }

      public String getLemma() {
        return lemma;
      }
    }

  }
 
  /**
   * The start paragraph pattern
   */
  private static final Pattern start = Pattern.compile("<s[^>]*>");

  /**
   * The end paragraph pattern
   */
  private static final Pattern end = Pattern.compile("</s>");

  private ParagraphParser parser;

  public ADParagraphStream(ObjectStream<String> lineStream) {
    super(lineStream);
    parser = new ParagraphParser();
  }

  public Paragraph read() throws IOException {

    StringBuilder paragraph = new StringBuilder();
    boolean paragraphStarted = false;

    while (true) {
      String line = samples.read();

      if (line != null) {

        if (start.matcher(line).matches()) {
          paragraphStarted = true;
        }

        if (paragraphStarted) {
          paragraph.append(line).append('\n');
        }

        if (end.matcher(line).matches()) {
          paragraphStarted = false;
        }

        if (!paragraphStarted && paragraph.length() > 0) {
          return parser.parse(paragraph.toString());
        }

      } else {
        // handle end of file
        if (paragraphStarted) {
          if (paragraph.length() > 0) {
            return parser.parse(paragraph.toString());
          }
        } else {
          return null;
        }
      }
    }
  }
}
TOP

Related Classes of opennlp.tools.formats.ad.ADParagraphStream

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.