Package org.maltparserx.core.syntaxgraph.reader

Source Code of org.maltparserx.core.syntaxgraph.reader.NegraReader

package org.maltparserx.core.syntaxgraph.reader;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.net.URL;
import java.util.Iterator;
import java.util.SortedMap;
import java.util.TreeMap;
import java.util.regex.PatternSyntaxException;

import org.maltparserx.core.exception.MaltChainedException;
import org.maltparserx.core.io.dataformat.ColumnDescription;
import org.maltparserx.core.io.dataformat.DataFormatException;
import org.maltparserx.core.io.dataformat.DataFormatInstance;
import org.maltparserx.core.syntaxgraph.MappablePhraseStructureGraph;
import org.maltparserx.core.syntaxgraph.PhraseStructure;
import org.maltparserx.core.syntaxgraph.TokenStructure;
import org.maltparserx.core.syntaxgraph.edge.Edge;
import org.maltparserx.core.syntaxgraph.node.PhraseStructureNode;

/**
*
*
* @author Johan Hall
*/
public class NegraReader implements SyntaxGraphReader {
  private enum NegraTables {
    ORIGIN, EDITOR, WORDTAG, MORPHTAG, NODETAG, EDGETAG, SECEDGETAG, SENTENCE, UNDEF
  };
  private BufferedReader reader;
  private DataFormatInstance dataFormatInstance;
  private int sentenceCount;
  private String optionString;
  private int formatVersion;
  private NegraTables currentHeaderTable;
  private int currentTerminalSize;
  private int currentNonTerminalSize;
  private SortedMap<Integer,PhraseStructureNode> nonterminals;
  private StringBuilder edgelabelSymbol;
  private StringBuilder edgelabelTableName;
  private int START_ID_OF_NONTERMINALS = 500;
  private String fileName = null;
  private URL url = null;
  private String charsetName;
  private int nIterations;
  private int cIterations;
  private boolean closeStream = true;
 
  public NegraReader() {
    currentHeaderTable = NegraTables.UNDEF;
    edgelabelSymbol = new StringBuilder();
    edgelabelTableName = new StringBuilder();
    nonterminals = new TreeMap<Integer,PhraseStructureNode>();
    nIterations = 1;
    cIterations = 1;
  }
 
  private void reopen() throws MaltChainedException {
    close();
    if (fileName != null) {
      open(fileName, charsetName);
    } else if (url != null) {
      open(url, charsetName);
    } else {
      throw new DataFormatException("The input stream cannot be reopen. ");
    }
  }
 
  public void open(String fileName, String charsetName) throws MaltChainedException {
    setFileName(fileName);
    setCharsetName(charsetName);
    try {
      open(new FileInputStream(fileName), charsetName);
    } catch (FileNotFoundException e) {
      throw new DataFormatException("The input file '"+fileName+"' cannot be found. ", e);
    }
  }
  public void open(URL url, String charsetName) throws MaltChainedException {
    setUrl(url);
    setCharsetName(charsetName);
    try {
      open(url.openStream(), charsetName);
    } catch (IOException e) {
      throw new DataFormatException("The URL '"+url.toString()+"' cannot be opened. ", e);
    }
  }
 
  public void open(InputStream is, String charsetName) throws MaltChainedException {
    try {
      if (is == System.in) {
        closeStream = false;
      }
      open(new InputStreamReader(is, charsetName));
    } catch (UnsupportedEncodingException e) {
      throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported. ", e);
    }
  }
 
  private void open(InputStreamReader isr) throws MaltChainedException {
    setReader(new BufferedReader(isr));
    setSentenceCount(0);
  }
 
  public void readProlog() throws MaltChainedException {
   
  }
 
  public boolean readSentence(TokenStructure syntaxGraph) throws MaltChainedException  {
    if (syntaxGraph == null || !(syntaxGraph instanceof PhraseStructure)) {
      return false;
    }
    syntaxGraph.clear();
    final PhraseStructure phraseStructure = (PhraseStructure)syntaxGraph;
    PhraseStructureNode parent = null;
    PhraseStructureNode child = null;
    currentHeaderTable = NegraTables.UNDEF;
    String line = null;
    syntaxGraph.clear();
    nonterminals.clear();
    try {
      while (true) {
        line = reader.readLine();
        if (line == null) {
          if (syntaxGraph.hasTokens()) {
            sentenceCount++;
            if (syntaxGraph instanceof MappablePhraseStructureGraph) {
              ((MappablePhraseStructureGraph)syntaxGraph).getMapping().updateDependenyGraph(((MappablePhraseStructureGraph)syntaxGraph), ((PhraseStructure)syntaxGraph).getPhraseStructureRoot());
            }
          }
          if (cIterations < nIterations) {
            cIterations++;
            reopen();
            return true;
          }
          return false;
        } else if (line.startsWith("#EOS")) {
          currentTerminalSize = 0;
          currentNonTerminalSize = 0;
          currentHeaderTable = NegraTables.UNDEF;
          if (syntaxGraph instanceof MappablePhraseStructureGraph) {
            ((MappablePhraseStructureGraph)syntaxGraph).getMapping().updateDependenyGraph(((MappablePhraseStructureGraph)syntaxGraph), ((PhraseStructure)syntaxGraph).getPhraseStructureRoot());
          }
          return true;
        } else if (line.startsWith("#BOS")) {
          currentHeaderTable = NegraTables.SENTENCE;
          int s = -1, e = -1;
          for (int i = 5, n = line.length(); i < n; i++) {
            if (Character.isDigit(line.charAt(i)) && s == -1) {
              s = i;
            }
            if (line.charAt(i) == ' ') {
              e = i;
              break;
            }
          }
          if (s != e && s != -1 && e != -1) {
            phraseStructure.setSentenceID(Integer.parseInt(line.substring(s,e)));
          }
          sentenceCount++;
        } else if (currentHeaderTable == NegraTables.SENTENCE) {
          if (line.length() >= 2 && line.charAt(0) == '#' && Character.isDigit(line.charAt(1))) { // Non-terminal
            Iterator<ColumnDescription> columns = dataFormatInstance.iterator();
            ColumnDescription column = null;
            currentNonTerminalSize++;
            char[] lineChars = line.toCharArray();
            int start = 0;
            int secedgecounter = 0;
            for (int i = 0, n = lineChars.length; i < n; i++) {
              if (lineChars[i] == '\t' && start == i) {
                start++;
              } else if (lineChars[i] == '\t' || i == n - 1) {
                if (columns.hasNext()) {
                  column = columns.next();
                }
                if (column.getPosition() == 0) {
                  int index = Integer.parseInt((i == n - 1)?line.substring(start+1):line.substring(start+1, i));
                  child = nonterminals.get(index);
                  if (child == null) {
                    if (index != 0) {
                      child = ((PhraseStructure)syntaxGraph).addNonTerminalNode(index-START_ID_OF_NONTERMINALS+1);
                    }
                    nonterminals.put(index,child);
                  }
                } else if (column.getPosition() == 2 && child != null) {
                  syntaxGraph.addLabel(child, "CAT", (i == n - 1)?line.substring(start):line.substring(start, i));
                } else if (column.getCategory() == ColumnDescription.PHRASE_STRUCTURE_EDGE_LABEL) {
                  edgelabelSymbol.setLength(0);
                  edgelabelSymbol.append((i == n - 1)?line.substring(start):line.substring(start, i));
                  edgelabelTableName.setLength(0);
                  edgelabelTableName.append(column.getName());
                } else if (column.getCategory() == ColumnDescription.PHRASE_STRUCTURE_NODE_LABEL && child != null) {
                  int index = Integer.parseInt((i == n - 1)?line.substring(start):line.substring(start, i));
                  parent = nonterminals.get(index);
                  if (parent == null) {
                    if (index == 0) {
                      parent = phraseStructure.getPhraseStructureRoot()
                    } else {
                      parent = phraseStructure.addNonTerminalNode(index-START_ID_OF_NONTERMINALS+1);
                    }
                    nonterminals.put(index,parent);
                  }
                  Edge e = phraseStructure.addPhraseStructureEdge(parent, child);
                  syntaxGraph.addLabel(e, edgelabelTableName.toString(), edgelabelSymbol.toString());
                } else if (column.getCategory() == ColumnDescription.SECONDARY_EDGE_LABEL && child != null) {
                  if (secedgecounter % 2 == 0) {
                    edgelabelSymbol.setLength(0);
                    edgelabelSymbol.append((i == n - 1)?line.substring(start):line.substring(start, i));
                    secedgecounter++;
                  } else {
                    int index = Integer.parseInt((i == n - 1)?line.substring(start):line.substring(start, i));
                    if (index == 0) {
                      parent = phraseStructure.getPhraseStructureRoot();
                    } else if (index < START_ID_OF_NONTERMINALS) {
                      parent = phraseStructure.getTokenNode(index);
                    } else {
                      parent = nonterminals.get(index);
                      if (parent == null) {
                        parent = phraseStructure.addNonTerminalNode(index-START_ID_OF_NONTERMINALS+1);
                        nonterminals.put(index,parent);
                      }
                    }
                    Edge e = phraseStructure.addSecondaryEdge(parent, child);
                    e.addLabel(column.getSymbolTable(), edgelabelSymbol.toString());
                    secedgecounter++;
                  }
                }
                start = i + 1;
              }
            }
          } else { // Terminal
            Iterator<ColumnDescription> columns = dataFormatInstance.iterator();
            ColumnDescription column = null;
           
            currentTerminalSize++;
            child = syntaxGraph.addTokenNode(currentTerminalSize);
            char[] lineChars = line.toCharArray();
            int start = 0;
            int secedgecounter = 0;
            for (int i = 0, n = lineChars.length; i < n; i++) {
              if (lineChars[i] == '\t' && start == i) {
                start++;
              } else if (lineChars[i] == '\t' || i == n - 1) {
                if (columns.hasNext()) {
                  column = columns.next();
                }
                if (column.getCategory() == ColumnDescription.INPUT && child != null) {
                  syntaxGraph.addLabel(child, column.getName(), (i == n - 1)?line.substring(start):line.substring(start, i));
                } else if (column.getCategory() == ColumnDescription.PHRASE_STRUCTURE_EDGE_LABEL && child != null) { // && column.getName().equals("EDGELABEL")) {
                  edgelabelSymbol.setLength(0);
                  edgelabelSymbol.append((i == n - 1)?line.substring(start):line.substring(start, i));
                  edgelabelTableName.setLength(0);
                  edgelabelTableName.append(column.getName());
                } else if (column.getCategory() == ColumnDescription.PHRASE_STRUCTURE_NODE_LABEL && child != null) {
                  int index = Integer.parseInt((i == n - 1)?line.substring(start):line.substring(start, i));
                  parent = nonterminals.get(index);
                  if (parent == null) {
                    if (index == 0) {
                      parent = phraseStructure.getPhraseStructureRoot()
                    } else {
                      parent = phraseStructure.addNonTerminalNode(index-START_ID_OF_NONTERMINALS+1);
                    }
                    nonterminals.put(index,parent);
                  }

                  Edge e = phraseStructure.addPhraseStructureEdge(parent, child);
                  syntaxGraph.addLabel(e, edgelabelTableName.toString(), edgelabelSymbol.toString());
                } else if (column.getCategory() == ColumnDescription.SECONDARY_EDGE_LABEL && child != null) {
                  if (secedgecounter % 2 == 0) {
                    edgelabelSymbol.setLength(0);
                    edgelabelSymbol.append((i == n - 1)?line.substring(start):line.substring(start, i));
                    secedgecounter++;
                  } else {
                    int index = Integer.parseInt((i == n - 1)?line.substring(start):line.substring(start, i));
                    if (index == 0) {
                      parent = phraseStructure.getPhraseStructureRoot();
                    } else if (index < START_ID_OF_NONTERMINALS) {
                      parent = phraseStructure.getTokenNode(index);
                    } else {
                      parent = nonterminals.get(index);
                      if (parent == null) {
                        parent = phraseStructure.addNonTerminalNode(index-START_ID_OF_NONTERMINALS+1);
                        nonterminals.put(index,parent);
                      }
                    }
                    Edge e = phraseStructure.addSecondaryEdge(parent, child);
                    e.addLabel(column.getSymbolTable(), edgelabelSymbol.toString());
                    secedgecounter++;
                  }
                }
                start = i + 1;
              }
            }
          }
        } else if (line.startsWith("%%")) { // comment skip
       
        } else if (line.startsWith("#FORMAT")) {
//        int index = line.indexOf(' ');
//        if (index > -1) {
//          try {
//            formatVersion = Integer.parseInt(line.substring(index+1));
//          } catch (NumberFormatException e) {
//           
//          }
//        }
        } else if (line.startsWith("#BOT")) {
//        int index = line.indexOf(' ');
//        if (index > -1) {
//          if (line.substring(index+1).equals("ORIGIN")) {
//            currentHeaderTable = NegraTables.ORIGIN;
//          } else if (line.substring(index+1).equals("EDITOR")) {
//            currentHeaderTable = NegraTables.EDITOR;
//          } else if (line.substring(index+1).equals("WORDTAG")) {
//            currentHeaderTable = NegraTables.WORDTAG;
//          } else if (line.substring(index+1).equals("MORPHTAG")) {
//            currentHeaderTable = NegraTables.MORPHTAG;
//          } else if (line.substring(index+1).equals("NODETAG")) {
//            currentHeaderTable = NegraTables.NODETAG;
//          } else if (line.substring(index+1).equals("EDGETAG")) {
//            currentHeaderTable = NegraTables.EDGETAG;
//          } else if (line.substring(index+1).equals("SECEDGETAG")) {
//            currentHeaderTable = NegraTables.SECEDGETAG;
//          } else {
//            currentHeaderTable = NegraTables.UNDEF;
//          }
//        }
        } else if (line.startsWith("#EOT")) {
          currentHeaderTable = NegraTables.UNDEF;
        }
      }
    }  catch (IOException e) {
      throw new DataFormatException("Error when reading from the input file. ", e);
    }
  }
 
  public void readEpilog() throws MaltChainedException {
   
  }
 
  public BufferedReader getReader() {
    return reader;
  }

  public void setReader(BufferedReader reader) {
    this.reader = reader;
  }

  public int getSentenceCount() {
    return sentenceCount;
  }

  public void setSentenceCount(int sentenceCount) {
    this.sentenceCount = sentenceCount;
  }
 
  public int getFormatVersion() {
    return formatVersion;
  }

  public void setFormatVersion(int formatVersion) {
    this.formatVersion = formatVersion;
  }

  public DataFormatInstance getDataFormatInstance() {
    return dataFormatInstance;
  }
 
  public void setDataFormatInstance(DataFormatInstance inputDataFormatInstance) {
    this.dataFormatInstance = inputDataFormatInstance;
  }
 
  public String getOptions() {
    return optionString;
  }
 
  public void setOptions(String optionString) throws MaltChainedException {
    this.optionString = optionString;

    String[] argv;
    try {
      argv = optionString.split("[_\\p{Blank}]");
    } catch (PatternSyntaxException e) {
      throw new DataFormatException("Could not split the penn writer option '"+optionString+"'. ", e);
    }
    for (int i=0; i < argv.length-1; i++) {
      if(argv[i].charAt(0) != '-') {
        throw new DataFormatException("The argument flag should start with the following character '-', not with "+argv[i].charAt(0));
      }
      if(++i>=argv.length) {
        throw new DataFormatException("The last argument does not have any value. ");
      }
      switch(argv[i-1].charAt(1)) {
      case 's':
        try {
          START_ID_OF_NONTERMINALS = Integer.parseInt(argv[i]);
        } catch (NumberFormatException e){
          throw new MaltChainedException("The TigerXML Reader option -s must be an integer value. ");
        }
        break;
      default:
        throw new DataFormatException("Unknown NegraReader parameter: '"+argv[i-1]+"' with value '"+argv[i]+"'. ");   
      }
    }
  }
 
  public String getFileName() {
    return fileName;
  }

  public void setFileName(String fileName) {
    this.fileName = fileName;
  }

  public URL getUrl() {
    return url;
  }

  public void setUrl(URL url) {
    this.url = url;
  }

  public String getCharsetName() {
    return charsetName;
  }

  public void setCharsetName(String charsetName) {
    this.charsetName = charsetName;
  }

  public int getNIterations() {
    return nIterations;
  }

  public void setNIterations(int iterations) {
    nIterations = iterations;
  }

  public int getIterationCounter() {
    return cIterations;
  }
 
  public void close() throws MaltChainedException {
    try {
      if (reader != null) {
        if (closeStream) {
          reader.close();
        }
        reader = null;
      }
    }   catch (IOException e) {
      throw new DataFormatException("Error when closing the input file.", e);
    }
  }
}
TOP

Related Classes of org.maltparserx.core.syntaxgraph.reader.NegraReader

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.