Package org.maltparserx.core.syntaxgraph.reader

Source Code of org.maltparserx.core.syntaxgraph.reader.TigerXMLReader

package org.maltparserx.core.syntaxgraph.reader;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.net.URL;
import java.util.SortedMap;
import java.util.regex.PatternSyntaxException;

import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamConstants;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;

import org.maltparserx.core.exception.MaltChainedException;
import org.maltparserx.core.io.dataformat.DataFormatException;
import org.maltparserx.core.io.dataformat.DataFormatInstance;
import org.maltparserx.core.symbol.SymbolTable;
import org.maltparserx.core.syntaxgraph.MappablePhraseStructureGraph;
import org.maltparserx.core.syntaxgraph.PhraseStructure;
import org.maltparserx.core.syntaxgraph.SyntaxGraphException;
import org.maltparserx.core.syntaxgraph.TokenStructure;
import org.maltparserx.core.syntaxgraph.edge.Edge;
import org.maltparserx.core.syntaxgraph.node.NonTerminalNode;
import org.maltparserx.core.syntaxgraph.node.PhraseStructureNode;

/**
*
*
* @author Johan Hall
*/
public class TigerXMLReader implements SyntaxGraphReader {
//  private TigerXMLHeader header;
  private XMLStreamReader reader;
  private int sentenceCount;
  private DataFormatInstance dataFormatInstance;
  private StringBuffer ntid;
  private final StringBuilder graphRootID;
//  private StringBuilder elementContent;
//  private StringBuilder valueName;
//  private StringBuilder currentFeatureName;
//  private Domain domain;
//  private boolean collectChar = false;
  private String optionString;
  private String fileName = null;
  private URL url = null;
  private String charsetName;
  private int nIterations;
  private int cIterations;
  private int START_ID_OF_NONTERMINALS = 500;
  private boolean closeStream = true;
 
  public TigerXMLReader() {
    this.ntid = new StringBuffer();
//    elementContent = new StringBuilder();
//    valueName = new StringBuilder();
//    currentFeatureName = new StringBuilder();
    graphRootID = new StringBuilder();
    nIterations = 1;
    cIterations = 1;
  }
 
  private void reopen() throws MaltChainedException {
    close();
    if (fileName != null) {
      open(fileName, charsetName);
    } else if (url != null) {
      open(url, charsetName);
    } else {
      throw new DataFormatException("The input stream cannot be reopen. ");
    }
  }
 
  public void open(String fileName, String charsetName) throws MaltChainedException {
    setFileName(fileName);
    setCharsetName(charsetName);
    try {
      open(new FileInputStream(fileName), charsetName);
    }catch (FileNotFoundException e) {
      throw new DataFormatException("The input file '"+fileName+"' cannot be found. ", e);
    }
  }
  public void open(URL url, String charsetName) throws MaltChainedException {
    setUrl(url);
    setCharsetName(charsetName);
    try {
      open(url.openStream(), charsetName);
    } catch (IOException e) {
      throw new DataFormatException("The URL '"+url.toString()+"' cannot be opened. ", e);
    }
  }
 
  public void open(InputStream is, String charsetName) throws MaltChainedException {
    try {
      if (is == System.in) {
        closeStream = false;
      }
      open(new InputStreamReader(is, charsetName));
    } catch (UnsupportedEncodingException e) {
      throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported. ", e);
    }
  }
 
  private void open(InputStreamReader isr) throws MaltChainedException {
    try {
      XMLInputFactory factory = XMLInputFactory.newInstance();
      setReader(factory.createXMLStreamReader(new BufferedReader(isr)));
    } catch (XMLStreamException e) {
      throw new DataFormatException("XML input file could be opened. ", e);
    }
    setSentenceCount(0);
  }
 
  public void readProlog() throws MaltChainedException {
   
  }
 
  public boolean readSentence(TokenStructure syntaxGraph) throws MaltChainedException  {
    if (syntaxGraph == null || !(syntaxGraph instanceof PhraseStructure)) {
      return false;
    }
    syntaxGraph.clear();
    final PhraseStructure phraseStructure = (PhraseStructure)syntaxGraph;
    PhraseStructureNode parent = null;
    PhraseStructureNode child = null;
//    if (header == null) {
//      header = new TigerXMLHeader(syntaxGraph.getSymbolTables());
//    }

    try {
      while (true) {
        int event = reader.next();
        if (event == XMLStreamConstants.START_ELEMENT) {
          if (reader.getLocalName().length() == 0) {
            continue;
          }
          if (reader.getLocalName().charAt(0) == 'e') {
            // e -> edge, edgelabel
            if (reader.getLocalName().length() == 4) { //edge
              int childid = -1;
              int indexSep = reader.getAttributeValue(null, "idref").indexOf('_');
             
              try {
                if (indexSep != -1) {
                  childid = Integer.parseInt(reader.getAttributeValue(null, "idref").substring(indexSep+1));
                } else {
                  childid = Integer.parseInt(reader.getAttributeValue(null, "idref"));
                }
                if (childid == -1) {
                  throw new SyntaxGraphException("The tiger reader couldn't recognize the idref attribute '"+reader.getAttributeValue(null, "idref")+"' of the edge element. ");
                }
              } catch (NumberFormatException e) {
                throw new SyntaxGraphException("The tiger reader couldn't recognize the idref attribute '"+reader.getAttributeValue(null, "idref")+"' of the edge element. ");
              }

              if (childid < START_ID_OF_NONTERMINALS) {
                child = phraseStructure.getTokenNode(childid);
              } else {

                child = phraseStructure.getNonTerminalNode(childid-START_ID_OF_NONTERMINALS+1);
              }

              Edge e = phraseStructure.addPhraseStructureEdge(parent, child);
              SortedMap<String, SymbolTable> inputTables = dataFormatInstance.getPhraseStructureEdgeLabelSymbolTables();
              for (String name : inputTables.keySet()) {
                e.addLabel(inputTables.get(name), reader.getAttributeValue(null, name.toLowerCase()));
              }
            } else if (reader.getLocalName().equals("edgelabel")) { // edgelabel
//              domain = Domain.EL;
            }
          } else if (reader.getLocalName().charAt(0) == 'n') {
            // n -> nt, nonterminals, name
            if (reader.getLocalName().length() == 2) { // nt
              final String id = reader.getAttributeValue(null, "id");
              if (graphRootID.length() == id.length() && graphRootID.toString().equals(id)) {
                parent = phraseStructure.getPhraseStructureRoot();
              } else {
                int index = id.indexOf('_');
                if (index != -1) {
                  parent = phraseStructure.addNonTerminalNode(Integer.parseInt(id.substring(index+1))-START_ID_OF_NONTERMINALS+1);
                }
              }
              SortedMap<String, SymbolTable> inputTables = dataFormatInstance.getPhraseStructureNodeLabelSymbolTables();
              for (String name : inputTables.keySet()) {
                parent.addLabel(inputTables.get(name), reader.getAttributeValue(null, name.toLowerCase()));
              }
            } else if (reader.getLocalName().equals("name")) { // name
//              elementContent.setLength(0);
//              collectChar = true;
            }
          } else if (reader.getLocalName().charAt(0) == 't') {
            // t -> t, terminals
            if (reader.getLocalName().length() == 1) { // t
              SortedMap<String, SymbolTable> inputTables = dataFormatInstance.getInputSymbolTables();
              child = syntaxGraph.addTokenNode();
              for (String name : inputTables.keySet()) {
                child.addLabel(inputTables.get(name), reader.getAttributeValue(null, name.toLowerCase()));
              }
            }
          } else if (reader.getLocalName().charAt(0) == 's') {
            // s -> subcorpus, secedge, s, secedgelabel
            if (reader.getLocalName().length() == 1) { // s
              String id = reader.getAttributeValue(null, "id");
              boolean indexable = false;
              int index = -1;
              if (id != null && id.length() > 0) {
                for (int i = 0, n = id.length(); i < n; i++) {
                  if (Character.isDigit(id.charAt(i))) {
                    if (index == -1) {
                      index = i;
                    }
                    indexable = true;
                  }
                }
              }
              if (indexable) {
                phraseStructure.setSentenceID(Integer.parseInt(id.substring(index)));
              } else {
                phraseStructure.setSentenceID(sentenceCount+1);
              }
            }
          } else if (reader.getLocalName().charAt(0) == 'v') {
            // v -> variable, value
//            if (reader.getLocalName().equals("value")) {
//              valueName.setLength(0);
//              valueName.append(reader.getAttributeValue(null, "name"));
//              elementContent.setLength(0);
//              collectChar = true;
//            }
          } else {
//             a -> annotation, author
//             b -> body
//             c -> corpus
//             d -> date, description,
//             f -> feature, format
//             g -> graph
//             h -> head, history
//             m -> matches, match
            if (reader.getLocalName().equals("graph")) {
              graphRootID.setLength(0);
              graphRootID.append(reader.getAttributeValue(null, "root"));
            } else  if (reader.getLocalName().equals("corpus")) {
//              header.setCorpusID(reader.getAttributeValue(null, "id"));
//              header.setCorpusID(reader.getAttributeValue(null, "version"));
            } else if (reader.getLocalName().equals("feature")) {
//              if (header != null) {
//                currentFeatureName.setLength(0);
//                currentFeatureName.append(reader.getAttributeValue(null, "name"));
//                header.addFeature(reader.getAttributeValue(null, "name"), reader.getAttributeValue(null, "domain"));
//              }
//              domain = Domain.valueOf(reader.getAttributeValue(null, "domain"));
            } else if (reader.getLocalName().equals("secedgelabel")) {
//              domain = Domain.SEL;
            } else if (reader.getLocalName().equals("author")) {
//              elementContent.setLength(0);
//              collectChar = true;
            } else if (reader.getLocalName().equals("date")) {
//              elementContent.setLength(0);
//              collectChar = true;
            } else if (reader.getLocalName().equals("description")) {
//              elementContent.setLength(0);
//              collectChar = true;
            } else if (reader.getLocalName().equals("format")) {
//              elementContent.setLength(0);
//              collectChar = true;
            } else if (reader.getLocalName().equals("history")) {
//              elementContent.setLength(0);
//              collectChar = true;
            }
          }
        } else if (event == XMLStreamConstants.END_ELEMENT) {
          if (reader.getLocalName().length() == 0) {
            continue;
          }
          if (reader.getLocalName().charAt(0) == 'e') {
            // e -> edge, edgelabel
          } else if (reader.getLocalName().charAt(0) == 'n') {
            // n -> nt, nonterminals, name
            if (reader.getLocalName().equals("nt")) {
              ntid.setLength(0);
            }
            else if (reader.getLocalName().equals("nonterminals")) {
              if (phraseStructure.nTokenNode() == 1 && phraseStructure.nNonTerminals() == 0 &&((NonTerminalNode)phraseStructure.getPhraseStructureRoot()).nChildren() == 0) {
                Edge e = phraseStructure.addPhraseStructureEdge(phraseStructure.getPhraseStructureRoot(), phraseStructure.getTokenNode(1));
                SortedMap<String, SymbolTable> inputTables = dataFormatInstance.getPhraseStructureEdgeLabelSymbolTables();
                for (String name : inputTables.keySet()) {
                  e.addLabel(inputTables.get(name), "--");
                }
              }
            }
//            else if (reader.getLocalName().equals("name")) {
//              if (header != null) {
//                header.setMetaName(elementContent.toString());
//              }
//              collectChar = false;
//            }
          } else if (reader.getLocalName().charAt(0) == 't') {
            // t -> t, terminals
          } else if (reader.getLocalName().charAt(0) == 's') {
            // s -> subcorpus, secedge, s, secedgelabel
            if (reader.getLocalName().equals("s")) {
              if (syntaxGraph.hasTokens()) {
                sentenceCount++;
              }
              if (syntaxGraph instanceof MappablePhraseStructureGraph) {
                ((MappablePhraseStructureGraph)syntaxGraph).getMapping().updateDependenyGraph(((MappablePhraseStructureGraph)syntaxGraph), ((PhraseStructure)syntaxGraph).getPhraseStructureRoot());
              }
              return true;
            }
          } else if (reader.getLocalName().charAt(0) == 'v') {
            // v -> variable, value
//            if (reader.getLocalName().equals("value")) {
//              if (header != null) {
//                if (domain == Domain.T || domain == Domain.NT || domain == Domain.FREC) {
//                  header.addFeatureValue(currentFeatureName.toString(), valueName.toString(), elementContent.toString());
//                } else if (domain == Domain.EL) {
//                  header.addEdgeLabelValue(valueName.toString(), elementContent.toString());
//                } else if (domain == Domain.SEL) {
//                  header.addSecEdgeLabelValue(valueName.toString(), elementContent.toString());
//                }
//              }
//              collectChar = false;
//            }
          } else {
//             a -> annotation, author
//             b -> body
//             c -> corpus
//             d -> date, description,
//             f -> feature, format
//             g -> graph
//             h -> head, history
//             m -> matches, match
            if (reader.getLocalName().equals("body")) {
              //sentence = dataStructures.getSentence();
              //phraseTree = dataStructures.getInPhraseTree();
              //sentence.clear();
              //phraseTree.clear();
              //dataStructures.setLastProcessObject(true);
            else if (reader.getLocalName().equals("author")) {
//              if (header != null) {
//                header.setMetaAuthor(elementContent.toString());
//              }
//              collectChar = false;
            } else if (reader.getLocalName().equals("date")) {
//              if (header != null) {
//                header.setMetaInDate(elementContent.toString());
//              }
//              collectChar = false;
            } else if (reader.getLocalName().equals("description")) {
//              if (header != null) {
//                header.setMetaDescription(elementContent.toString());
//              }
//              collectChar = false;
            } else if (reader.getLocalName().equals("format")) {
//              if (header != null) {
//                header.setMetaFormat(elementContent.toString());
//              }
//              collectChar = false;
            } else if (reader.getLocalName().equals("history")) {
//              if (header != null) {
//                header.setMetaHistory(elementContent.toString());
//              }
//              collectChar = false;
            } /* else if (reader.getLocalName().equals("annotation")) {
              if (header != null) {
                System.out.println(header.toTigerXML());
              }
              collectChar = false;
            } */
          }       
        } else if (event == XMLStreamConstants.END_DOCUMENT) {
          if (syntaxGraph.hasTokens()) {
            sentenceCount++;
          }
          if (cIterations < nIterations) {
            cIterations++;
            reopen();
            return true;
          }
          return false;
        } else if (event == XMLStreamConstants.CHARACTERS) {
//          if (collectChar) {
//            char[] ch = reader.getTextCharacters();
//            final int size = reader.getTextStart()+reader.getTextLength();
//            for (int i = reader.getTextStart(); i < size; i++) {
//              elementContent.append(ch[i]);
//            }
//          }
        }
      }
    } catch (XMLStreamException e) {
      throw new DataFormatException("", e);
    }
  }
 
  public int getSentenceCount() {
    return sentenceCount;
  }

  public void setSentenceCount(int sentenceCount) {
    this.sentenceCount = sentenceCount;
  }
 
  public XMLStreamReader getReader() {
    return reader;
  }

  public void setReader(XMLStreamReader reader) {
    this.reader = reader;
  }
 
  public void readEpilog() throws MaltChainedException {
   
  }
 
  public void close() throws MaltChainedException {
    try {
      if (reader != null) {
        if (closeStream) {
          reader.close();
        }
        reader = null;
      }
    } catch (XMLStreamException e) {
      throw new DataFormatException("The XML input file could be closed. ", e);
    }
  }

  public DataFormatInstance getDataFormatInstance() {
    return dataFormatInstance;
  }
 
  public void setDataFormatInstance(DataFormatInstance inputDataFormatInstance) {
    this.dataFormatInstance = inputDataFormatInstance;
  }
 
  public String getOptions() {
    return optionString;
  }
 
  public void setOptions(String optionString) throws MaltChainedException {
    this.optionString = optionString;
    String[] argv;
    try {
      argv = optionString.split("[_\\p{Blank}]");
    } catch (PatternSyntaxException e) {
      throw new DataFormatException("Could not split the TigerXML reader option '"+optionString+"'. ", e);
    }
    for (int i=0; i < argv.length-1; i++) {
      if(argv[i].charAt(0) != '-') {
        throw new DataFormatException("The argument flag should start with the following character '-', not with "+argv[i].charAt(0));
      }
      if(++i>=argv.length) {
        throw new DataFormatException("The last argument does not have any value. ");
      }
      switch(argv[i-1].charAt(1)) {
      case 's':
        try {
          START_ID_OF_NONTERMINALS = Integer.parseInt(argv[i]);
        } catch (NumberFormatException e){
          throw new MaltChainedException("The TigerXML Reader option -s must be an integer value. ");
        }
        break;
      default:
        throw new DataFormatException("Unknown TigerXMLReader parameter: '"+argv[i-1]+"' with value '"+argv[i]+"'. ");   
      }
    }
  }
 
  public String getFileName() {
    return fileName;
  }

  public void setFileName(String fileName) {
    this.fileName = fileName;
  }

  public URL getUrl() {
    return url;
  }

  public void setUrl(URL url) {
    this.url = url;
  }

  public String getCharsetName() {
    return charsetName;
  }

  public void setCharsetName(String charsetName) {
    this.charsetName = charsetName;
  }

  public int getNIterations() {
    return nIterations;
  }

  public void setNIterations(int iterations) {
    nIterations = iterations;
  }

  public int getIterationCounter() {
    return cIterations;
  }
//  public TigerXMLHeader getHeader() {
//    return header;
//  }
// 
//  public void setHeader(TigerXMLHeader header) {
//    this.header = header;
//  }
}
TOP

Related Classes of org.maltparserx.core.syntaxgraph.reader.TigerXMLReader

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.