Source Code of edu.stanford.nlp.international.arabic.pipeline.ATBArabicDataset$ArabicRawTreeNormalizer

package edu.stanford.nlp.international.arabic.pipeline;


import java.io.BufferedWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.io.UnsupportedEncodingException;
import java.util.Collections;
import java.util.Properties;


import edu.stanford.nlp.international.arabic.Buckwalter;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.trees.treebank.AbstractDataset;
import edu.stanford.nlp.trees.DiskTreebank;
import edu.stanford.nlp.trees.LabeledScoredTreeFactory;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreeFactory;
import edu.stanford.nlp.trees.TreeVisitor;
import edu.stanford.nlp.trees.TreebankLanguagePack;
import edu.stanford.nlp.trees.BobChrisTreeNormalizer.AOverAFilter;
import edu.stanford.nlp.trees.international.arabic.*;
import java.util.function.Predicate;


/**
 * Converts raw ATB trees into a format appropriate for treebank parsing.
 *
 * @author Spence Green
 *
 */
public class ATBArabicDataset extends AbstractDataset {


  public ATBArabicDataset() {
    super();


    //Read the raw file as UTF-8 irrespective of output encoding
    treebank = new DiskTreebank(new ArabicTreeReaderFactory.ArabicRawTreeReaderFactory(true), "UTF-8");
  }


  public void build() {
    for(File path : pathsToData) {
      if(splitFilter == null) {
        treebank.loadPath(path,treeFileExtension,false);
      } else {
        treebank.loadPath(path,splitFilter);
      }
    }


    PrintWriter outfile = null;
    PrintWriter flatFile = null;
    try {
      outfile = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outFileName),"UTF-8")));
      flatFile = (makeFlatFile) ? new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(flatFileName),"UTF-8"))) : null;


      treebank.apply(new ArabicRawTreeNormalizer(outfile,flatFile));


      outputFileList.add(outFileName);


      if(makeFlatFile) {
        outputFileList.add(flatFileName);
        toStringBuffer.append(" Made flat files\n");
      }


    } catch (UnsupportedEncodingException e) {
      System.err.printf("%s: Filesystem does not support UTF-8 output\n", this.getClass().getName());
      e.printStackTrace();
    } catch (FileNotFoundException e) {
      System.err.printf("%s: Could not open %s for writing\n", this.getClass().getName(), outFileName);
    } finally {
      if(outfile != null)
        outfile.close();
      if(flatFile != null)
        flatFile.close();
    }
  }




  public boolean setOptions(Properties opts) {
    boolean ret = super.setOptions(opts);


    if(lexMapper == null) {
      lexMapper = new DefaultLexicalMapper();
      lexMapper.setup(null, lexMapOptions.split(","));
    }


    if(pathsToMappings.size() != 0) {
      if(posMapper == null) {
        posMapper = new LDCPosMapper(addDeterminer);
      }
      String[] mapOpts = posMapOptions.split(",");
      for(File path : pathsToMappings)
        posMapper.setup(path,mapOpts);
    }


    return ret;
  }




  /**
   * A {@link edu.stanford.nlp.trees.TreeVisitor} for raw ATB trees. This class performs
   * minimal pre-processing (for example, it does not prune traces). It also provides
   * a facility via <code>enableIBMArabicEscaping</code> for sub-classes to process
   * IBM Arabic parse trees.
   *
   */
  protected class ArabicRawTreeNormalizer implements TreeVisitor {
    protected final Buckwalter encodingMap;
    protected final PrintWriter outfile;
    protected final PrintWriter flatFile;
    protected final Predicate<Tree> nullFilter;
    protected final Predicate<Tree> aOverAFilter;
    protected final TreeFactory tf;
    protected final TreebankLanguagePack tlp;


    public ArabicRawTreeNormalizer(PrintWriter outFile, PrintWriter flatFile) {
      encodingMap = (encoding == Encoding.UTF8) ? new Buckwalter() : new Buckwalter(true);


      this.outfile = outFile;
      this.flatFile = flatFile;


      nullFilter = new ArabicTreeNormalizer.ArabicEmptyFilter();
      aOverAFilter = new AOverAFilter();


      tf = new LabeledScoredTreeFactory();
      tlp = new ArabicTreebankLanguagePack();
    }


    protected void processPreterminal(Tree node) {
      String rawTag = node.value();
      String posTag = (posMapper == null) ? rawTag : posMapper.map(rawTag,node.firstChild().value());
      String rawWord = node.firstChild().value();


      //Hack for LDC2008E22 idiosyncrasy in which (NOUN.VN F) is a pre-terminal/word
      //This is a bare fathatan that bears no semantic content. Replacing it with the
      //conjunction ف / f .
      if(rawWord.equals("F")) {
        posTag = posTag.equals("NOUN.VN") ? "CONJ" : "CC";
        rawWord = "f";
      }


      // Hack for annotation error in ATB
      if (rawWord.startsWith("MERGE_with_previous_token:")) {
        rawWord = rawWord.replace("MERGE_with_previous_token:", "");
      }


      // Hack for annotation error in ATB
      if (rawWord.contains("e")) {
        rawWord = rawWord.replace("e", "");
      }


      String finalWord = lexMapper.map(rawTag, rawWord);
      if(lexMapper.canChangeEncoding(rawTag, finalWord))
        finalWord = encodingMap.apply(finalWord);


      node.setValue(posTag);
      if(morphDelim == null) {
        node.firstChild().setValue(finalWord);
        if (node.firstChild().label() instanceof CoreLabel) ((CoreLabel) node.firstChild().label()).setWord(finalWord);
      } else {
        node.firstChild().setValue(finalWord + morphDelim + rawTag);
      }
    }


    //Modifies the tree in-place...should be run after
    //mapping to reduced tag set
    public Tree arabicAoverAFilter(Tree t) {
      if(t == null || t.isLeaf() || t.isPreTerminal())
        return t;


      //Specific nodes to filter out
      if(t.numChildren() == 1) {
        final Tree fc = t.firstChild();


        //A over A nodes i.e. from BobChrisTreeNormalizer
        if(t.label() != null && fc.label() != null && t.value().equals(fc.value())) {
          t.setChildren(fc.children());
        }
      }


      for(Tree kid : t.getChildrenAsList())
        arabicAoverAFilter(kid);


      return t;
    }




    public void visitTree(Tree t) {
      // Filter out XBar trees
      if(t == null || t.value().equals("X")) return;
      if(t.yield().size() > maxLen) return;


      // Strip out traces and pronoun deletion markers,
      t = t.prune(nullFilter, tf);
      t = arabicAoverAFilter(t);


      // Visit nodes with a custom visitor
      if(customTreeVisitor != null)
        customTreeVisitor.visitTree(t);


      // Process each node in the tree
      for(Tree node : t) {
        if(node.isPreTerminal()) {
          processPreterminal(node);
        }
        if(removeDashTags && !node.isLeaf())
          node.setValue(tlp.basicCategory(node.value()));
      }


      // Add a ROOT node if necessary
      if (addRoot && t.value() != null && !t.value().equals("ROOT")) {
        t = tf.newTreeNode("ROOT", Collections.singletonList(t));
      }


      // Output the trees to file
      outfile.println(t.toString());
      if(flatFile != null) {
        String flatString = (removeEscapeTokens) ?
            ATBTreeUtils.unEscape(ATBTreeUtils.flattenTree(t)) : ATBTreeUtils.flattenTree(t);
        flatFile.println(flatString);
      }
    }
  }
}
Source Code of edu.stanford.nlp.international.arabic.pipeline.ATBArabicDataset$ArabicRawTreeNormalizer

Related Classes of edu.stanford.nlp.international.arabic.pipeline.ATBArabicDataset$ArabicRawTreeNormalizer