Package edu.stanford.nlp.international.arabic.pipeline

Source Code of edu.stanford.nlp.international.arabic.pipeline.TaggedArabicDataset

package edu.stanford.nlp.international.arabic.pipeline;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.io.UnsupportedEncodingException;

import edu.stanford.nlp.trees.treebank.ConfigParser;
import edu.stanford.nlp.trees.LabeledScoredTreeFactory;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.international.arabic.ATBTreeUtils;

/**
* Converts ATB gold parse trees to a format appropriate for training a POS tagger (especially
* the Stanford POS tagger!).
*
* @author Spence Green
*
*/
public class TaggedArabicDataset extends ATBArabicDataset {

  private String wordTagDelim = "_";

  @Override
  public void build() {
    //Set specific options for this dataset
    if(options.containsKey(ConfigParser.paramTagDelim)) {
      wordTagDelim = options.getProperty(ConfigParser.paramTagDelim);
    }

    for(File path : pathsToData) {
      int prevSize = treebank.size();
      if(splitFilter == null) {
        treebank.loadPath(path,treeFileExtension,false);
      } else {
        treebank.loadPath(path,splitFilter);
      }
      toStringBuffer.append(String.format(" Loaded %d trees from %s\n", treebank.size() - prevSize, path.getPath()));
      prevSize = treebank.size();
    }

    PrintWriter outfile = null;
    PrintWriter flatFile = null;
    try {
      outfile = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outFileName),"UTF-8")));
      flatFile = (makeFlatFile) ? new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(flatFileName),"UTF-8"))) : null;

      ArabicTreeTaggedNormalizer tv = new ArabicTreeTaggedNormalizer(outfile,flatFile);

      treebank.apply(tv);

      outputFileList.add(outFileName);

      if(makeFlatFile) {
        outputFileList.add(flatFileName);
      }

    } catch (UnsupportedEncodingException e) {
      System.err.printf("%s: Filesystem does not support UTF-8 output%n", this.getClass().getName());
      e.printStackTrace();
    } catch (FileNotFoundException e) {
      System.err.printf("%s: Could not open %s for writing%n", this.getClass().getName(), outFileName);
    } finally {
      if(outfile != null) {
        outfile.close();
      }
      if(flatFile != null) {
        flatFile.close();
      }
    }
  }

  protected class ArabicTreeTaggedNormalizer extends ArabicRawTreeNormalizer {

    public ArabicTreeTaggedNormalizer(PrintWriter outFile, PrintWriter flatFile) {
      super(outFile,flatFile);
    }

    public void visitTree(Tree t) {
      if(t == null || t.value().equals("X")) return;

      t = t.prune(nullFilter, new LabeledScoredTreeFactory());

      for(Tree node : t) {
        if(node.isPreTerminal()) {
          processPreterminal(node);
        }
      }

      outfile.println(ATBTreeUtils.taggedStringFromTree(t, removeEscapeTokens, wordTagDelim));

      if(flatFile != null) {
        flatFile.println(ATBTreeUtils.flattenTree(t));
      }
    }
  }
}
TOP

Related Classes of edu.stanford.nlp.international.arabic.pipeline.TaggedArabicDataset

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.