Source Code of edu.stanford.nlp.international.spanish.pipeline.AnCoraPOSStats

package edu.stanford.nlp.international.spanish.pipeline;


import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.stats.TwoDimensionalCounter;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreeReader;
import edu.stanford.nlp.trees.international.spanish.SpanishTreeNormalizer;
import edu.stanford.nlp.trees.international.spanish.SpanishTreebankLanguagePack;
import edu.stanford.nlp.trees.international.spanish.SpanishXMLTreeReaderFactory;
import edu.stanford.nlp.util.StringUtils;


import java.io.*;
import java.util.*;


/**
 * A utility to build unigram part-of-speech tagging data from XML
 * corpus files from the AnCora corpus.
 *
 * The constructed tagger is used to tag the constituent tokens of
 * multi-word expressions, which have no tags in the AnCora corpus.
 *
 * For invocation options, run the program with no arguments.
 *
 * @author Jon Gauthier
 */
public class AnCoraPOSStats {


  private final TwoDimensionalCounter<String, String> unigramTagger;


  private List<File> fileList;
  private String outputPath;


  public AnCoraPOSStats(List<File> fileList, String outputPath) {
    this.fileList = fileList;
    this.outputPath = outputPath;


    unigramTagger = new TwoDimensionalCounter<String, String>();
  }


  public void process() throws IOException {
    SpanishXMLTreeReaderFactory trf = new SpanishXMLTreeReaderFactory();


    Tree t;
    for (File file : fileList) {
      Reader in =
        new BufferedReader(new InputStreamReader(new FileInputStream(file),
                                                 SpanishTreebankLanguagePack.STB_ENCODING));
      TreeReader tr = trf.newTreeReader(in);


      // Tree reading will implicitly perform tree normalization for us
      while ((t = tr.readTree()) != null) {
        // Update tagger with this tree
        List<CoreLabel> yield = t.taggedLabeledYield();
        for (CoreLabel leafLabel : yield) {
          if (leafLabel.tag().equals(SpanishTreeNormalizer.MW_TAG))
            continue;


          unigramTagger.incrementCount(leafLabel.word(), leafLabel.tag());
        }
      }
    }
  }


  public TwoDimensionalCounter<String, String> getUnigramTagger() {
    return unigramTagger;
  }


  private static final String usage =
    String.format("Usage: java %s -o <output_path> file(s)%n%n", AnCoraPOSStats.class.getName());


  private static final Map<String, Integer> argOptionDefs = new HashMap<String, Integer>();
  static {
    argOptionDefs.put("o", 1);
  }


  public static void main(String[] args) throws IOException {
    if (args.length < 1) {
      System.err.println(usage);
      System.exit(1);
    }


    Properties options = StringUtils.argsToProperties(args, argOptionDefs);


    String outputPath = options.getProperty("o");
    if (outputPath == null)
      throw new IllegalArgumentException("-o argument (output path for built tagger) is required");


    String[] remainingArgs = options.getProperty("").split(" ");
    List<File> fileList = new ArrayList<File>();
    for (String arg : remainingArgs)
      fileList.add(new File(arg));


    AnCoraPOSStats stats = new AnCoraPOSStats(fileList, outputPath);
    stats.process();


    ObjectOutputStream oos = new ObjectOutputStream(new FileOutputStream(outputPath));
    TwoDimensionalCounter<String, String> tagger = stats.getUnigramTagger();
    oos.writeObject(tagger);


    System.out.printf("Wrote tagger to %s%n", outputPath);
  }


}
Source Code of edu.stanford.nlp.international.spanish.pipeline.AnCoraPOSStats

Related Classes of edu.stanford.nlp.international.spanish.pipeline.AnCoraPOSStats