Package edu.stanford.nlp.tagger.util

Source Code of edu.stanford.nlp.tagger.util.ConvertTreesToTags

package edu.stanford.nlp.tagger.util;

import java.io.BufferedWriter;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;

import edu.stanford.nlp.ling.Sentence;
import edu.stanford.nlp.ling.TaggedWord;
import edu.stanford.nlp.tagger.io.TaggedFileRecord;

/**
* A short utility program that dumps out trees from multiple files
* into one file of tagged text.  Useful for combining many parse tree
* training files into one tagger training file, since the tagger
* doesn't have convenient ways of reading in an entire directory.
* <br>
* There are a few command line arguments available:
* <table>
* <tr>
* <td> -output &lt;filename&gt; </td>
* <td> File to output the data to </td>
* </tr>
* <tr>
* <td> -tagSeparator &lt;separator&gt; </td>
* <td> Separator to use between word and tag </td>
* </tr>
* <tr>
* <td> -treeRange &lt;range&gt; </td>
* <td> If tree files have numbers, they will be filtered out if not
*      in this range.  Can be null. </td>
* </tr>
* <tr>
* <td> -inputEncoding &lt;encoding&gt; </td>
* <td> Encoding to use when reading tree files </td>
* </tr>
* <tr>
* <td> -outputEncoding &lt;encoding&gt; </td>
* <td> Encoding to use when writing tags </td>
* </tr>
* <tr>
* <td> -treeFilter &lt;classname&gt; </td>
* <td> A Filter&lt;Tree&gt; to load by reflection which eliminates
*      trees from the data read </td>
* </tr>
* <tr>
* <td> -noTags </td>
* <td> If present, will only output the words, no tags at all
* </tr>
* <tr>
* <td> -noSpaces </td>
* <td> If present, words will be concatenated together </td>
* </tr>
* </table>
*
* All other arguments will be treated as filenames to read.
*
* @author John Bauer
*/
public class ConvertTreesToTags {
  private ConvertTreesToTags() {}; // main method only

  public static void main(String[] args) throws IOException {
    String outputFilename = "";
    String tagSeparator = "";
    String treeRange = "";
    String inputEncoding = "UTF-8";
    String outputEncoding = "UTF-8";
    String treeFilter = "";
    boolean noTags = false;
    boolean noSpaces = false;
    List<String> inputFilenames = new ArrayList<String>();
    for (int i = 0; i < args.length; ++i) {
      if ((args[i].equalsIgnoreCase("-output") ||
           args[i].equalsIgnoreCase("--output")) &&
          (i + 1 < args.length)) {
        outputFilename = args[i + 1];
        i++;
      } else if ((args[i].equalsIgnoreCase("-tagSeparator") ||
                  args[i].equalsIgnoreCase("--tagSeparator")) &&
                 (i + 1 < args.length)) {
        tagSeparator = args[i + 1];
        i++;
      } else if ((args[i].equalsIgnoreCase("-treeRange") ||
                  args[i].equalsIgnoreCase("--treeRange")) &&
                 (i + 1 < args.length)) {
        treeRange = args[i + 1];
        i++;
      } else if ((args[i].equalsIgnoreCase("-inputEncoding") ||
                  args[i].equalsIgnoreCase("--inputEncoding")) &&
                 (i + 1 < args.length)) {
        inputEncoding = args[i + 1];
        i++;
      } else if ((args[i].equalsIgnoreCase("-outputEncoding") ||
                  args[i].equalsIgnoreCase("--outputEncoding")) &&
                 (i + 1 < args.length)) {
        outputEncoding = args[i + 1];
        i++;
      } else if ((args[i].equalsIgnoreCase("-treeFilter") ||
                  args[i].equalsIgnoreCase("--treeFilter")) &&
                 (i + 1< args.length)) {
        treeFilter = args[i + 1];
        i++;
      } else if (args[i].equalsIgnoreCase("-noTags") ||
                 args[i].equalsIgnoreCase("--noTags")) {
        noTags = true;
      } else if (args[i].equalsIgnoreCase("-noSpaces") ||
                 args[i].equalsIgnoreCase("--noSpaces")) {
        noSpaces = true;
      } else {
        inputFilenames.add(args[i]);
      }
    }
    if (outputFilename.equals("")) {
      System.err.println("Must specify an output filename, -output");
      System.exit(2);
    }
    if (inputFilenames.size() == 0) {
      System.err.println("Must specify one or more input filenames");
      System.exit(2);
    }

    FileOutputStream fos = new FileOutputStream(outputFilename);
    OutputStreamWriter osw = new OutputStreamWriter(fos, outputEncoding);
    BufferedWriter bout = new BufferedWriter(osw);
    Properties props = new Properties();
    for (String filename : inputFilenames) {
      String description = (TaggedFileRecord.FORMAT + "=" +
                            TaggedFileRecord.Format.TREES + "," + filename);
      if (!treeRange.equals("")) {
        description = (TaggedFileRecord.TREE_RANGE + "=" + treeRange +
                       "," + description);
      }
      if (!treeFilter.equals("")) {
        description = (TaggedFileRecord.TREE_FILTER + "=" + treeFilter +
                       "," + description);
      }
      description = (TaggedFileRecord.ENCODING + "=" + inputEncoding +
                     "," + description);
      TaggedFileRecord record =
        TaggedFileRecord.createRecord(props, description);
      for (List<TaggedWord> sentence : record.reader()) {
        String output = Sentence.listToString(sentence, noTags, tagSeparator);
        if (noSpaces) {
          output = output.replaceAll(" ", "");
        }
        bout.write(output);
        bout.newLine();
      }
    }
    bout.flush();
    bout.close();
    osw.close();
    fos.close();
  }
}
TOP

Related Classes of edu.stanford.nlp.tagger.util.ConvertTreesToTags

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.