Package edu.stanford.nlp.tagger.io

Source Code of edu.stanford.nlp.tagger.io.TaggedFileRecord

package edu.stanford.nlp.tagger.io;

import java.util.ArrayList;
import java.util.List;
import java.util.Properties;

import edu.stanford.nlp.io.NumberRangesFileFilter;
import edu.stanford.nlp.tagger.maxent.TaggerConfig;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreeNormalizer;
import edu.stanford.nlp.trees.TreeReaderFactory;
import edu.stanford.nlp.trees.TreeTransformer;
import java.util.function.Predicate;
import edu.stanford.nlp.util.ReflectionLoading;

/** Parses and specifies all the details for how to read some POS tagging data.
*  The options for this class are documented in MaxentTagger.
*
@author John Bauer
*/
public class TaggedFileRecord {

  public enum Format {
    TEXT,  // represents a tokenized file separated by text
    TSV,   // represents a tsv file such as a conll file
    TREES // represents a file in PTB format
  }

  final String file;
  final Format format;
  final String encoding;
  final String tagSeparator;
  final TreeTransformer treeTransformer;
  final TreeNormalizer treeNormalizer;
  final NumberRangesFileFilter treeRange;
  final Predicate<Tree> treeFilter;
  final Integer wordColumn;
  final Integer tagColumn;
  final TreeReaderFactory trf;

  private TaggedFileRecord(String file, Format format,
                           String encoding, String tagSeparator,
                           TreeTransformer treeTransformer,
                           TreeNormalizer treeNormalizer,
                           TreeReaderFactory trf,
                           NumberRangesFileFilter treeRange,
                           Predicate<Tree> treeFilter,
                           Integer wordColumn, Integer tagColumn) {
    this.file = file;
    this.format = format;
    this.encoding = encoding;
    this.tagSeparator = tagSeparator;
    this.treeTransformer = treeTransformer;
    this.treeNormalizer = treeNormalizer;
    this.treeRange = treeRange;
    this.treeFilter = treeFilter;
    this.wordColumn = wordColumn;
    this.tagColumn = tagColumn;
    this.trf = trf;
  }

  public static final String FORMAT = "format";
  public static final String ENCODING = "encoding";
  public static final String TAG_SEPARATOR = "tagSeparator";
  public static final String TREE_TRANSFORMER = "treeTransformer";
  public static final String TREE_NORMALIZER = "treeNormalizer";
  public static final String TREE_RANGE = "treeRange";
  public static final String TREE_FILTER = "treeFilter";
  public static final String WORD_COLUMN = "wordColumn";
  public static final String TAG_COLUMN = "tagColumn";
  public static final String TREE_READER = "trf";

  public String toString() {
    StringBuilder s = new StringBuilder();
    s.append(FORMAT + "=" + format);
    s.append("," + ENCODING + "=" + encoding);
    s.append("," + TAG_SEPARATOR + "=" + tagSeparator);
    if (treeTransformer != null) {
      s.append("," + TREE_TRANSFORMER + "=" +
               treeTransformer.getClass().getName());
    }
    if (trf != null) {
      s.append("," + TREE_READER + "=" +
               trf.getClass().getName());
    }
    if (treeNormalizer != null) {
      s.append("," + TREE_NORMALIZER + "=" +
               treeNormalizer.getClass().getName());
    }
    if (treeRange != null) {
      s.append("," + TREE_RANGE + "=" +
               treeRange.toString().replaceAll(",", ":"));
    }
    if (treeRange != null) {
      s.append("," + TREE_FILTER + "=" + treeFilter.getClass().toString());
    }
    if (wordColumn != null) {
      s.append("," + WORD_COLUMN + "=" + wordColumn);
    }
    if (tagColumn != null) {
      s.append("," + TAG_COLUMN + "=" + tagColumn);
    }
    return s.toString();
  }

  public String filename() { return file; }

  public TaggedFileReader reader() {
    switch(format) {
    case TEXT:
      return new TextTaggedFileReader(this);
    case TREES:
      return new TreeTaggedFileReader(this);
    case TSV:
      return new TSVTaggedFileReader(this);
    default:
      throw new IllegalArgumentException("Unknown format " + format);
    }
  }

  public static List<TaggedFileRecord> createRecords(Properties config,
                                                     String description) {
    String[] pieces = description.split(";");
    List<TaggedFileRecord> records = new ArrayList<TaggedFileRecord>();
    for (String piece : pieces) {
      records.add(createRecord(config, piece));
    }
    return records;
  }

  public static TaggedFileRecord createRecord(Properties config,
                                              String description) {
    String[] pieces = description.split(",");
    if (pieces.length == 1) {
      return new TaggedFileRecord(description, Format.TEXT,
                                  getEncoding(config),
                                  getTagSeparator(config),
                                  null, null, null, null, null, null, null);
    }

    String[] args = new String[pieces.length - 1];
    System.arraycopy(pieces, 0, args, 0, pieces.length - 1);
    String file = pieces[pieces.length - 1];
    Format format = Format.TEXT;
    String encoding = getEncoding(config);
    String tagSeparator = getTagSeparator(config);
    TreeTransformer treeTransformer = null;
    TreeNormalizer treeNormalizer = null;
    TreeReaderFactory trf = null;
    NumberRangesFileFilter treeRange = null;
    Predicate<Tree> treeFilter = null;
    Integer wordColumn = null, tagColumn = null;

    for (String arg : args) {
      String[] argPieces = arg.split("=", 2);
      if (argPieces.length != 2) {
        throw new IllegalArgumentException("TaggedFileRecord argument " + arg +
                                           " has an unexpected number of =s");
      }
      if (argPieces[0].equalsIgnoreCase(FORMAT)) {
        format = Format.valueOf(argPieces[1]);
      } else if (argPieces[0].equalsIgnoreCase(ENCODING)) {
        encoding = argPieces[1];
      } else if (argPieces[0].equalsIgnoreCase(TAG_SEPARATOR)) {
        tagSeparator = argPieces[1];
      } else if (argPieces[0].equalsIgnoreCase(TREE_TRANSFORMER)) {
        treeTransformer = ReflectionLoading.loadByReflection(argPieces[1]);
      } else if (argPieces[0].equalsIgnoreCase(TREE_NORMALIZER)) {
        treeNormalizer = ReflectionLoading.loadByReflection(argPieces[1]);
      } else if (argPieces[0].equalsIgnoreCase(TREE_READER)) {
        trf = ReflectionLoading.loadByReflection(argPieces[1]);
      } else if (argPieces[0].equalsIgnoreCase(TREE_RANGE)) {
        String range = argPieces[1].replaceAll(":", ",");
        treeRange = new NumberRangesFileFilter(range, true);
      } else if (argPieces[0].equalsIgnoreCase(TREE_FILTER)) {
        treeFilter = ReflectionLoading.loadByReflection(argPieces[1]);
      } else if (argPieces[0].equalsIgnoreCase(WORD_COLUMN)) {
        wordColumn = Integer.valueOf(argPieces[1]);
      } else if (argPieces[0].equalsIgnoreCase(TAG_COLUMN)) {
        tagColumn = Integer.valueOf(argPieces[1]);
      } else {
        throw new IllegalArgumentException("TaggedFileRecord argument " +
                                           argPieces[0] + " is unknown");
      }
    }
    return new TaggedFileRecord(file, format, encoding, tagSeparator,
                                treeTransformer, treeNormalizer, trf, treeRange,
                                treeFilter, wordColumn, tagColumn);
  }

  public static String getEncoding(Properties config) {
    String encoding = config.getProperty(TaggerConfig.ENCODING_PROPERTY);
    if (encoding == null)
      return TaggerConfig.ENCODING;
    return encoding;
  }

  public static String getTagSeparator(Properties config) {
    String tagSeparator =
      config.getProperty(TaggerConfig.TAG_SEPARATOR_PROPERTY);
    if (tagSeparator == null)
      return TaggerConfig.TAG_SEPARATOR;
    return tagSeparator;
  }

}
TOP

Related Classes of edu.stanford.nlp.tagger.io.TaggedFileRecord

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.