Source Code of edu.stanford.nlp.pipeline.TokenizerAnnotator

package edu.stanford.nlp.pipeline;


import java.io.Reader;
import java.io.StringReader;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.Properties;


import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.process.TokenizerFactory;
import edu.stanford.nlp.process.CoreLabelTokenFactory;
import edu.stanford.nlp.process.PTBTokenizer;
import edu.stanford.nlp.process.Tokenizer;
import edu.stanford.nlp.process.WhitespaceTokenizer;
import edu.stanford.nlp.international.spanish.process.SpanishTokenizer;
import edu.stanford.nlp.international.french.process.FrenchTokenizer;


import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.PropertiesUtils;


/**
 * This class will PTB tokenize the input.  It assumes that the original
 * String is under the CoreAnnotations.TextAnnotation field
 * and it will add the output from the
 * InvertiblePTBTokenizer ({@code List<CoreLabel>}) under
 * CoreAnnotation.TokensAnnotation.
 *
 * @author Jenny Finkel
 * @author Christopher Manning
 * @author Ishita Prasad
 */
public class TokenizerAnnotator implements Annotator {


  /**
   * Enum to identify the different TokenizerTypes. To add a new
   * TokenizerType, add it to the list with a default options string
   * and add a clause in getTokenizerType to identify it.
   */
  public enum TokenizerType {
    Unspecified(null, null, "invertible,ptb3Escaping=true"),
    Spanish    ("es", "SpanishTokenizer", "invertible,ptb3Escaping=true,splitAll=true"),
    English    ("en", "PTBTokenizer", "invertible,ptb3Escaping=true"),
    German     ("de", null, "invertible,ptb3Escaping=true"),
    French     ("fr", "FrenchTokenizer", ""),
    Whitespace (null, "WhitespaceTokenizer", "");
    
    private final String abbreviation;
    private final String className;
    private final String defaultOptions;


    private TokenizerType(String abbreviation, String className, String defaultOptions) {
      this.abbreviation = abbreviation;
      this.className = className;
      this.defaultOptions = defaultOptions;
    }


    public String getDefaultOptions() {
      return defaultOptions;
    }


    private static final Map<String, TokenizerType> nameToTokenizerMap = initializeNameMap();


    private static Map<String, TokenizerType> initializeNameMap() {
      Map<String, TokenizerType> map = Generics.newHashMap();
      for (TokenizerType type : TokenizerType.values()) {
        if (type.abbreviation != null) {
          map.put(type.abbreviation.toUpperCase(), type);
        }
        map.put(type.toString().toUpperCase(), type);
      }
      return Collections.unmodifiableMap(map);
    }


    private static final Map<String, TokenizerType> classToTokenizerMap = initializeClassMap();


    private static Map<String, TokenizerType> initializeClassMap() {
      Map<String, TokenizerType> map = Generics.newHashMap();
      for (TokenizerType type : TokenizerType.values()) {
        if (type.className != null) {
          map.put(type.className.toUpperCase(), type);
        }
      }
      return Collections.unmodifiableMap(map);
    }


    /***
     * Get TokenizerType based on what's in the properties
     */
    public static TokenizerType getTokenizerType(Properties props) {
      String tokClass = props.getProperty("tokenize.class", null);
      boolean whitespace = Boolean.valueOf(props.getProperty("tokenize.whitespace", "false"));
      String language = props.getProperty("tokenize.language", null);
      
      if(whitespace) {
        return Whitespace;
      }


      if (tokClass != null) {
        TokenizerType type = classToTokenizerMap.get(tokClass.toUpperCase());
        if (type == null) {
          throw new IllegalArgumentException("TokenizerAnnotator: unknown tokenize.class property " + tokClass);
        }
        return type;
      }
      
      if (language != null) {
        TokenizerType type = nameToTokenizerMap.get(language.toUpperCase());
        if (type == null) {
          throw new IllegalArgumentException("TokenizerAnnotator: unknown tokenize.language property " + language);
        }
        return type;
      }


      return Unspecified;
    }
  } // end enum TokenizerType
  
  public static final String EOL_PROPERTY = "tokenize.keepeol";
  
  private final boolean VERBOSE;
  private final TokenizerFactory<CoreLabel> factory;
  
  // CONSTRUCTORS
  
  public TokenizerAnnotator() {
    this(true);
  }
  
  public TokenizerAnnotator(boolean verbose) {
    this(verbose, TokenizerType.English);
  } 
  
  public TokenizerAnnotator(String lang) {
    this(true, lang, null);
  }
  
  public TokenizerAnnotator(boolean verbose, TokenizerType lang) {
    this(verbose, lang.toString());
  } 
  
  public TokenizerAnnotator(boolean verbose, String lang) {
    this(verbose, lang, null);
  }
  
  public TokenizerAnnotator(boolean verbose, String lang, String options) {
    VERBOSE = verbose;
    Properties props = new Properties();
    if (lang != null) {
      props.setProperty("tokenize.language", lang);
    }
    
    TokenizerType type = TokenizerType.getTokenizerType(props);
    factory = initFactory(type, props, options);
  }
  
  public TokenizerAnnotator(boolean verbose, Properties props) {
    this(verbose, props, null);
  }
  
  public TokenizerAnnotator(boolean verbose, Properties props, String options) {
    VERBOSE = verbose;
    if (props == null) {
      props = new Properties();
    }
    
    TokenizerType type = TokenizerType.getTokenizerType(props);
    factory = initFactory(type, props, options);
  }
  
  /** 
   * initFactory returns the right type of TokenizerFactory based on the options in the properties file
   * and the type. When adding a new Tokenizer, modify TokenizerType.getTokenizerType() to retrieve
   * your tokenizer from the properties file, and then add a class is the switch structure here to 
   * instanstiate the new Tokenizer type.
   *
   * @param type the TokenizerType
   * @param type the properties file
   * @param extraOptions extra things that should be passed into the tokenizer constructor
   */
  private TokenizerFactory<CoreLabel> initFactory(TokenizerType type, Properties props, String extraOptions) throws IllegalArgumentException{
    TokenizerFactory<CoreLabel> factory;
    String options = props.getProperty("tokenize.options", null);
    
    // set it to the equivalent of both extraOptions and options
    // TODO: maybe we should always have getDefaultOptions() and 
    // expect the user to turn off default options.  That would 
    // require all options to have negated options, but
    // currently there are some which don't have that
    if (options == null) {
      options = type.getDefaultOptions();
    } 
    if (extraOptions != null) {
      if (extraOptions.endsWith(",")) {
        options = extraOptions + options;
      } else {
        options = extraOptions + "," + options;
      }
    }
    
    switch(type) {
    case Spanish:
      factory = SpanishTokenizer.factory(new CoreLabelTokenFactory(), options);
      break;
      
    case French:
      factory = FrenchTokenizer.factory(new CoreLabelTokenFactory(), options);
      break;
      
    case Whitespace:
      boolean eolIsSignificant = Boolean.valueOf(props.getProperty(EOL_PROPERTY, "false"));
      eolIsSignificant = eolIsSignificant || Boolean.valueOf(props.getProperty(StanfordCoreNLP.NEWLINE_SPLITTER_PROPERTY, "false"));
      factory = new WhitespaceTokenizer.WhitespaceTokenizerFactory<CoreLabel> (new CoreLabelTokenFactory(), eolIsSignificant);
      break;
      
    case English: 
    case German:
      factory = PTBTokenizer.factory(new CoreLabelTokenFactory(), options);
      break;
      
    case Unspecified:
      System.err.println("TokenizerAnnotator: No tokenizer type provided. Defaulting to PTBTokenizer.");
      factory = PTBTokenizer.factory(new CoreLabelTokenFactory(), options);
      break;
      
    default:
      throw new IllegalArgumentException("No valid tokenizer type provided.\n" +
                                         "Use -tokenize.language, -tokenize.class, or -tokenize.whitespace \n" +
                                         "to specify a tokenizer.");
      
    }
    return factory;
  }


  /**
   * Returns a thread-safe tokenizer
   */
  public Tokenizer<CoreLabel> getTokenizer(Reader r) {
    return factory.getTokenizer(r);
  }
  
  /**   
   * Does the actual work of splitting TextAnnotation into CoreLabels,
   * which are then attached to the TokensAnnotation.
   */
  @Override
  public void annotate(Annotation annotation) {
    if (VERBOSE) {
      System.err.print("Tokenizing ... ");
    }
    
    if (annotation.has(CoreAnnotations.TextAnnotation.class)) {
      String text = annotation.get(CoreAnnotations.TextAnnotation.class);
      Reader r = new StringReader(text);  
      // don't wrap in BufferedReader.  It gives you nothing for in memory String unless you need the readLine() method  !
      
      List<CoreLabel> tokens = getTokenizer(r).tokenize();
      // cdm 2010-05-15: This is now unnecessary, as it is done in CoreLabelTokenFactory
      // for (CoreLabel token: tokens) {
      // token.set(CoreAnnotations.TextAnnotation.class, token.get(CoreAnnotations.TextAnnotation.class));
      // }
      
      annotation.set(CoreAnnotations.TokensAnnotation.class, tokens);
      if (VERBOSE) {
        System.err.println("done.");
        System.err.println("Tokens: " + annotation.get(CoreAnnotations.TokensAnnotation.class));
      }
    } else {
      throw new RuntimeException("Tokenizer unable to find text in annotation: " + annotation);
    }
  }
  
  @Override
  public Set<Requirement> requires() {
    return Collections.emptySet();
  }
  
  @Override
  public Set<Requirement> requirementsSatisfied() {
    return Collections.singleton(TOKENIZE_REQUIREMENT);
  }  
}
Source Code of edu.stanford.nlp.pipeline.TokenizerAnnotator

Related Classes of edu.stanford.nlp.pipeline.TokenizerAnnotator