Source Code of edu.stanford.nlp.pipeline.TrueCaseAnnotator

package edu.stanford.nlp.pipeline;


import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.StringTokenizer;


import edu.stanford.nlp.ie.crf.CRFBiasedClassifier;
import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.objectbank.ObjectBank;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.Generics;


public class TrueCaseAnnotator implements Annotator {


  @SuppressWarnings("unchecked")
  private CRFBiasedClassifier trueCaser;
  
  private Map<String,String> mixedCaseMap = Generics.newHashMap();
  
  private boolean VERBOSE = true;
  
  public static final String DEFAULT_MODEL_BIAS = "INIT_UPPER:-0.7,UPPER:-0.7,O:0";
  
  public TrueCaseAnnotator() {
    this(true);
  }


  public TrueCaseAnnotator(boolean verbose) {
    this(System.getProperty("truecase.model", DefaultPaths.DEFAULT_TRUECASE_MODEL), 
        System.getProperty("truecase.bias", DEFAULT_MODEL_BIAS),
        System.getProperty("truecase.mixedcasefile", DefaultPaths.DEFAULT_TRUECASE_DISAMBIGUATION_LIST),
        verbose);
  }


  @SuppressWarnings("unchecked")
  public TrueCaseAnnotator(String modelLoc, 
      String classBias,
      String mixedCaseFileName,
      boolean verbose){
    this.VERBOSE = verbose;
    
    Properties props = new Properties();
    props.setProperty("loadClassifier", modelLoc);
    props.setProperty("mixedCaseMapFile", mixedCaseFileName);
    props.setProperty("classBias", classBias);
    trueCaser = new CRFBiasedClassifier(props);
    
    if (modelLoc != null) {
      trueCaser.loadClassifierNoExceptions(modelLoc, props);
    } else {
      throw new RuntimeException("Model location not specified for true-case classifier!");
    }
    
    if(classBias != null) {
      StringTokenizer biases = new java.util.StringTokenizer(classBias,",");
      while (biases.hasMoreTokens()) {
        StringTokenizer bias = new java.util.StringTokenizer(biases.nextToken(),":");
        String cname = bias.nextToken();
        double w = Double.parseDouble(bias.nextToken());
        trueCaser.setBiasWeight(cname,w);
        if(VERBOSE) System.err.println("Setting bias for class "+cname+" to "+w);
      }
    }
    
    // Load map containing mixed-case words:
    mixedCaseMap = loadMixedCaseMap(mixedCaseFileName);
  }


  @SuppressWarnings("unchecked")
  public void annotate(Annotation annotation) {
    if (VERBOSE) {
      System.err.print("Adding true-case annotation...");
    }
    
    if (annotation.containsKey(CoreAnnotations.SentencesAnnotation.class)) {
      // classify tokens for each sentence 
      for (CoreMap sentence: annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
        List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
        List<CoreLabel> output = this.trueCaser.classifySentence(tokens);
        for (int i = 0; i < tokens.size(); ++i) {
          
          // add the named entity tag to each token
          String neTag = output.get(i).get(CoreAnnotations.AnswerAnnotation.class);
          tokens.get(i).set(CoreAnnotations.TrueCaseAnnotation.class, neTag);
          setTrueCaseText(tokens.get(i));
        }
      }
    } else {
      throw new RuntimeException("unable to find sentences in: " + annotation);
    }
  }
  
  private void setTrueCaseText(CoreLabel l) {
    String trueCase = l.getString(CoreAnnotations.TrueCaseAnnotation.class);
    String text = l.word();
    String trueCaseText = text;


    switch (trueCase) {
      case "UPPER":
        trueCaseText = text.toUpperCase();
        break;
      case "LOWER":
        trueCaseText = text.toLowerCase();
        break;
      case "INIT_UPPER":
        trueCaseText = text.substring(0, 1).toUpperCase() + text.substring(1);
        break;
      case "O":
        // The model predicted mixed case, so lookup the map:
        if (mixedCaseMap.containsKey(text))
          trueCaseText = mixedCaseMap.get(text);
        break;
    }
    
    l.set(CoreAnnotations.TrueCaseTextAnnotation.class, trueCaseText);
  }
  
  public static Map<String,String> loadMixedCaseMap(String mapFile) {
    Map<String,String> map = Generics.newHashMap();
    try {
      InputStream is = IOUtils.getInputStreamFromURLOrClasspathOrFileSystem(mapFile);
      BufferedReader br = new BufferedReader(new InputStreamReader(is));
      for(String line : ObjectBank.getLineIterator(br)) {
        line = line.trim();
        String[] els = line.split("\\s+");
        if(els.length != 2) 
          throw new RuntimeException("Wrong format: "+mapFile);
        map.put(els[0],els[1]);
      }
      br.close();
      is.close();
    } catch(IOException e){
      throw new RuntimeException(e);
    }
    return map;
  }


  @Override
  public Set<Requirement> requires() {
    return TOKENIZE_SSPLIT_POS_LEMMA;
  }


  @Override
  public Set<Requirement> requirementsSatisfied() {
    return Collections.singleton(TRUECASE_REQUIREMENT);
  }
}
Source Code of edu.stanford.nlp.pipeline.TrueCaseAnnotator

Related Classes of edu.stanford.nlp.pipeline.TrueCaseAnnotator