Source Code of edu.stanford.nlp.sequences.TrueCasingForNISTDocumentReaderAndWriter$LineToTrueCasesParser

package edu.stanford.nlp.sequences;


import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.util.StringUtils;
import java.util.function.Function;
import edu.stanford.nlp.objectbank.LineIterator;
import edu.stanford.nlp.objectbank.IteratorFromReaderFactory;


import java.io.PrintWriter;
import java.io.Reader;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;


/**
 * adapt from Jenny's TrueCasingDocumentReaderAndWriter.java
 * @author Pi-Chuan Chang
 */
public class TrueCasingForNISTDocumentReaderAndWriter implements DocumentReaderAndWriter<CoreLabel> {


  public static final String THREE_CLASSES_PROPERTY = "3class";
  public static final boolean THREE_CLASSES = Boolean.parseBoolean(System.getProperty(THREE_CLASSES_PROPERTY, "false"));


  /**
   * 
   */
  private static final long serialVersionUID = -3000389291781534479L;
  private IteratorFromReaderFactory<List<CoreLabel>> factory;
  private Boolean verboseForTrueCasing = false;
  private static Pattern alphabet = Pattern.compile("[A-Za-z]+");


  /**
   * for test only
   **/
  public static void main(String[] args) throws IOException{
    Reader reader = new BufferedReader(new FileReader(args[0]));
    TrueCasingForNISTDocumentReaderAndWriter raw = new TrueCasingForNISTDocumentReaderAndWriter();
    raw.init(null);
    Iterator<List<CoreLabel>> it = raw.getIterator(reader);
    while(it.hasNext()) {
      List<CoreLabel> l = it.next();
      for (CoreLabel cl : l) {
        System.out.println(cl);
      }
      System.out.println("========================================");
    }
  }


  public void init(SeqClassifierFlags flags) {
    verboseForTrueCasing = flags.verboseForTrueCasing;
    factory = LineIterator.getFactory(new LineToTrueCasesParser()); // todo
  }


  public static Set knownWords = null;


  public static boolean known(String s) {
    return knownWords.contains(s.toLowerCase());
  }


  public Iterator<List<CoreLabel>> getIterator(Reader r) {
    return factory.getIterator(r);
  }


  public void printAnswers(List<CoreLabel> doc, PrintWriter out) {
    List<String> sentence = new ArrayList<String>();
    int wrong = 0;
    
    for (CoreLabel wi : doc) {
      StringBuilder sb = new StringBuilder();
      if (! wi.get(CoreAnnotations.AnswerAnnotation.class).equals(wi.get(CoreAnnotations.GoldAnswerAnnotation.class))) {
        wrong++;
      }
      if (!THREE_CLASSES && wi.get(CoreAnnotations.AnswerAnnotation.class).equals("UPPER")) {
        sb.append(wi.word().toUpperCase());
      } else if (wi.get(CoreAnnotations.AnswerAnnotation.class).equals("LOWER")) {
        sb.append(wi.word().toLowerCase());
      } else if (wi.get(CoreAnnotations.AnswerAnnotation.class).equals("INIT_UPPER")) {
        sb.append(wi.word().substring(0,1).toUpperCase())
          .append(wi.word().substring(1));
      } else if (wi.get(CoreAnnotations.AnswerAnnotation.class).equals("O")) {
        // in this case, if it cotains a-z at all, then append "MIX" at the end
        sb.append(wi.word());
        Matcher alphaMatcher = alphabet.matcher(wi.word());
        if (alphaMatcher.matches()) {
          sb.append("/MIX");
        }
      }


      if (verboseForTrueCasing) {
        sb.append("/GOLD-")
          .append(wi.get(CoreAnnotations.GoldAnswerAnnotation.class))
          .append("/GUESS-")
          .append(wi.get(CoreAnnotations.AnswerAnnotation.class));
      }
      sentence.add(sb.toString());
    }
    out.print(StringUtils.join(sentence, " "));
    System.err.printf("> wrong = %d ; total = %d\n", wrong, doc.size());
    out.println();
  }


  public static class LineToTrueCasesParser implements Function<String,List<CoreLabel>> {
    private static Pattern allLower = Pattern.compile("[^A-Z]*?[a-z]+[^A-Z]*?");
    private static Pattern allUpper = Pattern.compile("[^a-z]*?[A-Z]+[^a-z]*?");
    private static Pattern startUpper = Pattern.compile("[A-Z].*");
    
    public List<CoreLabel> apply(String line) {
      List<CoreLabel> doc = new ArrayList<CoreLabel>();
      int pos = 0;
      
      //line = line.replaceAll(" +"," ");
      //System.err.println("pichuan: processing line = "+line);
      
      String[] toks = line.split(" ");
      for (String word : toks) {
        CoreLabel wi = new CoreLabel();
        Matcher lowerMatcher = allLower.matcher(word);
        
        if (lowerMatcher.matches()) {
          wi.set(CoreAnnotations.AnswerAnnotation.class, "LOWER");
          wi.set(CoreAnnotations.GoldAnswerAnnotation.class, "LOWER");
        } else {
          Matcher upperMatcher = allUpper.matcher(word);
          if (!THREE_CLASSES && upperMatcher.matches()) {
            wi.set(CoreAnnotations.AnswerAnnotation.class, "UPPER");
            wi.set(CoreAnnotations.GoldAnswerAnnotation.class, "UPPER");
          } else {
            Matcher startUpperMatcher = startUpper.matcher(word);


            boolean isINIT_UPPER; // = false;
            if (word.length() > 1) {
              String w2 = word.substring(1);
              String lcw2 = w2.toLowerCase();
              isINIT_UPPER = w2.equals(lcw2);
            } else {
              isINIT_UPPER = false;
            }


            if (startUpperMatcher.matches() && isINIT_UPPER) {
              wi.set(CoreAnnotations.AnswerAnnotation.class, "INIT_UPPER");
              wi.set(CoreAnnotations.GoldAnswerAnnotation.class, "INIT_UPPER");
            } else {
              wi.set(CoreAnnotations.AnswerAnnotation.class, "O");
              wi.set(CoreAnnotations.GoldAnswerAnnotation.class, "O");
            }
          }
        }
        
        wi.setWord(word.toLowerCase());
        wi.set(CoreAnnotations.PositionAnnotation.class, pos + "");
        doc.add(wi);
        pos++;
      }
      return doc;
    }
  }
}
Source Code of edu.stanford.nlp.sequences.TrueCasingForNISTDocumentReaderAndWriter$LineToTrueCasesParser

Related Classes of edu.stanford.nlp.sequences.TrueCasingForNISTDocumentReaderAndWriter$LineToTrueCasesParser