Package edu.stanford.nlp.pipeline

Source Code of edu.stanford.nlp.pipeline.MorphaAnnotator

package edu.stanford.nlp.pipeline;

import edu.stanford.nlp.ling.CoreAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.process.Morphology;
import edu.stanford.nlp.util.CoreMap;

import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Set;


/**
* This class will add the lemmas of all the words to the Annotation.
* It assumes that the Annotation already contains the tokenized words as
* a {@code List<CoreLabel>} for a list of sentences under the
* {@code SentencesAnnotation.class} key.
* The Annotator adds lemma information to each CoreLabel,
* in the LemmaAnnotation.class.
*
* @author Jenny Finkel
*/
public class MorphaAnnotator implements Annotator{

  private boolean VERBOSE = false;


  private static final String[] prep = {"abroad", "across", "after", "ahead", "along", "aside", "away", "around", "back", "down", "forward", "in", "off", "on", "over", "out", "round", "together", "through", "up"};
  private static final List<String> particles = Arrays.asList(prep);

  public MorphaAnnotator() {
    this(true);
  }

  public MorphaAnnotator(boolean verbose) {
    VERBOSE = verbose;
  }

  public void annotate(Annotation annotation) {
    if (VERBOSE) {
      System.err.print("Finding lemmas ...");
    }
    Morphology morphology = new Morphology();
    if (annotation.has(CoreAnnotations.SentencesAnnotation.class)) {
      for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
        List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
        //System.err.println("Lemmatizing sentence: " + tokens);
        for (CoreLabel token : tokens) {
          String text = token.get(CoreAnnotations.TextAnnotation.class);
          String posTag = token.get(CoreAnnotations.PartOfSpeechAnnotation.class);
          addLemma(morphology, CoreAnnotations.LemmaAnnotation.class, token, text, posTag);
        }
      }
    } else {
      throw new RuntimeException("Unable to find words/tokens in: " +
                                 annotation);
    }
  }


  private static void addLemma(Morphology morpha,
                        Class<? extends CoreAnnotation<String>> ann,
                        CoreMap map, String word, String tag) {
    if (tag.length() > 0) {
      String phrasalVerb = phrasalVerb(morpha, word, tag);
      if (phrasalVerb == null) {
        map.set(ann, morpha.lemma(word, tag));
      } else {
        map.set(ann, phrasalVerb);
      }
    } else {
      map.set(ann, morpha.stem(word));
    }
  }


  /** If a token is a phrasal verb with an underscore between a verb and a
   *  particle, return the phrasal verb lemmatized. If not, return null
   */
  private static String phrasalVerb(Morphology morpha, String word, String tag) {

    // must be a verb and contain an underscore
    assert(word != null);
    assert(tag != null);
    if(!tag.startsWith("VB"|| !word.contains("_")) return null;

    // check whether the last part is a particle
    String[] verb = word.split("_");
    if(verb.length != 2) return null;
    String particle = verb[1];
    if(particles.contains(particle)) {
      String base = verb[0];
      String lemma = morpha.lemma(base, tag);
      return lemma + '_' + particle;
    }

    return null;
  }


  @Override
  public Set<Requirement> requires() {
    return TOKENIZE_SSPLIT_POS;
  }

  @Override
  public Set<Requirement> requirementsSatisfied() {
    return Collections.singleton(LEMMA_REQUIREMENT);
  }
}
TOP

Related Classes of edu.stanford.nlp.pipeline.MorphaAnnotator

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.