Package edu.stanford.nlp.pipeline

Source Code of edu.stanford.nlp.pipeline.MentionsAnnotator

package edu.stanford.nlp.pipeline;

import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.time.TimeAnnotations;
import edu.stanford.nlp.time.Timex;
import edu.stanford.nlp.util.ArraySet;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.Pair;

import java.util.Collections;
import java.util.List;
import java.util.function.Function;
import java.util.Properties;
import java.util.Set;

/**
* Annotator that marks entity mentions in a document.
* Entity mentions are
* - Named entities (identified by NER)
* - Quantifiable entities
*   - Times (identified by TimeAnnotator)
*   - Measurements (identified by ???)
*
* Each sentence is annotated with a list of the mentions
*  (MentionsAnnotation as a list of CoreMap)
*
* @author Angel Chang
*/
public class MentionsAnnotator implements Annotator {
  // Currently relies on NER annotations being okay
  // - Replace with calling NER classifiers and timeAnnotator directly
  LabeledChunkIdentifier chunkIdentifier;

  public MentionsAnnotator() {
    chunkIdentifier = new LabeledChunkIdentifier();
  }

  // note: used in annotate.properties
  @SuppressWarnings("UnusedDeclaration")
  public MentionsAnnotator(String name, Properties props) {
    this();
  }

  private static boolean checkStrings(String s1, String s2) {
    if (s1 == null || s2 == null) {
      return s1 == s2;
    } else {
      return s1.equals(s2);
    }
  }

  private static boolean checkNumbers(Number n1, Number n2) {
    if (n1 == null || n2 == null) {
      return n1 == n2;
    } else {
      return n1.equals(n2);
    }
  }

  private static Function<Pair<CoreLabel,CoreLabel>, Boolean> IS_TOKENS_COMPATIBLE = new Function<Pair<CoreLabel, CoreLabel>, Boolean>() {
    @Override
    public Boolean apply(Pair<CoreLabel, CoreLabel> in) {
      // First argument is the current token
      CoreLabel cur = in.first;
      // Second argument the previous token
      CoreLabel prev = in.second;
      if (cur == null || prev == null) {
        return false;
      }

      // Get NormalizedNamedEntityTag and say two entities are incompatible if they are different
      String v1 = cur.get(CoreAnnotations.NormalizedNamedEntityTagAnnotation.class);
      String v2 = prev.get(CoreAnnotations.NormalizedNamedEntityTagAnnotation.class);
      boolean compatible = checkStrings(v1,v2);
      if (!compatible) return compatible;

      // This duplicates logic in the QuantifiableEntityNormalizer (but maybe we will get rid of that class)
      String nerTag = cur.get(CoreAnnotations.NamedEntityTagAnnotation.class);
      if ("NUMBER".equals(nerTag) || "ORDINAL".equals(nerTag)) {
        // Get NumericCompositeValueAnnotation and say two entities are incompatible if they are different
        Number n1 = cur.get(CoreAnnotations.NumericCompositeValueAnnotation.class);
        Number n2 = prev.get(CoreAnnotations.NumericCompositeValueAnnotation.class);
        compatible = checkNumbers(n1,n2);
        if (!compatible) return compatible;
      }

      // Check timex...
      if ("TIME".equals(nerTag) || "SET".equals(nerTag) || "DATE".equals(nerTag) || "DURATION".equals(nerTag)) {
        Timex timex1 = cur.get(TimeAnnotations.TimexAnnotation.class);
        Timex timex2 = prev.get(TimeAnnotations.TimexAnnotation.class);
        String tid1 = (timex1 != null)? timex1.tid():null;
        String tid2 = (timex2 != null)? timex2.tid():null;
        compatible = checkStrings(tid1,tid2);
        if (!compatible) return compatible;
      }

      return compatible;
    }
  };

  @Override
  public void annotate(Annotation annotation) {

    List<CoreLabel> tokens = annotation.get(CoreAnnotations.TokensAnnotation.class);
    Integer annoTokenBegin = annotation.get(CoreAnnotations.TokenBeginAnnotation.class);
    if (annoTokenBegin == null) { annoTokenBegin = 0; }
    List<CoreMap> chunks = chunkIdentifier.getAnnotatedChunks(tokens, annoTokenBegin,
            CoreAnnotations.TextAnnotation.class, CoreAnnotations.NamedEntityTagAnnotation.class, IS_TOKENS_COMPATIBLE);
    annotation.set(CoreAnnotations.MentionsAnnotation.class, chunks);

    // By now entity mentions have been annotated and TextAnnotation and NamedEntityAnnotation marked
    // Some additional annotations
    List<CoreMap> mentions = annotation.get(CoreAnnotations.MentionsAnnotation.class);
    if (mentions != null) {
      for (CoreMap mention: mentions) {
        List<CoreLabel> mentionTokens = mention.get(CoreAnnotations.TokensAnnotation.class);
        String name = (String) CoreMapAttributeAggregator.FIRST_NON_NIL.aggregate(
                CoreAnnotations.NormalizedNamedEntityTagAnnotation.class, mentionTokens);
        if (name == null) {
          name = mention.get(CoreAnnotations.TextAnnotation.class);
        } else {
          mention.set(CoreAnnotations.NormalizedNamedEntityTagAnnotation.class, name);
        }
        //mention.set(CoreAnnotations.EntityNameAnnotation.class, name);
        String type = mention.get(CoreAnnotations.NamedEntityTagAnnotation.class);
        mention.set(CoreAnnotations.EntityTypeAnnotation.class, type);

        // Take first non nil as timex for the mention
        Timex timex = (Timex) CoreMapAttributeAggregator.FIRST_NON_NIL.aggregate(
            TimeAnnotations.TimexAnnotation.class, mentionTokens);
        if (timex != null) {
          mention.set(TimeAnnotations.TimexAnnotation.class, timex);
        }
      }
    }
  }


  @Override
  public Set<Requirement> requires() {
    return new ArraySet<Requirement>(TOKENIZE_REQUIREMENT, NER_REQUIREMENT);
  }

  @Override
  public Set<Requirement> requirementsSatisfied() {
    // TODO: figure out what this produces
    return Collections.emptySet();
  }
}
TOP

Related Classes of edu.stanford.nlp.pipeline.MentionsAnnotator

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.