package edu.stanford.nlp.ling.tokensregex;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.tokensregex.types.Value;
import edu.stanford.nlp.pipeline.ChunkAnnotationUtils;
import edu.stanford.nlp.pipeline.CoreMapAttributeAggregator;
import edu.stanford.nlp.util.Comparators;
import edu.stanford.nlp.util.CoreMap;
import java.util.function.Function;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.Interval;
import edu.stanford.nlp.util.IntervalTree;
import java.util.*;
/**
* Matched Expression represents a chunk of text that was matched from an original segment of text).
*
* @author Angel Chang
*/
public class MatchedExpression {
/** Text representing the matched expression */
protected String text;
/**
* Character offsets (relative to original text).
* TODO: Fix up
* If matched using regular text patterns,
* the character offsets are with respect to the annotation (usually sentence)
* from which the text was matched against
* If matched using tokens, the character offsets are with respect to the overall document
*/
protected Interval<Integer> charOffsets;
/**Token offsets (relative to original text tokenization) */
protected Interval<Integer> tokenOffsets;
/** Chunk offsets (relative to chunking on top of original text) */
protected Interval<Integer> chunkOffsets;
protected CoreMap annotation;
// TODO: Should we keep some context from the source so we can perform more complex evaluation?
/** Function indicating how to extract an value from annotation built from this expression */
protected SingleAnnotationExtractor extractFunc;
public Value value;
//protected Map<String,String> attributes;
// Used to disambiguate matched expressions
double priority;
double weight;
int order;
/**
* Function that takes a CoreMap, applies a extraction function to it, to get a value
* Also contains information on how to construct a final annotation.
*/
public static class SingleAnnotationExtractor implements Function<CoreMap,Value> {
public String name;
public double priority; // Priority/Order in which this rule should be applied with respect to others
public double weight; // Weight given to the rule (how likely is this rule to fire)
// public Class annotationField; // Annotation field to apply rule over: text or tokens or numerizedtokens
public Class tokensAnnotationField = CoreAnnotations.TokensAnnotation.class; // Tokens or numerizedtokens
public List<Class> tokensResultAnnotationField;
public List<Class> resultAnnotationField; // Annotation field to put new annotation
public Class resultNestedAnnotationField; // Annotation field for child/nested annotations
public boolean includeNested = false;
public Function<CoreMap, Value> valueExtractor;
public Function<MatchedExpression,?> resultAnnotationExtractor;
public Map<Class, CoreMapAttributeAggregator> tokensAggregators;
@Override
public Value apply(CoreMap in) {
return valueExtractor.apply(in);
}
private static void setAnnotations(CoreMap cm, List<Class> annotationKeys, Object obj) {
if (annotationKeys.size() > 1 && obj instanceof List) {
// List of annotationKeys, obj also list, we should try to match the objects to annotationKeys
List list = (List) obj;
int n = Math.min(list.size(), annotationKeys.size());
for (int i = 0; i < n; i++) {
Object v = list.get(i);
Class key = annotationKeys.get(i);
if (key == null) {
throw new RuntimeException("Invalid null annotation key");
}
if (v instanceof Value) {
cm.set(key, ((Value) v).get());
} else {
cm.set(key, v);
}
}
} else {
// Only a single object, set all annotationKeys to that obj
for (Class key:annotationKeys) {
if (key == null) {
throw new RuntimeException("Invalid null annotation key");
}
cm.set(key, obj);
}
}
}
public void annotate(MatchedExpression matchedExpression, List<? extends CoreMap> nested) {
if (resultNestedAnnotationField != null) {
matchedExpression.annotation.set(resultNestedAnnotationField, nested);
}
// NOTE: for now value must be extracted after nested annotation is in place...
annotate(matchedExpression);
}
public void annotate(MatchedExpression matchedExpression) {
matchedExpression.value = valueExtractor.apply(matchedExpression.annotation);
if (resultAnnotationField != null) {
if (resultAnnotationExtractor != null) {
Object result = resultAnnotationExtractor.apply(matchedExpression);
setAnnotations(matchedExpression.annotation, resultAnnotationField, result);
} else {
// TODO: Should default result be the matchedExpression, value, object???
//matchedExpression.annotation.set(resultAnnotationField, matchedExpression);
Value v = matchedExpression.getValue();
setAnnotations(matchedExpression.annotation, resultAnnotationField, (v != null)? v.get():null);
}
}
if (tokensResultAnnotationField != null) {
List<? extends CoreMap> tokens = (List<? extends CoreMap>) matchedExpression.annotation.get(tokensAnnotationField);
if (resultAnnotationExtractor != null) {
Object result = resultAnnotationExtractor.apply(matchedExpression);
for (CoreMap cm:tokens) {
setAnnotations(cm, tokensResultAnnotationField, result);
}
} else {
// TODO: Should default result be the matchedExpression, value, object???
//matchedExpression.annotation.set(resultAnnotationField, matchedExpression);
Value v = matchedExpression.getValue();
for (CoreMap cm:tokens) {
setAnnotations(cm, tokensResultAnnotationField, (v != null)? v.get():null);
}
}
}
}
public MatchedExpression createMatchedExpression(Interval<Integer> charOffsets, Interval<Integer> tokenOffsets)
{
MatchedExpression me = new MatchedExpression(charOffsets, tokenOffsets, this, priority, weight);
return me;
}
}
public MatchedExpression(MatchedExpression me)
{
this.annotation = me.annotation;
this.extractFunc = me.extractFunc;
this.text = me.text;
this.value = me.value;
//this.attributes = me.attributes;
this.priority = me.priority;
this.weight = me.weight;
this.order = me.order;
this.charOffsets = me.charOffsets;
this.tokenOffsets = me.tokenOffsets;
this.chunkOffsets = me.tokenOffsets;
}
public MatchedExpression(Interval<Integer> charOffsets, Interval<Integer> tokenOffsets,
SingleAnnotationExtractor extractFunc, double priority, double weight)
{
this.charOffsets = charOffsets;
this.tokenOffsets = tokenOffsets;
this.chunkOffsets = tokenOffsets;
this.extractFunc = extractFunc;
this.priority = priority;
this.weight = weight;
}
public boolean extractAnnotation(Env env, CoreMap sourceAnnotation)
{
return extractAnnotation(sourceAnnotation, extractFunc.tokensAggregators);
}
private boolean extractAnnotation(CoreMap sourceAnnotation,
Map<Class, CoreMapAttributeAggregator> aggregators)
{
Class tokensAnnotationKey = extractFunc.tokensAnnotationField;
if (chunkOffsets != null) {
annotation = ChunkAnnotationUtils.getMergedChunk((List<? extends CoreMap>) sourceAnnotation.get(tokensAnnotationKey),
chunkOffsets.getBegin(), chunkOffsets.getEnd(), aggregators );
if (sourceAnnotation.containsKey(CoreAnnotations.TextAnnotation.class)) {
ChunkAnnotationUtils.annotateChunkText(annotation, sourceAnnotation);
}
if (tokenOffsets != null) {
if (annotation.get(CoreAnnotations.TokenBeginAnnotation.class) == null) {
annotation.set(CoreAnnotations.TokenBeginAnnotation.class, tokenOffsets.getBegin());
}
if (annotation.get(CoreAnnotations.TokenEndAnnotation.class) == null) {
annotation.set(CoreAnnotations.TokenEndAnnotation.class, tokenOffsets.getEnd());
}
}
charOffsets = Interval.toInterval(annotation.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class), annotation.get(CoreAnnotations.CharacterOffsetEndAnnotation.class));
tokenOffsets = Interval.toInterval(annotation.get(CoreAnnotations.TokenBeginAnnotation.class),
annotation.get(CoreAnnotations.TokenEndAnnotation.class), Interval.INTERVAL_OPEN_END);
} else {
Integer baseCharOffset = sourceAnnotation.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class);
if (baseCharOffset == null) {
baseCharOffset = 0;
}
chunkOffsets = ChunkAnnotationUtils.getChunkOffsetsUsingCharOffsets((List<? extends CoreMap>) sourceAnnotation.get(tokensAnnotationKey),
charOffsets.getBegin() + baseCharOffset, charOffsets.getEnd() + baseCharOffset);
CoreMap annotation2 = ChunkAnnotationUtils.getMergedChunk((List<? extends CoreMap>) sourceAnnotation.get(tokensAnnotationKey),
chunkOffsets.getBegin(), chunkOffsets.getEnd(), aggregators );
annotation = ChunkAnnotationUtils.getAnnotatedChunkUsingCharOffsets(sourceAnnotation, charOffsets.getBegin(), charOffsets.getEnd());
tokenOffsets = Interval.toInterval(annotation.get(CoreAnnotations.TokenBeginAnnotation.class),
annotation.get(CoreAnnotations.TokenEndAnnotation.class), Interval.INTERVAL_OPEN_END);
annotation.set(tokensAnnotationKey, annotation2.get(tokensAnnotationKey));
}
text = annotation.get(CoreAnnotations.TextAnnotation.class);
extractFunc.annotate(this, (List<? extends CoreMap>) annotation.get(tokensAnnotationKey));
return true;
}
public boolean extractAnnotation(Env env, List<? extends CoreMap> source)
{
return extractAnnotation(source, CoreMapAttributeAggregator.getDefaultAggregators());
}
protected boolean extractAnnotation(List<? extends CoreMap> source, Map<Class, CoreMapAttributeAggregator> chunkAggregators)
{
annotation = ChunkAnnotationUtils.getMergedChunk(source, chunkOffsets.getBegin(), chunkOffsets.getEnd(), chunkAggregators);
charOffsets = Interval.toInterval(annotation.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class),
annotation.get(CoreAnnotations.CharacterOffsetEndAnnotation.class), Interval.INTERVAL_OPEN_END);
tokenOffsets = Interval.toInterval(annotation.get(CoreAnnotations.TokenBeginAnnotation.class),
annotation.get(CoreAnnotations.TokenEndAnnotation.class), Interval.INTERVAL_OPEN_END);
text = annotation.get(CoreAnnotations.TextAnnotation.class);
extractFunc.annotate(this, source.subList(chunkOffsets.getBegin(), chunkOffsets.getEnd()));
return true;
}
public Interval<Integer> getCharOffsets() {
return charOffsets;
}
public Interval<Integer> getTokenOffsets() {
return tokenOffsets;
}
public Interval<Integer> getChunkOffsets() {
return chunkOffsets;
}
/* public Map<String, String> getAttributes() {
return attributes;
}*/
public double getPriority() {
return priority;
}
public int getOrder() {
return order;
}
public boolean isIncludeNested() {
return extractFunc.includeNested;
}
public void setIncludeNested(boolean includeNested) {
extractFunc.includeNested = includeNested;
}
public String getText() {
return text;
}
public CoreMap getAnnotation() {
return annotation;
}
public Value getValue() { return value; }
public String toString()
{
return text;
}
public static List<? extends CoreMap> replaceMerged(List<? extends CoreMap> list,
List<? extends MatchedExpression> matchedExprs)
{
if (matchedExprs == null) return list;
Collections.sort(matchedExprs, EXPR_TOKEN_OFFSET_COMPARATOR);
List<CoreMap> merged = new ArrayList<CoreMap>(list.size()); // Approximate size
int last = 0;
for (MatchedExpression expr:matchedExprs) {
int start = expr.chunkOffsets.first();
int end = expr.chunkOffsets.second();
if (start >= last) {
merged.addAll(list.subList(last,start));
CoreMap m = expr.getAnnotation();
merged.add(m);
last = end;
}
}
// Add rest of elements
if (last < list.size()) {
merged.addAll(list.subList(last, list.size()));
}
return merged;
}
public static List<? extends CoreMap> replaceMergedUsingTokenOffsets(List<? extends CoreMap> list,
List<? extends MatchedExpression> matchedExprs)
{
if (matchedExprs == null) return list;
Map<Integer, Integer> tokenBeginToListIndexMap = new HashMap<Integer, Integer>();//Generics.newHashMap();
Map<Integer, Integer> tokenEndToListIndexMap = new HashMap<Integer, Integer>();//Generics.newHashMap();
for (int i = 0; i < list.size(); i++) {
CoreMap cm = list.get(i);
if (cm.has(CoreAnnotations.TokenBeginAnnotation.class) && cm.has(CoreAnnotations.TokenEndAnnotation.class)) {
tokenBeginToListIndexMap.put(cm.get(CoreAnnotations.TokenBeginAnnotation.class), i);
tokenEndToListIndexMap.put(cm.get(CoreAnnotations.TokenEndAnnotation.class), i+1);
} else {
tokenBeginToListIndexMap.put(i, i);
tokenEndToListIndexMap.put(i+1, i+1);
}
}
Collections.sort(matchedExprs, EXPR_TOKEN_OFFSET_COMPARATOR);
List<CoreMap> merged = new ArrayList<CoreMap>(list.size()); // Approximate size
int last = 0;
for (MatchedExpression expr:matchedExprs) {
int start = expr.tokenOffsets.first();
int end = expr.tokenOffsets.second();
Integer istart = tokenBeginToListIndexMap.get(start);
Integer iend = tokenEndToListIndexMap.get(end);
if (istart != null && iend != null) {
if (istart >= last) {
merged.addAll(list.subList(last,istart));
CoreMap m = expr.getAnnotation();
merged.add(m);
last = iend;
}
}
}
// Add rest of elements
if (last < list.size()) {
merged.addAll(list.subList(last, list.size()));
}
return merged;
}
public static <T extends MatchedExpression> List<T> removeNullValues(List<T> chunks)
{
List<T> okayChunks = new ArrayList<T>(chunks.size());
for (T chunk : chunks) {
Value v = chunk.value;
if (v == null || v.get() == null) {
//skip
} else {
okayChunks.add(chunk);
}
}
return okayChunks;
}
public static <T extends MatchedExpression> List<T> removeNested(List<T> chunks)
{
if (chunks.size() > 1) {
for (int i = 0; i < chunks.size(); i++) {
chunks.get(i).order = i;
}
return IntervalTree.getNonNested(chunks, EXPR_TO_TOKEN_OFFSETS_INTERVAL_FUNC, EXPR_LENGTH_PRIORITY_COMPARATOR);
} else {
return chunks;
}
}
public static <T extends MatchedExpression> List<T> removeOverlapping(List<T> chunks)
{
if (chunks.size() > 1) {
for (int i = 0; i < chunks.size(); i++) {
chunks.get(i).order = i;
}
return IntervalTree.getNonOverlapping(chunks, EXPR_TO_TOKEN_OFFSETS_INTERVAL_FUNC, EXPR_PRIORITY_LENGTH_COMPARATOR);
} else {
return chunks;
}
}
@SuppressWarnings("unused")
public static final Function<CoreMap, Interval<Integer>> COREMAP_TO_TOKEN_OFFSETS_INTERVAL_FUNC =
in -> Interval.toInterval(
in.get(CoreAnnotations.TokenBeginAnnotation.class),
in.get(CoreAnnotations.TokenEndAnnotation.class));
public static final Function<CoreMap, Interval<Integer>> COREMAP_TO_CHAR_OFFSETS_INTERVAL_FUNC =
in -> Interval.toInterval(
in.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class),
in.get(CoreAnnotations.CharacterOffsetEndAnnotation.class));
public static final Function<MatchedExpression, Interval<Integer>> EXPR_TO_TOKEN_OFFSETS_INTERVAL_FUNC =
new Function<MatchedExpression, Interval<Integer>>() {
@Override
public Interval<Integer> apply(MatchedExpression in) {
return in.tokenOffsets;
}
};
public static final Comparator<MatchedExpression> EXPR_PRIORITY_COMPARATOR =
(e1, e2) -> {
double s1 = e1.getPriority();
double s2 = e2.getPriority();
if (s1 == s2) {
return 0;
} else {
return (s1 > s2)? -1:1;
}
};
public static final Comparator<MatchedExpression> EXPR_ORDER_COMPARATOR =
(e1, e2) -> {
int s1 = e1.getOrder();
int s2 = e2.getOrder();
if (s1 == s2) {
return 0;
} else {
return (s1 < s2)? -1:1;
}
};
// Compares two matched expressions.
// Use to order matched expressions by:
// length (longest first), then whether it has value or not (has value first),
// Returns -1 if e1 is longer than e2, 1 if e2 is longer
// If e1 and e2 are the same length:
// Returns -1 if e1 has value, but e2 doesn't (1 if e2 has value, but e1 doesn't)
// Otherwise, both e1 and e2 has value or no value
public static final Comparator<MatchedExpression> EXPR_LENGTH_COMPARATOR =
new Comparator<MatchedExpression>() {
@Override
public int compare(MatchedExpression e1, MatchedExpression e2) {
if (e1.getValue() == null && e2.getValue() != null) {
return 1;
}
if (e1.getValue() != null && e2.getValue() == null) {
return -1;
}
int len1 = e1.tokenOffsets.getEnd() - e1.tokenOffsets.getBegin();
int len2 = e2.tokenOffsets.getEnd() - e2.tokenOffsets.getBegin();
if (len1 == len2) {
return 0;
} else {
return (len1 > len2)? -1:1;
}
}
};
public static final Comparator<MatchedExpression> EXPR_TOKEN_OFFSET_COMPARATOR =
new Comparator<MatchedExpression>() {
@Override
public int compare(MatchedExpression e1, MatchedExpression e2) {
return (e1.tokenOffsets.compareTo(e2.tokenOffsets));
}
};
public static final Comparator<MatchedExpression> EXPR_TOKEN_OFFSETS_NESTED_FIRST_COMPARATOR =
new Comparator<MatchedExpression>() {
@Override
public int compare(MatchedExpression e1, MatchedExpression e2) {
Interval.RelType rel = e1.tokenOffsets.getRelation(e2.tokenOffsets);
if (rel.equals(Interval.RelType.CONTAIN)) {
return 1;
} else if (rel.equals(Interval.RelType.INSIDE)) {
return -1;
} else {
return (e1.tokenOffsets.compareTo(e2.tokenOffsets));
}
}
};
// Compares two matched expressions.
// Use to order matched expressions by:
// score
// length (longest first), then whether it has value or not (has value first),
// original order
// and then beginning token offset (smaller offset first)
public static final Comparator<MatchedExpression> EXPR_PRIORITY_LENGTH_COMPARATOR =
Comparators.chain(EXPR_PRIORITY_COMPARATOR, EXPR_LENGTH_COMPARATOR,
EXPR_ORDER_COMPARATOR, EXPR_TOKEN_OFFSET_COMPARATOR);
public static final Comparator<MatchedExpression> EXPR_LENGTH_PRIORITY_COMPARATOR =
Comparators.chain(EXPR_LENGTH_COMPARATOR, EXPR_PRIORITY_COMPARATOR,
EXPR_ORDER_COMPARATOR, EXPR_TOKEN_OFFSET_COMPARATOR);
public final static Function<MatchedExpression, Double> EXPR_WEIGHT_SCORER = new Function<MatchedExpression, Double>() {
@Override
public Double apply(MatchedExpression in) {
return in.weight;
}
};
}