public static <E extends CoreMap> void fixupNerBeforeNormalization(List<E> list)
{
// Goes through tokens and tries to fix up NER annotations
String prevNerTag = BACKGROUND_SYMBOL;
String prevNumericType = null;
Timex prevTimex = null;
for (int i = 0, sz = list.size(); i < sz; i++) {
E wi = list.get(i);
Timex timex = wi.get(TimeAnnotations.TimexAnnotation.class);
String numericType = wi.get(CoreAnnotations.NumericCompositeTypeAnnotation.class);
String curWord = (wi.get(CoreAnnotations.TextAnnotation.class) != null ? wi.get(CoreAnnotations.TextAnnotation.class) : "");
String currNerTag = wi.get(CoreAnnotations.NamedEntityTagAnnotation.class);
if (DEBUG) { System.err.println("fixupNerBeforeNormalization: wi is " + wi); }
// Attempts repairs to NER tags only if not marked by SUTime already
if (timex == null && numericType == null) {
// repairs commas in between dates... String constant first in equals() in case key has null value....
if ((i+1) < sz && ",".equals(wi.get(CoreAnnotations.TextAnnotation.class)) && "DATE".equals(prevNerTag)) {
if (prevTimex == null && prevNumericType == null) {
E nextToken = list.get(i+1);
String nextNER = nextToken.get(CoreAnnotations.NamedEntityTagAnnotation.class);
if (nextNER != null && nextNER.equals("DATE")) {
wi.set(CoreAnnotations.NamedEntityTagAnnotation.class, "DATE");
}
}
}
//repairs mistagged multipliers after a numeric quantity
if (!curWord.equals("") && (moneyMultipliers.containsKey(curWord) ||
(getOneSubstitutionMatch(curWord, moneyMultipliers.keySet()) != null)) &&
prevNerTag != null && (prevNerTag.equals("MONEY") || prevNerTag.equals("NUMBER"))) {
wi.set(CoreAnnotations.NamedEntityTagAnnotation.class, prevNerTag);
}
//repairs four digit ranges (2002-2004) that have not been tagged as years - maybe bad? (empirically useful)
if (curWord.contains("-")) {
String[] sides = curWord.split("-");
if (sides.length == 2) {
try {
int first = Integer.parseInt(sides[0]);
int second = Integer.parseInt(sides[1]);
//they're both integers, see if they're both between 1000-3000 (likely years)
if (1000 <= first && first <= 3000 && 1000 <= second && second <= 3000) {
wi.set(CoreAnnotations.NamedEntityTagAnnotation.class, "DATE");
String dateStr = new ISODateInstance(new ISODateInstance(sides[0]), new ISODateInstance(sides[1])).getDateString();
if (DEBUG) {
System.err.println("#5: Changing normalized NER from " +
wi.get(CoreAnnotations.NormalizedNamedEntityTagAnnotation.class) + " to " + dateStr + " at index " + i);
}
wi.set(CoreAnnotations.NormalizedNamedEntityTagAnnotation.class, dateStr);
continue;
}
} catch (Exception e) {
// they weren't numbers.
}
}
}
// Marks time units as DURATION if they are preceded by a NUMBER tag. e.g. "two years" or "5 minutes"
if ( timeUnitWords.contains(curWord) &&
(currNerTag == null || !"DURATION".equals(currNerTag) ) &&
("NUMBER".equals(prevNerTag))) {
wi.set(CoreAnnotations.NamedEntityTagAnnotation.class, "DURATION");
for (int j = i-1; j > 0; j--) {
E prev = list.get(j);
if ("NUMBER".equals(prev.get(CoreAnnotations.NamedEntityTagAnnotation.class))) {
prev.set(CoreAnnotations.NamedEntityTagAnnotation.class, "DURATION");
}
}
}
} else {
// Fixup SUTime marking of twenty-second
if ("DURATION".equals(currNerTag) && ordinalsToValues.containsKey(curWord)
&& curWord.endsWith("second") && timex.text().equals(curWord)) {
wi.set(CoreAnnotations.NamedEntityTagAnnotation.class, "ORDINAL");
}
}
prevNerTag = currNerTag;