package edu.stanford.nlp.ie.regexp;
import edu.stanford.nlp.ie.AbstractSequenceClassifier;
import edu.stanford.nlp.ling.CoreAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.sequences.DocumentReaderAndWriter;
import edu.stanford.nlp.sequences.PlainTextDocumentReaderAndWriter;
import edu.stanford.nlp.time.TimeAnnotations;
import edu.stanford.nlp.time.TimeExpressionExtractor;
import edu.stanford.nlp.time.TimeExpressionExtractorFactory;
import edu.stanford.nlp.time.Timex;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.PaddedList;
import edu.stanford.nlp.util.StringUtils;
import java.io.ObjectInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Properties;
import java.util.regex.Pattern;
/**
* A set of deterministic rules for marking certain entities, to add
* categories and to correct for failures of statistical NER taggers.
* This is an extremely simple and ungeneralized implementation of
* AbstractSequenceClassifier that was written for PASCAL RTE.
* It could profitably be extended and generalized.
* It marks a NUMBER category based on part-of-speech tags in a
* deterministic manner.
* It marks an ORDINAL category based on word form in a deterministic manner.
* It tags as MONEY currency signs and things tagged CD after a currency sign.
* It marks a number before a month name as a DATE.
* It marks as a DATE a word of the form xx/xx/xxxx
* (where x is a digit from a suitable range).
* It marks as a TIME a word of the form x(x):xx (where x is a digit).
* It marks everything else tagged "CD" as a NUMBER, and instances
* of "and" appearing between CD tags in contexts suggestive of a number.
* It requires text to be POS-tagged (have the getString(TagAnnotation.class) attribute).
* Effectively these rules assume that
* this classifier will be used as a secondary classifier by
* code such as ClassifierCombiner: it will mark most CD as NUMBER, and it
* is assumed that something else with higher priority is marking ones that
* are PERCENT, ADDRESS, etc.
*
* @author Christopher Manning
* @author Mihai (integrated with NumberNormalizer, SUTime)
*/
public class NumberSequenceClassifier extends AbstractSequenceClassifier<CoreLabel> {
private static final boolean DEBUG = false;
private final boolean useSUTime;
public static final boolean USE_SUTIME_DEFAULT = TimeExpressionExtractorFactory.DEFAULT_EXTRACTOR_PRESENT;
public static final String USE_SUTIME_PROPERTY = "ner.useSUTime";
private final TimeExpressionExtractor timexExtractor;
public NumberSequenceClassifier() {
this(new Properties(), USE_SUTIME_DEFAULT, new Properties());
if (! CURRENCY_WORD_PATTERN.matcher("pounds").matches()) {
System.err.println("NumberSequence: Currency pattern broken");
}
}
public NumberSequenceClassifier(boolean useSUTime) {
this(new Properties(), useSUTime, new Properties());
}
public NumberSequenceClassifier(Properties props,
boolean useSUTime, Properties sutimeProps) {
super(props);
this.useSUTime = useSUTime;
if(this.useSUTime) {
this.timexExtractor = TimeExpressionExtractorFactory.createExtractor("sutime", sutimeProps);
} else {
this.timexExtractor = null;
}
}
/**
* Classify a {@link List} of {@link CoreLabel}s.
*
* @param document A {@link List} of {@link CoreLabel}s.
* @return the same {@link List}, but with the elements annotated
* with their answers.
*/
@Override
public List<CoreLabel> classify(List<CoreLabel> document) {
return classifyWithGlobalInformation(document, null, null);
}
@Override
public List<CoreLabel> classifyWithGlobalInformation(List<CoreLabel> tokens, final CoreMap document, final CoreMap sentence) {
if(useSUTime) return classifyWithSUTime(tokens, document, sentence);
return classifyOld(tokens);
}
public void finalizeClassification(final CoreMap document) {
if (useSUTime) {
timexExtractor.finalize(document);
}
}
// todo [cdm, 2013]: Where does this call NumberNormalizer? Is it the call buried in SUTime's TimeExpressionExtractorImpl?
/**
* Modular classification using NumberNormalizer for numbers, SUTime for date/time.
* Note: this is slower than classifyOld because it runs multiple passes
* over the tokens (one for numbers and dates, and others for money and ordinals).
* However, the slowdown is not substantial since the passes are fast. Plus,
* the code is much cleaner than before...
* @param tokenSequence
*/
private List<CoreLabel> classifyWithSUTime(List<CoreLabel> tokenSequence, final CoreMap document, final CoreMap sentence) {
//
// set everything to "O" by default
//
for (CoreLabel token: tokenSequence) {
if (token.get(CoreAnnotations.AnswerAnnotation.class) == null)
token.set(CoreAnnotations.AnswerAnnotation.class, flags.backgroundSymbol);
}
//
// run SUTime
// note: SUTime requires TextAnnotation to be set at document/sent level and
// that the Character*Offset annotations be aligned with the token words.
// This is guaranteed because here we work on a copy generated by copyTokens()
//
CoreMap timeSentence = (sentence != null ?
alignSentence(sentence) :
buildSentenceFromTokens(tokenSequence));
List<CoreMap> timeExpressions = runSUTime(timeSentence, document);
List<CoreMap> numbers = timeSentence.get(CoreAnnotations.NumerizedTokensAnnotation.class);
//
// store DATE and TIME
//
if (timeExpressions != null) {
for(CoreMap timeExpression: timeExpressions) {
// todo [cdm 2013]: We should also store these in the Sentence, but we've just got the list of tokens here
int start = timeExpression.get(CoreAnnotations.TokenBeginAnnotation.class);
int end = timeExpression.get(CoreAnnotations.TokenEndAnnotation.class);
int offset = 0;
if(sentence != null && sentence.containsKey(CoreAnnotations.TokenBeginAnnotation.class)) {
offset = sentence.get(CoreAnnotations.TokenBeginAnnotation.class);
}
Timex timex = timeExpression.get(TimeAnnotations.TimexAnnotation.class);
if(timex != null){
if(DEBUG){
System.err.println("FOUND DATE/TIME \"" + timeExpression +
"\" with offsets " + start + " " + end +
" and value " + timex);
System.err.println("The above CoreMap has the following fields:");
// for(Class key: timeExpression.keySet()) System.err.println("\t" + key + ": " + timeExpression.get(key));
}
String label = timex.timexType();
for(int i = start; i < end; i ++){
CoreLabel token = tokenSequence.get(i - offset);
if(token.get(CoreAnnotations.AnswerAnnotation.class).equals(flags.backgroundSymbol)){
token.set(CoreAnnotations.AnswerAnnotation.class, label);
token.set(TimeAnnotations.TimexAnnotation.class, timex);
}
}
}
}
}
//
// store the numbers found by SUTime as NUMBER if they are not part of anything else
//
if(numbers != null){
for(CoreMap number: numbers) {
if(number.containsKey(CoreAnnotations.NumericCompositeValueAnnotation.class)){
int start = number.get(CoreAnnotations.TokenBeginAnnotation.class);
int end = number.get(CoreAnnotations.TokenEndAnnotation.class);
int offset = 0;
if(sentence != null && sentence.containsKey(CoreAnnotations.TokenBeginAnnotation.class)) {
offset = sentence.get(CoreAnnotations.TokenBeginAnnotation.class);
}
String type = number.get(CoreAnnotations.NumericCompositeTypeAnnotation.class);
Number value = number.get(CoreAnnotations.NumericCompositeValueAnnotation.class);
if(type != null){
if(DEBUG) System.err.println("FOUND NUMBER \"" + number + "\" with offsets " + start + " " + end + " and value " + value + " and type " + type);
for(int i = start; i < end; i ++){
CoreLabel token = tokenSequence.get(i - offset);
if(token.get(CoreAnnotations.AnswerAnnotation.class).equals(flags.backgroundSymbol)){
token.set(CoreAnnotations.AnswerAnnotation.class, type);
if(value != null){
token.set(CoreAnnotations.NumericCompositeValueAnnotation.class, value);
}
}
}
}
}
}
}
// everything tagged as CD is also a number
// NumberNormalizer probably catches these but let's be safe
for(CoreLabel token: tokenSequence) {
if(token.tag().equals("CD") &&
token.get(CoreAnnotations.AnswerAnnotation.class).equals(flags.backgroundSymbol)){
token.set(CoreAnnotations.AnswerAnnotation.class, "NUMBER");
}
}
// extract money and percents
moneyAndPercentRecognizer(tokenSequence);
// ordinals
// NumberNormalizer probably catches these but let's be safe
ordinalRecognizer(tokenSequence);
return tokenSequence;
}
/**
* Copies one sentence replicating only information necessary for SUTime
* @param sentence
*/
public static CoreMap alignSentence(CoreMap sentence) {
String text = sentence.get(CoreAnnotations.TextAnnotation.class);
if(text != null){
// original text is preserved; no need to align anything
return sentence;
}
CoreMap newSentence = buildSentenceFromTokens(
sentence.get(CoreAnnotations.TokensAnnotation.class),
sentence.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class),
sentence.get(CoreAnnotations.CharacterOffsetEndAnnotation.class));
newSentence.set(CoreAnnotations.TokenBeginAnnotation.class,
sentence.get(CoreAnnotations.TokenBeginAnnotation.class));
newSentence.set(CoreAnnotations.TokenEndAnnotation.class,
sentence.get(CoreAnnotations.TokenEndAnnotation.class));
return newSentence;
}
private static CoreMap buildSentenceFromTokens(List<CoreLabel> tokens) {
return buildSentenceFromTokens(tokens, null, null);
}
private static CoreMap buildSentenceFromTokens(
List<CoreLabel> tokens,
Integer characterOffsetStart,
Integer characterOffsetEnd) {
//
// Recover the sentence text:
// a) try to get it from TextAnnotation
// b) if not present, build it from the OriginalTextAnnotation of each token
// c) if not present, build it from the TextAnnotation of each token
//
boolean adjustCharacterOffsets = false;
// try to recover the text from the original tokens
String text = buildText(tokens, CoreAnnotations.OriginalTextAnnotation.class);
if(text == null){
text = buildText(tokens, CoreAnnotations.TextAnnotation.class);
// character offset will point to the original tokens
// so we need to align them to the text built from normalized tokens
adjustCharacterOffsets = true;
if(text == null){
throw new RuntimeException("ERROR: to use SUTime, sentences must have TextAnnotation set, or the individual tokens must have OriginalTextAnnotation or TextAnnotation set!");
}
}
// make sure token character offsets are aligned with text
List<CoreLabel> tokenSequence = copyTokens(tokens, adjustCharacterOffsets, false);
Annotation newSentence = new Annotation(text);
newSentence.set(CoreAnnotations.TokensAnnotation.class, tokenSequence);
if (! adjustCharacterOffsets &&
characterOffsetStart != null &&
characterOffsetEnd != null){
newSentence.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, characterOffsetStart);
newSentence.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, characterOffsetEnd);
} else {
int tokenCharStart = tokenSequence.get(0).get(CoreAnnotations.CharacterOffsetBeginAnnotation.class);
int tokenCharEnd = tokenSequence.get(tokenSequence.size() - 1).get(CoreAnnotations.CharacterOffsetEndAnnotation.class);
newSentence.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, tokenCharStart);
newSentence.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, tokenCharEnd);
}
// some default token offsets
newSentence.set(CoreAnnotations.TokenBeginAnnotation.class, 0);
newSentence.set(CoreAnnotations.TokenEndAnnotation.class, tokenSequence.size());
return newSentence;
}
private static String buildText(List<CoreLabel> tokens, Class<? extends CoreAnnotation<String>> textAnnotation) {
StringBuilder os = new StringBuilder();
for (int i = 0, sz = tokens.size(); i < sz; i ++) {
CoreLabel crt = tokens.get(i);
// System.out.println("\t" + crt.word() + "\t" + crt.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class) + "\t" + crt.get(CoreAnnotations.CharacterOffsetEndAnnotation.class));
if (i > 0) {
CoreLabel prev = tokens.get(i - 1);
int spaces = 1;
if (crt.containsKey(CoreAnnotations.CharacterOffsetBeginAnnotation.class)) {
spaces = crt.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class) -
prev.get(CoreAnnotations.CharacterOffsetEndAnnotation.class);
}
while (spaces > 0) {
os.append(' ');
spaces--;
}
}
String word = crt.get(textAnnotation);
if (word == null) {
// this annotation does not exist; bail out
return null;
}
os.append(word);
}
return os.toString();
}
/**
* Runs SUTime and converts its output into NamedEntityTagAnnotations
* @param sentence
* @param document Contains document-level annotations such as DocDateAnnotation
*/
private List<CoreMap> runSUTime(CoreMap sentence, final CoreMap document) {
/*
System.err.println("PARSING SENTENCE: " + sentence.get(CoreAnnotations.TextAnnotation.class));
for(CoreLabel t: sentence.get(CoreAnnotations.TokensAnnotation.class)){
System.err.println("TOKEN: \"" + t.word() + "\" \"" + t.get(CoreAnnotations.OriginalTextAnnotation.class) + "\" " + t.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class) + " " + t.get(CoreAnnotations.CharacterOffsetEndAnnotation.class));
}
*/
List<CoreMap> timeExpressions = timexExtractor.extractTimeExpressionCoreMaps(sentence, document);
if(timeExpressions != null){
if(DEBUG) System.out.println("FOUND TEMPORALS: " + timeExpressions);
}
return timeExpressions;
}
/**
* Recognizes money and percents
* This accepts currency symbols (e.g., $) both before and after numbers; but it accepts units (e.g., "dollar") only after
* @param tokenSequence
*/
private void moneyAndPercentRecognizer(List<CoreLabel> tokenSequence) {
for(int i = 0; i < tokenSequence.size(); i ++){
CoreLabel crt = tokenSequence.get(i);
CoreLabel next = (i < tokenSequence.size() - 1 ? tokenSequence.get(i + 1) : null);
CoreLabel prev = (i > 0 ? tokenSequence.get(i - 1) : null);
// $5
if(CURRENCY_SYMBOL_PATTERN.matcher(crt.word()).matches() && next != null &&
(next.get(CoreAnnotations.AnswerAnnotation.class).equals("NUMBER") || next.tag().equals("CD"))){
crt.set(CoreAnnotations.AnswerAnnotation.class, "MONEY");
i = changeLeftToRight(tokenSequence, i + 1,
next.get(CoreAnnotations.AnswerAnnotation.class),
next.tag(), "MONEY") - 1;
}
// 5$, 5 dollars
else if((CURRENCY_WORD_PATTERN.matcher(crt.word()).matches() ||
CURRENCY_SYMBOL_PATTERN.matcher(crt.word()).matches()) &&
prev != null &&
(prev.get(CoreAnnotations.AnswerAnnotation.class).equals("NUMBER") ||
prev.tag().equals("CD")) &&
! leftScanFindsWeightWord(tokenSequence, i)){
crt.set(CoreAnnotations.AnswerAnnotation.class, "MONEY");
changeRightToLeft(tokenSequence, i - 1,
prev.get(CoreAnnotations.AnswerAnnotation.class),
prev.tag(), "MONEY");
}
// 5%, 5 percent
else if((PERCENT_WORD_PATTERN.matcher(crt.word()).matches() ||
PERCENT_SYMBOL_PATTERN.matcher(crt.word()).matches()) &&
prev != null &&
(prev.get(CoreAnnotations.AnswerAnnotation.class).equals("NUMBER") ||
prev.tag().equals("CD"))){
crt.set(CoreAnnotations.AnswerAnnotation.class, "PERCENT");
changeRightToLeft(tokenSequence, i - 1,
prev.get(CoreAnnotations.AnswerAnnotation.class),
prev.tag(), "PERCENT");
}
}
}
/**
* Recognizes ordinal numbers
* @param tokenSequence
*/
private void ordinalRecognizer(List<CoreLabel> tokenSequence) {
for (CoreLabel crt : tokenSequence) {
if ((crt.get(CoreAnnotations.AnswerAnnotation.class).equals(flags.backgroundSymbol) ||
crt.get(CoreAnnotations.AnswerAnnotation.class).equals("NUMBER")) &&
ORDINAL_PATTERN.matcher(crt.word()).matches()) {
crt.set(CoreAnnotations.AnswerAnnotation.class, "ORDINAL");
}
}
}
private int changeLeftToRight(List<CoreLabel> tokens,
int start,
String oldTag,
String posTag,
String newTag) {
while(start < tokens.size()) {
CoreLabel crt = tokens.get(start);
// we are scanning for a NER tag and found something different
if(! oldTag.equals(flags.backgroundSymbol) && ! crt.get(CoreAnnotations.AnswerAnnotation.class).equals(oldTag)) {
break;
}
// the NER tag is not set, so we scan for similar POS tags
if(oldTag.equals(flags.backgroundSymbol) && ! crt.tag().equals(posTag)) {
break;
}
crt.set(CoreAnnotations.AnswerAnnotation.class, newTag);
start ++;
}
return start;
}
private int changeRightToLeft(List<CoreLabel> tokens,
int start,
String oldTag,
String posTag,
String newTag) {
while(start >= 0) {
CoreLabel crt = tokens.get(start);
// we are scanning for a NER tag and found something different
if(! oldTag.equals(flags.backgroundSymbol) && ! crt.get(CoreAnnotations.AnswerAnnotation.class).equals(oldTag)) {
break;
}
// the NER tag is not set, so we scan for similar POS tags
if(oldTag.equals(flags.backgroundSymbol) && ! crt.tag().equals(posTag)) {
break;
}
crt.set(CoreAnnotations.AnswerAnnotation.class, newTag);
start --;
}
return start;
}
/**
* Aligns the character offsets of these tokens with the actual text stored in each token
* Note that this copies the list ONLY when we need to adjust the character offsets. Otherwise, it keeps the original list.
* Note that this looks first at OriginalTextAnnotation and only when null at TextAnnotation.
* @param srcList
* @param adjustCharacterOffsets If true, it adjust the character offsets to match exactly with the token lengths
*/
private static List<CoreLabel> copyTokens(List<CoreLabel> srcList,
boolean adjustCharacterOffsets,
boolean forceCopy) {
// no need to adjust anything; use the original list
if(! adjustCharacterOffsets && ! forceCopy) return srcList;
List<CoreLabel> dstList = new ArrayList<CoreLabel>();
int adjustment = 0;
int offset = 0; // for when offsets are not available
for(CoreLabel src: srcList) {
if(adjustCharacterOffsets) {
int wordLength = (src.containsKey(CoreAnnotations.OriginalTextAnnotation.class))?
src.get(CoreAnnotations.OriginalTextAnnotation.class).length():src.word().length();
// We try to preserve the old character offsets but they just don't work well for normalized token text
// Also, in some cases, these offsets are not set
if(src.containsKey(CoreAnnotations.CharacterOffsetBeginAnnotation.class) &&
src.containsKey(CoreAnnotations.CharacterOffsetEndAnnotation.class)){
int start = src.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class);
int end = src.get(CoreAnnotations.CharacterOffsetEndAnnotation.class);
int origLength = end - start;
start += adjustment;
end = start + wordLength;
dstList.add(copyCoreLabel(src, start, end));
adjustment += wordLength - origLength;
} else {
int start = offset;
int end = start + wordLength;
offset = end + 1; // allow for one space character
dstList.add(copyCoreLabel(src, start, end));
}
} else {
dstList.add(copyCoreLabel(src, null, null));
}
}
return dstList;
}
/**
* Transfer from src to dst all annotations generated bu SUTime and NumberNormalizer
* @param src
* @param dst
*/
public static void transferAnnotations(CoreLabel src, CoreLabel dst) {
//
// annotations potentially set by NumberNormalizer
//
if(src.containsKey(CoreAnnotations.NumericCompositeValueAnnotation.class)){
dst.set(CoreAnnotations.NumericCompositeValueAnnotation.class,
src.get(CoreAnnotations.NumericCompositeValueAnnotation.class));
}
if(src.containsKey(CoreAnnotations.NumericCompositeTypeAnnotation.class))
dst.set(CoreAnnotations.NumericCompositeTypeAnnotation.class,
src.get(CoreAnnotations.NumericCompositeTypeAnnotation.class));
//
// annotations set by SUTime
//
if(src.containsKey(TimeAnnotations.TimexAnnotation.class))
dst.set(TimeAnnotations.TimexAnnotation.class,
src.get(TimeAnnotations.TimexAnnotation.class));
}
/**
* Create a copy of srcTokens, detecting on the fly if character offsets need adjusting
* @param srcTokens
* @param srcSentence
*/
public static List<CoreLabel> copyTokens(List<CoreLabel> srcTokens, CoreMap srcSentence) {
boolean adjustCharacterOffsets = false;
if (srcSentence == null ||
srcSentence.get(CoreAnnotations.TextAnnotation.class) == null ||
srcTokens.isEmpty() ||
srcTokens.get(0).get(CoreAnnotations.OriginalTextAnnotation.class) == null) {
adjustCharacterOffsets = true;
}
return copyTokens(srcTokens, adjustCharacterOffsets, true);
}
/**
* Copies only the fields required for numeric entity extraction into the new CoreLabel.
*
* @param src Source CoreLabel to copy.
*/
private static CoreLabel copyCoreLabel(CoreLabel src, Integer startOffset, Integer endOffset) {
CoreLabel dst = new CoreLabel();
dst.setWord(src.word());
dst.setTag(src.tag());
if (src.containsKey(CoreAnnotations.OriginalTextAnnotation.class)) {
dst.set(CoreAnnotations.OriginalTextAnnotation.class, src.get(CoreAnnotations.OriginalTextAnnotation.class));
}
if(startOffset == null){
dst.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, src.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class));
} else {
dst.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, startOffset);
}
if(endOffset == null){
dst.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, src.get(CoreAnnotations.CharacterOffsetEndAnnotation.class));
} else {
dst.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, endOffset);
}
transferAnnotations(src, dst);
return dst;
}
private static final Pattern MONTH_PATTERN = Pattern.compile("January|Jan\\.?|February|Feb\\.?|March|Mar\\.?|April|Apr\\.?|May|June|Jun\\.?|July|Jul\\.?|August|Aug\\.?|September|Sept?\\.?|October|Oct\\.?|November|Nov\\.?|December|Dec\\.");
private static final Pattern YEAR_PATTERN = Pattern.compile("[1-3][0-9]{3}|'?[0-9]{2}");
private static final Pattern DAY_PATTERN = Pattern.compile("(?:[1-9]|[12][0-9]|3[01])(?:st|nd|rd)?");
private static final Pattern DATE_PATTERN = Pattern.compile("(?:[1-9]|[0-3][0-9])\\\\?/(?:[1-9]|[0-3][0-9])\\\\?/[1-3][0-9]{3}");
private static final Pattern DATE_PATTERN2 = Pattern.compile("[12][0-9]{3}[-/](?:0?[1-9]|1[0-2])[-/][0-3][0-9]");
private static final Pattern TIME_PATTERN = Pattern.compile("[0-2]?[0-9]:[0-5][0-9]");
private static final Pattern TIME_PATTERN2 = Pattern.compile("[0-2][0-9]:[0-5][0-9]:[0-5][0-9]");
private static final Pattern AM_PM = Pattern.compile("(a\\.?m\\.?)|(p\\.?m\\.?)", Pattern.CASE_INSENSITIVE);
public static final Pattern CURRENCY_WORD_PATTERN = Pattern.compile("(?:dollar|cent|euro|pound)s?|penny|pence|yen|yuan|won", Pattern.CASE_INSENSITIVE);
public static final Pattern CURRENCY_SYMBOL_PATTERN = Pattern.compile("\\$|£|\u00A3|\u00A5|#|\u20AC|US\\$|HK\\$|A\\$", Pattern.CASE_INSENSITIVE);
public static final Pattern ORDINAL_PATTERN = Pattern.compile("(?i)[2-9]?1st|[2-9]?2nd|[2-9]?3rd|1[0-9]th|[2-9]?[04-9]th|100+th|zeroth|first|second|third|fourth|fifth|sixth|seventh|eighth|ninth|tenth|eleventh|twelfth|thirteenth|fourteenth|fifteenth|sixteenth|seventeenth|eighteenth|nineteenth|twentieth|twenty-first|twenty-second|twenty-third|twenty-fourth|twenty-fifth|twenty-sixth|twenty-seventh|twenty-eighth|twenty-ninth|thirtieth|thirty-first|fortieth|fiftieth|sixtieth|seventieth|eightieth|ninetieth|hundredth|thousandth|millionth");
public static final Pattern ARMY_TIME_MORNING = Pattern.compile("0([0-9])([0-9]){2}");
public static final Pattern GENERIC_TIME_WORDS = Pattern.compile("(morning|evening|night|noon|midnight|teatime|lunchtime|dinnertime|suppertime|afternoon|midday|dusk|dawn|sunup|sundown|daybreak|day)");
public static final Pattern PERCENT_WORD_PATTERN = Pattern.compile("percent", Pattern.CASE_INSENSITIVE);
public static final Pattern PERCENT_SYMBOL_PATTERN = Pattern.compile("%");
private List<CoreLabel> classifyOld(List<CoreLabel> document) {
// if (DEBUG) { System.err.println("NumberSequenceClassifier tagging"); }
PaddedList<CoreLabel> pl = new PaddedList<CoreLabel>(document, pad);
for (int i = 0, sz = pl.size(); i < sz; i++) {
CoreLabel me = pl.get(i);
CoreLabel prev = pl.get(i - 1);
CoreLabel next = pl.get(i + 1);
CoreLabel next2 = pl.get(i + 2);
//if (DEBUG) { System.err.println("Tagging:" + me.word()); }
me.set(CoreAnnotations.AnswerAnnotation.class, flags.backgroundSymbol);
if (CURRENCY_SYMBOL_PATTERN.matcher(me.word()).matches() &&
(prev.getString(CoreAnnotations.PartOfSpeechAnnotation.class).equals("CD") ||
next.getString(CoreAnnotations.PartOfSpeechAnnotation.class).equals("CD"))) {
// dollar, pound, pound, yen,
// Penn Treebank ancient # as pound, euro,
if (DEBUG) {
System.err.println("Found currency sign:" + me.word());
}
me.set(CoreAnnotations.AnswerAnnotation.class, "MONEY");
} else if (me.getString(CoreAnnotations.PartOfSpeechAnnotation.class).equals("CD")) {
if (DEBUG) {
System.err.println("Tagging CD:" + me.word());
}
if (TIME_PATTERN.matcher(me.word()).matches()) {
me.set(CoreAnnotations.AnswerAnnotation.class, "TIME");
} else if (TIME_PATTERN2.matcher(me.word()).matches()) {
me.set(CoreAnnotations.AnswerAnnotation.class, "TIME");
} else if (DATE_PATTERN.matcher(me.word()).matches()) {
me.set(CoreAnnotations.AnswerAnnotation.class, "DATE");
} else if (DATE_PATTERN2.matcher(me.word()).matches()) {
me.set(CoreAnnotations.AnswerAnnotation.class, "DATE");
} else if (next.get(CoreAnnotations.TextAnnotation.class) != null &&
me.get(CoreAnnotations.TextAnnotation.class) != null &&
DAY_PATTERN.matcher(me.get(CoreAnnotations.TextAnnotation.class)).matches() &&
MONTH_PATTERN.matcher(next.get(CoreAnnotations.TextAnnotation.class)).matches()) {
// deterministically make DATE for British-style number before month
me.set(CoreAnnotations.AnswerAnnotation.class, "DATE");
} else if (prev.get(CoreAnnotations.TextAnnotation.class) != null &&
MONTH_PATTERN.matcher(prev.get(CoreAnnotations.TextAnnotation.class)).matches() &&
me.get(CoreAnnotations.TextAnnotation.class) != null &&
DAY_PATTERN.matcher(me.get(CoreAnnotations.TextAnnotation.class)).matches()) {
// deterministically make DATE for number after month
me.set(CoreAnnotations.AnswerAnnotation.class, "DATE");
} else if (rightScanFindsMoneyWord(pl, i) && ! leftScanFindsWeightWord(pl, i)) {
me.set(CoreAnnotations.AnswerAnnotation.class, "MONEY");
} else if(ARMY_TIME_MORNING.matcher(me.word()).matches()) {
me.set(CoreAnnotations.AnswerAnnotation.class, "TIME");
} else
if (YEAR_PATTERN.matcher(me.word()).matches() &&
prev.getString(CoreAnnotations.AnswerAnnotation.class).equals("DATE") &&
(MONTH_PATTERN.matcher(prev.word()).matches() ||
pl.get(i - 2).get(CoreAnnotations.AnswerAnnotation.class).equals("DATE")))
{
me.set(CoreAnnotations.AnswerAnnotation.class, "DATE");
} else {
if (DEBUG) {
System.err.println("Found number:" + me.word());
}
if (prev.getString(CoreAnnotations.AnswerAnnotation.class).equals("MONEY")) {
me.set(CoreAnnotations.AnswerAnnotation.class, "MONEY");
} else {
me.set(CoreAnnotations.AnswerAnnotation.class, "NUMBER");
}
}
} else if(AM_PM.matcher(me.word()).matches() &&
prev.get(CoreAnnotations.AnswerAnnotation.class).equals("TIME")){
me.set(CoreAnnotations.AnswerAnnotation.class, "TIME");
} else if (me.getString(CoreAnnotations.PartOfSpeechAnnotation.class) != null &&
me.getString(CoreAnnotations.PartOfSpeechAnnotation.class).equals(",") &&
prev.getString(CoreAnnotations.AnswerAnnotation.class).equals("DATE") &&
next.word() != null && YEAR_PATTERN.matcher(next.word()).matches()) {
me.set(CoreAnnotations.AnswerAnnotation.class, "DATE");
} else if (me.getString(CoreAnnotations.PartOfSpeechAnnotation.class).equals("NNP") &&
MONTH_PATTERN.matcher(me.word()).matches()) {
if (prev.getString(CoreAnnotations.AnswerAnnotation.class).equals("DATE") ||
next.getString(CoreAnnotations.PartOfSpeechAnnotation.class).equals("CD")) {
me.set(CoreAnnotations.AnswerAnnotation.class, "DATE");
}
} else if (me.getString(CoreAnnotations.PartOfSpeechAnnotation.class) != null &&
me.getString(CoreAnnotations.PartOfSpeechAnnotation.class).equals("CC")) {
if (prev.tag() != null && prev.tag().equals("CD") &&
next.tag() != null && next.tag().equals("CD") &&
me.get(CoreAnnotations.TextAnnotation.class) != null &&
me.get(CoreAnnotations.TextAnnotation.class).equalsIgnoreCase("and")) {
if (DEBUG) {
System.err.println("Found number and:" + me.word());
}
String wd = prev.word();
if (wd.equalsIgnoreCase("hundred") ||
wd.equalsIgnoreCase("thousand") ||
wd.equalsIgnoreCase("million") ||
wd.equalsIgnoreCase("billion") ||
wd.equalsIgnoreCase("trillion"))
{
me.set(CoreAnnotations.AnswerAnnotation.class, "NUMBER");
}
}
} else if (me.getString(CoreAnnotations.PartOfSpeechAnnotation.class) != null &&
(me.getString(CoreAnnotations.PartOfSpeechAnnotation.class).equals("NN") ||
me.getString(CoreAnnotations.PartOfSpeechAnnotation.class).equals("NNS"))) {
if (CURRENCY_WORD_PATTERN.matcher(me.word()).matches()) {
if (prev.getString(CoreAnnotations.PartOfSpeechAnnotation.class).equals("CD") &&
prev.getString(CoreAnnotations.AnswerAnnotation.class).equals("MONEY")) {
me.set(CoreAnnotations.AnswerAnnotation.class, "MONEY");
}
} else if (me.word().equals("m") || me.word().equals("b")) {
// could be metres, but it's probably million or billion in our
// applications
if (prev.getString(CoreAnnotations.AnswerAnnotation.class).equals("MONEY")) {
me.set(CoreAnnotations.AnswerAnnotation.class, "MONEY");
} else {
me.set(CoreAnnotations.AnswerAnnotation.class, "NUMBER");
}
} else if (ORDINAL_PATTERN.matcher(me.word()).matches()) {
if ((next.word() != null && MONTH_PATTERN.matcher(next.word()).matches()) ||
(next.word() != null && next.word().equalsIgnoreCase("of") &&
next2.word() != null && MONTH_PATTERN.matcher(next2.word()).matches())) {
me.set(CoreAnnotations.AnswerAnnotation.class, "DATE");
}
} else if(GENERIC_TIME_WORDS.matcher(me.word()).matches()){
me.set(CoreAnnotations.AnswerAnnotation.class, "TIME");
}
} else if (me.getString(CoreAnnotations.PartOfSpeechAnnotation.class).equals("JJ")) {
if ((next.word() != null && MONTH_PATTERN.matcher(next.word()).matches()) ||
next.word() != null && next.word().equalsIgnoreCase("of") &&
next2.word() != null && MONTH_PATTERN.matcher(next2.word()).matches()) {
me.set(CoreAnnotations.AnswerAnnotation.class, "DATE");
} else if (ORDINAL_PATTERN.matcher(me.word()).matches()) {
// don't do other tags: don't want 'second' as noun, or 'first' as adverb
// introducing reasons
me.set(CoreAnnotations.AnswerAnnotation.class, "ORDINAL");
}
} else if (me.getString(CoreAnnotations.PartOfSpeechAnnotation.class).equals("IN") &&
me.word().equalsIgnoreCase("of")) {
if (prev.get(CoreAnnotations.TextAnnotation.class) != null &&
ORDINAL_PATTERN.matcher(prev.get(CoreAnnotations.TextAnnotation.class)).matches() &&
next.get(CoreAnnotations.TextAnnotation.class) != null &&
MONTH_PATTERN.matcher(next.get(CoreAnnotations.TextAnnotation.class)).matches()) {
me.set(CoreAnnotations.AnswerAnnotation.class, "DATE");
}
}
}
return document;
}
/**
* Look for a distance of up to 3 for something that indicates weight not
* money.
*
* @param pl The list of CoreLabel
* @param i The position to scan right from
* @return whether a weight word is found
*/
private static boolean leftScanFindsWeightWord(List<CoreLabel> pl, int i) {
if (DEBUG) {
System.err.println("leftScan from: " + pl.get(i).word());
}
for (int j = i - 1; j >= 0 && j >= i - 3; j--) {
CoreLabel fl = pl.get(j);
if (fl.word().startsWith("weigh")) {
if (DEBUG) {
System.err.println("leftScan found weight: " + fl.word());
}
return true;
}
}
return false;
}
/**
* Look along CD words and see if next thing is a money word
* like cents or pounds.
*
* @param pl The list of CoreLabel
* @param i The position to scan right from
* @return Whether a money word is found
*/
private static boolean rightScanFindsMoneyWord(List<CoreLabel> pl, int i) {
int j = i;
if (DEBUG) {
System.err.println("rightScan from: " + pl.get(j).word());
}
int sz = pl.size();
while (j < sz && pl.get(j).getString(CoreAnnotations.PartOfSpeechAnnotation.class).equals("CD")) {
j++;
}
if (j >= sz) {
return false;
}
String tag = pl.get(j).getString(CoreAnnotations.PartOfSpeechAnnotation.class);
String word = pl.get(j).word();
if (DEBUG) {
System.err.println("rightScan testing: " + word + '/' + tag + "; answer is: " + Boolean.toString((tag.equals("NN") || tag.equals("NNS")) && CURRENCY_WORD_PATTERN.matcher(word).matches()));
}
return (tag.equals("NN") || tag.equals("NNS")) && CURRENCY_WORD_PATTERN.matcher(word).matches();
}
// Implement other methods of AbstractSequenceClassifier interface
@Override
public void train(Collection<List<CoreLabel>> docs,
DocumentReaderAndWriter<CoreLabel> readerAndWriter) {
}
@Override
public void printProbsDocument(List<CoreLabel> document) {
}
@Override
public void serializeClassifier(String serializePath) {
System.err.print("Serializing classifier to " + serializePath + "...");
System.err.println("done.");
}
@Override
public void loadClassifier(ObjectInputStream in, Properties props) throws IOException, ClassCastException, ClassNotFoundException {
}
public static void main(String[] args) throws Exception {
Properties props = StringUtils.argsToProperties(args);
NumberSequenceClassifier nsc =
new NumberSequenceClassifier(props, true, props);
String trainFile = nsc.flags.trainFile;
String testFile = nsc.flags.testFile;
String textFile = nsc.flags.textFile;
String loadPath = nsc.flags.loadClassifier;
String serializeTo = nsc.flags.serializeTo;
if (loadPath != null) {
nsc.loadClassifierNoExceptions(loadPath);
nsc.flags.setProperties(props);
} else if (trainFile != null) {
nsc.train(trainFile);
}
if (serializeTo != null) {
nsc.serializeClassifier(serializeTo);
}
if (testFile != null) {
nsc.classifyAndWriteAnswers(testFile, nsc.makeReaderAndWriter(), true);
}
if (textFile != null) {
DocumentReaderAndWriter<CoreLabel> readerAndWriter =
new PlainTextDocumentReaderAndWriter<CoreLabel>();
nsc.classifyAndWriteAnswers(textFile, readerAndWriter, false);
}
} // end main
}