}
}
}
// TODO: Should we allow "," in written out numbers?
// TODO: Handle "-" that is not with token?
TokenSequenceMatcher matcher = numberPattern.getMatcher(tokens);
List<CoreMap> numbers = new ArrayList<CoreMap>();
while (matcher.find()) {
@SuppressWarnings("unused")
List<CoreMap> matchedTokens = matcher.groupNodes();
int numStart = matcher.start();
int possibleNumEnd = -1;
int lastUnitPos = -1;
int possibleNumStart = -1;
Number possibleNumEndUnit = null;
Number lastUnit = null;
// Check if we need to split matched chunk up more
for (int i = matcher.start(); i < matcher.end(); i++) {
CoreLabel token = tokens.get(i);
CoreLabel prev = (i > matcher.start())? tokens.get(i - 1): null;
Number num = token.get(CoreAnnotations.NumericValueAnnotation.class);
Number prevNum = (prev != null)? prev.get(CoreAnnotations.NumericValueAnnotation.class):null;
String w = token.word();
w = w.trim().toLowerCase();
switch (w) {
case ",":
if (lastUnit != null && lastUnitPos == i - 1) {
// OKAY, this may be one big number
possibleNumEnd = i;
possibleNumEndUnit = lastUnit;
} else {
// Not one big number
if (numStart < i) {
numbers.add(ChunkAnnotationUtils.getAnnotatedChunk(annotation, numStart, i));
numStart = i + 1;
possibleNumEnd = -1;
possibleNumEndUnit = null;
lastUnit = null;
lastUnitPos = -1;
}
}
if (numStart == i) {
numStart = i + 1;
}
break;
case "and":
// Check if number before and was unit
String prevWord = prev.word();
if (lastUnitPos == i - 1 || (lastUnitPos == i - 2 && ",".equals(prevWord))) {
// Okay
} else {
// Two separate numbers
if (numStart < possibleNumEnd) {
numbers.add(ChunkAnnotationUtils.getAnnotatedChunk(annotation, numStart, possibleNumEnd));
if (possibleNumStart >= possibleNumEnd) {
numStart = possibleNumStart;
} else {
numStart = i + 1;
}
} else if (numStart < i) {
numbers.add(ChunkAnnotationUtils.getAnnotatedChunk(annotation, numStart, i));
numStart = i + 1;
}
if (lastUnitPos < numStart) {
lastUnit = null;
lastUnitPos = -1;
}
possibleNumEnd = -1;
possibleNumEndUnit = null;
}
break;
default:
// NUMBER or ORDINAL
String numType = token.get(CoreAnnotations.NumericTypeAnnotation.class);
if ("UNIT".equals(numType)) {
// Compare this unit with previous
if (lastUnit == null || lastUnit.longValue() > num.longValue()) {
// lastUnit larger than this unit
// maybe four thousand two hundred?
// OKAY, probably one big number
} else {
if (numStart < possibleNumEnd) {
// Units are increasing - check if this unit is >= unit before "," (if so, need to split into chunks)
// Not one big number ( had a comma )
if (num.longValue() >= possibleNumEndUnit.longValue()) {
numbers.add(ChunkAnnotationUtils.getAnnotatedChunk(annotation, numStart, possibleNumEnd));
if (possibleNumStart >= possibleNumEnd) {
numStart = possibleNumStart;
} else {
numStart = i;
}
possibleNumEnd = -1;
possibleNumEndUnit = null;
}
} else {
// unit is increasing - can be okay, maybe five hundred thousand?
// what about four hundred five thousand
// unit might also be the same, as in thousand thousand,
// which we convert to million
}
}
lastUnit = num;
lastUnitPos = i;
} else {
// Normal number
if (num == null) {
logger.warning("NO NUMBER: " + token.word());
continue;
}
if (prevNum != null) {
if (num.doubleValue() > 0) {
if (num.doubleValue() < 10) {
// This number is a digit
// Treat following as two separate numbers
// \d+ [0-9]
// [one to nine] [0-9]
if (NumberNormalizer.numPattern.matcher(prev.word()).matches() ||
prevNum.longValue() < 10 || prevNum.longValue() % 10 != 0) {
// two separate numbers
if (numStart < i) {
numbers.add(ChunkAnnotationUtils.getAnnotatedChunk(annotation, numStart, i));
}
numStart = i;
possibleNumEnd = -1;
possibleNumEndUnit = null;
lastUnit = null;
lastUnitPos = -1;
}
} else {
String prevNumType = prev.get(CoreAnnotations.NumericTypeAnnotation.class);
if ("UNIT".equals(prevNumType)) {
// OKAY
} else if (!ordinalUnitPattern.matcher(w).matches()) {
// Start of new number
if (numStart < i) {
numbers.add(ChunkAnnotationUtils.getAnnotatedChunk(annotation, numStart, i));
}
numStart = i;
possibleNumEnd = -1;
possibleNumEndUnit = null;
lastUnit = null;
lastUnitPos = -1;
}
}
}
}
if ("ORDINAL".equals(numType)) {
if (possibleNumEnd >= 0) {
if (numStart < possibleNumEnd) {
numbers.add(ChunkAnnotationUtils.getAnnotatedChunk(annotation, numStart, possibleNumEnd));
}
if (possibleNumStart > possibleNumEnd) {
numbers.add(ChunkAnnotationUtils.getAnnotatedChunk(annotation, possibleNumStart, i + 1));
} else {
numbers.add(ChunkAnnotationUtils.getAnnotatedChunk(annotation, possibleNumEnd + 1, i + 1));
}
} else {
if (numStart < i + 1) {
numbers.add(ChunkAnnotationUtils.getAnnotatedChunk(annotation, numStart, i + 1));
}
}
numStart = i + 1;
possibleNumEnd = -1;
possibleNumEndUnit = null;
lastUnit = null;
lastUnitPos = -1;
}
if (possibleNumStart < possibleNumEnd) {
possibleNumStart = i;
}
}
break;
}
}
if (numStart < matcher.end()) {
numbers.add(ChunkAnnotationUtils.getAnnotatedChunk(annotation, numStart, matcher.end()));
}
}
for (CoreMap n:numbers) {
String exp = n.get(CoreAnnotations.TextAnnotation.class);
if (exp.trim().equals("")) { continue; }