CoreMap tokenAnnotations = (tokenAnnotationPatterns != null && !tokenAnnotationPatterns.isEmpty())? new ArrayCoreMap():null;
Map<Class, Stack<Pair<String, String>>> savedTokenAnnotations = new ArrayMap<Class, Stack<Pair<String, String>>>();
// Local variable for annotating sections
XMLUtils.XMLTag sectionStartTag = null;
CoreLabel sectionStartToken = null;
CoreMap sectionAnnotations = null;
Map<Class, List<CoreLabel>> savedTokensForSection = new HashMap<Class, List<CoreLabel>>();
boolean markSingleSentence = false;
for (CoreLabel token : tokens) {
String word = token.word().trim();
XMLUtils.XMLTag tag = XMLUtils.parseTag(word);
// If it's not a tag, we do manipulations such as unescaping
if (tag == null) {
// TODO: put this into the lexer instead of here
token.setWord(XMLUtils.unescapeStringForXML(token.word()));
// TODO: was there another annotation that also represents the word?
if (matchDepth > 0 ||
xmlTagMatcher == null ||
xmlTagMatcher.matcher("").matches()) {
newTokens.add(token);
if (inUtterance) {
token.set(CoreAnnotations.UtteranceAnnotation.class, utteranceIndex);
if (currentSpeaker != null) token.set(CoreAnnotations.SpeakerAnnotation.class, currentSpeaker);
}
if (markSingleSentence) {
token.set(CoreAnnotations.ForcedSentenceUntilEndAnnotation.class, true);
markSingleSentence = false;
}
if (tokenAnnotations != null) {
ChunkAnnotationUtils.copyUnsetAnnotations(tokenAnnotations, token);
}
}
// if we removed any text, and the tokens are "invertible" and
// therefore keep track of their before/after text, append
// what we removed to the appropriate tokens
if (removedText.length() > 0) {
boolean added = false;
String before = token.get(CoreAnnotations.BeforeAnnotation.class);
if (before != null) {
token.set(CoreAnnotations.BeforeAnnotation.class, removedText + before);
added = true;
}
if (added && newTokens.size() > 1) {
CoreLabel previous = newTokens.get(newTokens.size() - 2);
String after = previous.get(CoreAnnotations.AfterAnnotation.class);
if (after != null)
previous.set(CoreAnnotations.AfterAnnotation.class, after + removedText);
else
previous.set(CoreAnnotations.AfterAnnotation.class, removedText.toString());
}
removedText = new StringBuilder();
}
if (currentTagSet == null) {
// We wrap the list in an unmodifiable list because we reuse
// the same list object many times. We don't want to
// let someone modify one list and screw up all the others.
currentTagSet =
Collections.unmodifiableList(new ArrayList<String>(enclosingTags));
}
token.set(CoreAnnotations.XmlContextAnnotation.class, currentTagSet);
// is this token part of the doc date sequence?
if (dateTagMatcher != null &&
currentTagSet.size() > 0 &&
dateTagMatcher.matcher(currentTagSet.get(currentTagSet.size() - 1)).matches()) {
docDateTokens.add(token);
}
if (docIdTagMatcher != null &&
currentTagSet.size() > 0 &&
docIdTagMatcher.matcher(currentTagSet.get(currentTagSet.size() - 1)).matches()) {
docIdTokens.add(token);
}
if (docTypeTagMatcher != null &&
currentTagSet.size() > 0 &&
docTypeTagMatcher.matcher(currentTagSet.get(currentTagSet.size() - 1)).matches()) {
docTypeTokens.add(token);
}
if (inSpeakerTag) {
speakerTokens.add(token);
}
if (sectionStartTag != null) {
boolean okay = true;
if (ssplitDiscardTokensMatcher != null) {
okay = !ssplitDiscardTokensMatcher.matcher(token.word()).matches();
}
if (okay) {
if (sectionStartToken == null) {
sectionStartToken = token;
}
// Add tokens to saved section tokens
for (List<CoreLabel> saved:savedTokensForSection.values()) {
saved.add(token);
}
}
}
continue;
}
// At this point, we know we have a tag
// we are removing a token and its associated text...
// keep track of that
String currentRemoval = token.get(CoreAnnotations.BeforeAnnotation.class);
if (currentRemoval != null)
removedText.append(currentRemoval);
currentRemoval = token.get(CoreAnnotations.OriginalTextAnnotation.class);
if (currentRemoval != null)
removedText.append(currentRemoval);
if (token == tokens.get(tokens.size() - 1)) {
currentRemoval = token.get(CoreAnnotations.AfterAnnotation.class);
if (currentRemoval != null)
removedText.append(currentRemoval);
}
// Process tag
// Check if we want to annotate anything using the tags's attributes
if (!toAnnotate.isEmpty() && tag.attributes != null) {
Set<Class> foundAnnotations = annotateWithTag(annotation, annotation, tag, docAnnotationPatterns, null, toAnnotate, null);
toAnnotate.removeAll(foundAnnotations);
}
// Check if the tag matches a section
if (sectionTagMatcher != null && sectionTagMatcher.matcher(tag.name).matches()) {
if (tag.isEndTag) {
annotateWithTag(annotation, sectionAnnotations, tag, sectionAnnotationPatterns, savedTokensForSection, null, null);
if (sectionStartToken != null) {
sectionStartToken.set(CoreAnnotations.SectionStartAnnotation.class, sectionAnnotations);
}
// Mark previous token as forcing sentence and section end
if (newTokens.size() > 0) {
CoreLabel previous = newTokens.get(newTokens.size() - 1);
previous.set(CoreAnnotations.ForcedSentenceEndAnnotation.class, true);
previous.set(CoreAnnotations.SectionEndAnnotation.class, sectionStartTag.name);
}
savedTokensForSection.clear();
sectionStartTag = null;
sectionStartToken = null;
sectionAnnotations = null;
} else if (!tag.isSingleTag) {
// Prepare to mark first token with section information
sectionStartTag = tag;
sectionAnnotations = new ArrayCoreMap();
sectionAnnotations.set(CoreAnnotations.SectionAnnotation.class, sectionStartTag.name);
}
}
if (sectionStartTag != null) {
// store away annotations for section
annotateWithTag(annotation, sectionAnnotations, tag, sectionAnnotationPatterns, savedTokensForSection, null, null);
}
if (tokenAnnotations != null) {
annotateWithTag(annotation, tokenAnnotations, tag, tokenAnnotationPatterns, null, null, savedTokenAnnotations);
}
// If the tag matches the sentence ending tags, and we have some
// existing words, mark that word as being somewhere we want
// to end the sentence.
if (sentenceEndingTagMatcher != null &&
sentenceEndingTagMatcher.matcher(tag.name).matches() &&
newTokens.size() > 0) {
CoreLabel previous = newTokens.get(newTokens.size() - 1);
previous.set(CoreAnnotations.ForcedSentenceEndAnnotation.class, true);
}
if (utteranceTurnTagMatcher != null && utteranceTurnTagMatcher.matcher(tag.name).matches()) {
if (newTokens.size() > 0) {
// Utterance turn is also sentence ending
CoreLabel previous = newTokens.get(newTokens.size() - 1);
previous.set(CoreAnnotations.ForcedSentenceEndAnnotation.class, true);
}
inUtterance = !(tag.isEndTag || tag.isSingleTag);
if (inUtterance) {
utteranceIndex++;
}
if (!inUtterance) {
currentSpeaker = null;
}
}
if (speakerTagMatcher != null && speakerTagMatcher.matcher(tag.name).matches()) {
if (newTokens.size() > 0) {
// Speaker is not really part of sentence
CoreLabel previous = newTokens.get(newTokens.size() - 1);
previous.set(CoreAnnotations.ForcedSentenceEndAnnotation.class, true);
}
inSpeakerTag = !(tag.isEndTag || tag.isSingleTag);
if (tag.isEndTag) {
currentSpeaker = tokensToString(annotation, speakerTokens);
MultiTokenTag.Tag mentionTag = new MultiTokenTag.Tag(currentSpeaker, "Speaker", speakerTokens.size());
int i = 0;
for (CoreLabel t:speakerTokens) {
t.set(CoreAnnotations.SpeakerAnnotation.class, currentSpeaker);
t.set(CoreAnnotations.MentionTokenAnnotation.class, new MultiTokenTag(mentionTag, i));
i++;
}
} else {
currentSpeaker = null;
}
speakerTokens.clear();
}
if (singleSentenceTagMatcher != null && singleSentenceTagMatcher.matcher(tag.name).matches()) {
if (tag.isEndTag) {
// Mark previous token as forcing sentence end
if (newTokens.size() > 0) {
CoreLabel previous = newTokens.get(newTokens.size() - 1);
previous.set(CoreAnnotations.ForcedSentenceEndAnnotation.class, true);
}
markSingleSentence = false;
} else if (!tag.isSingleTag) {
// Enforce rest of the tokens to be single token until ForceSentenceEnd is seen
markSingleSentence = true;
}
}
if (xmlTagMatcher == null)
continue;
if (tag.isSingleTag) {
continue;
}
// at this point, we can't reuse the "currentTagSet" vector
// any more, since the current tag set has changed
currentTagSet = null;
if (tag.isEndTag) {
while (true) {
if (enclosingTags.isEmpty()) {
throw new IllegalArgumentException("Got a close tag " + tag.name +
" which does not match" +
" any open tag");
}
String lastTag = enclosingTags.pop();
if (xmlTagMatcher.matcher(lastTag).matches()){
--matchDepth;
}
if (lastTag.equals(tag.name))
break;
if (!allowFlawedXml)
throw new IllegalArgumentException("Mismatched tags... " +
tag.name + " closed a " +
lastTag + " tag.");
}
if (matchDepth < 0) {
// this should be impossible, since we already assert that
// the tags match up correctly
throw new AssertionError("Programming error? We think there " +
"have been more close tags than open tags");
}
} else {
// open tag, since all other cases are exhausted
enclosingTags.push(tag.name);
if (xmlTagMatcher.matcher(tag.name).matches())
matchDepth++;
}
}
if (enclosingTags.size() > 0 && !allowFlawedXml) {
throw new IllegalArgumentException("Unclosed tags, starting with " +
enclosingTags.pop());
}
// If we ended with a string of xml tokens, that text needs to be
// appended to the "AfterAnnotation" of one of the tokens...
// Note that we clear removedText when we see a real token, so
// if removedText is not empty, that must be because we just
// dropped an xml tag. Therefore we ignore that old After
// annotation, since that text was already absorbed in the Before
// annotation of the xml tag we threw away
if (newTokens.size() > 0 && removedText.length() > 0) {
CoreLabel lastToken = newTokens.get(newTokens.size() - 1);
// sometimes AfterAnnotation seems to be null even when we are
// collecting before & after annotations, but OriginalTextAnnotation
// is only non-null if we are invertible. Hopefully.
if (lastToken.get(CoreAnnotations.OriginalTextAnnotation.class) != null) {
lastToken.set(CoreAnnotations.AfterAnnotation.class, removedText.toString());
}
}
// Populate docid, docdate, doctype
if (annotation != null) {