int offset = 0;
List<CoreMap> sentences = new ArrayList<CoreMap>();
Elements sentenceElements = textElem.getChildElements("SENT");
for (int crtsent = 0; crtsent < sentenceElements.size(); crtsent ++){
Element sentElem = sentenceElements.get(crtsent);
CoreMap sentence = new ArrayCoreMap();
sentence.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, offset);
Tree tree = Tree.valueOf(sentElem.getChild(0).getValue()); // XXX ms: is this the same as sentElem.getText() in JDOM?
List<CoreLabel> tokens = new ArrayList<CoreLabel>();
List<Tree> preTerminals = preTerminals(tree);
for (Tree preTerminal: preTerminals) {
String posTag = preTerminal.value();
for (Tree wordTree: preTerminal.children()) {
String word = wordTree.value();
CoreLabel token = new CoreLabel();
token.set(CoreAnnotations.TextAnnotation.class, word);
token.set(CoreAnnotations.TextAnnotation.class, word);
token.set(CoreAnnotations.PartOfSpeechAnnotation.class, posTag);
token.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, offset);
offset += word.length();
token.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, offset);
text.append(word);
text.append(' ');
offset += 1;
tokens.add(token);
}
}
if (preTerminals.size() > 0) {
text.setCharAt(text.length() - 1, '\n');
}
sentence.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, offset - 1);
sentence.set(CoreAnnotations.TokensAnnotation.class, tokens);
sentence.set(TreeCoreAnnotations.TreeAnnotation.class, tree);
sentences.add(sentence);
}
String docID = docElem.getAttributeValue("id");
Matcher matcher = datePattern.matcher(docID);