}
private Annotation createDoc(String docId, List<IN> tokens, List<IntPair> sentenceBoundaries, boolean includeText) {
try {
String docText = includeText ? join(tokens, CoreAnnotations.TextAnnotation.class, " ") : null;
Annotation doc = new Annotation(docText);
doc.set(CoreAnnotations.DocIDAnnotation.class, docId);
Class tokensClass = Class.forName(tokensAnnotationClassName);
doc.set(tokensClass, tokens);
boolean setTokenCharOffsets = includeText;
if (setTokenCharOffsets) {
int i = 0;
for (IN token : tokens) {
String tokenText = token.get(CoreAnnotations.TextAnnotation.class);
token.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, i);
i += tokenText.length();
token.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, i);
/*
* if (i > docText.length()) { System.err.println("index " + i +
* " larger than docText length " + docText.length());
* System.err.println("Token: " + tokenText);
* System.err.println("DocText: " + docText); }
*/
assert (i <= docText.length());
i++; // Skip space
}
}
if (sentenceBoundaries != null) {
List<CoreMap> sentences = new ArrayList<CoreMap>(sentenceBoundaries.size());
for (IntPair p : sentenceBoundaries) {
// get the sentence text from the first and last character offsets
List<IN> sentenceTokens = new ArrayList<IN>(tokens.subList(p.getSource(), p.getTarget() + 1));
Integer begin = sentenceTokens.get(0).get(CoreAnnotations.CharacterOffsetBeginAnnotation.class);
int last = sentenceTokens.size() - 1;
Integer end = sentenceTokens.get(last).get(CoreAnnotations.CharacterOffsetEndAnnotation.class);
String sentenceText = includeText ? join(sentenceTokens, CoreAnnotations.TextAnnotation.class, " ") : null;
// create a sentence annotation with text and token offsets
Annotation sentence = new Annotation(sentenceText);
sentence.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, begin);
sentence.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, end);
sentence.set(tokensClass, sentenceTokens);
sentence.set(CoreAnnotations.TokenBeginAnnotation.class, p.getSource());
sentence.set(CoreAnnotations.TokenEndAnnotation.class, p.getTarget() + 1);
int sentenceIndex = sentences.size();
sentence.set(CoreAnnotations.SentenceIndexAnnotation.class, sentenceIndex);
// add the sentence to the list
sentences.add(sentence);
}
// add the sentences annotations to the document