public void initialize(String text) {
taggedTokens = new ArrayList<TaggedToken>();
//Load the POS model:
Tagger posTagger = lingPipeFactory.getPoSTaggerInstance();
//1.) Tokenization
long start = System.currentTimeMillis();
List<String> tokenList = new ArrayList<String>();
List<String> whiteList = new ArrayList<String>();
Tokenizer tokenizer = lingPipeFactory.getTokenizerFactoryInstance().tokenizer(text.toCharArray(),
0, text.length());
tokenizer.tokenize(tokenList, whiteList);
LOG.trace("Tokenization took " + (System.currentTimeMillis() - start) + "ms.");
//2.) Sentence detection
start = System.currentTimeMillis();
String[] tokens = new String[tokenList.size()];
String[] whites = new String[whiteList.size()];
tokenList.toArray(tokens);
whiteList.toArray(whites);
SentenceModel sentenceModel = lingPipeFactory.getSentenceModelInstance();
sentenceBoundaries = sentenceModel.boundaryIndices(tokens, whites);
LOG.trace("Sentence segmentation took " + (System.currentTimeMillis() - start) + "ms.");
//3.) Part-of-Speech tagging
start = System.currentTimeMillis();
int sentStartToken = 0;
int sentEndToken;
int textOffset = whites[0].length();
/**
* Tag every sentence with final punctuation (i < sentenceBoundaries.length), if there is
* text without final punctuation, treat the rest of the text as a single sentence.
*/
for (int i = 0; (i < sentenceBoundaries.length || sentStartToken < tokens.length); ++i) {
if (i < sentenceBoundaries.length) {
//We are between two sentence-final punctuation tokens.
sentEndToken = sentenceBoundaries[i];
} else {
//We are beyond the last sentence-final punctuation: Tag the rest of the text.
sentEndToken = tokens.length - 1;
}
Tagging<String> tags = posTagger.tag(tokenList.subList(sentStartToken, sentEndToken + 1));
for (int j = 0; j < tags.size(); j++) {
TaggedToken taggedToken = new TaggedToken(tags.token(j), whiteList.get(sentStartToken + j + 1), tags.tag(j), textOffset, null);
taggedTokens.add(taggedToken);
textOffset += tokens[sentStartToken + j].length() + whites[sentStartToken + j + 1].length();
}