* @return
* @throws IOException
*/
static Set<String> getTokens(Article article) throws IOException {
Set<String> tokenList = new HashSet<String>();
WikipediaTokenizer tok = new WikipediaTokenizer(new StringReader(article.getText()));
TermAttribute term = tok.addAttribute(TermAttribute.class);
try {
while (tok.incrementToken()) {
String token = term.term();
if (!StringUtils.isEmpty(token))
tokenList.add(token);
}
} catch (IOException e) {
log.error("Error tokenizing text", e);
} finally {
try {
tok.end();
} catch (IOException e) {
log.error("Error calling end()", e);
} finally {
try {
tok.close();
} catch (IOException e) {
log.error("Error closing tokenizer", e);
}
}
}