document = StringEscapeUtils.unescapeHtml(WikipediaDatasetCreatorMapper.CLOSE_TEXT_TAG_PATTERN.matcher(
WikipediaDatasetCreatorMapper.OPEN_TEXT_TAG_PATTERN.matcher(document).replaceFirst("")).replaceAll(""));
TokenStream stream = analyzer.tokenStream(catMatch, new StringReader(document));
TermAttribute termAtt = stream.addAttribute(TermAttribute.class);
while (stream.incrementToken()) {
contents.append(termAtt.termBuffer(), 0, termAtt.termLength()).append(' ');
}
context.write(
new Text(WikipediaDatasetCreatorMapper.SPACE_NON_ALPHA_PATTERN.matcher(catMatch).replaceAll("_")),
new Text(contents.toString()));
}