if (!catMatch.equals("Unknown")) {
document = StringEscapeUtils.unescapeHtml(WikipediaDatasetCreatorMapper.CLOSE_TEXT_TAG_PATTERN.matcher(
WikipediaDatasetCreatorMapper.OPEN_TEXT_TAG_PATTERN.matcher(document).replaceFirst(""))
.replaceAll(""));
TokenStream stream = analyzer.tokenStream(catMatch, new StringReader(document));
TermAttribute termAtt = (TermAttribute) stream.addAttribute(TermAttribute.class);
while (stream.incrementToken()) {
contents.append(termAtt.termBuffer(), 0, termAtt.termLength()).append(' ');
}
output.collect(new Text(WikipediaDatasetCreatorMapper.SPACE_NON_ALPHA_PATTERN.matcher(catMatch)
.replaceAll("_")), new Text(contents.toString()));