LOGGER.info("Initialising the TREC document factory");
// The parser is a SGML BulletParser with TREC vocabulary
this.parser = new BulletParser(TRECParsingFactory.INSTANCE);
ComposedCallbackBuilder composedBuilder = new ComposedCallbackBuilder();
composedBuilder.add(this.textExtractor = new StructuredTextExtractor());
this.textExtractor.ignore(
TRECParsingFactory.ELEMENT_DOCNO,
TRECParsingFactory.ELEMENT_FILEID,
TRECParsingFactory.ELEMENT_FIRST,
TRECParsingFactory.ELEMENT_SECOND
);
parser.setCallback(composedBuilder.compose());
this.wordReader = new FastBufferedReader();
text = new char[DEFAULT_BUFFER_SIZE];
}