@Override
public void endDocument() throws SAXException {
super.endDocument();
TextDocument td = toTextDocument();
try {
extractor.process(td);
} catch (BoilerpipeProcessingException e) {
throw new SAXException(e);
}
Attributes emptyAttrs = new AttributesImpl();
// At this point we have all the information we need to either emit N paragraphs
// of plain text (if not including markup), or we have to replay our recorded elements
// and only emit character runs that passed the boilerpipe filters.
if (includeMarkup) {
BitSet validCharacterRuns = new BitSet();
for (TextBlock block : td.getTextBlocks()) {
if (block.isContent()) {
BitSet bs = block.getContainedTextElements();
if (bs != null) {
validCharacterRuns.or(bs);
}
}
}
// Now have bits set for all valid character runs. Replay our recorded elements,
// but only emit character runs flagged as valid.
int curCharsIndex = headerCharOffset;
for (RecordedElement element : elements) {
switch (element.getElementType()) {
case START:
delegate.startElement(element.getUri(), element.getLocalName(), element.getQName(), element.getAttrs());
// Fall through
case CONTINUE:
// Now emit characters that are valid. Note that boilerpipe pre-increments the character index, so
// we have to follow suit.
for (char[] chars : element.getCharacters()) {
curCharsIndex++;
if (validCharacterRuns.get(curCharsIndex)) {
delegate.characters(chars, 0, chars.length);
}
}
break;
case END:
delegate.endElement(element.getUri(), element.getLocalName(), element.getQName());
break;
default:
throw new RuntimeException("Unhandled element type: " + element.getElementType());
}
}
} else {
for (TextBlock block : td.getTextBlocks()) {
if (block.isContent()) {
delegate.startElement(XHTMLContentHandler.XHTML, "p", "p", emptyAttrs);
char[] chars = block.getText().toCharArray();
delegate.characters(chars, 0, chars.length);
delegate.endElement(XHTMLContentHandler.XHTML, "p", "p");