@Override
public void extractFrom( final Binary binary,
final TextExtractor.Output output,
final Context context ) throws Exception {
final DefaultParser parser = initialize();
final Integer writeLimit = this.writeLimit;
processStream(binary, new BinaryOperation<Object>() {
@Override
public Object execute( InputStream stream ) throws Exception {
Metadata metadata = prepareMetadata(binary, context);
//TODO author=Horia Chiorean date=1/30/13 description=//TIKA 1.2 TXTParser seems to have a bug, always adding 1 ignorable whitespace to the actual chars to be parsed
//https://issues.apache.org/jira/browse/TIKA-1069
ContentHandler textHandler = writeLimit == null ? new BodyContentHandler() : new BodyContentHandler(writeLimit + 1);
try {
LOGGER.debug("Using TikaTextExtractor to extract text");
// Parse the input stream ...
parser.parse(stream, textHandler, metadata, new ParseContext());
} catch (SAXException sae) {
LOGGER.warn(TikaI18n.parseExceptionWhileExtractingText, sae.getMessage());
} catch (NoClassDefFoundError ncdfe) {
LOGGER.warn(TikaI18n.warnNoClassDefFound, ncdfe.getMessage());
} catch (Throwable e) {