Blob blob = (Blob) value;
InputStream is = null;
// TODO make write limit configurable
WriteOutContentHandler woh = new WriteOutContentHandler(500 * 1000); // 500K limit (Tika default: 100K)
BodyContentHandler ch = new BodyContentHandler(woh);
try {
is = repository.getTable(table).getInputStream(record, fieldType.getName(), indexes);
Metadata metadata = new Metadata();
metadata.add(Metadata.CONTENT_TYPE, blob.getMediaType());
if (blob.getName() != null) {
metadata.add(Metadata.RESOURCE_NAME_KEY, blob.getName());
}
ParseContext parseContext = new ParseContext();
tikaParser.parse(is, ch, metadata, parseContext);
} catch (Throwable t) {
if (woh.isWriteLimitReached(t)) {
// ok, we'll just add use the partial result
if (log.isInfoEnabled()) {
log.info("Blob extraction: write limit reached. Field '" + fieldType.getName() + "', record '"
+ record.getId() + "'.");
}