this.sourceFile = sourceFile;
numChunks = 0; //unknown until indexing is done
boolean success = false;
Reader reader = null;
final InputStream stream = new ReadContentInputStream(sourceFile);
try {
Metadata meta = new Metadata();
//Parse the file in a task
Tika tika = new Tika(); //new tika instance for every file, to workaround tika memory issues
ParseRequestTask parseTask = new ParseRequestTask(tika, stream, meta, sourceFile);
final Future<?> future = tikaParseExecutor.submit(parseTask);
try {
future.get(Ingester.getTimeout(sourceFile.getSize()), TimeUnit.SECONDS);
} catch (TimeoutException te) {
final String msg = NbBundle.getMessage(this.getClass(),
"AbstractFileTikaTextExtract.index.tikaParseTimeout.text",
sourceFile.getId(), sourceFile.getName());
KeywordSearch.getTikaLogger().log(Level.WARNING, msg, te);
logger.log(Level.WARNING, msg);
throw new IngesterException(msg);
} catch (Exception ex) {
final String msg = NbBundle.getMessage(this.getClass(),
"AbstractFileTikaTextExtract.index.exception.tikaParse.msg",
sourceFile.getId(), sourceFile.getName());
KeywordSearch.getTikaLogger().log(Level.WARNING, msg, ex);
logger.log(Level.WARNING, msg);
throw new IngesterException(msg);
}
// get the reader with the results
reader = parseTask.getReader();
if (reader == null) {
//likely due to exception in parse()
logger.log(Level.WARNING, "No reader available from Tika parse"); //NON-NLS
return false;
}
// break the results into chunks and index
success = true;
long readSize;
long totalRead = 0;
boolean eof = false;
//we read max 1024 chars at time, this seems to max what this Reader would return
while (!eof) {
readSize = reader.read(textChunkBuf, 0, SINGLE_READ_CHARS);
if (readSize == -1) {
eof = true;
}
else {
totalRead += readSize;
}
//consume more bytes to fill entire chunk (leave EXTRA_CHARS to end the word)
while (!eof && (totalRead < MAX_EXTR_TEXT_CHARS - SINGLE_READ_CHARS - EXTRA_CHARS)
&& (readSize = reader.read(textChunkBuf, (int) totalRead, SINGLE_READ_CHARS)) != -1) {
totalRead += readSize;
}
if (readSize == -1) {
//this is the last chunk
eof = true;
} else {
//try to read char-by-char until whitespace to not break words
while ((totalRead < MAX_EXTR_TEXT_CHARS - 1)
&& !Character.isWhitespace(textChunkBuf[(int) totalRead - 1])
&& (readSize = reader.read(textChunkBuf, (int) totalRead, 1)) != -1) {
totalRead += readSize;
}
if (readSize == -1) {
//this is the last chunk
eof = true;
}
}
//logger.log(Level.INFO, "TOTAL READ SIZE: " + totalRead + " file: " + sourceFile.getName());
//encode to bytes to index as byte stream
String extracted;
//add BOM and trim the 0 bytes
//set initial size to chars read + bom + metadata (roughly) - try to prevent from resizing
StringBuilder sb = new StringBuilder((int) totalRead + 1000);
//inject BOM here (saves byte buffer realloc later), will be converted to specific encoding BOM
//sb.append(UTF16BOM); disabled prepending of BOM
if (totalRead < MAX_EXTR_TEXT_CHARS) {
sb.append(textChunkBuf, 0, (int) totalRead);
} else {
sb.append(textChunkBuf);
}
//reset for next chunk
totalRead = 0;
//append meta data if last chunk
if (eof) {
//sort meta data keys
List<String> sortedKeyList = Arrays.asList(meta.names());
Collections.sort(sortedKeyList);
sb.append("\n\n------------------------------METADATA------------------------------\n\n"); //NON-NLS
for (String key : sortedKeyList) {
String value = meta.get(key);
sb.append(key).append(": ").append(value).append("\n");
}
}
extracted = sb.toString();
//converts BOM automatically to charSet encoding
byte[] encodedBytes = extracted.getBytes(OUTPUT_CHARSET);
AbstractFileChunk chunk = new AbstractFileChunk(this, this.numChunks + 1);
try {
chunk.index(ingester, encodedBytes, encodedBytes.length, OUTPUT_CHARSET);
++this.numChunks;
} catch (Ingester.IngesterException ingEx) {
success = false;
logger.log(Level.WARNING, "Ingester had a problem with extracted strings from file '" //NON-NLS
+ sourceFile.getName() + "' (id: " + sourceFile.getId() + ").", ingEx); //NON-NLS
throw ingEx; //need to rethrow/return to signal error and move on
}
}
} catch (IOException ex) {
final String msg = "Exception: Unable to read Tika content stream from " + sourceFile.getId() + ": " + sourceFile.getName(); //NON-NLS
KeywordSearch.getTikaLogger().log(Level.WARNING, msg, ex);
logger.log(Level.WARNING, msg);
success = false;
} catch (Exception ex) {
final String msg = "Exception: Unexpected error, can't read Tika content stream from " + sourceFile.getId() + ": " + sourceFile.getName(); //NON-NLS
KeywordSearch.getTikaLogger().log(Level.WARNING, msg, ex);
logger.log(Level.WARNING, msg);
success = false;
} finally {
try {
stream.close();
} catch (IOException ex) {
logger.log(Level.WARNING, "Unable to close Tika content stream from " + sourceFile.getId(), ex); //NON-NLS
}
try {
if (reader != null) {