*
* @throws IOException If there is an error parsing the document.
*/
private void addContent(Document document, InputStream is, String documentLocation) throws IOException
{
PDDocument pdfDocument = null;
try
{
if (useNonSeqParser)
{
pdfDocument = PDDocument.loadNonSeq(is, "");
}
else
{
pdfDocument = PDDocument.load(is);
if (pdfDocument.isEncrypted())
{
// Just try using the default password and move on
StandardDecryptionMaterial sdm = new StandardDecryptionMaterial("");
pdfDocument.openProtection(sdm);
}
}
// create a writer where to append the text content.
StringWriter writer = new StringWriter();
if (stripper == null)
{
stripper = new PDFTextStripper();
}
stripper.writeText(pdfDocument, writer);
// Note: the buffer to string operation is costless;
// the char array value of the writer buffer and the content string
// is shared as long as the buffer content is not modified, which will
// not occur here.
String contents = writer.getBuffer().toString();
StringReader reader = new StringReader(contents);
// Add the tag-stripped contents as a Reader-valued Text field so it will
// get tokenized and indexed.
addTextField(document, "contents", reader);
PDDocumentInformation info = pdfDocument.getDocumentInformation();
if (info != null)
{
addTextField(document, "Author", info.getAuthor());
try
{
addTextField(document, "CreationDate", info.getCreationDate());
}
catch (IOException io)
{
// ignore, bad date but continue with indexing
}
addTextField(document, "Creator", info.getCreator());
addTextField(document, "Keywords", info.getKeywords());
try
{
addTextField(document, "ModificationDate", info.getModificationDate());
}
catch (IOException io)
{
// ignore, bad date but continue with indexing
}
addTextField(document, "Producer", info.getProducer());
addTextField(document, "Subject", info.getSubject());
addTextField(document, "Title", info.getTitle());
addTextField(document, "Trapped", info.getTrapped());
}
int summarySize = Math.min(contents.length(), 500);
String summary = contents.substring(0, summarySize);
// Add the summary as an UnIndexed field, so that it is stored and returned
// with hit documents for display.
addUnindexedField(document, "summary", summary);
}
catch (InvalidPasswordException e)
{
// they didn't suppply a password and the default of "" was wrong.
throw new IOException("Error: The document(" + documentLocation + ") is encrypted and will not be indexed.");
}
finally
{
if (pdfDocument != null)
{
pdfDocument.close();
}
}
}