byte[] raw = content.getContent();
String contentLength = content.getMetadata().get(Response.CONTENT_LENGTH);
if (contentLength != null
&& raw.length != Integer.parseInt(contentLength)) {
return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED,
"Content truncated at "+raw.length
+" bytes. Parser can't handle incomplete pdf file.").getEmptyParse(getConf());
}
PDFParser parser = new PDFParser(new ByteArrayInputStream(raw));
parser.parse();
pdf = parser.getPDDocument();
if (pdf.isEncrypted()) {
DocumentEncryption decryptor = new DocumentEncryption(pdf);
//Just try using the default password and move on
decryptor.decryptDocument("");
}
// collect text
PDFTextStripper stripper = new PDFTextStripper();
text = stripper.getText(pdf);
// collect title
PDDocumentInformation info = pdf.getDocumentInformation();
title = info.getTitle();
// more useful info, currently not used. please keep them for future use.
metadata.add(Metadata.PAGE_COUNT, String.valueOf(pdf.getPageCount()));
metadata.add(Metadata.AUTHOR, info.getAuthor());
metadata.add(Metadata.SUBJECT, info.getSubject());
metadata.add(Metadata.KEYWORDS, info.getKeywords());
metadata.add(Metadata.CREATOR, info.getCreator());
metadata.add(Metadata.PUBLISHER, info.getProducer());
//TODO: Figure out why we get a java.io.IOException: Error converting date:1-Jan-3 18:15PM
//error here
//metadata.put(DATE, dcDateFormatter.format(info.getCreationDate().getTime()));
//metadata.put(LAST_MODIFIED, dcDateFormatter.format(info.getModificationDate().getTime()));
} catch (CryptographyException e) {
return new ParseStatus(ParseStatus.FAILED,
"Error decrypting document. " + e).getEmptyParse(getConf());
} catch (InvalidPasswordException e) {
return new ParseStatus(ParseStatus.FAILED,
"Can't decrypt document - invalid password. " + e).getEmptyParse(getConf());
} catch (Exception e) { // run time exception
if (LOG.isWarnEnabled()) {
LOG.warn("General exception in PDF parser: "+e.getMessage());
e.printStackTrace(LogUtil.getWarnStream(LOG));
}
return new ParseStatus(ParseStatus.FAILED,
"Can't be handled as pdf document. " + e).getEmptyParse(getConf());
} finally {
try {
if (pdf != null)
pdf.close();