public Parse getParse(Content content) {
// check that contentType is one we can handle
String contentType = content.getContentType();
if (contentType != null && !contentType.startsWith("application/pdf"))
return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_INVALID_FORMAT,
"Content-Type not application/pdf: " + contentType).getEmptyParse();
// in memory representation of pdf file
PDDocument pdf = null;
String text = null;
String title = null;
try {
byte[] raw = content.getContent();
String contentLength = content.get("Content-Length");
if (contentLength != null
&& raw.length != Integer.parseInt(contentLength)) {
return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED,
"Content truncated at "+raw.length
+" bytes. Parser can't handle incomplete pdf file.").getEmptyParse();
}
PDFParser parser = new PDFParser(
new ByteArrayInputStream(raw));
parser.parse();
pdf = parser.getPDDocument();
if (pdf.isEncrypted()) {
DocumentEncryption decryptor = new DocumentEncryption(pdf);
//Just try using the default password and move on
decryptor.decryptDocument("");
}
// collect text
PDFTextStripper stripper = new PDFTextStripper();
text = stripper.getText(pdf);
// collect title
PDDocumentInformation info = pdf.getDocumentInformation();
title = info.getTitle();
// more useful info, currently not used. please keep them for future use.
// pdf.getPageCount();
// info.getAuthor()
// info.getSubject()
// info.getKeywords()
// info.getCreator()
// info.getProducer()
// info.getTrapped()
// formatDate(info.getCreationDate())
// formatDate(info.getModificationDate())
} catch (CryptographyException e) {
return new ParseStatus(ParseStatus.FAILED,
"Error decrypting document. " + e).getEmptyParse();
} catch (InvalidPasswordException e) {
return new ParseStatus(ParseStatus.FAILED,
"Can't decrypt document - invalid password. " + e).getEmptyParse();
} catch (Exception e) { // run time exception
return new ParseStatus(ParseStatus.FAILED,
"Can't be handled as pdf document. " + e).getEmptyParse();
} finally {
try {
if (pdf != null)
pdf.close();