public Parse getParse(Content content) throws ParseException {
// check that contentType is one we can handle
String contentType = content.getContentType();
if (contentType != null && !contentType.startsWith("application/pdf"))
throw new ParseException(
"Content-Type not application/pdf: "+contentType);
// in memory representation of pdf file
PDDocument pdf = null;
String text = null;
String title = null;
try {
byte[] raw = content.getContent();
String contentLength = content.get("Content-Length");
if (contentLength != null
&& raw.length != Integer.parseInt(contentLength)) {
throw new ParseException("Content truncated at "+raw.length
+" bytes. Parser can't handle incomplete pdf file.");
}
PDFParser parser = new PDFParser(
new ByteArrayInputStream(raw));
parser.parse();
pdf = parser.getPDDocument();
if (pdf.isEncrypted()) {
DocumentEncryption decryptor = new DocumentEncryption(pdf);
//Just try using the default password and move on
decryptor.decryptDocument("");
}
// collect text
PDFTextStripper stripper = new PDFTextStripper();
text = stripper.getText(pdf);
// collect title
PDDocumentInformation info = pdf.getDocumentInformation();
title = info.getTitle();
// more useful info, currently not used. please keep them for future use.
// pdf.getPageCount();
// info.getAuthor()
// info.getSubject()
// info.getKeywords()
// info.getCreator()
// info.getProducer()
// info.getTrapped()
// formatDate(info.getCreationDate())
// formatDate(info.getModificationDate())
} catch (ParseException e) {
throw e;
} catch (CryptographyException e) {
throw new ParseException("Error decrypting document. "+e);
} catch (InvalidPasswordException e) {
throw new ParseException("Can't decrypt document. "+e);
} catch (Exception e) { // run time exception
throw new ParseException("Can't be handled as pdf document. "+e);
} finally {
try {
if (pdf != null)
pdf.close();
} catch (IOException e) {