@SuppressWarnings("unchecked")
public void prepare(RawDocument rawDocument) throws RegainException {
String url = rawDocument.getUrl();
InputStream stream = null;
PDDocument pdfDocument = null;
try {
// Create a InputStream that reads the content.
stream = rawDocument.getContentAsStream();
// Parse the content
PDFParser parser = new PDFParser(stream);
parser.parse();
pdfDocument = parser.getPDDocument();
// Decrypt the PDF-Dokument
if (pdfDocument.isEncrypted()) {
mLog.debug("Document is encrypted: " + url);
StandardDecryptionMaterial sdm = new StandardDecryptionMaterial("");
pdfDocument.openProtection(sdm);
AccessPermission ap = pdfDocument.getCurrentAccessPermission();
if (!ap.canExtractContent()) {
throw new RegainException("Document is encrypted and can't be opened: " + url);
}
}
// Extract the text with a utility class
PDFTextStripper stripper = new PDFTextStripper();
stripper.setSuppressDuplicateOverlappingText(false);
stripper.setSortByPosition(true);
stripper.setStartPage(1);
stripper.setEndPage(Integer.MAX_VALUE);
setCleanedContent(stripper.getText(pdfDocument).replaceAll("visiblespace", " "));
// extract annotations
StringBuilder annotsResult = new StringBuilder();
List allPages = pdfDocument.getDocumentCatalog().getAllPages();
for (int i = 0; i < allPages.size(); i++) {
int pageNum = i + 1;
PDPage page = (PDPage) allPages.get(i);
List<PDAnnotation> annotations = page.getAnnotations();
if (annotations.size() < 1) {
continue;
}
mLog.debug("Total annotations = " + annotations.size());
mLog.debug("\nProcess Page " + pageNum + "...");
for (PDAnnotation annotation : annotations) {
if (annotation.getContents() != null && annotation.getContents().length() > 0) {
annotsResult.append(annotation.getContents());
annotsResult.append(" ");
mLog.debug("Text from annotation: " + annotation.getContents());
}
}
}
if (annotsResult.length() > 0) {
setCleanedContent(getCleanedContent() + " Annotations " + annotsResult.toString());
}
// Get the meta data
PDDocumentInformation info = pdfDocument.getDocumentInformation();
StringBuilder metaData = new StringBuilder();
metaData.append("p.");
metaData.append(Integer.toString(pdfDocument.getNumberOfPages()));
metaData.append(" ");
// Check if fields are null
if (info.getAuthor() != null) {
metaData.append(info.getAuthor());
metaData.append(" ");
}
if (info.getSubject() != null) {
metaData.append(info.getSubject());
metaData.append(" ");
}
if (info.getKeywords() != null) {
metaData.append(info.getKeywords());
metaData.append(" ");
}
if (info.getTitle() != null) {
setTitle(info.getTitle());
}
setCleanedMetaData(metaData.toString());
if (mLog.isDebugEnabled()) {
mLog.debug("Extracted meta data ::" + getCleanedMetaData()
+ ":: from " + rawDocument.getUrl());
}
} catch (CryptographyException exc) {
throw new RegainException("Error decrypting document: " + url, exc);
} catch (BadSecurityHandlerException exc) {
// They didn't supply a password and the default of "" was wrong.
throw new RegainException("Document is encrypted: " + url, exc);
} catch (IOException exc) {
throw new RegainException("Error reading document: " + url, exc);
} finally {
if (stream != null) {
try {
stream.close();
} catch (Exception exc) {
}
}
if (pdfDocument != null) {
try {
pdfDocument.close();
} catch (Exception exc) {
}
}
}
}