if (info.getLevel() >= 0) {
InputStream in = UtilExtract.getStream(info.getUri());
ByteArrayOutputStream bout = null;
Writer writer = null;
PDDocument document = null;
try {
PDFTextStripper stripper = new PDFTextStripper();
stripper.setLineSeparator("\n");
//load the document
document = PDDocument.load(in);
String author = "";
String title = "";
String summary = "";
//get the additional data
try {
PDDocumentInformation pdfinfo = document.getDocumentInformation();
if (!Util.isEmpty(pdfinfo.getAuthor())) {
author = pdfinfo.getAuthor();
}
if (!Util.isEmpty(pdfinfo.getTitle())) {
title = pdfinfo.getTitle();
}
if (!Util.isEmpty(pdfinfo.getSubject())) {
summary = pdfinfo.getSubject();
}
} catch (Exception eR) {
String message = MessageUtil.getMessage("extractor.pdf.metadatamissing",
new Object[] { info.getUri() });
logger.info(message);
}
//set the buffer
bout = new ByteArrayOutputStream();
writer = new OutputStreamWriter(bout);
//strip the document to the buffer
stripper.writeText(document, writer);
bout.flush();
writer.flush();
//construct the patterns (to not ignore and replace)
Pattern notIgnorePattern = Pattern.compile(getNotIgnoreChars());
Pattern replacePattern = Pattern.compile(getReplaceChars());
NodeStruct node = new NodeStruct();
ByteArrayInputStream bin = null;
try {
bin = new ByteArrayInputStream(bout.toByteArray());
byte[] buffer = new byte[1024];
int n = bin.read(buffer);
while (n > 0) {
String chars = new String(buffer, 0, n);
//generate the list of the words for the buffer
LinkedList listWords = UtilExtract.getValueList(chars,
getMinLengthWord(), notIgnorePattern,
replacePattern);
for (int j = 0; j < listWords.size(); j++)
node.addTuple(TupleStruct.KEYWORD_GENERIC,
(String) listWords.get(j));
n = bin.read(buffer);
}
logger.debug("Title is " + title + "Path is :" +
info.getUri() + "author" + author + " Summary:" +
summary);
//set the summary field according to the defualt settings
if (summary.length() > getMaxLengthSummary()) {
summary = summary.substring(0, getMaxLengthSummary());
}
DocumStruct doc = new DocumStruct();
doc.setTitle(title);
doc.setPath(info.getUri());
doc.setDescription(summary);
doc.setContent(node);
doc.setCategoryName(info.getCategoryName());
doc.setCategoryLocation(info.getCategoryLocation());
//set the pdf -author
doc.setAuthor(author);
//store and reindex document
PluginManager.storeAndAddDocument(doc);
} catch (IOException e) {
logger.debug("Exception in reading the document text" +
e.getMessage(), e);
throw new RpException("extractor.pdf.textdatamissing",
new Object[] { info.getUri() });
} finally {
try {
if (bin != null) {
bin.close();
}
} catch (Exception e) {
}
}
} catch (IOException e) {
logger.debug("Exception in reading the document text" +
e.getMessage(), e);
throw new RpException("app.extract.error",
new Object[] { info.getUri() });
} finally {
try {
if (writer != null) {
writer.close();
}
if (bout != null) {
bout.close();
}
if (document != null) {
document.close();
}
} catch (Exception e) {
}
}
} else {