output = new OutputStreamWriter(
new FileOutputStream( outputFile ) );
}
}
PDFTextStripper stripper = null;
if(toHTML)
{
stripper = new PDFText2HTML(encoding);
}
else
{
stripper = new PDFTextStripper(encoding);
}
stripper.setForceParsing( force );
stripper.setSortByPosition( sort );
stripper.setShouldSeparateByBeads( separateBeads );
stripper.setStartPage( startPage );
stripper.setEndPage( endPage );
startTime = startProcessing("Starting text extraction");
if (debug)
{
System.err.println("Writing to "+outputFile);
}
// Extract text for main document:
stripper.writeText( document, output );
// ... also for any embedded PDFs:
PDDocumentCatalog catalog = document.getDocumentCatalog();
PDDocumentNameDictionary names = catalog.getNames();
if (names != null)
{
PDEmbeddedFilesNameTreeNode embeddedFiles = names.getEmbeddedFiles();
if (embeddedFiles != null)
{
Map<String,COSObjectable> embeddedFileNames = embeddedFiles.getNames();
if (embeddedFileNames != null) {
for (Map.Entry<String,COSObjectable> ent : embeddedFileNames.entrySet())
{
if (debug)
{
System.err.println("Processing embedded file " + ent.getKey() + ":");
}
PDComplexFileSpecification spec = (PDComplexFileSpecification) ent.getValue();
PDEmbeddedFile file = spec.getEmbeddedFile();
if (file.getSubtype().equals("application/pdf"))
{
if (debug)
{
System.err.println(" is PDF (size=" + file.getSize() + ")");
}
InputStream fis = file.createInputStream();
PDDocument subDoc = null;
try
{
subDoc = PDDocument.load(fis);
}
finally
{
fis.close();
}
try
{
stripper.writeText( subDoc, output );
}
finally
{
subDoc.close();
}