Package org.dspace.app.mediafilter

Source Code of org.dspace.app.mediafilter.PDFFilter

/**
* The contents of this file are subject to the license and copyright
* detailed in the LICENSE and NOTICE files at the root of the source
* tree and available online at
*
* http://www.dspace.org/license/
*/
package org.dspace.app.mediafilter;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.io.OutputStreamWriter;
import java.io.Writer;

import org.apache.log4j.Logger;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;
import org.dspace.core.ConfigurationManager;

/*
*
* to do: helpful error messages - can't find mediafilter.cfg - can't
* instantiate filter - bitstream format doesn't exist
*
*/
public class PDFFilter extends MediaFilter
{

    private static Logger log = Logger.getLogger(PDFFilter.class);

    public String getFilteredName(String oldFilename)
    {
        return oldFilename + ".txt";
    }

    /**
     * @return String bundle name
     *
     */
    public String getBundleName()
    {
        return "TEXT";
    }

    /**
     * @return String bitstreamformat
     */
    public String getFormatString()
    {
        return "Text";
    }

    /**
     * @return String description
     */
    public String getDescription()
    {
        return "Extracted text";
    }

    /**
     * @param source
     *            source input stream
     *
     * @return InputStream the resulting input stream
     */
    public InputStream getDestinationStream(InputStream source)
            throws Exception
    {
        try
        {
            boolean useTemporaryFile = ConfigurationManager.getBooleanProperty("pdffilter.largepdfs", false);

            // get input stream from bitstream
            // pass to filter, get string back
            PDFTextStripper pts = new PDFTextStripper();
            PDDocument pdfDoc = null;
            Writer writer = null;
            File tempTextFile = null;
            ByteArrayOutputStream byteStream = null;

            if (useTemporaryFile)
            {
                tempTextFile = File.createTempFile("dspacepdfextract" + source.hashCode(), ".txt");
                tempTextFile.deleteOnExit();
                writer = new OutputStreamWriter(new FileOutputStream(tempTextFile));
            }
            else
            {
                byteStream = new ByteArrayOutputStream();
                writer = new OutputStreamWriter(byteStream);
            }
           
            try
            {
                pdfDoc = PDDocument.load(source);
                pts.writeText(pdfDoc, writer);
            }
            finally
            {
                try
                {
                    if (pdfDoc != null)
                    {
                        pdfDoc.close();
                    }
                }
                catch(Exception e)
                {
                   log.error("Error closing PDF file: " + e.getMessage(), e);
                }

                try
                {
                    writer.close();
                }
                catch(Exception e)
                {
                   log.error("Error closing temporary extract file: " + e.getMessage(), e);
                }
            }

            if (useTemporaryFile)
            {
                return new FileInputStream(tempTextFile);
            }
            else
            {
                byte[] bytes = byteStream.toByteArray();
                return new ByteArrayInputStream(bytes);
            }
        }
        catch (OutOfMemoryError oome)
        {
            log.error("Error parsing PDF document " + oome.getMessage(), oome);
            if (!ConfigurationManager.getBooleanProperty("pdffilter.skiponmemoryexception", false))
            {
                throw oome;
            }
        }

        return null;
    }
}
TOP

Related Classes of org.dspace.app.mediafilter.PDFFilter

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.