Source Code of org.exoplatform.services.document.impl.PDFDocumentReader

/*
 * Copyright (C) 2009 eXo Platform SAS.
 *
 * This is free software; you can redistribute it and/or modify it
 * under the terms of the GNU Lesser General Public License as
 * published by the Free Software Foundation; either version 2.1 of
 * the License, or (at your option) any later version.
 *
 * This software is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this software; if not, write to the Free
 * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
 * 02110-1301 USA, or see the FSF site: http://www.fsf.org.
 */
package org.exoplatform.services.document.impl;


import com.lowagie.text.pdf.PdfDate;
import com.lowagie.text.pdf.PdfReader;


import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;
import org.exoplatform.commons.utils.ISO8601;
import org.exoplatform.services.document.DCMetaData;
import org.exoplatform.services.document.DocumentReadException;
import org.exoplatform.services.log.ExoLogger;
import org.exoplatform.services.log.Log;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;


import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringWriter;
import java.security.AccessController;
import java.security.PrivilegedActionException;
import java.security.PrivilegedExceptionAction;
import java.text.ParseException;
import java.util.Calendar;
import java.util.HashMap;
import java.util.Properties;


import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;


/**
 * Created by The eXo Platform SAS A parser of Adobe PDF files.
 * 
 * @author Phung Hai Nam
 * @author Gennady Azarenkov
 * @version Oct 19, 2005
 */
public class PDFDocumentReader extends BaseDocumentReader
{


   protected static Log log = ExoLogger.getLogger("exo.core.component.document.PDFDocumentReader");


   /**
    * Get the application/pdf mime type.
    * 
    * @return The application/pdf mime type.
    */
   public String[] getMimeTypes()
   {
      return new String[]{"application/pdf"};
   }


   /**
    * Returns only a text from pdf file content.
    * 
    * @param is an input stream with .pdf file content.
    * @return The string only with text from file content.
    */
   public String getContentAsText(final InputStream is) throws IOException, DocumentReadException
   {


      try
      {
         return (String)AccessController.doPrivileged(new PrivilegedExceptionAction<Object>()
         {
            public Object run() throws Exception
            {
               if (is == null)
               {
                  throw new NullPointerException("InputStream is null.");
               }
               PDDocument pdDocument = null;
               StringWriter sw = new StringWriter();
               try
               {
                  if (is.available() == 0)
                     return "";


                  try
                  {
                     pdDocument = PDDocument.load(is);
                  }
                  catch (IOException e)
                  {
                     throw new DocumentReadException("Can not load PDF document.", e);
                  }


                  PDFTextStripper stripper = new PDFTextStripper();
                  stripper.setStartPage(1);
                  stripper.setEndPage(Integer.MAX_VALUE);
                  stripper.writeText(pdDocument, sw);
               }
               finally
               {
                  if (pdDocument != null)
                     try
                     {
                        pdDocument.close();
                     }
                     catch (IOException e)
                     {
                     }
                  if (is != null)
                     try
                     {
                        is.close();
                     }
                     catch (IOException e)
                     {
                     }
               }
               return sw.toString();
            }
         });


      }
      catch (PrivilegedActionException pae)
      {
         Throwable cause = pae.getCause();
         if (cause instanceof IOException)
         {
            throw (IOException)cause;
         }
         else if (cause instanceof RuntimeException)
         {
            throw (RuntimeException)cause;
         }
         else
         {
            throw new RuntimeException(cause);
         }
      }


   }


   public String getContentAsText(InputStream is, String encoding) throws IOException, DocumentReadException
   {
      // Ignore encoding
      return getContentAsText(is);
   }


   /*
    * (non-Javadoc)
    * 
    * @see org.exoplatform.services.document.DocumentReader#getProperties(java.io.
    *      InputStream)
    */
   public Properties getProperties(InputStream is) throws IOException, DocumentReadException
   {


      Properties props = null;


      PdfReader reader = new PdfReader(is, "".getBytes());


      // Read the file metadata
      byte[] metadata = reader.getMetadata();


      if (metadata != null)
      {
         // there is XMP metadata try exctract it
         props = getPropertiesFromMetadata(metadata);
      }


      if (props == null)
      {
         // it's old pdf document version
         props = getPropertiesFromInfo(reader.getInfo());
      }
      reader.close();
      if (is != null)
         try
         {
            is.close();
         }
         catch (IOException e)
         {
         }
      return props;
   }


   /**
    * Extract properties from XMP xml.
    * 
    * @param metadata XML as byte array
    * @return extracted properties
    * @throws DocumentReadException
    * @throws Exception if extracting fails
    */
   protected Properties getPropertiesFromMetadata(byte[] metadata) throws IOException, DocumentReadException
   {


      Properties props = null;


      // parse xml


      Document doc;
      try
      {
         DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
         DocumentBuilder docBuilder = dbf.newDocumentBuilder();
         doc = docBuilder.parse(new ByteArrayInputStream(metadata));
      }
      catch (SAXException e)
      {
         throw new DocumentReadException(e.getMessage(), e);
      }
      catch (ParserConfigurationException e)
      {
         throw new DocumentReadException(e.getMessage(), e);
      }


      // Check is there PDF/A-1 XMP
      String version = "";
      NodeList list = doc.getElementsByTagName("pdfaid:conformance");
      if (list != null && list.item(0) != null)
      {
         version += list.item(0).getTextContent() + "-";
      }


      list = doc.getElementsByTagName("pdfaid:part");
      if (list != null && list.item(0) != null)
      {
         version += list.item(0).getTextContent();
      }


      // PDF/A-1a or PDF/A-1b
      if (version.equalsIgnoreCase("A-1"))
      {
         props = getPropsFromPDFAMetadata(doc);
      }


      return props;
   }


   /**
    * Extracts properties from PDF Info hash set.
    * 
    * @param Pdf Info hash set
    * @return Extracted properties
    * @throws Exception if extracting fails
    */
   @SuppressWarnings("unchecked")
   protected Properties getPropertiesFromInfo(HashMap info) throws IOException
   {
      Properties props = new Properties();


      String title = (String)info.get("Title");
      if (title != null)
      {
         props.put(DCMetaData.TITLE, title);
      }


      String author = (String)info.get("Author");
      if (author != null)
      {
         props.put(DCMetaData.CREATOR, author);
      }


      String subject = (String)info.get("Subject");
      if (subject != null)
      {
         props.put(DCMetaData.SUBJECT, subject);
      }


      String creationDate = (String)info.get("CreationDate");
      if (creationDate != null)
      {
         props.put(DCMetaData.DATE, PdfDate.decode(creationDate));
      }


      String modDate = (String)info.get("ModDate");
      if (modDate != null)
      {
         props.put(DCMetaData.DATE, PdfDate.decode(modDate));
      }


      return props;
   }


   private Properties getPropsFromPDFAMetadata(Document doc) throws IOException, DocumentReadException
   {
      Properties props = new Properties();
      // get properties
      NodeList list = doc.getElementsByTagName("rdf:li");
      if (list != null && list.getLength() > 0)
      {
         for (int i = 0; i < list.getLength(); i++)
         {


            Node n = list.item(i);
            // dc:title - TITLE
            if (n.getParentNode().getParentNode().getNodeName().equals("dc:title"))
            {
               String title = n.getLastChild().getTextContent();
               props.put(DCMetaData.TITLE, title);
            }


            // dc:creator - CREATOR
            if (n.getParentNode().getParentNode().getNodeName().equals("dc:creator"))
            {
               String author = n.getLastChild().getTextContent();
               props.put(DCMetaData.CREATOR, author);
            }


            // DC:description - SUBJECT
            if (n.getParentNode().getParentNode().getNodeName().equals("dc:description"))
            {
               String description = n.getLastChild().getTextContent();
               props.put(DCMetaData.SUBJECT, description);
               // props.put(DCMetaData.DESCRIPTION, description);
            }
         }
      }


      try
      {
         // xmp:CreateDate - DATE
         list = doc.getElementsByTagName("xmp:CreateDate");
         if (list != null && list.item(0) != null)
         {
            Node creationDateNode = list.item(0).getLastChild();
            if (creationDateNode != null)
            {
               String creationDate = creationDateNode.getTextContent();
               Calendar c = ISO8601.parseEx(creationDate);
               props.put(DCMetaData.DATE, c);
            }
         }


         // xmp:ModifyDate - DATE
         list = doc.getElementsByTagName("xmp:ModifyDate");
         if (list != null && list.item(0) != null)
         {
            Node modifyDateNode = list.item(0).getLastChild();
            if (modifyDateNode != null)
            {
               String modifyDate = modifyDateNode.getTextContent();
               Calendar c = ISO8601.parseEx(modifyDate);
               props.put(DCMetaData.DATE, c);
            }
         }
      }
      catch (ParseException e)
      {
         throw new DocumentReadException(e.getMessage(), e);
      }
      return props;
   }


}
Source Code of org.exoplatform.services.document.impl.PDFDocumentReader

Related Classes of org.exoplatform.services.document.impl.PDFDocumentReader