Examples of PDFParser


Examples of org.pdfbox.pdfparser.PDFParser

   * @return Reader a reader that is fed to an indexer.
   */
  protected Reader getReader(InputStream docStream)
  {
   
    PDFParser parser = null; PDDocument document = null; PDFTextStripper stripper = null;
    CharArrayWriter writer = null;
    try{
      parser = new PDFParser(docStream);
      parser.parse();
      document = parser.getPDDocument();
      writer = new CharArrayWriter();
      stripper = new PDFTextStripper();
      stripper.setLineSeparator("\n");
      stripper.writeText(document, writer);
      document.close();
      writer.close();
      parser.getDocument().close();
      return new CharArrayReader(writer.toCharArray());
    }catch (Exception e){
        //logger.warn("WARNING: Problem converting PDF: ",e);
      try{
        document.close();       
      }catch(Exception e1){
        //logger.warn("WARNING: Problem converting PDF: ",e1);
      }
      try{
        writer.close();
      }catch(Exception e2){
        //logger.warn("WARNING: Problem converting PDF: ",e2);
      }
      try{
        parser.getDocument().close();
      }catch(Exception e3){
        //logger.warn("WARNING: Problem converting PDF: ",e3); 
      }
      parser = null; document = null; writer = null; stripper = null;
      EOD=true;
View Full Code Here

Examples of org.pdfbox.pdfparser.PDFParser

      // FileOutputStream fout = new FileOutputStream("/home/nutchwax/lixo/"+System.currentTimeMillis()+".pdf");
      // fout.write(raw);
      // fout.close();
      // TODO MC

      PDFParser parser = new PDFParser(new ByteArrayInputStream(raw));
      parser.parse();

      pdf = parser.getPDDocument();

      if (pdf.isEncrypted()) {
        DocumentEncryption decryptor = new DocumentEncryption(pdf);
        //Just try using the default password and move on
        decryptor.decryptDocument("");
View Full Code Here

Examples of org.pdfbox.pdfparser.PDFParser

        InternalValue[] values = data.getValues();
        if (values.length > 0) {
            BLOBFileValue blob = (BLOBFileValue) values[0].internalValue();
               
            try {
                PDFParser parser = new PDFParser(blob.getStream());
                parser.parse();
   
                PDDocument document = parser.getPDDocument();
   
                CharArrayWriter writer = new CharArrayWriter();
   
                PDFTextStripper stripper = new PDFTextStripper();
                stripper.setLineSeparator("\n");
View Full Code Here

Examples of org.pdfbox.pdfparser.PDFParser

          return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED,
                  "Content truncated at "+raw.length
            +" bytes. Parser can't handle incomplete pdf file.").getEmptyParse(getConf());
      }

      PDFParser parser = new PDFParser(new ByteArrayInputStream(raw));
      parser.parse();

      pdf = parser.getPDDocument();

      if (pdf.isEncrypted()) {
        DocumentEncryption decryptor = new DocumentEncryption(pdf);
        //Just try using the default password and move on
        decryptor.decryptDocument("");
View Full Code Here

Examples of org.pdfbox.pdfparser.PDFParser

 
  public static final Log log = LogFactory.getLog(PDFIndexer.class);

  public IndexDocument getIndexedDocument(File2Index fileData) throws SolrException {
    try {
      PDFParser parser = new PDFParser(new ByteArrayInputStream(fileData.data));
      parser.parse();
      COSDocument cosDoc = parser.getDocument();

      PDFTextStripper stripper = new PDFTextStripper();
      String docText = stripper.getText(new PDDocument(cosDoc));
      cosDoc.close();
View Full Code Here

Examples of org.pdfbox.pdfparser.PDFParser

                validateForLocalUrl(url);
                resourceURL = new URL(url);
                is = resourceURL.openStream();
            }

            PDFParser parser = new PDFParser(is);
            parser.parse();
            COSDocument cosDoc = parser.getDocument();

            PDFTextStripper stripper = new PDFTextStripper();
            String docText = stripper.getText(new PDDocument(cosDoc));
            cosDoc.close();
            Document document = new Document();
View Full Code Here

Examples of org.pdfbox.pdfparser.PDFParser

        try {
            // get file as stream
            input = new FileInputStream(filename);

            // init PDFParser with stream
            PDFParser parser = new PDFParser(input);
            parser.setTempDirectory(new File(System.getProperty("java.io.tmpdir")));

            // parse
            parser.parse();

            // return Document
            return parser.getPDDocument();
        }
        finally {
            IOUtils.closeQuietly(input);
        }
    }
View Full Code Here

Examples of org.pdfbox.pdfparser.PDFParser

          return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED,
                  "Content truncated at "+raw.length
            +" bytes. Parser can't handle incomplete pdf file.").getEmptyParse();
      }

      PDFParser parser = new PDFParser(
        new ByteArrayInputStream(raw));
      parser.parse();

      pdf = parser.getPDDocument();

      if (pdf.isEncrypted()) {
        DocumentEncryption decryptor = new DocumentEncryption(pdf);
        //Just try using the default password and move on
        decryptor.decryptDocument("");
View Full Code Here

Examples of org.pdfbox.pdfparser.PDFParser

            && raw.length != Integer.parseInt(contentLength)) {
          throw new ParseException("Content truncated at "+raw.length
            +" bytes. Parser can't handle incomplete pdf file.");
      }

      PDFParser parser = new PDFParser(
        new ByteArrayInputStream(raw));
      parser.parse();

      pdf = parser.getPDDocument();

      if (pdf.isEncrypted()) {
        DocumentEncryption decryptor = new DocumentEncryption(pdf);
        //Just try using the default password and move on
        decryptor.decryptDocument("");
View Full Code Here

Examples of org.pdfbox.pdfparser.PDFParser

    public Reader extract(InputStream contentthrows ExtractorException
    {
        try
        {
            PDFParser parser = new PDFParser( content );
            parser.parse();

            PDDocument document = parser.getPDDocument();

            CharArrayWriter writer = new CharArrayWriter();

            PDFTextStripper stripper = new PDFTextStripper();
            stripper.setLineSeparator("\n");
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.