Examples of net.nutch.parse.ParseException

net.nutch.parse.ParseException

    rtfParser.setDelegate(delegate);


    try {
      rtfParser.parse();
    } catch (com.etranslate.tm.processing.rtf.ParseException e) {
      throw new ParseException("Exception parsing RTF document", e);
    }


    Properties metadata = new Properties();
    metadata.putAll(content.getMetadata());
    metadata.putAll(delegate.getMetaData());

View Full Code Here

  public Parse getParse(Content content) throws ParseException {


    // check that contentType is one we can handle
    String contentType = content.getContentType();
    if (contentType != null && !contentType.startsWith("application/msword"))
      throw new ParseException(
        "Content-Type not application/msword: "+contentType);


    String text = null;
    String title = null;
    Properties properties = null;


    try {


      byte[] raw = content.getContent();


      String contentLength = content.get("Content-Length");
      if (contentLength != null
            && raw.length != Integer.parseInt(contentLength)) {
          throw new ParseException("Content truncated at "+raw.length
            +" bytes. Parser can't handle incomplete msword file.");
      }


      WordExtractor extractor = new WordExtractor();


      // collect text
      text = extractor.extractText(new ByteArrayInputStream(raw));


      // collect meta info
      properties = extractor.extractProperties(new ByteArrayInputStream(raw));


      extractor = null;


    } catch (ParseException e) {
      throw e;
    } catch (FastSavedException e) {
      throw new ParseException(e);
    } catch (PasswordProtectedException e) {
      throw new ParseException(e);
    } catch (Exception e) { // run time exception
      throw new ParseException("Can't be handled as msword document. "+e);
    } finally {
      // nothing so far
    }


    // collect meta data

View Full Code Here

  public Parse getParse(Content content) throws ParseException {


    // check that contentType is one we can handle
    String contentType = content.getContentType();
    if (contentType != null && !contentType.startsWith("application/pdf"))
      throw new ParseException(
        "Content-Type not application/pdf: "+contentType);


    // in memory representation of pdf file
    PDDocument pdf = null;


    String text = null;
    String title = null;


    try {


      byte[] raw = content.getContent();


      String contentLength = content.get("Content-Length");
      if (contentLength != null
            && raw.length != Integer.parseInt(contentLength)) {
          throw new ParseException("Content truncated at "+raw.length
            +" bytes. Parser can't handle incomplete pdf file.");
      }


      PDFParser parser = new PDFParser(
        new ByteArrayInputStream(raw));
      parser.parse();


      pdf = parser.getPDDocument();


      if (pdf.isEncrypted()) {
        DocumentEncryption decryptor = new DocumentEncryption(pdf);
        //Just try using the default password and move on
        decryptor.decryptDocument("");
      }


      // collect text
      PDFTextStripper stripper = new PDFTextStripper();
      text = stripper.getText(pdf);


      // collect title
      PDDocumentInformation info = pdf.getDocumentInformation();
      title = info.getTitle();
      // more useful info, currently not used. please keep them for future use.
      // pdf.getPageCount();
      // info.getAuthor()
      // info.getSubject()
      // info.getKeywords()
      // info.getCreator()
      // info.getProducer()
      // info.getTrapped()
      // formatDate(info.getCreationDate())
      // formatDate(info.getModificationDate())


    } catch (ParseException e) {
      throw e;
    } catch (CryptographyException e) {
      throw new ParseException("Error decrypting document. "+e);
    } catch (InvalidPasswordException e) {
      throw new ParseException("Can't decrypt document. "+e);
    } catch (Exception e) { // run time exception
      throw new ParseException("Can't be handled as pdf document. "+e);
    } finally {
      try {
        if (pdf != null)
          pdf.close();
        } catch (IOException e) {

View Full Code Here


    String contentType = content.getContentType();


    String[] params = (String[]) TYPE_PARAMS_MAP.get(contentType);
    if (params == null)
      throw new ParseException(
        "No external command defined for contentType: " + contentType);


    String command = params[0];
    int timeout = Integer.parseInt(params[1]);


    if (LOG.isLoggable(Level.FINE))
      LOG.fine("Use "+command+ " with timeout="+timeout+"secs");


    String text = null;
    String title = null;


    try {


      byte[] raw = content.getContent();


      String contentLength =
        (String)content.getMetadata().get("Content-Length");
      if (contentLength != null
            && raw.length != Integer.parseInt(contentLength)) {
          throw new ParseException("Content truncated at "+raw.length
            +" bytes. Parser can't handle incomplete "+contentType+" file.");
      }


      ByteArrayOutputStream os = new ByteArrayOutputStream(BUFFER_SIZE);
      ByteArrayOutputStream es = new ByteArrayOutputStream(BUFFER_SIZE/4);


      CommandRunner cr = new CommandRunner();


      cr.setCommand(command+ " " +contentType);
      cr.setInputStream(new ByteArrayInputStream(raw));
      cr.setStdOutputStream(os);
      cr.setStdErrorStream(es);


      cr.setTimeout(timeout);


      cr.evaluate();


      if (cr.getExitValue() != 0)
        throw new ParseException("External command "+command
          +" failed with error: "+es.toString());


      text = os.toString();


    } catch (ParseException e) {
      throw e;
    } catch (Exception e) { // run time exception
      throw new ParseException("ExtParser failed. "+e);
    }


    if (text == null)
      text = "";

View Full Code Here

TOP

Related Classes of net.nutch.parse.ParseException

net.nutch.parse.ext.ExtParser

net.nutch.parse.msword.MSWordParser

net.nutch.parse.pdf.PdfParser

net.nutch.parse.rtf.RTFParseFactory

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.