Examples of org.apache.nutch.parse.ParseStatus

org.apache.nutch.parse.ParseStatus
@author Andrzej Bialecki <ab@getopt.org>


    String contentType = content.getContentType();


    String[] params = (String[]) TYPE_PARAMS_MAP.get(contentType);
    if (params == null)
      return new ParseStatus(ParseStatus.FAILED,
                      "No external command defined for contentType: " + contentType).getEmptyParseResult(content.getUrl(), getConf());


    String command = params[0];
    int timeout = Integer.parseInt(params[1]);


    if (LOG.isTraceEnabled()) {
      LOG.trace("Use "+command+ " with timeout="+timeout+"secs");
    }


    String text = null;
    String title = null;


    try {


      byte[] raw = content.getContent();


      String contentLength = content.getMetadata().get(Response.CONTENT_LENGTH);
      if (contentLength != null
            && raw.length != Integer.parseInt(contentLength)) {
          return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED,
                "Content truncated at " + raw.length
            +" bytes. Parser can't handle incomplete "
            + contentType + " file.").getEmptyParseResult(content.getUrl(), getConf());
      }


      ByteArrayOutputStream os = new ByteArrayOutputStream(BUFFER_SIZE);
      ByteArrayOutputStream es = new ByteArrayOutputStream(BUFFER_SIZE/4);


      CommandRunner cr = new CommandRunner();


      cr.setCommand(command+ " " +contentType);
      cr.setInputStream(new ByteArrayInputStream(raw));
      cr.setStdOutputStream(os);
      cr.setStdErrorStream(es);


      cr.setTimeout(timeout);


      cr.evaluate();


      if (cr.getExitValue() != 0)
        return new ParseStatus(ParseStatus.FAILED,
                        "External command " + command
                        + " failed with error: " + es.toString()).getEmptyParseResult(content.getUrl(), getConf());


      text = os.toString();


    } catch (Exception e) { // run time exception
      return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
    }


    if (text == null)
      text = "";

View Full Code Here

      byte[] raw = content.getContent();


      String contentLength = content.getMetadata().get(Response.CONTENT_LENGTH);
      if (contentLength != null
            && raw.length != Integer.parseInt(contentLength)) {
          return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED,
                  "Content truncated at "+raw.length
            +" bytes. Parser can't handle incomplete pdf file.").getEmptyParseResult(content.getUrl(), getConf());
      }


      PDFParser parser = new PDFParser(new ByteArrayInputStream(raw));
      parser.parse();


      pdf = parser.getPDDocument();


      if (pdf.isEncrypted()) {
        //Just try using the default password and move on
        pdf.openProtection(new StandardDecryptionMaterial(""));
      }


      // collect text
      PDFTextStripper stripper = new PDFTextStripper();
      text = stripper.getText(pdf);


      // collect title
      PDDocumentInformation info = pdf.getDocumentInformation();
      title = info.getTitle();
      // more useful info, currently not used. please keep them for future use.
      metadata.add(Metadata.PAGE_COUNT, String.valueOf(pdf.getNumberOfPages()));
      metadata.add(Metadata.AUTHOR, info.getAuthor());
      metadata.add(Metadata.SUBJECT, info.getSubject());
      metadata.add(Metadata.KEYWORDS, info.getKeywords());
      metadata.add(Metadata.CREATOR, info.getCreator());
      metadata.add(Metadata.PUBLISHER, info.getProducer());
      
      //TODO: Figure out why we get a java.io.IOException: Error converting date:1-Jan-3 18:15PM
      //error here
      
      //metadata.put(DATE, dcDateFormatter.format(info.getCreationDate().getTime()));
      //metadata.put(LAST_MODIFIED, dcDateFormatter.format(info.getModificationDate().getTime()));


    } catch (CryptographyException e) {
      return new ParseStatus(ParseStatus.FAILED,
              "Error decrypting document. " + e).getEmptyParseResult(content.getUrl(), getConf());
    } catch (BadSecurityHandlerException e) {
      return new ParseStatus(ParseStatus.FAILED,
              "Error decrypting document. " + e).getEmptyParseResult(content.getUrl(), getConf());
    } catch (Exception e) { // run time exception
        if (LOG.isWarnEnabled()) {
          LOG.warn("General exception in PDF parser: "+e.getMessage());
          e.printStackTrace(LogUtil.getWarnStream(LOG));        
        }
      return new ParseStatus(ParseStatus.FAILED,
              "Can't be handled as pdf document. " + e).getEmptyParseResult(content.getUrl(), getConf());
    } finally {
      try {
        if (pdf != null)
          pdf.close();

View Full Code Here


            //Make a content object
            Content content = new Content(url,url, docBody.toString().getBytes(), mimetype, metaData, conf);


            Parse parse = null;
            ParseStatus parseStatus;
            try {
              parse = pu.parse(content);
              parseStatus = parse.getData().getStatus();
            } 
            catch (final Exception e) {
              parseStatus = new ParseStatus(e);
              LOG.error("error: unknown "+parseStatus.toString());
              if(!parseStatus.isSuccess()) {
                LOG.error("parse failure");
              }
            }
            catch (StackOverflowError soe){
              parseStatus = new ParseStatus(soe);
              LOG.error("error: StackOverflowError "+parseStatus.toString());
              if(!parseStatus.isSuccess()) {
                LOG.error("parse failure");
              }
            }


            if(parseStatus.isSuccess()) {
              CrawlDatum datum = new CrawlDatum();
              datum.setStatus(CrawlDatum.STATUS_FETCH_SUCCESS);
              datum.setFetchTime(fetchDate.getTime());
          
              // Score at this stage is 1.0f.

View Full Code Here

      byte[] raw = content.getContent();


      String contentLength = content.getMetadata().get(Response.CONTENT_LENGTH);
      if (contentLength != null
            && raw.length != Integer.parseInt(contentLength)) {
          return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED,
                  "Content truncated at "+raw.length
            +" bytes. Parser can't handle incomplete pdf file.").getEmptyParse(getConf());
      }


      // TODO MC - store pdf files to analyze
      // FileOutputStream fout = new FileOutputStream("/home/nutchwax/lixo/"+System.currentTimeMillis()+".pdf"); 
      // fout.write(raw);
      // fout.close();
      // TODO MC


      PDFParser parser = new PDFParser(new ByteArrayInputStream(raw));
      parser.parse();


      pdf = parser.getPDDocument();


      if (pdf.isEncrypted()) {
        DocumentEncryption decryptor = new DocumentEncryption(pdf);
        //Just try using the default password and move on
        decryptor.decryptDocument("");
      }


      // collect text
      PDFTextStripper stripper = new PDFTextStripper();
      text = stripper.getText(pdf);


      // collect title
      PDDocumentInformation info = pdf.getDocumentInformation();
      title = info.getTitle();
      // more useful info, currently not used. please keep them for future use.
      metadata.add(Metadata.PAGE_COUNT, String.valueOf(pdf.getPageCount()));
      metadata.add(Metadata.AUTHOR, info.getAuthor());
      metadata.add(Metadata.SUBJECT, info.getSubject());
      metadata.add(Metadata.KEYWORDS, info.getKeywords());
      metadata.add(Metadata.CREATOR, info.getCreator());
      metadata.add(Metadata.PUBLISHER, info.getProducer());
      
      //TODO: Figure out why we get a java.io.IOException: Error converting date:1-Jan-3 18:15PM
      //error here
      
      //metadata.put(DATE, dcDateFormatter.format(info.getCreationDate().getTime()));
      //metadata.put(LAST_MODIFIED, dcDateFormatter.format(info.getModificationDate().getTime()));


    } catch (CryptographyException e) {
      return new ParseStatus(ParseStatus.FAILED,
              "Error decrypting document. " + e).getEmptyParse(getConf());
    } catch (InvalidPasswordException e) {
      return new ParseStatus(ParseStatus.FAILED,
              "Can't decrypt document - invalid password. " + e).getEmptyParse(getConf());
    } catch (Exception e) { // run time exception
        if (LOG.isWarnEnabled()) {
          LOG.warn("General exception in PDF parser: "+e.getMessage());
          e.printStackTrace(LogUtil.getWarnStream(LOG));        
        }
      return new ParseStatus(ParseStatus.FAILED,
              "Can't be handled as pdf document. " + e).getEmptyParse(getConf());
    } finally {
      try {
        if (pdf != null)
          pdf.close();

View Full Code Here


    String[] params = (String[]) TYPE_PARAMS_MAP.get(contentType);
    
    if (params == null)
    {
      return new ParseStatus(ParseStatus.FAILED,
        "No external command defined for contentType: "
        + contentType).getEmptyParse(getConf());
    }
    
    String command = params[0];
    int timeout = Integer.parseInt(params[1]);


    if (LOG.isDebugEnabled())
    {
      LOG.debug("Use " + command + " with timeout=" + timeout + "secs");
    }


    String text = null;
    String title = null;


    try
    {
      byte[] raw = content.getContent();
      String contentLength = content.getMetadata().
        get("contentLength");
        
      if (contentLength != null &&
        raw.length != Integer.parseInt(contentLength))
      {
        return new ParseStatus(ParseStatus.FAILED,
          ParseStatus.FAILED_TRUNCATED,
          "Content truncated at " + raw.length +
          " bytes (Original was " + contentLength +
          ". Parser can't handle incomplete " + contentType +
          " file.").getEmptyParse(getConf());
      }


      ByteArrayOutputStream os = new ByteArrayOutputStream(BUFFER_SIZE);
      ByteArrayOutputStream es = new ByteArrayOutputStream(BUFFER_SIZE / 4);


      CommandRunner cr = new CommandRunner();


      cr.setCommand(command + " " + System.getProperty("java.io.tmpdir")
        + " " + contentType);
      cr.setInputStream(new ByteArrayInputStream(raw));
      cr.setStdOutputStream(os);
      cr.setStdErrorStream(es);


      cr.setTimeout(timeout);


      cr.evaluate();


      if (cr.getExitValue() != 0)
      {
        return new ParseStatus(ParseStatus.FAILED, "External command "
          + command + " failed with error: " + es.toString()
          + ", contentLength " + contentLength + ", raw length "
          + Integer.toString(raw.length)).getEmptyParse(getConf());
      }
      
      text = os.toString();


      CharSequence cs =
        text.subSequence(0, Math.min(512, text.length()));
      Matcher m = TITLE.matcher(cs);
      
      if (m.find())
      {
        if (m.group(1) != null)
        {
          title = m.group(1).trim();
        }
      }
      else
      {
        if (LOG.isDebugEnabled())
        {
          LOG.debug("PDFInfo: " + cs.toString());
        }
      }


    }
    catch (Exception e) // run time exception
    {
      return new ParseStatus(e).getEmptyParse(getConf());
    }
    
    if (title == null)
    {
      title = "";

View Full Code Here

    try {
      byte[] raw = content.getContent();
      String contentLength = content.getMetadata().get(Metadata.CONTENT_LENGTH);
      if ((contentLength != null) &&
          (raw.length != Integer.parseInt(contentLength))) {
        return new ParseStatus(ParseStatus.FAILED,
                               ParseStatus.FAILED_TRUNCATED,
                               "Content truncated at " + raw.length +" bytes. " +
                               "Parser can't handle incomplete file.")
                               .getEmptyParse(this.conf);
      }
      extractor.extract(new ByteArrayInputStream(raw));
      text = extractor.getText();
      properties = extractor.getProperties();
      outlinks = OutlinkExtractor.getOutlinks(text, content.getUrl(), getConf());
      
    } catch (Exception e) {
      return new ParseStatus(ParseStatus.FAILED,
                             "Can't be handled as Microsoft document. " + e)
                             .getEmptyParse(this.conf);
    }
    
    // collect meta data

View Full Code Here

  tout.setUrl(url);
    tout.setContent(content);
    tout.setParseUtil(parseUtil);          
    tout.wakeupAndWait();        
  
  ParseStatus parseStatus=tout.getParseStatus();
  Parse parse=tout.getParse();     
  reporter.setStatusIfElapse("parsed " + url);
     
  if (!parseStatus.isSuccess()) {
      final String status = formatToOneLine(parseStatus.toString());
      LOG.warn("Error parsing: " + mimetype + " " + url + ": " + status);
      parse = null;
    }
    else {
      // Was it a slow parse?

View Full Code Here


    URL base;
    try {
      base = new URL(content.getBaseUrl());
    } catch (MalformedURLException e) {
      return new ParseStatus(e).getEmptyParseResult(content.getUrl(),
          getConf());
    }


    // get the right parser using the mime type as a clue
    Parser parser = tikaConfig.getParser(MediaType.parse(mimeType));
    byte[] raw = content.getContent();


    if (parser == null) {
      String message = "Can't retrieve Tika parser for mime-type "
          + mimeType;
      LOG.error(message);
      return new ParseStatus(ParseStatus.FAILED, message)
          .getEmptyParseResult(content.getUrl(), getConf());
    }


    LOG.debug("Using Tika parser " + parser.getClass().getName()
        + " for mime-type " + mimeType);


    Metadata tikamd = new Metadata();


    HTMLDocumentImpl doc = new HTMLDocumentImpl();
    doc.setErrorChecking(false);
    DocumentFragment root = doc.createDocumentFragment();
    DOMBuilder domhandler = new DOMBuilder(doc, root);
    ParseContext context = new ParseContext();
    tikamd.set(Metadata.CONTENT_TYPE, mimeType);
    try {
      parser.parse(new ByteArrayInputStream(raw), domhandler, tikamd,context);
    } catch (Exception e) {
      LOG.error("Error parsing "+content.getUrl(),e);
      return new ParseStatus(ParseStatus.FAILED, e.getMessage())
          .getEmptyParseResult(content.getUrl(), getConf());
    }


    HTMLMetaTags metaTags = new HTMLMetaTags();
    String text = "";
    String title = "";
    Outlink[] outlinks = new Outlink[0];
    org.apache.nutch.metadata.Metadata nutchMetadata = new org.apache.nutch.metadata.Metadata();


    // we have converted the sax events generated by Tika into a DOM object
    // so we can now use the usual HTML resources from Nutch
    // get meta directives
    HTMLMetaProcessor.getMetaTags(metaTags, root, base);
    if (LOG.isTraceEnabled()) {
      LOG.trace("Meta tags for " + base + ": " + metaTags.toString());
    }


    // check meta directives
    if (!metaTags.getNoIndex()) { // okay to index
      StringBuffer sb = new StringBuffer();
      if (LOG.isTraceEnabled()) {
        LOG.trace("Getting text...");
      }
      utils.getText(sb, root); // extract text
      text = sb.toString();
      sb.setLength(0);
      if (LOG.isTraceEnabled()) {
        LOG.trace("Getting title...");
      }
      utils.getTitle(sb, root); // extract title
      title = sb.toString().trim();
    }


    if (!metaTags.getNoFollow()) { // okay to follow links
      ArrayList<Outlink> l = new ArrayList<Outlink>(); // extract outlinks
      URL baseTag = utils.getBase(root);
      if (LOG.isTraceEnabled()) {
        LOG.trace("Getting links...");
      }
      utils.getOutlinks(baseTag != null ? baseTag : base, l, root);
      outlinks = l.toArray(new Outlink[l.size()]);
      if (LOG.isTraceEnabled()) {
        LOG.trace("found " + outlinks.length + " outlinks in "
            + content.getUrl());
      }
    }


    // populate Nutch metadata with Tika metadata
    String[] TikaMDNames = tikamd.names();
    for (String tikaMDName : TikaMDNames) {
      if (tikaMDName.equalsIgnoreCase(Metadata.TITLE))
        continue;
      // TODO what if multivalued?
      nutchMetadata.add(tikaMDName, tikamd.get(tikaMDName));
    }


    // no outlinks? try OutlinkExtractor e.g works for mime types where no
    // explicit markup for anchors


    if (outlinks.length == 0) {
      outlinks = OutlinkExtractor.getOutlinks(text, getConf());
    }


    ParseStatus status = new ParseStatus(ParseStatus.SUCCESS);
    if (metaTags.getRefresh()) {
      status.setMinorCode(ParseStatus.SUCCESS_REDIRECT);
      status.setArgs(new String[] { metaTags.getRefreshHref().toString(),
          Integer.toString(metaTags.getRefreshTime()) });
    }
    ParseData parseData = new ParseData(status, title, outlinks, content
        .getMetadata(), nutchMetadata);
    ParseResult parseResult = ParseResult.createParseResult(content

View Full Code Here

      feed = feedInput.build(input);
    } catch (Exception e) {
      // return empty parse
      LOG.warn("Parse failed: url: " + content.getUrl() + ", exception: "
          + StringUtils.stringifyException(e));
      return new ParseStatus(e)
          .getEmptyParseResult(content.getUrl(), getConf());
    }


    String feedLink = feed.getLink();
    try {
      feedLink = normalizers.normalize(feedLink, URLNormalizers.SCOPE_OUTLINK);
      if (feedLink != null)
        feedLink = filters.filter(feedLink);
    } catch (Exception e) {
      feedLink = null;
    }


    List<?> entries = feed.getEntries();
    for(Object entry: entries) {
      addToMap(parseResult, feed, feedLink, (SyndEntry)entry, content);
    }


    String feedDesc = stripTags(feed.getDescriptionEx());
    String feedTitle = stripTags(feed.getTitleEx());


    parseResult.put(content.getUrl(), new ParseText(feedDesc), new ParseData(
        new ParseStatus(ParseStatus.SUCCESS), feedTitle, new Outlink[0],
        content.getMetadata()));


    return parseResult;
  }

View Full Code Here

      final int len = Integer.parseInt(contentLen);
      if (LOG.isDebugEnabled()) { LOG.debug("ziplen: " + len); }
      final byte[] contentInBytes = content.getContent();


      if (contentLen != null && contentInBytes.length != len) {
        return new ParseStatus(ParseStatus.FAILED,
            ParseStatus.FAILED_TRUNCATED, "Content truncated at "
                + contentInBytes.length
                + " bytes. Parser can't handle incomplete zip file.")
            .getEmptyParseResult(content.getUrl(), getConf());
      }


      ZipTextExtractor extractor = new ZipTextExtractor(getConf());


      // extract text
      resultText = extractor.extractText(new ByteArrayInputStream(
          contentInBytes), content.getUrl(), outLinksList);


    } catch (Exception e) {
      return new ParseStatus(ParseStatus.FAILED,
          "Can't be handled as Zip document. " + e).getEmptyParseResult(content.getUrl(), getConf());
    }


    if (resultText == null) {
      resultText = "";

View Full Code Here

0 1 2 3 4 5 6 7 8

TOP

Related Classes of org.apache.nutch.parse.ParseStatus

com.atlantbh.nutch.filter.xpath.XPathHtmlParserFilter

org.apache.nutch.indexer.more.TestMoreIndexingFilter

org.apache.nutch.indexer.TestIndexingFilters

org.apache.nutch.metadata.Metadata

org.apache.nutch.parse.ext.ExtParser

org.apache.nutch.parse.ext.WaxExtParser

org.apache.nutch.parse.feed.FeedParser

org.apache.nutch.parse.js.JSParseFilter

org.apache.nutch.parse.mp3.MP3Parser

org.apache.nutch.parse.ms.MSBaseParser

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.