Package org.apache.nutch.parse

Examples of org.apache.nutch.parse.ParseStatus


    String contentType = content.getContentType();

    String[] params = (String[]) TYPE_PARAMS_MAP.get(contentType);
    if (params == null)
      return new ParseStatus(ParseStatus.FAILED,
                      "No external command defined for contentType: " + contentType).getEmptyParseResult(content.getUrl(), getConf());

    String command = params[0];
    int timeout = Integer.parseInt(params[1]);

    if (LOG.isTraceEnabled()) {
      LOG.trace("Use "+command+ " with timeout="+timeout+"secs");
    }

    String text = null;
    String title = null;

    try {

      byte[] raw = content.getContent();

      String contentLength = content.getMetadata().get(Response.CONTENT_LENGTH);
      if (contentLength != null
            && raw.length != Integer.parseInt(contentLength)) {
          return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED,
                "Content truncated at " + raw.length
            +" bytes. Parser can't handle incomplete "
            + contentType + " file.").getEmptyParseResult(content.getUrl(), getConf());
      }

      ByteArrayOutputStream os = new ByteArrayOutputStream(BUFFER_SIZE);
      ByteArrayOutputStream es = new ByteArrayOutputStream(BUFFER_SIZE/4);

      CommandRunner cr = new CommandRunner();

      cr.setCommand(command+ " " +contentType);
      cr.setInputStream(new ByteArrayInputStream(raw));
      cr.setStdOutputStream(os);
      cr.setStdErrorStream(es);

      cr.setTimeout(timeout);

      cr.evaluate();

      if (cr.getExitValue() != 0)
        return new ParseStatus(ParseStatus.FAILED,
                        "External command " + command
                        + " failed with error: " + es.toString()).getEmptyParseResult(content.getUrl(), getConf());

      text = os.toString();

    } catch (Exception e) { // run time exception
      return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
    }

    if (text == null)
      text = "";
View Full Code Here


      byte[] raw = content.getContent();

      String contentLength = content.getMetadata().get(Response.CONTENT_LENGTH);
      if (contentLength != null
            && raw.length != Integer.parseInt(contentLength)) {
          return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED,
                  "Content truncated at "+raw.length
            +" bytes. Parser can't handle incomplete pdf file.").getEmptyParseResult(content.getUrl(), getConf());
      }

      PDFParser parser = new PDFParser(new ByteArrayInputStream(raw));
      parser.parse();

      pdf = parser.getPDDocument();

      if (pdf.isEncrypted()) {
        //Just try using the default password and move on
        pdf.openProtection(new StandardDecryptionMaterial(""));
      }

      // collect text
      PDFTextStripper stripper = new PDFTextStripper();
      text = stripper.getText(pdf);

      // collect title
      PDDocumentInformation info = pdf.getDocumentInformation();
      title = info.getTitle();
      // more useful info, currently not used. please keep them for future use.
      metadata.add(Metadata.PAGE_COUNT, String.valueOf(pdf.getNumberOfPages()));
      metadata.add(Metadata.AUTHOR, info.getAuthor());
      metadata.add(Metadata.SUBJECT, info.getSubject());
      metadata.add(Metadata.KEYWORDS, info.getKeywords());
      metadata.add(Metadata.CREATOR, info.getCreator());
      metadata.add(Metadata.PUBLISHER, info.getProducer());
     
      //TODO: Figure out why we get a java.io.IOException: Error converting date:1-Jan-3 18:15PM
      //error here
     
      //metadata.put(DATE, dcDateFormatter.format(info.getCreationDate().getTime()));
      //metadata.put(LAST_MODIFIED, dcDateFormatter.format(info.getModificationDate().getTime()));

    } catch (CryptographyException e) {
      return new ParseStatus(ParseStatus.FAILED,
              "Error decrypting document. " + e).getEmptyParseResult(content.getUrl(), getConf());
    } catch (BadSecurityHandlerException e) {
      return new ParseStatus(ParseStatus.FAILED,
              "Error decrypting document. " + e).getEmptyParseResult(content.getUrl(), getConf());
    } catch (Exception e) { // run time exception
        if (LOG.isWarnEnabled()) {
          LOG.warn("General exception in PDF parser: "+e.getMessage());
          e.printStackTrace(LogUtil.getWarnStream(LOG));       
        }
      return new ParseStatus(ParseStatus.FAILED,
              "Can't be handled as pdf document. " + e).getEmptyParseResult(content.getUrl(), getConf());
    } finally {
      try {
        if (pdf != null)
          pdf.close();
View Full Code Here

            //Make a content object
            Content content = new Content(url,url, docBody.toString().getBytes(), mimetype, metaData, conf);

            Parse parse = null;
            ParseStatus parseStatus;
            try {
              parse = pu.parse(content);
              parseStatus = parse.getData().getStatus();
            }
            catch (final Exception e) {
              parseStatus = new ParseStatus(e);
              LOG.error("error: unknown "+parseStatus.toString());
              if(!parseStatus.isSuccess()) {
                LOG.error("parse failure");
              }
            }
            catch (StackOverflowError soe){
              parseStatus = new ParseStatus(soe);
              LOG.error("error: StackOverflowError "+parseStatus.toString());
              if(!parseStatus.isSuccess()) {
                LOG.error("parse failure");
              }
            }

            if(parseStatus.isSuccess()) {
              CrawlDatum datum = new CrawlDatum();
              datum.setStatus(CrawlDatum.STATUS_FETCH_SUCCESS);
              datum.setFetchTime(fetchDate.getTime());
         
              // Score at this stage is 1.0f.
View Full Code Here

      byte[] raw = content.getContent();

      String contentLength = content.getMetadata().get(Response.CONTENT_LENGTH);
      if (contentLength != null
            && raw.length != Integer.parseInt(contentLength)) {
          return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED,
                  "Content truncated at "+raw.length
            +" bytes. Parser can't handle incomplete pdf file.").getEmptyParse(getConf());
      }

      // TODO MC - store pdf files to analyze
      // FileOutputStream fout = new FileOutputStream("/home/nutchwax/lixo/"+System.currentTimeMillis()+".pdf");
      // fout.write(raw);
      // fout.close();
      // TODO MC

      PDFParser parser = new PDFParser(new ByteArrayInputStream(raw));
      parser.parse();

      pdf = parser.getPDDocument();

      if (pdf.isEncrypted()) {
        DocumentEncryption decryptor = new DocumentEncryption(pdf);
        //Just try using the default password and move on
        decryptor.decryptDocument("");
      }

      // collect text
      PDFTextStripper stripper = new PDFTextStripper();
      text = stripper.getText(pdf);

      // collect title
      PDDocumentInformation info = pdf.getDocumentInformation();
      title = info.getTitle();
      // more useful info, currently not used. please keep them for future use.
      metadata.add(Metadata.PAGE_COUNT, String.valueOf(pdf.getPageCount()));
      metadata.add(Metadata.AUTHOR, info.getAuthor());
      metadata.add(Metadata.SUBJECT, info.getSubject());
      metadata.add(Metadata.KEYWORDS, info.getKeywords());
      metadata.add(Metadata.CREATOR, info.getCreator());
      metadata.add(Metadata.PUBLISHER, info.getProducer());
     
      //TODO: Figure out why we get a java.io.IOException: Error converting date:1-Jan-3 18:15PM
      //error here
     
      //metadata.put(DATE, dcDateFormatter.format(info.getCreationDate().getTime()));
      //metadata.put(LAST_MODIFIED, dcDateFormatter.format(info.getModificationDate().getTime()));

    } catch (CryptographyException e) {
      return new ParseStatus(ParseStatus.FAILED,
              "Error decrypting document. " + e).getEmptyParse(getConf());
    } catch (InvalidPasswordException e) {
      return new ParseStatus(ParseStatus.FAILED,
              "Can't decrypt document - invalid password. " + e).getEmptyParse(getConf());
    } catch (Exception e) { // run time exception
        if (LOG.isWarnEnabled()) {
          LOG.warn("General exception in PDF parser: "+e.getMessage());
          e.printStackTrace(LogUtil.getWarnStream(LOG));       
        }
      return new ParseStatus(ParseStatus.FAILED,
              "Can't be handled as pdf document. " + e).getEmptyParse(getConf());
    } finally {
      try {
        if (pdf != null)
          pdf.close();
View Full Code Here

    String[] params = (String[]) TYPE_PARAMS_MAP.get(contentType);
   
    if (params == null)
    {
      return new ParseStatus(ParseStatus.FAILED,
        "No external command defined for contentType: "
        + contentType).getEmptyParse(getConf());
    }
   
    String command = params[0];
    int timeout = Integer.parseInt(params[1]);

    if (LOG.isDebugEnabled())
    {
      LOG.debug("Use " + command + " with timeout=" + timeout + "secs");
    }

    String text = null;
    String title = null;

    try
    {
      byte[] raw = content.getContent();
      String contentLength = content.getMetadata().
        get("contentLength");
       
      if (contentLength != null &&
        raw.length != Integer.parseInt(contentLength))
      {
        return new ParseStatus(ParseStatus.FAILED,
          ParseStatus.FAILED_TRUNCATED,
          "Content truncated at " + raw.length +
          " bytes (Original was " + contentLength +
          ". Parser can't handle incomplete " + contentType +
          " file.").getEmptyParse(getConf());
      }

      ByteArrayOutputStream os = new ByteArrayOutputStream(BUFFER_SIZE);
      ByteArrayOutputStream es = new ByteArrayOutputStream(BUFFER_SIZE / 4);

      CommandRunner cr = new CommandRunner();

      cr.setCommand(command + " " + System.getProperty("java.io.tmpdir")
        + " " + contentType);
      cr.setInputStream(new ByteArrayInputStream(raw));
      cr.setStdOutputStream(os);
      cr.setStdErrorStream(es);

      cr.setTimeout(timeout);

      cr.evaluate();

      if (cr.getExitValue() != 0)
      {
        return new ParseStatus(ParseStatus.FAILED, "External command "
          + command + " failed with error: " + es.toString()
          + ", contentLength " + contentLength + ", raw length "
          + Integer.toString(raw.length)).getEmptyParse(getConf());
      }
     
      text = os.toString();

      CharSequence cs =
        text.subSequence(0, Math.min(512, text.length()));
      Matcher m = TITLE.matcher(cs);
     
      if (m.find())
      {
        if (m.group(1) != null)
        {
          title = m.group(1).trim();
        }
      }
      else
      {
        if (LOG.isDebugEnabled())
        {
          LOG.debug("PDFInfo: " + cs.toString());
        }
      }

    }
    catch (Exception e) // run time exception
    {
      return new ParseStatus(e).getEmptyParse(getConf());
    }
   
    if (title == null)
    {
      title = "";
View Full Code Here

    try {
      byte[] raw = content.getContent();
      String contentLength = content.getMetadata().get(Metadata.CONTENT_LENGTH);
      if ((contentLength != null) &&
          (raw.length != Integer.parseInt(contentLength))) {
        return new ParseStatus(ParseStatus.FAILED,
                               ParseStatus.FAILED_TRUNCATED,
                               "Content truncated at " + raw.length +" bytes. " +
                               "Parser can't handle incomplete file.")
                               .getEmptyParse(this.conf);
      }
      extractor.extract(new ByteArrayInputStream(raw));
      text = extractor.getText();
      properties = extractor.getProperties();
      outlinks = OutlinkExtractor.getOutlinks(text, content.getUrl(), getConf());
     
    } catch (Exception e) {
      return new ParseStatus(ParseStatus.FAILED,
                             "Can't be handled as Microsoft document. " + e)
                             .getEmptyParse(this.conf);
    }
   
    // collect meta data
View Full Code Here

  tout.setUrl(url);
    tout.setContent(content);
    tout.setParseUtil(parseUtil);         
    tout.wakeupAndWait();       
 
  ParseStatus parseStatus=tout.getParseStatus();
  Parse parse=tout.getParse();    
  reporter.setStatusIfElapse("parsed " + url);
    
  if (!parseStatus.isSuccess()) {
      final String status = formatToOneLine(parseStatus.toString());
      LOG.warn("Error parsing: " + mimetype + " " + url + ": " + status);
      parse = null;
    }
    else {
      // Was it a slow parse?
View Full Code Here

    URL base;
    try {
      base = new URL(content.getBaseUrl());
    } catch (MalformedURLException e) {
      return new ParseStatus(e).getEmptyParseResult(content.getUrl(),
          getConf());
    }

    // get the right parser using the mime type as a clue
    Parser parser = tikaConfig.getParser(MediaType.parse(mimeType));
    byte[] raw = content.getContent();

    if (parser == null) {
      String message = "Can't retrieve Tika parser for mime-type "
          + mimeType;
      LOG.error(message);
      return new ParseStatus(ParseStatus.FAILED, message)
          .getEmptyParseResult(content.getUrl(), getConf());
    }

    LOG.debug("Using Tika parser " + parser.getClass().getName()
        + " for mime-type " + mimeType);

    Metadata tikamd = new Metadata();

    HTMLDocumentImpl doc = new HTMLDocumentImpl();
    doc.setErrorChecking(false);
    DocumentFragment root = doc.createDocumentFragment();
    DOMBuilder domhandler = new DOMBuilder(doc, root);
    ParseContext context = new ParseContext();
    tikamd.set(Metadata.CONTENT_TYPE, mimeType);
    try {
      parser.parse(new ByteArrayInputStream(raw), domhandler, tikamd,context);
    } catch (Exception e) {
      LOG.error("Error parsing "+content.getUrl(),e);
      return new ParseStatus(ParseStatus.FAILED, e.getMessage())
          .getEmptyParseResult(content.getUrl(), getConf());
    }

    HTMLMetaTags metaTags = new HTMLMetaTags();
    String text = "";
    String title = "";
    Outlink[] outlinks = new Outlink[0];
    org.apache.nutch.metadata.Metadata nutchMetadata = new org.apache.nutch.metadata.Metadata();

    // we have converted the sax events generated by Tika into a DOM object
    // so we can now use the usual HTML resources from Nutch
    // get meta directives
    HTMLMetaProcessor.getMetaTags(metaTags, root, base);
    if (LOG.isTraceEnabled()) {
      LOG.trace("Meta tags for " + base + ": " + metaTags.toString());
    }

    // check meta directives
    if (!metaTags.getNoIndex()) { // okay to index
      StringBuffer sb = new StringBuffer();
      if (LOG.isTraceEnabled()) {
        LOG.trace("Getting text...");
      }
      utils.getText(sb, root); // extract text
      text = sb.toString();
      sb.setLength(0);
      if (LOG.isTraceEnabled()) {
        LOG.trace("Getting title...");
      }
      utils.getTitle(sb, root); // extract title
      title = sb.toString().trim();
    }

    if (!metaTags.getNoFollow()) { // okay to follow links
      ArrayList<Outlink> l = new ArrayList<Outlink>(); // extract outlinks
      URL baseTag = utils.getBase(root);
      if (LOG.isTraceEnabled()) {
        LOG.trace("Getting links...");
      }
      utils.getOutlinks(baseTag != null ? baseTag : base, l, root);
      outlinks = l.toArray(new Outlink[l.size()]);
      if (LOG.isTraceEnabled()) {
        LOG.trace("found " + outlinks.length + " outlinks in "
            + content.getUrl());
      }
    }

    // populate Nutch metadata with Tika metadata
    String[] TikaMDNames = tikamd.names();
    for (String tikaMDName : TikaMDNames) {
      if (tikaMDName.equalsIgnoreCase(Metadata.TITLE))
        continue;
      // TODO what if multivalued?
      nutchMetadata.add(tikaMDName, tikamd.get(tikaMDName));
    }

    // no outlinks? try OutlinkExtractor e.g works for mime types where no
    // explicit markup for anchors

    if (outlinks.length == 0) {
      outlinks = OutlinkExtractor.getOutlinks(text, getConf());
    }

    ParseStatus status = new ParseStatus(ParseStatus.SUCCESS);
    if (metaTags.getRefresh()) {
      status.setMinorCode(ParseStatus.SUCCESS_REDIRECT);
      status.setArgs(new String[] { metaTags.getRefreshHref().toString(),
          Integer.toString(metaTags.getRefreshTime()) });
    }
    ParseData parseData = new ParseData(status, title, outlinks, content
        .getMetadata(), nutchMetadata);
    ParseResult parseResult = ParseResult.createParseResult(content
View Full Code Here

      feed = feedInput.build(input);
    } catch (Exception e) {
      // return empty parse
      LOG.warn("Parse failed: url: " + content.getUrl() + ", exception: "
          + StringUtils.stringifyException(e));
      return new ParseStatus(e)
          .getEmptyParseResult(content.getUrl(), getConf());
    }

    String feedLink = feed.getLink();
    try {
      feedLink = normalizers.normalize(feedLink, URLNormalizers.SCOPE_OUTLINK);
      if (feedLink != null)
        feedLink = filters.filter(feedLink);
    } catch (Exception e) {
      feedLink = null;
    }

    List<?> entries = feed.getEntries();
    for(Object entry: entries) {
      addToMap(parseResult, feed, feedLink, (SyndEntry)entry, content);
    }

    String feedDesc = stripTags(feed.getDescriptionEx());
    String feedTitle = stripTags(feed.getTitleEx());

    parseResult.put(content.getUrl(), new ParseText(feedDesc), new ParseData(
        new ParseStatus(ParseStatus.SUCCESS), feedTitle, new Outlink[0],
        content.getMetadata()));

    return parseResult;
  }
View Full Code Here

      final int len = Integer.parseInt(contentLen);
      if (LOG.isDebugEnabled()) { LOG.debug("ziplen: " + len); }
      final byte[] contentInBytes = content.getContent();

      if (contentLen != null && contentInBytes.length != len) {
        return new ParseStatus(ParseStatus.FAILED,
            ParseStatus.FAILED_TRUNCATED, "Content truncated at "
                + contentInBytes.length
                + " bytes. Parser can't handle incomplete zip file.")
            .getEmptyParseResult(content.getUrl(), getConf());
      }

      ZipTextExtractor extractor = new ZipTextExtractor(getConf());

      // extract text
      resultText = extractor.extractText(new ByteArrayInputStream(
          contentInBytes), content.getUrl(), outLinksList);

    } catch (Exception e) {
      return new ParseStatus(ParseStatus.FAILED,
          "Can't be handled as Zip document. " + e).getEmptyParseResult(content.getUrl(), getConf());
    }

    if (resultText == null) {
      resultText = "";
View Full Code Here

TOP

Related Classes of org.apache.nutch.parse.ParseStatus

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.