Package org.apache.nutch.storage

Examples of org.apache.nutch.storage.ParseStatus


    return new Parse("", "", new Outlink[0], status);
  }

  public static Parse getEmptyParse(int minorCode, String message, Configuration conf) {
    ParseStatus status = new ParseStatus();
    status.setMajorCode(ParseStatusCodes.FAILED);
    status.setMinorCode(minorCode);
    status.addToArgs(new Utf8(message));

    return new Parse("", "", new Outlink[0], status);
  }
View Full Code Here


    if (outlinks.size() > 0) {
      Outlink[] old = parse.getOutlinks();
      String title = parse.getTitle();
      List<Outlink> list = Arrays.asList(old);
      outlinks.addAll(list);
      ParseStatus status = parse.getParseStatus();
      String text = parse.getText();
      Outlink[] newlinks = outlinks.toArray(new Outlink[outlinks.size()]);
      return new Parse(text, title, newlinks, status);
    }
    return parse;
View Full Code Here

    if (outlinks.length == 0) {
      outlinks = OutlinkExtractor.getOutlinks(text, getConf());
    }

    ParseStatus status = ParseStatusUtils.STATUS_SUCCESS;
    if (metaTags.getRefresh()) {
      status.setMinorCode(ParseStatusCodes.SUCCESS_REDIRECT);
      status.addToArgs(new Utf8(metaTags.getRefreshHref().toString()));
      status.addToArgs(new Utf8(Integer.toString(metaTags.getRefreshTime())));
    }

    Parse parse = new Parse(text, title, outlinks, status);
    parse = htmlParseFilters.filter(url, page, parse, metaTags, root);
View Full Code Here

      if (LOG.isTraceEnabled()) {
        LOG.trace("found "+outlinks.length+" outlinks in "+ url);
      }
    }

    ParseStatus status = new ParseStatus();
    status.setMajorCode(ParseStatusCodes.SUCCESS);
    if (metaTags.getRefresh()) {
      status.setMinorCode(ParseStatusCodes.SUCCESS_REDIRECT);
      status.addToArgs(new Utf8(metaTags.getRefreshHref().toString()));
      status.addToArgs(new Utf8(Integer.toString(metaTags.getRefreshTime())));
    }

    Parse parse = new Parse(text, title, outlinks, status);
    parse = htmlParseFilters.filter(url, page, parse, metaTags, root);
View Full Code Here

          res.put(f, simpleMeta);
        } else if ("protocolStatus".equals(f)) {
          ProtocolStatus ps = page.getProtocolStatus();
          res.put(f, ProtocolStatusUtils.toString(ps));
        } else if ("parseStatus".equals(f)) {
          ParseStatus ps = page.getParseStatus();
          res.put(f, ParseStatusUtils.toString(ps));
        } else if ("signature".equals(f)) {
          ByteBuffer bb = page.getSignature();
          res.put(f, StringUtil.toHexString(bb.array()));
        } else if ("content".equals(f)) {
View Full Code Here

        return;
      }
     

      parseUtil.process(key, page);
      ParseStatus pstatus = page.getParseStatus();
      if (pstatus != null) {
        context.getCounter("ParserStatus",
            ParseStatusCodes.majorCodes[pstatus.getMajorCode()]).increment(1);
      }

      context.write(key, page);
    }   
View Full Code Here

    if (outlinks.length == 0) {
      outlinks = OutlinkExtractor.getOutlinks(text, getConf());
    }

    ParseStatus status = ParseStatusUtils.STATUS_SUCCESS;
    if (metaTags.getRefresh()) {
      status.setMinorCode(ParseStatusCodes.SUCCESS_REDIRECT);
      status.addToArgs(new Utf8(metaTags.getRefreshHref().toString()));
      status.addToArgs(new Utf8(Integer.toString(metaTags.getRefreshTime())));
    }

    Parse parse = new Parse(text, title, outlinks, status);
    parse = htmlParseFilters.filter(url, page, parse, metaTags, root);
View Full Code Here

TOP

Related Classes of org.apache.nutch.storage.ParseStatus

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.