Package org.apache.nutch.crawl

Examples of org.apache.nutch.crawl.URLWebPage


      if (skipTruncated && isTruncated(unreverseKey, page)) {
        return;
      }
     

      URLWebPage redirectedPage = parseUtil.process(key, page);
      ParseStatus pstatus = page.getParseStatus();
      if (pstatus != null) {
        context.getCounter("ParserStatus",
            ParseStatusCodes.majorCodes[pstatus.getMajorCode()]).increment(1);
      }

      if (redirectedPage != null) {
        context.write(TableUtil.reverseUrl(redirectedPage.getUrl()),
                      redirectedPage.getDatum());
      }
      context.write(key, page);
    }   
View Full Code Here


      Mark.FETCH_MARK.putMark(fit.page, Mark.GENERATE_MARK.checkMark(fit.page));
      String key = TableUtil.reverseUrl(fit.url);

      if (parse) {
        if (!skipTruncated || (skipTruncated && !ParserJob.isTruncated(fit.url, fit.page))) {
          URLWebPage redirectedPage = parseUtil.process(key, fit.page);
          if (redirectedPage != null) {
            context.write(TableUtil.reverseUrl(redirectedPage.getUrl()),
                redirectedPage.getDatum());
          }
        }
      }
      context.write(key, fit.page);
    }
View Full Code Here

   * @param key
   * @param page
   * @return newly-discovered webpage (via a meta-redirect)
   */
  public URLWebPage process(String key, WebPage page) {
    URLWebPage redirectedPage = null;
    String url = TableUtil.unreverseUrl(key);
    byte status = (byte) page.getStatus();
    if (status != CrawlStatus.STATUS_FETCHED) {
      if (LOG.isDebugEnabled()) {
        LOG.debug("Skipping " + url + " as status is: " + CrawlStatus.getName(status));
      }
      return redirectedPage;
    }

    Parse parse;
    try {
      parse = parse(url, page);
    } catch (ParserNotFound e) {
      // do not print stacktrace for the fact that some types are not mapped.
      LOG.warn("No suitable parser found: " + e.getMessage());
      return redirectedPage;
    } catch (final Exception e) {
      LOG.warn("Error parsing: " + url + ": " + StringUtils.stringifyException(e));
      return redirectedPage;
    }

    if (parse == null) {
      return redirectedPage;
    }

    final byte[] signature = sig.calculate(page);

    org.apache.nutch.storage.ParseStatus pstatus = parse.getParseStatus();
    page.setParseStatus(pstatus);
    if (ParseStatusUtils.isSuccess(pstatus)) {
      if (pstatus.getMinorCode() == ParseStatusCodes.SUCCESS_REDIRECT) {
        String newUrl = ParseStatusUtils.getMessage(pstatus);
        int refreshTime = Integer.parseInt(ParseStatusUtils.getArg(pstatus, 1));
        try {
          newUrl = normalizers.normalize(newUrl, URLNormalizers.SCOPE_FETCHER);
          newUrl = filters.filter(newUrl);
        } catch (URLFilterException e) {
          return redirectedPage; // TODO: is this correct
        } catch (MalformedURLException e) {
          return redirectedPage;
        }
        if (newUrl == null || newUrl.equals(url)) {
          String reprUrl = URLUtil.chooseRepr(url, newUrl,
              refreshTime < FetcherJob.PERM_REFRESH_TIME);
          WebPage newWebPage = new WebPage();
          if (reprUrl == null) {
            LOG.warn("reprUrl==null for " + url);
            return redirectedPage;
          } else {
            page.setReprUrl(new Utf8(reprUrl));
          }
          page.putToMetadata(FetcherJob.REDIRECT_DISCOVERED, TableUtil.YES_VAL);
          redirectedPage = new URLWebPage(reprUrl, newWebPage);
        }
      } else {
        page.setText(new Utf8(parse.getText()));
        page.setTitle(new Utf8(parse.getTitle()));
        ByteBuffer prevSig = page.getSignature();
View Full Code Here

        continue;

      if (requiredMark != null && requiredMark.checkMark(page) == null)
        continue;

      l.add(new URLWebPage(TableUtil.unreverseUrl(url), (WebPage)page.clone()));
    }

    return l;
  }
View Full Code Here

          continue;

        if (requiredMark != null && requiredMark.checkMark(page) == null)
          continue;

        l.add(new URLWebPage(TableUtil.unreverseUrl(url), (WebPage)page.clone()));
      } catch (Exception e) {
        e.printStackTrace();
      }
    }
    return l;
View Full Code Here

TOP

Related Classes of org.apache.nutch.crawl.URLWebPage

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.