Package edu.uci.ics.crawler4j.fetcher

Examples of edu.uci.ics.crawler4j.fetcher.PageFetchResult


  }

  private Page download(String url) {
    WebURL curURL = new WebURL();
    curURL.setURL(url);
    PageFetchResult fetchResult = null;
    try {
      fetchResult = pageFetcher.fetchHeader(curURL);
      if (fetchResult.getStatusCode() == HttpStatus.SC_OK) {
        try {
          Page page = new Page(curURL);
          fetchResult.fetchContent(page);
          if (parser.parse(page, curURL.getURL())) {
            return page;
          }
        } catch (Exception e) {
          e.printStackTrace();
        }
      }
    } finally {
      fetchResult.discardContentIfNotConsumed();
    }
    return null;
  }
View Full Code Here


  private void processPage(WebURL curURL) {
    if (curURL == null) {
      return;
    }
    PageFetchResult fetchResult = null;
    try {
      fetchResult = pageFetcher.fetchHeader(curURL);
      int statusCode = fetchResult.getStatusCode();
      handlePageStatusCode(curURL, statusCode, CustomFetchStatus.getStatusDescription(statusCode));
      if (statusCode != HttpStatus.SC_OK) {
        if (statusCode == HttpStatus.SC_MOVED_PERMANENTLY || statusCode == HttpStatus.SC_MOVED_TEMPORARILY) {
          if (myController.getConfig().isFollowRedirects()) {
            String movedToUrl = fetchResult.getMovedToUrl();
            if (movedToUrl == null) {
              return;
            }
            int newDocId = docIdServer.getDocId(movedToUrl);
            if (newDocId > 0) {
              // Redirect page is already seen
              return;
            } else {
              WebURL webURL = new WebURL();
              webURL.setURL(movedToUrl);
              webURL.setParentDocid(curURL.getParentDocid());
              webURL.setParentUrl(curURL.getParentUrl());
              webURL.setDepth(curURL.getDepth());
              webURL.setDocid(-1);
              if (shouldVisit(webURL) && robotstxtServer.allows(webURL)) {
                webURL.setDocid(docIdServer.getNewDocID(movedToUrl));
                frontier.schedule(webURL);
              }
            }
          }
        } else if (fetchResult.getStatusCode() == CustomFetchStatus.PageTooBig) {
          logger.info("Skipping a page which was bigger than max allowed size: " + curURL.getURL());
        }
        return;
      }

      if (!curURL.getURL().equals(fetchResult.getFetchedUrl())) {
        if (docIdServer.isSeenBefore(fetchResult.getFetchedUrl())) {
          // Redirect page is already seen
          return;
        }
        curURL.setURL(fetchResult.getFetchedUrl());
        curURL.setDocid(docIdServer.getNewDocID(fetchResult.getFetchedUrl()));
      }

      Page page = new Page(curURL);
      int docid = curURL.getDocid();
      if (fetchResult.fetchContent(page) && parser.parse(page, curURL.getURL())) {
        ParseData parseData = page.getParseData();
        if (parseData instanceof HtmlParseData) {
          HtmlParseData htmlParseData = (HtmlParseData) parseData;

          List<WebURL> toSchedule = new ArrayList<WebURL>();
          int maxCrawlDepth = myController.getConfig().getMaxDepthOfCrawling();
          for (WebURL webURL : htmlParseData.getOutgoingUrls()) {
            webURL.setParentDocid(docid);
            webURL.setParentUrl(curURL.getURL());
            int newdocid = docIdServer.getDocId(webURL.getURL());
            if (newdocid > 0) {
              // This is not the first time that this Url is
              // visited. So, we set the depth to a negative
              // number.
              webURL.setDepth((short) -1);
              webURL.setDocid(newdocid);
            } else {
              webURL.setDocid(-1);
              webURL.setDepth((short) (curURL.getDepth() + 1));
              if (maxCrawlDepth == -1 || curURL.getDepth() < maxCrawlDepth) {
                if (shouldVisit(webURL) && robotstxtServer.allows(webURL)) {
                  webURL.setDocid(docIdServer.getNewDocID(webURL.getURL()));
                  toSchedule.add(webURL);
                }
              }
            }
          }
          frontier.scheduleAll(toSchedule);
        }
        visit(page);
      }
    } catch (Exception e) {
      e.printStackTrace();
      logger.error(e.getMessage() + ", while processing: " + curURL.getURL());
    } finally {
      if (fetchResult != null) {
        fetchResult.discardContentIfNotConsumed();
      }
    }
  }
View Full Code Here

  private HostDirectives fetchDirectives(String host) {
    WebURL robotsTxtUrl = new WebURL();
    robotsTxtUrl.setURL("http://" + host + "/robots.txt");
    HostDirectives directives = null;
    PageFetchResult fetchResult = null;
    try {
      fetchResult = pageFetcher.fetchHeader(robotsTxtUrl);
      if (fetchResult.getStatusCode() == HttpStatus.SC_OK) {
        Page page = new Page(robotsTxtUrl);
        fetchResult.fetchContent(page);
        if (Util.hasPlainTextContent(page.getContentType())) {
          try {
            String content;
            if (page.getContentCharset() == null) {
              content = new String(page.getContentData());
            } else {
              content = new String(page.getContentData(), page.getContentCharset());
            }
            directives = RobotstxtParser.parse(content, config.getUserAgentName());
          } catch (Exception e) {
            e.printStackTrace();
          }
        }
      }
    } finally {
      fetchResult.discardContentIfNotConsumed();
    }
    if (directives == null) {
      // We still need to have this object to keep track of the time we
      // fetched it
      directives = new HostDirectives();
View Full Code Here

    WebURL robotsTxtUrl = new WebURL();
    String host = getHost(url);
    String port = (url.getPort() == url.getDefaultPort() || url.getPort() == -1) ? "" : ":" + url.getPort();
    robotsTxtUrl.setURL("http://" + host + port + "/robots.txt");
    HostDirectives directives = null;
    PageFetchResult fetchResult = null;
    try {
      fetchResult = pageFetcher.fetchHeader(robotsTxtUrl);
      if (fetchResult.getStatusCode() == HttpStatus.SC_OK) {
        Page page = new Page(robotsTxtUrl);
        fetchResult.fetchContent(page);
        if (Util.hasPlainTextContent(page.getContentType())) {
          try {
            String content;
            if (page.getContentCharset() == null) {
              content = new String(page.getContentData());
            } else {
              content = new String(page.getContentData(), page.getContentCharset());
            }
            directives = RobotstxtParser.parse(content, config.getUserAgentName());
          } catch (Exception e) {
            e.printStackTrace();
          }
        }
      }
    } finally {
      if (fetchResult != null) {
        fetchResult.discardContentIfNotConsumed();
      }
    }
    if (directives == null) {
      // We still need to have this object to keep track of the time we
      // fetched it
View Full Code Here

  private void processPage(WebURL curURL) {
    if (curURL == null) {
      return;
    }
    PageFetchResult fetchResult = null;
    try {
      fetchResult = pageFetcher.fetchHeader(curURL);
      int statusCode = fetchResult.getStatusCode();
      handlePageStatusCode(curURL, statusCode, CustomFetchStatus.getStatusDescription(statusCode));
      if (statusCode != HttpStatus.SC_OK) {
        if (statusCode == HttpStatus.SC_MOVED_PERMANENTLY || statusCode == HttpStatus.SC_MOVED_TEMPORARILY) {
          if (myController.getConfig().isFollowRedirects()) {
            String movedToUrl = fetchResult.getMovedToUrl();
            if (movedToUrl == null) {
              return;
            }
            int newDocId = docIdServer.getDocId(movedToUrl);
            if (newDocId > 0) {
              // Redirect page is already seen
              return;
            }

            WebURL webURL = new WebURL();
            webURL.setURL(movedToUrl);
            webURL.setParentDocid(curURL.getParentDocid());
            webURL.setParentUrl(curURL.getParentUrl());
            webURL.setDepth(curURL.getDepth());
            webURL.setDocid(-1);
            webURL.setAnchor(curURL.getAnchor());
            if (shouldVisit(webURL) && robotstxtServer.allows(webURL)) {
              webURL.setDocid(docIdServer.getNewDocID(movedToUrl));
              frontier.schedule(webURL);
            }
          }
        } else if (fetchResult.getStatusCode() == CustomFetchStatus.PageTooBig) {
          logger.info("Skipping a page which was bigger than max allowed size: " + curURL.getURL());
        }
        return;
      }

      if (!curURL.getURL().equals(fetchResult.getFetchedUrl())) {
        if (docIdServer.isSeenBefore(fetchResult.getFetchedUrl())) {
          // Redirect page is already seen
          return;
        }
        curURL.setURL(fetchResult.getFetchedUrl());
        curURL.setDocid(docIdServer.getNewDocID(fetchResult.getFetchedUrl()));
      }

      Page page = new Page(curURL);
      int docid = curURL.getDocid();

      if (!fetchResult.fetchContent(page)) {
        onContentFetchError(curURL);
        return;
      }

      if (!parser.parse(page, curURL.getURL())) {
        onParseError(curURL);
        return;
      }

      ParseData parseData = page.getParseData();
      if (parseData instanceof HtmlParseData) {
        HtmlParseData htmlParseData = (HtmlParseData) parseData;

        List<WebURL> toSchedule = new ArrayList<>();
        int maxCrawlDepth = myController.getConfig().getMaxDepthOfCrawling();
        for (WebURL webURL : htmlParseData.getOutgoingUrls()) {
          webURL.setParentDocid(docid);
          webURL.setParentUrl(curURL.getURL());
          int newdocid = docIdServer.getDocId(webURL.getURL());
          if (newdocid > 0) {
            // This is not the first time that this Url is
            // visited. So, we set the depth to a negative
            // number.
            webURL.setDepth((short) -1);
            webURL.setDocid(newdocid);
          } else {
            webURL.setDocid(-1);
            webURL.setDepth((short) (curURL.getDepth() + 1));
            if (maxCrawlDepth == -1 || curURL.getDepth() < maxCrawlDepth) {
              if (shouldVisit(webURL) && robotstxtServer.allows(webURL)) {
                webURL.setDocid(docIdServer.getNewDocID(webURL.getURL()));
                toSchedule.add(webURL);
              }
            }
          }
        }
        frontier.scheduleAll(toSchedule);
      }
      try {
        visit(page);
      } catch (Exception e) {
        logger.error("Exception while running the visit method. Message: '" + e.getMessage() + "' at " + e.getStackTrace()[0]);
      }

    } catch (Exception e) {
      logger.error(e.getMessage() + ", while processing: " + curURL.getURL());
    } finally {
      if (fetchResult != null) {
        fetchResult.discardContentIfNotConsumed();
      }
    }
  }
View Full Code Here

  private HostDirectives fetchDirectives(String host) {
    WebURL robotsTxtUrl = new WebURL();
    robotsTxtUrl.setURL("http://" + host + "/robots.txt");
    HostDirectives directives = null;
    PageFetchResult fetchResult = null;
    try {
      fetchResult = pageFetcher.fetchHeader(robotsTxtUrl);
      if (fetchResult.getStatusCode() == HttpStatus.SC_OK) {
        Page page = new Page(robotsTxtUrl);
        fetchResult.fetchContent(page);
        if (Util.hasPlainTextContent(page.getContentType())) {
          try {
            String content;
            if (page.getContentCharset() == null) {
              content = new String(page.getContentData());
            } else {
              content = new String(page.getContentData(), page.getContentCharset());
            }
            directives = RobotstxtParser.parse(content, config.getUserAgentName());
          } catch (Exception e) {
            logger.error(e);
          }
        }
      }
    } finally {
      fetchResult.discardContentIfNotConsumed();
    }
    if (directives == null) {
      // We still need to have this object to keep track of the time we
      // fetched it
      directives = new HostDirectives();
View Full Code Here

  private void processPage(WebURL curURL) {
    if (curURL == null) {
      return;
    }
    PageFetchResult fetchResult = null;
    try {
      fetchResult = pageFetcher.fetchHeader(curURL);
      int statusCode = fetchResult.getStatusCode();
      handlePageStatusCode(curURL, statusCode, CustomFetchStatus.getStatusDescription(statusCode));
      if (statusCode != HttpStatus.SC_OK) {
        if (statusCode == HttpStatus.SC_MOVED_PERMANENTLY || statusCode == HttpStatus.SC_MOVED_TEMPORARILY) {
          if (myController.getConfig().isFollowRedirects()) {
            String movedToUrl = fetchResult.getMovedToUrl();
            if (movedToUrl == null) {
              return;
            }
            int newDocId = docIdServer.getDocId(movedToUrl);
            if (newDocId > 0) {
              // Redirect page is already seen
              return;
            } else {
              WebURL webURL = new WebURL();
              webURL.setURL(movedToUrl);
              webURL.setParentDocid(curURL.getParentDocid());
              webURL.setParentUrl(curURL.getParentUrl());
              webURL.setDepth(curURL.getDepth());
              webURL.setDocid(-1);
                            // TODO This only proceeds if the tracked Crawl-delay comes back as 0 (instantly ok to fetch)
                            // Need to augment scheduling logic to delay fetches into a time-based priority queue, or some such
                            if (shouldVisit(webURL) && robotstxtServer.allowedIn(webURL) == 0L) {
                webURL.setDocid(docIdServer.getNewDocID(movedToUrl));
                frontier.schedule(webURL);
              }
            }
          }
        } else if (fetchResult.getStatusCode() == CustomFetchStatus.PageTooBig) {
          logger.info("Skipping a page which was bigger than max allowed size: " + curURL.getURL());
        }
        return;
      }

      if (!curURL.getURL().equals(fetchResult.getFetchedUrl())) {
        if (docIdServer.isSeenBefore(fetchResult.getFetchedUrl())) {
          // Redirect page is already seen
          return;
        }
        curURL.setURL(fetchResult.getFetchedUrl());
        curURL.setDocid(docIdServer.getNewDocID(fetchResult.getFetchedUrl()));
      }

      Page page = new Page(curURL);
      int docid = curURL.getDocid();
      if (fetchResult.fetchContent(page) && parser.parse(page, curURL.getURL())) {
        ParseData parseData = page.getParseData();
        if (parseData instanceof HtmlParseData) {
          HtmlParseData htmlParseData = (HtmlParseData) parseData;

          List<WebURL> toSchedule = new ArrayList<WebURL>();
          int maxCrawlDepth = myController.getConfig().getMaxDepthOfCrawling();
          for (WebURL webURL : htmlParseData.getOutgoingUrls()) {
            webURL.setParentDocid(docid);
            webURL.setParentUrl(curURL.getURL());
            int newdocid = docIdServer.getDocId(webURL.getURL());
            if (newdocid > 0) {
              // This is not the first time that this Url is
              // visited. So, we set the depth to a negative
              // number.
              webURL.setDepth((short) -1);
              webURL.setDocid(newdocid);
            } else {
              webURL.setDocid(-1);
              webURL.setDepth((short) (curURL.getDepth() + 1));
              if (maxCrawlDepth == -1 || curURL.getDepth() < maxCrawlDepth) {
                                // TODO This only proceeds if the tracked Crawl-delay comes back as 0 (instantly ok to fetch)
                                // Need to augment scheduling logic to delay fetches into a time-based priority queue, or some such
                if (shouldVisit(webURL) && robotstxtServer.allowedIn(webURL) == 0L) {
                  webURL.setDocid(docIdServer.getNewDocID(webURL.getURL()));
                  toSchedule.add(webURL);
                }
              }
            }
          }
          frontier.scheduleAll(toSchedule);
        }
        visit(page);
      }
    } catch (Exception e) {
      e.printStackTrace();
      logger.error(e.getMessage() + ", while processing: " + curURL.getURL());
    } finally {
      if (fetchResult != null) {
        fetchResult.discardContentIfNotConsumed();
      }
    }
  }
View Full Code Here

TOP

Related Classes of edu.uci.ics.crawler4j.fetcher.PageFetchResult

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.