Package edu.uci.ics.crawler4j.url

Examples of edu.uci.ics.crawler4j.url.WebURL


            int newDocId = docIdServer.getDocId(movedToUrl);
            if (newDocId > 0) {
              // Redirect page is already seen
              return;
            } else {
              WebURL webURL = new WebURL();
              webURL.setURL(movedToUrl);
              webURL.setParentDocid(curURL.getParentDocid());
              webURL.setParentUrl(curURL.getParentUrl());
              webURL.setDepth(curURL.getDepth());
              webURL.setDocid(-1);
                            // TODO This only proceeds if the tracked Crawl-delay comes back as 0 (instantly ok to fetch)
                            // Need to augment scheduling logic to delay fetches into a time-based priority queue, or some such
                            if (shouldVisit(webURL) && robotstxtServer.allowedIn(webURL) == 0L) {
                webURL.setDocid(docIdServer.getNewDocID(movedToUrl));
                frontier.schedule(webURL);
              }
            }
          }
        } else if (fetchResult.getStatusCode() == CustomFetchStatus.PageTooBig) {
          logger.info("Skipping a page which was bigger than max allowed size: " + curURL.getURL());
        }
        return;
      }

      if (!curURL.getURL().equals(fetchResult.getFetchedUrl())) {
        if (docIdServer.isSeenBefore(fetchResult.getFetchedUrl())) {
          // Redirect page is already seen
          return;
        }
        curURL.setURL(fetchResult.getFetchedUrl());
        curURL.setDocid(docIdServer.getNewDocID(fetchResult.getFetchedUrl()));
      }

      Page page = new Page(curURL);
      int docid = curURL.getDocid();
      if (fetchResult.fetchContent(page) && parser.parse(page, curURL.getURL())) {
        ParseData parseData = page.getParseData();
        if (parseData instanceof HtmlParseData) {
          HtmlParseData htmlParseData = (HtmlParseData) parseData;

          List<WebURL> toSchedule = new ArrayList<WebURL>();
          int maxCrawlDepth = myController.getConfig().getMaxDepthOfCrawling();
          for (WebURL webURL : htmlParseData.getOutgoingUrls()) {
            webURL.setParentDocid(docid);
            webURL.setParentUrl(curURL.getURL());
            int newdocid = docIdServer.getDocId(webURL.getURL());
            if (newdocid > 0) {
              // This is not the first time that this Url is
              // visited. So, we set the depth to a negative
              // number.
              webURL.setDepth((short) -1);
              webURL.setDocid(newdocid);
            } else {
              webURL.setDocid(-1);
              webURL.setDepth((short) (curURL.getDepth() + 1));
              if (maxCrawlDepth == -1 || curURL.getDepth() < maxCrawlDepth) {
                                // TODO This only proceeds if the tracked Crawl-delay comes back as 0 (instantly ok to fetch)
                                // Need to augment scheduling logic to delay fetches into a time-based priority queue, or some such
                if (shouldVisit(webURL) && robotstxtServer.allowedIn(webURL) == 0L) {
                  webURL.setDocid(docIdServer.getNewDocID(webURL.getURL()));
                  toSchedule.add(webURL);
                }
              }
            }
          }
View Full Code Here


      } catch (Exception e) {
        logger.error("Could not add seed: " + e.getMessage());
      }
    }

    WebURL webUrl = new WebURL();
    webUrl.setURL(canonicalUrl);
    webUrl.setDocid(docId);
    webUrl.setDepth((short) 0);
    if (robotstxtServer.allowedIn(webUrl) == null) {
      logger.info("Robots.txt does not allow this seed: " + pageUrl);
    } else {
      frontier.schedule(webUrl);
    }
View Full Code Here

TOP

Related Classes of edu.uci.ics.crawler4j.url.WebURL

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.