Package edu.uci.ics.crawler4j.url

Examples of edu.uci.ics.crawler4j.url.WebURL


            int newDocId = docIdServer.getDocId(movedToUrl);
            if (newDocId > 0) {
              // Redirect page is already seen
              return;
            } else {
              WebURL webURL = new WebURL();
              webURL.setURL(movedToUrl);
              webURL.setParentDocid(curURL.getParentDocid());
              webURL.setParentUrl(curURL.getParentUrl());
              webURL.setDepth(curURL.getDepth());
              webURL.setDocid(-1);
              if (shouldVisit(webURL) && robotstxtServer.allows(webURL)) {
                webURL.setDocid(docIdServer.getNewDocID(movedToUrl));
                frontier.schedule(webURL);
              }
            }
          }
        } else if (fetchResult.getStatusCode() == CustomFetchStatus.PageTooBig) {
          logger.info("Skipping a page which was bigger than max allowed size: " + curURL.getURL());
        }
        return;
      }

      if (!curURL.getURL().equals(fetchResult.getFetchedUrl())) {
        if (docIdServer.isSeenBefore(fetchResult.getFetchedUrl())) {
          // Redirect page is already seen
          return;
        }
        curURL.setURL(fetchResult.getFetchedUrl());
        curURL.setDocid(docIdServer.getNewDocID(fetchResult.getFetchedUrl()));
      }

      Page page = new Page(curURL);
      int docid = curURL.getDocid();
      if (fetchResult.fetchContent(page) && parser.parse(page, curURL.getURL())) {
        ParseData parseData = page.getParseData();
        if (parseData instanceof HtmlParseData) {
          HtmlParseData htmlParseData = (HtmlParseData) parseData;

          List<WebURL> toSchedule = new ArrayList<WebURL>();
          int maxCrawlDepth = myController.getConfig().getMaxDepthOfCrawling();
          for (WebURL webURL : htmlParseData.getOutgoingUrls()) {
            webURL.setParentDocid(docid);
            webURL.setParentUrl(curURL.getURL());
            int newdocid = docIdServer.getDocId(webURL.getURL());
            if (newdocid > 0) {
              // This is not the first time that this Url is
              // visited. So, we set the depth to a negative
              // number.
              webURL.setDepth((short) -1);
              webURL.setDocid(newdocid);
            } else {
              webURL.setDocid(-1);
              webURL.setDepth((short) (curURL.getDepth() + 1));
              if (maxCrawlDepth == -1 || curURL.getDepth() < maxCrawlDepth) {
                if (shouldVisit(webURL) && robotstxtServer.allows(webURL)) {
                  webURL.setDocid(docIdServer.getNewDocID(webURL.getURL()));
                  toSchedule.add(webURL);
                }
              }
            }
          }
View Full Code Here


        hrefWithoutProtocol = href.substring(7);
      }
      if (!hrefWithoutProtocol.contains("javascript:") && !hrefWithoutProtocol.contains("@")) {
        String url = URLCanonicalizer.getCanonicalURL(href, contextURL);
        if (url != null) {
          WebURL webURL = new WebURL();
          webURL.setURL(url);
          webURL.setAnchor(urlAnchorPair.getAnchor());
          outgoingUrls.add(webURL);
          urlCount++;
          if (urlCount > config.getMaxOutgoingLinksToFollow()) {
            break;
          }
View Full Code Here

      } catch (Exception e) {
        logger.error("Could not add seed: " + e.getMessage());
      }
    }

    WebURL webUrl = new WebURL();
    webUrl.setURL(canonicalUrl);
    webUrl.setDocid(docId);
    webUrl.setDepth((short) 0);
    if (!robotstxtServer.allows(webUrl)) {
      logger.info("Robots.txt does not allow this seed: " + pageUrl);
    } else {
      frontier.schedule(webUrl);
    }
View Full Code Here

    }
    return true;
  }

  private HostDirectives fetchDirectives(String host) {
    WebURL robotsTxtUrl = new WebURL();
    robotsTxtUrl.setURL("http://" + host + "/robots.txt");
    HostDirectives directives = null;
    PageFetchResult fetchResult = null;
    try {
      fetchResult = pageFetcher.fetchHeader(robotsTxtUrl);
      if (fetchResult.getStatusCode() == HttpStatus.SC_OK) {
View Full Code Here

    }
    return true;
  }

  private HostDirectives fetchDirectives(URL url) {
    WebURL robotsTxtUrl = new WebURL();
    String host = getHost(url);
    String port = (url.getPort() == url.getDefaultPort() || url.getPort() == -1) ? "" : ":" + url.getPort();
    robotsTxtUrl.setURL("http://" + host + port + "/robots.txt");
    HostDirectives directives = null;
    PageFetchResult fetchResult = null;
    try {
      fetchResult = pageFetcher.fetchHeader(robotsTxtUrl);
      if (fetchResult.getStatusCode() == HttpStatus.SC_OK) {
View Full Code Here

*/
public class WebURLTupleBinding extends TupleBinding<WebURL> {

  @Override
  public WebURL entryToObject(TupleInput input) {
    WebURL webURL = new WebURL();
    webURL.setURL(input.readString());
    webURL.setDocid(input.readInt());
    webURL.setParentDocid(input.readInt());
    webURL.setParentUrl(input.readString());
    webURL.setDepth(input.readShort());
    webURL.setPriority(input.readByte());
    webURL.setAnchor(input.readString());
    return webURL;
  }
View Full Code Here

            if (newDocId > 0) {
              // Redirect page is already seen
              return;
            }

            WebURL webURL = new WebURL();
            webURL.setURL(movedToUrl);
            webURL.setParentDocid(curURL.getParentDocid());
            webURL.setParentUrl(curURL.getParentUrl());
            webURL.setDepth(curURL.getDepth());
            webURL.setDocid(-1);
            webURL.setAnchor(curURL.getAnchor());
            if (shouldVisit(webURL) && robotstxtServer.allows(webURL)) {
              webURL.setDocid(docIdServer.getNewDocID(movedToUrl));
              frontier.schedule(webURL);
            }
          }
        } else if (fetchResult.getStatusCode() == CustomFetchStatus.PageTooBig) {
          logger.info("Skipping a page which was bigger than max allowed size: " + curURL.getURL());
        }
        return;
      }

      if (!curURL.getURL().equals(fetchResult.getFetchedUrl())) {
        if (docIdServer.isSeenBefore(fetchResult.getFetchedUrl())) {
          // Redirect page is already seen
          return;
        }
        curURL.setURL(fetchResult.getFetchedUrl());
        curURL.setDocid(docIdServer.getNewDocID(fetchResult.getFetchedUrl()));
      }

      Page page = new Page(curURL);
      int docid = curURL.getDocid();

      if (!fetchResult.fetchContent(page)) {
        onContentFetchError(curURL);
        return;
      }

      if (!parser.parse(page, curURL.getURL())) {
        onParseError(curURL);
        return;
      }

      ParseData parseData = page.getParseData();
      if (parseData instanceof HtmlParseData) {
        HtmlParseData htmlParseData = (HtmlParseData) parseData;

        List<WebURL> toSchedule = new ArrayList<>();
        int maxCrawlDepth = myController.getConfig().getMaxDepthOfCrawling();
        for (WebURL webURL : htmlParseData.getOutgoingUrls()) {
          webURL.setParentDocid(docid);
          webURL.setParentUrl(curURL.getURL());
          int newdocid = docIdServer.getDocId(webURL.getURL());
          if (newdocid > 0) {
            // This is not the first time that this Url is
            // visited. So, we set the depth to a negative
            // number.
            webURL.setDepth((short) -1);
            webURL.setDocid(newdocid);
          } else {
            webURL.setDocid(-1);
            webURL.setDepth((short) (curURL.getDepth() + 1));
            if (maxCrawlDepth == -1 || curURL.getDepth() < maxCrawlDepth) {
              if (shouldVisit(webURL) && robotstxtServer.allows(webURL)) {
                webURL.setDocid(docIdServer.getNewDocID(webURL.getURL()));
                toSchedule.add(webURL);
              }
            }
          }
        }
View Full Code Here

      if (!hrefWithoutProtocol.contains("javascript:")
          && !hrefWithoutProtocol.contains("mailto:")
          && !hrefWithoutProtocol.contains("@")) {
        String url = URLCanonicalizer.getCanonicalURL(href, contextURL);
        if (url != null) {
          WebURL webURL = new WebURL();
          webURL.setURL(url);
          webURL.setAnchor(urlAnchorPair.getAnchor());
          outgoingUrls.add(webURL);
          urlCount++;
          if (urlCount > config.getMaxOutgoingLinksToFollow()) {
            break;
          }
View Full Code Here

      } catch (Exception e) {
        logger.error("Could not add seed: " + e.getMessage());
      }
    }

    WebURL webUrl = new WebURL();
    webUrl.setURL(canonicalUrl);
    webUrl.setDocid(docId);
    webUrl.setDepth((short) 0);
    if (!robotstxtServer.allows(webUrl)) {
      logger.info("Robots.txt does not allow this seed: " + pageUrl);
    } else {
      frontier.schedule(webUrl);
    }
View Full Code Here

        }
        return 0L;
    }

  private HostDirectives fetchDirectives(String host) {
    WebURL robotsTxtUrl = new WebURL();
    robotsTxtUrl.setURL("http://" + host + "/robots.txt");
    HostDirectives directives = null;
    PageFetchResult fetchResult = null;
    try {
      fetchResult = pageFetcher.fetchHeader(robotsTxtUrl);
      if (fetchResult.getStatusCode() == HttpStatus.SC_OK) {
View Full Code Here

TOP

Related Classes of edu.uci.ics.crawler4j.url.WebURL

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.