Examples of WebUrl | massapi.com

        hrefWithoutProtocol = href.substring(7);
      }
      if (!hrefWithoutProtocol.contains("javascript:") && !hrefWithoutProtocol.contains("@")) {
        String url = URLCanonicalizer.getCanonicalURL(href, contextURL);
        if (url != null) {
          WebURL webURL = new WebURL();
          webURL.setURL(url);
          webURL.setAnchor(urlAnchorPair.getAnchor());
          outgoingUrls.add(webURL);
          urlCount++;
          if (urlCount > config.getMaxOutgoingLinksToFollow()) {
            break;
          }

Examples of edu.uci.ics.crawler4j.url.WebURL

      } catch (Exception e) {
        logger.error("Could not add seed: " + e.getMessage());
      }
    }


    WebURL webUrl = new WebURL();
    webUrl.setURL(canonicalUrl);
    webUrl.setDocid(docId);
    webUrl.setDepth((short) 0);
    if (!robotstxtServer.allows(webUrl)) {
      logger.info("Robots.txt does not allow this seed: " + pageUrl);
    } else {
      frontier.schedule(webUrl);
    }

Examples of edu.uci.ics.crawler4j.url.WebURL

    }
    return true;
  }


  private HostDirectives fetchDirectives(String host) {
    WebURL robotsTxtUrl = new WebURL();
    robotsTxtUrl.setURL("http://" + host + "/robots.txt");
    HostDirectives directives = null;
    PageFetchResult fetchResult = null;
    try {
      fetchResult = pageFetcher.fetchHeader(robotsTxtUrl);
      if (fetchResult.getStatusCode() == HttpStatus.SC_OK) {

Examples of edu.uci.ics.crawler4j.url.WebURL

    }
    return true;
  }


  private HostDirectives fetchDirectives(URL url) {
    WebURL robotsTxtUrl = new WebURL();
    String host = getHost(url);
    String port = (url.getPort() == url.getDefaultPort() || url.getPort() == -1) ? "" : ":" + url.getPort();
    robotsTxtUrl.setURL("http://" + host + port + "/robots.txt");
    HostDirectives directives = null;
    PageFetchResult fetchResult = null;
    try {
      fetchResult = pageFetcher.fetchHeader(robotsTxtUrl);
      if (fetchResult.getStatusCode() == HttpStatus.SC_OK) {

Examples of edu.uci.ics.crawler4j.url.WebURL

 */
public class WebURLTupleBinding extends TupleBinding<WebURL> {


  @Override
  public WebURL entryToObject(TupleInput input) {
    WebURL webURL = new WebURL();
    webURL.setURL(input.readString());
    webURL.setDocid(input.readInt());
    webURL.setParentDocid(input.readInt());
    webURL.setParentUrl(input.readString());
    webURL.setDepth(input.readShort());
    webURL.setPriority(input.readByte());
    webURL.setAnchor(input.readString());
    return webURL;
  }

Examples of edu.uci.ics.crawler4j.url.WebURL

            if (newDocId > 0) {
              // Redirect page is already seen
              return;
            }


            WebURL webURL = new WebURL();
            webURL.setURL(movedToUrl);
            webURL.setParentDocid(curURL.getParentDocid());
            webURL.setParentUrl(curURL.getParentUrl());
            webURL.setDepth(curURL.getDepth());
            webURL.setDocid(-1);
            webURL.setAnchor(curURL.getAnchor());
            if (shouldVisit(webURL) && robotstxtServer.allows(webURL)) {
              webURL.setDocid(docIdServer.getNewDocID(movedToUrl));
              frontier.schedule(webURL);
            }
          }
        } else if (fetchResult.getStatusCode() == CustomFetchStatus.PageTooBig) {
          logger.info("Skipping a page which was bigger than max allowed size: " + curURL.getURL());
        }
        return;
      }


      if (!curURL.getURL().equals(fetchResult.getFetchedUrl())) {
        if (docIdServer.isSeenBefore(fetchResult.getFetchedUrl())) {
          // Redirect page is already seen
          return;
        }
        curURL.setURL(fetchResult.getFetchedUrl());
        curURL.setDocid(docIdServer.getNewDocID(fetchResult.getFetchedUrl()));
      }


      Page page = new Page(curURL);
      int docid = curURL.getDocid();


      if (!fetchResult.fetchContent(page)) {
        onContentFetchError(curURL);
        return;
      }


      if (!parser.parse(page, curURL.getURL())) {
        onParseError(curURL);
        return;
      }


      ParseData parseData = page.getParseData();
      if (parseData instanceof HtmlParseData) {
        HtmlParseData htmlParseData = (HtmlParseData) parseData;


        List<WebURL> toSchedule = new ArrayList<>();
        int maxCrawlDepth = myController.getConfig().getMaxDepthOfCrawling();
        for (WebURL webURL : htmlParseData.getOutgoingUrls()) {
          webURL.setParentDocid(docid);
          webURL.setParentUrl(curURL.getURL());
          int newdocid = docIdServer.getDocId(webURL.getURL());
          if (newdocid > 0) {
            // This is not the first time that this Url is
            // visited. So, we set the depth to a negative
            // number.
            webURL.setDepth((short) -1);
            webURL.setDocid(newdocid);
          } else {
            webURL.setDocid(-1);
            webURL.setDepth((short) (curURL.getDepth() + 1));
            if (maxCrawlDepth == -1 || curURL.getDepth() < maxCrawlDepth) {
              if (shouldVisit(webURL) && robotstxtServer.allows(webURL)) {
                webURL.setDocid(docIdServer.getNewDocID(webURL.getURL()));
                toSchedule.add(webURL);
              }
            }
          }
        }

Examples of edu.uci.ics.crawler4j.url.WebURL

      if (!hrefWithoutProtocol.contains("javascript:") 
          && !hrefWithoutProtocol.contains("mailto:")
          && !hrefWithoutProtocol.contains("@")) {
        String url = URLCanonicalizer.getCanonicalURL(href, contextURL);
        if (url != null) {
          WebURL webURL = new WebURL();
          webURL.setURL(url);
          webURL.setAnchor(urlAnchorPair.getAnchor());
          outgoingUrls.add(webURL);
          urlCount++;
          if (urlCount > config.getMaxOutgoingLinksToFollow()) {
            break;
          }

Examples of edu.uci.ics.crawler4j.url.WebURL

      } catch (Exception e) {
        logger.error("Could not add seed: " + e.getMessage());
      }
    }


    WebURL webUrl = new WebURL();
    webUrl.setURL(canonicalUrl);
    webUrl.setDocid(docId);
    webUrl.setDepth((short) 0);
    if (!robotstxtServer.allows(webUrl)) {
      logger.info("Robots.txt does not allow this seed: " + pageUrl);
    } else {
      frontier.schedule(webUrl);
    }

Examples of edu.uci.ics.crawler4j.url.WebURL

        }
        return 0L;
    }


  private HostDirectives fetchDirectives(String host) {
    WebURL robotsTxtUrl = new WebURL();
    robotsTxtUrl.setURL("http://" + host + "/robots.txt");
    HostDirectives directives = null;
    PageFetchResult fetchResult = null;
    try {
      fetchResult = pageFetcher.fetchHeader(robotsTxtUrl);
      if (fetchResult.getStatusCode() == HttpStatus.SC_OK) {

Examples of edu.uci.ics.crawler4j.url.WebURL

            int newDocId = docIdServer.getDocId(movedToUrl);
            if (newDocId > 0) {
              // Redirect page is already seen
              return;
            } else {
              WebURL webURL = new WebURL();
              webURL.setURL(movedToUrl);
              webURL.setParentDocid(curURL.getParentDocid());
              webURL.setParentUrl(curURL.getParentUrl());
              webURL.setDepth(curURL.getDepth());
              webURL.setDocid(-1);
                            // TODO This only proceeds if the tracked Crawl-delay comes back as 0 (instantly ok to fetch)
                            // Need to augment scheduling logic to delay fetches into a time-based priority queue, or some such
                            if (shouldVisit(webURL) && robotstxtServer.allowedIn(webURL) == 0L) {
                webURL.setDocid(docIdServer.getNewDocID(movedToUrl));
                frontier.schedule(webURL);
              }
            }
          }
        } else if (fetchResult.getStatusCode() == CustomFetchStatus.PageTooBig) {
          logger.info("Skipping a page which was bigger than max allowed size: " + curURL.getURL());
        }
        return;
      }


      if (!curURL.getURL().equals(fetchResult.getFetchedUrl())) {
        if (docIdServer.isSeenBefore(fetchResult.getFetchedUrl())) {
          // Redirect page is already seen
          return;
        }
        curURL.setURL(fetchResult.getFetchedUrl());
        curURL.setDocid(docIdServer.getNewDocID(fetchResult.getFetchedUrl()));
      }


      Page page = new Page(curURL);
      int docid = curURL.getDocid();
      if (fetchResult.fetchContent(page) && parser.parse(page, curURL.getURL())) {
        ParseData parseData = page.getParseData();
        if (parseData instanceof HtmlParseData) {
          HtmlParseData htmlParseData = (HtmlParseData) parseData;


          List<WebURL> toSchedule = new ArrayList<WebURL>();
          int maxCrawlDepth = myController.getConfig().getMaxDepthOfCrawling();
          for (WebURL webURL : htmlParseData.getOutgoingUrls()) {
            webURL.setParentDocid(docid);
            webURL.setParentUrl(curURL.getURL());
            int newdocid = docIdServer.getDocId(webURL.getURL());
            if (newdocid > 0) {
              // This is not the first time that this Url is
              // visited. So, we set the depth to a negative
              // number.
              webURL.setDepth((short) -1);
              webURL.setDocid(newdocid);
            } else {
              webURL.setDocid(-1);
              webURL.setDepth((short) (curURL.getDepth() + 1));
              if (maxCrawlDepth == -1 || curURL.getDepth() < maxCrawlDepth) {
                                // TODO This only proceeds if the tracked Crawl-delay comes back as 0 (instantly ok to fetch)
                                // Need to augment scheduling logic to delay fetches into a time-based priority queue, or some such
                if (shouldVisit(webURL) && robotstxtServer.allowedIn(webURL) == 0L) {
                  webURL.setDocid(docIdServer.getNewDocID(webURL.getURL()));
                  toSchedule.add(webURL);
                }
              }
            }
          }