Examples of WebUrl

  • edu.uci.ics.crawler4j.url.WebURL
    @author Yasser Ganjisaffar
  • org.apache.manifoldcf.crawler.connectors.rss.WebURL
    Replacement class for java.net.URI, which is broken in many ways.
  • org.apache.manifoldcf.crawler.connectors.webcrawler.WebURL
    Replacement class for java.net.URI, which is broken in many ways.

  • Examples of edu.uci.ics.crawler4j.url.WebURL

            hrefWithoutProtocol = href.substring(7);
          }
          if (!hrefWithoutProtocol.contains("javascript:") && !hrefWithoutProtocol.contains("@")) {
            String url = URLCanonicalizer.getCanonicalURL(href, contextURL);
            if (url != null) {
              WebURL webURL = new WebURL();
              webURL.setURL(url);
              webURL.setAnchor(urlAnchorPair.getAnchor());
              outgoingUrls.add(webURL);
              urlCount++;
              if (urlCount > config.getMaxOutgoingLinksToFollow()) {
                break;
              }
    View Full Code Here

    Examples of edu.uci.ics.crawler4j.url.WebURL

          } catch (Exception e) {
            logger.error("Could not add seed: " + e.getMessage());
          }
        }

        WebURL webUrl = new WebURL();
        webUrl.setURL(canonicalUrl);
        webUrl.setDocid(docId);
        webUrl.setDepth((short) 0);
        if (!robotstxtServer.allows(webUrl)) {
          logger.info("Robots.txt does not allow this seed: " + pageUrl);
        } else {
          frontier.schedule(webUrl);
        }
    View Full Code Here

    Examples of edu.uci.ics.crawler4j.url.WebURL

        }
        return true;
      }

      private HostDirectives fetchDirectives(String host) {
        WebURL robotsTxtUrl = new WebURL();
        robotsTxtUrl.setURL("http://" + host + "/robots.txt");
        HostDirectives directives = null;
        PageFetchResult fetchResult = null;
        try {
          fetchResult = pageFetcher.fetchHeader(robotsTxtUrl);
          if (fetchResult.getStatusCode() == HttpStatus.SC_OK) {
    View Full Code Here

    Examples of edu.uci.ics.crawler4j.url.WebURL

        }
        return true;
      }

      private HostDirectives fetchDirectives(URL url) {
        WebURL robotsTxtUrl = new WebURL();
        String host = getHost(url);
        String port = (url.getPort() == url.getDefaultPort() || url.getPort() == -1) ? "" : ":" + url.getPort();
        robotsTxtUrl.setURL("http://" + host + port + "/robots.txt");
        HostDirectives directives = null;
        PageFetchResult fetchResult = null;
        try {
          fetchResult = pageFetcher.fetchHeader(robotsTxtUrl);
          if (fetchResult.getStatusCode() == HttpStatus.SC_OK) {
    View Full Code Here

    Examples of edu.uci.ics.crawler4j.url.WebURL

    */
    public class WebURLTupleBinding extends TupleBinding<WebURL> {

      @Override
      public WebURL entryToObject(TupleInput input) {
        WebURL webURL = new WebURL();
        webURL.setURL(input.readString());
        webURL.setDocid(input.readInt());
        webURL.setParentDocid(input.readInt());
        webURL.setParentUrl(input.readString());
        webURL.setDepth(input.readShort());
        webURL.setPriority(input.readByte());
        webURL.setAnchor(input.readString());
        return webURL;
      }
    View Full Code Here

    Examples of edu.uci.ics.crawler4j.url.WebURL

                if (newDocId > 0) {
                  // Redirect page is already seen
                  return;
                }

                WebURL webURL = new WebURL();
                webURL.setURL(movedToUrl);
                webURL.setParentDocid(curURL.getParentDocid());
                webURL.setParentUrl(curURL.getParentUrl());
                webURL.setDepth(curURL.getDepth());
                webURL.setDocid(-1);
                webURL.setAnchor(curURL.getAnchor());
                if (shouldVisit(webURL) && robotstxtServer.allows(webURL)) {
                  webURL.setDocid(docIdServer.getNewDocID(movedToUrl));
                  frontier.schedule(webURL);
                }
              }
            } else if (fetchResult.getStatusCode() == CustomFetchStatus.PageTooBig) {
              logger.info("Skipping a page which was bigger than max allowed size: " + curURL.getURL());
            }
            return;
          }

          if (!curURL.getURL().equals(fetchResult.getFetchedUrl())) {
            if (docIdServer.isSeenBefore(fetchResult.getFetchedUrl())) {
              // Redirect page is already seen
              return;
            }
            curURL.setURL(fetchResult.getFetchedUrl());
            curURL.setDocid(docIdServer.getNewDocID(fetchResult.getFetchedUrl()));
          }

          Page page = new Page(curURL);
          int docid = curURL.getDocid();

          if (!fetchResult.fetchContent(page)) {
            onContentFetchError(curURL);
            return;
          }

          if (!parser.parse(page, curURL.getURL())) {
            onParseError(curURL);
            return;
          }

          ParseData parseData = page.getParseData();
          if (parseData instanceof HtmlParseData) {
            HtmlParseData htmlParseData = (HtmlParseData) parseData;

            List<WebURL> toSchedule = new ArrayList<>();
            int maxCrawlDepth = myController.getConfig().getMaxDepthOfCrawling();
            for (WebURL webURL : htmlParseData.getOutgoingUrls()) {
              webURL.setParentDocid(docid);
              webURL.setParentUrl(curURL.getURL());
              int newdocid = docIdServer.getDocId(webURL.getURL());
              if (newdocid > 0) {
                // This is not the first time that this Url is
                // visited. So, we set the depth to a negative
                // number.
                webURL.setDepth((short) -1);
                webURL.setDocid(newdocid);
              } else {
                webURL.setDocid(-1);
                webURL.setDepth((short) (curURL.getDepth() + 1));
                if (maxCrawlDepth == -1 || curURL.getDepth() < maxCrawlDepth) {
                  if (shouldVisit(webURL) && robotstxtServer.allows(webURL)) {
                    webURL.setDocid(docIdServer.getNewDocID(webURL.getURL()));
                    toSchedule.add(webURL);
                  }
                }
              }
            }
    View Full Code Here

    Examples of edu.uci.ics.crawler4j.url.WebURL

          if (!hrefWithoutProtocol.contains("javascript:")
              && !hrefWithoutProtocol.contains("mailto:")
              && !hrefWithoutProtocol.contains("@")) {
            String url = URLCanonicalizer.getCanonicalURL(href, contextURL);
            if (url != null) {
              WebURL webURL = new WebURL();
              webURL.setURL(url);
              webURL.setAnchor(urlAnchorPair.getAnchor());
              outgoingUrls.add(webURL);
              urlCount++;
              if (urlCount > config.getMaxOutgoingLinksToFollow()) {
                break;
              }
    View Full Code Here

    Examples of edu.uci.ics.crawler4j.url.WebURL

          } catch (Exception e) {
            logger.error("Could not add seed: " + e.getMessage());
          }
        }

        WebURL webUrl = new WebURL();
        webUrl.setURL(canonicalUrl);
        webUrl.setDocid(docId);
        webUrl.setDepth((short) 0);
        if (!robotstxtServer.allows(webUrl)) {
          logger.info("Robots.txt does not allow this seed: " + pageUrl);
        } else {
          frontier.schedule(webUrl);
        }
    View Full Code Here

    Examples of edu.uci.ics.crawler4j.url.WebURL

            }
            return 0L;
        }

      private HostDirectives fetchDirectives(String host) {
        WebURL robotsTxtUrl = new WebURL();
        robotsTxtUrl.setURL("http://" + host + "/robots.txt");
        HostDirectives directives = null;
        PageFetchResult fetchResult = null;
        try {
          fetchResult = pageFetcher.fetchHeader(robotsTxtUrl);
          if (fetchResult.getStatusCode() == HttpStatus.SC_OK) {
    View Full Code Here

    Examples of edu.uci.ics.crawler4j.url.WebURL

                int newDocId = docIdServer.getDocId(movedToUrl);
                if (newDocId > 0) {
                  // Redirect page is already seen
                  return;
                } else {
                  WebURL webURL = new WebURL();
                  webURL.setURL(movedToUrl);
                  webURL.setParentDocid(curURL.getParentDocid());
                  webURL.setParentUrl(curURL.getParentUrl());
                  webURL.setDepth(curURL.getDepth());
                  webURL.setDocid(-1);
                                // TODO This only proceeds if the tracked Crawl-delay comes back as 0 (instantly ok to fetch)
                                // Need to augment scheduling logic to delay fetches into a time-based priority queue, or some such
                                if (shouldVisit(webURL) && robotstxtServer.allowedIn(webURL) == 0L) {
                    webURL.setDocid(docIdServer.getNewDocID(movedToUrl));
                    frontier.schedule(webURL);
                  }
                }
              }
            } else if (fetchResult.getStatusCode() == CustomFetchStatus.PageTooBig) {
              logger.info("Skipping a page which was bigger than max allowed size: " + curURL.getURL());
            }
            return;
          }

          if (!curURL.getURL().equals(fetchResult.getFetchedUrl())) {
            if (docIdServer.isSeenBefore(fetchResult.getFetchedUrl())) {
              // Redirect page is already seen
              return;
            }
            curURL.setURL(fetchResult.getFetchedUrl());
            curURL.setDocid(docIdServer.getNewDocID(fetchResult.getFetchedUrl()));
          }

          Page page = new Page(curURL);
          int docid = curURL.getDocid();
          if (fetchResult.fetchContent(page) && parser.parse(page, curURL.getURL())) {
            ParseData parseData = page.getParseData();
            if (parseData instanceof HtmlParseData) {
              HtmlParseData htmlParseData = (HtmlParseData) parseData;

              List<WebURL> toSchedule = new ArrayList<WebURL>();
              int maxCrawlDepth = myController.getConfig().getMaxDepthOfCrawling();
              for (WebURL webURL : htmlParseData.getOutgoingUrls()) {
                webURL.setParentDocid(docid);
                webURL.setParentUrl(curURL.getURL());
                int newdocid = docIdServer.getDocId(webURL.getURL());
                if (newdocid > 0) {
                  // This is not the first time that this Url is
                  // visited. So, we set the depth to a negative
                  // number.
                  webURL.setDepth((short) -1);
                  webURL.setDocid(newdocid);
                } else {
                  webURL.setDocid(-1);
                  webURL.setDepth((short) (curURL.getDepth() + 1));
                  if (maxCrawlDepth == -1 || curURL.getDepth() < maxCrawlDepth) {
                                    // TODO This only proceeds if the tracked Crawl-delay comes back as 0 (instantly ok to fetch)
                                    // Need to augment scheduling logic to delay fetches into a time-based priority queue, or some such
                    if (shouldVisit(webURL) && robotstxtServer.allowedIn(webURL) == 0L) {
                      webURL.setDocid(docIdServer.getNewDocID(webURL.getURL()));
                      toSchedule.add(webURL);
                    }
                  }
                }
              }
    View Full Code Here
    TOP
    Copyright © 2018 www.massapi.com. All rights reserved.
    All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.