Examples of WebURL

  • edu.uci.ics.crawler4j.url.WebURL
    @author Yasser Ganjisaffar
  • org.apache.manifoldcf.crawler.connectors.rss.WebURL
    Replacement class for java.net.URI, which is broken in many ways.
  • org.apache.manifoldcf.crawler.connectors.webcrawler.WebURL
    Replacement class for java.net.URI, which is broken in many ways.

  • Examples of edu.uci.ics.crawler4j.url.WebURL

              int newdocid = DocIDServer.getDocID(uri);
              if (newdocid != -1) {
                if (newdocid > 0) {
                  return PageFetchStatus.RedirectedPageIsSeen;
                }
                WebURL webURL = new WebURL();
                webURL.setURL(uri);
                webURL.setDocid(DocIDServer.getNewDocID(uri));
                page.setWebURL(webURL);
              }
            }
          }
    View Full Code Here

    Examples of edu.uci.ics.crawler4j.url.WebURL

      // and extract its title and text
     
      private HTMLParser htmlParser = new HTMLParser();

      public Page download(String url) {
        WebURL curURL = new WebURL();
        curURL.setURL(url);
        Page page = new Page(curURL);
        int statusCode = PageFetcher.fetch(page, true);
        if (statusCode == PageFetchStatus.OK) {
          try {
            htmlParser.parse(page.getHTML(), curURL.getURL());
            page.setText(htmlParser.getText());
            page.setTitle(htmlParser.getTitle());
            return page;
          } catch (Exception e) {
            e.printStackTrace();
    View Full Code Here

    Examples of edu.uci.ics.crawler4j.url.WebURL

      public static void setActive(boolean active) {
        RobotstxtServer.active = active;
      }
     
      private static HostDirectives fetchDirectives(String host) {
        WebURL robotsTxt = new WebURL();
        robotsTxt.setURL("http://" + host + "/robots.txt");
        Page page = new Page(robotsTxt);
        int statusCode = PageFetcher.fetch(page, true);
        HostDirectives directives = null;
        if (statusCode == PageFetchStatus.OK) {
          directives = RobotstxtParser.parse(page.getHTML(), USER_AGENT_NAME);     
    View Full Code Here

    Examples of edu.uci.ics.crawler4j.url.WebURL

        if (docid > 0) {
          // This URL is already seen.
          return;
        }

        WebURL webUrl = new WebURL();
        webUrl.setURL(canonicalUrl);
        docid = DocIDServer.getNewDocID(canonicalUrl);
        webUrl.setDocid(docid);
        webUrl.setDepth((short) 0);
        if (!RobotstxtServer.allows(webUrl)) {
          logger.info("Robots.txt does not allow this seed: " + pageUrl);
        } else {
          Frontier.schedule(webUrl);
        }
    View Full Code Here

    Examples of edu.uci.ics.crawler4j.url.WebURL

            cursor = urlsDB.openCursor(txn, null);
            result = cursor.getFirst(key, value, null);

            while (matches < max && result == OperationStatus.SUCCESS) {
              if (value.getData().length > 0) {
                WebURL curi = (WebURL) webURLBinding.entryToObject(value);
                results.add(curi);
                matches++;
              }
              result = cursor.getNext(key, value, null);
            }
    View Full Code Here

    Examples of edu.uci.ics.crawler4j.url.WebURL

      public static void scheduleAll(List<WebURL> urls) {
        synchronized (mutex) {
          Iterator<WebURL> it = urls.iterator();
          while (it.hasNext()) {
            WebURL url = it.next();
            if (maxPagesToFetch < 0 || scheduledPages < maxPagesToFetch) {         
              try {
                workQueues.put(url);
                scheduledPages++;
              } catch (DatabaseException e) {
    View Full Code Here

    Examples of edu.uci.ics.crawler4j.url.WebURL

    public final class WebURLTupleBinding extends TupleBinding<WebURL> {

      @Override
      public WebURL entryToObject(TupleInput input) {
        WebURL webURL = new WebURL();
        webURL.setURL(input.readString());
        webURL.setDocid(input.readInt());
        webURL.setParentDocid(input.readInt());
        webURL.setDepth(input.readShort());
        return webURL;
      }
    View Full Code Here

    Examples of edu.uci.ics.crawler4j.url.WebURL

        parser = new Parser(config);
        pageFetcher = new PageFetcher(config);
      }

      private Page download(String url) {
        WebURL curURL = new WebURL();
        curURL.setURL(url);
        PageFetchResult fetchResult = null;
        try {
          fetchResult = pageFetcher.fetchHeader(curURL);
          if (fetchResult.getStatusCode() == HttpStatus.SC_OK) {
            try {
              Page page = new Page(curURL);
              fetchResult.fetchContent(page);
              if (parser.parse(page, curURL.getURL())) {
                return page;
              }
            } catch (Exception e) {
              e.printStackTrace();
            }
    View Full Code Here

    Examples of edu.uci.ics.crawler4j.url.WebURL

    */
    public class WebURLTupleBinding extends TupleBinding<WebURL> {

      @Override
      public WebURL entryToObject(TupleInput input) {
        WebURL webURL = new WebURL();
        webURL.setURL(input.readString());
        webURL.setDocid(input.readInt());
        webURL.setParentDocid(input.readInt());
        webURL.setParentUrl(input.readString());
        webURL.setDepth(input.readShort());
        webURL.setPriority(input.readByte());
        return webURL;
      }
    View Full Code Here

    Examples of edu.uci.ics.crawler4j.url.WebURL

                int newDocId = docIdServer.getDocId(movedToUrl);
                if (newDocId > 0) {
                  // Redirect page is already seen
                  return;
                } else {
                  WebURL webURL = new WebURL();
                  webURL.setURL(movedToUrl);
                  webURL.setParentDocid(curURL.getParentDocid());
                  webURL.setParentUrl(curURL.getParentUrl());
                  webURL.setDepth(curURL.getDepth());
                  webURL.setDocid(-1);
                  if (shouldVisit(webURL) && robotstxtServer.allows(webURL)) {
                    webURL.setDocid(docIdServer.getNewDocID(movedToUrl));
                    frontier.schedule(webURL);
                  }
                }
              }
            } else if (fetchResult.getStatusCode() == CustomFetchStatus.PageTooBig) {
              logger.info("Skipping a page which was bigger than max allowed size: " + curURL.getURL());
            }
            return;
          }

          if (!curURL.getURL().equals(fetchResult.getFetchedUrl())) {
            if (docIdServer.isSeenBefore(fetchResult.getFetchedUrl())) {
              // Redirect page is already seen
              return;
            }
            curURL.setURL(fetchResult.getFetchedUrl());
            curURL.setDocid(docIdServer.getNewDocID(fetchResult.getFetchedUrl()));
          }

          Page page = new Page(curURL);
          int docid = curURL.getDocid();
          if (fetchResult.fetchContent(page) && parser.parse(page, curURL.getURL())) {
            ParseData parseData = page.getParseData();
            if (parseData instanceof HtmlParseData) {
              HtmlParseData htmlParseData = (HtmlParseData) parseData;

              List<WebURL> toSchedule = new ArrayList<WebURL>();
              int maxCrawlDepth = myController.getConfig().getMaxDepthOfCrawling();
              for (WebURL webURL : htmlParseData.getOutgoingUrls()) {
                webURL.setParentDocid(docid);
                webURL.setParentUrl(curURL.getURL());
                int newdocid = docIdServer.getDocId(webURL.getURL());
                if (newdocid > 0) {
                  // This is not the first time that this Url is
                  // visited. So, we set the depth to a negative
                  // number.
                  webURL.setDepth((short) -1);
                  webURL.setDocid(newdocid);
                } else {
                  webURL.setDocid(-1);
                  webURL.setDepth((short) (curURL.getDepth() + 1));
                  if (maxCrawlDepth == -1 || curURL.getDepth() < maxCrawlDepth) {
                    if (shouldVisit(webURL) && robotstxtServer.allows(webURL)) {
                      webURL.setDocid(docIdServer.getNewDocID(webURL.getURL()));
                      toSchedule.add(webURL);
                    }
                  }
                }
              }
    View Full Code Here
    TOP
    Copyright © 2018 www.massapi.com. All rights reserved.
    All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.