Examples of WebURL | massapi.com

          int newdocid = DocIDServer.getDocID(uri);
          if (newdocid != -1) {
            if (newdocid > 0) {
              return PageFetchStatus.RedirectedPageIsSeen;
            }
            WebURL webURL = new WebURL();
            webURL.setURL(uri);
            webURL.setDocid(DocIDServer.getNewDocID(uri));
            page.setWebURL(webURL);
          }
        }
      }

Examples of edu.uci.ics.crawler4j.url.WebURL

  // and extract its title and text
  
  private HTMLParser htmlParser = new HTMLParser();


  public Page download(String url) {
    WebURL curURL = new WebURL();
    curURL.setURL(url);
    Page page = new Page(curURL);
    int statusCode = PageFetcher.fetch(page, true);
    if (statusCode == PageFetchStatus.OK) {
      try {
        htmlParser.parse(page.getHTML(), curURL.getURL());
        page.setText(htmlParser.getText());
        page.setTitle(htmlParser.getTitle());
        return page;
      } catch (Exception e) {
        e.printStackTrace();

Examples of edu.uci.ics.crawler4j.url.WebURL

  public static void setActive(boolean active) {
    RobotstxtServer.active = active;
  }
  
  private static HostDirectives fetchDirectives(String host) {
    WebURL robotsTxt = new WebURL();
    robotsTxt.setURL("http://" + host + "/robots.txt");
    Page page = new Page(robotsTxt);
    int statusCode = PageFetcher.fetch(page, true);
    HostDirectives directives = null;
    if (statusCode == PageFetchStatus.OK) {
      directives = RobotstxtParser.parse(page.getHTML(), USER_AGENT_NAME);

Examples of edu.uci.ics.crawler4j.url.WebURL

    if (docid > 0) {
      // This URL is already seen.
      return;
    }


    WebURL webUrl = new WebURL();
    webUrl.setURL(canonicalUrl);
    docid = DocIDServer.getNewDocID(canonicalUrl);
    webUrl.setDocid(docid);
    webUrl.setDepth((short) 0);
    if (!RobotstxtServer.allows(webUrl)) {
      logger.info("Robots.txt does not allow this seed: " + pageUrl);
    } else {
      Frontier.schedule(webUrl);
    }

Examples of edu.uci.ics.crawler4j.url.WebURL

        cursor = urlsDB.openCursor(txn, null);
        result = cursor.getFirst(key, value, null);


        while (matches < max && result == OperationStatus.SUCCESS) {
          if (value.getData().length > 0) {
            WebURL curi = (WebURL) webURLBinding.entryToObject(value);
            results.add(curi);
            matches++;
          }
          result = cursor.getNext(key, value, null);
        }

Examples of edu.uci.ics.crawler4j.url.WebURL


  public static void scheduleAll(List<WebURL> urls) {
    synchronized (mutex) {
      Iterator<WebURL> it = urls.iterator();
      while (it.hasNext()) {
        WebURL url = it.next();
        if (maxPagesToFetch < 0 || scheduledPages < maxPagesToFetch) {          
          try {
            workQueues.put(url);
            scheduledPages++;
          } catch (DatabaseException e) {

Examples of edu.uci.ics.crawler4j.url.WebURL


public final class WebURLTupleBinding extends TupleBinding<WebURL> {


  @Override
  public WebURL entryToObject(TupleInput input) {
    WebURL webURL = new WebURL();
    webURL.setURL(input.readString());
    webURL.setDocid(input.readInt());
    webURL.setParentDocid(input.readInt());
    webURL.setDepth(input.readShort());
    return webURL;
  }

Examples of edu.uci.ics.crawler4j.url.WebURL

    parser = new Parser(config);
    pageFetcher = new PageFetcher(config);
  }


  private Page download(String url) {
    WebURL curURL = new WebURL();
    curURL.setURL(url);
    PageFetchResult fetchResult = null;
    try {
      fetchResult = pageFetcher.fetchHeader(curURL);
      if (fetchResult.getStatusCode() == HttpStatus.SC_OK) {
        try {
          Page page = new Page(curURL);
          fetchResult.fetchContent(page);
          if (parser.parse(page, curURL.getURL())) {
            return page;
          }
        } catch (Exception e) {
          e.printStackTrace();
        }

Examples of edu.uci.ics.crawler4j.url.WebURL

 */
public class WebURLTupleBinding extends TupleBinding<WebURL> {


  @Override
  public WebURL entryToObject(TupleInput input) {
    WebURL webURL = new WebURL();
    webURL.setURL(input.readString());
    webURL.setDocid(input.readInt());
    webURL.setParentDocid(input.readInt());
    webURL.setParentUrl(input.readString());
    webURL.setDepth(input.readShort());
    webURL.setPriority(input.readByte());
    return webURL;
  }

Examples of edu.uci.ics.crawler4j.url.WebURL

            int newDocId = docIdServer.getDocId(movedToUrl);
            if (newDocId > 0) {
              // Redirect page is already seen
              return;
            } else {
              WebURL webURL = new WebURL();
              webURL.setURL(movedToUrl);
              webURL.setParentDocid(curURL.getParentDocid());
              webURL.setParentUrl(curURL.getParentUrl());
              webURL.setDepth(curURL.getDepth());
              webURL.setDocid(-1);
              if (shouldVisit(webURL) && robotstxtServer.allows(webURL)) {
                webURL.setDocid(docIdServer.getNewDocID(movedToUrl));
                frontier.schedule(webURL);
              }
            }
          }
        } else if (fetchResult.getStatusCode() == CustomFetchStatus.PageTooBig) {
          logger.info("Skipping a page which was bigger than max allowed size: " + curURL.getURL());
        }
        return;
      }


      if (!curURL.getURL().equals(fetchResult.getFetchedUrl())) {
        if (docIdServer.isSeenBefore(fetchResult.getFetchedUrl())) {
          // Redirect page is already seen
          return;
        }
        curURL.setURL(fetchResult.getFetchedUrl());
        curURL.setDocid(docIdServer.getNewDocID(fetchResult.getFetchedUrl()));
      }


      Page page = new Page(curURL);
      int docid = curURL.getDocid();
      if (fetchResult.fetchContent(page) && parser.parse(page, curURL.getURL())) {
        ParseData parseData = page.getParseData();
        if (parseData instanceof HtmlParseData) {
          HtmlParseData htmlParseData = (HtmlParseData) parseData;


          List<WebURL> toSchedule = new ArrayList<WebURL>();
          int maxCrawlDepth = myController.getConfig().getMaxDepthOfCrawling();
          for (WebURL webURL : htmlParseData.getOutgoingUrls()) {
            webURL.setParentDocid(docid);
            webURL.setParentUrl(curURL.getURL());
            int newdocid = docIdServer.getDocId(webURL.getURL());
            if (newdocid > 0) {
              // This is not the first time that this Url is
              // visited. So, we set the depth to a negative
              // number.
              webURL.setDepth((short) -1);
              webURL.setDocid(newdocid);
            } else {
              webURL.setDocid(-1);
              webURL.setDepth((short) (curURL.getDepth() + 1));
              if (maxCrawlDepth == -1 || curURL.getDepth() < maxCrawlDepth) {
                if (shouldVisit(webURL) && robotstxtServer.allows(webURL)) {
                  webURL.setDocid(docIdServer.getNewDocID(webURL.getURL()));
                  toSchedule.add(webURL);
                }
              }
            }
          }