Package edu.uci.ics.crawler4j.parser

Examples of edu.uci.ics.crawler4j.parser.HtmlParseData


                final String pageURL = page.getWebURL().getURL();
                System.err.println( format("Processing page: [%s]", pageURL) );

                final ParseData parseData = page.getParseData();
                if (parseData instanceof HtmlParseData) {
                    final HtmlParseData htmlParseData = (HtmlParseData) parseData;
                    try {
                        synchronized (roverLock) {
                            Crawler.super.performExtraction(
                                    new StringDocumentSource(
                                            htmlParseData.getHtml(),
                                            pageURL

                                    )
                            );
                        }
View Full Code Here


  public void visit(Page page) {
    System.out.println("Visited: " + page.getWebURL().getURL());
    myCrawlStat.incProcessedPages();

    if (page.getParseData() instanceof HtmlParseData) {
      HtmlParseData parseData = (HtmlParseData) page.getParseData();
      List<WebURL> links = parseData.getOutgoingUrls();
      myCrawlStat.incTotalLinks(links.size());
      try {
        myCrawlStat.incTotalTextSize(parseData.getText().getBytes("UTF-8").length);
      } catch (UnsupportedEncodingException ignored) {
      }
    }
    // We dump this crawler statistics after processing every 50 pages
    if (myCrawlStat.getTotalProcessedPages() % 50 == 0) {
View Full Code Here

    System.out.println("Docid: " + docid);
    System.out.println("URL: " + url);
    System.out.println("Docid of parent page: " + parentDocid);

    if (page.getParseData() instanceof HtmlParseData) {
      HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
      String text = htmlParseData.getText();
      String html = htmlParseData.getHtml();
      List<WebURL> links = htmlParseData.getOutgoingUrls();

      System.out.println("Text length: " + text.length());
      System.out.println("Html length: " + html.length());
      System.out.println("Number of outgoing links: " + links.size());
    }
View Full Code Here

    System.out.println("Docid: " + docid);
    System.out.println("URL: " + url);
    System.out.println("Docid of parent page: " + parentDocid);

    if (page.getParseData() instanceof HtmlParseData) {
      HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
      String text = htmlParseData.getText();
      String html = htmlParseData.getHtml();
      List<WebURL> links = htmlParseData.getOutgoingUrls();

      System.out.println("Text length: " + text.length());
      System.out.println("Html length: " + html.length());
      System.out.println("Number of outgoing links: " + links.size());
    }
View Full Code Here

    Page page = download(url);
    if (page != null) {
      ParseData parseData = page.getParseData();
      if (parseData != null) {
        if (parseData instanceof HtmlParseData) {
          HtmlParseData htmlParseData = (HtmlParseData) parseData;
          System.out.println("Title: " + htmlParseData.getTitle());
          System.out.println("Text length: " + htmlParseData.getText().length());
          System.out.println("Html length: " + htmlParseData.getHtml().length());
        }
      } else {
        System.out.println("Couldn't parse the content of the page.");
      }
    } else {
View Full Code Here

      Page page = new Page(curURL);
      int docid = curURL.getDocid();
      if (fetchResult.fetchContent(page) && parser.parse(page, curURL.getURL())) {
        ParseData parseData = page.getParseData();
        if (parseData instanceof HtmlParseData) {
          HtmlParseData htmlParseData = (HtmlParseData) parseData;

          List<WebURL> toSchedule = new ArrayList<WebURL>();
          int maxCrawlDepth = myController.getConfig().getMaxDepthOfCrawling();
          for (WebURL webURL : htmlParseData.getOutgoingUrls()) {
            webURL.setParentDocid(docid);
            webURL.setParentUrl(curURL.getURL());
            int newdocid = docIdServer.getDocId(webURL.getURL());
            if (newdocid > 0) {
              // This is not the first time that this Url is
View Full Code Here

    System.out.println("Sub-domain: '" + subDomain + "'");
    System.out.println("Path: '" + path + "'");
    System.out.println("Parent page: " + parentUrl);

    if (page.getParseData() instanceof HtmlParseData) {
      HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
      String text = htmlParseData.getText();
      String html = htmlParseData.getHtml();
      List<WebURL> links = htmlParseData.getOutgoingUrls();

      System.out.println("Text length: " + text.length());
      System.out.println("Html length: " + html.length());
      System.out.println("Number of outgoing links: " + links.size());
    }
View Full Code Here

        return;
      }

      ParseData parseData = page.getParseData();
      if (parseData instanceof HtmlParseData) {
        HtmlParseData htmlParseData = (HtmlParseData) parseData;

        List<WebURL> toSchedule = new ArrayList<>();
        int maxCrawlDepth = myController.getConfig().getMaxDepthOfCrawling();
        for (WebURL webURL : htmlParseData.getOutgoingUrls()) {
          webURL.setParentDocid(docid);
          webURL.setParentUrl(curURL.getURL());
          int newdocid = docIdServer.getDocId(webURL.getURL());
          if (newdocid > 0) {
            // This is not the first time that this Url is
View Full Code Here

      Page page = new Page(curURL);
      int docid = curURL.getDocid();
      if (fetchResult.fetchContent(page) && parser.parse(page, curURL.getURL())) {
        ParseData parseData = page.getParseData();
        if (parseData instanceof HtmlParseData) {
          HtmlParseData htmlParseData = (HtmlParseData) parseData;

          List<WebURL> toSchedule = new ArrayList<WebURL>();
          int maxCrawlDepth = myController.getConfig().getMaxDepthOfCrawling();
          for (WebURL webURL : htmlParseData.getOutgoingUrls()) {
            webURL.setParentDocid(docid);
            webURL.setParentUrl(curURL.getURL());
            int newdocid = docIdServer.getDocId(webURL.getURL());
            if (newdocid > 0) {
              // This is not the first time that this Url is
View Full Code Here

TOP

Related Classes of edu.uci.ics.crawler4j.parser.HtmlParseData

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.