Package edu.uci.ics.crawler4j.parser

Examples of edu.uci.ics.crawler4j.parser.ParseData


            @Override
            public void visitedPage(Page page) {
                final String pageURL = page.getWebURL().getURL();
                System.err.println( format("Processing page: [%s]", pageURL) );

                final ParseData parseData = page.getParseData();
                if (parseData instanceof HtmlParseData) {
                    final HtmlParseData htmlParseData = (HtmlParseData) parseData;
                    try {
                        synchronized (roverLock) {
                            Crawler.super.performExtraction(
View Full Code Here


  public void processUrl(String url) {
    System.out.println("Processing: " + url);
    Page page = download(url);
    if (page != null) {
      ParseData parseData = page.getParseData();
      if (parseData != null) {
        if (parseData instanceof HtmlParseData) {
          HtmlParseData htmlParseData = (HtmlParseData) parseData;
          System.out.println("Title: " + htmlParseData.getTitle());
          System.out.println("Text length: " + htmlParseData.getText().length());
View Full Code Here

      }

      Page page = new Page(curURL);
      int docid = curURL.getDocid();
      if (fetchResult.fetchContent(page) && parser.parse(page, curURL.getURL())) {
        ParseData parseData = page.getParseData();
        if (parseData instanceof HtmlParseData) {
          HtmlParseData htmlParseData = (HtmlParseData) parseData;

          List<WebURL> toSchedule = new ArrayList<WebURL>();
          int maxCrawlDepth = myController.getConfig().getMaxDepthOfCrawling();
View Full Code Here

      if (!parser.parse(page, curURL.getURL())) {
        onParseError(curURL);
        return;
      }

      ParseData parseData = page.getParseData();
      if (parseData instanceof HtmlParseData) {
        HtmlParseData htmlParseData = (HtmlParseData) parseData;

        List<WebURL> toSchedule = new ArrayList<>();
        int maxCrawlDepth = myController.getConfig().getMaxDepthOfCrawling();
View Full Code Here

      }

      Page page = new Page(curURL);
      int docid = curURL.getDocid();
      if (fetchResult.fetchContent(page) && parser.parse(page, curURL.getURL())) {
        ParseData parseData = page.getParseData();
        if (parseData instanceof HtmlParseData) {
          HtmlParseData htmlParseData = (HtmlParseData) parseData;

          List<WebURL> toSchedule = new ArrayList<WebURL>();
          int maxCrawlDepth = myController.getConfig().getMaxDepthOfCrawling();
View Full Code Here

TOP

Related Classes of edu.uci.ics.crawler4j.parser.ParseData

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.