Examples of SiteCrawler


Examples of org.apache.any23.plugin.crawler.SiteCrawler

                        format( "Storage folder %s can not be created, please verify you have enough permissions",
                                                         storageFolder ) );
            }
        }

        final SiteCrawler siteCrawler = new SiteCrawler( storageFolder );
        siteCrawler.setNumOfCrawlers( numCrawlers );
        siteCrawler.setMaxPages( maxPages );
        siteCrawler.setMaxDepth( maxDepth );
        siteCrawler.setPolitenessDelay(politenessDelay);

        siteCrawler.addListener(new CrawlerListener() {
            @Override
            public void visitedPage(Page page) {
                final String pageURL = page.getWebURL().getURL();
                System.err.println( format("Processing page: [%s]", pageURL) );

                final ParseData parseData = page.getParseData();
                if (parseData instanceof HtmlParseData) {
                    final HtmlParseData htmlParseData = (HtmlParseData) parseData;
                    try {
                        synchronized (roverLock) {
                            Crawler.super.performExtraction(
                                    new StringDocumentSource(
                                            htmlParseData.getHtml(),
                                            pageURL

                                    )
                            );
                        }
                    } catch (Exception e) {
                        System.err.println(format("Error while processing page [%s], error: %s .",
                                                  pageURL, e.getMessage())
                        );
                    }
                }
            }
        });

        Runtime.getRuntime().addShutdownHook( new Thread() {
            @Override
            public void run() {
                try {
                    System.err.println( Crawler.super.printReports() );
                    // siteCrawler.stop(); // TODO: cause shutdown hanging.
                } catch (Exception e) {
                    e.printStackTrace(System.err);
                }
            }
        });
        siteCrawler.start(seed, pageFilter, true);
    }
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.