package com.manning.hip.testdata;
import edu.uci.ics.crawler4j.crawler.CrawlController;
import edu.uci.ics.crawler4j.crawler.Page;
import edu.uci.ics.crawler4j.crawler.WebCrawler;
import edu.uci.ics.crawler4j.url.WebURL;
import java.util.regex.Pattern;
public class Crawler {
public static void main(String[] args) throws Exception {
String rootFolder = "/tmp";
int numberOfCrawlers = 1;
CrawlController controller = new CrawlController(rootFolder);
controller.addSeed("http://hadoop.apache.org/");
controller.addSeed("http://hadoop.apache.org/common/");
controller.addSeed("http://hadoop.apache.org/hdfs/");
controller.addSeed("http://hadoop.apache.org/mapreduce/");
controller.addSeed("http://avro.apache.org/");
controller.addSeed("http://hbase.apache.org/");
controller.addSeed("http://hive.apache.org/");
controller.addSeed("http://pig.apache.org/");
controller.addSeed("http://zookeeper.apache.org/");
controller.setPolitenessDelay(1000);
controller.setMaximumCrawlDepth(2);
controller.setMaximumPagesToFetch(1);
controller.start(MyCrawler.class, numberOfCrawlers);
}
public static class MyCrawler extends WebCrawler {
Pattern filters = Pattern.compile(".*(\\.(css|js|bmp|gif|jpe?g"
+ "|png|tiff?|mid|mp2|mp3|mp4" + "|wav|avi|mov|mpeg|ram|m4v|pdf"
+ "|rm|smil|wmv|swf|wma|zip|rar|gz))$");
public MyCrawler() {
}
public boolean shouldVisit(WebURL url) {
String href = url.getURL().toLowerCase();
return !filters.matcher(href).matches() &&
href.contains("apache.org");
}
public void visit(Page page) {
String url = page.getWebURL().getURL();
// standard out contains a single line per URL, with the URL
// followed by all the words found on the page
//
String text = page.getText().replaceAll("[^a-zA-Z]+", " ");
System.out.println(url + "\t" + text);
// standard err contains a line for each outgoing link from the
// page we're crawling
//
for(WebURL link: page.getURLs()) {
System.err.println(url + "\t" + link.getURL());
}
}
}
}