import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.net.MalformedURLException;
import java.util.HashMap;
import java.util.Map;
import websphinx.CrawlEvent;
import websphinx.CrawlListener;
import websphinx.Crawler;
import websphinx.Link;
import websphinx.LinkEvent;
import websphinx.LinkListener;
import websphinx.Page;
public class GenericCrawler extends Crawler {
private final Map urlsToVectorize = new HashMap();
/** the MIME content type of the crawled documents */
public String contentType = "";
/** the encoding of the crawled document */
public String contentEncoding = "";
/** the language the documents are written in (english, german, ...) */
public String contentLanguage = "";
public GenericCrawler() {
super();
}
public GenericCrawler(String encoding, String language, String type) {
super();
contentEncoding = encoding;
contentLanguage = language;
contentType = type;
}
public void visit (Page page) {
System.out.println(page.getURL());
FileWriter out;
try {
out = new FileWriter(new File("d:/crawler/dump"
+ File.separator + page.getLastModified() + ".html"));
out.write(page.getContent());
out.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} finally {
}
}
public boolean shouldVisit (Link l) {
return true;
}
public final Map getURLS() {
return urlsToVectorize;
}
public static void main(String[] args) {
GenericCrawler myCrwaler = new GenericCrawler();
try {
//myCrwaler.setDepthFirst(true);
myCrwaler
.setRoot(new Link(
"http://www.bankersonline.com/"));
myCrwaler.setMaxDepth(2);
//myCrwaler.addLinkListener(new MyLinkListener());
//myCrwaler.sendCrawlEvent(1);
// myCrwaler.addCrawlListener(new MyCrawlLister());
myCrwaler.run();
} catch (MalformedURLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
class MyLinkListener implements LinkListener {
static int count = 0;
public void crawled(LinkEvent event) {
// TODO Auto-generated method stub
System.out.println(event.getLink().toURL());
//System.out.println(event.getLink().getParentURL());
Page page = event.getLink().getPage();
if (event.getLink().toURL().indexOf("yahoo.com")>=0) return;
if (page != null) {
count++;
FileWriter out;
try {
out = new FileWriter(new File("d:/crawler/dump"
+ File.separator + count + ".html"));
out.write(page.getContent());
out.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} finally {
}
}
}
}
class MyCrawlLister implements CrawlListener {
public void cleared(CrawlEvent arg0) {
// TODO Auto-generated method stub
}
public void paused(CrawlEvent arg0) {
// TODO Auto-generated method stub
System.out.println("paused" + arg0.toString());
}
public void started(CrawlEvent arg0) {
// TODO Auto-generated method stub
System.out.println("Started" + arg0.toString());
System.out.println(arg0.getCrawler().getPagesVisited());
}
public void stopped(CrawlEvent arg0) {
// TODO Auto-generated method stub
System.out.println("stopped" + arg0.toString());
System.out.println(arg0.getCrawler().getPagesVisited());
}
public void timedOut(CrawlEvent arg0) {
// TODO Auto-generated method stub
System.out.println("Timedout" + arg0.toString());
}
}