Package

Source Code of MyLinkListener

import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.net.MalformedURLException;
import java.util.HashMap;
import java.util.Map;

import websphinx.CrawlEvent;
import websphinx.CrawlListener;
import websphinx.Crawler;
import websphinx.Link;
import websphinx.LinkEvent;
import websphinx.LinkListener;
import websphinx.Page;

public class GenericCrawler extends Crawler {

  private final Map urlsToVectorize = new HashMap();

  /** the MIME content type of the crawled documents */
  public String contentType = "";

  /** the encoding of the crawled document */
  public String contentEncoding = "";

  /** the language the documents are written in (english, german, ...) */
  public String contentLanguage = "";

  public GenericCrawler() {
    super();
  }

  public GenericCrawler(String encoding, String language, String type) {
    super();
    contentEncoding = encoding;
    contentLanguage = language;
    contentType = type;
  }
 
    public void visit (Page page) {
      System.out.println(page.getURL());
    FileWriter out;
    try {
      out = new FileWriter(new File("d:/crawler/dump"
          + File.separator + page.getLastModified() + ".html"));
      out.write(page.getContent());
      out.close();

    } catch (IOException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    } finally {

    }

    }

    public boolean shouldVisit (Link l) {
         return true;
    }

    public final Map getURLS() {

    return urlsToVectorize;
  }

  public static void main(String[] args) {
    GenericCrawler myCrwaler = new GenericCrawler();
    try {
      //myCrwaler.setDepthFirst(true);
      myCrwaler
          .setRoot(new Link(
              "http://www.bankersonline.com/"));
      myCrwaler.setMaxDepth(2);
      //myCrwaler.addLinkListener(new MyLinkListener());
      //myCrwaler.sendCrawlEvent(1);
      // myCrwaler.addCrawlListener(new MyCrawlLister());
      myCrwaler.run();

    } catch (MalformedURLException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }

  }
}

class MyLinkListener implements LinkListener {
  static int count = 0;

  public void crawled(LinkEvent event) {
    // TODO Auto-generated method stub
    System.out.println(event.getLink().toURL());
    //System.out.println(event.getLink().getParentURL());
    Page page = event.getLink().getPage();
    if (event.getLink().toURL().indexOf("yahoo.com")>=0) return;
    if (page != null) {
      count++;
      FileWriter out;
      try {
        out = new FileWriter(new File("d:/crawler/dump"
            + File.separator + count + ".html"));
        out.write(page.getContent());
        out.close();

      } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
      } finally {

      }
    }
  }

}

class MyCrawlLister implements CrawlListener {

  public void cleared(CrawlEvent arg0) {
    // TODO Auto-generated method stub

  }

  public void paused(CrawlEvent arg0) {
    // TODO Auto-generated method stub
    System.out.println("paused" + arg0.toString());
  }

  public void started(CrawlEvent arg0) {
    // TODO Auto-generated method stub
    System.out.println("Started" + arg0.toString());
    System.out.println(arg0.getCrawler().getPagesVisited());
  }

  public void stopped(CrawlEvent arg0) {
    // TODO Auto-generated method stub
    System.out.println("stopped" + arg0.toString());
    System.out.println(arg0.getCrawler().getPagesVisited());

  }

  public void timedOut(CrawlEvent arg0) {
    // TODO Auto-generated method stub
    System.out.println("Timedout" + arg0.toString());
  }

}
TOP

Related Classes of MyLinkListener

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.