Source Code of crawler.movie.MyCrawlLister1

package crawler.movie;


import java.io.BufferedOutputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.HashMap;
import java.util.Map;


import websphinx.CrawlEvent;
import websphinx.CrawlListener;
import websphinx.Crawler;
import websphinx.Link;
import websphinx.LinkEvent;
import websphinx.LinkListener;
import websphinx.Page;


/**
 * An abstract class that must be overridden by all specialized crawlers that
 * are used to construct a crawled input list.
 * 
 * @author Michael Wurst
 * @version $Id$
 * 
 */
public class MovieCrawler extends Crawler {


  private final Map urlsToVectorize = new HashMap();


  /** the MIME content type of the crawled documents */
  public String contentType = "";


  /** the encoding of the crawled document */
  public String contentEncoding = "";


  /** the language the documents are written in (english, german, ...) */
  public String contentLanguage = "";


  public MovieCrawler() {
    super();
  }


  public MovieCrawler(String encoding, String language, String type) {
    super();
    contentEncoding = encoding;
    contentLanguage = language;
    contentType = type;
  }


  public boolean shouldVisit(Link l) {
    return true;
  }


  public final Map getURLS() {


    return urlsToVectorize;
  }


  public static void main(String[] args) {
    MovieCrawler myCrwaler = new MovieCrawler();
    int num = 1;
    try {
      //myCrwaler.setDepthFirst(true);
      for (num = 10; num < 525; num++) {
        Long l = new Long(num); 
        String ls = l.toString(num);
        if (ls.length() == 1)
          ls = "00" + ls;
        if (ls.length() == 2)
          ls = "0" + ls;
        myCrwaler.addRoot(new Link(
            "http://gallys.nastydollars.com/sl/" + ls
                + "/?id=panda22"));
        //http://www.naughtyofficegallery.com/fhg/sco/104/gall.php?10941


        //myCrwaler.addRoot(new Link("http://da.thebrazzers.com/galleries/" + ls
        //        + "/?t=14&nats=MjE1OTozOjE"));


        




      }
      
      myCrwaler.setMaxDepth(1);
      myCrwaler.addLinkListener(new MyLinkListener1());
      // myCrwaler.sendCrawlEvent(1);
      // myCrwaler.addCrawlListener(new MyCrawlLister());
      myCrwaler.run();


    } catch (MalformedURLException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }


  }
}


class MyLinkListener1 implements LinkListener {
  int count = 0;


  public void crawled(LinkEvent event) {
    // TODO Auto-generated method stub


    //System.out.println(event.getLink().toURL());
    
    String fullpath = event.getLink().getURL().getPath();
    String name = fullpath.replace('/', '_');


    if ((event.getLink().toString().indexOf(".mpg") >= 0 || event.getLink().toString().indexOf(".wmv") >= 0) 
                        && event.getLink().toString().indexOf(".htm")<0) {
      BufferedOutputStream out = null;
      InputStream in = null;


      System.out.println("found.." + event.getLink().toURL() + "name= "
          + name);
      try {
        File f = new File("d:/crawler/movie/dump" + File.separator
            + name);
        if (f!=null && f.exists())
          return;
        else {
          f.createNewFile();
        }
        count++;
        URL url = event.getLink().getURL();
        URLConnection conn = url.openConnection();
        //conn.setConnectTimeout(30000);
        in = conn.getInputStream();
        out = new BufferedOutputStream(new FileOutputStream(new File(
            "d:/crawler/movie/dump" + File.separator + name)));


        byte[] buffer = new byte[1024];
        int numRead;
        long numWritten = 0;
        while ((numRead = in.read(buffer)) != -1) {
          out.write(buffer, 0, numRead);
          numWritten += numRead;
        }
        System.out.println("file downloaded: " + "\t" + numWritten);
      } catch (Exception exception) {
        exception.printStackTrace();
      } finally {
        try {
          if (in != null) {
            in.close();
          }
          if (out != null) {
            out.close();
          }


        } catch (IOException e) {
          // TODO Auto-generated catch block
          e.printStackTrace();
        } finally {


        }
      }
    }


  }
}


class MyCrawlLister1 implements CrawlListener {


  public void cleared(CrawlEvent arg0) {
    // TODO Auto-generated method stub


  }


  public void paused(CrawlEvent arg0) {
    // TODO Auto-generated method stub
    System.out.println("paused" + arg0.toString());
  }


  public void started(CrawlEvent arg0) {
    // TODO Auto-generated method stub
    System.out.println("Started" + arg0.toString());
    System.out.println(arg0.getCrawler().getPagesVisited());
  }


  public void stopped(CrawlEvent arg0) {
    // TODO Auto-generated method stub
    System.out.println("stopped" + arg0.toString());
    System.out.println(arg0.getCrawler().getPagesVisited());


  }


  public void timedOut(CrawlEvent arg0) {
    // TODO Auto-generated method stub
    System.out.println("Timedout" + arg0.toString());
  }


}
Source Code of crawler.movie.MyCrawlLister1

Related Classes of crawler.movie.MyCrawlLister1