Package crawler.movie

Source Code of crawler.movie.MyCrawlLister1

package crawler.movie;

import java.io.BufferedOutputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.HashMap;
import java.util.Map;

import websphinx.CrawlEvent;
import websphinx.CrawlListener;
import websphinx.Crawler;
import websphinx.Link;
import websphinx.LinkEvent;
import websphinx.LinkListener;
import websphinx.Page;

/**
* An abstract class that must be overridden by all specialized crawlers that
* are used to construct a crawled input list.
*
* @author Michael Wurst
* @version $Id$
*
*/
public class MovieCrawler extends Crawler {

  private final Map urlsToVectorize = new HashMap();

  /** the MIME content type of the crawled documents */
  public String contentType = "";

  /** the encoding of the crawled document */
  public String contentEncoding = "";

  /** the language the documents are written in (english, german, ...) */
  public String contentLanguage = "";

  public MovieCrawler() {
    super();
  }

  public MovieCrawler(String encoding, String language, String type) {
    super();
    contentEncoding = encoding;
    contentLanguage = language;
    contentType = type;
  }

  public boolean shouldVisit(Link l) {
    return true;
  }

  public final Map getURLS() {

    return urlsToVectorize;
  }

  public static void main(String[] args) {
    MovieCrawler myCrwaler = new MovieCrawler();
    int num = 1;
    try {
      //myCrwaler.setDepthFirst(true);
      for (num = 10; num < 525; num++) {
        Long l = new Long(num);
        String ls = l.toString(num);
        if (ls.length() == 1)
          ls = "00" + ls;
        if (ls.length() == 2)
          ls = "0" + ls;
        myCrwaler.addRoot(new Link(
            "http://gallys.nastydollars.com/sl/" + ls
                + "/?id=panda22"));
        //http://www.naughtyofficegallery.com/fhg/sco/104/gall.php?10941

        //myCrwaler.addRoot(new Link("http://da.thebrazzers.com/galleries/" + ls
        //        + "/?t=14&nats=MjE1OTozOjE"));

       


      }
     
      myCrwaler.setMaxDepth(1);
      myCrwaler.addLinkListener(new MyLinkListener1());
      // myCrwaler.sendCrawlEvent(1);
      // myCrwaler.addCrawlListener(new MyCrawlLister());
      myCrwaler.run();

    } catch (MalformedURLException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }

  }
}

class MyLinkListener1 implements LinkListener {
  int count = 0;

  public void crawled(LinkEvent event) {
    // TODO Auto-generated method stub

    //System.out.println(event.getLink().toURL());
   
    String fullpath = event.getLink().getURL().getPath();
    String name = fullpath.replace('/', '_');

    if ((event.getLink().toString().indexOf(".mpg") >= 0 || event.getLink().toString().indexOf(".wmv") >= 0)
                        && event.getLink().toString().indexOf(".htm")<0) {
      BufferedOutputStream out = null;
      InputStream in = null;

      System.out.println("found.." + event.getLink().toURL() + "name= "
          + name);
      try {
        File f = new File("d:/crawler/movie/dump" + File.separator
            + name);
        if (f!=null && f.exists())
          return;
        else {
          f.createNewFile();
        }
        count++;
        URL url = event.getLink().getURL();
        URLConnection conn = url.openConnection();
        //conn.setConnectTimeout(30000);
        in = conn.getInputStream();
        out = new BufferedOutputStream(new FileOutputStream(new File(
            "d:/crawler/movie/dump" + File.separator + name)));

        byte[] buffer = new byte[1024];
        int numRead;
        long numWritten = 0;
        while ((numRead = in.read(buffer)) != -1) {
          out.write(buffer, 0, numRead);
          numWritten += numRead;
        }
        System.out.println("file downloaded: " + "\t" + numWritten);
      } catch (Exception exception) {
        exception.printStackTrace();
      } finally {
        try {
          if (in != null) {
            in.close();
          }
          if (out != null) {
            out.close();
          }

        } catch (IOException e) {
          // TODO Auto-generated catch block
          e.printStackTrace();
        } finally {

        }
      }
    }

  }
}

class MyCrawlLister1 implements CrawlListener {

  public void cleared(CrawlEvent arg0) {
    // TODO Auto-generated method stub

  }

  public void paused(CrawlEvent arg0) {
    // TODO Auto-generated method stub
    System.out.println("paused" + arg0.toString());
  }

  public void started(CrawlEvent arg0) {
    // TODO Auto-generated method stub
    System.out.println("Started" + arg0.toString());
    System.out.println(arg0.getCrawler().getPagesVisited());
  }

  public void stopped(CrawlEvent arg0) {
    // TODO Auto-generated method stub
    System.out.println("stopped" + arg0.toString());
    System.out.println(arg0.getCrawler().getPagesVisited());

  }

  public void timedOut(CrawlEvent arg0) {
    // TODO Auto-generated method stub
    System.out.println("Timedout" + arg0.toString());
  }

}
TOP

Related Classes of crawler.movie.MyCrawlLister1

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.