Source Code of MovieCrawler

/*
 WVTool - Word Vector Tool
 Copyright (C) 2001-2007


 Michael Wurst       


 web:   http://wvtool.sourceforge.net


 This program is free software; you can redistribute it and/or
 modify it under the terms of the GNU General Public License as 
 published by the Free Software Foundation; either version 2 of the
 License, or (at your option) any later version. 


 This program is distributed in the hope that it will be useful, but
 WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 General Public License for more details.


 You should have received a copy of the GNU General Public License
 along with this program; if not, write to the Free Software
 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
 USA.
 */


import java.io.BufferedOutputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.HashMap;
import java.util.Map;


import websphinx.CrawlEvent;
import websphinx.CrawlListener;
import websphinx.Crawler;
import websphinx.Link;
import websphinx.LinkEvent;
import websphinx.LinkListener;
import websphinx.Page;


/**
 * An abstract class that must be overridden by all specialized crawlers that
 * are used to construct a crawled input list.
 * 
 * @author Michael Wurst
 * @version $Id$
 * 
 */
public class MovieCrawler extends Crawler {


  private final Map urlsToVectorize = new HashMap();


  /** the MIME content type of the crawled documents */
  public String contentType = "";


  /** the encoding of the crawled document */
  public String contentEncoding = "";


  /** the language the documents are written in (english, german, ...) */
  public String contentLanguage = "";


  public MovieCrawler() {
    super();
  }


  public MovieCrawler(String encoding, String language, String type) {
    super();
    contentEncoding = encoding;
    contentLanguage = language;
    contentType = type;
  }


  public boolean shouldVisit(Link l) {
    return true;
  }


  public final Map getURLS() {


    return urlsToVectorize;
  }


  public static void main(String[] args) {
    MovieCrawler myCrwaler = new MovieCrawler();
    int num = 1;
    try {
      //myCrwaler.setDepthFirst(true);
      for (num = 100; num < 200; num++) {
        Long l = new Long(num);
        String ls = l.toString(num);
        if (ls.length() == 1)
          ls = "00" + ls;
        if (ls.length() == 2)
          ls = "0" + ls;
        myCrwaler.addRoot(new Link(
            "http://www.latinadulterypage.com/fhg/nem/" + ls
                + "/gall.php?10941"));
      }
      myCrwaler.setMaxDepth(1);
      myCrwaler.addLinkListener(new MyLinkListener1());
      // myCrwaler.sendCrawlEvent(1);
      // myCrwaler.addCrawlListener(new MyCrawlLister());
      myCrwaler.run();


    } catch (MalformedURLException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }


  }
}


class MyLinkListener1 implements LinkListener {
  int count = 0;


  public void crawled(LinkEvent event) {
    // TODO Auto-generated method stub


    //System.out.println(event.getLink().toURL());
    
    String fullpath = event.getLink().getURL().getPath();
    String name = fullpath.replace('/', '_');


    if (event.getLink().toString().indexOf("wmv") >= 0) {
      BufferedOutputStream out = null;
      InputStream in = null;


      System.out.println("found.." + event.getLink().toURL() + "name= "
          + name);
      try {
        File f = new File("d:/crawler/movie/dump" + File.separator
            + name);
        if (f!=null && f.exists())
          return;
        else {
          f.createNewFile();
        }
        count++;
        URL url = event.getLink().getURL();
        URLConnection conn = url.openConnection();
        conn.setConnectTimeout(30000);
        in = conn.getInputStream();
        out = new BufferedOutputStream(new FileOutputStream(new File(
            "d:/crawler/movie/dump" + File.separator + name)));


        byte[] buffer = new byte[1024];
        int numRead;
        long numWritten = 0;
        while ((numRead = in.read(buffer)) != -1) {
          out.write(buffer, 0, numRead);
          numWritten += numRead;
        }
        System.out.println("file downloaded: " + "\t" + numWritten);
      } catch (Exception exception) {
        exception.printStackTrace();
      } finally {
        try {
          if (in != null) {
            in.close();
          }
          if (out != null) {
            out.close();
          }


        } catch (IOException e) {
          // TODO Auto-generated catch block
          e.printStackTrace();
        } finally {


        }
      }
    }


  }
}


class MyCrawlLister1 implements CrawlListener {


  public void cleared(CrawlEvent arg0) {
    // TODO Auto-generated method stub


  }


  public void paused(CrawlEvent arg0) {
    // TODO Auto-generated method stub
    System.out.println("paused" + arg0.toString());
  }


  public void started(CrawlEvent arg0) {
    // TODO Auto-generated method stub
    System.out.println("Started" + arg0.toString());
    System.out.println(arg0.getCrawler().getPagesVisited());
  }


  public void stopped(CrawlEvent arg0) {
    // TODO Auto-generated method stub
    System.out.println("stopped" + arg0.toString());
    System.out.println(arg0.getCrawler().getPagesVisited());


  }


  public void timedOut(CrawlEvent arg0) {
    // TODO Auto-generated method stub
    System.out.println("Timedout" + arg0.toString());
  }


}
Source Code of MovieCrawler

Related Classes of MovieCrawler