Package

Source Code of MovieCrawler

/*
WVTool - Word Vector Tool
Copyright (C) 2001-2007

Michael Wurst      

web:   http://wvtool.sourceforge.net

This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License as
published by the Free Software Foundation; either version 2 of the
License, or (at your option) any later version.

This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
USA.
*/

import java.io.BufferedOutputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.HashMap;
import java.util.Map;

import websphinx.CrawlEvent;
import websphinx.CrawlListener;
import websphinx.Crawler;
import websphinx.Link;
import websphinx.LinkEvent;
import websphinx.LinkListener;
import websphinx.Page;

/**
* An abstract class that must be overridden by all specialized crawlers that
* are used to construct a crawled input list.
*
* @author Michael Wurst
* @version $Id$
*
*/
public class MovieCrawler extends Crawler {

  private final Map urlsToVectorize = new HashMap();

  /** the MIME content type of the crawled documents */
  public String contentType = "";

  /** the encoding of the crawled document */
  public String contentEncoding = "";

  /** the language the documents are written in (english, german, ...) */
  public String contentLanguage = "";

  public MovieCrawler() {
    super();
  }

  public MovieCrawler(String encoding, String language, String type) {
    super();
    contentEncoding = encoding;
    contentLanguage = language;
    contentType = type;
  }

  public boolean shouldVisit(Link l) {
    return true;
  }

  public final Map getURLS() {

    return urlsToVectorize;
  }

  public static void main(String[] args) {
    MovieCrawler myCrwaler = new MovieCrawler();
    int num = 1;
    try {
      //myCrwaler.setDepthFirst(true);
      for (num = 100; num < 200; num++) {
        Long l = new Long(num);
        String ls = l.toString(num);
        if (ls.length() == 1)
          ls = "00" + ls;
        if (ls.length() == 2)
          ls = "0" + ls;
        myCrwaler.addRoot(new Link(
            "http://www.latinadulterypage.com/fhg/nem/" + ls
                + "/gall.php?10941"));
      }
      myCrwaler.setMaxDepth(1);
      myCrwaler.addLinkListener(new MyLinkListener1());
      // myCrwaler.sendCrawlEvent(1);
      // myCrwaler.addCrawlListener(new MyCrawlLister());
      myCrwaler.run();

    } catch (MalformedURLException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }

  }
}

class MyLinkListener1 implements LinkListener {
  int count = 0;

  public void crawled(LinkEvent event) {
    // TODO Auto-generated method stub

    //System.out.println(event.getLink().toURL());
   
    String fullpath = event.getLink().getURL().getPath();
    String name = fullpath.replace('/', '_');

    if (event.getLink().toString().indexOf("wmv") >= 0) {
      BufferedOutputStream out = null;
      InputStream in = null;

      System.out.println("found.." + event.getLink().toURL() + "name= "
          + name);
      try {
        File f = new File("d:/crawler/movie/dump" + File.separator
            + name);
        if (f!=null && f.exists())
          return;
        else {
          f.createNewFile();
        }
        count++;
        URL url = event.getLink().getURL();
        URLConnection conn = url.openConnection();
        conn.setConnectTimeout(30000);
        in = conn.getInputStream();
        out = new BufferedOutputStream(new FileOutputStream(new File(
            "d:/crawler/movie/dump" + File.separator + name)));

        byte[] buffer = new byte[1024];
        int numRead;
        long numWritten = 0;
        while ((numRead = in.read(buffer)) != -1) {
          out.write(buffer, 0, numRead);
          numWritten += numRead;
        }
        System.out.println("file downloaded: " + "\t" + numWritten);
      } catch (Exception exception) {
        exception.printStackTrace();
      } finally {
        try {
          if (in != null) {
            in.close();
          }
          if (out != null) {
            out.close();
          }

        } catch (IOException e) {
          // TODO Auto-generated catch block
          e.printStackTrace();
        } finally {

        }
      }
    }

  }
}

class MyCrawlLister1 implements CrawlListener {

  public void cleared(CrawlEvent arg0) {
    // TODO Auto-generated method stub

  }

  public void paused(CrawlEvent arg0) {
    // TODO Auto-generated method stub
    System.out.println("paused" + arg0.toString());
  }

  public void started(CrawlEvent arg0) {
    // TODO Auto-generated method stub
    System.out.println("Started" + arg0.toString());
    System.out.println(arg0.getCrawler().getPagesVisited());
  }

  public void stopped(CrawlEvent arg0) {
    // TODO Auto-generated method stub
    System.out.println("stopped" + arg0.toString());
    System.out.println(arg0.getCrawler().getPagesVisited());

  }

  public void timedOut(CrawlEvent arg0) {
    // TODO Auto-generated method stub
    System.out.println("Timedout" + arg0.toString());
  }

}
TOP

Related Classes of MovieCrawler

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.