/*
WVTool - Word Vector Tool
Copyright (C) 2001-2007
Michael Wurst
web: http://wvtool.sourceforge.net
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License as
published by the Free Software Foundation; either version 2 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
USA.
*/
import java.io.BufferedOutputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.HashMap;
import java.util.Map;
import websphinx.CrawlEvent;
import websphinx.CrawlListener;
import websphinx.Crawler;
import websphinx.Link;
import websphinx.LinkEvent;
import websphinx.LinkListener;
import websphinx.Page;
/**
* An abstract class that must be overridden by all specialized crawlers that
* are used to construct a crawled input list.
*
* @author Michael Wurst
* @version $Id$
*
*/
public class MovieCrawler extends Crawler {
private final Map urlsToVectorize = new HashMap();
/** the MIME content type of the crawled documents */
public String contentType = "";
/** the encoding of the crawled document */
public String contentEncoding = "";
/** the language the documents are written in (english, german, ...) */
public String contentLanguage = "";
public MovieCrawler() {
super();
}
public MovieCrawler(String encoding, String language, String type) {
super();
contentEncoding = encoding;
contentLanguage = language;
contentType = type;
}
public boolean shouldVisit(Link l) {
return true;
}
public final Map getURLS() {
return urlsToVectorize;
}
public static void main(String[] args) {
MovieCrawler myCrwaler = new MovieCrawler();
int num = 1;
try {
//myCrwaler.setDepthFirst(true);
for (num = 100; num < 200; num++) {
Long l = new Long(num);
String ls = l.toString(num);
if (ls.length() == 1)
ls = "00" + ls;
if (ls.length() == 2)
ls = "0" + ls;
myCrwaler.addRoot(new Link(
"http://www.latinadulterypage.com/fhg/nem/" + ls
+ "/gall.php?10941"));
}
myCrwaler.setMaxDepth(1);
myCrwaler.addLinkListener(new MyLinkListener1());
// myCrwaler.sendCrawlEvent(1);
// myCrwaler.addCrawlListener(new MyCrawlLister());
myCrwaler.run();
} catch (MalformedURLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
class MyLinkListener1 implements LinkListener {
int count = 0;
public void crawled(LinkEvent event) {
// TODO Auto-generated method stub
//System.out.println(event.getLink().toURL());
String fullpath = event.getLink().getURL().getPath();
String name = fullpath.replace('/', '_');
if (event.getLink().toString().indexOf("wmv") >= 0) {
BufferedOutputStream out = null;
InputStream in = null;
System.out.println("found.." + event.getLink().toURL() + "name= "
+ name);
try {
File f = new File("d:/crawler/movie/dump" + File.separator
+ name);
if (f!=null && f.exists())
return;
else {
f.createNewFile();
}
count++;
URL url = event.getLink().getURL();
URLConnection conn = url.openConnection();
conn.setConnectTimeout(30000);
in = conn.getInputStream();
out = new BufferedOutputStream(new FileOutputStream(new File(
"d:/crawler/movie/dump" + File.separator + name)));
byte[] buffer = new byte[1024];
int numRead;
long numWritten = 0;
while ((numRead = in.read(buffer)) != -1) {
out.write(buffer, 0, numRead);
numWritten += numRead;
}
System.out.println("file downloaded: " + "\t" + numWritten);
} catch (Exception exception) {
exception.printStackTrace();
} finally {
try {
if (in != null) {
in.close();
}
if (out != null) {
out.close();
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} finally {
}
}
}
}
}
class MyCrawlLister1 implements CrawlListener {
public void cleared(CrawlEvent arg0) {
// TODO Auto-generated method stub
}
public void paused(CrawlEvent arg0) {
// TODO Auto-generated method stub
System.out.println("paused" + arg0.toString());
}
public void started(CrawlEvent arg0) {
// TODO Auto-generated method stub
System.out.println("Started" + arg0.toString());
System.out.println(arg0.getCrawler().getPagesVisited());
}
public void stopped(CrawlEvent arg0) {
// TODO Auto-generated method stub
System.out.println("stopped" + arg0.toString());
System.out.println(arg0.getCrawler().getPagesVisited());
}
public void timedOut(CrawlEvent arg0) {
// TODO Auto-generated method stub
System.out.println("Timedout" + arg0.toString());
}
}