/*
* Copyright (C) 2010 Ex-Crawler Project. All Rights Reserved.
* http://ex-crawler.sourceforge.net
*
* This is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This software is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this software; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
* USA.
*/
package de.excrawler.server;
import java.util.logging.*;
import java.sql.*;
import java.io.*;
import java.net.*;
import java.util.regex.Pattern;
import java.util.Date;
/**
* Downloads websites and saves them to temporary files
* @author Yves Hoppe
*/
public class DownWebWorker extends Thread {
public String letter;
public int limit;
public static String status = "Download web worker is doing nothing";
public static int WORKING = 1;
Logger logger = Logger.getLogger(Main.class.getName());
Connection connection = null;
DownWebWorker(String chr, int lim) {
super("downweb-" + chr);
this.letter = chr;
this.limit = lim;
}
@Override
public void run() {
BufferedReader reader = null;
BufferedWriter writer = null;
try {
String currentLetter = null;
int current = 0;
// System.out.println("letter: " + letter);
String pattern = "[|]";
Pattern splitter = Pattern.compile(pattern);
String[] result = splitter.split(letter);
while(true) {
String address = null;
currentLetter = result[current];
connection = DatabaseTools.getDBConnection();
// Ok hardcoding it - bad hack, but no other solution in the hand
String sql = "Select * FROM crawllist_" + currentLetter + " WHERE status = 0 or status = 5 ORDER by priority ASC LIMIT 0, ?";
PreparedStatement statement = connection.prepareStatement(sql, ResultSet.TYPE_SCROLL_SENSITIVE, ResultSet.CONCUR_UPDATABLE);
//statement.setString(1, currentLetter);
statement.setInt(1, limit);
ResultSet entry = statement.executeQuery();
int success = 0;
while(entry.next())
{
address = entry.getString("url");
status = "DownloadWebWorker downloads: " + address;
InputStream is = null;
try {
URL aURL = new URL(address);
File newfile = new File(CrawlerConfig.TMPDIR + File.separatorChar + "tmp_"+ currentLetter + "_"+ entry.getInt("id"));
// e.g sites/a_crawlistid Todo: Move to another tmp directory;
newfile.createNewFile();
is = aURL.openStream();
reader = new BufferedReader(new InputStreamReader(is));
writer = new BufferedWriter(new FileWriter(newfile), 1000000);
String line;
while( (line = reader.readLine()) != null) {
if(!line.isEmpty())
{
writer.write(line);
writer.newLine();
}
}
writer.flush();
writer.close();
reader.close();
is.close();
success = 1;
// System.out.println("Downloaded of site successfull: " + address);
} catch (Exception e) {
logger.log(Level.INFO, "Error at Download Worker " + DownWebWorker.currentThread().getName(), e); // tooo many errors out there so just info
success = 0;
if (reader != null)
reader.close();
if (writer != null)
writer.close();
} finally {
if (reader != null)
reader.close();
if (writer != null)
writer.close();
}
if (success == 1)
{
entry.updateInt("status", 1);
entry.updateRow();
} else {
entry.updateInt("status", 7);
int prior = entry.getInt("priority");
prior += 3;
entry.updateInt("priority", prior);
entry.updateTimestamp("date", new java.sql.Timestamp(System.currentTimeMillis()));
entry.updateRow();
}
}
entry.close();
statement.close();
connection.close();
if (WORKING == 0)
break;
// System.out.println("Current: " + current + " Letter: " + currentLetter);
if (current < result.length -1)
current++;
else
current = 0;
}
} catch (Exception e) {
logger.log(Level.SEVERE, "DownWebWorker " + DownWebWorker.currentThread().getName() + " hangs", e);
DownWebWorker.currentThread().start();
}
} // End run
}