/*
* Copyright (C) 2010 Yves Hoppe. All Rights Reserved.
* http://ex-crawler.sourceforge.net
*
* This is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This software is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this software; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
* USA.
*/
package de.excrawler.server;
import java.sql.*;
import java.util.logging.Logger;
import java.util.logging.Level;
import java.util.regex.Pattern;
/**
*
* @author Yves Hoppe
*/
public class DbHost extends Thread {
public static int saveHost(String host, String protocol, String ip, String subdomain,
int port, String countrycode, String encoding, int trusted, int status,
int wired) throws Exception {
Logger logger = Logger.getLogger(Main.class.getName());
Connection connection = null;
try {
int id = 0;
connection = DatabaseTools.getDBConnection();
String sql = "Select * FROM hosts WHERE host = ?";
PreparedStatement statement = connection.prepareStatement(sql, ResultSet.TYPE_SCROLL_SENSITIVE, ResultSet.CONCUR_UPDATABLE);
statement.setString(1, host);
ResultSet entry = statement.executeQuery();
entry.last();
int rows = entry.getRow();
entry.beforeFirst();
logger.info("Host count: " + rows);
if(rows == 0)
{
logger.info("Creating new Host entry");
sql = "INSERT INTO hosts ( host, protocol, ip, subdomain, port, countrycode, encoding, firstcrawl, lastcrawl, trusted, status) " +
"VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)";
statement = connection.prepareStatement(sql);
statement.setString(1, host);
statement.setString(2, protocol);
statement.setString(3, ip);
statement.setString(4, subdomain);
statement.setInt(5, port);
statement.setString(6, countrycode);
statement.setString(7, encoding);
statement.setTimestamp(8, new java.sql.Timestamp(System.currentTimeMillis()));
statement.setTimestamp(9, new java.sql.Timestamp(System.currentTimeMillis()));
statement.setInt(10, trusted);
statement.setInt(11, 0);
statement.execute();
logger.info("Statement" + statement);
// Getting new id
sql = "Select * FROM hosts WHERE host = ? AND status = 0";
statement = connection.prepareStatement(sql);
statement.setString(1, host);
entry = statement.executeQuery();
while(entry.next())
{
id = entry.getInt("id");
}
} else {
while (entry.next())
{
logger.info("Host is already in index");
id = entry.getInt("id");
}
entry.first();
entry.updateTimestamp("lastcrawl", new java.sql.Timestamp(System.currentTimeMillis()));
if (entry.getString("ip") != null)
{
if(entry.getString("ip").equals(ip))
{
entry.updateString("ip", ip);
logger.info("Host has a new ip adress: " + ip);
}
}
int crawlcount = entry.getInt("crawlcount");
crawlcount++;
entry.updateInt("crawlcount", crawlcount);
logger.info("Updated number of crawl to: " + crawlcount);
entry.updateRow();
}
entry.close();
statement.close();
connection.close();
return id;
} finally {
try {
if (connection != null)
connection.close();
} catch (Exception e) {
logger.log(Level.WARNING, "Something went wrong saving host " + host, e);
}
}
}// End Save Host
/* Guessing Basic Host Priority for getLinksPriority */
public static int getHostPriority(String newhost) throws Exception{
int priority = 20;
Logger logger = Logger.getLogger(Main.class.getName());
Connection connection = null;
try {
int id = 0;
connection = DatabaseTools.getDBConnection();
String sql = "Select * FROM hosts WHERE host = ? AND status = 0";
PreparedStatement statement = connection.prepareStatement(sql, ResultSet.TYPE_SCROLL_SENSITIVE, ResultSet.CONCUR_UPDATABLE);
statement.setString(1, newhost);
ResultSet entry = statement.executeQuery();
entry.last();
int rows = entry.getRow();
entry.beforeFirst();
if (rows == 0)
{
priority -= 10; // New hosts get default rating 10
} else {
entry.next();
int sitecount = entry.getInt("crawlcount");
if(sitecount >= 0 && sitecount <= 100)
priority -= 8; // Site is not often in the index
if (sitecount >= 101 && sitecount <= 500)
priority -= 6;
if (sitecount >= 501 && sitecount <= 1000)
priority -= 5;
if (sitecount >= 1001 && sitecount <= 2000)
priority -= 4;
if (sitecount >= 2001 && sitecount <= 5000)
priority -= 2;
if (sitecount > 5001)
priority += 2;
int trusted = entry.getInt("trusted");
switch(trusted)
{
case 7:
case 8:
case 9:
priority -= 3;
break;
case 6:
case 5:
priority -= 2;
break;
case 4:
case 3:
priority -= 1;
break;
default:
priority += 1;
break;
}
String subdomain = entry.getString("subdomain");
if(subdomain == null || subdomain.equalsIgnoreCase("www")) // subdomains rating isn't so high ;)
{
priority -= 2;
} else {
priority += 1;
}
// String countrycode = entry.getString("countrycode");
// Already used at getLinks() @ analyzeWebsiteCore
//
// /* Getting Higher Priority if host country code setting is activated (crawler.conf) */
//
// if (CrawlerConfig.CRAWLER_USECOUNTRYCODE == 1)
// {
// String pattern = "[,]";
// Pattern splitter = Pattern.compile(pattern);
// String[] result = splitter.split(CrawlerConfig.CRAWLER_COUNTRYCODES);
//
// for (int i = 0; i < result.length; i++)
// {
// if (countrycode.equalsIgnoreCase(result[i]))
// priority -= 7;
// }
// }
String protocol = entry.getString("protocol");
if (!protocol.equals("http"))
{
priority += 1; // A bit worse priority for non http hosts like https, but just a bit ;)
}
int port = entry.getInt("port");
if (port != 80)
priority += 2; // A bit worse priority for non port 80 websites
if (port > 1024)
priority += 2; // A bit more worse for non < 1024 port websites (not needed root to start server.. maybe (!) hacked sites, so get it worse :))
int sitesathost = entry.getInt("websites");
if (sitesathost > 3)
priority += 2; // A bit worse priority because of too many subdomains ;)
} // End Else
entry.close();
statement.close();
connection.close();
} catch (Exception e) {
logger.log(Level.WARNING, "Error at getting host rating for getLinks() " + newhost, e);
} finally {
if (connection != null)
connection.close();
}
return priority;
} // End getHostRating
} // End DbHost