Package de.excrawler.server

Source Code of de.excrawler.server.DbHost

/*
*  Copyright (C) 2010 Yves Hoppe.  All Rights Reserved.
*  http://ex-crawler.sourceforge.net
*
*  This is free software; you can redistribute it and/or modify
*  it under the terms of the GNU General Public License as published by
*  the Free Software Foundation; either version 2 of the License, or
*  (at your option) any later version.
*
*  This software is distributed in the hope that it will be useful,
*  but WITHOUT ANY WARRANTY; without even the implied warranty of
*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
*  GNU General Public License for more details.
*
*  You should have received a copy of the GNU General Public License
*  along with this software; if not, write to the Free Software
*  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307,
*  USA.
*/
package de.excrawler.server;

import java.sql.*;
import java.util.logging.Logger;
import java.util.logging.Level;
import java.util.regex.Pattern;
/**
*
* @author Yves Hoppe
*/
public class DbHost extends Thread {


public static int saveHost(String host, String protocol, String ip, String subdomain,
        int port, String countrycode, String encoding, int trusted, int status,
        int wired) throws Exception {
   
    Logger logger = Logger.getLogger(Main.class.getName());
    Connection connection = null;
    try {
   
    int id = 0;
    connection = DatabaseTools.getDBConnection();

    String sql = "Select * FROM hosts WHERE host = ?";

    PreparedStatement statement = connection.prepareStatement(sql, ResultSet.TYPE_SCROLL_SENSITIVE, ResultSet.CONCUR_UPDATABLE);
    statement.setString(1, host);

    ResultSet entry = statement.executeQuery();

    entry.last();
    int rows = entry.getRow();
    entry.beforeFirst();

    logger.info("Host count: " + rows);

    if(rows == 0)
    {
        logger.info("Creating new Host entry");
        sql = "INSERT INTO hosts ( host, protocol, ip, subdomain, port, countrycode, encoding, firstcrawl, lastcrawl, trusted, status) " +
                "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)";

         statement = connection.prepareStatement(sql);

         statement.setString(1, host);
         statement.setString(2, protocol);
         statement.setString(3, ip);
         statement.setString(4, subdomain);
         statement.setInt(5, port);
         statement.setString(6, countrycode);
         statement.setString(7, encoding);
         statement.setTimestamp(8, new java.sql.Timestamp(System.currentTimeMillis()));
         statement.setTimestamp(9, new java.sql.Timestamp(System.currentTimeMillis()));
         statement.setInt(10, trusted);
         statement.setInt(11, 0);

         statement.execute();

         logger.info("Statement" + statement);

         // Getting new id
        
         sql = "Select * FROM hosts WHERE host = ? AND status = 0";
         statement = connection.prepareStatement(sql);
         statement.setString(1, host);
         entry = statement.executeQuery();

         while(entry.next())
         {
             id = entry.getInt("id");
         }      
    } else {
        while (entry.next())
        {
            logger.info("Host is already in index");
            id = entry.getInt("id");
        }
       
        entry.first();
        entry.updateTimestamp("lastcrawl", new java.sql.Timestamp(System.currentTimeMillis()));

        if (entry.getString("ip") != null)
        {
            if(entry.getString("ip").equals(ip))
            {
            entry.updateString("ip", ip);
            logger.info("Host has a new ip adress: " + ip);
            }
        }

        int crawlcount = entry.getInt("crawlcount");
        crawlcount++;
        entry.updateInt("crawlcount", crawlcount);
        logger.info("Updated number of crawl to: " + crawlcount);
        entry.updateRow();   
    }
    entry.close();
    statement.close();
    connection.close();
    return id;
    } finally {
       try {
         if (connection != null)
         connection.close();
        } catch (Exception e) {
        logger.log(Level.WARNING, "Something went wrong saving host " + host, e);
    }
    }
    }// End Save Host


    /* Guessing Basic Host Priority for getLinksPriority */

    public static int getHostPriority(String newhost) throws Exception{
        int priority = 20;
        Logger logger = Logger.getLogger(Main.class.getName());
        Connection connection = null;

        try {
        int id = 0;
       
        connection = DatabaseTools.getDBConnection();

        String sql = "Select * FROM hosts WHERE host = ? AND status = 0";

        PreparedStatement statement = connection.prepareStatement(sql, ResultSet.TYPE_SCROLL_SENSITIVE, ResultSet.CONCUR_UPDATABLE);
        statement.setString(1, newhost);

        ResultSet entry = statement.executeQuery();

        entry.last();
        int rows = entry.getRow();
        entry.beforeFirst();

        if (rows == 0)
        {
            priority -= 10; // New hosts get default rating 10
        } else {
            entry.next();

            int sitecount = entry.getInt("crawlcount");

            if(sitecount >= 0 && sitecount <= 100)
                 priority -= 8; // Site is not often in the index
           
            if (sitecount >= 101 && sitecount <= 500)
                priority -= 6;
           
            if (sitecount >= 501 && sitecount <= 1000)
                priority -= 5;
           
            if (sitecount >= 1001 && sitecount <= 2000)
                priority -= 4;

            if (sitecount >= 2001 && sitecount <= 5000)
                priority -= 2;

            if (sitecount > 5001)
                priority += 2;

            int trusted = entry.getInt("trusted");
            switch(trusted)
            {
                case 7:
                case 8:
                case 9:
                    priority -= 3;
                    break;

                case 6:
                case 5:
                    priority -= 2;
                    break;

                case 4:
                case 3:
                    priority -= 1;
                    break;

                default:
                    priority += 1;
                    break;
            }

            String subdomain = entry.getString("subdomain");

            if(subdomain == null || subdomain.equalsIgnoreCase("www")) // subdomains rating isn't so high ;)
            {
                priority -= 2;
            } else {
                priority += 1;
            }

//            String countrycode = entry.getString("countrycode");

//            Already used at getLinks() @ analyzeWebsiteCore
//
//            /* Getting Higher Priority if host country code setting is activated (crawler.conf) */
//
//            if (CrawlerConfig.CRAWLER_USECOUNTRYCODE == 1)
//            {
//                 String pattern = "[,]";
//                 Pattern splitter = Pattern.compile(pattern);
//                 String[] result = splitter.split(CrawlerConfig.CRAWLER_COUNTRYCODES);
//
//                 for (int i = 0; i < result.length; i++)
//                 {
//                     if (countrycode.equalsIgnoreCase(result[i]))
//                        priority -= 7;
//                 }
//             }

            String protocol = entry.getString("protocol");

            if (!protocol.equals("http"))
            {
                priority += 1; // A bit worse priority for non http hosts like https, but just a bit ;)
            }

            int port = entry.getInt("port");

            if (port != 80)
                priority += 2; // A bit worse priority for non port 80 websites

            if (port > 1024)
                priority += 2; // A bit more worse for non < 1024 port websites (not needed root to start server.. maybe (!) hacked sites, so get it worse :))

            int sitesathost = entry.getInt("websites");

            if (sitesathost > 3)
                priority += 2; // A bit worse priority because of too many subdomains ;)

        } // End Else

        entry.close();
        statement.close();
        connection.close();
        } catch (Exception e) {
            logger.log(Level.WARNING, "Error at getting host rating for getLinks() " + newhost, e);
        } finally {
        if (connection != null)
            connection.close();
        }
        return priority;
    } // End getHostRating

} // End DbHost
TOP

Related Classes of de.excrawler.server.DbHost

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.