/*
* Copyright (C) 2010 Ex-Crawler Project. All Rights Reserved.
* http://ex-crawler.sourceforge.net
*
* This is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This software is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this software; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
* USA.
*/
package de.excrawler.server;
import java.util.regex.Pattern;
import java.util.HashSet;
import java.util.HashMap;
import java.util.regex.Matcher;
import java.net.URL;
//import org.jsoup.parser.*;
/**
* Basic, small and fast regexp functions
* and helpers
* @author Yves Hoppe (2010-03-13)
*/
public class BasicRegexp extends Thread {
/**
* Gets the first letter of the Host (without subdomain)
* for the database
* @param host
* @return First char of the host
*/
public static String getHostFirstLetter(String host){
String firstLetter = null;
String sub = null;
sub = getHostSubdomain(host);
if (sub != null)
{
sub = sub.concat(".");
host = host.replaceAll(sub, "");
}
firstLetter = String.valueOf(host.charAt(0));
return firstLetter;
}
/**
* Gets the subdomain of the host
* @param host
* @return subdomain (String)
*/
public static String getHostSubdomain(String host) {
String subdomain = null;
String pattern = "[.]";
Pattern splitter = Pattern.compile(pattern);
String[] result = splitter.split(host);
if (result.length == 2)
subdomain = null;
if (result.length == 3)
subdomain = result[0];
if (result.length >= 4)
{
subdomain = result[0];
//System.out.println("Long sub: " + subdomain);
}
return subdomain;
}
/**
* Gets the Countrycode from the host
* @param host (String)
* @return countrycode (String)
*/
public static String getHostCountryCode(String host) {
String countrycode = null;
String pattern = "[.]";
Pattern splitter = Pattern.compile(pattern);
String[] result = splitter.split(host);
countrycode = result[result.length - 1];
return countrycode;
}
/**
* Gets the countrycodes set in the config file
* @return crawler.config Countrycodes as Object
*/
public static String[] getConfigCountryCodes() {
String pattern = "[,]";
Pattern splitter = Pattern.compile(pattern);
String[] result = splitter.split(CrawlerConfig.CRAWLER_COUNTRYCODES);
return result;
}
/**
* Detects of the given host matches the
* Country Code Settings
* @param host
* @return boolean
*/
public static boolean matchConfigCountryCode(String host){
String countrycode = getHostCountryCode(host);
String[] codes = getConfigCountryCodes();
for (int i=0; i < codes.length; i++)
{
if (countrycode.equalsIgnoreCase(codes[i]))
return true;
}
return false;
}
/**
* Shortens strings to the given number
* @param chars
* @param str
* @return shortened string
*/
public static String shortenString(int chars, String str){
String shortened = null;
if(str != null)
{
if (str.length() <= chars)
shortened = str; // Nothing to do all matchs
else
shortened = str.substring(0, chars);
}
return shortened;
}
public static String[] getLetters(String letters, String pattern)
{
Pattern splitter = Pattern.compile(pattern);
String[] result = splitter.split(letters);
return result;
}
public static String getWebFirstLetter(String address){
String firstLetter = null;
String host = null;
try {
URL aURL = new URL(address);
host = aURL.getHost();
} catch (Exception e) {
e.printStackTrace();
}
String sub = null;
sub = getHostSubdomain(host);
if (sub != null)
{
sub = sub.concat(".");
host = host.replaceAll(sub, "");
}
firstLetter = String.valueOf(host.charAt(0));
return firstLetter.toLowerCase();
}
}