Package de.excrawler.server

Source Code of de.excrawler.server.BasicRegexp

/*
*  Copyright (C) 2010 Ex-Crawler Project.  All Rights Reserved.
*  http://ex-crawler.sourceforge.net
*
*  This is free software; you can redistribute it and/or modify
*  it under the terms of the GNU General Public License as published by
*  the Free Software Foundation; either version 2 of the License, or
*  (at your option) any later version.
*
*  This software is distributed in the hope that it will be useful,
*  but WITHOUT ANY WARRANTY; without even the implied warranty of
*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
*  GNU General Public License for more details.
*
*  You should have received a copy of the GNU General Public License
*  along with this software; if not, write to the Free Software
*  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307,
*  USA.
*/

package de.excrawler.server;

import java.util.regex.Pattern;
import java.util.HashSet;
import java.util.HashMap;
import java.util.regex.Matcher;
import java.net.URL;

//import org.jsoup.parser.*;

/**
* Basic, small and fast regexp functions
* and helpers
* @author Yves Hoppe (2010-03-13)
*/

public class BasicRegexp extends Thread {

    /**
     * Gets the first letter of the Host (without subdomain)
     * for the database
     * @param host
     * @return First char of the host
     */

    public static String getHostFirstLetter(String host){
        String firstLetter = null;

        String sub = null;
        sub = getHostSubdomain(host);

        if (sub != null)
        {
            sub = sub.concat(".");
            host = host.replaceAll(sub, "");
        }
       
        firstLetter = String.valueOf(host.charAt(0));
       
        return firstLetter;
    }

    /**
     * Gets the subdomain of the host
     * @param host
     * @return subdomain (String)
     */

    public static String getHostSubdomain(String host) {
        String subdomain = null;

        String pattern = "[.]";
        Pattern splitter = Pattern.compile(pattern);
        String[] result = splitter.split(host);

        if (result.length == 2)
            subdomain = null;
        if (result.length == 3)
            subdomain = result[0];
        if (result.length >= 4)
        {
            subdomain = result[0];
            //System.out.println("Long sub: " + subdomain);
        }

        return subdomain;
    }

    /**
     * Gets the Countrycode from the host
     * @param host (String)
     * @return countrycode (String)
     */

    public static String getHostCountryCode(String host) {
        String countrycode = null;

        String pattern = "[.]";
        Pattern splitter = Pattern.compile(pattern);
        String[] result = splitter.split(host);

        countrycode = result[result.length - 1];

        return countrycode;
    }

    /**
     * Gets the countrycodes set in the config file
     * @return crawler.config Countrycodes as Object
     */


    public static String[] getConfigCountryCodes() {
        String pattern = "[,]";
        Pattern splitter = Pattern.compile(pattern);
        String[] result = splitter.split(CrawlerConfig.CRAWLER_COUNTRYCODES);
       
        return result;
    }

    /**
     * Detects of the given host matches the
     * Country Code Settings
     * @param host
     * @return boolean
     */

    public static boolean matchConfigCountryCode(String host){
        String countrycode = getHostCountryCode(host);
        String[] codes = getConfigCountryCodes();

        for (int i=0; i < codes.length; i++)
        {
            if (countrycode.equalsIgnoreCase(codes[i]))
                return true;
        }
      
    return false;
    }

    /**
     * Shortens strings to the given number
     * @param chars
     * @param str
     * @return shortened string
     */

    public static String shortenString(int chars, String str){
        String shortened = null;

        if(str != null)
        {
        if (str.length() <= chars)
            shortened = str; // Nothing to do all matchs
        else
            shortened = str.substring(0, chars);
        }
        return shortened;
    }

    public static String[] getLetters(String letters, String pattern)
    {
        Pattern splitter = Pattern.compile(pattern);
        String[] result = splitter.split(letters);

        return result;
    }

    public static String getWebFirstLetter(String address){
        String firstLetter = null;
        String host = null;

        try {
        URL aURL = new URL(address);
        host = aURL.getHost();
        } catch (Exception e) {
            e.printStackTrace();
        }

        String sub = null;
        sub = getHostSubdomain(host);

        if (sub != null)
        {
            sub = sub.concat(".");
            host = host.replaceAll(sub, "");
        }

        firstLetter = String.valueOf(host.charAt(0));

        return firstLetter.toLowerCase();
    }

}
TOP

Related Classes of de.excrawler.server.BasicRegexp

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.