Package com.crawl.control

Source Code of com.crawl.control.Crawler

package com.crawl.control;

import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.Collection;

import org.apache.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import com.crawl.model.CraigslistAreasEnum;
import com.crawl.model.CraigslistCategoryEnum;
import com.crawl.model.CrawlResultPackage;

/**
* SINGLETON: The main crawler method to extract the oinformation from the Craiglists web page.
* @author Team Kappa
*
*/
public class Crawler {
    static Logger logger = Logger.getLogger(Crawler.class);
   
    private static Crawler crawler;
   
    //private String url;
    private String matchPattern;
   
    /**
     * Private constructor (SINGLETON)
     */
    private Crawler(){}
   
    /**
     * SINGLETON getInstance method.
     * @return
     */
    public static Crawler getInstance(){
        if (Crawler.crawler==null){
            Crawler.crawler=new Crawler();
        }
       
        return Crawler.crawler;
    }
   
    /**
     * The main extraction method, to call the Craigslist page and get all needed information from there.
     *
     * @param inCraigslistCategoryEnum Which category?
     * @param inCraigslistAreasEnum Which area
     * @param inSearchItem Which item do you search "iPhone", "Apple IIc", etc.
     * @param inputIntOffers how many offers 100 = 1 Craigslist page
     * @return
     */
    public synchronized Collection<CrawlResultPackage> crawlWebPages(
            String inputSearchUrl,
            int inputIntOffers){
       
        Collection<CrawlResultPackage> aCurrentPageResults=null;
        int myIntPage=0;
       
        // Do we already searched for this URL?
        aCurrentPageResults=CraigslistCache.getInstance().getResultFromCache(inputSearchUrl);
       
        // No! Then do the search
        if (aCurrentPageResults==null){
            Collection<CrawlResultPackage> aReturnColl=new ArrayList<CrawlResultPackage>();
           
            do {
                logger.debug("Page="+myIntPage);
           
                aCurrentPageResults=this.crawlWebPage(inputSearchUrl, myIntPage);
                aReturnColl.addAll(aCurrentPageResults);
           
                myIntPage=myIntPage+100;
            } while (aCurrentPageResults.size()!=0 && myIntPage < inputIntOffers);
           
            // And add it to the cache object
            CraigslistCache.getInstance().addResultToCache(inputSearchUrl, aReturnColl);           
           
            return aReturnColl;
        } else {
            return aCurrentPageResults;
        }
    }
   
    /**
     * This is an important step because we need the Craigslist URL before we search also as key for the cache object.
     * (Specified for only San Francisco Bay Area).
     *
     * @param inCraigslistCategoryEnum
     * @param inCraigslistAreasEnum
     * @param inSearchItem
     * @return
     */
    public String createUrl(CraigslistCategoryEnum inCraigslistCategoryEnum,
            CraigslistAreasEnum inCraigslistAreasEnum,
            String inSearchItem) {
        String aTempUrl=(
                "http://"+CraigslistAreasEnum.URL_CONST_AREA_SF_BAY_AREA.getCode()+
                ".craigslist.org/search/"+inCraigslistCategoryEnum.getCode()+
                inCraigslistAreasEnum.getCode()+
                "?query="+inSearchItem+
                "&maxAsk=100000&sort=pricedsc&srchType=A&s=").trim();       
       
        return aTempUrl;
    }
   
    /**
     * This is an important step because we need the Craigslist URL before we search also as key for the cache object.
     * (For global worldwide location/area)
     *
     * @param inCraigslistCategoryCode
     * @param inCraigslistAreasURL
     * @param inSearchItem
     * @return
     */
    public String createUrl(String inCraigslistCategoryCode,
            String inCraigslistAreasURL,
            String inSearchItem) {
        try {
            inSearchItem = URLEncoder.encode(inSearchItem,"UTF-8");
        }
        catch(UnsupportedEncodingException e) { }
        String aTempUrl=(
                inCraigslistAreasURL+
                "/search/"+inCraigslistCategoryCode+
                "?query="+inSearchItem+
                "&sort=priceasc&srchType=T&s=").trim();       
       
        return aTempUrl;
    }
   
    /**
     * The crawl function for one page of Craigslist.
     * @return
     */
    private synchronized Collection<CrawlResultPackage> crawlWebPage(String inputSearchUrl, int page) {
        Collection<CrawlResultPackage> aReturnColl=new ArrayList<CrawlResultPackage>();

        try {
            // OLD http://sfbay.craigslist.org/search/sya?query=&srchType=T&minAsk=1&maxAsk=100000&sort=pricedsc
            // OLD http://sfbay.craigslist.org/search/sya?sort=pricedsc&hasPic=1&srchType=A
           
            // http://sfbay.craigslist.org/search/sya?query=&srchType=T&minAsk=1&maxAsk=100000&sort=pricedsc&s=0
            // http://sfbay.craigslist.org/search/sya?
            //  maxAsk=1000000
            //  &sort=pricedsc
            //  &srchType=A
            logger.info("URL = "+inputSearchUrl+" +page="+page);
            //url = new URL(inputSearchUrl+page);
            inputSearchUrl = inputSearchUrl + page;
           
            // Get items list
            Document document = Jsoup.connect(inputSearchUrl).get();
            Elements rows = document.select("p.row");
            for(Element row : rows) {
                CrawlResultPackage myTempCrawlResultPackage= new CrawlResultPackage();
                myTempCrawlResultPackage.setLine(row.html());
                // get price
                myTempCrawlResultPackage.setPriceOfItem(getPriceFromString(row.select("span.price").text()));
                // Get item and url
                myTempCrawlResultPackage.setItem(row.select("span.pl a").text());
                myTempCrawlResultPackage.setUrl(row.select("span.pl a").attr("abs:href"));
                // Get location
                myTempCrawlResultPackage.setLocations(getLocationsFromString(row.select("small").text()));
                aReturnColl.add(myTempCrawlResultPackage);
            }
        } catch (MalformedURLException mue) {
            mue.printStackTrace();
        } catch (IOException ioe) {
            ioe.printStackTrace();
        }
       
        logger.debug("Size="+aReturnColl.size());
        return aReturnColl;
    }
   
    /**
     * Extracts the price as int value from Craigslists strings like this <a
     * href="http://sfbay.craigslist.org/sby/sys/3256407578.html">$1895 - Apple
     * MacBook Pro MC725LL/A 17-Inch</a
     *
     * @param input
     *            a complete string which must contains a > sign as starting
     *            position. The method reads all number signs after the > and
     *            transformed it into a int number
     * @return
     */
    private int getPriceFromString(String input) {
        try {
            String stringNumber = "";

            // Get everything after the $ character if it is a number
            boolean isNumberScanned = false;
            for (int j = 0; j < input.length(); j++) {
                if (Character.isDigit(input.charAt(j))) {
                    char aChar = input.charAt(j);
                    stringNumber = stringNumber + "" + aChar;
                    isNumberScanned = true;
                }
                else if (isNumberScanned) {
                    break;
                }
            }

            logger.debug("stringNumber=|"+stringNumber+"|");
           
            if (stringNumber==null || stringNumber.trim().length()==0){
                return 0;
            } else {
                // Transform the String to an int value and return it
                try{
                    return new Integer(stringNumber).intValue();
                } catch (NumberFormatException e){
                    logger.fatal("String number is too big for int 32 bit. This is not reasonable because nothing in Craigslist is4 billions worth!");
                    return 1;
                }
            }
        } catch (Exception e) {
            logger.fatal(e);
            e.printStackTrace();
            return 1;
        }
    }
   
    /**
     * Extract the locations from the web page.
     *
     * @param input
     * @return
     */
    private Collection<String> getLocationsFromString(String input) {
        try {
            logger.debug("getLocationsFromString 11111111111111111111111111111111111111111");
            logger.debug("a) input=|"+input+"|");
           
            Collection<String> aCollLoc=new ArrayList<String>();
            String aWorkString=null;
            if(input == null) return aCollLoc;

            completeLoop: // Break out mark

            // Search for the first '(' character.
            // Beginning from the right side
            for (int i = input.length()-1; i >= 0 ; i--) {
                if (input.charAt(i) == '(') {
                    // Get everything after the ( character
                    aWorkString=input.substring(i+1);
                   
                    // Cut off the rest including and after the ')' sign
                    for (int j=0;j<aWorkString.length();j++){
                        if (aWorkString.charAt(j) == ')') {
                            aWorkString=aWorkString.substring(0, j);                           
                        }
                    }
   
                    logger.debug("aWorkString=|"+aWorkString+"|");
                   
                    // if there are more locations then one?
                    if (aWorkString.contains("/")==true){
                        Collection<String> aStringCollLocations=new ArrayList<String>();
                        aCollLoc.addAll(this.getLocationsFromCleanStringRecursion(aWorkString, aStringCollLocations, 10));
                        break completeLoop;
                    } else {
                        aCollLoc.add(aWorkString);
                        break completeLoop;
                    }
                }
            }

            logger.debug("ENNNNNDDDD) aCollLoc=|"+aCollLoc.toString()+"|");
            return aCollLoc;
        } catch (Exception e) {
            logger.fatal(e.toString());
            e.printStackTrace();
            return new ArrayList<String>();
        }
    }  
   
    /**
     * Split up the writing style "(dublin / pleasanton / livermore)"
     * @param input
     * @return
     */
    private Collection<String> getLocationsFromCleanStringRecursion(String inputCleanString, Collection<String> inputCollection, int inMaxRecursion) {
        try {  
            logger.debug("getLocationsFromCleanStringRecursion inMaxRecursion="+inMaxRecursion+" +++++++++++++++++++++++++++++++++");
            // input string look like that = Locations=dublin / pleasanton / livermore
            logger.debug("1) inputCleanString=|"+inputCleanString+"|");
                       
            if (inputCleanString.contains("/")==true){
                for (int i=0;i<inputCleanString.length();i++){
                    if (inputCleanString.charAt(i) == '/'){
                        String aTempStringNewAddItem=inputCleanString.substring(0, i).trim();
                        String aTempStringNewRecurItem=inputCleanString.substring(i+1).trim();
                       
                        logger.debug("2) aTempStringNewAddItem=|"+aTempStringNewAddItem+"|\n   aTempStringNewRecurItem=|"+aTempStringNewRecurItem+"|");
                       
                        // Break out condition
                        if (inMaxRecursion<=0){
                            logger.debug("3) MAX MAAAAXXXXRECURSION REACHED inputCleanString=|"+inputCleanString+"|");
                            inputCollection.add(inputCleanString);
                            logger.debug("30) inputCollection=|"+inputCollection.toString()+"|");
                            return inputCollection;
                        } else {
                            logger.debug("35) ELSEEEEE aTempStringNewAddItem=   |"+aTempStringNewAddItem+"|");
                            inputCollection.add(aTempStringNewAddItem);
                            logger.debug("38) inputCollection=|"+inputCollection.toString()+"|");
                            this.getLocationsFromCleanStringRecursion(aTempStringNewRecurItem, inputCollection, (inMaxRecursion-1));
                            return inputCollection;
                        }
                    }
                }
            } else {
                inputCollection.add(inputCleanString.trim());
            }
       
            logger.debug("40) inputCollection=|"+inputCollection.toString()+"|");
            return inputCollection;
        } catch (Exception e) {
            logger.fatal(e);
            inputCollection.add(inputCleanString);
            e.printStackTrace();
            return inputCollection;
        }
    }

    /**
     * GETTER/SETTER.
     * @return
     */
    public String getMatchPattern() {
        return matchPattern;
    }
}
TOP

Related Classes of com.crawl.control.Crawler

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.