Source Code of de.excrawler.server.AnalyzeWebsiteCore

/*
 *  Copyright (C) 2010 Ex-Crawler Project.  All Rights Reserved.
 *  http://ex-crawler.sourceforge.net
 *
 *  This is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  This software is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this software; if not, write to the Free Software
 *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307,
 *  USA.
 */


package de.excrawler.server;


import java.io.*;
import java.net.*;
// import java.util.zip.Checksum;
import java.util.zip.CheckedInputStream;
import java.util.zip.CRC32;
// import java.util.HashMap;
import java.util.Map;
// import java.lang.StringBuffer;
// import java.lang.String;
import java.util.StringTokenizer;
import java.util.Arrays;
import java.util.regex.Pattern;
import java.util.HashSet;
import java.util.HashMap;
import java.util.logging.*;
import java.util.regex.Matcher;
// import org.apache.commons.lang.Validate;
import org.jsoup.nodes.Document;
// import org.jsoup.nodes.Element;
import org.jsoup.Jsoup;
import org.jsoup.select.Elements;
// import org.jsoup.parser.*;




/**
 * Analyzes Website Core,
 * Todo: @AnalyzeWebsiteCore needs fixing, outsourcing and clean up
 * - complete rewrite into static methods
 * @author Yves Hoppe
 */


public class AnalyzeWebsiteCore extends Thread {


    String firstLetter = null;
    String filename = null;
    String LinkList = "LINKS";
    String ImageList = "IMAGE";
    String host = null;
    String protocol = null;
    String address = null;
    long fileSize;
    long checksum;
    int charcount;
    String contentwohtml = null; // For stripped content
    String nohtml = null;
    String html = null;


    // More info about document
    String doctype = null;
    String basehref = null;
    String xmlns = null;


    // Parse Metatags
    String keywords = null;
    String author = null;
    String description = null;
    String alternate = null;
    String robots = null;
    String generator = null;


    // String contenttype;
    String browsercompatible = null;


    // Document JSoup
    Document doc = null;


    Logger logger = Logger.getLogger(Main.class.getName());
    
    StringBuffer contents = new StringBuffer();


    /**
     * Init class, super(filename)
     * @param fn
     */


    public AnalyzeWebsiteCore(String fn)
    {
        super(fn);
        this.filename = fn;        
    }


    private Map<String, Integer> map; // get it out just for analyze words       


    /**
     * Todo: Combine all initializing function into one function or at public ()
     * Sets the address
     * @param adr
     */


    void setAddress(String adr)
    {
        address = adr;
    }


    /**
     * Sets the protocol
     * @param pro
     */


    void setProtocol(String pro)
    {
        protocol = pro;
    }


    /**
     * Sets the host
     * @param ho
     */


    void setHost(String ho)
    {
        host = ho;
    }


    /**
     * Sets the firstletter
     * @param ho
     */


    void setFirstLetter(String fl)
    {
        firstLetter = fl;
    }


    /**
     * Basic method to get some file informations about the saved website
     * like crc32 checksum and file size
     * Todo: Needs updating and should be merged together with analyzeFile
     * to save resources
     * @throws Exception
     */


    void initializeFile() throws Exception
    {
       // FILE SIZE
        CheckedInputStream cis = null;


        File site = new File(filename);
        fileSize = site.length();


       // File Checksum
        try {
            cis = new CheckedInputStream( new FileInputStream(filename), new CRC32());
            byte[] buf = new byte[128];


            while(cis.read(buf) >= 0) {
            }


        checksum = cis.getChecksum().getValue();
        
        cis.close();      


        } catch (Exception e) {
            logger.log(Level.WARNING, "Something went wrong at initialize file (crc32)", e);
        }
        // NEXT
    }


    /**
     * Basic initializing method to open the website to a StreamBuffer
     * and to Jsoup
     * @return boolean true if opening succeded, false if something fails
     * @throws Exception
     */


    boolean analyzeFile() throws Exception
    {
        logger.finer("Analyzing file: " + filename);


        File file = new File(filename);


        if (!FileSecurity.basicCheckWebsite(file))
        {
            logger.warning("Basic security check failed at file: " + filename);
            return false;
        }
                 
        BufferedReader reader = null;


        try {
            reader = new BufferedReader(new FileReader(file));
            String text = null;


            while ((text = reader.readLine()) != null)
            {
                contents.append(text); // .append(System.getProperty("line.separator"));
            }


            if (contents.length() > 0)
                //Jsoup.class.newInstance();
                doc = Jsoup.parse(contents.toString());


            reader.close();
        } catch (Exception e) {
            logger.log(Level.WARNING, "Error analyzing file (IOException)", e);
        } finally
        {
            try 
            {
                if (reader != null)                
                    reader.close();                
            } catch (Exception e) {}
        }
        return true;
    }


    /**
     * Tries to get all links out of an websites, processes them
     * and submitts them to a db saving method (crawllist)
     * USES jsoup
     * @throws Exception
     */


    void getLinks() throws Exception
    {
        try {
        int status = 0;


        if (doc == null)
            return;
                
        Elements links = doc.select("a");


        if (links == null)
            return;


        int index = 0;
        int returncode = 0;            
            
        for (int i = 0; i < links.toArray().length; i++)
        {
            String link = links.get(i).attr("href").toLowerCase();
            String linkalt = links.get(i).attr("alt");
            String linktitle = links.get(i).attr("title");
            String linktext = links.get(i).text();
            String linkinnerhtml = links.get(i).html();
            String newLink2 = null;
            int wrong = 0;


            if (!link.contains("javascript:") && link != null)
            {
                try {
                    URL newLink = new URL(link);
                } catch (MalformedURLException e) {
                    newLink2 = protocol;
                    if (link.contains("http://") || link.contains("https://") || link.contains("ftp://")){
                        logger.info("Strange Link: " + link);
                        InitCrawler.wired++;
                    } else {
                    if (link.contains("/")){
                    newLink2 = newLink2.concat("://").concat(host).concat(link);
                    } else {
                    newLink2 = newLink2.concat("://").concat(host).concat("/").concat(link);
                    }
                    }
                    continue;
                }


                if(newLink2 != null)
                {
                  try {
                      URL newLink3 = new URL(newLink2);
                  } catch (MalformedURLException e) {
                      wrong = 1;
                      continue;
                  }
                if (wrong == 0) {
                      link = newLink2;
                }
                }
            
            int websiterating = getLinkRating(link, linktext, linkalt, linktitle);


            //System.out.println("rating: " + websiterating +  " | site: " + link );


            if (websiterating <= 30) // Get really (!) bad links out
            {
            String fLetter = BasicRegexp.getWebFirstLetter(link);


            returncode = DbWebsite.addURLCrawler(fLetter, link, linktext, host, "website crawler", websiterating, status);
            
                if (returncode == 0)
                {
                    logger.warning("Encountered a problem at saving new URL: " + link);
                } else if (returncode == 1) {
                    logger.fine("Added URL " + link + " to crawling list");
                } else if (returncode == 2) {
                    logger.finer("URL " + link + " already exists in index");
                }
            }
          } // END if !javascript void


    }// END FOR
    } catch (Exception e) {
        logger.log(Level.INFO, "Encountered a problem at retriving new links", e);
    }
    }// End get_Links




    /**
     * Basic function to guess priority for links on the website called by getLinks()
     * Needs a better Algorithm and should be outsourced
     * @param link
     * @param linktext
     * @param linkalt
     * @param linktitle
     * @return Link Rating (from 0 best to 30 worst)
     */


    public static int getLinkRating(String link, String linktext, String linkalt, String linktitle) {
        int rating = 5; // Start at a low rating


        try {
        link = link.toLowerCase();
        URL aURL = new URL(link);      


        String newhost = aURL.getHost();
        String newprotocol = aURL.getProtocol();
        int newport = aURL.getPort();
        String newquery = aURL.getQuery();
        String newfilename = aURL.getFile();
        int filenamelength = newfilename.length();


        String newref = aURL.getRef();


        // Getting host priority
        
        int hostrating = DbHost.getHostPriority(newhost) / 2; // dividing for better accuracy


        rating += hostrating;


        // Basic checking for bad links
        if (link.contains("mailto:"))
            rating += 100;


         /* Getting Higher Priority if host country code setting is activated (crawler.conf) */


            if (CrawlerConfig.CRAWLER_USECOUNTRYCODE == 1)
            {
                 if (BasicRegexp.matchConfigCountryCode(newhost));
                        rating -= 10;
            }


        /* Checking Protocol, ports and some other basic informations */


        if (!newprotocol.equalsIgnoreCase("http") && newprotocol.equalsIgnoreCase("https"))
            rating += 2; // A bit worse rating for non http protocols


        if (newport != 80 && newport != -1 && newport != 443)
            rating += 2; // A bit worse rating for non port 80 or 443 websites


        if (newport > 1024)
            rating += 2; // A bit more worse for port > 1024 http servers (because root not needed to start, maybe (!) hacked server)


        if (newquery == null)
            rating -= 2; // A bit better rating for non query links


        if (newquery != null)
        {
        if (!(newquery.contains("page") || newquery.contains("p=") || newquery.contains("site=")  || newquery.contains("s=") || newquery.contains("id=")
                || newquery.contains("thread=") || newquery.contains("show=") || newquery.contains("itemid")))
            rating += 1; // A bit basic query checking


        if (newquery.contains("filter") || newquery.contains("sid=") || newquery.contains("session") || newquery.contains("action=")
                || newquery.contains("tag=") || newquery.contains("keyword") || newquery.contains("userid=") || newquery.contains("user_id=")
                || newquery.contains("search=") || newquery.contains("comment=") || newquery.contains("sort=") || newquery.contains("count=")
                || newquery.contains("feature=") || newquery.contains("label=") || newquery.contains("widget") || newquery.contains("query="))
            rating += 5; // Things which could be endless
        


        String pattern = "[&]";
        Pattern splitter = Pattern.compile(pattern);
        String[] result = splitter.split(newquery);


        if(result.length == 2)
            rating += 2; // Worse if there are more then one query item


        if(result.length >= 3)
            rating += 4;  // Worse if there more then two query items
        }




        if (filenamelength == 0)
            rating -= 2; // New domains ;)


        if (filenamelength > 0 && filenamelength <= 20)
            rating -= 2; // Bit better rating for short urls


        if (filenamelength >= 21 && filenamelength <= 40)
            rating -= 1;


        if (filenamelength >= 60 && filenamelength <= 80)
            rating += 1;


        if (filenamelength >= 81)
            rating += 3; // Bit worse rating for long urls


        if (newfilename != null)
        {
            String pattern = "[.]";
            Pattern splitter = Pattern.compile(pattern);
            String[] result = splitter.split(newfilename);
            
            if (result.length != 0 && result.length != -1)
            {
            String filetype = result[result.length - 1];


            if (filetype.equalsIgnoreCase("pdf"))
                rating += 4; // A bit worse rating for pdf files
            
            if (filetype.equals("zip") || filetype.equals("rar") || filetype.equals("dmg") || filetype.equals("exe") || filetype.equals("jpg")
                    || filetype.equals("png") || filetype.equals("gif") || filetype.equals("mov") || filetype.equals("avi") || filetype.equals("mp3")
                    || filetype.equals("wav") || filetype.equals("ogg") || filetype.equals("bmp") || filetype.endsWith("tiff") || filetype.equals("psd")
                    || filetype.equals("m4a"))
                rating += 100; // Simple outsorting of bad files and links
            }
        }
               
        if (link.contains("/2009/") || link.contains("/2008/") || link.contains("/2007/") || link.contains("/2006/") || link.contains("/2005/"))
            rating += 3; // Hmm "old" archives rate worse


        if (link.contains("comments/") || link.contains("tag/") || link.contains("user/") || link.contains("label/"))
            rating += 4;


      if (BasicWebLinkFilter.isSocialmediaFilter(link))
            rating += 6; // Hmm social media or short urls are nice, but not for a crawler - Todo: needs better listing @ getLinkRating()


        if (link.contains("facebook.com") || link.contains("myspace.com") || link.contains("youtube.com"))
            rating += 4; // Hmm they are too big, hard to get usefull content, so a bit worse


        /* Link Text Rating */


        if(linktext != null)
        {
            if(linktext.length() <= 5)
                rating += 1; // Short Links without much text, hmm not very much informations
        } else {
            rating += 2;
        }
        
        /* Link alt and title Rating */


        if (linkalt != null && linktitle != null)
        {
            rating -= 1;
        } else {
            rating += 1; // Bit worse rating for empty alt and title tag
        }


         /* Ref checking */


        if (newref != null)
        {
            rating += 5; // Hmm could be endless, should filter it before inserting....  :)
        }


        } catch (MalformedURLException e) {
            e.printStackTrace();
            return 40;
        } catch (Exception e) {
            e.printStackTrace();
            return 40;
        }


        return rating;
    }


    /**
     * Tries to extract all images out of an site and sends them
     * to db storing
     * Todo: Needs updating to jsoup including priority rating, image text, image alt
     * @throws Exception
     */


   void getImages() throws Exception {
        try {
        int status = 0;


        String lcContent = contents.toString().toLowerCase();
        String newImage = null;


        int index = 0;
        int returncode = 0;


        while((index = lcContent.indexOf("<", index)) != -1)
        {
            if ((index = lcContent.indexOf("img src", index)) == -1)
                break;
            if((index = lcContent.indexOf("=", index)) == -1)
                break;


            index++;
            int wrong = 0;
            String remaining = contents.toString().substring(index);


            StringTokenizer st = new StringTokenizer(remaining, "\t\n\r\">#");
            String strLink = st.nextToken();


            // LINK Validierung


            if (!strLink.contains("javascript:void(0);"))
            {
                try {
                    URL Image = new URL(strLink);
                } catch (MalformedURLException e) {
                    newImage = protocol;
                    if (strLink.contains("/")){
                    newImage = newImage.concat("://").concat(host).concat(strLink);
                    } else {
                    newImage = newImage.concat("://").concat(host).concat("/").concat(strLink);
                    }
                    continue;
                }
            if(newImage == null)
            {
               if (strLink.contains("http://") || strLink.contains("https://") || strLink.contains("ftp://"))
                {
                ImageList = ImageList.concat("|" + strLink);
                }
            } else {
                try {
                  URL newLink3 = new URL(newImage);
              } catch (MalformedURLException e) {
                  wrong = 1;
                  continue;
              }
            if (wrong == 0) {
                ImageList =  ImageList.concat("|" + newImage);
            }
            }
            } // ENDIF
        } // ENDWHILE


        String pattern = "[|]";
        Pattern splitter = Pattern.compile(pattern);
        String[] result = splitter.split(ImageList);


        HashSet set = new HashSet(Arrays.asList(result));
        set.remove("IMAGE");
        set.remove(address);
        String[] result2 = (String[])(set.toArray(new String[set.size()]));


        for(int i = 0; i < result2.length; i++)
        {
            returncode = DbWebsite.addImageCrawler(result2[i], "Link on image (System)", status);
            if (returncode == 0)
            {
                logger.warning("Something went wrong saving new Image:  " + result2[i]);
            } else if (returncode == 1) {
                logger.fine("Added Image " + result2[i] + " to crawling list");
            } else if (returncode == 2) {
                logger.finer("Image " + result2[i] + " already exists in index");
            }
        }


        set.clear();
    } catch (Exception e) {
     logger.log(Level.INFO, "Error at getImages: ", e);
    }// End Get Images
    }


   /*
    * Get's all valid email addresses out of an site.
    * Yes this could be used to get email addresses for sending spam etc. to them, but ex-crawler's goal
    * is to get as much knowledge out of an website as possible. So what to do about this? I'm really not sure..
    * Todo: Detect names and other things where the mail is wrapped around
    * Todo: write an algorithm that detects other emails, like info [at] web (dot) de
    * @param siteId for database
    */


   void getEmails(int siteId) throws Exception
   {
        try {
        String mailName = "";


        String lcContent = contents.toString().toLowerCase().trim();
        String newEmail = null;


        int returncode = 0;


        Pattern p = Pattern.compile("([A-Za-z0-9]+@[A-Za-z0-9]+\\.[A-Za-z]{2,4}\\b)");
        Matcher m = p.matcher(lcContent);


       
        while(m.find()){
        if (!(m.group().contains("jpg") || m.group().contains("pdf") || m.group().contains("png") || m.group().contains("gif")))
        {
        returncode = DbWebsite.addDBEmail(m.group(), mailName, siteId);
        logger.info("New Mail adress found: " + m.group() + " returncode: " + returncode );
        }
        }
        } catch (Exception e) {
           logger.log(Level.WARNING, "Failed to get E-Mails", e);
       }
   }


   /*
    * Not yet implentend function to get a word list on a site with counter
    * Needs updating and comes into a deeper analyzing class
    * Todo: Move analyzeWordCount() into the deep crawling and update it
    * @return just shouts to console for testing
    */


    void analyzeWordCount() throws Exception
    {
        map = new HashMap<String, Integer>();
        String regex = "[-.,;:!=(){} ]+";
        for (String w : contents.toString().split(regex)) {
            map.put(w.toLowerCase(), new Integer(map.containsKey(w.toLowerCase()) ? map.get(w.toLowerCase()) + 1 : 1));
        }
        System.out.println("Map: " + map);
    }


    /**
     * Basic function to strip all html from a website (works not so well)
     * Needs updating and better algorithms.. pattern is in this case not a good choice
     * Todo: Move getOnlyText() and update algorithm
     * @return: Return just only trimmed text out of an website
     */


    String getOnlyText() throws Exception {
        try {
            nohtml = contents.toString();


            Pattern pattern = Pattern.compile("\\<.*?\\>");
            Matcher m = pattern.matcher(nohtml);


            nohtml = m.replaceAll("");
            nohtml = nohtml.trim();
            nohtml = nohtml.replaceAll("\\b\\s{2,}\\b", " ");
            nohtml = nohtml.replaceAll("\\s\\s", "");
            nohtml = nohtml.replace("\\<--.*?--\\>", "");
            //nohtml = nohtml.replaceAll("\\s+", " ");
            return nohtml;


        }  catch (Exception e ) {
            e.printStackTrace();
            return nohtml;
        }
    }


    /**
     * Basic function to strip all text and return just html (works not so well)
     * Needs updateing and better algorithms.. pattern is not a good choice here
     * Todo: Move onlyHTML() and update algorithm
     */


    void onlyHTML() throws Exception {
        try {
        html = contents.toString().replaceAll("\\<.*?\\>;", "");
        html = html.replace("\\<--.*?--\\>", "");


        }  catch (Exception e ) {
        e.printStackTrace();
        }
    }


    /**
     * Gets the (html) title out of a website
     * @return
     * @throws Exception
     */


    String getTitle() throws Exception {
        try {
        String title = doc.title();
        logger.info("Title: " + title);
        return title;
        } catch (Exception e) {
            logger.log(Level.INFO, "Error at getTitle", e);
            return "";
        }
    }


    /**
     * Parses the Metatags and set them to local global vars
     * Needs some optimizations
     * @throws Exception
     */


   void parseMetatags() throws Exception {
       try {
       Elements meta;
       String comments = "";
       
       meta = doc.getElementsByTag("META");
       Object metatags[] = meta.toArray();


       for (int i = 0; i < metatags.length; i++)
       {
          String metatag = metatags[i].toString().toLowerCase().trim();
          if(metatag.contains("keywords"))
          {
              keywords = metatag;
              keywords = keywords.replaceAll("\\<.?meta.?name=\"keywords\".*content.?=.?\"", "");
              keywords = keywords.replaceAll("\".?/>", "");
              logger.info("Keywords: " + keywords);
          }


          if(metatag.contains("description"))
          {
              description = metatag;
              description = description.replaceAll("\\<.?meta.?name=\"description\".*content.?=.?\"", "");
              description = description.replaceAll("\".?/>", "");
              logger.info("Description: " + description);
          }


          if(metatag.contains("author"))
          {
              author = metatag;
              author = author.replaceAll("\\<.?meta.?name=\"author\".*content.?=.?\"", "");
              author = author.replaceAll("\".?/>", "");
              logger.info("Author: " + author);
          }


          if(metatag.contains("robots"))
          {
              robots = metatag;
              robots = robots.replaceAll("\\<.?meta.?name=\"robots\".*content.?=.?\"", "");
              robots = robots.replaceAll("\".?/>", "");
              logger.info("Robots: " + robots);
          }


          if(metatag.contains("generator"))
          {
              generator = metatag;
              generator = generator.replaceAll("\\<.?meta.?name=\"generator\".*content.?=.?\"", "");
              generator = generator.replaceAll("\".?/>", "");
              logger.info("Generator: " + generator);
          }        
       } //End While
       } catch (Exception e) { // End Try
           logger.log(Level.INFO, "Something went wrong @ parseMetaTags", e);
       }
   } // End Metatags


   /**
    * Tries to get the Basehref from the website html
    * @return basehref
    */


   String getBasehref()throws Exception {
       // meta = doc.getElementsByTag("META").last();
       try
       {
       basehref = doc.getElementsByTag("BASE").first().text();
         logger.info("Basehref: " + basehref);
       } catch (Exception e) {
         logger.info("Site has no basehref");
       }
       return basehref;
   }


   /**
    * Tries to get the doctype of the website html
    * @return
    * @throws Exception
    */


   String getDoctype() throws Exception {
       try
       {
       doctype = contents.substring(0, 130).toString().toLowerCase().trim();


              if (doctype.contains("xhtml 1.0"))
              {
                  doctype = "XHTML 1.0";
              }
              else if (doctype.contains("xhtml 1.1"))
              {
                  doctype = "XHTML 1.1";
              }


              else if (doctype.contains("xhtml 2.0"))
              {
                  doctype = "XHTML 2.0";
              }


              else if (doctype.contains("html 4.01"))
              {
                  doctype = "HTML 4.01";
              }


              else if (doctype.contains("html 5"))
              {
                  doctype = "HTML 5";
              }


              else if (doctype.contains("MathML 2.0"))
              {
                  doctype = "MathML 2.0";
              }


              else if (doctype.contains("MathML 1.01"))
              {
                  doctype = "MathML 1.01";
              }


              else if (doctype.contains("SVG 1.0"))
              {
                  doctype = "SVG 1.0";
              }


              else if (doctype.contains("SVG 1.1"))
              {
                  doctype = "SVG 1.1";
              }


              // Old doctypes


              else if (doctype.contains("html 2.0"))
              {
                  doctype = "HTML 2.0";
              }


              else if (doctype.contains("html 3.2"))
              {
                  doctype = "HTML 3.2";
              }


              else if (doctype.contains("xhtml basic 1.0"))
              {
                  doctype = "xhtml basic 1.0";
              }


              else {
                  doctype = "unknown";
              }
              } catch (Exception e) {
       logger.log(Level.INFO,"Something went wrong with doctype from site " + address, e);
       doctype ="unknown";
       }
       logger.info("Doctype: " + doctype);
       return doctype;
   } // End Getdoctype


   /**
    * needs to be initialized, else null
    * @return filesize of the site
    */


    long getFilesize()
    {
        return fileSize;
    }


    /**
    * needs to be initialized, else null
    * @return checksum of the site
    */


    long getChecksum()
    {
        return checksum;
    }


    /**
    * needs to be initialized, else null
    * @return meta-keywords of the site
    */


    String getKeywords()
    {
        return keywords;
    }


    /**
    * needs to be initialized, else null
    * @return meta-description of the site
    */


    String getDescription()
    {
        return description;
    }


    /**
    * needs to be initialized, else null
    * @return meta-generator of the site
    */


    String getGenerator()
    {
        generator = BasicRegexp.shortenString(255, generator);
        return generator;
    }


    /**
    * needs to be initialized, else null
    * @return robots tag of the site
    */


    String getRobots()
    {
        robots = BasicRegexp.shortenString(255, robots);
        return robots;
    }


    /**
    * needs to be initialized, else null
    * @return meta author of the site
    */


    String getAuthor()
    {
        author = BasicRegexp.shortenString(255, author);
        return author;
    }


} // End Class
Source Code of de.excrawler.server.AnalyzeWebsiteCore

Related Classes of de.excrawler.server.AnalyzeWebsiteCore