/*
* Copyright (C) 2010 Ex-Crawler Project. All Rights Reserved.
* http://ex-crawler.sourceforge.net
*
* This is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This software is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this software; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
* USA.
*/
package de.excrawler.server;
import java.io.*;
import java.net.*;
// import java.util.zip.Checksum;
import java.util.zip.CheckedInputStream;
import java.util.zip.CRC32;
// import java.util.HashMap;
import java.util.Map;
// import java.lang.StringBuffer;
// import java.lang.String;
import java.util.StringTokenizer;
import java.util.Arrays;
import java.util.regex.Pattern;
import java.util.HashSet;
import java.util.HashMap;
import java.util.logging.*;
import java.util.regex.Matcher;
// import org.apache.commons.lang.Validate;
import org.jsoup.nodes.Document;
// import org.jsoup.nodes.Element;
import org.jsoup.Jsoup;
import org.jsoup.select.Elements;
// import org.jsoup.parser.*;
/**
* Analyzes Website Core,
* Todo: @AnalyzeWebsiteCore needs fixing, outsourcing and clean up
* - complete rewrite into static methods
* @author Yves Hoppe
*/
public class AnalyzeWebsiteCore extends Thread {
String firstLetter = null;
String filename = null;
String LinkList = "LINKS";
String ImageList = "IMAGE";
String host = null;
String protocol = null;
String address = null;
long fileSize;
long checksum;
int charcount;
String contentwohtml = null; // For stripped content
String nohtml = null;
String html = null;
// More info about document
String doctype = null;
String basehref = null;
String xmlns = null;
// Parse Metatags
String keywords = null;
String author = null;
String description = null;
String alternate = null;
String robots = null;
String generator = null;
// String contenttype;
String browsercompatible = null;
// Document JSoup
Document doc = null;
Logger logger = Logger.getLogger(Main.class.getName());
StringBuffer contents = new StringBuffer();
/**
* Init class, super(filename)
* @param fn
*/
public AnalyzeWebsiteCore(String fn)
{
super(fn);
this.filename = fn;
}
private Map<String, Integer> map; // get it out just for analyze words
/**
* Todo: Combine all initializing function into one function or at public ()
* Sets the address
* @param adr
*/
void setAddress(String adr)
{
address = adr;
}
/**
* Sets the protocol
* @param pro
*/
void setProtocol(String pro)
{
protocol = pro;
}
/**
* Sets the host
* @param ho
*/
void setHost(String ho)
{
host = ho;
}
/**
* Sets the firstletter
* @param ho
*/
void setFirstLetter(String fl)
{
firstLetter = fl;
}
/**
* Basic method to get some file informations about the saved website
* like crc32 checksum and file size
* Todo: Needs updating and should be merged together with analyzeFile
* to save resources
* @throws Exception
*/
void initializeFile() throws Exception
{
// FILE SIZE
CheckedInputStream cis = null;
File site = new File(filename);
fileSize = site.length();
// File Checksum
try {
cis = new CheckedInputStream( new FileInputStream(filename), new CRC32());
byte[] buf = new byte[128];
while(cis.read(buf) >= 0) {
}
checksum = cis.getChecksum().getValue();
cis.close();
} catch (Exception e) {
logger.log(Level.WARNING, "Something went wrong at initialize file (crc32)", e);
}
// NEXT
}
/**
* Basic initializing method to open the website to a StreamBuffer
* and to Jsoup
* @return boolean true if opening succeded, false if something fails
* @throws Exception
*/
boolean analyzeFile() throws Exception
{
logger.finer("Analyzing file: " + filename);
File file = new File(filename);
if (!FileSecurity.basicCheckWebsite(file))
{
logger.warning("Basic security check failed at file: " + filename);
return false;
}
BufferedReader reader = null;
try {
reader = new BufferedReader(new FileReader(file));
String text = null;
while ((text = reader.readLine()) != null)
{
contents.append(text); // .append(System.getProperty("line.separator"));
}
if (contents.length() > 0)
//Jsoup.class.newInstance();
doc = Jsoup.parse(contents.toString());
reader.close();
} catch (Exception e) {
logger.log(Level.WARNING, "Error analyzing file (IOException)", e);
} finally
{
try
{
if (reader != null)
reader.close();
} catch (Exception e) {}
}
return true;
}
/**
* Tries to get all links out of an websites, processes them
* and submitts them to a db saving method (crawllist)
* USES jsoup
* @throws Exception
*/
void getLinks() throws Exception
{
try {
int status = 0;
if (doc == null)
return;
Elements links = doc.select("a");
if (links == null)
return;
int index = 0;
int returncode = 0;
for (int i = 0; i < links.toArray().length; i++)
{
String link = links.get(i).attr("href").toLowerCase();
String linkalt = links.get(i).attr("alt");
String linktitle = links.get(i).attr("title");
String linktext = links.get(i).text();
String linkinnerhtml = links.get(i).html();
String newLink2 = null;
int wrong = 0;
if (!link.contains("javascript:") && link != null)
{
try {
URL newLink = new URL(link);
} catch (MalformedURLException e) {
newLink2 = protocol;
if (link.contains("http://") || link.contains("https://") || link.contains("ftp://")){
logger.info("Strange Link: " + link);
InitCrawler.wired++;
} else {
if (link.contains("/")){
newLink2 = newLink2.concat("://").concat(host).concat(link);
} else {
newLink2 = newLink2.concat("://").concat(host).concat("/").concat(link);
}
}
continue;
}
if(newLink2 != null)
{
try {
URL newLink3 = new URL(newLink2);
} catch (MalformedURLException e) {
wrong = 1;
continue;
}
if (wrong == 0) {
link = newLink2;
}
}
int websiterating = getLinkRating(link, linktext, linkalt, linktitle);
//System.out.println("rating: " + websiterating + " | site: " + link );
if (websiterating <= 30) // Get really (!) bad links out
{
String fLetter = BasicRegexp.getWebFirstLetter(link);
returncode = DbWebsite.addURLCrawler(fLetter, link, linktext, host, "website crawler", websiterating, status);
if (returncode == 0)
{
logger.warning("Encountered a problem at saving new URL: " + link);
} else if (returncode == 1) {
logger.fine("Added URL " + link + " to crawling list");
} else if (returncode == 2) {
logger.finer("URL " + link + " already exists in index");
}
}
} // END if !javascript void
}// END FOR
} catch (Exception e) {
logger.log(Level.INFO, "Encountered a problem at retriving new links", e);
}
}// End get_Links
/**
* Basic function to guess priority for links on the website called by getLinks()
* Needs a better Algorithm and should be outsourced
* @param link
* @param linktext
* @param linkalt
* @param linktitle
* @return Link Rating (from 0 best to 30 worst)
*/
public static int getLinkRating(String link, String linktext, String linkalt, String linktitle) {
int rating = 5; // Start at a low rating
try {
link = link.toLowerCase();
URL aURL = new URL(link);
String newhost = aURL.getHost();
String newprotocol = aURL.getProtocol();
int newport = aURL.getPort();
String newquery = aURL.getQuery();
String newfilename = aURL.getFile();
int filenamelength = newfilename.length();
String newref = aURL.getRef();
// Getting host priority
int hostrating = DbHost.getHostPriority(newhost) / 2; // dividing for better accuracy
rating += hostrating;
// Basic checking for bad links
if (link.contains("mailto:"))
rating += 100;
/* Getting Higher Priority if host country code setting is activated (crawler.conf) */
if (CrawlerConfig.CRAWLER_USECOUNTRYCODE == 1)
{
if (BasicRegexp.matchConfigCountryCode(newhost));
rating -= 10;
}
/* Checking Protocol, ports and some other basic informations */
if (!newprotocol.equalsIgnoreCase("http") && newprotocol.equalsIgnoreCase("https"))
rating += 2; // A bit worse rating for non http protocols
if (newport != 80 && newport != -1 && newport != 443)
rating += 2; // A bit worse rating for non port 80 or 443 websites
if (newport > 1024)
rating += 2; // A bit more worse for port > 1024 http servers (because root not needed to start, maybe (!) hacked server)
if (newquery == null)
rating -= 2; // A bit better rating for non query links
if (newquery != null)
{
if (!(newquery.contains("page") || newquery.contains("p=") || newquery.contains("site=") || newquery.contains("s=") || newquery.contains("id=")
|| newquery.contains("thread=") || newquery.contains("show=") || newquery.contains("itemid")))
rating += 1; // A bit basic query checking
if (newquery.contains("filter") || newquery.contains("sid=") || newquery.contains("session") || newquery.contains("action=")
|| newquery.contains("tag=") || newquery.contains("keyword") || newquery.contains("userid=") || newquery.contains("user_id=")
|| newquery.contains("search=") || newquery.contains("comment=") || newquery.contains("sort=") || newquery.contains("count=")
|| newquery.contains("feature=") || newquery.contains("label=") || newquery.contains("widget") || newquery.contains("query="))
rating += 5; // Things which could be endless
String pattern = "[&]";
Pattern splitter = Pattern.compile(pattern);
String[] result = splitter.split(newquery);
if(result.length == 2)
rating += 2; // Worse if there are more then one query item
if(result.length >= 3)
rating += 4; // Worse if there more then two query items
}
if (filenamelength == 0)
rating -= 2; // New domains ;)
if (filenamelength > 0 && filenamelength <= 20)
rating -= 2; // Bit better rating for short urls
if (filenamelength >= 21 && filenamelength <= 40)
rating -= 1;
if (filenamelength >= 60 && filenamelength <= 80)
rating += 1;
if (filenamelength >= 81)
rating += 3; // Bit worse rating for long urls
if (newfilename != null)
{
String pattern = "[.]";
Pattern splitter = Pattern.compile(pattern);
String[] result = splitter.split(newfilename);
if (result.length != 0 && result.length != -1)
{
String filetype = result[result.length - 1];
if (filetype.equalsIgnoreCase("pdf"))
rating += 4; // A bit worse rating for pdf files
if (filetype.equals("zip") || filetype.equals("rar") || filetype.equals("dmg") || filetype.equals("exe") || filetype.equals("jpg")
|| filetype.equals("png") || filetype.equals("gif") || filetype.equals("mov") || filetype.equals("avi") || filetype.equals("mp3")
|| filetype.equals("wav") || filetype.equals("ogg") || filetype.equals("bmp") || filetype.endsWith("tiff") || filetype.equals("psd")
|| filetype.equals("m4a"))
rating += 100; // Simple outsorting of bad files and links
}
}
if (link.contains("/2009/") || link.contains("/2008/") || link.contains("/2007/") || link.contains("/2006/") || link.contains("/2005/"))
rating += 3; // Hmm "old" archives rate worse
if (link.contains("comments/") || link.contains("tag/") || link.contains("user/") || link.contains("label/"))
rating += 4;
if (BasicWebLinkFilter.isSocialmediaFilter(link))
rating += 6; // Hmm social media or short urls are nice, but not for a crawler - Todo: needs better listing @ getLinkRating()
if (link.contains("facebook.com") || link.contains("myspace.com") || link.contains("youtube.com"))
rating += 4; // Hmm they are too big, hard to get usefull content, so a bit worse
/* Link Text Rating */
if(linktext != null)
{
if(linktext.length() <= 5)
rating += 1; // Short Links without much text, hmm not very much informations
} else {
rating += 2;
}
/* Link alt and title Rating */
if (linkalt != null && linktitle != null)
{
rating -= 1;
} else {
rating += 1; // Bit worse rating for empty alt and title tag
}
/* Ref checking */
if (newref != null)
{
rating += 5; // Hmm could be endless, should filter it before inserting.... :)
}
} catch (MalformedURLException e) {
e.printStackTrace();
return 40;
} catch (Exception e) {
e.printStackTrace();
return 40;
}
return rating;
}
/**
* Tries to extract all images out of an site and sends them
* to db storing
* Todo: Needs updating to jsoup including priority rating, image text, image alt
* @throws Exception
*/
void getImages() throws Exception {
try {
int status = 0;
String lcContent = contents.toString().toLowerCase();
String newImage = null;
int index = 0;
int returncode = 0;
while((index = lcContent.indexOf("<", index)) != -1)
{
if ((index = lcContent.indexOf("img src", index)) == -1)
break;
if((index = lcContent.indexOf("=", index)) == -1)
break;
index++;
int wrong = 0;
String remaining = contents.toString().substring(index);
StringTokenizer st = new StringTokenizer(remaining, "\t\n\r\">#");
String strLink = st.nextToken();
// LINK Validierung
if (!strLink.contains("javascript:void(0);"))
{
try {
URL Image = new URL(strLink);
} catch (MalformedURLException e) {
newImage = protocol;
if (strLink.contains("/")){
newImage = newImage.concat("://").concat(host).concat(strLink);
} else {
newImage = newImage.concat("://").concat(host).concat("/").concat(strLink);
}
continue;
}
if(newImage == null)
{
if (strLink.contains("http://") || strLink.contains("https://") || strLink.contains("ftp://"))
{
ImageList = ImageList.concat("|" + strLink);
}
} else {
try {
URL newLink3 = new URL(newImage);
} catch (MalformedURLException e) {
wrong = 1;
continue;
}
if (wrong == 0) {
ImageList = ImageList.concat("|" + newImage);
}
}
} // ENDIF
} // ENDWHILE
String pattern = "[|]";
Pattern splitter = Pattern.compile(pattern);
String[] result = splitter.split(ImageList);
HashSet set = new HashSet(Arrays.asList(result));
set.remove("IMAGE");
set.remove(address);
String[] result2 = (String[])(set.toArray(new String[set.size()]));
for(int i = 0; i < result2.length; i++)
{
returncode = DbWebsite.addImageCrawler(result2[i], "Link on image (System)", status);
if (returncode == 0)
{
logger.warning("Something went wrong saving new Image: " + result2[i]);
} else if (returncode == 1) {
logger.fine("Added Image " + result2[i] + " to crawling list");
} else if (returncode == 2) {
logger.finer("Image " + result2[i] + " already exists in index");
}
}
set.clear();
} catch (Exception e) {
logger.log(Level.INFO, "Error at getImages: ", e);
}// End Get Images
}
/*
* Get's all valid email addresses out of an site.
* Yes this could be used to get email addresses for sending spam etc. to them, but ex-crawler's goal
* is to get as much knowledge out of an website as possible. So what to do about this? I'm really not sure..
* Todo: Detect names and other things where the mail is wrapped around
* Todo: write an algorithm that detects other emails, like info [at] web (dot) de
* @param siteId for database
*/
void getEmails(int siteId) throws Exception
{
try {
String mailName = "";
String lcContent = contents.toString().toLowerCase().trim();
String newEmail = null;
int returncode = 0;
Pattern p = Pattern.compile("([A-Za-z0-9]+@[A-Za-z0-9]+\\.[A-Za-z]{2,4}\\b)");
Matcher m = p.matcher(lcContent);
while(m.find()){
if (!(m.group().contains("jpg") || m.group().contains("pdf") || m.group().contains("png") || m.group().contains("gif")))
{
returncode = DbWebsite.addDBEmail(m.group(), mailName, siteId);
logger.info("New Mail adress found: " + m.group() + " returncode: " + returncode );
}
}
} catch (Exception e) {
logger.log(Level.WARNING, "Failed to get E-Mails", e);
}
}
/*
* Not yet implentend function to get a word list on a site with counter
* Needs updating and comes into a deeper analyzing class
* Todo: Move analyzeWordCount() into the deep crawling and update it
* @return just shouts to console for testing
*/
void analyzeWordCount() throws Exception
{
map = new HashMap<String, Integer>();
String regex = "[-.,;:!=(){} ]+";
for (String w : contents.toString().split(regex)) {
map.put(w.toLowerCase(), new Integer(map.containsKey(w.toLowerCase()) ? map.get(w.toLowerCase()) + 1 : 1));
}
System.out.println("Map: " + map);
}
/**
* Basic function to strip all html from a website (works not so well)
* Needs updating and better algorithms.. pattern is in this case not a good choice
* Todo: Move getOnlyText() and update algorithm
* @return: Return just only trimmed text out of an website
*/
String getOnlyText() throws Exception {
try {
nohtml = contents.toString();
Pattern pattern = Pattern.compile("\\<.*?\\>");
Matcher m = pattern.matcher(nohtml);
nohtml = m.replaceAll("");
nohtml = nohtml.trim();
nohtml = nohtml.replaceAll("\\b\\s{2,}\\b", " ");
nohtml = nohtml.replaceAll("\\s\\s", "");
nohtml = nohtml.replace("\\<--.*?--\\>", "");
//nohtml = nohtml.replaceAll("\\s+", " ");
return nohtml;
} catch (Exception e ) {
e.printStackTrace();
return nohtml;
}
}
/**
* Basic function to strip all text and return just html (works not so well)
* Needs updateing and better algorithms.. pattern is not a good choice here
* Todo: Move onlyHTML() and update algorithm
*/
void onlyHTML() throws Exception {
try {
html = contents.toString().replaceAll("\\<.*?\\>;", "");
html = html.replace("\\<--.*?--\\>", "");
} catch (Exception e ) {
e.printStackTrace();
}
}
/**
* Gets the (html) title out of a website
* @return
* @throws Exception
*/
String getTitle() throws Exception {
try {
String title = doc.title();
logger.info("Title: " + title);
return title;
} catch (Exception e) {
logger.log(Level.INFO, "Error at getTitle", e);
return "";
}
}
/**
* Parses the Metatags and set them to local global vars
* Needs some optimizations
* @throws Exception
*/
void parseMetatags() throws Exception {
try {
Elements meta;
String comments = "";
meta = doc.getElementsByTag("META");
Object metatags[] = meta.toArray();
for (int i = 0; i < metatags.length; i++)
{
String metatag = metatags[i].toString().toLowerCase().trim();
if(metatag.contains("keywords"))
{
keywords = metatag;
keywords = keywords.replaceAll("\\<.?meta.?name=\"keywords\".*content.?=.?\"", "");
keywords = keywords.replaceAll("\".?/>", "");
logger.info("Keywords: " + keywords);
}
if(metatag.contains("description"))
{
description = metatag;
description = description.replaceAll("\\<.?meta.?name=\"description\".*content.?=.?\"", "");
description = description.replaceAll("\".?/>", "");
logger.info("Description: " + description);
}
if(metatag.contains("author"))
{
author = metatag;
author = author.replaceAll("\\<.?meta.?name=\"author\".*content.?=.?\"", "");
author = author.replaceAll("\".?/>", "");
logger.info("Author: " + author);
}
if(metatag.contains("robots"))
{
robots = metatag;
robots = robots.replaceAll("\\<.?meta.?name=\"robots\".*content.?=.?\"", "");
robots = robots.replaceAll("\".?/>", "");
logger.info("Robots: " + robots);
}
if(metatag.contains("generator"))
{
generator = metatag;
generator = generator.replaceAll("\\<.?meta.?name=\"generator\".*content.?=.?\"", "");
generator = generator.replaceAll("\".?/>", "");
logger.info("Generator: " + generator);
}
} //End While
} catch (Exception e) { // End Try
logger.log(Level.INFO, "Something went wrong @ parseMetaTags", e);
}
} // End Metatags
/**
* Tries to get the Basehref from the website html
* @return basehref
*/
String getBasehref()throws Exception {
// meta = doc.getElementsByTag("META").last();
try
{
basehref = doc.getElementsByTag("BASE").first().text();
logger.info("Basehref: " + basehref);
} catch (Exception e) {
logger.info("Site has no basehref");
}
return basehref;
}
/**
* Tries to get the doctype of the website html
* @return
* @throws Exception
*/
String getDoctype() throws Exception {
try
{
doctype = contents.substring(0, 130).toString().toLowerCase().trim();
if (doctype.contains("xhtml 1.0"))
{
doctype = "XHTML 1.0";
}
else if (doctype.contains("xhtml 1.1"))
{
doctype = "XHTML 1.1";
}
else if (doctype.contains("xhtml 2.0"))
{
doctype = "XHTML 2.0";
}
else if (doctype.contains("html 4.01"))
{
doctype = "HTML 4.01";
}
else if (doctype.contains("html 5"))
{
doctype = "HTML 5";
}
else if (doctype.contains("MathML 2.0"))
{
doctype = "MathML 2.0";
}
else if (doctype.contains("MathML 1.01"))
{
doctype = "MathML 1.01";
}
else if (doctype.contains("SVG 1.0"))
{
doctype = "SVG 1.0";
}
else if (doctype.contains("SVG 1.1"))
{
doctype = "SVG 1.1";
}
// Old doctypes
else if (doctype.contains("html 2.0"))
{
doctype = "HTML 2.0";
}
else if (doctype.contains("html 3.2"))
{
doctype = "HTML 3.2";
}
else if (doctype.contains("xhtml basic 1.0"))
{
doctype = "xhtml basic 1.0";
}
else {
doctype = "unknown";
}
} catch (Exception e) {
logger.log(Level.INFO,"Something went wrong with doctype from site " + address, e);
doctype ="unknown";
}
logger.info("Doctype: " + doctype);
return doctype;
} // End Getdoctype
/**
* needs to be initialized, else null
* @return filesize of the site
*/
long getFilesize()
{
return fileSize;
}
/**
* needs to be initialized, else null
* @return checksum of the site
*/
long getChecksum()
{
return checksum;
}
/**
* needs to be initialized, else null
* @return meta-keywords of the site
*/
String getKeywords()
{
return keywords;
}
/**
* needs to be initialized, else null
* @return meta-description of the site
*/
String getDescription()
{
return description;
}
/**
* needs to be initialized, else null
* @return meta-generator of the site
*/
String getGenerator()
{
generator = BasicRegexp.shortenString(255, generator);
return generator;
}
/**
* needs to be initialized, else null
* @return robots tag of the site
*/
String getRobots()
{
robots = BasicRegexp.shortenString(255, robots);
return robots;
}
/**
* needs to be initialized, else null
* @return meta author of the site
*/
String getAuthor()
{
author = BasicRegexp.shortenString(255, author);
return author;
}
} // End Class