package placeholder;
import java.io.PrintStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import placeholder.check.CheckServices;
import placeholder.http.HttpRequest;
import placeholder.website.WebsiteResult;
import placeholder.website.queue.WebsiteQueue;
import placeholder.website.queue.WebsiteResultQueue;
/**
* Represents the crawler.
*
* @author Heiko Bornholdt
*/
public class Crawler extends Thread {
protected final CrawlerPrintStream out;
protected final WebsiteQueue queue;
protected final WebsiteResultQueue resultQueue;
/**
* Creates a new Crawler with <code>out</code> as PrintStream for debug.
*/
public Crawler(PrintStream out, WebsiteQueue queue, WebsiteResultQueue resultQueue) {
this.out = new CrawlerPrintStream(out, this);
this.queue = queue;
this.resultQueue = resultQueue;
}
/**
* Receives a webseite from WebsiteQueue and crawl.
*/
public void run() {
String website;
String illegalChars = "[^\\\u0000-\\\u002C\\\u002E\\\u002F\\\u003A-\\\u0040\\\u005B-\\\u0060\\\u007B-\\\u007F]+";
String urlPattern = "(" +
"(?:" + // hostname
"https?://" + illegalChars + "(?:\\." + illegalChars + ")*" +
"|" +
"www\\.(?:" + illegalChars + "\\.)+" +
"(?:[a-z]{2,4}(?=\\b))" +
")" +
"(?::\\d+)?" + // port
")";
Pattern pattern = Pattern.compile(urlPattern, Pattern.CASE_INSENSITIVE);
try {
while ((website = this.queue.get()) != null || this.resultQueue.flush() && (website = this.queue.get()) != null) {
this.out.titledPrintln("Website: " + website);
// start request
int responseCode = 0;
String redirectLocation = null;
String responseContent = null;
WebsiteResult result;
try {
this.out.titledPrint("\tStarting request...");
String website2 = "http://" + website;
do {
HttpRequest request = new HttpRequest(website2);
request.send();
responseCode = request.getConnection().getResponseCode();
redirectLocation = request.getConnection().getHeaderField("Location");
responseContent = request.getResponseBody();
// follow redirect?
try {
URL url = new URL(redirectLocation);
if (website.equals(url.getHost())) {
website2 = redirectLocation;
}
else {
website2 = null;
}
} catch (MalformedURLException e) {
website2 = null;
}
} while (website2 != null);
this.out.println("finished!");
} catch(Exception e) {
responseCode = -1;
responseContent = "";
} finally {
result = new WebsiteResult(website, responseCode);
this.resultQueue.add(result);
}
if (responseCode == 200) {
// look for other websites adresses
this.out.titledPrint("\tLooking for other websites...");
Matcher matcher = pattern.matcher(responseContent);
while (matcher.find() && matcher.group(1).indexOf(".") != -1) {
try {
String host = new URL(matcher.group(1)).getHost();
// ignore ip addresses
if (host.matches("\\d{1,3}(\\.\\d{1,3}){3}")) {
continue;
}
// ignore sub domains
String[] parts = host.split("\\.", 5);
String topLevel = parts[parts.length - 1];
int maxParts = (topLevel.matches("at|be|biz|ca|cc|ch|com|cz|de|dk|fr|edu|es|eu|in|int|it|info|li|mobi|name|net|nl|org|ru|se|tv|us|ws") ? 3 : 4);
if (parts.length > maxParts || parts.length == maxParts && !parts[0].equals("www")) {
// remove lowed namespace
StringBuilder builder = new StringBuilder();
for (int i = parts.length - maxParts + 1; i < parts.length; i++) {
if (builder.length() > 0) builder.append(".");
builder.append(parts[i]);
}
host = builder.toString();
}
result.addWebsite(host);
} catch (MalformedURLException e) {
// ignore bad urls
continue;
}
}
this.out.println(result.getWebsites().length + " found.");
// look for services
this.out.titledPrint("\tLooking for services...");
for (String service: CheckServices.check(responseContent)) {
result.addService(service);
}
this.out.println(result.getServices().length + " found.");
}
else if (redirectLocation != null) {
try {
URL url = new URL(redirectLocation);
result.addWebsite(url.getHost());
this.out.titledPrintln("\tFound redirect to website: " + url.getHost());
} catch (MalformedURLException e) {
// ignore bad urls
continue;
}
}
}
} catch (Exception e) {
e.printStackTrace();
System.exit(1);
}
this.out.titledPrintln("No more websites in queue.");
}
}