Package placeholder

Source Code of placeholder.Crawler

package placeholder;

import java.io.PrintStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import placeholder.check.CheckServices;
import placeholder.http.HttpRequest;
import placeholder.website.WebsiteResult;
import placeholder.website.queue.WebsiteQueue;
import placeholder.website.queue.WebsiteResultQueue;

/**
* Represents the crawler.
*
* @author Heiko Bornholdt
*/
public class Crawler extends Thread {
  protected final CrawlerPrintStream out;
  protected final WebsiteQueue queue;
  protected final WebsiteResultQueue resultQueue;

  /**
   * Creates a new Crawler with <code>out</code> as PrintStream for debug.
   */
  public Crawler(PrintStream out, WebsiteQueue queue, WebsiteResultQueue resultQueue) {
    this.out = new CrawlerPrintStream(out, this);
    this.queue = queue;
    this.resultQueue = resultQueue;
  }

  /**
   * Receives a webseite from WebsiteQueue and crawl.
   */
    public void run() {     
    String website;
    String illegalChars = "[^\\\u0000-\\\u002C\\\u002E\\\u002F\\\u003A-\\\u0040\\\u005B-\\\u0060\\\u007B-\\\u007F]+";
    String urlPattern = "(" +
          "(?:" + // hostname
            "https?://" + illegalChars + "(?:\\." + illegalChars + ")*" +
            "|" +
            "www\\.(?:" + illegalChars + "\\.)+" +
            "(?:[a-z]{2,4}(?=\\b))" +
          ")" +
          "(?::\\d+)?" + // port
        ")";
    Pattern pattern = Pattern.compile(urlPattern, Pattern.CASE_INSENSITIVE);
    try {     
      while ((website = this.queue.get()) != null || this.resultQueue.flush() && (website = this.queue.get()) != null) {
        this.out.titledPrintln("Website: " + website);

        // start request
        int responseCode = 0;
        String redirectLocation = null;
        String responseContent = null;
        WebsiteResult result;
        try {
          this.out.titledPrint("\tStarting request...");
         
          String website2 = "http://" + website;
          do {
            HttpRequest request = new HttpRequest(website2);
            request.send();
            responseCode = request.getConnection().getResponseCode();
            redirectLocation = request.getConnection().getHeaderField("Location");
            responseContent = request.getResponseBody();
           
            // follow redirect?
            try {
              URL url = new URL(redirectLocation);
              if (website.equals(url.getHost())) {
                website2 = redirectLocation;
              }
              else {
                website2 = null;
              }
            } catch (MalformedURLException e) {
              website2 = null;
            }
          } while (website2 != null);
          this.out.println("finished!");
        } catch(Exception e) {
          responseCode = -1;
          responseContent = "";
        } finally {
          result = new WebsiteResult(website, responseCode);
          this.resultQueue.add(result);
        }
       
        if (responseCode == 200) {
          // look for other websites adresses
          this.out.titledPrint("\tLooking for other websites...");
          Matcher matcher = pattern.matcher(responseContent);
          while (matcher.find() && matcher.group(1).indexOf(".") != -1) {           
            try {
              String host = new URL(matcher.group(1)).getHost();
             
              // ignore ip addresses
              if (host.matches("\\d{1,3}(\\.\\d{1,3}){3}")) {
                continue;
              }
             
              // ignore sub domains
              String[] parts = host.split("\\.", 5);
              String topLevel = parts[parts.length - 1];
              int maxParts = (topLevel.matches("at|be|biz|ca|cc|ch|com|cz|de|dk|fr|edu|es|eu|in|int|it|info|li|mobi|name|net|nl|org|ru|se|tv|us|ws") ? 3 : 4);

              if (parts.length > maxParts || parts.length == maxParts && !parts[0].equals("www")) {
                // remove lowed namespace
                StringBuilder builder = new StringBuilder();
                for (int i = parts.length - maxParts + 1; i < parts.length; i++) {
                  if (builder.length() > 0) builder.append(".");
                  builder.append(parts[i]);
                }
                host = builder.toString();
              }
             
              result.addWebsite(host);
            } catch (MalformedURLException e) {
              // ignore bad urls
              continue;
            }
          }
          this.out.println(result.getWebsites().length + " found.");
 
          // look for services
          this.out.titledPrint("\tLooking for services...");
          for (String service: CheckServices.check(responseContent)) {
            result.addService(service);
          }
          this.out.println(result.getServices().length + " found.");
        }
        else if (redirectLocation != null) {
          try {
            URL url = new URL(redirectLocation);
            result.addWebsite(url.getHost());
            this.out.titledPrintln("\tFound redirect to website: " + url.getHost());
          } catch (MalformedURLException e) {
            // ignore bad urls
            continue;
          }
        }
      }
    } catch (Exception e) {
      e.printStackTrace();
      System.exit(1);
    }

    this.out.titledPrintln("No more websites in queue.");
  }
}
TOP

Related Classes of placeholder.Crawler

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.