Package de.jungblut.crawl.extraction

Source Code of de.jungblut.crawl.extraction.OutlinkExtractor

package de.jungblut.crawl.extraction;

import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.net.URLConnection;
import java.nio.ByteBuffer;
import java.nio.channels.Channels;
import java.nio.channels.ReadableByteChannel;
import java.util.HashSet;
import java.util.Iterator;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.htmlparser.util.SimpleNodeIterator;
import org.mozilla.universalchardet.UniversalDetector;

import de.jungblut.crawl.FetchResult;

/**
* Outlink extractor, parses a page just for its outlinks.
*
* @author thomas.jungblut
*
*/
public final class OutlinkExtractor implements Extractor<FetchResult> {

  private static final int BUFFER_SIZE = 1024 * 1024;
  private static final String USER_AGENT_KEY = "User-Agent";
  private static final String USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.97 Safari/537.11";
  private static final NodeFilter LINK_FILTER = new NodeClassFilter(
      LinkTag.class);

  private static final Pattern IGNORE_SUFFIX_PATTERN = Pattern
      .compile(".*(\\.(css|js|bmp|gif|jpe?g|png|tiff?|mid|mp2|mp3|mp4|wav|avi|mov|mpeg|ram|m4v|pdf|iso|rm|smil|wmv|swf|wma|zip|rar|gz))$");

  private static final Pattern BASE_URL_PATTERN = Pattern
      .compile("(http[s]*://[a-z0-9.-]+)");
  private static final Pattern GENERAL_URL_PATTERN = Pattern
      .compile("\\bhttps?://[-a-zA-Z0-9+&#/%?=~_|!:,.;]*[-a-zA-Z0-9+&#/%=~_|]");

  @Override
  public FetchResult extract(String realUrl) {
    if (realUrl == null || !realUrl.startsWith("http")
        || realUrl.length() > 500)
      return null;
    try {

      InputStream connection = getConnection(realUrl);
      String html = consumeStream(connection);

      final HashSet<String> set = extractOutlinks(html, realUrl);

      return new FetchResult(realUrl, set);
    } catch (ParserException pEx) {
      // ignore parser exceptions, they contain mostly garbage
    } catch (RuntimeException rEx) {
      rEx.printStackTrace();
    } catch (Exception e) {
      System.err.println(e.toString().replace("\n", "; ") + " >>> URL was: \""
          + realUrl + "\"");
    }
    return null;
  }

  /**
   * @return an opened stream.
   */
  public static InputStream getConnection(String realUrl) throws IOException {
    URL url = new URL(realUrl);
    // maybe we need to write our http connection. DNS cache and
    // inputstream reader...
    /*
     * get the IP of the DNS, put it into the cache. Then open an
     * socketinputstream with an 16k long buffer and let the reader read it...
     */
    URLConnection con = url.openConnection();
    // con.setConnectTimeout(50);
    con.addRequestProperty(USER_AGENT_KEY, USER_AGENT);
    return con.getInputStream();
  }

  /**
   * Filters outlinks from a parsed page that NOT matches the given matcher.
   */
  public static HashSet<String> filter(HashSet<String> set, Pattern matcher) {
    if (matcher != null) {
      Iterator<String> iterator = set.iterator();
      while (iterator.hasNext()) {
        if (!matcher.matcher(iterator.next()).matches()) {
          iterator.remove();
        }
      }
    }
    return set;
  }

  /**
   * Extracts outlinks of the given HTML doc in string.
   *
   * @param html the html to extract the outlinkts from.
   * @param url the url where we found the current document.
   * @return a set of outlinks.
   */
  public static HashSet<String> extractOutlinks(String html, String url)
      throws ParserException {

    final String baseUrl = extractBaseUrl(url);
    if (baseUrl == null)
      return null;

    final HashSet<String> set = new HashSet<>();
    Parser parser = new Parser(html);
    NodeList matches = parser.extractAllNodesThatMatch(LINK_FILTER);
    SimpleNodeIterator it = matches.elements();
    while (it.hasMoreNodes()) {
      LinkTag node = (LinkTag) it.nextNode();
      String link = node.getLink().trim();
      // remove the anchor if present
      if (link.contains("#")) {
        link = link.substring(0, link.lastIndexOf('#'));
      }
      if (link != null && !link.isEmpty()) {
        if (isValid(link)) {
          set.add(link);
          continue;
        }
        // sometimes people adress with "//"
        if (link.startsWith("//")) {
          link = "http:" + link;
          if (isValid(link)) {
            set.add(link);
            continue;
          }
        }
        // retry by expanding relative links
        if (link.charAt(0) == '/') {
          link = baseUrl + link;
          if (isValid(link)) {
            set.add(link);
            continue;
          }
        }

        // this is a relative url for the current directory
        if (url.endsWith("/")) {
          link = url + link;
        } else {
          link = url.substring(0, url.lastIndexOf('/') + 1) + link;
        }
        if (isValid(link)) {
          set.add(link);
          continue;
        }
      }
    }
    return set;
  }

  /**
   * Consumes a given {@link InputStream} and returns a string consisting of the
   * html code of the site.
   */
  public static String consumeStream(InputStream stream) throws IOException {
    try {
      UniversalDetector detector = new UniversalDetector(null);
      ReadableByteChannel bc = Channels.newChannel(stream);
      ByteBuffer buffer = ByteBuffer.allocate(BUFFER_SIZE);
      int read = 0;
      while ((read = bc.read(buffer)) != -1) {
        detector.handleData(buffer.array(), buffer.position() - read, read);
        buffer = resizeBuffer(buffer);
      }
      detector.dataEnd();
      // copy the result back to a byte array
      String encoding = detector.getDetectedCharset();
      return new String(buffer.array(), 0, buffer.position(),
          encoding == null ? "UTF-8" : encoding);
    } finally {
      if (stream != null) {
        stream.close();
      }
    }
  }

  private static ByteBuffer resizeBuffer(ByteBuffer buffer) {
    ByteBuffer result = buffer;
    // double the size if we have only 10% capacity left
    if (buffer.remaining() < (int) (buffer.capacity() * 0.1f)) {
      result = ByteBuffer.allocate(buffer.capacity() * 2);
      buffer.flip();
      result.put(buffer);
    }
    return result;
  }

  /**
   * Extracts a base url from the given url (to make relative outlinks to
   * absolute ones).
   *
   * @return a base url or null if none was found.
   */
  public static String extractBaseUrl(String url) {
    Matcher matcher = BASE_URL_PATTERN.matcher(url);
    if (matcher.find()) {
      return matcher.group();
    }
    return null;
  }

  /**
   * Checks if the site does not end with unparsable suffixes likes PDF and if
   * its a valid url by extracting a base url at at index 0.
   */
  public static boolean isValid(final String s) {
    Matcher baseMatcher = BASE_URL_PATTERN.matcher(s);
    return baseMatcher.find() && baseMatcher.start() == 0
        && !IGNORE_SUFFIX_PATTERN.matcher(s).matches()
        && GENERAL_URL_PATTERN.matcher(s).matches();
  }

}
TOP

Related Classes of de.jungblut.crawl.extraction.OutlinkExtractor

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.