Source Code of de.jungblut.crawl.extraction.HtmlExtrator$HtmlFetchResult

package de.jungblut.crawl.extraction;


import static de.jungblut.crawl.extraction.OutlinkExtractor.consumeStream;
import static de.jungblut.crawl.extraction.OutlinkExtractor.extractOutlinks;
import static de.jungblut.crawl.extraction.OutlinkExtractor.getConnection;


import java.io.InputStream;
import java.util.HashSet;


import org.apache.commons.lang.StringEscapeUtils;
import org.htmlparser.util.ParserException;


import de.jungblut.crawl.FetchResult;
import de.jungblut.crawl.extraction.HtmlExtrator.HtmlFetchResult;


/**
 * Extractor for raw html.
 * 
 * @author thomas.jungblut
 * 
 */
public final class HtmlExtrator implements Extractor<HtmlFetchResult> {


  @Override
  public final HtmlFetchResult extract(String site) {


    if (site == null || !site.startsWith("http") || site.length() > 500)
      return null;


    try {
      InputStream connection = getConnection(site);
      String html = consumeStream(connection);
      html = StringEscapeUtils.unescapeHtml(html);
      final HashSet<String> outlinkSet = extractOutlinks(html, site);
      return new HtmlFetchResult(site, outlinkSet, html);
    } catch (ParserException pEx) {
      // ignore parser exceptions, they contain mostly garbage
    } catch (Exception e) {
      String errMsg = e.getMessage().length() > 150 ? e.getMessage().substring(
          0, 150) : e.getMessage();
      System.err.println(errMsg.replace("\n", "") + " >>> URL was: \"" + site
          + "\"");
    }


    return null;
  }


  /**
   * Article content fetch result.
   */
  public static class HtmlFetchResult extends FetchResult {


    private final String html;


    public HtmlFetchResult(String url, HashSet<String> outlinks) {
      super(url, outlinks);
      html = null;
    }


    public HtmlFetchResult(String url, HashSet<String> outlinks, String html) {
      super(url, outlinks);
      this.html = html;
    }


    public String getHtml() {
      return html;
    }


    @Override
    public String toString() {
      return html;
    }


  }


}
Source Code of de.jungblut.crawl.extraction.HtmlExtrator$HtmlFetchResult

Related Classes of de.jungblut.crawl.extraction.HtmlExtrator$HtmlFetchResult