Package de.jungblut.crawl.extraction

Source Code of de.jungblut.crawl.extraction.HtmlExtrator$HtmlFetchResult

package de.jungblut.crawl.extraction;

import static de.jungblut.crawl.extraction.OutlinkExtractor.consumeStream;
import static de.jungblut.crawl.extraction.OutlinkExtractor.extractOutlinks;
import static de.jungblut.crawl.extraction.OutlinkExtractor.getConnection;

import java.io.InputStream;
import java.util.HashSet;

import org.apache.commons.lang.StringEscapeUtils;
import org.htmlparser.util.ParserException;

import de.jungblut.crawl.FetchResult;
import de.jungblut.crawl.extraction.HtmlExtrator.HtmlFetchResult;

/**
* Extractor for raw html.
*
* @author thomas.jungblut
*
*/
public final class HtmlExtrator implements Extractor<HtmlFetchResult> {

  @Override
  public final HtmlFetchResult extract(String site) {

    if (site == null || !site.startsWith("http") || site.length() > 500)
      return null;

    try {
      InputStream connection = getConnection(site);
      String html = consumeStream(connection);
      html = StringEscapeUtils.unescapeHtml(html);
      final HashSet<String> outlinkSet = extractOutlinks(html, site);
      return new HtmlFetchResult(site, outlinkSet, html);
    } catch (ParserException pEx) {
      // ignore parser exceptions, they contain mostly garbage
    } catch (Exception e) {
      String errMsg = e.getMessage().length() > 150 ? e.getMessage().substring(
          0, 150) : e.getMessage();
      System.err.println(errMsg.replace("\n", "") + " >>> URL was: \"" + site
          + "\"");
    }

    return null;
  }

  /**
   * Article content fetch result.
   */
  public static class HtmlFetchResult extends FetchResult {

    private final String html;

    public HtmlFetchResult(String url, HashSet<String> outlinks) {
      super(url, outlinks);
      html = null;
    }

    public HtmlFetchResult(String url, HashSet<String> outlinks, String html) {
      super(url, outlinks);
      this.html = html;
    }

    public String getHtml() {
      return html;
    }

    @Override
    public String toString() {
      return html;
    }

  }

}
TOP

Related Classes of de.jungblut.crawl.extraction.HtmlExtrator$HtmlFetchResult

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.