Package de.jungblut.crawl.extraction

Source Code of de.jungblut.crawl.extraction.ArticleContentExtrator$ContentFetchResult

package de.jungblut.crawl.extraction;

import static de.jungblut.crawl.extraction.OutlinkExtractor.consumeStream;
import static de.jungblut.crawl.extraction.OutlinkExtractor.extractOutlinks;
import static de.jungblut.crawl.extraction.OutlinkExtractor.getConnection;

import java.io.IOException;
import java.io.InputStream;
import java.util.HashSet;
import java.util.concurrent.ExecutionException;

import org.apache.commons.lang.StringEscapeUtils;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.tags.TitleTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.htmlparser.util.SimpleNodeIterator;

import de.jungblut.crawl.ConsoleResultWriter;
import de.jungblut.crawl.Crawler;
import de.jungblut.crawl.FetchResult;
import de.jungblut.crawl.SequentialCrawler;
import de.jungblut.crawl.extraction.ArticleContentExtrator.ContentFetchResult;
import de.l3s.boilerpipe.BoilerpipeExtractor;
import de.l3s.boilerpipe.extractors.ArticleExtractor;

/**
* Extractor for news articles. Uses Boilerpipes {@link ArticleExtractor} to
* extract the largest block of text and the article title.
*
* @author thomas.jungblut
*
*/
public final class ArticleContentExtrator implements
    Extractor<ContentFetchResult> {

  private final BoilerpipeExtractor extractor = ArticleExtractor.getInstance();

  private static final NodeFilter TITLE_FILTER = new NodeClassFilter(
      TitleTag.class);

  @Override
  public ContentFetchResult extract(String site) {

    if (site == null || !site.startsWith("http") || site.length() > 500)
      return null;

    try {
      InputStream connection = getConnection(site);
      String html = consumeStream(connection);
      html = StringEscapeUtils.unescapeHtml(html);
      final HashSet<String> outlinkSet = extractOutlinks(html, site);
      String title = extractTitle(html);

      String extractedLargestText = extractor.getText(html);
      return new ContentFetchResult(site, outlinkSet, title,
          extractedLargestText);
    } catch (ParserException pEx) {
      // ignore parser exceptions, they contain mostly garbage
    } catch (RuntimeException rEx) {
      rEx.printStackTrace();
    } catch (Exception e) {
      System.err.println(e.toString().replace("\n", "; ") + " >>> URL was: \""
          + site + "\"");
    }

    return null;
  }

  /**
   * Extracts the title from the given HTML.
   *
   * @return never null, just an empty string if not parsable.
   */
  public static String extractTitle(String html) throws ParserException {
    String title = "";
    Parser parser = new Parser(html);
    NodeList matches = parser.extractAllNodesThatMatch(TITLE_FILTER);
    SimpleNodeIterator it = matches.elements();
    while (it.hasMoreNodes()) {
      TitleTag node = (TitleTag) it.nextNode();
      title = node.getTitle().trim();
    }
    return title;
  }

  /**
   * Article content fetch result.
   */
  public static class ContentFetchResult extends FetchResult {

    private final String title;
    private final String text;

    public ContentFetchResult(String url, HashSet<String> outlinks) {
      super(url, outlinks);
      title = null;
      text = null;
    }

    public ContentFetchResult(String url, HashSet<String> outlinks,
        String title, String text) {
      super(url, outlinks);
      this.title = title;
      this.text = text;
    }

    public String getTitle() {
      return title;
    }

    public String getText() {
      return text;
    }

    @Override
    public String toString() {
      return title + "\n\n" + text;
    }

  }

  public static void main(String[] args) throws IOException,
      InterruptedException, ExecutionException {
    String start = "http://www.spiegel.de/wissenschaft/natur/erbgut-entziffert-austern-haben-viele-anti-stress-gene-a-856902.html";

    Crawler<ContentFetchResult> crawler = new SequentialCrawler<>(1,
        new ArticleContentExtrator(),
        new ConsoleResultWriter<ContentFetchResult>());

    crawler.process(start);

  }
}
TOP

Related Classes of de.jungblut.crawl.extraction.ArticleContentExtrator$ContentFetchResult

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.