Source Code of com.ontometrics.scraper.legacy.Scraper

package com.ontometrics.scraper.legacy;


import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.text.MessageFormat;
import java.util.ArrayList;
import java.util.List;


import net.htmlparser.jericho.Element;
import net.htmlparser.jericho.HTMLElementName;
import net.htmlparser.jericho.Source;


import org.slf4j.Logger;
import org.slf4j.LoggerFactory;


import com.ontometrics.scraper.Iterator;
import com.ontometrics.scraper.Record;
import com.ontometrics.scraper.ScrapedRecord;
import com.ontometrics.scraper.extraction.Extractor;
import com.ontometrics.scraper.extraction.Field;
import com.ontometrics.scraper.util.ScraperUtil;


/**
 * Provides a mechanism for extracting items from pages or feeds.
 * <p>
 * Uses a fluent builder pattern in a fashion that does border on being a DSL.
 * The idea is that a {@link #url} is provided, then through a set of
 * manipulator methods, the operations to be performed are framed. All scrapings
 * require a url call and then the execute thread at the end to perform the
 * scraping.
 * <p>
 * Internally, the manipulators are triggered by basic conditions right now. We
 * will need a more sophisticated architecture as more operations pile up
 * (perhaps something like a Chain of Responsibility Pattern).
 * 
 * @author Rob
 */
public class Scraper {


  private static final Logger log = LoggerFactory.getLogger(Scraper.class);


  public static final String sessionIdKeyword = "$SESSION_ID$";;


  /**
   * Remote resource we will be extracting content from.
   */
  private URL url;


  /**
   * Does the work of actually extracting the desired content. If there is an
   * {@link #iterator}, the scraper iterates through the {@link #url}s and
   * collects the results.
   */
  private Extractor extractor;


  /**
   * Usually Builders have a single product. We support two kinds of products:
   * a single string that represents a scrape and a set of extracted elements.
   */
  private List<String> results;


  /**
   * Scraper will call this to get the URL of the next page.
   */
  private Iterator iterator;


  /**
   * This variable dictates how many total pages will be iterated.
   */
  private int pages = 0;


  private List<Field> extractedFields;


  private List<Record> records;


  /**
   * Used for relative links. At the moment, this will be just the host name.
   */
  private URL baseUrl;


  private String sessionIDName;


  public Scraper() {
    this.extractor = new Extractor();
  }


  /**
   * Provides a means of just extracting links.
   * 
   * @return a list of the links that were valid {@link URL}s.
   * @throws IOException
   */
  public List<URL> getLinks() throws IOException {
    List<URL> links = new ArrayList<URL>();


    Source source = new Source(url);
    source.fullSequentialParse();
    List<Element> linkElements = source.getAllElements(HTMLElementName.A);
    for (Element linkElement : linkElements) {
      String href = linkElement.getAttributeValue("href");
      if (href == null) {
        continue;
      }
      // A element can contain other tags so need to extract the text from
      // it:
      String label = linkElement.getContent().getTextExtractor().toString();
      log.debug(MessageFormat.format("{0} <{1}>", label, href));
      URL currentUrl;
      try {
        currentUrl = new URL(href);
        links.add(currentUrl);
      } catch (MalformedURLException e) {
        log.error("malformed url", e);
      }
    }
    return links;
  }


  /**
   * Provides means of asking that just the text of the page be extracted.
   */
  public Scraper asText() {
    extractor.asText();
    return this;
  }


  public Scraper usesSessionId(String sessionIdName) {
    this.sessionIDName = sessionIdName;
    return this;
  }


  // ---- Builder-style Interface
  public Scraper sessionIDName(String sessionIDName) {
    this.sessionIDName = sessionIDName;
    return this;
  }


  public Scraper url(String url) throws MalformedURLException {
    return url(new URL(url));
  }


  /**
   * Sets the url to scrape.<br>
   * If {@link sessionIDName} is set, we will extract the session id from the
   * page source based on the given keyword. <br>
   * A base url will be saved inside {@link baseUrl} to handle relative links.
   * 
   * @param url
   * @return
   * @throws MalformedURLException
   */
  public Scraper url(URL url) throws MalformedURLException {
    this.url = url;
    this.extractedFields = null;
    extractor.url(url);
    if (this.extractedFields != null) {
      this.extractedFields.clear();
    }
    if (this.sessionIDName != null) {
      try {
        this.sessionIDName = ScraperUtil.extractSessionId(this.url, sessionIDName);
      } catch (IOException e) {
        log.error(MessageFormat.format("io error scraping url: {0}", url), e);
      }
    }
    String hostName = "http://".concat(url.getHost());
    baseUrl = new URL(hostName);
    return this;
  }


  public Extractor extractor() {
    return this.extractor;
  }


  public Scraper iterator(Iterator iterator) {
    log.debug("setting iterator: {}", iterator);
    this.iterator = iterator;
    return this;
  }


  public Scraper pages(int i) {
    this.pages = i;
    return this;
  }


  public Scraper extract(String extraction) {
    return this;
  }


  public String getResult() throws IOException {
    return extractor.execute();
  }


  public List<String> getResults() {
    return this.results;
  }


  public List<Field> getFields() {
    if (this.extractedFields == null) {
      this.extractedFields = new ArrayList<Field>();
      try {
        List<Field> fields = extractor.getFields();
        this.extractedFields.addAll(fields);
      } catch (IOException e) {
        log.error("IO getting fields", e);
      }
    }
    return extractedFields;
  }


  public List<Record> getRecords() {
    return this.records;
  }


  public Scraper listing(List<Field> fields) throws IOException {
    // this method is going to get the list of strings and store them as
    // links
    pages -= 1; // when we get results, we will have parsed the first page
          // already
    extract(fields);
    log.debug("extracted links: {}", fields);
    return this;
  }


  public Scraper detail(Scraper detailScraper) {
    this.records = new ArrayList<Record>();
    // all we have to do here is loop through the links extracted in listing
    // and perform the operations here, collecting all the fields into
    // records..
    String builtUrl = null;
    for (Field link : this.extractedFields) {
      try {
        if (isRelativeUrl(link.getValue())) {
          builtUrl = convertToAbsoluteUrl(link.getValue());
        }
        log.debug("Using link = {}", builtUrl);
        List<Field> fields = new ArrayList<Field>(detailScraper.url(new URL(builtUrl)).getFields());
        log.debug("returned fields = {}", fields);
        records.add(new ScrapedRecord(fields));
      } catch (MalformedURLException e) {
        log.info("Bad URL in looping detail page for listing links: {}", e.toString());
      }
    }
    return this;
  }


  private boolean isRelativeUrl(String url) {
    boolean result = false;
    if (url != null && !url.startsWith("http://")) {
      result = true;
    }
    return result;
  }


  private String convertToAbsoluteUrl(String link) {
    String absoluteUrlString = (baseUrl.toString()).concat(link);
    return absoluteUrlString;
  }


  public void setExtractor(Extractor extractor) {
    this.extractor = extractor;
  }


  public Scraper extractStrings(List<String> results) throws IOException {
    log.debug("extractor return these results: {}", results);
    this.results = results;
    if (iterator != null) {
      for (int i = 0; i < pages; i++) {
        URL nextUrl = iterator.next();


        if (nextUrl.toString().contains(sessionIdKeyword)) {
          String urlString = nextUrl.toString().replace(sessionIdKeyword, sessionIDName);
          nextUrl = new URL(urlString);
        }
        log.debug("next url = {}", nextUrl);
        extractor.url(nextUrl);
        results.addAll(extractor.getResults());
      }
    }
    return this;
  }


  /**
   * Supports injecting session id into the URL. We will search for a keyword
   * and if it exists, it will replace it.
   * 
   * @param results
   * @return
   * @throws IOException
   */
  public Scraper extract(List<Field> results) throws IOException {


    this.extractedFields = results;


    if (iterator != null) {
      for (int i = 0; i < pages; i++) {
        URL nextUrl = iterator.next();


        if (nextUrl.toString().contains(sessionIdKeyword)) {
          String urlString = nextUrl.toString().replace(sessionIdKeyword, sessionIDName);
          nextUrl = new URL(urlString);
        }
        log.debug("next url = {}", nextUrl);
        extractor.url(nextUrl);
        this.extractedFields.addAll(extractor.getFields());
      }
    }
    return this;
  }


}
Source Code of com.ontometrics.scraper.legacy.Scraper

Related Classes of com.ontometrics.scraper.legacy.Scraper