Source Code of com.ontometrics.scraper.extraction.BaseExtractor

package com.ontometrics.scraper.extraction;


import java.net.URL;
import java.util.Deque;
import java.util.LinkedList;


import net.htmlparser.jericho.Source;


import org.slf4j.Logger;
import org.slf4j.LoggerFactory;


/**
 * The one thing all Extractors will have in common is the need to marshal some html to perform their specialized
 * extractions on. This class provides a means of getting that source by calling a chain of {@link Manipulator}s that
 * are held in the {@link HtmlExtractor}.
 * <p>
 * The derived classes don't need to know any of the details, they simply string together the manipulators they want on
 * the HtmlExtractor, then ask the base for the source.
 * <p>
 * Note: because we are doing a fluent interface implementation of the Builder Pattern here, we want to allow for syntax
 * like the following:
 * <p>
 * <code>
 * List<Link> links = linkExtractor.source(htmlExtractor.url(PagedListingTable.getUrl()).table(3)).getLinks();
 * </code>
 * <p>
 * In order to do that, subclasses need to override the source method and just call super, but return this (reference to
 * their class, not the base class).
 * 
 * @author Rob
 * 
 */
public abstract class BaseExtractor {
  @SuppressWarnings("unused")
  private static final Logger log = LoggerFactory.getLogger(BaseExtractor.class);


    private Source extractedSource = null;


  /**
   * Does the work of actually honing in on the source we are interested in.
   */
  private Deque<HtmlExtractor> htmlExtractors = new LinkedList<HtmlExtractor>();


  /**
   * Provides access to the source from the {@link #htmlExtractors} which derived classes will then perform their
   * extractions on.
   * 
   * @param htmlExtractor
   *            the extractor to handle the appropriation of the html
   * @return this so chaining can be done, though, subclasses should override so they can have one chain of calls. See
   *         the note above.
   */
  public BaseExtractor source(HtmlExtractor htmlExtractor) {
    this.htmlExtractors.add(htmlExtractor);
    return this;
  }


    public BaseExtractor source(Source startingSource){
        this.extractedSource = startingSource;
        return this;
    }


  /**
   * Provided as a convenience method for use in cases where might want to extract things from multiple sections of
   * the page.
   * 
   * @param htmlExtractor
   *            the chain of {@link Manipulator}s that will be used to get interested html element
   * @return this for chaining, note: overload this in subclasses so calls can be chained.
   */
  public BaseExtractor section(HtmlExtractor htmlExtractor) {
    source(htmlExtractor);
    return this;
  }


  public BaseExtractor url(URL url) {
    for (HtmlExtractor extractor : htmlExtractors) {
      extractor.url(url);
    }
    return this;
  }


    public BaseExtractor clearCachedSource() {
        this.extractedSource = null;
        for (HtmlExtractor htmlExtractor : htmlExtractors) {
            htmlExtractor.clearCachedSource();
        }
        return this;
    }


  protected HtmlExtractor getCurrentHtmlExtractor() {
    return this.htmlExtractors.getLast();
  }


  /**
   * Provides a means of kicking off the {@link Manipulation} chain and harvesting the results.
   * 
   * @return the manipulated source gotten from the {@link #htmlExtractors}
   */
  public Source getSource() {
        if (extractedSource==null){
            StringBuffer accumulatedSource = new StringBuffer();
            for (HtmlExtractor extractor : htmlExtractors) {
                accumulatedSource.append(extractor.getSource().toString());
            }
            Source combinedSource = new Source(accumulatedSource);
            combinedSource.fullSequentialParse();
            extractedSource = combinedSource;
        }
        return extractedSource;
  }


}
Source Code of com.ontometrics.scraper.extraction.BaseExtractor

Related Classes of com.ontometrics.scraper.extraction.BaseExtractor