Package com.ontometrics.scraper.extraction

Source Code of com.ontometrics.scraper.extraction.BaseExtractor

package com.ontometrics.scraper.extraction;

import java.net.URL;
import java.util.Deque;
import java.util.LinkedList;

import net.htmlparser.jericho.Source;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
* The one thing all Extractors will have in common is the need to marshal some html to perform their specialized
* extractions on. This class provides a means of getting that source by calling a chain of {@link Manipulator}s that
* are held in the {@link HtmlExtractor}.
* <p>
* The derived classes don't need to know any of the details, they simply string together the manipulators they want on
* the HtmlExtractor, then ask the base for the source.
* <p>
* Note: because we are doing a fluent interface implementation of the Builder Pattern here, we want to allow for syntax
* like the following:
* <p>
* <code>
* List<Link> links = linkExtractor.source(htmlExtractor.url(PagedListingTable.getUrl()).table(3)).getLinks();
* </code>
* <p>
* In order to do that, subclasses need to override the source method and just call super, but return this (reference to
* their class, not the base class).
*
* @author Rob
*
*/
public abstract class BaseExtractor {
  @SuppressWarnings("unused")
  private static final Logger log = LoggerFactory.getLogger(BaseExtractor.class);

    private Source extractedSource = null;

  /**
   * Does the work of actually honing in on the source we are interested in.
   */
  private Deque<HtmlExtractor> htmlExtractors = new LinkedList<HtmlExtractor>();

  /**
   * Provides access to the source from the {@link #htmlExtractors} which derived classes will then perform their
   * extractions on.
   *
   * @param htmlExtractor
   *            the extractor to handle the appropriation of the html
   * @return this so chaining can be done, though, subclasses should override so they can have one chain of calls. See
   *         the note above.
   */
  public BaseExtractor source(HtmlExtractor htmlExtractor) {
    this.htmlExtractors.add(htmlExtractor);
    return this;
  }

    public BaseExtractor source(Source startingSource){
        this.extractedSource = startingSource;
        return this;
    }

  /**
   * Provided as a convenience method for use in cases where might want to extract things from multiple sections of
   * the page.
   *
   * @param htmlExtractor
   *            the chain of {@link Manipulator}s that will be used to get interested html element
   * @return this for chaining, note: overload this in subclasses so calls can be chained.
   */
  public BaseExtractor section(HtmlExtractor htmlExtractor) {
    source(htmlExtractor);
    return this;
  }

  public BaseExtractor url(URL url) {
    for (HtmlExtractor extractor : htmlExtractors) {
      extractor.url(url);
    }
    return this;
  }

    public BaseExtractor clearCachedSource() {
        this.extractedSource = null;
        for (HtmlExtractor htmlExtractor : htmlExtractors) {
            htmlExtractor.clearCachedSource();
        }
        return this;
    }

  protected HtmlExtractor getCurrentHtmlExtractor() {
    return this.htmlExtractors.getLast();
  }

  /**
   * Provides a means of kicking off the {@link Manipulation} chain and harvesting the results.
   *
   * @return the manipulated source gotten from the {@link #htmlExtractors}
   */
  public Source getSource() {
        if (extractedSource==null){
            StringBuffer accumulatedSource = new StringBuffer();
            for (HtmlExtractor extractor : htmlExtractors) {
                accumulatedSource.append(extractor.getSource().toString());
            }
            Source combinedSource = new Source(accumulatedSource);
            combinedSource.fullSequentialParse();
            extractedSource = combinedSource;
        }
        return extractedSource;
  }

}
TOP

Related Classes of com.ontometrics.scraper.extraction.BaseExtractor

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.