package com.ontometrics.scraper.extraction;
import com.ontometrics.scraper.TagOccurrence;
import com.ontometrics.scraper.util.IOUtils;
import net.htmlparser.jericho.HTMLElementName;
import net.htmlparser.jericho.Source;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.Deque;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.Map;
* Provides a means of collecting {@link Manipulator}s and performing
* progressive harvesting of html from an original source. This is done through
* an implementation of the Chain of Responsibility Pattern: the manipulators
* are held in a LinkedList and when new ones are added, they are bolted on to
* the end, then, when the source is requested, the first {@link Manipulator} is
* invoked, setting off the chain of operations. Then the resulting source is
* extracted.
* Extractor is fetching content from {@link URL} using {@link UrlContentProvider}. Default implementation is
* {@link UrlConnectionContentProvider}, but it can be changed, see {@link #urlContentProvider(UrlContentProvider)}
* @author Rob
public class HtmlExtractor extends BaseExtractor {
private static final Logger log = LoggerFactory.getLogger(HtmlExtractor.class);
* This holds the source that we are manipulating.
private Source source;
private SourceExtractor sourceExtractor;
* Typical starting point for beginning the process of getting html to
* manipulate.
private URL url;
* The chain of collaborators who will do the work of transforming the html
* source.
private Deque<Manipulator> manipulators = new LinkedList<Manipulator>();
* Provider of content {@link} for url
private UrlContentProvider urlContentProvider = new UrlConnectionContentProvider.Builder().build();
public HtmlExtractor from(SourceExtractor sourceExtractor) {
this.sourceExtractor = sourceExtractor;
return this;
public static HtmlExtractor html() {
return new HtmlExtractor();
public HtmlExtractor source(Source startingSource) {
this.source = startingSource;
return this;
* The idea here is that the various static methods that are used to present
* the syntax of the DSL will ultimately enqueue a corresponding command by
* calling this method, so for instance, the method:
* <p>
* <code>
* public HtmlExtractor table(int occurrence)
* </code>
* <p>
* will be turned into a request to take the html that was passed in, parse it,
* get the nth occurrence of a table tag, then pass it on to the next Manipulator
* in the chain.
* @param manipulator
* the command to be enqueued at this point in the progressive
* operation of extracting the html source
public void addManipulator(Manipulator manipulator) {
if (hasManipulators()) {
public HtmlExtractor clean() {
Manipulator cleaner = new CleanManipulator();
if (hasManipulators()) {
return this;
* Call this when it's time to actually perform the operations on the
* source.
public void performManipulations() {
try {
if (this.source == null) {
if (hasManipulators()) {
source = manipulators.getLast().getSource();
} catch (IOException e) {
log.error("IO Error while performing manipulations", e);
* Fetches source from {@link #url} using {@link #urlContentProvider}
* @throws IOException if i/o operation(s) fails
private void fetchSourceFromUrl() throws IOException {
InputStream is = null;
try {
source = new Source(is = urlContentProvider.getContent(url));
} finally {
private SourceExtractor getSourceExtractor() {
if (this.sourceExtractor == null) {
this.sourceExtractor = new SimpleSourceExtractor();
return this.sourceExtractor;
* (non-Javadoc)
* @see com.ontometrics.scraper.extraction.BaseExtractor#getSource()
public Source getSource() {
return source;
public HtmlExtractor clearCachedSource() {
this.source = null;
return this;
* Provides means of extracting a specific table.
* @param occurrence
* this would refer to the index in the list of all table tags
* found in the passed html
* @return the table tag and all its contents
public HtmlExtractor table(int occurrence) {
addManipulator(new ElementManipulator(new TagOccurrence.Builder().tag(HTMLElementName.TABLE)
return this;
* Provides means of extracting a specific table.
* @param matching
* this would refer to the index in the list of all table tags
* found in the passed html
* @return the table tag and all its contents
public HtmlExtractor table(String matching) {
addManipulator(new ElementManipulator(new TagOccurrence.Builder().tag(HTMLElementName.TABLE)
return this;
* Provides means of getting the html after a tag.
* @param tag
* the element to look for
* @param occurrence
* which one
* @return all the html after (and including) the element
public HtmlExtractor after(String tag, int occurrence) {
addManipulator(new SplicingExtractor(SpliceOperation.After, new TagOccurrence.Builder().tag(tag)
return this;
* Provides a simple means of adding matching to the prior operation. For
* example, if you want to find a table that contains a given string, you
* would do:
* <p>
* <code>
* table().matching(targetString)
* </code>
* <p>
* How the matching is done is going to be based on the manipulator.
* @param matcher
* just a simple string to use for matching, or could be a regex
* expression
* @return the current HtmlExtractor for call chaining
public HtmlExtractor matching(String matcher) {
return this;
* Usually the starting point: provides the path to a file that would be the
* original source that is then progressively transformed by any additional
* {@link Manipulator}s.
* @param url
* valid url point to a page that has html in it
* @return this, for chaining
public HtmlExtractor url(URL url) {
this.url = url;
return this;
* Provides a means of extracting a table.
* @return this for method chaining
* @see #matching(String)
public HtmlExtractor table() {
addManipulator(new ElementManipulator(new TagOccurrence.Builder().tag(HTMLElementName.TABLE).build()));
return this;
private boolean hasManipulators() {
return this.manipulators.size() > 0;
public HtmlExtractor tableWithID(String id) {
addManipulator(new ElementManipulator(new TagOccurrence.Builder().tag(HTMLElementName.TABLE)
return this;
public HtmlExtractor divWithID(String id) {
addManipulator(new ElementManipulator(new TagOccurrence.Builder().tag(HTMLElementName.DIV)
return this;
public HtmlExtractor divWithOccurrence(int occurrence) {
addManipulator(new ElementManipulator(new TagOccurrence.Builder().tag(HTMLElementName.DIV)
return this;
public HtmlExtractor divWithClassAndOccurrence(String className, int occurrence) {
addManipulator(new ElementManipulator(new TagOccurrence.Builder().tag(HTMLElementName.DIV)
return this;
public HtmlExtractor spanWithID(String id) {
addManipulator(new ElementManipulator(new TagOccurrence.Builder().tag(HTMLElementName.SPAN)
return this;
public HtmlExtractor add(Manipulator manipulator) {
return this;
public HtmlExtractor ofClass(String className, int occurrence) {
addManipulator(new ElementManipulator(new TagOccurrence.Builder().elementIdentifierType(
return this;
public HtmlExtractor ofClass(String className) {
ofClass(className, 0);
return this;
* Updates {@link UrlContentProvider} for getting content by {@link #url}
* @param urlContentProvider url content provider
* @return this, for chaining
public HtmlExtractor urlContentProvider(UrlContentProvider urlContentProvider) {
this.urlContentProvider = urlContentProvider;
return this;
* Use {@link #urlContentProvider} instead
* @throws IllegalStateException if {@link #urlContentProvider} is not an instance of {@link UrlConnectionContentProvider}
public HtmlExtractor addRequestProperty(String key, String value) {
UrlContentProvider urlContentProvider = this.urlContentProvider;
if (urlContentProvider instanceof UrlConnectionContentProvider) {
((UrlConnectionContentProvider)urlContentProvider).setRequestProperty(key, value);
} else {
throw new IllegalStateException("Current UrlContentProvider is not an instance of UrlConnectionContentProvider");
return this;
* Use {@link com.ontometrics.scraper.extraction.UrlConnectionContentProvider#getRequestProperties()} instead
* @return request properties
* @throws IllegalStateException if {@link #urlContentProvider} is not an instance of {@link UrlConnectionContentProvider}
public Map<String, String> getHttpRequestProperties() {
UrlContentProvider urlContentProvider = this.urlContentProvider;
if (urlContentProvider instanceof UrlConnectionContentProvider) {
return ((UrlConnectionContentProvider)urlContentProvider).getRequestProperties();
} else {
throw new IllegalStateException("Current UrlContentProvider is not an instance of UrlConnectionContentProvider");
public HtmlExtractor attribute(Object attributeName) {
return this;