Source Code of com.ontometrics.scraper.extraction.HtmlExtractor

package com.ontometrics.scraper.extraction;


import com.ontometrics.scraper.TagOccurrence;
import com.ontometrics.scraper.util.IOUtils;
import net.htmlparser.jericho.HTMLElementName;
import net.htmlparser.jericho.Source;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;


import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.util.Deque;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.Map;


/**
 * Provides a means of collecting {@link Manipulator}s and performing
 * progressive harvesting of html from an original source. This is done through
 * an implementation of the Chain of Responsibility Pattern: the manipulators
 * are held in a LinkedList and when new ones are added, they are bolted on to
 * the end, then, when the source is requested, the first {@link Manipulator} is
 * invoked, setting off the chain of operations. Then the resulting source is
 * extracted.
 *
 * Extractor is fetching content from {@link URL} using {@link UrlContentProvider}. Default implementation is
 * {@link UrlConnectionContentProvider}, but it can be changed, see {@link #urlContentProvider(UrlContentProvider)}
 * 
 * @author Rob
 * 
 */
public class HtmlExtractor extends BaseExtractor {


  private static final Logger log = LoggerFactory.getLogger(HtmlExtractor.class);


  /**
   * This holds the source that we are manipulating.
   */
  private Source source;


  private SourceExtractor sourceExtractor;


  /**
   * Typical starting point for beginning the process of getting html to
   * manipulate.
   */
  private URL url;


  /**
   * The chain of collaborators who will do the work of transforming the html
   * source.
   */
  private Deque<Manipulator> manipulators = new LinkedList<Manipulator>();


    /**
     * Provider of content {@link java.io.InputStream} for url
     */
    private UrlContentProvider urlContentProvider = new UrlConnectionContentProvider.Builder().build();


  public HtmlExtractor from(SourceExtractor sourceExtractor) {
    this.sourceExtractor = sourceExtractor;
    return this;
  }


  public static HtmlExtractor html() {
    return new HtmlExtractor();
  }


    @Override
    public HtmlExtractor source(Source startingSource) {
        this.source = startingSource;
        super.source(startingSource);
        return this;
    }


    /**
   * The idea here is that the various static methods that are used to present
   * the syntax of the DSL will ultimately enqueue a corresponding command by
   * calling this method, so for instance, the method:
   * <p>
   * <code>
   * public HtmlExtractor table(int occurrence)
   * </code>
   * <p>
   * will be turned into a request to take the html that was passed in, parse it,
   * get the nth occurrence of a table tag, then pass it on to the next Manipulator
   * in the chain.
   * 
   * @param manipulator
   *            the command to be enqueued at this point in the progressive
   *            operation of extracting the html source
   */
  public void addManipulator(Manipulator manipulator) {
    if (hasManipulators()) {
      manipulators.getLast().setSuccessor(manipulator);
    }
    manipulators.add(manipulator);
  }


  public HtmlExtractor clean() {
    Manipulator cleaner = new CleanManipulator();
    if (hasManipulators()) {
      cleaner.setSuccessor(manipulators.getFirst());
    }
    manipulators.addFirst(cleaner);
    return this;
  }


  /**
   * Call this when it's time to actually perform the operations on the
   * source.
   */
  public void performManipulations() {
    try {
            if (this.source == null) {
                fetchSourceFromUrl();
            }


      if (hasManipulators()) {
        manipulators.getFirst().execute(source);
        source = manipulators.getLast().getSource();
      }
    } catch (IOException e) {
      log.error("IO Error while performing manipulations", e);
    }
  }


    /**
     * Fetches source from {@link #url} using {@link #urlContentProvider}
     * @throws IOException if i/o operation(s) fails
     */
    private void fetchSourceFromUrl() throws IOException {
        InputStream is = null;
        try {
            source = new Source(is = urlContentProvider.getContent(url));
        } finally {
            IOUtils.closeQuietly(is);
        }
    }


    private SourceExtractor getSourceExtractor() {
    if (this.sourceExtractor == null) {
      this.sourceExtractor = new SimpleSourceExtractor();
    }
    return this.sourceExtractor;
  }


  /*
   * (non-Javadoc)
   * 
   * @see com.ontometrics.scraper.extraction.BaseExtractor#getSource()
   */
  public Source getSource() {
    performManipulations();
    return source;
  }


    @Override
    public HtmlExtractor clearCachedSource() {
        super.clearCachedSource();
        this.source = null;
        return this;
    }


    /**
   * Provides means of extracting a specific table.
   * 
   * @param occurrence
   *            this would refer to the index in the list of all table tags
   *            found in the passed html
   * @return the table tag and all its contents
   */
  public HtmlExtractor table(int occurrence) {
    addManipulator(new ElementManipulator(new TagOccurrence.Builder().tag(HTMLElementName.TABLE)
        .occurrence(occurrence)
        .build()));
    return this;
  }


  /**
   * Provides means of extracting a specific table.
   * 
   * @param matching
   *            this would refer to the index in the list of all table tags
   *            found in the passed html
   * @return the table tag and all its contents
   */
  public HtmlExtractor table(String matching) {
    addManipulator(new ElementManipulator(new TagOccurrence.Builder().tag(HTMLElementName.TABLE)
        .matching(matching)
        .build()));
    return this;
  }


  /**
   * Provides means of getting the html after a tag.
   * 
   * @param tag
   *            the element to look for
   * @param occurrence
   *            which one
   * @return all the html after (and including) the element
   */
  public HtmlExtractor after(String tag, int occurrence) {
    addManipulator(new SplicingExtractor(SpliceOperation.After, new TagOccurrence.Builder().tag(tag)
        .occurrence(occurrence)
        .build()));
    return this;
  }


  /**
   * Provides a simple means of adding matching to the prior operation. For
   * example, if you want to find a table that contains a given string, you
   * would do:
   * <p>
   * <code>
   * table().matching(targetString)
   * </code>
   * <p>
   * How the matching is done is going to be based on the manipulator.
   * 
   * @param matcher
   *            just a simple string to use for matching, or could be a regex
   *            expression
   * @return the current HtmlExtractor for call chaining
   */
  public HtmlExtractor matching(String matcher) {
    manipulators.getLast().setMatcher(matcher);
    return this;
  }


  /**
   * Usually the starting point: provides the path to a file that would be the
   * original source that is then progressively transformed by any additional
   * {@link Manipulator}s.
   * 
   * @param url
   *            valid url point to a page that has html in it
   * @return this, for chaining
   */
  public HtmlExtractor url(URL url) {
    this.url = url;
    return this;
  }


  /**
   * Provides a means of extracting a table.
   * 
   * @return this for method chaining
   * @see #matching(String)
   */
  public HtmlExtractor table() {
    addManipulator(new ElementManipulator(new TagOccurrence.Builder().tag(HTMLElementName.TABLE).build()));
    return this;
  }


  private boolean hasManipulators() {
    return this.manipulators.size() > 0;
  }


  public HtmlExtractor tableWithID(String id) {
    addManipulator(new ElementManipulator(new TagOccurrence.Builder().tag(HTMLElementName.TABLE)
        .elementIdentifierType(ElementIdentifierType.ID)
        .identifier(id)
        .build()));
    return this;
  }


  public HtmlExtractor divWithID(String id) {
    addManipulator(new ElementManipulator(new TagOccurrence.Builder().tag(HTMLElementName.DIV)
        .elementIdentifierType(ElementIdentifierType.ID)
        .identifier(id)
        .build()));
    return this;
  }
  
  public HtmlExtractor divWithOccurrence(int occurrence) {
    addManipulator(new ElementManipulator(new TagOccurrence.Builder().tag(HTMLElementName.DIV)
        .occurrence(occurrence)
        .build()));
    return this;
  }




  public HtmlExtractor divWithClassAndOccurrence(String className, int occurrence) {
    addManipulator(new ElementManipulator(new TagOccurrence.Builder().tag(HTMLElementName.DIV)
        .elementIdentifierType(ElementIdentifierType.cssClass)
        .occurrence(occurrence)
        .identifier(className)
        .build()));
    return this;
  }


  public HtmlExtractor spanWithID(String id) {
    addManipulator(new ElementManipulator(new TagOccurrence.Builder().tag(HTMLElementName.SPAN)
        .elementIdentifierType(ElementIdentifierType.ID)
        .identifier(id)
        .build()));
    return this;
  }


  public HtmlExtractor add(Manipulator manipulator) {
    addManipulator(manipulator);
    return this;
  }


  public HtmlExtractor ofClass(String className, int occurrence) {
    addManipulator(new ElementManipulator(new TagOccurrence.Builder().elementIdentifierType(
        ElementIdentifierType.cssClass)
        .identifier(className)
        .ofClass(className)
        .occurrence(occurrence)
        .build()));
    
    return this;
  }


  public HtmlExtractor ofClass(String className) {
    ofClass(className, 0);
    return this;
  }


    /**
     * Updates {@link UrlContentProvider} for getting content by {@link #url}
     * @param urlContentProvider url content provider
     * @return this, for chaining
     */
    public HtmlExtractor urlContentProvider(UrlContentProvider urlContentProvider) {
        this.urlContentProvider = urlContentProvider;
        return this;
    }


    @Deprecated
    /**
     * Use {@link #urlContentProvider} instead
     * @throws IllegalStateException if {@link #urlContentProvider} is not an instance of {@link UrlConnectionContentProvider}
     */
  public HtmlExtractor addRequestProperty(String key, String value) {
        UrlContentProvider urlContentProvider = this.urlContentProvider;
        if (urlContentProvider instanceof UrlConnectionContentProvider) {
            ((UrlConnectionContentProvider)urlContentProvider).setRequestProperty(key, value);
        } else {
            throw new IllegalStateException("Current UrlContentProvider is not an instance of UrlConnectionContentProvider");
        }
    return this;
  }


    @Deprecated
    /**
     * Use {@link com.ontometrics.scraper.extraction.UrlConnectionContentProvider#getRequestProperties()} instead
     *
     * @return request properties
     *
     * @throws IllegalStateException if {@link #urlContentProvider} is not an instance of {@link UrlConnectionContentProvider}
     */
  public Map<String, String> getHttpRequestProperties() {
        UrlContentProvider urlContentProvider = this.urlContentProvider;
        if (urlContentProvider instanceof UrlConnectionContentProvider) {
            return ((UrlConnectionContentProvider)urlContentProvider).getRequestProperties();
        } else {
            throw new IllegalStateException("Current UrlContentProvider is not an instance of UrlConnectionContentProvider");
        }
  }


  public HtmlExtractor attribute(Object attributeName) {
    return this;
  }
}
Source Code of com.ontometrics.scraper.extraction.HtmlExtractor

Related Classes of com.ontometrics.scraper.extraction.HtmlExtractor