Package com.ontometrics.scraper.extraction

Source Code of com.ontometrics.scraper.extraction.LinkExtractor$LinkProcessor

package com.ontometrics.scraper.extraction;

import java.net.URL;
import java.util.ArrayList;
import java.util.List;

import net.htmlparser.jericho.Element;
import net.htmlparser.jericho.HTMLElementName;

import net.htmlparser.jericho.Source;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
* Provides a simple means of pulling links out of a clump of html.
*
* @author Rob
*/
public class LinkExtractor extends BaseExtractor {

  private static final Logger log = LoggerFactory.getLogger(LinkExtractor.class);

  private String matcher;
 
  private String styleClass;

    private URL baseUrl;

    private LinkProcessor linkProcessor;

    private boolean associateHtmlSourceWithLink;

  public LinkExtractor ofClass(String styleClass) {
    this.styleClass = styleClass;
    return this;
  }
  /**
   * This is the product we are building here.
   *
   * @return list of links found in the source (after manipulation)
   */
  public List<Link> getLinks() {
    return extractLinks();
  }

  /*
   * (non-Javadoc)
   *
   * @see com.ontometrics.scraper.extraction.BaseExtractor#source(com.ontometrics .scraper.extraction.HtmlExtractor)
   * Note: override here so we can do one chain of calls..
   */
  @Override
  public LinkExtractor source(HtmlExtractor htmlExtractor) {
    super.source(htmlExtractor);
    return this;
  }

    @Override
    public BaseExtractor source(Source startingSource) {
        super.source(startingSource);
        return this;
    }

    /*
         * (non-Javadoc)
         *
         * @see com.ontometrics.scraper.extraction.BaseExtractor#section(com.ontometrics .scraper.extraction.HtmlExtractor)
         * Note: override here so we can do one chain of calls..
         */
  @Override
  public LinkExtractor section(HtmlExtractor htmlExtractor) {
    super.section(htmlExtractor);
    return this;
  }

  public LinkExtractor matching(String matchingExpression) {
    this.matcher = matchingExpression;
    return this;
  }

    public LinkExtractor linkProcessor(LinkProcessor linkProcessor) {
        this.linkProcessor = linkProcessor;
        return this;
    }

    public LinkExtractor associateHtmlSourceWithLink() {
        this.associateHtmlSourceWithLink = true;
        return this;
    }

    private List<Link> extractLinks() {
    List<Link> links = new ArrayList<Link>();
        Source source = getSource();
    List<Element> as = source.getAllElements(HTMLElementName.A);
    for (Element linkElement : as) {
      if(styleClass != null && !styleClass.isEmpty()) {
        String classValue = linkElement.getAttributeValue("class");
        if(classValue == null || !classValue.contains(styleClass))
          continue;
      }
      String text = linkElement.getTextExtractor().toString();
      String href = linkElement.getAttributeValue("href");
      String name = linkElement.getAttributeValue("name");
      if (href != null || name != null) {
        Link.Builder linkBuilder = new Link.Builder().label(text).href(href).name(name).baseUrl(baseUrl);
                if (associateHtmlSourceWithLink) {
                    linkBuilder.source(source);
                }
                Link link = linkBuilder.build();
        log.debug("constructed link: {} from {} must match: {}", new Object[] { link, linkElement, matcher });
        if (matcher == null || (link.getHref() != null && link.getHref().contains(matcher))) {
                    if (linkProcessor != null) {
                        Link processedLink = linkProcessor.processAddedLink(link, linkElement, getSource());
                        if (processedLink != null) {
                            links.add(processedLink);
                        }
                    } else {
              links.add(link);
                    }
        }
      }
    }
    return links;
  }

    public void baseUrl(URL baseUrl) {
        this.baseUrl = baseUrl;
    }


    /**
     * Add ability to inject link processor which may either transform the link or filter it out if it does not match
     * certain criteria
     */
    public static interface LinkProcessor {
        /**
         *
         * @param link link which was found by LinkExtractor
         * @param linkElement instance of {@link net.htmlparser.jericho.Element} from which link was created
         * @param source parent document in which "link" was found - {@link Source}
         * @return instance of Link to be added to the list of extracted links or null if this link should not be added
         */
        Link processAddedLink(Link link, Element linkElement, Source source);
    }
}
TOP

Related Classes of com.ontometrics.scraper.extraction.LinkExtractor$LinkProcessor

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.