Source Code of com.ontometrics.scraper.extraction.Extractor

package com.ontometrics.scraper.extraction;


import java.io.FileNotFoundException;
import java.io.IOException;
import java.net.URL;
import java.text.MessageFormat;
import java.util.ArrayList;
import java.util.List;


import net.htmlparser.jericho.Element;
import net.htmlparser.jericho.HTMLElementName;
import net.htmlparser.jericho.Source;


import org.apache.commons.lang.StringUtils;
import org.jsoup.helper.StringUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;


import com.ontometrics.scraper.OutputFormats;
import com.ontometrics.scraper.PairedTags;
import com.ontometrics.scraper.TagOccurrence;
import com.ontometrics.scraper.util.ScraperUtil;


/**
 * Does the work of iteratively extracting portions of the page.
 * <p>
 * Note that once the requested elements are specified, this class is held in
 * the scraper and can be successively invoked for jobs that involve iteration
 * (e.g. paging).
 * 
 * @author Rob
 */
public class Extractor {


  private static final Logger log = LoggerFactory.getLogger(Extractor.class);


  /**
   * Raw content source
   */
  private URL url;


  /**
   * The type of output we should return upon extraction.
   */
  private OutputFormats outputFormat = OutputFormats.Html;


  /**
   * Provides means of extracting just an element with a given id.
   */
  private String idToGet;


  /**
   * Provides means of extracting element(s) with a given class.
   */
  private String classToGet;


  /**
   * Which occurrence of the tag should we extract? (Remember it is 0 indexed
   * so 1st would be 0.
   */
  private int occurrence = 0;


  /**
   * We collect the desired tags, and whether there is some associate
   * occurrence, e.g. in the dsl we might want to support calls like:
   * table(3).row(2).cell(3) to get something out of the 3rd table on the
   * page, its second row and 3rd cell.
   */
  private List<TagOccurrence> tagsToGet;


  /**
   * Provides a means of defining paired tags that contain fields.
   * <p>
   * For example, if you have a page where the labels are all wrapped with h4
   * and the fields with dd, you would add a pair like this:
   * <p>
   * <code>
   * scraper.extractor().pair(HTMLElementName.H4, HTMLElementName.DD).getFields()
   * </code>
   * <p>
   * Note: The extractor will extract nothing if the pairs are not contiguous
   * and matching.
   */
  private List<PairedTags> fieldPairs;


  /**
   * If there is a parameter to be extracted from a link. Eventually, we might
   * want to make link extraction its own sub-DSL. If there is a parameter to
   * be extracted from a link. Eventually, we might want to make link
   * extraction its own sub-DSL.
   */
  private String parameter;


  private String matchingPattern;


  private TagOccurrence afterTagOccurrence;


  private List<FieldToGet> fieldsToGet = new ArrayList<FieldToGet>();


  private boolean useDefaultFieldExtractor = true;


  public Extractor() {
    this.tagsToGet = new ArrayList<TagOccurrence>();
    this.fieldPairs = new ArrayList<PairedTags>();
  }


  public Extractor table(int occurrence) {
    addTagToGet(HTMLElementName.TABLE, occurrence);
    return this;
  }


  /**
   * Provides means of matching the table based on a string it contains.
   * 
   * @param string
   *            the string to match, case sensitive
   * @return the whole table tag with all its contents
   */
  public Extractor table(String match) {
    addTagToGet(HTMLElementName.TABLE, match);
    return this;
  }


  public Extractor url(URL url) {
    this.url = url;
    return this;
  }


  public Extractor id(String id) {
    this.idToGet = id;
    return this;
  }


  public Extractor ofClass(String className) {
    this.classToGet = className;
    return this;
  }


  public Extractor ofClass(String className, int occurrence) {
    this.classToGet = className;
    this.occurrence = occurrence;
    return this;
  }


  public Extractor links() {
    addTagToGet(HTMLElementName.A);
    return this;
  }


  public Extractor parameter(String parameterName) {
    this.parameter = parameterName;
    return this;
  }


  /**
   * Call this method to have the scrape performed and the extractions
   * returned in the desired format {@link #outputFormat}.
   * 
   * @return the items from the {@link #url} that were prescribed by the
   *         various manipulators
   * @throws IOException
   */
  public String execute() throws IOException {
    String result = "";
    Source source = new Source(url);
    source.fullSequentialParse();
    // log.debug("parsed source: {}", source.toString());


    if (idToGet != null) {
      result = source.getElementById(idToGet).getTextExtractor().toString();
    } else if (classToGet != null) {
      List<Element> classElements = source.getAllElementsByClass(classToGet);
      if (occurrence > 0) {
        result = classElements.get(occurrence).getTextExtractor().toString();
      } else {
        for (Element element : classElements) {
          result += element.getTextExtractor().toString();
        }
        log.debug("extracted: {} from class: {}", result, classToGet);
      }
    } else {
      if (outputFormat == OutputFormats.Text) {
        result = source.getTextExtractor().toString();
      } else if (outputFormat == OutputFormats.Html) {
        result = source.toString();
      }
      for (TagOccurrence toGet : tagsToGet) {
        if (toGet.getMatching() != null) {
          result = extractTagMatching(source.toString(), toGet);
        } else {
          result = extractTagText(source.toString(), toGet);
        }
      }


    }


    return result;
  }


  /**
   * Prunes the raw URL content down based on the manipulators that have been
   * used to specify subelements, then extracts links and potentially
   * parameters from them.
   * 
   * @return a set of extracted elements
   * @throws IOException
   */
  public List<String> getResults() throws IOException {
    log.debug("extracting results from url: {}", url);
    List<String> results = new ArrayList<String>();
    Source source = new Source(url);
    source.fullSequentialParse();
    String content = source.toString();
    List<Element> currentElements = null;
    for (TagOccurrence toGet : tagsToGet) {
      log.debug("toGet = {}", toGet);
      if (toGet.getOccurrence() > 0) {
        content = extractTagText(content, toGet);
      } else if (toGet.getMatching() != null) {
        content = extractTagMatching(source.toString(), toGet);
      } else if (toGet.getTag().equals("a")) {
        source = new Source(content);
        currentElements = source.getAllElements(HTMLElementName.A);
        for (Element element : currentElements) {
          String href = element.getAttributeValue("href");
          if (href != null) {
            if (matchingPattern == null || href.contains(matchingPattern)) {
              results.add(href);
            }
          }
        }
      }
    }
    if (this.parameter != null) {
      List<String> foundIds = new ArrayList<String>();
      for (String foundLink : results) {
        String found = ScraperUtil.extractParameter(foundLink, parameter);
        if (found != null) {
          foundIds.add(found);
        }
      }
      results = foundIds;
    }
    return results;
  }


  private Source pruneFrom(Source source, TagOccurrence afterTagOccurrence) {
    String sourceHtml = source.toString();
    String endAfterTag = "</" + afterTagOccurrence.getTag() + ">";
    for (int i = 0; i < afterTagOccurrence.getOccurrence(); i++) {
      sourceHtml = sourceHtml.substring(sourceHtml.indexOf(endAfterTag) + 1);
    }
    String afterSource = sourceHtml;
    Source newSource = new Source(afterSource);
    newSource.fullSequentialParse();
    return newSource;
  }


  public Extractor asText() {
    this.outputFormat = OutputFormats.Text;
    return this;
  }


  public List<Field> getFields() throws IOException {


    List<Field> extractedFields = new ArrayList<Field>();
    // fill in from the already extracted HTML..


    if (isUsingDefaultFieldExtractor()) {
      extractedFields.addAll(defaultFieldExtractions());
    }


    if (this.classToGet != null) {
      Source source = new Source(url);
      source.fullSequentialParse();
      List<Element> elements = source.getAllElementsByClass(classToGet);
      String text = elements.get(0).toString();
      String[] fields = text.split("<br>");
      log.debug("fields: {}", fields);
      for (String field : fields) {
        Source fieldSource = new Source(field);
        field = fieldSource.getTextExtractor().toString();
        String[] fieldParts = field.split(":");
        log.debug("{} : {}", fieldParts[0], fieldParts[1]);
        extractedFields.add(new ScrapedField(fieldParts[0], fieldParts[1]));
      }
    }


    Source source = new Source(url);
    for (TagOccurrence tagOccurrence : this.tagsToGet) {
      // log.debug("extracting fields using tag: {}", tagOccurrence);
      source.fullSequentialParse();
      if (!(tagOccurrence.getTag().contains(HTMLElementName.TABLE) || tagOccurrence.getTag().contains(
          HTMLElementName.A))) {
        throw new IllegalStateException(MessageFormat.format(
            "Asked to extract tag: {0}, only know how to extract fields from tables.",
            tagOccurrence.getTag()));
      } else {


        if (isAttemptingToMatchSpecificTable(tagOccurrence)) {
          source = new Source(extractTagText(source.toString(), tagOccurrence));
          extractedFields.addAll(extractFieldsFromTable(source.toString()));
        } else if (tagOccurrence.getTag().equals(HTMLElementName.TABLE)) {
          extractedFields.addAll(extractFieldsFromTable(source.toString()));
        } else {
          extractedFields = extractLinksFromList(source.toString());
        }


      }
    }
    source = new Source(url);
    source.fullSequentialParse();
    if (this.afterTagOccurrence != null) {
      source = pruneFrom(source, afterTagOccurrence);
    }
    for (FieldToGet fieldToGet : fieldsToGet) {
      String value = "";
      if (fieldToGet.getSearchType() == FieldSearchType.Tag) {
        value = source.getAllElements(fieldToGet.getLabel()).get(0).getTextExtractor().toString();
      }
      extractedFields.add(new ScrapedField(fieldToGet.getFieldname(), value));
    }
    for (PairedTags tagPair : this.fieldPairs) {
      List<Element> labels = source.getAllElements(tagPair.getLabelTag());
      List<Element> fields = source.getAllElements(tagPair.getFieldTag());


      removeInvalidFields(fields);


      int fieldCount = Math.min(labels.size(), fields.size());
      for (int i = 0; i < fieldCount; i++) {
        String label = labels.get(i).getTextExtractor().toString().trim();
        String field = "";
        if (fieldHasMultipleValues(fields.get(i).toString())) {
          field = delimitFieldValues(fields.get(i).toString());
        } else {
          field = getValueFieldText(fields.get(i));
        }
        log.debug("outputting pair: {} : {}", label, field);
        extractedFields.add(new ScrapedField(label, field));
      }


    }
    return extractedFields;
  }


  private boolean isAttemptingToMatchSpecificTable(TagOccurrence tagOccurrence) {
    return tagOccurrence.getOccurrence() > 0 || !StringUtils.isEmpty(tagOccurrence.getMatching());
  }


  private List<Field> extractLinksFromList(String html) {
    log.debug("extracting links...");
    List<Field> fields = new ArrayList<Field>();
    Source source = new Source(html);
    source.fullSequentialParse();
    List<Element> links = source.getAllElements(HTMLElementName.A);
    for (Element a : links) {
      String label = a.getTextExtractor().toString();
      String href = a.getAttributeValue("href");
      if (matchingPattern == null || href.contains(matchingPattern)) {
        fields.add(new Link(label, href));
      }
    }
    return fields;
  }


  public boolean isUsingDefaultFieldExtractor() {
    return this.useDefaultFieldExtractor;
  }


  public Extractor setUseDefaultFieldExtractor(boolean useDefaultExtractor) {
    this.useDefaultFieldExtractor = useDefaultExtractor;
    return this;
  }


  private boolean fieldHasMultipleValues(String fieldValue) {
    Source source = new Source(fieldValue);
    source.fullSequentialParse();
    return source.getAllElements(HTMLElementName.BR).size() > 1;
  }


  private String delimitFieldValues(String source) {
    Source result = new Source(source.replace("<br>", ";").replace("<br/>", ";"));
    return getValueFieldText(result.getFirstElement());
  }


  private void removeInvalidFields(List<Element> fields) {
    java.util.Iterator<Element> iterator = fields.iterator();
    while (iterator.hasNext()) {
      Element field = iterator.next();
      if (!isAField(field.toString())) {
        log.debug("pruning invalid field: {}", field);
        iterator.remove();
      }
    }
  }


  private boolean isAField(String extract) {
    return !extract.contains(HTMLElementName.TABLE);
  }


  public Extractor matching(String pattern) {
    this.matchingPattern = pattern;
    return this;
  }


  private String extractTagText(String html, TagOccurrence tagOccurrence) {
    String result = null;


    if (StringUtil.isBlank(tagOccurrence.getMatching())) {
      result = ScraperUtil.extract(html, tagOccurrence.getTag(), tagOccurrence.getOccurrence());
    } else {
      result = extractTagMatching(html, tagOccurrence);
    }


    return result;
  }


  private String extractTagMatching(String html, TagOccurrence toGet) {
    log.debug("looking for {} in tags: {}", toGet.getMatching(), toGet.getTag());
    String found = null;
    Source source = new Source(html);
    source.fullSequentialParse();
    List<Element> elements = source.getAllElements(HTMLElementName.TABLE);
    for (Element element : elements) {
      String elementText = element.getTextExtractor().toString();
      if (elementText.contains(toGet.getMatching())) {
        found = element.toString();
      }
    }
    return found;
  }


  private void addTagToGet(String tag, int occurrence, String match) {
    tagsToGet.add(new TagOccurrence.Builder().tag(tag).occurrence(occurrence).matching(match).build());
  }


  private void addTagToGet(String tag) {
    addTagToGet(tag, 0);
  }


  private void addTagToGet(String tag, String match) {
    addTagToGet(tag, 0, match);
  }


  private void addTagToGet(String tag, int occurrence) {
    addTagToGet(tag, occurrence, null);
  }


  public Extractor pair(String labelTag, String fieldTag) {
    this.fieldPairs.add(new PairedTags(labelTag, fieldTag));
    return this;
  }


  public Extractor after(String tag, int occurrence) {
    this.afterTagOccurrence = new TagOccurrence.Builder().tag(tag).occurrence(occurrence).build();
    return this;


  }


  public Extractor field(String fieldName, String tag) {
    this.fieldsToGet.add(new FieldToGet(fieldName, tag));
    return this;
  }


  public Extractor getFieldFromLabel(String string) {
    return this;
  }


  /**
   * We will refactor this away when we have specialized extractors that are
   * chained.
   * 
   * @return all fields that were found
   * @throws IOException
   */
  private List<Field> defaultFieldExtractions() throws IOException {
    List<Field> extractedFields = new ArrayList<Field>();


    Source source = null;
    try {
      source = new Source(this.url);
    } catch (FileNotFoundException e) {
      log.info("Error while sourcing URL = {}, error description = {}", this.url, e.toString());
      return new ArrayList<Field>();
    }
    source.fullSequentialParse();


    List<Element> tables = source.getAllElements(HTMLElementName.TABLE);


    for (Element table : tables) {
      extractedFields.addAll(extractFieldsFromTable(table.toString()));
    }


    List<Element> dls = source.getAllElements(HTMLElementName.DL);


    for (Element dt : dls) {
      extractedFields.addAll(extractFieldsFromDL(dt.toString()));
    }


    return extractedFields;
  }


  private List<Field> extractFieldsFromTable(String html) {
    // log.debug("extracting fields from table: {}", html);
    List<Field> extractedFields = new ArrayList<Field>();
    Source source = new Source(html);
    source.fullSequentialParse();
    List<Element> cells = source.getAllElements(HTMLElementName.TD);
    int rows = source.getAllElements(HTMLElementName.TR).size();
    log.debug("found {} cells in {} rows", cells.size(), rows);
    if (cells.size() == (rows * 2)) {
      Field lastField = null;
      log.debug("cells.size: {}", cells.size());
      for (int i = 0; i < cells.size(); i++) {
        Element labelElement = cells.get(i);
        Element valueElement = cells.get(++i);
        String label = labelElement.getTextExtractor().toString().trim().replaceAll(":$", "");
        String value = getValueFieldText(valueElement);
        log.debug("found field: {}={}", label, value);
        if (StringUtils.isEmpty(label) && lastField != null) {
          lastField.addValue(value);
        } else {
          lastField = new ScrapedField(label, value);
          extractedFields.add(lastField);
        }
      }
    }
    return extractedFields;
  }


  private String getValueFieldText(Element valueElement) {
    String result = "";
    List<Element> subElements = valueElement.getAllElements(HTMLElementName.A);
    if (subElements.size() > 0) {
      log.debug("found a tag inside field!");
      for (Element element : subElements) {
        if (element.getName().equals(HTMLElementName.A) && element.getAttributeValue("href").contains("mailto")) {
          log.debug("found mailto");
          result = element.getAttributeValue("href");
        } else {
          result = element.getTextExtractor().toString();
        }
      }
    } else {
      result = valueElement.getTextExtractor().toString();
    }
    return result;
  }


  private List<Field> extractFieldsFromDL(String html) {
    List<Field> extractedFields = new ArrayList<Field>();
    Source source = new Source(html);
    source.fullSequentialParse();
    List<Element> labels = source.getAllElements(HTMLElementName.DT);
    List<Element> values = source.getAllElements(HTMLElementName.DD);
    int cellCount = Math.min(labels.size(), values.size());
    for (int i = 0; i < cellCount; i++) {
      String label = labels.get(i).getTextExtractor().toString().trim().replaceAll(":$", "");
      Element valueElement = values.get(i);
      log.debug("looking at value element: {}", valueElement);
      String value = getValueFieldText(valueElement);
      extractedFields.add(new ScrapedField(label, value));
    }
    return extractedFields;
  }
}
Source Code of com.ontometrics.scraper.extraction.Extractor

Related Classes of com.ontometrics.scraper.extraction.Extractor