Package com.ontometrics.scraper.extraction

Source Code of com.ontometrics.scraper.extraction.DefaultFieldExtractor

package com.ontometrics.scraper.extraction;

import java.net.URL;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

import com.ontometrics.scraper.TagAttributeFieldExtractor;
import net.htmlparser.jericho.Element;
import net.htmlparser.jericho.HTMLElementName;
import net.htmlparser.jericho.Segment;
import net.htmlparser.jericho.Source;
import net.htmlparser.jericho.Tag;

import org.apache.commons.lang.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.ontometrics.scraper.PairedTags;
import com.ontometrics.scraper.TagOccurrence;
import com.ontometrics.scraper.util.ScraperUtil;

/**
* Provides a means of pulling fields out of a page.
* <p>
* This extractor will work on tables and on DLs. The tables have to have 2
* columns (TDs). This will take the first as the label and the second as the
* value. Same is done with the DLs: the DT is read as the label and the DD as
* the value.
* <p>
* One way to use this tool is to just start with this default extractor and
* then start removing the elements that are not helping.
*
* @author Rob
*/
public class DefaultFieldExtractor extends BaseExtractor implements FieldExtractor<DefaultFieldExtractor> {

  private static final Logger log = LoggerFactory.getLogger(DefaultFieldExtractor.class);

  private List<DesignatedField> fieldsToGet = new ArrayList<DesignatedField>();

  private List<PairedTags> pairedTagsToGet = new ArrayList<PairedTags>();

    private List<TagAttributeFieldExtractor> tagAttributesToGet = new ArrayList<TagAttributeFieldExtractor>();

    @Override
    public Source getSource() {
        return super.getSource();
    }

    @Override
  public DefaultFieldExtractor url(URL url) {
    super.url(url);
    return this;
  }

  /*
   * (non-Javadoc)
   *
   * @see com.ontometrics.scraper.extraction.FieldExtractor#getFields()
   */
  @Override
  public List<Field> getFields() {
    List<Field> extractedFields = new ArrayList<Field>();
    extractedFields.addAll(extractFieldsFromULs());
    extractedFields.addAll(extractFieldsFromTables());
    extractedFields.addAll(extractFieldsFromDLs());
    extractedFields.addAll(extractDesignatedFields());
    extractedFields.addAll(extractFieldsFromPairTags());
        extractedFields.addAll(extractFieldsFromTagAttributes());
    return extractedFields;
  }

  /**
   * The pair function will delimit field values that have new lines (<BR>
   * s) with semicolons.
   *
   * @return
   */
  private List<Field> extractFieldsFromPairTags() {
    List<Field> extractedFields = new ArrayList<Field>();
    for (PairedTags pairedTags : this.pairedTagsToGet) {
      List<Element> labels = getSource().getAllElements(pairedTags.getLabelTag());
      List<Element> fields = getSource().getAllElements(pairedTags.getFieldTag());

      removeInvalidFields(fields);

      int fieldCount = Math.min(labels.size(), fields.size());
      for (int i = 0; i < fieldCount; i++) {
        String label = labels.get(i).getTextExtractor().toString().trim();
        String field = "";
        if (fieldHasMultipleValues(fields.get(i).toString())) {
          field = delimitFieldValues(fields.get(i).toString());
        } else {
          field = getValueFieldText(fields.get(i));
        }
        log.debug("outputting pair: {} : {}", label, field);
        extractedFields.add(new ScrapedField(label, field));
      }

    }
    return extractedFields;
  }

    /**
     * Extracts the fields from the tag attributes
     * @return list of extracted fields
     */
    private List<Field> extractFieldsFromTagAttributes() {
        if (!tagAttributesToGet.isEmpty()) {
            List<Field> extractedFields = new ArrayList<Field>();
            Source source = getSource();
            for (TagAttributeFieldExtractor extractor : this.tagAttributesToGet) {
                List<Element> elements = source.getAllElements(extractor.getTagName());
                if (elements != null && !elements.isEmpty()) {
                    for (Element element : elements) {
                        Field field = extractor.extractField(element);
                        if (field != null) {
                            extractedFields.add(field);
                        }
                    }
                }
            }
            return extractedFields;
        }
        return Collections.emptyList();
    }

  private List<Field> extractDesignatedFields() {
    List<Field> extractedFields = new ArrayList<Field>();

    for (DesignatedField designatedField : this.fieldsToGet) {
            log.debug("designated field: {}", designatedField);
            log.debug("tag to get value from: {}", designatedField.getTagToGetValueFrom());
      Element elementWithValue = ScraperUtil.extract(getSource(), designatedField.getTagToGetValueFrom());
            log.debug("element with value: {}", elementWithValue);
      String value = getValueFieldText(elementWithValue);
      log.debug("looking for field: {}, value: {}", designatedField.getLabel(), value);
      extractedFields.add(new ScrapedField(designatedField.getLabel(), value));
    }

    return extractedFields;
  }

  /*
   * (non-Javadoc)
   *
   * @see
   * com.ontometrics.scraper.extraction.BaseExtractor#source(com.ontometrics
   * .scraper.extraction.HtmlExtractor)
   */
  public DefaultFieldExtractor source(HtmlExtractor htmlExtractor) {
    super.source(htmlExtractor);
    return this;
  }

    public DefaultFieldExtractor source(Source source){
        super.source(source);
        return this;
    }

  @Override
  public DefaultFieldExtractor section(HtmlExtractor htmlExtractor) {
    super.section(htmlExtractor);
    return this;
  }

  public DefaultFieldExtractor add(PairedTags pairedTags) {
    this.pairedTagsToGet.add(pairedTags);
    return this;
  }

    public DefaultFieldExtractor add(TagAttributeFieldExtractor tagAttributeFieldExtractor) {
        this.tagAttributesToGet.add(tagAttributeFieldExtractor);
        return this;
    }

    private List<Field> extractFieldsFromDLs() {
    List<Field> extractedFields = new ArrayList<Field>();
    List<Element> dls = getSource().getAllElements(HTMLElementName.DL);
    for (Element dt : dls) {
      extractedFields.addAll(extractFieldsFromDL(dt.toString()));
    }
    return extractedFields;
  }

  private List<Field> extractFieldsFromTables() {
    List<Field> extractedFields = new ArrayList<Field>();
    List<Element> tables = getSource().getAllElements(HTMLElementName.TABLE);
    log.debug("found {} tables to try and find fields in {}", tables.size(), getSource().toString());

    for (Element table : tables) {
      log.debug("this table = {}", table.getContent());
      extractedFields.addAll(extractFieldsFromTable(table.toString()));
    }
    return extractedFields;
  }

  private List<Field> extractFieldsFromULs() {
    List<Field> extractedFields = new ArrayList<Field>();
    List<Element> lists = getSource().getAllElements(HTMLElementName.UL);
    log.debug("found {} ULs to try and find fields in", lists.size());
    for (Element list : lists) {
      extractedFields.addAll(extractFieldsFromUL(list.toString()));
    }
    return extractedFields;
  }

  private List<Field> extractFieldsFromTable(String html) {
    log.debug("extracting fields from table: {}", html);
    List<Field> extractedFields = new ArrayList<Field>();
    Source source = new Source(html);
    source.fullSequentialParse();
    int cellCount = source.getAllElements(HTMLElementName.TD).size();
    int rowCount = source.getAllElements(HTMLElementName.TR).size();
    log.debug("found {} cells in {} rows", cellCount, rowCount);
    if (cellCount == (rowCount * 2)) {
      Field lastField = null;
      log.debug("cells.size: {}", cellCount);
      List<Element> cells = source.getAllElements(HTMLElementName.TD);
      for (int i = 0; i < cellCount; i++) {
        Element labelElement = cells.get(i);
        Element valueElement = cells.get(++i);
        String label = labelElement.getTextExtractor().toString().trim().replaceAll(":$", "");
        String value = getValueFieldText(valueElement);
        log.debug("found field: {}={}", label, value);
        if (StringUtils.isEmpty(label) && lastField != null) {
          lastField.addValue(value);
        } else {
          lastField = new ScrapedField(label, value);
          extractedFields.add(lastField);
        }
      }
    } else {
      List<String> headers = new ArrayList<String>();
      List<Element> rows = source.getAllElements(HTMLElementName.TR);
      for (Element row : rows) {
        List<Element> headerElements = row.getAllElements(HTMLElementName.TH);
        if (headerElements.size() > 0) {
          headers.clear();
        }
        for (Element headerElement : headerElements) {
          String header = headerElement.getTextExtractor().toString();
          headers.add(header);
          log.debug("header text: {}", header);
        }
        List<Element> cells = row.getAllElements(HTMLElementName.TD);
        if (cells.size() > 0) {
          for (int n = headers.size(); n < cells.size(); n++) {
            headers.add("col" + n);
          }
          int index = 0;
          for (Element cell : cells) {
            String label = headers.get(index++);
            String value = getValueFieldText(cell);
            extractedFields.add(new ScrapedField(label, value));
          }
          headers.clear();
        }
      }
    }
    return extractedFields;
  }

  private List<Field> extractFieldsFromDL(String html) {
    List<Field> extractedFields = new ArrayList<Field>();
    Source source = new Source(html);
    source.fullSequentialParse();
    List<Element> labels = source.getAllElements(HTMLElementName.DT);
    List<Element> values = source.getAllElements(HTMLElementName.DD);
    int cellCount = Math.min(labels.size(), values.size());
    for (int i = 0; i < cellCount; i++) {
      String label = labels.get(i).getTextExtractor().toString().trim().replaceAll(":$", "");
      Element valueElement = values.get(i);
      log.debug("looking at value element: {}", valueElement);
      String value = getValueFieldText(valueElement);
      extractedFields.add(new ScrapedField(label, value));
    }
    return extractedFields;
  }

  private List<Field> extractFieldsFromUL(String html) {
    List<Field> extractedFields = new ArrayList<Field>();
    Source source = new Source(html);
    source.fullSequentialParse();
    List<Element> lis = source.getAllElements(HTMLElementName.LI);
    for (Element li : lis) {
      log.debug("looking at li: {} w/text: {}", li, li.getTextExtractor().toString());
      String[] parts = li.getTextExtractor().toString().split(":");
      if (parts.length == 2) {
        Field field = new ScrapedField(parts[0], parts[1]);
        extractedFields.add(field);
        log.debug("found <li> to process: {}, added field: {}", li, field);
      } else if (tagsWithSpecificTagRemoved(HTMLElementName.BR, li.getAllTags()).size() == 4) {
        Tag enclosingTag = li.getAllTags().get(1);
        log.debug("enclosing tag: {}", enclosingTag);
        log
            .debug("first element of enclosing tag: {}", enclosingTag
                .getElement()
                .getTextExtractor()
                .toString());
        String tagText = enclosingTag
            .getElement()
            .getRenderer()
            .setMaxLineLength(Integer.MAX_VALUE)
            .toString()
            .trim()
            .replaceAll(":$", "");
        String allText = li
            .getRenderer()
            .setMaxLineLength(Integer.MAX_VALUE)
            .toString()
            .trim()
            .replaceAll(":$", "");
        log.debug("enclosing tag text starts at: {}", allText.indexOf(tagText));
        log.debug("tagText (length = {}): {} alltext (length = {}): {}", new Object[] { tagText.length(),
            tagText, allText.length(), allText });
        if (allText.startsWith(tagText)) {
          String valueText = (allText.length() > tagText.length()) ? allText.substring(tagText.length() + 1)
              : "";
          extractedFields.add(new ScrapedField(tagText, valueText));
          log.debug("extracted fields = {}", extractedFields);
        }
      }

    }
    return extractedFields;
  }

  private List<Tag> tagsWithSpecificTagRemoved(String tagNameToRemove, List<Tag> tags) {
    log.debug("Tag name to remove = {}, tags to operate on = {}", tagNameToRemove, tags);
    for (int i = 0; i < tags.size(); i++) {
      Tag currentTag = tags.get(i);
      log.debug("Current tag name = {}", currentTag.getName());
      if (currentTag.getName() == tagNameToRemove) {
        tags.remove(currentTag);
        i--;
      }
    }
    log.debug("returning tags = {}", tags);
    return tags;
  }

  private String getValueFieldText(Element valueElement) {
    String result = "";
        log.debug("valueElement: {}", valueElement);
        if (valueElement!=null){
            List<Element> preTags = valueElement.getAllElements(HTMLElementName.PRE);
            if (preTags.size() > 0) {
                result = preTags.get(0).getContent().toString();
            } else {

                log.debug("parsing element: {}", valueElement.toString());
                Segment valueText = valueElement.getContent();
                int elementsSize = valueText.getAllElements().size();
                String fieldText = "";
                if (elementsSize > 0) {
                    if (valueText.toString().contains("<br")) {
                        String delimitedText = valueText
                                .toString()
                                .replace("<br>", ";")
                                .replace("<br/>", ";")
                                .replace("<br />", ";");
                        log.debug("delimited text: {}", delimitedText);
                        Source newElement = new Source(delimitedText);
                        fieldText = newElement.getTextExtractor().toString();
                        if (fieldText.endsWith(";")) {
                            fieldText = fieldText.substring(0, fieldText.length() - 1).trim();
                        }
                    } else {
                        fieldText = valueText.getTextExtractor().toString();
                    }

                } else {
                    fieldText = valueText.getTextExtractor().toString();
                }
                // check for links..
                List<Element> links = valueElement.getAllElements(HTMLElementName.A);
                if (links.size() > 0) {
                    for (Element link : links) {
                        int linkcount = 0;
                        log.debug("found link inside field: {}", link);
                        if (link.getAttributeValue("href") != null) {
                            fieldText += (linkcount > 0) ? ";" : " ";
                            fieldText += link.getAttributeValue("href");
                            linkcount++;
                        }
                    }
                }

                result = fieldText;
                log.debug("returning value = {}", result);
            }
    }
    return result;
  }

  @Override
  public DefaultFieldExtractor field(String label, String element) {
    this.fieldsToGet.add(new DesignatedField(label, new TagOccurrence.Builder().tag(element).build()));
    return this;
  }

  private void removeInvalidFields(List<Element> fields) {
    java.util.Iterator<Element> iterator = fields.iterator();
    while (iterator.hasNext()) {
      Element field = iterator.next();
      if (!isAField(field.toString())) {
        log.debug("pruning invalid field: {}", field);
        iterator.remove();
      }
    }
  }

  private boolean isAField(String extract) {
    return !extract.contains(HTMLElementName.TABLE);
  }

  private boolean fieldHasMultipleValues(String fieldValue) {
    Source source = new Source(fieldValue);
    source.fullSequentialParse();
    return source.getAllElements(HTMLElementName.BR).size() > 1;
  }

  private String delimitFieldValues(String source) {
    Source result = new Source(source.replace("<br>", ";").replace("<br/>", ";"));
    return getValueFieldText(result.getFirstElement());
  }

  public DefaultFieldExtractor field(String label, ElementIdentifierType identierType, String id) {
    this.fieldsToGet.add(new DesignatedField(label, new TagOccurrence.Builder()
        .elementIdentifierType(identierType)
        .identifier(id)
        .build()));
    return this;
  }

}
TOP

Related Classes of com.ontometrics.scraper.extraction.DefaultFieldExtractor

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.