Package com.findwise.hydra.stage.webstages

Source Code of com.findwise.hydra.stage.webstages.AbstractJsoupSelector

package com.findwise.hydra.stage.webstages;

import java.util.ArrayList;
import java.util.List;
import java.util.Map;

import com.findwise.hydra.stage.AbstractProcessStage;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

import com.findwise.hydra.local.LocalDocument;
import com.findwise.hydra.stage.Parameter;
import com.findwise.hydra.stage.RequiredArgumentMissingException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
* This stage uses JSoup and it's DOM selector queries (http://jsoup.org
* contains selector documentation) to parse HTML code from a specified field
* stored in the LocalDocument in hydra.
*
* It provides all the processing and selector operations for JSoup so the
* extending classes can retrieve anything from the selected content.
*
* selector = the jsoup selector query
* fieldname = the fieldname where you want the result from the query stored in (Hydras mongodb storage)
* singlevalue = if you want to only get the first element found or all of them
*
* The example given would get the first h1 tag in the HTML and put it in the
* Hydra store with the fieldname "h1". Then it would get all the h2 tags and
* store them in Hydra store with fieldname "h2"
*
* @author jens.bengtsson
*
*/
public abstract class AbstractJsoupSelector extends AbstractProcessStage {
    private static Logger logger = LoggerFactory.getLogger(AbstractJsoupSelector.class);

  @Parameter(name = "htmlField", required = true, description = "The input field containing HTML/XML, or a listfield with fields containing HTML/XML")
  private String htmlField;

  @Parameter(name = "jSoupConfigs", required = true, description = "List of configs, where each config is a map with at least the keys 'selector', 'fieldname' and optionally 'singlevalue' (only output the first selected element; false if omitted)")
  private List<Map<String, String>> jSoupConfigs;

  @Override
  public void init() throws RequiredArgumentMissingException {
   
    if (this.htmlField == null || htmlField.length() == 0)
      throw new RequiredArgumentMissingException("htmlField missing");

    if (jSoupConfigs == null || jSoupConfigs.size() == 0)
      throw new RequiredArgumentMissingException("jSoupConfigs missing");
   
  }

  @Override
  public void process(LocalDocument doc) {
   
        Object value = doc.getContentField(htmlField);
       
        if(value instanceof String){
            String content = (String) doc.getContentField(htmlField);
            jsoupParse(doc, content, false);
        } else if (value instanceof List<?>){
          boolean append = false;
            for (Object val : (List<?>)value) {
        if (val instanceof String) {
                    jsoupParse(doc, (String)val, append);
                } else {
                    logger.warn("Field " + htmlField + " was a list but not a List<String>");
                }
        if (!append) append = true;
            }
        } else {
            logger.warn("Field " + htmlField + " did not contain String or List<String>");
        }
    }

    private void jsoupParse(LocalDocument doc, String content, boolean append) {
      if(content != null) {
      Document jsoupDoc = Jsoup.parse(content);
     
      for (Map<String, String> jsoupConfig : jSoupConfigs) {
        String fieldName = jsoupConfig.get("fieldname");
        if (jsoupConfig.get("singlevalue") != null && jsoupConfig.get("singlevalue").equalsIgnoreCase("true")){
          String fieldContent = getJsoupElement(jsoupDoc, jsoupConfig);
          if (fieldContent != null && !fieldContent.isEmpty()){
            doc.putContentField(fieldName, fieldContent);
          }
        } else {
          List<String> fieldContent = new ArrayList<String>();
          List<String> selectedContent = getJsoupElements(jsoupDoc, jsoupConfig);
          if (append) {
            @SuppressWarnings("unchecked")
            List<String> oldFieldContent = doc.hasContentField(fieldName) ? (List<String>)doc.getContentField(fieldName) : new ArrayList<String>();
            fieldContent.addAll(oldFieldContent);
          }
          fieldContent.addAll(selectedContent);
          if (!fieldContent.isEmpty()){
            doc.putContentField(fieldName, fieldContent);
          }
        }
      }
    }
   
    }
   
   
  /**
   * Takes a jsoup Document and gets all the elements that matches the
   * selector and returns a list of the selected content
   *
   * @param jsoupDoc
   * @param selector
   * @return A list of strings containing the text values of the elements
   *         selected
   */
  abstract ArrayList<String> getJsoupElements(Document jsoupDoc,
      Map<String, String> jSoupConfig);

  /**
   * Takes a jsoup Document and gets the first instance that matches the
   * selector and returns whatever selected
   *
   * @param jsoupDoc
   * @param selector
   * @return The text (without HTML code), or the HTML representation if getHTML is true, from the selected element
   */
  abstract String getJsoupElement(Document jsoupDoc, Map<String, String> jSoupConfig);

  public List<Map<String, String>> getjSoupConfigs() {
    return jSoupConfigs;
  }

  public void setjSoupConfigs(List<Map<String, String>> jSoupConfigs) {
    this.jSoupConfigs = jSoupConfigs;
  }

  public String getHtmlField() {
    return htmlField;
  }

  public void setHtmlField(String htmlField) {
    this.htmlField = htmlField;
  }
}
TOP

Related Classes of com.findwise.hydra.stage.webstages.AbstractJsoupSelector

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.