Source Code of fr.eolya.simplepipeline.stage.DocTextExtractor$HtmlParser

package fr.eolya.simplepipeline.stage;


import java.io.ByteArrayInputStream;
import java.io.InputStream;
import java.util.HashMap;


import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringEscapeUtils;




//import fr.eolya.extraction.MultiFormatTextExtractor;
import fr.eolya.extraction.ScriptsWrapper;
import fr.eolya.simplepipeline.config.PipelineConfig;
import fr.eolya.simplepipeline.document.Doc;
import fr.eolya.utils.Base64;
import fr.eolya.utils.Utils;
import fr.eolya.utils.http.HttpUtils;
import fr.eolya.extraction.tika.TikaWrapper;




/*
 * Configuration snippet sample :
 * 
 *  <stage classname="fr.eolya.simplepipeline.stage.DocTextExtractor">
 *    <param name="onaction">add</param>
 *    <param name="scriptspath">/opt/crawler/config/scripts</param>
 *    <param name="url">item_url</param>
 *    <param name="contenttype">item_contenttype</param>
 *      <param name="contentcharset">item_charset</param>
 *      <param name="cleanmethod">item_clean_method</param>
 *    <param name="cleanmethoddefault">boilerpipe_article|boilerpipe_default|boilerpipe_canola</param>
 *    <param name="source">content</param>
 *    <param name="target">text</param>
 *    <param name="pdftotextpath">/usr/bin/pdftotext</param>
 *    <param name="tmppath">/opt/crawler/tmp</param>
 *    <param name="stoppipelineonerror">yes</param>
 *    <param name="stoppipelineonemptytext">yes</param>
 *  </stage>
 */


public class DocTextExtractor extends Stage {


  private String parserLanguage = "";
  private String parserTitle = "";
  private String parserEncoding = "";
  private String parserContentType = "";
  private String parserContentSize = "";
  private String parserDate = "";
  private String parserText = "";
  private String parserImageUrl = "";


  private boolean stopPipelineOnError = false;
  private boolean stopPipelineOnEmptyText = false;
  private String scriptsPath = null;
  private String urlElement = null;
  private String contentTypeElement = null;
  private String contentCharsetElement = null;
  private String cleanMethodDefault = null;
  private String cleanMethodElement = null;
  private String sourceElement = null;
  private String targetElement = null;


  private String tmpPath = null;
  private String pdfToTextPath = null;


  private boolean VERBOSE = true;


  /**
   * Perform initialization.
   */
  public void initialize() {
    super.initialize();


    stopPipelineOnError = PipelineConfig.isEnabled(props.getProperty("stoppipelineonerror"));
    stopPipelineOnEmptyText = PipelineConfig.isEnabled(props.getProperty("stoppipelineonemptytext"));
    scriptsPath = props.getProperty("scriptspath");
    scriptsPath = Utils.getValidPropertyPath(scriptsPath, null, "HOME");
    urlElement = props.getProperty("url");
    contentTypeElement = props.getProperty("contenttype");
    contentCharsetElement = props.getProperty("contentcharset");
    cleanMethodDefault = props.getProperty("cleanmethoddefault");
    cleanMethodElement = props.getProperty("cleanmethod");
    sourceElement = props.getProperty("source");
    targetElement = props.getProperty("target");


    tmpPath = props.getProperty("tmppath");
    tmpPath = Utils.getValidPropertyPath(tmpPath, null, "HOME");


    pdfToTextPath = props.getProperty("pdftotextpath");
    pdfToTextPath = Utils.getValidPropertyPath(pdfToTextPath, null, "HOME");
  }




  @Override
  public void processDoc(Doc doc) throws Exception {


    // Check onaction
    if (!doProcess(doc)) {
      if (nextStage != null)
        nextStage.processDoc(doc);  
      return;
    }


    java.util.Date startTime = new java.util.Date();


    if (logger!=null) logger.log("    text extraction");


    // Input
    String contentType = "";
    String contentCharset = "";
    String cleanMethod = "";
    String source = "";
    String text = "";
    String url = "";
    String accountId = "";


    if (urlElement != null && !"".equals(urlElement)) {
      url = doc.getElementText("//" + urlElement);
    }


    if (contentTypeElement != null && !"".equals(contentTypeElement)) {
      contentType = doc.getElementText("//" + contentTypeElement);
    }


    if (contentCharsetElement != null && !"".equals(contentCharsetElement)) {
      contentCharset = doc.getElementText("//" + contentCharsetElement);
    }


    accountId = doc.getElementText("//account_id");


    String scriptName = "";
    if (accountId!=null && !"".equals(accountId))
      scriptName = ScriptsWrapper.getScriptName (scriptsPath + "/" + accountId, url);


    if (scriptName==null || "".equals(scriptName))
      scriptName = ScriptsWrapper.getScriptName (scriptsPath, url);


    if (scriptName!=null && !"".equals(scriptName) && logger!=null) logger.log("        using script : " + scriptName);


    if (cleanMethodElement != null && !"".equals(cleanMethodElement))
      cleanMethod = doc.getElementText("//" + cleanMethodElement);


    if ("".equals(cleanMethod) && !"".equals(cleanMethodDefault))
      cleanMethod = cleanMethodDefault;


    if (sourceElement != null && !"".equals(sourceElement)) {
      source = doc.getElementText("//" + sourceElement);
      if (source == null)
        source = "";
    }


    boolean success = true;
    try {
      if (!"".equals(source)) {
        String sourceEncoding = doc.getElementAttribute("//" + sourceElement, "encoding");


        InputStream inputSource = null;
        String rawData = null;


        if (contentType.startsWith("text/")) {  // or "application/x-shockwave-flash" ???
          if ("base64".equals(sourceEncoding)) {
            if (contentCharset==null || "".equals(contentCharset)) contentCharset = "UTF-8";
            rawData = IOUtils.toString(new Base64.InputStream(new ByteArrayInputStream(source.getBytes()), Base64.DECODE), contentCharset.toUpperCase());
          } else {
            rawData = StringEscapeUtils.unescapeHtml4(source);
          }
        }
        else {
          inputSource = new ByteArrayInputStream(source.getBytes());
          if ("base64".equals(sourceEncoding))
            inputSource = new Base64.InputStream(inputSource, Base64.DECODE);
        }


        if (inputSource != null || rawData != null) {


          if (inputSource != null) rawData = null;


          try {
            text = extract(inputSource, rawData, contentType, cleanMethod, url, scriptName);
            if (text==null) {
              success = false;
              stageList.setStagesStatus(StageList.STATUS_ERROR);
              if (logger!=null) logger.log("        parsing error (text null)");
              java.util.Date endTime = new java.util.Date();
              processingTime += (endTime.getTime() - startTime.getTime());
              return;              
            }
          }
          catch (Exception e){
            e.printStackTrace();
            success = false;
            if (logger!=null) logger.log("        parsing error");
            return;
          }


          if (text != null && !"".equals(text)) {
            doc.addElement("/job", targetElement, text);
          }
          else {
            if (logger!=null) logger.log("        empty document");
            if (stopPipelineOnEmptyText) {
              java.util.Date endTime = new java.util.Date();
              processingTime += (endTime.getTime() - startTime.getTime());
              return;
            }
          }


          if (parserLanguage!=null && !"".equals(parserLanguage))
            doc.addElement("/job", "parser_language", parserLanguage);


          if (parserTitle!=null && !"".equals(parserTitle))
            doc.addElement("/job", "parser_title", parserTitle);


          if (parserEncoding!=null && !"".equals(parserEncoding))
            doc.addElement("/job", "parser_charset", parserEncoding);


          if (parserContentType!=null && !"".equals(parserContentType))
            doc.addElement("/job", "parser_contentype", parserContentType);


          if (parserContentSize!=null && !"".equals(parserContentSize))
            doc.addElement("/job", "parser_contensize", parserContentSize);


          if (parserDate!=null && !"".equals(parserDate))
            doc.addElement("/job", "parser_date", parserDate);


          if (parserImageUrl!=null && !"".equals(parserImageUrl))
            doc.addElement("/job", "parser_imageurl", parserImageUrl);
        }
      }
    }
    catch (Exception e) {
      e.printStackTrace();
      success = false;
      if (logger!=null) logger.log("        parsing error");
    }


    java.util.Date endTime = new java.util.Date();
    processingTime += (endTime.getTime() - startTime.getTime());


    if (success || !stopPipelineOnError)
      if (nextStage != null)
        nextStage.processDoc(doc);
  }


  private String extract(InputStream input, String rawData, String contentType, String cleanMethod, String url, String scriptName) {
    String text = doExtract(input, rawData, contentType, cleanMethod, url, scriptName);
    if (text==null) return null;


    if (parserTitle==null || "".equals(parserTitle.trim()))            
      parserTitle = HttpUtils.urlGetFileName(url);


    if (text!=null && !"".equals(text)) {
      if (url!=null && !"".equals(url) && scriptName!=null && !"".equals(scriptName)) {
        if (logger!=null && verbose) {
          logger.log("    url          = " + url);
          logger.log("    scripts name = " + scriptName);
        }
        HashMap<String, String> m = ScriptsWrapper.cleanText(url, text, null, scriptName); 
        if (m!=null && m.size()>0) {
          text = m.get("text");
        }
      }
    }
    return text;
  }


  private class HtmlParser {
    private String title = null;
    private String date = null;
    private String html = null;


    public HtmlParser() {};


    public boolean parse(String rawData, String contentType, String url, String scriptName) {
      html = rawData;
      if (url!=null && !"".equals(url) && scriptName!=null && !"".equals(scriptName)) {
        if (logger!=null && verbose) {
          logger.log("    url          = " + url);
          logger.log("    scripts name = " + scriptName);
        }
        HashMap<String, String> m = ScriptsWrapper.htmlParse(url, rawData, contentType, null, scriptName); 
        if (m!=null && m.size()>0) {
          html = m.get("page");
          date = m.get("date");
          title = m.get("title");
          if (logger!=null && verbose) {
            logger.log("    title        = " + title);
            logger.log("    date         = " + date);
          }
        }
        else {
          return false;
        }
      }
      else {
        return false;
      }
      return true;
    }


    public String getBestTitle(String parserTitle) {
      if (title!=null && !"".equals(title)) return title;
      return parserTitle;
    }


    public String getBestDate(String parserDate) {
      if (date!=null && !"".equals(date)) return date;
      return parserDate;
    }


    public String getBestHtml(String rawData) {
      if (html!=null && !"".equals(html)) return html;
      return rawData;
    }
  }


  private String doExtract(InputStream input, String rawData, String contentType, String cleanMethod, String url, String scriptName) {


    try {      
      // text/plain
      if (contentType.startsWith("text/plain")) {
        parserText = rawData;
        parserTitle = Utils.strGetStartingText(parserText, 80);
        parserContentType = contentType;
        parserContentSize = String.valueOf(parserText.length());
        return parserText;
      } 


      HtmlParser htmlParser = new HtmlParser();


      TikaWrapper wrapper = null;
      String tikaContentType = null;


      // application/pdf
      if (contentType.startsWith("application/pdf")) {
        wrapper = new TikaWrapper(TikaWrapper.OUTPUT_FORMAT_TEXT);
        tikaContentType = TikaWrapper.CONTENT_TYPE_PDF;
        wrapper.setPdfToTextPath(pdfToTextPath);
      } else {
        // text/html
        if (contentType.startsWith("text/html")) {
          
          if (htmlParser.parse(rawData, contentType, url, scriptName)) {
            //cleanMethod = ""; // if parse script was use, disable any clean algorithm
            rawData = htmlParser.getBestHtml(rawData);
          }
          
          if (input==null && rawData!=null) input = new ByteArrayInputStream(rawData.getBytes());
          if (input==null) return "";


          String outputFormat = TikaWrapper.OUTPUT_FORMAT_TEXT;
          if ("boilerpipe_article".equals(cleanMethod)) 
            outputFormat = TikaWrapper.OUTPUT_FORMAT_TEXT_MAIN_BOILERPIPE_ARTICLE;
          if ("boilerpipe_default".equals(cleanMethod)) 
            outputFormat = TikaWrapper.OUTPUT_FORMAT_TEXT_MAIN_BOILERPIPE_DEFAULT;
          if ("boilerpipe_canola".equals(cleanMethod)) 
            outputFormat = TikaWrapper.OUTPUT_FORMAT_TEXT_MAIN_BOILERPIPE_CANOLA;
          if ("snacktory".equals(cleanMethod)) 
            outputFormat = TikaWrapper.OUTPUT_FORMAT_TEXT_MAIN_SNACKTORY;
          
          wrapper = new TikaWrapper(outputFormat);
          tikaContentType = TikaWrapper.CONTENT_TYPE_HTML;


          
        } else {
          wrapper = new TikaWrapper(TikaWrapper.OUTPUT_FORMAT_TEXT);
        }
      }
      
      wrapper.setTempPath(tmpPath);
      wrapper.process(input, tikaContentType);
      
      parserText = wrapper.getText();
      parserContentType = wrapper.getMetaContentType();
      if (contentType.startsWith("text/html")) {
        parserTitle = htmlParser.getBestTitle(wrapper.getMetaTitle());
        parserDate = htmlParser.getBestDate(wrapper.getMetaCreated());
      } else {
        parserTitle = wrapper.getMetaTitle();
        parserDate = wrapper.getMetaCreated();        
      }


      //parserContentSize = Long.toString(extractor.getContentSize());


      return parserText;
    } catch (Exception e) {
      e.printStackTrace();
      return null;
    }
  }


  @Override
  public String getName() {
    // TODO Auto-generated method stub
    return null;
  }


  @Override
  public String getDescription() {
    // TODO Auto-generated method stub
    return null;
  }
}
Source Code of fr.eolya.simplepipeline.stage.DocTextExtractor$HtmlParser

Related Classes of fr.eolya.simplepipeline.stage.DocTextExtractor$HtmlParser