Source Code of com.ikanow.infinit.e.harvest.enrichment.custom.UnstructuredAnalysisHarvester

/*******************************************************************************
 * Copyright 2012, The Infinit.e Open Source Project.
 * 
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License, version 3,
 * as published by the Free Software Foundation.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU Affero General Public License for more details.
 * 
 * You should have received a copy of the GNU Affero General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 ******************************************************************************/
package com.ikanow.infinit.e.harvest.enrichment.custom;


import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.StringWriter;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Scanner;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;


import javax.script.ScriptEngine;
import javax.script.ScriptEngineManager;
import javax.script.ScriptException;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamReader;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;


import org.apache.commons.lang.StringEscapeUtils;
import org.apache.log4j.Logger;
import org.bson.types.ObjectId;
import org.htmlcleaner.CleanerProperties;
import org.htmlcleaner.DomSerializer;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;
import org.json.JSONException;
import org.json.JSONObject;
import org.json.XML;
import org.w3c.dom.Document;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;


import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import com.google.gson.stream.JsonReader;
import com.ikanow.infinit.e.data_model.InfiniteEnums.ExtractorDocumentLevelException;
import com.ikanow.infinit.e.data_model.store.config.source.SimpleTextCleanserPojo;
import com.ikanow.infinit.e.data_model.store.config.source.SourcePipelinePojo.ManualTextExtractionSpecPojo;
import com.ikanow.infinit.e.data_model.store.config.source.SourcePipelinePojo.MetadataSpecPojo;
import com.ikanow.infinit.e.data_model.store.config.source.SourcePojo;
import com.ikanow.infinit.e.data_model.store.config.source.SourceRssConfigPojo;
import com.ikanow.infinit.e.data_model.store.config.source.UnstructuredAnalysisConfigPojo;
import com.ikanow.infinit.e.data_model.store.config.source.UnstructuredAnalysisConfigPojo.Context;
import com.ikanow.infinit.e.data_model.store.config.source.UnstructuredAnalysisConfigPojo.metaField;
import com.ikanow.infinit.e.data_model.store.document.AssociationPojo;
import com.ikanow.infinit.e.data_model.store.document.DocumentPojo;
import com.ikanow.infinit.e.data_model.store.document.EntityPojo;
import com.ikanow.infinit.e.data_model.utils.IkanowSecurityManager;
import com.ikanow.infinit.e.harvest.HarvestContext;
import com.ikanow.infinit.e.harvest.HarvestController;
import com.ikanow.infinit.e.harvest.extraction.document.file.JsonToMetadataParser;
import com.ikanow.infinit.e.harvest.extraction.document.file.XmlToMetadataParser;
import com.ikanow.infinit.e.harvest.extraction.text.legacy.TextExtractorTika;
import com.ikanow.infinit.e.harvest.utils.HarvestExceptionUtils;
import com.ikanow.infinit.e.harvest.utils.PropertiesManager;
import com.ikanow.infinit.e.harvest.utils.ProxyManager;
import com.mongodb.BasicDBList;
import com.mongodb.BasicDBObject;


/**
 * UnstructuredAnalysisHarvester
 */
public class UnstructuredAnalysisHarvester {
  
  ///////////////////////////////////////////////////////////////////////////////////////////
  
  // NEW PROCESSING PIPELINE INTERFACE


  //TODO (INF-1922): Handle headers and footers
  
  public void setContext(HarvestContext context) {
    _context = context;
    securityManager = _context.getSecurityManager();
  } 
  
  // Transform the doc's text (go get it if necessary)
  
  public String doManualTextEnrichment(DocumentPojo doc, List<ManualTextExtractionSpecPojo> textExtractors, SourceRssConfigPojo feedConfig) throws IOException {
    String cachedFullText = null;
    // Map to the legacy format and then call the legacy code 
    ArrayList<SimpleTextCleanserPojo> mappedTextExtractors = new ArrayList<SimpleTextCleanserPojo>(textExtractors.size());
    for (ManualTextExtractionSpecPojo textExtractor: textExtractors) {
      if (DocumentPojo.fullText_.equalsIgnoreCase(textExtractor.fieldName)) {
        boolean fullTextNeeded = (null == doc.getFullText()); // (check here so we can cache it)
        if (fullTextNeeded) {
          getRawTextFromUrlIfNeeded(doc, feedConfig);        
            // (if transforming full text then grab the raw body from the URL if necessary)
          cachedFullText = doc.getFullText();
        }//TESTED (by hand)
      }      
      SimpleTextCleanserPojo mappedTextExtractor = new SimpleTextCleanserPojo();
      mappedTextExtractor.setField(textExtractor.fieldName);
      mappedTextExtractor.setFlags(textExtractor.flags);
      mappedTextExtractor.setScript(textExtractor.script);
      mappedTextExtractor.setScriptlang(textExtractor.scriptlang);
      mappedTextExtractor.setReplacement(textExtractor.replacement);
      mappedTextExtractors.add(mappedTextExtractor);
    }
    this.cleanseText(mappedTextExtractors, doc);
    
    return cachedFullText;    
  }
  //TESTED (fulltext_regexTests.json)
  
  public void processMetadataChain(DocumentPojo doc, List<MetadataSpecPojo> metadataFields, SourceRssConfigPojo feedConfig, HashSet<String> unstoredFields) throws IOException
  {    
    // Map metadata list to a legacy meta format (they're really similar...)
    UnstructuredAnalysisConfigPojo.metaField mappedEl = new UnstructuredAnalysisConfigPojo.metaField();
    boolean textSet = false;
    for (MetadataSpecPojo meta: metadataFields) {
      mappedEl.fieldName = meta.fieldName;
      mappedEl.context = Context.All;
      mappedEl.flags = meta.flags;
      if (null == mappedEl.flags) {
        mappedEl.flags = "";
      }
      boolean scriptLangNeedsText = doesScriptLangNeedText(meta.scriptlang);      
        // (js you can operate only on metadata, other languages you need the text ...
        //  actually that is not true, because of chaining ugh - we'll just ignore that for now)
      
      if (scriptLangNeedsText || (null == mappedEl.flags) || mappedEl.flags.isEmpty() || mappedEl.flags.contains("t")) {
        if (!textSet) {
          getRawTextFromUrlIfNeeded(doc, feedConfig);        
          textSet = true;
        }
      }//TESTED (content_needed_test)
      
      mappedEl.scriptlang = meta.scriptlang;
      mappedEl.script = meta.script;
      mappedEl.replace = meta.replace;
      mappedEl.groupNum = null;
      //(no group num - just use replace, and flags "o" for xpath/gN:-1)


      // Storage of fields:
      if ((null != meta.store) && !meta.store) {
        unstoredFields.add(meta.fieldName); 
      }
      else if (unstoredFields.contains(meta.fieldName)) {
        unstoredFields.remove(meta.fieldName);
      }
      this.processMeta(doc, mappedEl, doc.getFullText(), null, null);            
    }
    //TESTED (storageSettings_advanced.json)
  }
  //TESTED (fulltext_regexTests.json, storageSettings_advanced.json)
  
  ///////////////////////////////////////////////////////////////////////////////////////////
  
  // PROCESSING PIPELINE - UTILITIES
  
  public void getRawTextFromUrlIfNeeded(DocumentPojo doc, SourceRssConfigPojo feedConfig) throws IOException {
    if (null != doc.getFullText()) { // Nothing to do
      return;
    }
    Scanner s = null;
    OutputStreamWriter wr = null;
    try {
      URL url = new URL(doc.getUrl());
      URLConnection urlConnect = null;
      String postContent = null;
      if (null != feedConfig) {
        urlConnect = url.openConnection(ProxyManager.getProxy(url, feedConfig.getProxyOverride()));
        if (null != feedConfig.getUserAgent()) {
          urlConnect.setRequestProperty("User-Agent", feedConfig.getUserAgent());
        }// TESTED (by hand)
        if (null != feedConfig.getHttpFields()) {
          for (Map.Entry<String, String> httpFieldPair: feedConfig.getHttpFields().entrySet()) {
            if (httpFieldPair.getKey().equalsIgnoreCase("content")) {
              postContent = httpFieldPair.getValue();
              urlConnect.setDoInput(true);
              urlConnect.setDoOutput(true);
            }
            else {
              urlConnect.setRequestProperty(httpFieldPair.getKey(), httpFieldPair.getValue());
            }
          }
        }//TESTED (by hand)
      }
      else {
        urlConnect = url.openConnection();        
      }
      InputStream urlStream = null;
      try {
        securityManager.setSecureFlag(true); // (disallow file/local URL access)
        if (null != postContent) {
          wr = new OutputStreamWriter(urlConnect.getOutputStream());
          wr.write(postContent.toCharArray());
          wr.flush();
        }//TESTED
        urlStream = urlConnect.getInputStream();
      }
      catch (SecurityException se) {
        throw se;
      }
      catch (Exception e) { // Try one more time, this time exception out all the way
        securityManager.setSecureFlag(false); // (some file stuff - so need to re-enable)
        if (null != feedConfig) {
          urlConnect = url.openConnection(ProxyManager.getProxy(url, feedConfig.getProxyOverride()));
          if (null != feedConfig.getUserAgent()) {
            urlConnect.setRequestProperty("User-Agent", feedConfig.getUserAgent());
          }// TESTED
          if (null != feedConfig.getHttpFields()) {
            for (Map.Entry<String, String> httpFieldPair: feedConfig.getHttpFields().entrySet()) {
              if (httpFieldPair.getKey().equalsIgnoreCase("content")) {
                urlConnect.setDoInput(true); // (need to do this again)
                urlConnect.setDoOutput(true);            
              }
              else {
                urlConnect.setRequestProperty(httpFieldPair.getKey(), httpFieldPair.getValue());                            
              }
            }
          }//TESTED
        }
        else {
          urlConnect = url.openConnection();        
        }
        securityManager.setSecureFlag(true); // (disallow file/local URL access)
        if (null != postContent) {
          wr = new OutputStreamWriter(urlConnect.getOutputStream());
          wr.write(postContent.toCharArray());
          wr.flush();
        }//TESTED
        urlStream = urlConnect.getInputStream();
      }
      finally {
        securityManager.setSecureFlag(false); // (turn security check for local URL/file access off)
      }
      // Grab any interesting header fields
      Map<String, List<String>> headers = urlConnect.getHeaderFields();
      BasicDBObject metadataHeaderObj = null;
      for (Map.Entry<String, List<String>> it: headers.entrySet()) {
        if (null != it.getKey()) {
          if (it.getKey().startsWith("X-") || it.getKey().startsWith("Set-") || it.getKey().startsWith("Location")) {
            if (null == metadataHeaderObj) {
              metadataHeaderObj = new  BasicDBObject();
            }
            metadataHeaderObj.put(it.getKey(), it.getValue());
          }
        }
      }//TESTED
      // Grab the response code
      try {
        HttpURLConnection httpUrlConnect = (HttpURLConnection)urlConnect;
        int responseCode = httpUrlConnect.getResponseCode();
        if (200 != responseCode) {
          if (null == metadataHeaderObj) {
            metadataHeaderObj = new  BasicDBObject();
          }
          metadataHeaderObj.put("responseCode", String.valueOf(responseCode));
        }
      }//TESTED
      catch (Exception e) {} // interesting, not an HTTP connect ... shrug and carry on
      if (null != metadataHeaderObj) {
        doc.addToMetadata("__FEED_METADATA__", metadataHeaderObj);
      }//TESTED
      s = new Scanner(urlStream, "UTF-8");
      doc.setFullText(s.useDelimiter("\\A").next());
    }
    catch (MalformedURLException me) { // This one is worthy of a more useful error message
      throw new MalformedURLException(me.getMessage() + ": Likely because the document has no full text (eg JSON) and you are calling a contentMetadata block without setting flags:'m' or 'd'");
    }
    finally { //(release resources)
      if (null != s) {
        s.close();
      }
      if (null != wr) {
        wr.close();
      }
    }    
    
  }//TESTED (cut-and-paste from existing code, so new testing very cursory) 
  
  ///////////////////////////////////////////////////////////////////////////////////////////
  ///////////////////////////////////////////////////////////////////////////////////////////
  ///////////////////////////////////////////////////////////////////////////////////////////
  
  // LEGACY CODE - USE TO SUPPORT OLD CODE FOR NOW + AS UTILITY CODE FOR THE PIPELINE LOGIC
  
  // Per-source state
  private Pattern headerPattern = null;
  private Pattern footerPattern = null;
  private UnstructuredAnalysisConfigPojo savedUap = null;


  // Javascript handling, if needed
  private ScriptEngineManager factory = null;
  private ScriptEngine engine = null;
  private static String parsingScript = null;


  // Using Tika to process documents:
  TextExtractorTika tikaExtractor = null;
  
  private HarvestContext _context = null;
  private Logger logger = Logger
      .getLogger(UnstructuredAnalysisHarvester.class);


  // (some web scraping may be needed)
  private long nBetweenDocs_ms = -1;
  // (set this in execute harvest - makes it easy to only set once in the per doc version called in bulk from the SAH)


  // Ensure we don't get long list of duplicates for commonly occurring words
  private HashSet<String> regexDuplicates = null;
  private HtmlCleaner cleaner = null;
  
  //if the sah already init'd an engine we'll just use it
  private ScriptEngine _sahEngine = null; 
  private IkanowSecurityManager securityManager = null;
  
  /**
   * Default Constructor
   */
  public UnstructuredAnalysisHarvester() {
  }


  // For harvest pipeline, just ensures duplicate map exists and is empty for each doc
  public void resetForNewDoc() {
    if ((null == regexDuplicates) || (!regexDuplicates.isEmpty())) {
      regexDuplicates = new HashSet<String>();
    }
  }
  
  /**
   * executeHarvest(SourcePojo source, List<DocumentPojo> feeds)
   * 
   * @param source
   * @param feeds
   * @return List<DocumentPojo>
   */
  public List<DocumentPojo> executeHarvest(HarvestController contextController, SourcePojo source, List<DocumentPojo> documents)
  {
    nBetweenDocs_ms = -1;
    // Can override the default (feed) wait time from within the source (eg
    // for sites that we know don't get upset about getting hammered)
    if (null != source.getRssConfig()) {
      if (null != source.getRssConfig().getWaitTimeOverride_ms()) {
        nBetweenDocs_ms = source.getRssConfig().getWaitTimeOverride_ms();
      }
    }
    if (-1 == nBetweenDocs_ms) {
      PropertiesManager props = new PropertiesManager();
      nBetweenDocs_ms = props.getWebCrawlWaitTime();
    }
    // TESTED: default and overridden values


    _context = contextController;
    securityManager = _context.getSecurityManager();
    UnstructuredAnalysisConfigPojo uap = source.getUnstructuredAnalysisConfig();


    if (uap != null) {
      boolean bGetRawDoc = source.getExtractType().equalsIgnoreCase("feed");


      String headerRegEx = uap.getHeaderRegEx();
      String footerRegEx = uap.getFooterRegEx();
      List<metaField> meta = uap.getMeta();


      if (headerRegEx != null)
        headerPattern = createRegex(headerRegEx, uap.getHeaderRegExFlags());
      if (footerRegEx != null)
        footerPattern = createRegex(footerRegEx, uap.getFooterRegExFlags());


      Iterator<DocumentPojo> it = documents.iterator();
      int nDocs = 0;
      while (it.hasNext()) {
        nDocs++;
        DocumentPojo d = it.next();
         regexDuplicates = new HashSet<String>();
        cleaner = null;


        // For feeds, may need to go get the document text manually,
        // it's a bit horrible since
        // obviously may then go get the data again for full text
        // extraction
        boolean bFetchedUrl = false;
        if (bGetRawDoc && (null == d.getFullText())) {
          if (null == source.getRssConfig()) {
            source.setRssConfig(new SourceRssConfigPojo()); // (makes logic easier down the road)
          }
          // (first time through, sleep following a URL/RSS access)
          if ((1 == nDocs) && (null != source.getUrl())) { // (have already made a call to RSS (or "searchConfig" URL)
            try {
              Thread.sleep(nBetweenDocs_ms);
            } catch (InterruptedException e) {
            }
          }
          // TESTED (first time only, correct value after searchConfig override)


          try {
            if ((null != source.useTextExtractor()) && source.useTextExtractor().equalsIgnoreCase("tika")) {
              // Special case: if tika enabled then do that first
              if (null == tikaExtractor) {
                tikaExtractor = new TextExtractorTika();
                tikaExtractor.extractText(d);
              }
            }
            else {
              this.getRawTextFromUrlIfNeeded(d, source.getRssConfig());
            }
            bFetchedUrl = true;
            
          } catch (Exception e) { // Failed to get full text twice, remove doc
            if (e instanceof SecurityException) { // This seems worthy of actually logging, even though it's a lowly doc error
              contextController.getHarvestStatus().logMessage(e.getMessage(), true);
            }
            contextController.handleExtractError(e, source); //handle extractor error if need be        
            it.remove();
            d.setTempSource(null); // (can safely corrupt this doc since it's been removed)            
            continue;
          }
        }
        long nTime_ms = System.currentTimeMillis();
        // ^^^ (end slight hack to get raw text to the UAH for RSS feeds)


        try {
          processBody(d, meta, true, source, uap);
        } catch (Exception e) {
          this._context.getHarvestStatus().logMessage("processBody1: " + e.getMessage(), true);
          //DEBUG (don't output log messages per doc)
          //logger.error("processBody1: " + e.getMessage(), e);
        }


        try {
          if (uap.getSimpleTextCleanser() != null) {
            cleanseText(uap.getSimpleTextCleanser(), d);
          }
        } catch (Exception e) {
          this._context.getHarvestStatus().logMessage("cleanseText: " + e.getMessage(), true);
          //DEBUG (don't output log messages per doc)
          //logger.error("cleanseText: " + e.getMessage(), e);
        }


        try {
          processHeader(headerPattern, d, meta, source, uap);
          processFooter(footerPattern, d, meta, source, uap);
          
        } catch (Exception e) {
          this._context.getHarvestStatus().logMessage("header/footerPattern: " + e.getMessage(), true);
          //DEBUG (don't output log messages per doc)
          //logger.error("header/footerPattern: " + e.getMessage(), e);
        }
        try {
          processBody(d, meta, false, source, uap);
          
        } catch (Exception e) {
          this._context.getHarvestStatus().logMessage("processBody2: " + e.getMessage(), true);
          //DEBUG (don't output log messages per doc)
          //logger.error("processBody2: " + e.getMessage(), e);
        }


        if (it.hasNext() && bFetchedUrl) {
          nTime_ms = nBetweenDocs_ms
              - (System.currentTimeMillis() - nTime_ms); // (ie delay time - processing time)
          if (nTime_ms > 0) {
            try {
              Thread.sleep(nTime_ms);
            } catch (InterruptedException e) {
            }
          }
        } // (end politeness delay for URL getting from a single source (likely site)
      }
      return documents;
    }
    return new ArrayList<DocumentPojo>();
  }


  /**
   * executeHarvest For single-feed calls (note exception handling happens in
   * SAH)
   * 
   * @param source
   * @param doc
   * @return
   * @throws ExtractorDocumentLevelException 
   */
  public boolean executeHarvest(HarvestContext context, SourcePojo source, DocumentPojo doc, boolean bFirstTime, boolean bMoreDocs) throws ExtractorDocumentLevelException
  {    
    regexDuplicates = new HashSet<String>();
    cleaner = null;
    boolean bGetRawDoc = source.getExtractType().equalsIgnoreCase("feed")
                && (null == doc.getFullText());
    // (ie don't have full text and will need to go fetch it from network)


    if (bFirstTime) {
      nBetweenDocs_ms = -1; // (reset eg bewteen searchConfig and SAH)
    }
    if ((-1 == nBetweenDocs_ms) && bGetRawDoc && (bMoreDocs || bFirstTime)) { // (don't bother if not using it...)
      // Can override the default (feed) wait time from within the source
      // (eg for sites that we know
      // don't get upset about getting hammered)
      if (null != source.getRssConfig()) {
        if (null != source.getRssConfig().getWaitTimeOverride_ms()) {
          nBetweenDocs_ms = source.getRssConfig().getWaitTimeOverride_ms();
        }
      }
      if (-1 == nBetweenDocs_ms) { // (ie not overridden so use default)
        PropertiesManager props = new PropertiesManager();
        nBetweenDocs_ms = props.getWebCrawlWaitTime();
      }
    } // TESTED (overridden and using system default)


    _context = context;
    securityManager = _context.getSecurityManager();
    UnstructuredAnalysisConfigPojo uap = source.getUnstructuredAnalysisConfig();


    int nChanges = 0;
    if (null != doc.getMetaData()) {
      nChanges = doc.getMetaData().size();
    }
    boolean bFetchedUrl = false;
    if (bGetRawDoc) {
      if (null == source.getRssConfig()) {
        source.setRssConfig(new SourceRssConfigPojo()); // (makes logic easier down the road)
      }
      try {
        // Workaround for observed twitter bug (first access after the
        // RSS was gzipped)
        if (bFirstTime) {
          // (first time through, sleep following a URL/RSS access)
          if (null != source.getUrl()) { // (have already made a call to RSS (or "searchConfig" URL)
            try {
              Thread.sleep(nBetweenDocs_ms);
            } catch (InterruptedException e) {
            }
          }
          // TESTED
        }
        
        if ((null != source.useTextExtractor()) && source.useTextExtractor().equalsIgnoreCase("tika")) {
          // Special case: if tika enabled then do that first
          if (null == tikaExtractor) {
            tikaExtractor = new TextExtractorTika();
            tikaExtractor.extractText(doc);
          }
        }
        else {
          getRawTextFromUrlIfNeeded(doc, source.getRssConfig());
        }
        bFetchedUrl = true;
        
      }
      catch (SecurityException e) { // This seems worthy of actually logging, even though it's a lowly doc error
        _context.getHarvestStatus().logMessage(e.getMessage(), true);
        throw new ExtractorDocumentLevelException(e.getMessage());
      }//TESTED
      catch (Exception e) { // Failed to get full text twice... remove doc and carry on
        throw new ExtractorDocumentLevelException(e.getMessage());
      }
    }
    long nTime_ms = System.currentTimeMillis();
    // ^^^ (end slight hack to get raw text to the UAH for RSS feeds)


    if (uap != null) {
      List<metaField> meta = uap.getMeta();
      if (savedUap != uap) {
        String headerRegEx = uap.getHeaderRegEx();
        String footerRegEx = uap.getFooterRegEx();


        if (headerRegEx != null)
          headerPattern = Pattern.compile(headerRegEx, Pattern.DOTALL);
        if (footerRegEx != null)
          footerPattern = Pattern.compile(footerRegEx, Pattern.DOTALL);


        savedUap = uap;
      }
      try {
        processBody(doc, meta, true, source, uap);
        
      } catch (Exception e) {
        this._context.getHarvestStatus().logMessage("processBody1: " + e.getMessage(), true);
        //DEBUG (don't output log messages per doc)
        //logger.error("processBody1: " + e.getMessage(), e);
      }
      try {
        if (uap.getSimpleTextCleanser() != null) {
          cleanseText(uap.getSimpleTextCleanser(), doc);
        }
      } catch (Exception e) {
        this._context.getHarvestStatus().logMessage("cleanseText: " + e.getMessage(), true);
        //DEBUG (don't output log messages per doc)
        //logger.error("cleanseText: " + e.getMessage(), e);
      }
      try {
        processHeader(headerPattern, doc, meta, source, uap);
        processFooter(footerPattern, doc, meta, source, uap);
        
      } catch (Exception e) {
        this._context.getHarvestStatus().logMessage("header/footerPattern: " + e.getMessage(), true);
        //DEBUG (don't output log messages per doc)
        //logger.error("header/footerPattern: " + e.getMessage(), e);
      }
      try {
        processBody(doc, meta, false, source, uap);
        
      } catch (Exception e) {
        this._context.getHarvestStatus().logMessage("processBody2: " + e.getMessage(), true);
        //DEBUG (don't output log messages per doc)
        //logger.error("processBody2: " + e.getMessage(), e);
      }
    }
    if (bMoreDocs && bFetchedUrl) {
      nTime_ms = nBetweenDocs_ms - (System.currentTimeMillis() - nTime_ms); // (ie delay time - processing time)
      if (nTime_ms > 0) {
        try {
          Thread.sleep(nTime_ms);
        } catch (InterruptedException e) {
        }
      }
    } // (end politeness delay for URL getting from a single source (likely site)


    if (null != doc.getMetaData()) {
      if (nChanges != doc.getMetaData().size()) {
        return true;
      }
    }
    return false;
  }


  /**
   * processHeader
   * 
   * @param headerPattern
   * @param f
   * @param meta
   */
  @SuppressWarnings("deprecation")
  private void processHeader(Pattern headerPattern, DocumentPojo f, List<metaField> meta, SourcePojo source, UnstructuredAnalysisConfigPojo uap)
  {
    if (headerPattern != null) {
      Matcher headerMatcher = headerPattern.matcher(f.getFullText());
      String headerText = null;
      while (headerMatcher.find()) {
        if (headerMatcher.start() == 0) {
          headerText = headerMatcher.group(0);
          f.setHeaderEndIndex(headerText.length());
          for (int i = 1; i < headerMatcher.groupCount() + 1; i++) {
            f.addToHeader(headerMatcher.group(i).trim());
          }
          break;
        }
      }
      if (null != headerText && null != meta) {
        for (metaField m : meta) {
          if (m.context == Context.Header || m.context == Context.All) {
            this.processMeta(f, m, headerText, source, uap);
          }
        }
      }
    }
  }


  /**
   * processFooter
   * 
   * @param footerPattern
   * @param f
   * @param meta
   */
  @SuppressWarnings("deprecation")
  private void processFooter(Pattern footerPattern, DocumentPojo f, List<metaField> meta, SourcePojo source, UnstructuredAnalysisConfigPojo uap)
  {


    if (footerPattern != null) {
      Matcher footerMatcher = footerPattern.matcher(f.getFullText());
      String footerText = null;
      while (footerMatcher.find()) {
        footerText = footerMatcher.group(0);
        int docLength = f.getFullText().length();
        f.setFooterStartIndex(docLength - footerMatcher.group(0).length());
        for (int i = 1; i < footerMatcher.groupCount() + 1; i++) {
          f.addToHeader(footerMatcher.group(i).trim());
        }
        break;
      }


      if (null != footerText && null != meta) {
        for (metaField m : meta) {
          if (m.context == Context.Footer || m.context == Context.All) {
            this.processMeta(f, m, footerText, source, uap);
          }
        }
      }
    }
  }


  /**
   * processBody
   * 
   * @param f
   * @param meta
   */
  @SuppressWarnings("deprecation")
  private void processBody(DocumentPojo f, List<metaField> meta, boolean bPreCleansing, SourcePojo source, UnstructuredAnalysisConfigPojo uap)
  {
    if (null != meta) {
      for (metaField m : meta) {
        if ((bPreCleansing && (m.context == Context.First))
            || (!bPreCleansing && (m.context == Context.Body || m.context == Context.All))) {
          
          boolean scriptLangNeedsText = doesScriptLangNeedText(m.scriptlang);      
          // (js you can operate only on metadata, other languages you need the text ...
          //  actually that is not true, because of chaining ugh - we'll just ignore that for now)
        
          // If running with text (default) then need there to be non-null text
          String toProcess = null;
          if (scriptLangNeedsText || (null == m.flags) ||m.flags.isEmpty() || m.flags.contains("t")) {
            toProcess = f.getBody();
            if (toProcess == null)
              toProcess = f.getDescription();
            if (toProcess == null)
              continue;
          }
          this.processMeta(f, m, toProcess, source, uap);
        }
      }
    }
  }


  /**
   * processMeta - handle an individual field
   */
  private void processMeta(DocumentPojo f, metaField m, String text, SourcePojo source, UnstructuredAnalysisConfigPojo uap) {


    boolean bAllowDuplicates = false;
    if ((null != m.flags) && m.flags.contains("U")) {
      bAllowDuplicates = true;
    }    
    if ((null == m.scriptlang) || m.scriptlang.equalsIgnoreCase("regex")) {


      Pattern metaPattern = createRegex(m.script, m.flags);


      int timesToRun = 1;
      Object[] currField = null;
      if ((null != m.flags) && m.flags.contains("c")) {
        currField = f.getMetadata().get(m.fieldName);
      }
      if (null != currField) { // chained metadata
        timesToRun = currField.length;
        text = (String)currField[0];
      }//TESTED


      Matcher matcher = metaPattern.matcher(text);
      LinkedList<String> Llist = null;
      
      for (int ii = 0; ii < timesToRun; ++ii) {
        if (ii > 0) { // (else either just text, or in the above "chained metadata" initialization above)
          text = (String)currField[ii];    
          matcher = metaPattern.matcher(text);
        }//TESTED
      
        StringBuffer prefix = new StringBuffer(m.fieldName).append(':');
        int nFieldNameLen = m.fieldName.length() + 1;
  
        try {
          while (matcher.find()) {
            if (null == Llist) {
              Llist = new LinkedList<String>();
            }
            if (null == m.groupNum) {
              m.groupNum = 0;
            }
            String toAdd = matcher.group(m.groupNum);
            if (null != m.replace) {
              toAdd = metaPattern.matcher(toAdd).replaceFirst(
                  m.replace);
            }
            if ((null != m.flags) && m.flags.contains("H"))  {
              toAdd = StringEscapeUtils.unescapeHtml(toAdd);
            }
            prefix.setLength(nFieldNameLen);
            prefix.append(toAdd);
            String dupCheck = prefix.toString();
  
            if (!regexDuplicates.contains(dupCheck)) {
              Llist.add(toAdd);
              if (!bAllowDuplicates) {
                regexDuplicates.add(dupCheck);
              }
            }
          }
        } catch (Exception e) {
          this._context.getHarvestStatus().logMessage("processMeta1: " + e.getMessage(), true);
        }
      }//(end metadata chaining handling)
      if (null != Llist) {
        if (null != currField) { // (overwrite)
          f.getMetadata().put(m.fieldName, Llist.toArray());
        }
        else {
          f.addToMetadata(m.fieldName, Llist.toArray());
        }
      }//TESTED
    } 
    else if (m.scriptlang.equalsIgnoreCase("javascript")) 
    {
      if (null == f.getMetadata()) {
        f.setMetadata(new LinkedHashMap<String, Object[]>());
      }
      //set the script engine up if necessary
      if ((null != source) && (null != uap)) {
        //(these are null if called from new processing pipeline vs legacy code)
        intializeScriptEngine(source, uap);
      }
      
      try 
      {
        //TODO (INF-2488): in new format, this should only happen in between contentMeta blocks/docs
        // (also should be able to use SAH _document object I think?)
        
        // Javascript: the user passes in 
        Object[] currField = f.getMetadata().get(m.fieldName);
        if ((null == m.flags) || m.flags.isEmpty()) {
          if (null == currField) {
            engine.put("text", text);
            engine.put("_iterator", null);
          }
          //(otherwise will just pass the current fields in there)
        }
        else { // flags specified
          if (m.flags.contains("t")) { // text
            engine.put("text", text);              
          }
          if (m.flags.contains("d")) { // entire document (minus ents and assocs)
            GsonBuilder gb = new GsonBuilder();
            Gson g = gb.create();
            List<EntityPojo> ents = f.getEntities();
            List<AssociationPojo> assocs = f.getAssociations();
            try {
              f.setEntities(null);
              f.setAssociations(null);
                  engine.put("document", g.toJson(f));
                  securityManager.eval(engine, JavaScriptUtils.initScript);
            }
            finally {
              f.setEntities(ents);
              f.setAssociations(assocs);
            }
          }
          if (m.flags.contains("m")) { // metadata
            GsonBuilder gb = new GsonBuilder();
            Gson g = gb.create();  
            engine.put("_metadata", g.toJson(f.getMetadata()));
            securityManager.eval(engine, JavaScriptUtils.iteratorMetaScript);
          }
        }//(end flags processing)
        
        if (null != currField) {
          f.getMetadata().remove(m.fieldName);
          
          GsonBuilder gb = new GsonBuilder();
          Gson g = gb.create();  
          engine.put("_iterator", g.toJson(currField));
          securityManager.eval(engine, JavaScriptUtils.iteratorDocScript);              
        }
        //TESTED (handling of flags, and replacing of existing fields, including when field is null but specified)


        Object returnVal = securityManager.eval(engine, m.script);


        if (null != returnVal) {
          if (returnVal instanceof String) { // The only easy case
            Object[] array = new Object[1];
            if ((null != m.flags) && m.flags.contains("H"))  {
              returnVal = StringEscapeUtils.unescapeHtml((String)returnVal);
            }
            array[0] = returnVal;
            f.addToMetadata(m.fieldName, array);
          } else { // complex object or array - in either case the engine turns these into
                // internal.NativeArray or internal.NativeObject
            
            BasicDBList outList = JavaScriptUtils.parseNativeJsObject(returnVal, engine);                        
            f.addToMetadata(m.fieldName, outList.toArray());
          }
        }
      } catch (ScriptException e) {


        _context.getHarvestStatus().logMessage(HarvestExceptionUtils.createExceptionMessage(e).toString(), true);


        // Just do nothing and log
        // e.printStackTrace();
        //DEBUG (don't output log messages per doc)
        //logger.error(e.getMessage());
      } catch (Exception e) {
        
        _context.getHarvestStatus().logMessage(HarvestExceptionUtils.createExceptionMessage(e).toString(), true);


        // Just do nothing and log
        // e.printStackTrace();
        //DEBUG (don't output log messages per doc)
        //logger.error(e.getMessage());
      }
    } else if (m.scriptlang.equalsIgnoreCase("xpath")) {


      String xpath = m.script;
      
      try {
        createHtmlCleanerIfNeeded();


        int timesToRun = 1;
        Object[] currField = null;
        if ((null != m.flags) && m.flags.contains("c")) {
          currField = f.getMetadata().get(m.fieldName);
        }
        if (null != currField) { // chained metadata
          f.getMetadata().remove(m.fieldName); // (so will add to the end)
          timesToRun = currField.length;
          text = (String)currField[0];
        }//TESTED


        for (int ii = 0; ii < timesToRun; ++ii) {
          if (ii > 0) { // (else either just text, or in the above "chained metadata" initialization above)
            text = (String)currField[ii];            
          }//TESTED
          
          TagNode node = cleaner.clean(new ByteArrayInputStream(text.getBytes()));
          
          //NewCode : Only use html cleaner for cleansing
          //use JAXP for full Xpath lib
          Document doc = new DomSerializer(new CleanerProperties()).createDOM(node);
          
  
          String extraRegex = extractRegexFromXpath(xpath);
  
          if (extraRegex != null)
            xpath = xpath.replace(extraRegex, "");
          
          XPath xpa = XPathFactory.newInstance().newXPath();
          NodeList res = (NodeList)xpa.evaluate(xpath, doc, XPathConstants.NODESET);
          
          if (res.getLength() > 0)
          {
            if ((null != m.flags) && (m.flags.contains("o"))) { // "o" for object
              m.groupNum = -1; // (see bConvertToObject below)
            }
            StringBuffer prefix = new StringBuffer(m.fieldName).append(':');
            int nFieldNameLen = m.fieldName.length() + 1;
            ArrayList<Object> Llist = new ArrayList<Object>(res.getLength());
            boolean bConvertToObject = ((m.groupNum != null) && (m.groupNum == -1));
            boolean convertToXml =  ((null != m.flags) && (m.flags.contains("x")));
            for (int i= 0; i< res.getLength(); i++)
            {
              Node info_node = res.item(i);
              if ((null != m.flags) && (m.flags.contains("g"))) {
                Llist.add(parseHtmlTable(info_node, m.replace));
              }
              else if (bConvertToObject || convertToXml) {
                // Try to create a JSON object out of this
                StringWriter writer = new StringWriter();
                try {
                  Transformer transformer = TransformerFactory.newInstance().newTransformer();
                  transformer.transform(new DOMSource(info_node), new StreamResult(writer));
                } catch (TransformerException e1) {
                  continue;
                }
  
                if (bConvertToObject) {
                  try {
                    JSONObject subObj = XML.toJSONObject(writer.toString());
                    if (xpath.endsWith("*"))  { // (can have any number of different names here)
                      Llist.add(XmlToMetadataParser.convertJsonObjectToLinkedHashMap(subObj));
                    }//TESTED
                    else {
                      String[] rootNames = JSONObject.getNames(subObj);
                      if (1 == rootNames.length) {
                        // (don't think it can't be any other number in fact)
                        subObj = subObj.getJSONObject(rootNames[0]);
                      }
                      boolean bUnescapeHtml = ((null != m.flags) && m.flags.contains("H"));
                      Llist.add(XmlToMetadataParser.convertJsonObjectToLinkedHashMap(subObj, bUnescapeHtml));                    
                    }//TESTED
                  }
                  catch (JSONException e) { // Just carry on
                    continue;
                  }
                  //TESTED
                }
                else { // leave in XML form
                  Llist.add(writer.toString().substring(38)); // +38: (step over <?xml version="1.0" encoding="UTF-8"?>)
                }//TESTED (xpath_test.json)
              }
              else { // Treat this as string, either directly or via regex
                String info = info_node.getTextContent().trim();
                if (extraRegex == null || extraRegex.isEmpty()) {
                  prefix.setLength(nFieldNameLen);
                  prefix.append(info);
                  String dupCheck = prefix.toString();
    
                  if (!regexDuplicates.contains(dupCheck)) {
                    if ((null != m.flags) && m.flags.contains("H")) {
                      info = StringEscapeUtils.unescapeHtml(info);
                    }
                    Llist.add(info);
                    if (!bAllowDuplicates) {
                      regexDuplicates.add(dupCheck);
                    }
                  }
                } 
                else { // Apply regex to the string
                  Pattern dataRegex = createRegex(extraRegex, m.flags);
                  Matcher dataMatcher = dataRegex.matcher(info);
                  boolean result = dataMatcher.find();
                  while (result) {
                    String toAdd;
                    if (m.groupNum != null)
                      toAdd = dataMatcher.group(m.groupNum);
                    else
                      toAdd = dataMatcher.group();
                    prefix.setLength(nFieldNameLen);
                    prefix.append(toAdd);
                    String dupCheck = prefix.toString();
    
                    if (!regexDuplicates.contains(dupCheck)) {
                      if ((null != m.flags) && m.flags.contains("H")) {
                        toAdd = StringEscapeUtils.unescapeHtml(toAdd);
                      }
                      Llist.add(toAdd);
                      if (!bAllowDuplicates) {
                        regexDuplicates.add(dupCheck);
                      }
                    }
    
                    result = dataMatcher.find();
                  }
                }//(regex vs no regex)
              }//(end string vs object)
            }
            if (Llist.size() > 0) {
              f.addToMetadata(m.fieldName, Llist.toArray());
            }
          }
        }//(end loop over metadata objects if applicable)


      } catch (IOException ioe) {
        _context.getHarvestStatus().logMessage(HarvestExceptionUtils.createExceptionMessage(ioe).toString(), true);


        // Just do nothing and log
        //DEBUG (don't output log messages per doc)
        //logger.error(ioe.getMessage());
      } catch (ParserConfigurationException e1) {
        _context.getHarvestStatus().logMessage(HarvestExceptionUtils.createExceptionMessage(e1).toString(), true);
        // Just do nothing and log
        //DEBUG (don't output log messages per doc)
        //logger.error(e1.getMessage());
      } catch (XPathExpressionException e1) {
        _context.getHarvestStatus().logMessage("Error evaluating xpath expression: " +  xpath, true);
      }
    }
    else if (m.scriptlang.equalsIgnoreCase("stream")) { // XML or JSON streaming interface
      // which one?
      try {
        boolean json = false;
        boolean xml = false;
        for (int i = 0; i < 128; ++i) {
          if ('<' == text.charAt(i)) {
            xml = true;
            break;
          }
          if ('{' == text.charAt(i)) {
            json = true;
            break;
          }
          if (!Character.isSpaceChar(text.charAt(i))) {
            break;
          }
        }//TESTED (too many spaces: meta_stream_test, test4; incorrect chars: test3, xml: test1, json: test2)
        
        boolean textNotObject = m.flags == null || !m.flags.contains("o");
        
        List<DocumentPojo> docs = new LinkedList<DocumentPojo>();
        List<String> levelOneFields = null;
        if (null != m.script) {
          levelOneFields = Arrays.asList(m.script.split("\\s*,\\s*"));
          if ((1 == levelOneFields.size()) && levelOneFields.get(0).isEmpty()) {
            // convert [""] to null
            levelOneFields = null;
          }
        }//TESTED (json and xml)
        
        if (xml) {
          XmlToMetadataParser parser = new XmlToMetadataParser(levelOneFields, null, null, null, null, null, Integer.MAX_VALUE);
          XMLInputFactory factory = XMLInputFactory.newInstance();
          factory.setProperty(XMLInputFactory.IS_COALESCING, true);
          factory.setProperty(XMLInputFactory.SUPPORT_DTD, false);
          XMLStreamReader reader = null;
          try {              
            reader = factory.createXMLStreamReader(new ByteArrayInputStream(text.getBytes()));
            docs = parser.parseDocument(reader, textNotObject);
          }
          finally {
            if (null != reader) reader.close();
          }
        }//TESTED (meta_stream_test, test1)
        if (json) {
          JsonReader jsonReader = null;
          try {          
            JsonToMetadataParser parser = new JsonToMetadataParser(null, levelOneFields, null, null, Integer.MAX_VALUE);
            jsonReader = new JsonReader(new InputStreamReader(new ByteArrayInputStream(text.getBytes()), "UTF-8"));
            jsonReader.setLenient(true);
            docs = parser.parseDocument(jsonReader, textNotObject);
          }
          finally {
            if (null != jsonReader) jsonReader.close();
          }
        }//TESTED (meta_stream_test test2)
        
        if (!docs.isEmpty()) {
          ArrayList<String> Llist = null;          
          ArrayList<Object> LlistObj = null;          
          if (textNotObject) {
            Llist = new ArrayList<String>(docs.size());
          }
          else {
            LlistObj = new ArrayList<Object>(docs.size());
          }
          for (DocumentPojo doc: docs) {
            if ((null != doc.getFullText()) || (null != doc.getMetadata())) {
              if (textNotObject) {
                Llist.add(doc.getFullText());
              }//TESTED
              else if (xml) {
                LlistObj.add(doc.getMetadata());                
              }//TESTED
              else if (json) {
                Object o = doc.getMetadata();
                if (null != o) {
                  o = doc.getMetadata().get("json");
                  if (null != o) {
                    LlistObj.add(o);
                  }
                }
              }//TESTED
            }
          }//TESTED
          if ((null != Llist) && !Llist.isEmpty()) {
            f.addToMetadata(m.fieldName, Llist.toArray());
          }//TESTED
          if ((null != LlistObj) && !LlistObj.isEmpty()) {
            f.addToMetadata(m.fieldName, LlistObj.toArray());
          }//TESTED
          
        }//TESTED (meta_stream_test test1,test2)
      }//(end try)
      catch (Exception e) { // various parsing errors
        _context.getHarvestStatus().logMessage(HarvestExceptionUtils.createExceptionMessage(e).toString(), true);        
      }
    }//TESTED (meta_stream_test)
    
    // (don't currently support other script types)
  }


  private static String extractRegexFromXpath(String original_xpath) {
    Pattern addedRegex = Pattern.compile("regex\\(.*\\)\\s*$", Pattern.MULTILINE | Pattern.DOTALL); 
    Matcher matcher = addedRegex.matcher(original_xpath);
    boolean matchFound = matcher.find();


    if (matchFound) {
      try {
        return matcher.group();
      } catch (Exception e) {
        return null;
      }
    }
    return null;


  }


  /**
   * cleanseText
   * 
   * @param source
   * @param documents
   * @return
   */
  private void cleanseText(List<SimpleTextCleanserPojo> simpleTextCleanser, DocumentPojo document)
  {
    // Store these since can re-generate them by concatenation
    StringBuffer fullTextBuilder = null;
    StringBuffer descriptionBuilder = null;
    StringBuffer titleBuilder = null;
    // (note no support for metadata concatenation, replace only)
    
    // Iterate over the cleanser functions that need to run on each feed
    for (SimpleTextCleanserPojo s : simpleTextCleanser) {
      boolean bConcat = (null != s.getFlags()) && s.getFlags().contains("+");
      
      boolean bUsingJavascript = ((null != s.getScriptlang()) && s.getScriptlang().equalsIgnoreCase("javascript"));
      if (s.getField().equalsIgnoreCase("fulltext")) {
        if ((null != document.getFullText()) || bUsingJavascript) {
          StringBuffer myBuilder = fullTextBuilder;
          
          if ((!bConcat) && (null != myBuilder) && (myBuilder.length() > 0)) {
            document.setFullText(myBuilder.toString());
            myBuilder.setLength(0);  
          } //TESTED
          
          String res = cleanseField(document.getFullText(),
                        s.getScriptlang(), s.getScript(), s.getFlags(),
                        s.getReplacement(), document);          
          if (bConcat) {
            if (null == myBuilder) {
              fullTextBuilder = myBuilder = new StringBuffer();
            }
            myBuilder.append(res).append('\n');
          }
          else {
            document.setFullText(res);
          }          
        }
      } //TESTED
      else if (s.getField().equalsIgnoreCase("description")) {
        if ((null != document.getDescription()) || bUsingJavascript) {
          StringBuffer myBuilder = descriptionBuilder;
          
          if ((!bConcat) && (null != myBuilder) && (myBuilder.length() > 0)) {
            document.setDescription(myBuilder.toString());
            myBuilder.setLength(0);  
          } //TESTED
          
          String res = cleanseField(document.getDescription(),
                        s.getScriptlang(), s.getScript(), s.getFlags(),
                        s.getReplacement(), document);
          
          if (bConcat) {
            if (null == myBuilder) {
              descriptionBuilder = myBuilder = new StringBuffer();
            }
            myBuilder.append(res).append('\n');
          }
          else {
            document.setDescription(res);
          }          
        }
      } //TESTED
      else if (s.getField().equalsIgnoreCase("title")) {
        if ((null != document.getTitle()) || bUsingJavascript) {
          StringBuffer myBuilder = titleBuilder;
          
          if ((!bConcat) && (null != myBuilder) && (myBuilder.length() > 0)) {
            document.setTitle(myBuilder.toString());
            myBuilder.setLength(0);  
          } //TESTED
          
          String res = cleanseField(document.getTitle(),
                        s.getScriptlang(), s.getScript(), s.getFlags(),
                        s.getReplacement(), document);
          if (bConcat) {
            if (null == myBuilder) {
              titleBuilder = myBuilder = new StringBuffer();
            }
            myBuilder.append(res).append('\n');
          }
          else {
            document.setTitle(res);
          }          
        }
      } //TESTED
      else if (s.getField().startsWith("metadata.")) {
        // (note no support for metadata concatenation, replace only)
        String metaField = s.getField().substring(9); // (9 for"metadata.")
        Object[] meta = document.getMetadata().get(metaField);
        if ((null != meta) && (meta.length > 0)) {
          Object[] newMeta = new Object[meta.length];
          for (int i = 0; i < meta.length; ++i) {
            Object metaValue = meta[i];
            if (metaValue instanceof String) {
              newMeta[i] = (Object) cleanseField(
                  (String) metaValue, s.getScriptlang(),
                  s.getScript(), s.getFlags(),
                  s.getReplacement(), document);
            } else {
              newMeta[i] = metaValue;
            }
          }
          // Overwrite the old fields
          document.addToMetadata(metaField, newMeta);
        }
      }      
      // This is sufficient fields for the moment
      
    } // (end loop over fields)
    
    // Handle any left over cases:
    if ((null != fullTextBuilder) && (fullTextBuilder.length() > 0)) {
      document.setFullText(fullTextBuilder.toString());
    } //TESTED
    if ((null != descriptionBuilder) && (descriptionBuilder.length() > 0)) {
      document.setDescription(descriptionBuilder.toString());
    } //TESTED
    if ((null != titleBuilder) && (titleBuilder.length() > 0)) {
      document.setTitle(titleBuilder.toString());
    } //TESTED
    
  }// TESTED


  /**
   * cleanseField
   * 
   * @param field
   * @param script
   * @param replaceWith
   */
  private String cleanseField(String field, String scriptLang, String script,
                  String flags, String replaceWith, DocumentPojo f) 
  {
    if ((null == scriptLang) || scriptLang.equalsIgnoreCase("regex")) {
      if (null == flags) {
        return field.replaceAll(script, replaceWith);
      } 
      else {
        if (flags.contains("H")) { // HTML decode
          return StringEscapeUtils.unescapeHtml(createRegex(script,flags).matcher(field).replaceAll(replaceWith));
        } else {
          return createRegex(script, flags).matcher(field).replaceAll(replaceWith);
        }
      }
    } 
    else if (scriptLang.equalsIgnoreCase("xpath")) {
      
      try {
        createHtmlCleanerIfNeeded();


        TagNode node = cleaner.clean(new ByteArrayInputStream(field.getBytes()));


        Document doc = new DomSerializer(new CleanerProperties()).createDOM(node);
        XPath xpa = XPathFactory.newInstance().newXPath();        
        
        NodeList res = (NodeList)xpa.evaluate(script, doc, XPathConstants.NODESET);


        if (0 == res.getLength()) { // No match, just return "", unlike regex we don't want anything if we don't match...
          return "";          
        }
        else {
          StringBuffer sb = new StringBuffer();
          for (int i= 0; i< res.getLength(); i++) {
            if (0 != i) {
              sb.append('\n');
            }
            Node info_node = res.item(i);


            if ((null != flags) && flags.contains("H")) { // HTML decode
              sb.append(StringEscapeUtils.unescapeHtml(info_node.getTextContent().trim()));
            }
            else if ((null != flags) && flags.contains("x")) { // Leave as XML string 
              StringWriter writer = new StringWriter();
              try {
                Transformer transformer = TransformerFactory.newInstance().newTransformer();
                transformer.transform(new DOMSource(info_node), new StreamResult(writer));
                sb.append(writer.toString().substring(38)); // (step over <?xml etc?> see under metadata field extraction
              } 
              catch (TransformerException e1) { // (do nothing just skip)
              }
            }
            else {
              sb.append(info_node.getTextContent().trim());            
            }                    
          }                    
          return sb.toString();
        }//TESTED (xpath_test: object - multiple and single, text)
        
      } catch (IOException e) {
        _context.getHarvestStatus().logMessage(HarvestExceptionUtils.createExceptionMessage(e).toString(), true);
      } 
      catch (XPathExpressionException e) {
        _context.getHarvestStatus().logMessage(HarvestExceptionUtils.createExceptionMessage(e).toString(), true);
      } 
      catch (ParserConfigurationException e) {
        _context.getHarvestStatus().logMessage(HarvestExceptionUtils.createExceptionMessage(e).toString(), true);
      }
    }
    else if (scriptLang.equalsIgnoreCase("javascript")) {
      try {
        SourcePojo src = f.getTempSource();
        intializeScriptEngine(src, src.getUnstructuredAnalysisConfig());


        // Setup input:
        if (null == flags) {
          flags = "t";
        }
        if (flags.contains("t")) { // text
          engine.put("text", field);              
        }
        if (flags.contains("d")) { // entire document
          GsonBuilder gb = new GsonBuilder();
          Gson g = gb.create();  
          List<EntityPojo> ents = f.getEntities();
          List<AssociationPojo> assocs = f.getAssociations();
          try {
            f.setEntities(null);
            f.setAssociations(null);
                engine.put("document", g.toJson(f));
                securityManager.eval(engine, JavaScriptUtils.initScript);
          }
          finally {
            f.setEntities(ents);
            f.setAssociations(assocs);
          }
        }
        if (flags.contains("m")) { // metadata
          GsonBuilder gb = new GsonBuilder();
          Gson g = gb.create();  
          engine.put("_metadata", g.toJson(f.getMetadata()));
          securityManager.eval(engine, JavaScriptUtils.iteratorMetaScript);
        }
        Object returnVal = securityManager.eval(engine, script);
        field = (String) returnVal; // (If not a string or is null then will exception out)
        if ((null != flags) && flags.contains("H") && (null != field)) { // HTML decode
          field = StringEscapeUtils.unescapeHtml(field);
        }
      }
      catch (Exception e) {
        _context.getHarvestStatus().logMessage(HarvestExceptionUtils.createExceptionMessage(e).toString(), true);


        // Just do nothing and log
        // e.printStackTrace();
        //DEBUG (don't output log messages per doc)
        //logger.error(e.getMessage());
      }
    }
    return field;
  }
  
  // Handles parsing of HTML tables to Objects that can be easily printed as JSON. (flag = g)
  // 1] No Replace Value - The first row of the table will be set as the headers
  // 2] Replace Value = "[]" - Headers will be set to the column count number (beginning with 0) eg "0","1"
  // 3a] Replace Value = "[one,two,three]" - The provided headers will be set as the headers
  // 3b] Replace Values set, but more data columns than values provided - Additional columns that were not
  //    specified will be assigned it's column count number. eg "specified","1","2"
  // 4] Replace Value = "[one,null,three]" - Columns specified as null in the provided header will be skipped.
  //    eg "one","three"
  
  private static HashMap<String, Object> parseHtmlTable(Node table_node, String replaceWith)
  {
    if (table_node.getNodeName().equalsIgnoreCase("table") && table_node.hasChildNodes())
    {
      Node topNode = table_node;
      
      boolean tbody = table_node.getFirstChild().getNodeName().equalsIgnoreCase("tbody");
      
      if (tbody)
        topNode = table_node.getFirstChild();
      
      if (topNode.hasChildNodes())
      {
        NodeList rows = topNode.getChildNodes();
        
        List<String> headers = null;
        ArrayList<HashMap<String, String>> data = null;
        int headerLength = 0;
        boolean[] skip = null;
        
        if (null != replaceWith)
        {
          if (replaceWith.equals("[]")){
            headers = new ArrayList<String>();
            headerLength = 0;
          } // TESTED (by eye - 2)
          else
          {
            //Remove square brackets
            if(replaceWith.startsWith("[") && replaceWith.endsWith("]"))
              replaceWith = replaceWith.substring(1, replaceWith.length()-1);
            //Turn the provided list of headers into a list object
            headers = Arrays.asList(replaceWith.split("\\s*,\\s*"));
            headerLength = headers.size();
            skip = new boolean[headerLength];
            for(int h = 0; h < headerLength; h++)
            {
              String val = headers.get(h);
              if (val.length() == 0 || val.equalsIgnoreCase("null"))
                skip[h] = true;
              else
                skip[h] = false;
            }
            
          }// TESTED (by eye - 3a)
        }
        
        //traverse rows
        for(int i = 0; i < rows.getLength(); i++)
        {
          Node row = rows.item(i);
          if (row.getNodeName().equalsIgnoreCase("tr") || row.getNodeName().equalsIgnoreCase("th"))
          {
            //If the header value has not been set, the first row will be set as the headers
            if (null == headers)
            {
              //Traverse through cells
              headers = new ArrayList<String>();
              if (row.hasChildNodes())
              {
                NodeList cells = row.getChildNodes();
                headerLength = cells.getLength();
                skip = new boolean[headerLength];
                for (int j = 0; j < headerLength; j++)
                {
                  headers.add(cells.item(j).getTextContent());
                  skip[j] = false;
                }
              } // TESTED (by eye - 1)
            } 
            else
            {
              if (null == data)
              {
                data = new ArrayList<HashMap<String,String>>();
              }
              if (row.hasChildNodes())
              {
                HashMap<String,String> cellList = new HashMap<String,String>();
                NodeList cells = row.getChildNodes();
                for (int j = 0; j < cells.getLength(); j++)
                {
                  // Skip Code (TESTED by eye - 4)
                  if (headerLength == 0 || (j < headerLength && skip[j] == false))
                  {
                    String key = Integer.toString(j); // TESTED (by eye - 3b)
                    if (j < headerLength)
                      key = headers.get(j);
  
                    cellList.put(key, cells.item(j).getTextContent());
                  }
                }
                data.add(cellList);
              }
              
            }
          }
        }
        //Create final hashmap containing attributes
        HashMap<String,Object> table_attrib = new HashMap<String, Object>();
        
        NamedNodeMap nnm = table_node.getAttributes();
        for (int i = 0; i < nnm.getLength(); i++)
        {
          Node att = nnm.item(i);
          table_attrib.put(att.getNodeName(), att.getNodeValue());
        }
        table_attrib.put("table", data);
        
        //TESTED (by eye) attributes added to table value
        // eg: {"id":"search","cellpadding":"1","table":[{"Status":"B","two":"ONE6313" ......
        
        return table_attrib;
      }      
    }
    return null;
  }


  private static Pattern createRegex(String regEx, String flags) {
    int nflags = 0; 


    if (null != flags) {
      for (int i = 0; i < flags.length(); ++i) {
        char c = flags.charAt(i);
        switch (c) { 
        case 'm':
          nflags |= Pattern.MULTILINE;
          break;
        case 'i':
          nflags |= Pattern.CASE_INSENSITIVE;
          break;
        case 'd':
          nflags |= Pattern.DOTALL;
          break; 
        case 'u':
          nflags |= Pattern.UNICODE_CASE;
          break;
        case 'n':
          nflags |= Pattern.UNIX_LINES;
          break;
        }
      }
    }
    return Pattern.compile(regEx, nflags);
  }


  // Utility to minimise number of times the cleaner is created
  
  private void createHtmlCleanerIfNeeded()
  {
    if (null == cleaner) {
      cleaner = new HtmlCleaner();
      CleanerProperties props = cleaner.getProperties();
      props.setAllowHtmlInsideAttributes(true);
      props.setAllowMultiWordAttributes(true);
      props.setRecognizeUnicodeChars(true);
      props.setOmitComments(true);
      props.setTreatUnknownTagsAsContent(false);
      props.setTranslateSpecialEntities(true);
      props.setTransResCharsToNCR(true);
      props.setNamespacesAware(false);
    }    
  }


  public void set_sahEngine(ScriptEngine _sahEngine) {
    this._sahEngine = _sahEngine;
  }


  public ScriptEngine get_sahEngine() {
    return _sahEngine;
  }  


  ///////////////////////////////////////////////////
  
  // Javascript scripting utilities:
  
  public void intializeScriptEngine(SourcePojo source, UnstructuredAnalysisConfigPojo uap) {
    if ( null == engine )
    {
      //use the passed in sah one if possible
      if ( null != this.get_sahEngine())
      {
        engine = this.get_sahEngine();
      }
      else if (null == factory)  //otherwise create our own
      {
        factory = new ScriptEngineManager();
        engine = factory.getEngineByName("JavaScript");    
        //grab any json cache and make it available to the engine
      }
      //once engine is created, do some initialization
      if ( null != engine )
      {
        if (null != source) {
          loadLookupCaches(uap.getCaches(), source.getCommunityIds());
          List<String> scriptFiles = null;
          if (null != uap.getScriptFiles()) {
            scriptFiles = Arrays.asList(uap.getScriptFiles());
          }
          loadGlobalFunctions(scriptFiles, uap.getScript());
        }
        if (null == parsingScript)  {
          parsingScript = JavaScriptUtils.generateParsingScript();
        }
        try  {
          securityManager.eval(engine, parsingScript);            
        } 
        catch (ScriptException e) { // Just do nothing and log
          e.printStackTrace();
          logger.error("intializeScriptEngine: " + e.getMessage());
        }
        
      }
    }//end start engine up    
    
  }//TESTED (legacy + imports_and_lookup_test.json + imports_and_lookup_test_uahSah.json)
  
  //////////////////////////////////////////////////////
  
  // Utilities that in legacy mode are called from the initializeScriptEngine, but can be called
  // standalone in the pipelined mode:
  
  public void loadLookupCaches(Map<String, ObjectId> caches, Set<ObjectId> communityIds)
  {
    try
    {
      if (null != caches) {
        CacheUtils.addJSONCachesToEngine(caches, engine, securityManager, communityIds, _context);
      }
    }
    catch (Exception ex)
    {
      _context.getHarvestStatus().logMessage("JSONcache: " + ex.getMessage(), true);            
      //(no need to log this, appears in log under source -with URL- anyway):
      //logger.error("JSONcache: " + ex.getMessage(), ex);
    }
  }//TESTED (legacy + imports_and_lookup_test.json)
  
  public void loadGlobalFunctions(List<String> imports, String script) 
  {
        // Pass scripts into the engine
        try 
        {
          // Eval script passed in s.script
          if (script != null) securityManager.eval(engine, script);
          
          // Retrieve and eval script files in s.scriptFiles
          if (imports != null)
          {
            for (String file : imports)
            {
                  try 
                  {
                    securityManager.eval(engine, JavaScriptUtils.getJavaScriptFile(file, securityManager));
                  }
                  catch (Exception e) 
              {
                this._context.getHarvestStatus().logMessage("ScriptException (imports): " + e.getMessage(), true);
                //DEBUG
                //logger.error("ScriptException (imports): " + e.getMessage(), e);
              }
            }
          }
    } 
        catch (ScriptException e) 
    {
      this._context.getHarvestStatus().logMessage("ScriptException (globals): " + e.getMessage(), true);            
      //DEBUG
      //logger.error("ScriptException: " + e.getMessage(), e);
    }
        
  }//TESTED (legacy + imports_and_lookup_test.json)
  
  // UTILITY - CURRENTLY ONLY JS CAN SURVIVE WITHOUT TEXT...
  
  private static boolean doesScriptLangNeedText(String scriptLang) {
    return !scriptLang.equalsIgnoreCase("javascript");
  }//TESTED (content_needed_test)
}
Source Code of com.ikanow.infinit.e.harvest.enrichment.custom.UnstructuredAnalysisHarvester

Related Classes of com.ikanow.infinit.e.harvest.enrichment.custom.UnstructuredAnalysisHarvester