/*******************************************************************************
* Copyright 2012, The Infinit.e Open Source Project.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License, version 3,
* as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
******************************************************************************/
package com.ikanow.infinit.e.harvest.enrichment.custom;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.StringWriter;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Scanner;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.script.ScriptEngine;
import javax.script.ScriptEngineManager;
import javax.script.ScriptException;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamReader;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;
import org.apache.commons.lang.StringEscapeUtils;
import org.apache.log4j.Logger;
import org.bson.types.ObjectId;
import org.htmlcleaner.CleanerProperties;
import org.htmlcleaner.DomSerializer;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;
import org.json.JSONException;
import org.json.JSONObject;
import org.json.XML;
import org.w3c.dom.Document;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import com.google.gson.stream.JsonReader;
import com.ikanow.infinit.e.data_model.InfiniteEnums.ExtractorDocumentLevelException;
import com.ikanow.infinit.e.data_model.store.config.source.SimpleTextCleanserPojo;
import com.ikanow.infinit.e.data_model.store.config.source.SourcePipelinePojo.ManualTextExtractionSpecPojo;
import com.ikanow.infinit.e.data_model.store.config.source.SourcePipelinePojo.MetadataSpecPojo;
import com.ikanow.infinit.e.data_model.store.config.source.SourcePojo;
import com.ikanow.infinit.e.data_model.store.config.source.SourceRssConfigPojo;
import com.ikanow.infinit.e.data_model.store.config.source.UnstructuredAnalysisConfigPojo;
import com.ikanow.infinit.e.data_model.store.config.source.UnstructuredAnalysisConfigPojo.Context;
import com.ikanow.infinit.e.data_model.store.config.source.UnstructuredAnalysisConfigPojo.metaField;
import com.ikanow.infinit.e.data_model.store.document.AssociationPojo;
import com.ikanow.infinit.e.data_model.store.document.DocumentPojo;
import com.ikanow.infinit.e.data_model.store.document.EntityPojo;
import com.ikanow.infinit.e.data_model.utils.IkanowSecurityManager;
import com.ikanow.infinit.e.harvest.HarvestContext;
import com.ikanow.infinit.e.harvest.HarvestController;
import com.ikanow.infinit.e.harvest.extraction.document.file.JsonToMetadataParser;
import com.ikanow.infinit.e.harvest.extraction.document.file.XmlToMetadataParser;
import com.ikanow.infinit.e.harvest.extraction.text.legacy.TextExtractorTika;
import com.ikanow.infinit.e.harvest.utils.HarvestExceptionUtils;
import com.ikanow.infinit.e.harvest.utils.PropertiesManager;
import com.ikanow.infinit.e.harvest.utils.ProxyManager;
import com.mongodb.BasicDBList;
import com.mongodb.BasicDBObject;
/**
* UnstructuredAnalysisHarvester
*/
public class UnstructuredAnalysisHarvester {
///////////////////////////////////////////////////////////////////////////////////////////
// NEW PROCESSING PIPELINE INTERFACE
//TODO (INF-1922): Handle headers and footers
public void setContext(HarvestContext context) {
_context = context;
securityManager = _context.getSecurityManager();
}
// Transform the doc's text (go get it if necessary)
public String doManualTextEnrichment(DocumentPojo doc, List<ManualTextExtractionSpecPojo> textExtractors, SourceRssConfigPojo feedConfig) throws IOException {
String cachedFullText = null;
// Map to the legacy format and then call the legacy code
ArrayList<SimpleTextCleanserPojo> mappedTextExtractors = new ArrayList<SimpleTextCleanserPojo>(textExtractors.size());
for (ManualTextExtractionSpecPojo textExtractor: textExtractors) {
if (DocumentPojo.fullText_.equalsIgnoreCase(textExtractor.fieldName)) {
boolean fullTextNeeded = (null == doc.getFullText()); // (check here so we can cache it)
if (fullTextNeeded) {
getRawTextFromUrlIfNeeded(doc, feedConfig);
// (if transforming full text then grab the raw body from the URL if necessary)
cachedFullText = doc.getFullText();
}//TESTED (by hand)
}
SimpleTextCleanserPojo mappedTextExtractor = new SimpleTextCleanserPojo();
mappedTextExtractor.setField(textExtractor.fieldName);
mappedTextExtractor.setFlags(textExtractor.flags);
mappedTextExtractor.setScript(textExtractor.script);
mappedTextExtractor.setScriptlang(textExtractor.scriptlang);
mappedTextExtractor.setReplacement(textExtractor.replacement);
mappedTextExtractors.add(mappedTextExtractor);
}
this.cleanseText(mappedTextExtractors, doc);
return cachedFullText;
}
//TESTED (fulltext_regexTests.json)
public void processMetadataChain(DocumentPojo doc, List<MetadataSpecPojo> metadataFields, SourceRssConfigPojo feedConfig, HashSet<String> unstoredFields) throws IOException
{
// Map metadata list to a legacy meta format (they're really similar...)
UnstructuredAnalysisConfigPojo.metaField mappedEl = new UnstructuredAnalysisConfigPojo.metaField();
boolean textSet = false;
for (MetadataSpecPojo meta: metadataFields) {
mappedEl.fieldName = meta.fieldName;
mappedEl.context = Context.All;
mappedEl.flags = meta.flags;
if (null == mappedEl.flags) {
mappedEl.flags = "";
}
boolean scriptLangNeedsText = doesScriptLangNeedText(meta.scriptlang);
// (js you can operate only on metadata, other languages you need the text ...
// actually that is not true, because of chaining ugh - we'll just ignore that for now)
if (scriptLangNeedsText || (null == mappedEl.flags) || mappedEl.flags.isEmpty() || mappedEl.flags.contains("t")) {
if (!textSet) {
getRawTextFromUrlIfNeeded(doc, feedConfig);
textSet = true;
}
}//TESTED (content_needed_test)
mappedEl.scriptlang = meta.scriptlang;
mappedEl.script = meta.script;
mappedEl.replace = meta.replace;
mappedEl.groupNum = null;
//(no group num - just use replace, and flags "o" for xpath/gN:-1)
// Storage of fields:
if ((null != meta.store) && !meta.store) {
unstoredFields.add(meta.fieldName);
}
else if (unstoredFields.contains(meta.fieldName)) {
unstoredFields.remove(meta.fieldName);
}
this.processMeta(doc, mappedEl, doc.getFullText(), null, null);
}
//TESTED (storageSettings_advanced.json)
}
//TESTED (fulltext_regexTests.json, storageSettings_advanced.json)
///////////////////////////////////////////////////////////////////////////////////////////
// PROCESSING PIPELINE - UTILITIES
public void getRawTextFromUrlIfNeeded(DocumentPojo doc, SourceRssConfigPojo feedConfig) throws IOException {
if (null != doc.getFullText()) { // Nothing to do
return;
}
Scanner s = null;
OutputStreamWriter wr = null;
try {
URL url = new URL(doc.getUrl());
URLConnection urlConnect = null;
String postContent = null;
if (null != feedConfig) {
urlConnect = url.openConnection(ProxyManager.getProxy(url, feedConfig.getProxyOverride()));
if (null != feedConfig.getUserAgent()) {
urlConnect.setRequestProperty("User-Agent", feedConfig.getUserAgent());
}// TESTED (by hand)
if (null != feedConfig.getHttpFields()) {
for (Map.Entry<String, String> httpFieldPair: feedConfig.getHttpFields().entrySet()) {
if (httpFieldPair.getKey().equalsIgnoreCase("content")) {
postContent = httpFieldPair.getValue();
urlConnect.setDoInput(true);
urlConnect.setDoOutput(true);
}
else {
urlConnect.setRequestProperty(httpFieldPair.getKey(), httpFieldPair.getValue());
}
}
}//TESTED (by hand)
}
else {
urlConnect = url.openConnection();
}
InputStream urlStream = null;
try {
securityManager.setSecureFlag(true); // (disallow file/local URL access)
if (null != postContent) {
wr = new OutputStreamWriter(urlConnect.getOutputStream());
wr.write(postContent.toCharArray());
wr.flush();
}//TESTED
urlStream = urlConnect.getInputStream();
}
catch (SecurityException se) {
throw se;
}
catch (Exception e) { // Try one more time, this time exception out all the way
securityManager.setSecureFlag(false); // (some file stuff - so need to re-enable)
if (null != feedConfig) {
urlConnect = url.openConnection(ProxyManager.getProxy(url, feedConfig.getProxyOverride()));
if (null != feedConfig.getUserAgent()) {
urlConnect.setRequestProperty("User-Agent", feedConfig.getUserAgent());
}// TESTED
if (null != feedConfig.getHttpFields()) {
for (Map.Entry<String, String> httpFieldPair: feedConfig.getHttpFields().entrySet()) {
if (httpFieldPair.getKey().equalsIgnoreCase("content")) {
urlConnect.setDoInput(true); // (need to do this again)
urlConnect.setDoOutput(true);
}
else {
urlConnect.setRequestProperty(httpFieldPair.getKey(), httpFieldPair.getValue());
}
}
}//TESTED
}
else {
urlConnect = url.openConnection();
}
securityManager.setSecureFlag(true); // (disallow file/local URL access)
if (null != postContent) {
wr = new OutputStreamWriter(urlConnect.getOutputStream());
wr.write(postContent.toCharArray());
wr.flush();
}//TESTED
urlStream = urlConnect.getInputStream();
}
finally {
securityManager.setSecureFlag(false); // (turn security check for local URL/file access off)
}
// Grab any interesting header fields
Map<String, List<String>> headers = urlConnect.getHeaderFields();
BasicDBObject metadataHeaderObj = null;
for (Map.Entry<String, List<String>> it: headers.entrySet()) {
if (null != it.getKey()) {
if (it.getKey().startsWith("X-") || it.getKey().startsWith("Set-") || it.getKey().startsWith("Location")) {
if (null == metadataHeaderObj) {
metadataHeaderObj = new BasicDBObject();
}
metadataHeaderObj.put(it.getKey(), it.getValue());
}
}
}//TESTED
// Grab the response code
try {
HttpURLConnection httpUrlConnect = (HttpURLConnection)urlConnect;
int responseCode = httpUrlConnect.getResponseCode();
if (200 != responseCode) {
if (null == metadataHeaderObj) {
metadataHeaderObj = new BasicDBObject();
}
metadataHeaderObj.put("responseCode", String.valueOf(responseCode));
}
}//TESTED
catch (Exception e) {} // interesting, not an HTTP connect ... shrug and carry on
if (null != metadataHeaderObj) {
doc.addToMetadata("__FEED_METADATA__", metadataHeaderObj);
}//TESTED
s = new Scanner(urlStream, "UTF-8");
doc.setFullText(s.useDelimiter("\\A").next());
}
catch (MalformedURLException me) { // This one is worthy of a more useful error message
throw new MalformedURLException(me.getMessage() + ": Likely because the document has no full text (eg JSON) and you are calling a contentMetadata block without setting flags:'m' or 'd'");
}
finally { //(release resources)
if (null != s) {
s.close();
}
if (null != wr) {
wr.close();
}
}
}//TESTED (cut-and-paste from existing code, so new testing very cursory)
///////////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////
// LEGACY CODE - USE TO SUPPORT OLD CODE FOR NOW + AS UTILITY CODE FOR THE PIPELINE LOGIC
// Per-source state
private Pattern headerPattern = null;
private Pattern footerPattern = null;
private UnstructuredAnalysisConfigPojo savedUap = null;
// Javascript handling, if needed
private ScriptEngineManager factory = null;
private ScriptEngine engine = null;
private static String parsingScript = null;
// Using Tika to process documents:
TextExtractorTika tikaExtractor = null;
private HarvestContext _context = null;
private Logger logger = Logger
.getLogger(UnstructuredAnalysisHarvester.class);
// (some web scraping may be needed)
private long nBetweenDocs_ms = -1;
// (set this in execute harvest - makes it easy to only set once in the per doc version called in bulk from the SAH)
// Ensure we don't get long list of duplicates for commonly occurring words
private HashSet<String> regexDuplicates = null;
private HtmlCleaner cleaner = null;
//if the sah already init'd an engine we'll just use it
private ScriptEngine _sahEngine = null;
private IkanowSecurityManager securityManager = null;
/**
* Default Constructor
*/
public UnstructuredAnalysisHarvester() {
}
// For harvest pipeline, just ensures duplicate map exists and is empty for each doc
public void resetForNewDoc() {
if ((null == regexDuplicates) || (!regexDuplicates.isEmpty())) {
regexDuplicates = new HashSet<String>();
}
}
/**
* executeHarvest(SourcePojo source, List<DocumentPojo> feeds)
*
* @param source
* @param feeds
* @return List<DocumentPojo>
*/
public List<DocumentPojo> executeHarvest(HarvestController contextController, SourcePojo source, List<DocumentPojo> documents)
{
nBetweenDocs_ms = -1;
// Can override the default (feed) wait time from within the source (eg
// for sites that we know don't get upset about getting hammered)
if (null != source.getRssConfig()) {
if (null != source.getRssConfig().getWaitTimeOverride_ms()) {
nBetweenDocs_ms = source.getRssConfig().getWaitTimeOverride_ms();
}
}
if (-1 == nBetweenDocs_ms) {
PropertiesManager props = new PropertiesManager();
nBetweenDocs_ms = props.getWebCrawlWaitTime();
}
// TESTED: default and overridden values
_context = contextController;
securityManager = _context.getSecurityManager();
UnstructuredAnalysisConfigPojo uap = source.getUnstructuredAnalysisConfig();
if (uap != null) {
boolean bGetRawDoc = source.getExtractType().equalsIgnoreCase("feed");
String headerRegEx = uap.getHeaderRegEx();
String footerRegEx = uap.getFooterRegEx();
List<metaField> meta = uap.getMeta();
if (headerRegEx != null)
headerPattern = createRegex(headerRegEx, uap.getHeaderRegExFlags());
if (footerRegEx != null)
footerPattern = createRegex(footerRegEx, uap.getFooterRegExFlags());
Iterator<DocumentPojo> it = documents.iterator();
int nDocs = 0;
while (it.hasNext()) {
nDocs++;
DocumentPojo d = it.next();
regexDuplicates = new HashSet<String>();
cleaner = null;
// For feeds, may need to go get the document text manually,
// it's a bit horrible since
// obviously may then go get the data again for full text
// extraction
boolean bFetchedUrl = false;
if (bGetRawDoc && (null == d.getFullText())) {
if (null == source.getRssConfig()) {
source.setRssConfig(new SourceRssConfigPojo()); // (makes logic easier down the road)
}
// (first time through, sleep following a URL/RSS access)
if ((1 == nDocs) && (null != source.getUrl())) { // (have already made a call to RSS (or "searchConfig" URL)
try {
Thread.sleep(nBetweenDocs_ms);
} catch (InterruptedException e) {
}
}
// TESTED (first time only, correct value after searchConfig override)
try {
if ((null != source.useTextExtractor()) && source.useTextExtractor().equalsIgnoreCase("tika")) {
// Special case: if tika enabled then do that first
if (null == tikaExtractor) {
tikaExtractor = new TextExtractorTika();
tikaExtractor.extractText(d);
}
}
else {
this.getRawTextFromUrlIfNeeded(d, source.getRssConfig());
}
bFetchedUrl = true;
} catch (Exception e) { // Failed to get full text twice, remove doc
if (e instanceof SecurityException) { // This seems worthy of actually logging, even though it's a lowly doc error
contextController.getHarvestStatus().logMessage(e.getMessage(), true);
}
contextController.handleExtractError(e, source); //handle extractor error if need be
it.remove();
d.setTempSource(null); // (can safely corrupt this doc since it's been removed)
continue;
}
}
long nTime_ms = System.currentTimeMillis();
// ^^^ (end slight hack to get raw text to the UAH for RSS feeds)
try {
processBody(d, meta, true, source, uap);
} catch (Exception e) {
this._context.getHarvestStatus().logMessage("processBody1: " + e.getMessage(), true);
//DEBUG (don't output log messages per doc)
//logger.error("processBody1: " + e.getMessage(), e);
}
try {
if (uap.getSimpleTextCleanser() != null) {
cleanseText(uap.getSimpleTextCleanser(), d);
}
} catch (Exception e) {
this._context.getHarvestStatus().logMessage("cleanseText: " + e.getMessage(), true);
//DEBUG (don't output log messages per doc)
//logger.error("cleanseText: " + e.getMessage(), e);
}
try {
processHeader(headerPattern, d, meta, source, uap);
processFooter(footerPattern, d, meta, source, uap);
} catch (Exception e) {
this._context.getHarvestStatus().logMessage("header/footerPattern: " + e.getMessage(), true);
//DEBUG (don't output log messages per doc)
//logger.error("header/footerPattern: " + e.getMessage(), e);
}
try {
processBody(d, meta, false, source, uap);
} catch (Exception e) {
this._context.getHarvestStatus().logMessage("processBody2: " + e.getMessage(), true);
//DEBUG (don't output log messages per doc)
//logger.error("processBody2: " + e.getMessage(), e);
}
if (it.hasNext() && bFetchedUrl) {
nTime_ms = nBetweenDocs_ms
- (System.currentTimeMillis() - nTime_ms); // (ie delay time - processing time)
if (nTime_ms > 0) {
try {
Thread.sleep(nTime_ms);
} catch (InterruptedException e) {
}
}
} // (end politeness delay for URL getting from a single source (likely site)
}
return documents;
}
return new ArrayList<DocumentPojo>();
}
/**
* executeHarvest For single-feed calls (note exception handling happens in
* SAH)
*
* @param source
* @param doc
* @return
* @throws ExtractorDocumentLevelException
*/
public boolean executeHarvest(HarvestContext context, SourcePojo source, DocumentPojo doc, boolean bFirstTime, boolean bMoreDocs) throws ExtractorDocumentLevelException
{
regexDuplicates = new HashSet<String>();
cleaner = null;
boolean bGetRawDoc = source.getExtractType().equalsIgnoreCase("feed")
&& (null == doc.getFullText());
// (ie don't have full text and will need to go fetch it from network)
if (bFirstTime) {
nBetweenDocs_ms = -1; // (reset eg bewteen searchConfig and SAH)
}
if ((-1 == nBetweenDocs_ms) && bGetRawDoc && (bMoreDocs || bFirstTime)) { // (don't bother if not using it...)
// Can override the default (feed) wait time from within the source
// (eg for sites that we know
// don't get upset about getting hammered)
if (null != source.getRssConfig()) {
if (null != source.getRssConfig().getWaitTimeOverride_ms()) {
nBetweenDocs_ms = source.getRssConfig().getWaitTimeOverride_ms();
}
}
if (-1 == nBetweenDocs_ms) { // (ie not overridden so use default)
PropertiesManager props = new PropertiesManager();
nBetweenDocs_ms = props.getWebCrawlWaitTime();
}
} // TESTED (overridden and using system default)
_context = context;
securityManager = _context.getSecurityManager();
UnstructuredAnalysisConfigPojo uap = source.getUnstructuredAnalysisConfig();
int nChanges = 0;
if (null != doc.getMetaData()) {
nChanges = doc.getMetaData().size();
}
boolean bFetchedUrl = false;
if (bGetRawDoc) {
if (null == source.getRssConfig()) {
source.setRssConfig(new SourceRssConfigPojo()); // (makes logic easier down the road)
}
try {
// Workaround for observed twitter bug (first access after the
// RSS was gzipped)
if (bFirstTime) {
// (first time through, sleep following a URL/RSS access)
if (null != source.getUrl()) { // (have already made a call to RSS (or "searchConfig" URL)
try {
Thread.sleep(nBetweenDocs_ms);
} catch (InterruptedException e) {
}
}
// TESTED
}
if ((null != source.useTextExtractor()) && source.useTextExtractor().equalsIgnoreCase("tika")) {
// Special case: if tika enabled then do that first
if (null == tikaExtractor) {
tikaExtractor = new TextExtractorTika();
tikaExtractor.extractText(doc);
}
}
else {
getRawTextFromUrlIfNeeded(doc, source.getRssConfig());
}
bFetchedUrl = true;
}
catch (SecurityException e) { // This seems worthy of actually logging, even though it's a lowly doc error
_context.getHarvestStatus().logMessage(e.getMessage(), true);
throw new ExtractorDocumentLevelException(e.getMessage());
}//TESTED
catch (Exception e) { // Failed to get full text twice... remove doc and carry on
throw new ExtractorDocumentLevelException(e.getMessage());
}
}
long nTime_ms = System.currentTimeMillis();
// ^^^ (end slight hack to get raw text to the UAH for RSS feeds)
if (uap != null) {
List<metaField> meta = uap.getMeta();
if (savedUap != uap) {
String headerRegEx = uap.getHeaderRegEx();
String footerRegEx = uap.getFooterRegEx();
if (headerRegEx != null)
headerPattern = Pattern.compile(headerRegEx, Pattern.DOTALL);
if (footerRegEx != null)
footerPattern = Pattern.compile(footerRegEx, Pattern.DOTALL);
savedUap = uap;
}
try {
processBody(doc, meta, true, source, uap);
} catch (Exception e) {
this._context.getHarvestStatus().logMessage("processBody1: " + e.getMessage(), true);
//DEBUG (don't output log messages per doc)
//logger.error("processBody1: " + e.getMessage(), e);
}
try {
if (uap.getSimpleTextCleanser() != null) {
cleanseText(uap.getSimpleTextCleanser(), doc);
}
} catch (Exception e) {
this._context.getHarvestStatus().logMessage("cleanseText: " + e.getMessage(), true);
//DEBUG (don't output log messages per doc)
//logger.error("cleanseText: " + e.getMessage(), e);
}
try {
processHeader(headerPattern, doc, meta, source, uap);
processFooter(footerPattern, doc, meta, source, uap);
} catch (Exception e) {
this._context.getHarvestStatus().logMessage("header/footerPattern: " + e.getMessage(), true);
//DEBUG (don't output log messages per doc)
//logger.error("header/footerPattern: " + e.getMessage(), e);
}
try {
processBody(doc, meta, false, source, uap);
} catch (Exception e) {
this._context.getHarvestStatus().logMessage("processBody2: " + e.getMessage(), true);
//DEBUG (don't output log messages per doc)
//logger.error("processBody2: " + e.getMessage(), e);
}
}
if (bMoreDocs && bFetchedUrl) {
nTime_ms = nBetweenDocs_ms - (System.currentTimeMillis() - nTime_ms); // (ie delay time - processing time)
if (nTime_ms > 0) {
try {
Thread.sleep(nTime_ms);
} catch (InterruptedException e) {
}
}
} // (end politeness delay for URL getting from a single source (likely site)
if (null != doc.getMetaData()) {
if (nChanges != doc.getMetaData().size()) {
return true;
}
}
return false;
}
/**
* processHeader
*
* @param headerPattern
* @param f
* @param meta
*/
@SuppressWarnings("deprecation")
private void processHeader(Pattern headerPattern, DocumentPojo f, List<metaField> meta, SourcePojo source, UnstructuredAnalysisConfigPojo uap)
{
if (headerPattern != null) {
Matcher headerMatcher = headerPattern.matcher(f.getFullText());
String headerText = null;
while (headerMatcher.find()) {
if (headerMatcher.start() == 0) {
headerText = headerMatcher.group(0);
f.setHeaderEndIndex(headerText.length());
for (int i = 1; i < headerMatcher.groupCount() + 1; i++) {
f.addToHeader(headerMatcher.group(i).trim());
}
break;
}
}
if (null != headerText && null != meta) {
for (metaField m : meta) {
if (m.context == Context.Header || m.context == Context.All) {
this.processMeta(f, m, headerText, source, uap);
}
}
}
}
}
/**
* processFooter
*
* @param footerPattern
* @param f
* @param meta
*/
@SuppressWarnings("deprecation")
private void processFooter(Pattern footerPattern, DocumentPojo f, List<metaField> meta, SourcePojo source, UnstructuredAnalysisConfigPojo uap)
{
if (footerPattern != null) {
Matcher footerMatcher = footerPattern.matcher(f.getFullText());
String footerText = null;
while (footerMatcher.find()) {
footerText = footerMatcher.group(0);
int docLength = f.getFullText().length();
f.setFooterStartIndex(docLength - footerMatcher.group(0).length());
for (int i = 1; i < footerMatcher.groupCount() + 1; i++) {
f.addToHeader(footerMatcher.group(i).trim());
}
break;
}
if (null != footerText && null != meta) {
for (metaField m : meta) {
if (m.context == Context.Footer || m.context == Context.All) {
this.processMeta(f, m, footerText, source, uap);
}
}
}
}
}
/**
* processBody
*
* @param f
* @param meta
*/
@SuppressWarnings("deprecation")
private void processBody(DocumentPojo f, List<metaField> meta, boolean bPreCleansing, SourcePojo source, UnstructuredAnalysisConfigPojo uap)
{
if (null != meta) {
for (metaField m : meta) {
if ((bPreCleansing && (m.context == Context.First))
|| (!bPreCleansing && (m.context == Context.Body || m.context == Context.All))) {
boolean scriptLangNeedsText = doesScriptLangNeedText(m.scriptlang);
// (js you can operate only on metadata, other languages you need the text ...
// actually that is not true, because of chaining ugh - we'll just ignore that for now)
// If running with text (default) then need there to be non-null text
String toProcess = null;
if (scriptLangNeedsText || (null == m.flags) ||m.flags.isEmpty() || m.flags.contains("t")) {
toProcess = f.getBody();
if (toProcess == null)
toProcess = f.getDescription();
if (toProcess == null)
continue;
}
this.processMeta(f, m, toProcess, source, uap);
}
}
}
}
/**
* processMeta - handle an individual field
*/
private void processMeta(DocumentPojo f, metaField m, String text, SourcePojo source, UnstructuredAnalysisConfigPojo uap) {
boolean bAllowDuplicates = false;
if ((null != m.flags) && m.flags.contains("U")) {
bAllowDuplicates = true;
}
if ((null == m.scriptlang) || m.scriptlang.equalsIgnoreCase("regex")) {
Pattern metaPattern = createRegex(m.script, m.flags);
int timesToRun = 1;
Object[] currField = null;
if ((null != m.flags) && m.flags.contains("c")) {
currField = f.getMetadata().get(m.fieldName);
}
if (null != currField) { // chained metadata
timesToRun = currField.length;
text = (String)currField[0];
}//TESTED
Matcher matcher = metaPattern.matcher(text);
LinkedList<String> Llist = null;
for (int ii = 0; ii < timesToRun; ++ii) {
if (ii > 0) { // (else either just text, or in the above "chained metadata" initialization above)
text = (String)currField[ii];
matcher = metaPattern.matcher(text);
}//TESTED
StringBuffer prefix = new StringBuffer(m.fieldName).append(':');
int nFieldNameLen = m.fieldName.length() + 1;
try {
while (matcher.find()) {
if (null == Llist) {
Llist = new LinkedList<String>();
}
if (null == m.groupNum) {
m.groupNum = 0;
}
String toAdd = matcher.group(m.groupNum);
if (null != m.replace) {
toAdd = metaPattern.matcher(toAdd).replaceFirst(
m.replace);
}
if ((null != m.flags) && m.flags.contains("H")) {
toAdd = StringEscapeUtils.unescapeHtml(toAdd);
}
prefix.setLength(nFieldNameLen);
prefix.append(toAdd);
String dupCheck = prefix.toString();
if (!regexDuplicates.contains(dupCheck)) {
Llist.add(toAdd);
if (!bAllowDuplicates) {
regexDuplicates.add(dupCheck);
}
}
}
} catch (Exception e) {
this._context.getHarvestStatus().logMessage("processMeta1: " + e.getMessage(), true);
}
}//(end metadata chaining handling)
if (null != Llist) {
if (null != currField) { // (overwrite)
f.getMetadata().put(m.fieldName, Llist.toArray());
}
else {
f.addToMetadata(m.fieldName, Llist.toArray());
}
}//TESTED
}
else if (m.scriptlang.equalsIgnoreCase("javascript"))
{
if (null == f.getMetadata()) {
f.setMetadata(new LinkedHashMap<String, Object[]>());
}
//set the script engine up if necessary
if ((null != source) && (null != uap)) {
//(these are null if called from new processing pipeline vs legacy code)
intializeScriptEngine(source, uap);
}
try
{
//TODO (INF-2488): in new format, this should only happen in between contentMeta blocks/docs
// (also should be able to use SAH _document object I think?)
// Javascript: the user passes in
Object[] currField = f.getMetadata().get(m.fieldName);
if ((null == m.flags) || m.flags.isEmpty()) {
if (null == currField) {
engine.put("text", text);
engine.put("_iterator", null);
}
//(otherwise will just pass the current fields in there)
}
else { // flags specified
if (m.flags.contains("t")) { // text
engine.put("text", text);
}
if (m.flags.contains("d")) { // entire document (minus ents and assocs)
GsonBuilder gb = new GsonBuilder();
Gson g = gb.create();
List<EntityPojo> ents = f.getEntities();
List<AssociationPojo> assocs = f.getAssociations();
try {
f.setEntities(null);
f.setAssociations(null);
engine.put("document", g.toJson(f));
securityManager.eval(engine, JavaScriptUtils.initScript);
}
finally {
f.setEntities(ents);
f.setAssociations(assocs);
}
}
if (m.flags.contains("m")) { // metadata
GsonBuilder gb = new GsonBuilder();
Gson g = gb.create();
engine.put("_metadata", g.toJson(f.getMetadata()));
securityManager.eval(engine, JavaScriptUtils.iteratorMetaScript);
}
}//(end flags processing)
if (null != currField) {
f.getMetadata().remove(m.fieldName);
GsonBuilder gb = new GsonBuilder();
Gson g = gb.create();
engine.put("_iterator", g.toJson(currField));
securityManager.eval(engine, JavaScriptUtils.iteratorDocScript);
}
//TESTED (handling of flags, and replacing of existing fields, including when field is null but specified)
Object returnVal = securityManager.eval(engine, m.script);
if (null != returnVal) {
if (returnVal instanceof String) { // The only easy case
Object[] array = new Object[1];
if ((null != m.flags) && m.flags.contains("H")) {
returnVal = StringEscapeUtils.unescapeHtml((String)returnVal);
}
array[0] = returnVal;
f.addToMetadata(m.fieldName, array);
} else { // complex object or array - in either case the engine turns these into
// internal.NativeArray or internal.NativeObject
BasicDBList outList = JavaScriptUtils.parseNativeJsObject(returnVal, engine);
f.addToMetadata(m.fieldName, outList.toArray());
}
}
} catch (ScriptException e) {
_context.getHarvestStatus().logMessage(HarvestExceptionUtils.createExceptionMessage(e).toString(), true);
// Just do nothing and log
// e.printStackTrace();
//DEBUG (don't output log messages per doc)
//logger.error(e.getMessage());
} catch (Exception e) {
_context.getHarvestStatus().logMessage(HarvestExceptionUtils.createExceptionMessage(e).toString(), true);
// Just do nothing and log
// e.printStackTrace();
//DEBUG (don't output log messages per doc)
//logger.error(e.getMessage());
}
} else if (m.scriptlang.equalsIgnoreCase("xpath")) {
String xpath = m.script;
try {
createHtmlCleanerIfNeeded();
int timesToRun = 1;
Object[] currField = null;
if ((null != m.flags) && m.flags.contains("c")) {
currField = f.getMetadata().get(m.fieldName);
}
if (null != currField) { // chained metadata
f.getMetadata().remove(m.fieldName); // (so will add to the end)
timesToRun = currField.length;
text = (String)currField[0];
}//TESTED
for (int ii = 0; ii < timesToRun; ++ii) {
if (ii > 0) { // (else either just text, or in the above "chained metadata" initialization above)
text = (String)currField[ii];
}//TESTED
TagNode node = cleaner.clean(new ByteArrayInputStream(text.getBytes()));
//NewCode : Only use html cleaner for cleansing
//use JAXP for full Xpath lib
Document doc = new DomSerializer(new CleanerProperties()).createDOM(node);
String extraRegex = extractRegexFromXpath(xpath);
if (extraRegex != null)
xpath = xpath.replace(extraRegex, "");
XPath xpa = XPathFactory.newInstance().newXPath();
NodeList res = (NodeList)xpa.evaluate(xpath, doc, XPathConstants.NODESET);
if (res.getLength() > 0)
{
if ((null != m.flags) && (m.flags.contains("o"))) { // "o" for object
m.groupNum = -1; // (see bConvertToObject below)
}
StringBuffer prefix = new StringBuffer(m.fieldName).append(':');
int nFieldNameLen = m.fieldName.length() + 1;
ArrayList<Object> Llist = new ArrayList<Object>(res.getLength());
boolean bConvertToObject = ((m.groupNum != null) && (m.groupNum == -1));
boolean convertToXml = ((null != m.flags) && (m.flags.contains("x")));
for (int i= 0; i< res.getLength(); i++)
{
Node info_node = res.item(i);
if ((null != m.flags) && (m.flags.contains("g"))) {
Llist.add(parseHtmlTable(info_node, m.replace));
}
else if (bConvertToObject || convertToXml) {
// Try to create a JSON object out of this
StringWriter writer = new StringWriter();
try {
Transformer transformer = TransformerFactory.newInstance().newTransformer();
transformer.transform(new DOMSource(info_node), new StreamResult(writer));
} catch (TransformerException e1) {
continue;
}
if (bConvertToObject) {
try {
JSONObject subObj = XML.toJSONObject(writer.toString());
if (xpath.endsWith("*")) { // (can have any number of different names here)
Llist.add(XmlToMetadataParser.convertJsonObjectToLinkedHashMap(subObj));
}//TESTED
else {
String[] rootNames = JSONObject.getNames(subObj);
if (1 == rootNames.length) {
// (don't think it can't be any other number in fact)
subObj = subObj.getJSONObject(rootNames[0]);
}
boolean bUnescapeHtml = ((null != m.flags) && m.flags.contains("H"));
Llist.add(XmlToMetadataParser.convertJsonObjectToLinkedHashMap(subObj, bUnescapeHtml));
}//TESTED
}
catch (JSONException e) { // Just carry on
continue;
}
//TESTED
}
else { // leave in XML form
Llist.add(writer.toString().substring(38)); // +38: (step over <?xml version="1.0" encoding="UTF-8"?>)
}//TESTED (xpath_test.json)
}
else { // Treat this as string, either directly or via regex
String info = info_node.getTextContent().trim();
if (extraRegex == null || extraRegex.isEmpty()) {
prefix.setLength(nFieldNameLen);
prefix.append(info);
String dupCheck = prefix.toString();
if (!regexDuplicates.contains(dupCheck)) {
if ((null != m.flags) && m.flags.contains("H")) {
info = StringEscapeUtils.unescapeHtml(info);
}
Llist.add(info);
if (!bAllowDuplicates) {
regexDuplicates.add(dupCheck);
}
}
}
else { // Apply regex to the string
Pattern dataRegex = createRegex(extraRegex, m.flags);
Matcher dataMatcher = dataRegex.matcher(info);
boolean result = dataMatcher.find();
while (result) {
String toAdd;
if (m.groupNum != null)
toAdd = dataMatcher.group(m.groupNum);
else
toAdd = dataMatcher.group();
prefix.setLength(nFieldNameLen);
prefix.append(toAdd);
String dupCheck = prefix.toString();
if (!regexDuplicates.contains(dupCheck)) {
if ((null != m.flags) && m.flags.contains("H")) {
toAdd = StringEscapeUtils.unescapeHtml(toAdd);
}
Llist.add(toAdd);
if (!bAllowDuplicates) {
regexDuplicates.add(dupCheck);
}
}
result = dataMatcher.find();
}
}//(regex vs no regex)
}//(end string vs object)
}
if (Llist.size() > 0) {
f.addToMetadata(m.fieldName, Llist.toArray());
}
}
}//(end loop over metadata objects if applicable)
} catch (IOException ioe) {
_context.getHarvestStatus().logMessage(HarvestExceptionUtils.createExceptionMessage(ioe).toString(), true);
// Just do nothing and log
//DEBUG (don't output log messages per doc)
//logger.error(ioe.getMessage());
} catch (ParserConfigurationException e1) {
_context.getHarvestStatus().logMessage(HarvestExceptionUtils.createExceptionMessage(e1).toString(), true);
// Just do nothing and log
//DEBUG (don't output log messages per doc)
//logger.error(e1.getMessage());
} catch (XPathExpressionException e1) {
_context.getHarvestStatus().logMessage("Error evaluating xpath expression: " + xpath, true);
}
}
else if (m.scriptlang.equalsIgnoreCase("stream")) { // XML or JSON streaming interface
// which one?
try {
boolean json = false;
boolean xml = false;
for (int i = 0; i < 128; ++i) {
if ('<' == text.charAt(i)) {
xml = true;
break;
}
if ('{' == text.charAt(i)) {
json = true;
break;
}
if (!Character.isSpaceChar(text.charAt(i))) {
break;
}
}//TESTED (too many spaces: meta_stream_test, test4; incorrect chars: test3, xml: test1, json: test2)
boolean textNotObject = m.flags == null || !m.flags.contains("o");
List<DocumentPojo> docs = new LinkedList<DocumentPojo>();
List<String> levelOneFields = null;
if (null != m.script) {
levelOneFields = Arrays.asList(m.script.split("\\s*,\\s*"));
if ((1 == levelOneFields.size()) && levelOneFields.get(0).isEmpty()) {
// convert [""] to null
levelOneFields = null;
}
}//TESTED (json and xml)
if (xml) {
XmlToMetadataParser parser = new XmlToMetadataParser(levelOneFields, null, null, null, null, null, Integer.MAX_VALUE);
XMLInputFactory factory = XMLInputFactory.newInstance();
factory.setProperty(XMLInputFactory.IS_COALESCING, true);
factory.setProperty(XMLInputFactory.SUPPORT_DTD, false);
XMLStreamReader reader = null;
try {
reader = factory.createXMLStreamReader(new ByteArrayInputStream(text.getBytes()));
docs = parser.parseDocument(reader, textNotObject);
}
finally {
if (null != reader) reader.close();
}
}//TESTED (meta_stream_test, test1)
if (json) {
JsonReader jsonReader = null;
try {
JsonToMetadataParser parser = new JsonToMetadataParser(null, levelOneFields, null, null, Integer.MAX_VALUE);
jsonReader = new JsonReader(new InputStreamReader(new ByteArrayInputStream(text.getBytes()), "UTF-8"));
jsonReader.setLenient(true);
docs = parser.parseDocument(jsonReader, textNotObject);
}
finally {
if (null != jsonReader) jsonReader.close();
}
}//TESTED (meta_stream_test test2)
if (!docs.isEmpty()) {
ArrayList<String> Llist = null;
ArrayList<Object> LlistObj = null;
if (textNotObject) {
Llist = new ArrayList<String>(docs.size());
}
else {
LlistObj = new ArrayList<Object>(docs.size());
}
for (DocumentPojo doc: docs) {
if ((null != doc.getFullText()) || (null != doc.getMetadata())) {
if (textNotObject) {
Llist.add(doc.getFullText());
}//TESTED
else if (xml) {
LlistObj.add(doc.getMetadata());
}//TESTED
else if (json) {
Object o = doc.getMetadata();
if (null != o) {
o = doc.getMetadata().get("json");
if (null != o) {
LlistObj.add(o);
}
}
}//TESTED
}
}//TESTED
if ((null != Llist) && !Llist.isEmpty()) {
f.addToMetadata(m.fieldName, Llist.toArray());
}//TESTED
if ((null != LlistObj) && !LlistObj.isEmpty()) {
f.addToMetadata(m.fieldName, LlistObj.toArray());
}//TESTED
}//TESTED (meta_stream_test test1,test2)
}//(end try)
catch (Exception e) { // various parsing errors
_context.getHarvestStatus().logMessage(HarvestExceptionUtils.createExceptionMessage(e).toString(), true);
}
}//TESTED (meta_stream_test)
// (don't currently support other script types)
}
private static String extractRegexFromXpath(String original_xpath) {
Pattern addedRegex = Pattern.compile("regex\\(.*\\)\\s*$", Pattern.MULTILINE | Pattern.DOTALL);
Matcher matcher = addedRegex.matcher(original_xpath);
boolean matchFound = matcher.find();
if (matchFound) {
try {
return matcher.group();
} catch (Exception e) {
return null;
}
}
return null;
}
/**
* cleanseText
*
* @param source
* @param documents
* @return
*/
private void cleanseText(List<SimpleTextCleanserPojo> simpleTextCleanser, DocumentPojo document)
{
// Store these since can re-generate them by concatenation
StringBuffer fullTextBuilder = null;
StringBuffer descriptionBuilder = null;
StringBuffer titleBuilder = null;
// (note no support for metadata concatenation, replace only)
// Iterate over the cleanser functions that need to run on each feed
for (SimpleTextCleanserPojo s : simpleTextCleanser) {
boolean bConcat = (null != s.getFlags()) && s.getFlags().contains("+");
boolean bUsingJavascript = ((null != s.getScriptlang()) && s.getScriptlang().equalsIgnoreCase("javascript"));
if (s.getField().equalsIgnoreCase("fulltext")) {
if ((null != document.getFullText()) || bUsingJavascript) {
StringBuffer myBuilder = fullTextBuilder;
if ((!bConcat) && (null != myBuilder) && (myBuilder.length() > 0)) {
document.setFullText(myBuilder.toString());
myBuilder.setLength(0);
} //TESTED
String res = cleanseField(document.getFullText(),
s.getScriptlang(), s.getScript(), s.getFlags(),
s.getReplacement(), document);
if (bConcat) {
if (null == myBuilder) {
fullTextBuilder = myBuilder = new StringBuffer();
}
myBuilder.append(res).append('\n');
}
else {
document.setFullText(res);
}
}
} //TESTED
else if (s.getField().equalsIgnoreCase("description")) {
if ((null != document.getDescription()) || bUsingJavascript) {
StringBuffer myBuilder = descriptionBuilder;
if ((!bConcat) && (null != myBuilder) && (myBuilder.length() > 0)) {
document.setDescription(myBuilder.toString());
myBuilder.setLength(0);
} //TESTED
String res = cleanseField(document.getDescription(),
s.getScriptlang(), s.getScript(), s.getFlags(),
s.getReplacement(), document);
if (bConcat) {
if (null == myBuilder) {
descriptionBuilder = myBuilder = new StringBuffer();
}
myBuilder.append(res).append('\n');
}
else {
document.setDescription(res);
}
}
} //TESTED
else if (s.getField().equalsIgnoreCase("title")) {
if ((null != document.getTitle()) || bUsingJavascript) {
StringBuffer myBuilder = titleBuilder;
if ((!bConcat) && (null != myBuilder) && (myBuilder.length() > 0)) {
document.setTitle(myBuilder.toString());
myBuilder.setLength(0);
} //TESTED
String res = cleanseField(document.getTitle(),
s.getScriptlang(), s.getScript(), s.getFlags(),
s.getReplacement(), document);
if (bConcat) {
if (null == myBuilder) {
titleBuilder = myBuilder = new StringBuffer();
}
myBuilder.append(res).append('\n');
}
else {
document.setTitle(res);
}
}
} //TESTED
else if (s.getField().startsWith("metadata.")) {
// (note no support for metadata concatenation, replace only)
String metaField = s.getField().substring(9); // (9 for"metadata.")
Object[] meta = document.getMetadata().get(metaField);
if ((null != meta) && (meta.length > 0)) {
Object[] newMeta = new Object[meta.length];
for (int i = 0; i < meta.length; ++i) {
Object metaValue = meta[i];
if (metaValue instanceof String) {
newMeta[i] = (Object) cleanseField(
(String) metaValue, s.getScriptlang(),
s.getScript(), s.getFlags(),
s.getReplacement(), document);
} else {
newMeta[i] = metaValue;
}
}
// Overwrite the old fields
document.addToMetadata(metaField, newMeta);
}
}
// This is sufficient fields for the moment
} // (end loop over fields)
// Handle any left over cases:
if ((null != fullTextBuilder) && (fullTextBuilder.length() > 0)) {
document.setFullText(fullTextBuilder.toString());
} //TESTED
if ((null != descriptionBuilder) && (descriptionBuilder.length() > 0)) {
document.setDescription(descriptionBuilder.toString());
} //TESTED
if ((null != titleBuilder) && (titleBuilder.length() > 0)) {
document.setTitle(titleBuilder.toString());
} //TESTED
}// TESTED
/**
* cleanseField
*
* @param field
* @param script
* @param replaceWith
*/
private String cleanseField(String field, String scriptLang, String script,
String flags, String replaceWith, DocumentPojo f)
{
if ((null == scriptLang) || scriptLang.equalsIgnoreCase("regex")) {
if (null == flags) {
return field.replaceAll(script, replaceWith);
}
else {
if (flags.contains("H")) { // HTML decode
return StringEscapeUtils.unescapeHtml(createRegex(script,flags).matcher(field).replaceAll(replaceWith));
} else {
return createRegex(script, flags).matcher(field).replaceAll(replaceWith);
}
}
}
else if (scriptLang.equalsIgnoreCase("xpath")) {
try {
createHtmlCleanerIfNeeded();
TagNode node = cleaner.clean(new ByteArrayInputStream(field.getBytes()));
Document doc = new DomSerializer(new CleanerProperties()).createDOM(node);
XPath xpa = XPathFactory.newInstance().newXPath();
NodeList res = (NodeList)xpa.evaluate(script, doc, XPathConstants.NODESET);
if (0 == res.getLength()) { // No match, just return "", unlike regex we don't want anything if we don't match...
return "";
}
else {
StringBuffer sb = new StringBuffer();
for (int i= 0; i< res.getLength(); i++) {
if (0 != i) {
sb.append('\n');
}
Node info_node = res.item(i);
if ((null != flags) && flags.contains("H")) { // HTML decode
sb.append(StringEscapeUtils.unescapeHtml(info_node.getTextContent().trim()));
}
else if ((null != flags) && flags.contains("x")) { // Leave as XML string
StringWriter writer = new StringWriter();
try {
Transformer transformer = TransformerFactory.newInstance().newTransformer();
transformer.transform(new DOMSource(info_node), new StreamResult(writer));
sb.append(writer.toString().substring(38)); // (step over <?xml etc?> see under metadata field extraction
}
catch (TransformerException e1) { // (do nothing just skip)
}
}
else {
sb.append(info_node.getTextContent().trim());
}
}
return sb.toString();
}//TESTED (xpath_test: object - multiple and single, text)
} catch (IOException e) {
_context.getHarvestStatus().logMessage(HarvestExceptionUtils.createExceptionMessage(e).toString(), true);
}
catch (XPathExpressionException e) {
_context.getHarvestStatus().logMessage(HarvestExceptionUtils.createExceptionMessage(e).toString(), true);
}
catch (ParserConfigurationException e) {
_context.getHarvestStatus().logMessage(HarvestExceptionUtils.createExceptionMessage(e).toString(), true);
}
}
else if (scriptLang.equalsIgnoreCase("javascript")) {
try {
SourcePojo src = f.getTempSource();
intializeScriptEngine(src, src.getUnstructuredAnalysisConfig());
// Setup input:
if (null == flags) {
flags = "t";
}
if (flags.contains("t")) { // text
engine.put("text", field);
}
if (flags.contains("d")) { // entire document
GsonBuilder gb = new GsonBuilder();
Gson g = gb.create();
List<EntityPojo> ents = f.getEntities();
List<AssociationPojo> assocs = f.getAssociations();
try {
f.setEntities(null);
f.setAssociations(null);
engine.put("document", g.toJson(f));
securityManager.eval(engine, JavaScriptUtils.initScript);
}
finally {
f.setEntities(ents);
f.setAssociations(assocs);
}
}
if (flags.contains("m")) { // metadata
GsonBuilder gb = new GsonBuilder();
Gson g = gb.create();
engine.put("_metadata", g.toJson(f.getMetadata()));
securityManager.eval(engine, JavaScriptUtils.iteratorMetaScript);
}
Object returnVal = securityManager.eval(engine, script);
field = (String) returnVal; // (If not a string or is null then will exception out)
if ((null != flags) && flags.contains("H") && (null != field)) { // HTML decode
field = StringEscapeUtils.unescapeHtml(field);
}
}
catch (Exception e) {
_context.getHarvestStatus().logMessage(HarvestExceptionUtils.createExceptionMessage(e).toString(), true);
// Just do nothing and log
// e.printStackTrace();
//DEBUG (don't output log messages per doc)
//logger.error(e.getMessage());
}
}
return field;
}
// Handles parsing of HTML tables to Objects that can be easily printed as JSON. (flag = g)
// 1] No Replace Value - The first row of the table will be set as the headers
// 2] Replace Value = "[]" - Headers will be set to the column count number (beginning with 0) eg "0","1"
// 3a] Replace Value = "[one,two,three]" - The provided headers will be set as the headers
// 3b] Replace Values set, but more data columns than values provided - Additional columns that were not
// specified will be assigned it's column count number. eg "specified","1","2"
// 4] Replace Value = "[one,null,three]" - Columns specified as null in the provided header will be skipped.
// eg "one","three"
private static HashMap<String, Object> parseHtmlTable(Node table_node, String replaceWith)
{
if (table_node.getNodeName().equalsIgnoreCase("table") && table_node.hasChildNodes())
{
Node topNode = table_node;
boolean tbody = table_node.getFirstChild().getNodeName().equalsIgnoreCase("tbody");
if (tbody)
topNode = table_node.getFirstChild();
if (topNode.hasChildNodes())
{
NodeList rows = topNode.getChildNodes();
List<String> headers = null;
ArrayList<HashMap<String, String>> data = null;
int headerLength = 0;
boolean[] skip = null;
if (null != replaceWith)
{
if (replaceWith.equals("[]")){
headers = new ArrayList<String>();
headerLength = 0;
} // TESTED (by eye - 2)
else
{
//Remove square brackets
if(replaceWith.startsWith("[") && replaceWith.endsWith("]"))
replaceWith = replaceWith.substring(1, replaceWith.length()-1);
//Turn the provided list of headers into a list object
headers = Arrays.asList(replaceWith.split("\\s*,\\s*"));
headerLength = headers.size();
skip = new boolean[headerLength];
for(int h = 0; h < headerLength; h++)
{
String val = headers.get(h);
if (val.length() == 0 || val.equalsIgnoreCase("null"))
skip[h] = true;
else
skip[h] = false;
}
}// TESTED (by eye - 3a)
}
//traverse rows
for(int i = 0; i < rows.getLength(); i++)
{
Node row = rows.item(i);
if (row.getNodeName().equalsIgnoreCase("tr") || row.getNodeName().equalsIgnoreCase("th"))
{
//If the header value has not been set, the first row will be set as the headers
if (null == headers)
{
//Traverse through cells
headers = new ArrayList<String>();
if (row.hasChildNodes())
{
NodeList cells = row.getChildNodes();
headerLength = cells.getLength();
skip = new boolean[headerLength];
for (int j = 0; j < headerLength; j++)
{
headers.add(cells.item(j).getTextContent());
skip[j] = false;
}
} // TESTED (by eye - 1)
}
else
{
if (null == data)
{
data = new ArrayList<HashMap<String,String>>();
}
if (row.hasChildNodes())
{
HashMap<String,String> cellList = new HashMap<String,String>();
NodeList cells = row.getChildNodes();
for (int j = 0; j < cells.getLength(); j++)
{
// Skip Code (TESTED by eye - 4)
if (headerLength == 0 || (j < headerLength && skip[j] == false))
{
String key = Integer.toString(j); // TESTED (by eye - 3b)
if (j < headerLength)
key = headers.get(j);
cellList.put(key, cells.item(j).getTextContent());
}
}
data.add(cellList);
}
}
}
}
//Create final hashmap containing attributes
HashMap<String,Object> table_attrib = new HashMap<String, Object>();
NamedNodeMap nnm = table_node.getAttributes();
for (int i = 0; i < nnm.getLength(); i++)
{
Node att = nnm.item(i);
table_attrib.put(att.getNodeName(), att.getNodeValue());
}
table_attrib.put("table", data);
//TESTED (by eye) attributes added to table value
// eg: {"id":"search","cellpadding":"1","table":[{"Status":"B","two":"ONE6313" ......
return table_attrib;
}
}
return null;
}
private static Pattern createRegex(String regEx, String flags) {
int nflags = 0;
if (null != flags) {
for (int i = 0; i < flags.length(); ++i) {
char c = flags.charAt(i);
switch (c) {
case 'm':
nflags |= Pattern.MULTILINE;
break;
case 'i':
nflags |= Pattern.CASE_INSENSITIVE;
break;
case 'd':
nflags |= Pattern.DOTALL;
break;
case 'u':
nflags |= Pattern.UNICODE_CASE;
break;
case 'n':
nflags |= Pattern.UNIX_LINES;
break;
}
}
}
return Pattern.compile(regEx, nflags);
}
// Utility to minimise number of times the cleaner is created
private void createHtmlCleanerIfNeeded()
{
if (null == cleaner) {
cleaner = new HtmlCleaner();
CleanerProperties props = cleaner.getProperties();
props.setAllowHtmlInsideAttributes(true);
props.setAllowMultiWordAttributes(true);
props.setRecognizeUnicodeChars(true);
props.setOmitComments(true);
props.setTreatUnknownTagsAsContent(false);
props.setTranslateSpecialEntities(true);
props.setTransResCharsToNCR(true);
props.setNamespacesAware(false);
}
}
public void set_sahEngine(ScriptEngine _sahEngine) {
this._sahEngine = _sahEngine;
}
public ScriptEngine get_sahEngine() {
return _sahEngine;
}
///////////////////////////////////////////////////
// Javascript scripting utilities:
public void intializeScriptEngine(SourcePojo source, UnstructuredAnalysisConfigPojo uap) {
if ( null == engine )
{
//use the passed in sah one if possible
if ( null != this.get_sahEngine())
{
engine = this.get_sahEngine();
}
else if (null == factory) //otherwise create our own
{
factory = new ScriptEngineManager();
engine = factory.getEngineByName("JavaScript");
//grab any json cache and make it available to the engine
}
//once engine is created, do some initialization
if ( null != engine )
{
if (null != source) {
loadLookupCaches(uap.getCaches(), source.getCommunityIds());
List<String> scriptFiles = null;
if (null != uap.getScriptFiles()) {
scriptFiles = Arrays.asList(uap.getScriptFiles());
}
loadGlobalFunctions(scriptFiles, uap.getScript());
}
if (null == parsingScript) {
parsingScript = JavaScriptUtils.generateParsingScript();
}
try {
securityManager.eval(engine, parsingScript);
}
catch (ScriptException e) { // Just do nothing and log
e.printStackTrace();
logger.error("intializeScriptEngine: " + e.getMessage());
}
}
}//end start engine up
}//TESTED (legacy + imports_and_lookup_test.json + imports_and_lookup_test_uahSah.json)
//////////////////////////////////////////////////////
// Utilities that in legacy mode are called from the initializeScriptEngine, but can be called
// standalone in the pipelined mode:
public void loadLookupCaches(Map<String, ObjectId> caches, Set<ObjectId> communityIds)
{
try
{
if (null != caches) {
CacheUtils.addJSONCachesToEngine(caches, engine, securityManager, communityIds, _context);
}
}
catch (Exception ex)
{
_context.getHarvestStatus().logMessage("JSONcache: " + ex.getMessage(), true);
//(no need to log this, appears in log under source -with URL- anyway):
//logger.error("JSONcache: " + ex.getMessage(), ex);
}
}//TESTED (legacy + imports_and_lookup_test.json)
public void loadGlobalFunctions(List<String> imports, String script)
{
// Pass scripts into the engine
try
{
// Eval script passed in s.script
if (script != null) securityManager.eval(engine, script);
// Retrieve and eval script files in s.scriptFiles
if (imports != null)
{
for (String file : imports)
{
try
{
securityManager.eval(engine, JavaScriptUtils.getJavaScriptFile(file, securityManager));
}
catch (Exception e)
{
this._context.getHarvestStatus().logMessage("ScriptException (imports): " + e.getMessage(), true);
//DEBUG
//logger.error("ScriptException (imports): " + e.getMessage(), e);
}
}
}
}
catch (ScriptException e)
{
this._context.getHarvestStatus().logMessage("ScriptException (globals): " + e.getMessage(), true);
//DEBUG
//logger.error("ScriptException: " + e.getMessage(), e);
}
}//TESTED (legacy + imports_and_lookup_test.json)
// UTILITY - CURRENTLY ONLY JS CAN SURVIVE WITHOUT TEXT...
private static boolean doesScriptLangNeedText(String scriptLang) {
return !scriptLang.equalsIgnoreCase("javascript");
}//TESTED (content_needed_test)
}