Gson g = _gson;
// Skip if the StructuredAnalysis object of the source is null
if (source.getStructuredAnalysisConfig() != null)
{
StructuredAnalysisConfigPojo s = source.getStructuredAnalysisConfig();
// (some pre-processing to expand the specs)
expandIterationLoops(s);
// Instantiate a new ScriptEngineManager and create an engine to execute
// the type of script specified in StructuredAnalysisPojo.scriptEngine
this.intializeScriptEngine();
this.loadLookupCaches(s.getCaches(), source.getCommunityIds());
// Iterate over each doc in docs, create entity and association pojo objects
// to add to the feed using the source entity and association spec pojos
Iterator<DocumentPojo> it = docs.iterator();
int nDocs = 0;
while (it.hasNext())
{
DocumentPojo f = it.next();
nDocs++;
try
{
resetEntityCache();
_document = null;
_docPojo = null;
// (don't create this until needed, since it might need to be (re)serialized after a call
// to the UAH which would obviously be undesirable)
// If the script engine has been instantiated pass the feed document and any scripts
if (_scriptEngine != null)
{
List<String> scriptList = null;
List<String> scriptFileList = null;
try {
// Script code embedded in source
scriptList = Arrays.asList(s.getScript());
}
catch (Exception e) {}
try {
// scriptFiles - can contain String[] of script files to import into the engine
scriptFileList = Arrays.asList(s.getScriptFiles());
}
catch (Exception e) {}
this.loadGlobalFunctions(scriptFileList, scriptList, s.getScriptEngine());
}//TESTED
// 1. Document level fields
// Extract Title if applicable
boolean bTryTitleLater = false;
try {
if (s.getTitle() != null)
{
intializeDocIfNeeded(f, g);
if (JavaScriptUtils.containsScript(s.getTitle()))
{
f.setTitle((String)getValueFromScript(s.getTitle(), null, null));
}
else
{
f.setTitle(getFormattedTextFromField(s.getTitle(), null));
}
if (null == f.getTitle()) {
bTryTitleLater = true;
}
}
}
catch (Exception e)
{
this._context.getHarvestStatus().logMessage("title: " + e.getMessage(), true);
//DEBUG (don't output log messages per doc)
//logger.error("title: " + e.getMessage(), e);
}
// Extract Display URL if applicable
boolean bTryDisplayUrlLater = false;
try {
if (s.getDisplayUrl() != null)
{
intializeDocIfNeeded(f, g);
if (JavaScriptUtils.containsScript(s.getDisplayUrl()))
{
f.setDisplayUrl((String)getValueFromScript(s.getDisplayUrl(), null, null));
}
else
{
f.setDisplayUrl(getFormattedTextFromField(s.getDisplayUrl(), null));
}
if (null == f.getDisplayUrl()) {
bTryDisplayUrlLater = true;
}
}
}
catch (Exception e)
{
this._context.getHarvestStatus().logMessage("displayUrl: " + e.getMessage(), true);
//DEBUG (don't output log messages per doc)
//logger.error("displayUrl: " + e.getMessage(), e);
}
//TOTEST
// Extract Description if applicable
boolean bTryDescriptionLater = false;
try {
if (s.getDescription() != null)
{
intializeDocIfNeeded(f, g);
if (JavaScriptUtils.containsScript(s.getDescription()))
{
f.setDescription((String)getValueFromScript(s.getDescription(), null, null));
}
else
{
f.setDescription(getFormattedTextFromField(s.getDescription(), null));
}
if (null == f.getDescription()) {
bTryDescriptionLater = true;
}
}
}
catch (Exception e)
{
this._context.getHarvestStatus().logMessage("description: " + e.getMessage(), true);
//DEBUG (don't output log messages per doc)
//logger.error("description: " + e.getMessage(), e);
}
// Extract fullText if applicable
boolean bTryFullTextLater = false;
try {
if (s.getFullText() != null)
{
intializeDocIfNeeded(f, g);
if (JavaScriptUtils.containsScript(s.getFullText()))
{
f.setFullText((String)getValueFromScript(s.getFullText(), null, null));
}
else
{
f.setFullText(getFormattedTextFromField(s.getFullText(), null));
}
if (null == f.getFullText()) {
bTryFullTextLater = true;
}
}
}
catch (Exception e)
{
this._context.getHarvestStatus().logMessage("fullText: " + e.getMessage(), true);
//DEBUG (don't output log messages per doc)
//logger.error("fullText: " + e.getMessage(), e);
}
// Published date is done after the UAH
// (since the UAH can't access it, and it might be populated via the UAH)
// 2. UAH/extraction properties
// Add fields to metadata that can be used to create entities and associations
// (Either with the UAH, or with the entity extractor)
try {
boolean bMetadataChanged = false;
if (null != this._unstructuredHandler)
{
try
{
this._unstructuredHandler.set_sahEngine(_scriptEngine);
bMetadataChanged = this._unstructuredHandler.executeHarvest(_context, source, f, (1 == nDocs), it.hasNext());
}
catch (Exception e) {
contextController.handleExtractError(e, source); //handle extractor error if need be
it.remove(); // remove the document from the list...
f.setTempSource(null); // (can safely corrupt this doc since it's been removed)
// (Note: this can't be source level error, so carry on harvesting - unlike below)
continue;
}
}
if (contextController.isEntityExtractionRequired(source))
{
bMetadataChanged = true;
// Text/Entity Extraction
List<DocumentPojo> toAdd = new ArrayList<DocumentPojo>(1);
toAdd.add(f);
try {
contextController.extractTextAndEntities(toAdd, source, false, false);
if (toAdd.isEmpty()) { // this failed...
it.remove(); // remove the document from the list...
f.setTempSource(null); // (can safely corrupt this doc since it's been removed)
continue;
}//TESTED
}
catch (Exception e) {
contextController.handleExtractError(e, source); //handle extractor error if need be
it.remove(); // remove the document from the list...
f.setTempSource(null); // (can safely corrupt this doc since it's been removed)
if (source.isHarvestBadSource())
{
// Source error, ignore all other documents
while (it.hasNext()) {
f = it.next();
f.setTempSource(null); // (can safely corrupt this doc since it's been removed)
it.remove();
}
break;
}
else {
continue;
}
//TESTED
}
}
if (bMetadataChanged) {
// Ugly, but need to re-create doc json because metadata has changed
String sTmpFullText = f.getFullText();
f.setFullText(null); // (no need to serialize this, can save some cycles)
_document = null;
_docPojo = null;
intializeDocIfNeeded(f, g);
f.setFullText(sTmpFullText); //(restore)
}
// Can copy metadata from old documents to new ones:
handleDocumentUpdates(s.getOnUpdateScript(), f);
// Check (based on the metadata and entities so far) whether to retain the doc
if (rejectDoc(s.getRejectDocCriteria(), f)) {
it.remove(); // remove the document from the list...
f.setTempSource(null); // (can safely corrupt this doc since it's been removed)
continue;
}
}
catch (Exception e) {
this._context.getHarvestStatus().logMessage("SAH->UAH: " + e.getMessage(), true);
//DEBUG (don't output log messages per doc)
//logger.error("SAH->UAH: " + e.getMessage(), e);
}
// Now create document since there's no risk of having to re-serialize
intializeDocIfNeeded(f, g);
// 3. final doc-level metadata fields:
// If description was null before might need to get it from a UAH field
if (bTryTitleLater) {
try {
if (s.getTitle() != null)
{
intializeDocIfNeeded(f, g);
if (JavaScriptUtils.containsScript(s.getTitle()))
{
f.setTitle((String)getValueFromScript(s.getTitle(), null, null));
}
else
{
f.setTitle(getFormattedTextFromField(s.getTitle(), null));
}
}
}
catch (Exception e)
{
this._context.getHarvestStatus().logMessage("title: " + e.getMessage(), true);
//DEBUG (don't output log messages per doc)
//logger.error("title: " + e.getMessage(), e);
}
}
// Extract Display URL if needed
if (bTryDisplayUrlLater) {
try {
if (s.getDisplayUrl() != null)
{
intializeDocIfNeeded(f, g);
if (JavaScriptUtils.containsScript(s.getDisplayUrl()))
{
f.setDisplayUrl((String)getValueFromScript(s.getDisplayUrl(), null, null));
}
else
{
f.setDisplayUrl(getFormattedTextFromField(s.getDisplayUrl(), null));
}
}
}
catch (Exception e)
{
this._context.getHarvestStatus().logMessage("displayUrl: " + e.getMessage(), true);
//DEBUG (don't output log messages per doc)
//logger.error("displayUrl: " + e.getMessage(), e);
}
}
//TOTEST
// If description was null before might need to get it from a UAH field
if (bTryDescriptionLater) {
try {
if (s.getDescription() != null)
{
intializeDocIfNeeded(f, g);
if (JavaScriptUtils.containsScript(s.getDescription()))
{
f.setDescription((String)getValueFromScript(s.getDescription(), null, null));
}
else
{
f.setDescription(getFormattedTextFromField(s.getDescription(), null));
}
}
}
catch (Exception e)
{
this._context.getHarvestStatus().logMessage("description2: " + e.getMessage(), true);
//DEBUG (don't output log messages per doc)
//logger.error("description2: " + e.getMessage(), e);
}
}
// If fullText was null before might need to get it from a UAH field
if (bTryFullTextLater) {
try {
if (s.getFullText() != null)
{
intializeDocIfNeeded(f, g);
if (JavaScriptUtils.containsScript(s.getFullText()))
{
f.setFullText((String)getValueFromScript(s.getFullText(), null, null));
}
else
{
f.setFullText(getFormattedTextFromField(s.getFullText(), null));
}
}
}
catch (Exception e)
{
this._context.getHarvestStatus().logMessage("fullText2: " + e.getMessage(), true);
//DEBUG (don't output log messages per doc)
//logger.error("fullText2: " + e.getMessage(), e);
}
}
// Extract Published Date if applicable
if (s.getPublishedDate() != null)
{
if (JavaScriptUtils.containsScript(s.getPublishedDate()))
{
try
{
f.setPublishedDate(new Date(
DateUtility.parseDate((String)getValueFromScript(s.getPublishedDate(), null, null))));
}
catch (Exception e)
{
this._context.getHarvestStatus().logMessage("publishedDate: " + e.getMessage(), true);
}
}
else
{
try
{
f.setPublishedDate(new Date(
DateUtility.parseDate((String)getFormattedTextFromField(s.getPublishedDate(), null))));
}
catch (Exception e)
{
this._context.getHarvestStatus().logMessage("publishedDate: " + e.getMessage(), true);
}
}
}
// 4. Entity level fields
// Extract Document GEO if applicable
if (s.getDocumentGeo() != null)
{
try
{
f.setDocGeo(getDocGeo(s.getDocumentGeo()));
}
catch (Exception e)
{
this._context.getHarvestStatus().logMessage("docGeo: " + e.getMessage(), true);
}
}
// Extract Entities
if (s.getEntities() != null)
{
f.setEntities(getEntities(s.getEntities(), f));
}
// Extract Associations
if (s.getAssociations() != null)
{
f.setAssociations(getAssociations(s.getAssociations(), f));
}
// 5. Remove unwanted metadata fields
removeUnwantedMetadataFields(s.getMetadataFields(), f);
}
catch (Exception e)
{
this._context.getHarvestStatus().logMessage("Unknown: " + e.getMessage(), true);
//DEBUG (don't output log messages per doc)