HashSet<String> unstoredFields = new HashSet<String>();
int error_on_feed_count = 0, feed_count = 0;
LinkedList<DocumentPojo> splitterList = null;
for (docIt.hasNext();;) {
DocumentPojo doc = null;
HashSet<String> currentBranches = null;
unstoredFields.clear();
if (!docIt.hasNext()) {
if ((null == splitterList) || (splitterList.isEmpty())) {
break;
} // all done!
else { // add all splitterList elements to toAdd
while (!splitterList.isEmpty()) {
docIt.add(splitterList.removeLast());
doc = docIt.previous();
}
}//TESTED (doc_splitte_test)
}
else {
doc = docIt.next();
}//TESTED
boolean processSpawnedDocOrNotSpawnedDoc = null == doc.getSpawnedFrom(); // (initially: only true if not spawned doc...)
// (Do this at the top so don't get foxed by any continues in the code)
long currTime = new Date().getTime();
if ( HarvestController.isHarvestKilled() // (harvest manually killed or because of global time)
||
((currTime - pipelineStartTime) > nMaxTimeSpentInPipeline_ms))
// Don't let any source spend too long in one iteration...
{
source.setReachedMaxDocs(); // (move to success iteration)
// Remove the rest of the documents
doc.setTempSource(null); // (can safely corrupt this doc since it's been removed)
docIt.remove();
while (docIt.hasNext()) {
doc = docIt.next();
doc.setTempSource(null); // (can safely corrupt this doc since it's been removed)
docIt.remove();
}
// Exit loop
break;
}//TESTED
feed_count++;
try {
// For cases where we grab the full text for early processing and then want it back
_cachedRawFullText = null;
_cachedRawFullText_available = true; // (set this latch if I never see the full text, eg non-raw text engine is called)
if (null != _uah) {
_uah.resetForNewDoc();
}
if (null != _sah) {
_sah.resetForNewDoc();
}
_lastDocInPipeline = !docIt.hasNext();
//NOTE: inter-doc waiting needs to happen before the following processing elements:
// pxPipe.textEngine: always
// pxPipe.text: only if doc.fullText==null
// pxPipe.contentMetadata: only if doc.fullText==null
// pxPipe.featureEngine: only if doc.fullText==null
for (SourcePipelinePojo pxPipe: source.getProcessingPipeline()) { /// (must be non null if here)
//DEBUG
//System.out.println("PX EL: " + pxPipe.display + ", " + processSpawnedDocOrNotSpawnedDoc + ", " + doc.getUrl() + ": " + toAdd.size());
// Spawned documents only enter at their spot in the pipeline:
if (!processSpawnedDocOrNotSpawnedDoc) {
if (pxPipe == doc.getSpawnedFrom()) { // (intentionally ptr ==)
processSpawnedDocOrNotSpawnedDoc = true; // (next pipeline element, start processing)
}
continue; // (skip past elements, including the spawnee)
}//TESTED (doc_splitter_test);
// Run criteria for this pipeline element:
if ((null != pxPipe.criteria) && !pxPipe.criteria.isEmpty()) {
// Check branches (read)
boolean moddedCriteria = false;
String newCriteria = pxPipe.criteria;
Matcher m1 = this.BRANCH_MAP_GET.matcher(newCriteria);
boolean modCriteria = false;
boolean branchMismatch = false;
while (m1.find()) {
modCriteria = true;
if ((null == currentBranches) || !currentBranches.contains(m1.group(1))) {
branchMismatch = true;
break;
}
}
if (branchMismatch) {
continue;
}
if (modCriteria) {
newCriteria = m1.replaceAll("");
moddedCriteria = true;
}
//TESTED (complex_criteria_test)
// Check branches (write)
String branchYes = null;
String branchNo = null;
Matcher m2 = BRANCH_MAP_SET.matcher(newCriteria);
modCriteria = false;
if (m2.find()) {
modCriteria = true;
branchYes = m2.group(1);
branchNo = m2.group(2);
}
if (modCriteria) {
newCriteria = m2.replaceAll("");
moddedCriteria = true;
}
//TESTED (complex_criteria_test)
if (!moddedCriteria || !newCriteria.isEmpty()) {
if (!newCriteria.startsWith("$SCRIPT")) {
newCriteria= "$SCRIPT(" + newCriteria + ")";
}//TESTED (basic_criteria_test)
if ((null != branchYes) && (null == currentBranches)) {
currentBranches = new HashSet<String>();
}
if (!_sah.rejectDoc(newCriteria, doc, false)) {
if (null != branchNo) {
currentBranches.add(branchNo);
Set<String> parentBranches = this._branchMappings.get(branchNo);
if (null != parentBranches) {
currentBranches.addAll(parentBranches);
}
}
continue;
}
else {
if (null != branchYes) {
currentBranches.add(branchYes);
Set<String> parentBranches = this._branchMappings.get(branchYes);
if (null != parentBranches) {
currentBranches.addAll(parentBranches);
}
}
}
//TESTED (complex_criteria_test)
}
}//TESTED (basic_criteria_test)
//TODO (INF-2218): improve performance of doc serialization by only updating spec'd fields (note: need to change the js engine)
// and by sharing engine state between the SAH and UAH
// Save metadata state so we know if we need to re-serialize the document
int nCurrMetaFields = 0;
Object ptr = doc.getMetadata();
// (Only needed for text engine or feature engine - otherwise the SAH cache is reset as needed)
if ((null != pxPipe.featureEngine) || (null != pxPipe.textEngine)) {
if ((null != _sah) && (null != ptr)) {
nCurrMetaFields = doc.getMetadata().size();
}
}//TESTED (metadata_doc_cache_reset)
try {
// 3] Create new documents from existing ones
if (null != pxPipe.splitter) {
if (null == splitterList) {
splitterList = new LinkedList<DocumentPojo>();
}
try {
splitDocuments(doc, source, pxPipe, splitterList);
}
catch (Exception e) {} // do nothing, still want to keep doc unless otherwise specified below
if ((null == pxPipe.splitter.getDeleteExisting()) || pxPipe.splitter.getDeleteExisting()) {
// Don't keep original doc
docIt.remove();
doc.setTempSource(null); // (can safely corrupt this doc since it's been removed)
break;
}//TESTED (test1,test2)
}//TESTED (doc_splitter)
// 4] Text and linked document extraction
if (null != pxPipe.text) {
// IN: doc (xpath/regex) or json(doc) (js)
// OUT: doc.fullText, doc.title, doc.desc, (less common) doc.metadata.*
// POST: reset
updateInterDocDelayState(doc, false);
String cachedFullText = _uah.doManualTextEnrichment(doc, pxPipe.text, source.getRssConfig());
if (null != _sah) {
_sah.resetDocumentCache();
}
// Cache the full text if available
if ((null == _cachedRawFullText) && _cachedRawFullText_available) {
_cachedRawFullText = cachedFullText;
}//(TESTED: cache available: text_raw_to_boilerpipe, no cache available: text_then_raw_then_content*)
}
//TESTED (fulltext_regexTests.json, basic_web_uahRawText.json, text_raw_to_boilerpipe)
if (null != pxPipe.textEngine) {
// IN: doc
// OUT: doc.*
// POST: reset sah ent cache (_should_ change only metadata and text (+ents/assocs) so don't need to reset sah doc cache)
if (!handleTextEngine(pxPipe, doc, source)) {
error_on_feed_count++;
if ((null == pxPipe.textEngine.exitOnError) || pxPipe.textEngine.exitOnError) {
doc.setTempSource(null); // (can safely corrupt this doc since it's been removed)
docIt.remove();
break; // (no more processing)
}//TESTED (engines_exit_on_error)
}
} //TESTED (basic_web_test_ocOptions.json, basic_web_test_textaaOptions.json)
// 5] Document level fields
if (null != pxPipe.docMetadata) {
// IN: sah.doc
// OUT: doc.*
// POST: reset
_sah.setDocumentMetadata(doc, pxPipe.docMetadata);
_sah.resetDocumentCache();
}
//TESTED (fulltext_docMetaTest.json)
if (null != pxPipe.contentMetadata) {
// IN: doc (xpath/regex) or json(doc) (js)
// OUT: doc.meta.*
// POST: reset
updateInterDocDelayState(doc, false);
_uah.processMetadataChain(doc, pxPipe.contentMetadata, source.getRssConfig(), unstoredFields);
if (null != _sah) {
_sah.resetDocumentCache();
}
// Cache the full text if available
if ((null == _cachedRawFullText) && _cachedRawFullText_available) {
_cachedRawFullText = doc.getFullText();
}//(TESTED: ((cache available) text_content_then_raw_to_boilerpipe (not available) text_default_then_content_then_default_test.json)
}
//TESTED (fulltext_regexTests.json, basic_web_uahRawText.json)
// 6] Entities and Associations
if (null != pxPipe.entities) {
// IN: sah.doc.*, sah.doc.metadadata.*,
//(recalculate from scratch then use: sah.entityMap, sah.geoMap)
// OUT: doc.entities, sah.entityMap, sah.geoMap
// POST: no need to reset anything, sah.entities never read
_sah.setEntities(doc, pxPipe.entities);
}
//TESTED (fulltext_ents_and_assocs.json)
if (null != pxPipe.associations) {
// IN: sah.doc.*, sah.doc.metadadata.*, doc.entities, sah.entityMap, sah.geoMap
// OUT: doc.associations
// POST: no need to reset anything, sah.associations never read
_sah.setAssociations(doc, pxPipe.associations);
}
//TESTED (fulltext_ents_and_assocs.json)
if (null != pxPipe.featureEngine) {
// IN: doc
// OUT: doc.*
// POST: reset sah ent cache (_should_ change only metadata, ents and assocs so don't need to reset sah doc cache)
if (!handleFeatureEngine(pxPipe, doc, source)) {
error_on_feed_count++;
if ((null == pxPipe.featureEngine.exitOnError) || pxPipe.featureEngine.exitOnError) {
doc.setTempSource(null); // (can safely corrupt this doc since it's been removed)
docIt.remove();
break; // (no more processing)
}//TESTED (engines_exit_on_error_test)
}
} //TESTED (basic_web_test_ocOptions.json, basic_web_test_textaaOptions.json)
// 7] Finishing steps:
if (null != pxPipe.storageSettings) {
// IN: doc
// OUT: doc.metadata.*
// POST: reset if metadata settings present
if (!handleStorageSettings(pxPipe, doc)) {
// (this is a manual rejection not an error so we're good)
doc.setTempSource(null); // (can safely corrupt this doc since it's been removed)
docIt.remove();
break; // (no more processing for this document)
}
if ((null != pxPipe.storageSettings.exitPipeline) && pxPipe.storageSettings.exitPipeline) {
break; // (no more processing for this document)
}//TESTED (basic_criteria_test)
}//TESTED (storageSettings_test; not update - need more infrastructure)
}
catch (Exception e) { // For now we'll just handle any exception by nuking either the doc or source
// (in the future we could consider continuing on depending on which pipeline element had died
// or perhaps even have a common option "continue on error")
throw e;
}
finally {}
// Check metadata state so we know if we need to re-ingest the document
// (Only needed for text engine or feature engine - otherwise the SAH cache is reset as needed)
if ((null != pxPipe.featureEngine) || (null != pxPipe.textEngine)) {
Object ptrAfter = doc.getMetadata();
int nCurrMetaFieldsAfter = 0;
if (null != _sah) {
if (null != ptrAfter) {
nCurrMetaFieldsAfter = doc.getMetadata().size();
}
if ((ptr != ptrAfter) || (nCurrMetaFieldsAfter != nCurrMetaFields))
{
_sah.resetDocumentCache();
}
}
}//TESTED (metadata_doc_cache_reset)
}//end loop over per-document processing pipeline elements
}
catch (ExtractorSourceLevelException e) { // Delete all docs, log
this.handleDocOrSourceError(source, doc, docIt, e, true);
break;
} //TESTED (c/p file_textError)
catch (ExtractorDailyLimitExceededException e) {
this.handleDocOrSourceError(source, doc, docIt, e, true);
break;
} //TESTED (c/p file_textError)
catch (ExtractorSourceLevelMajorException e) {
this.handleDocOrSourceError(source, doc, docIt, e, true);
break;
} //TESTED (file_textError)
catch (ExtractorSourceLevelTransientException e) {
this.handleDocOrSourceError(source, doc, docIt, e, true);
break;
} //TESTED (c/p file_textError)
catch (Exception e) { // Misc doc error
//e.printStackTrace();
error_on_feed_count++;
this.handleDocOrSourceError(source, doc, docIt, e, false);
// (don't break)
} //TESTED (web_errors_test)
finally {}
if (!unstoredFields.isEmpty()) {
if (null != doc.getMetadata()) {
for (String fieldToDelete: unstoredFields) {
doc.getMetadata().remove(fieldToDelete);
}
}
} //TESTED (storageSettings_advanced.json)
}//end loop over documents