// ie specified one but it doesn't exist....
StringBuffer errMsg = new StringBuffer("Skipping source=").append(source.getKey()).append(" no_extractor=").append(source.useExtractor());
logger.warn(errMsg.toString());
// No point trying this for the rest of the day
throw new ExtractorSourceLevelException(errMsg.toString());
}
else if (null == source.useExtractor()) { // Didn't specify one, just use default:
currentEntityExtractor = default_entity_extractor;
}
}//TESTED
if (bFinalizeBatchOnly) {
try {
currentEntityExtractor.extractEntities(null);
}
catch (Exception e) {} // do nothing, eg handle entity extractors that don't handle things well
return;
}
// A teeny bit of complex logic:
// toAdd by default use a text extractor
// DB/Files by default don't (but can override)
ITextExtractor currentTextExtractor = null;
boolean bUseRawContentWhereAvailable = false; // (only applies for feeds)
if (null != source.useTextExtractor()) {
currentTextExtractor = text_extractor_mappings.get(source.useTextExtractor().toLowerCase());
if (null == currentTextExtractor) { // (second chance)
currentTextExtractor = (ITextExtractor) lookForDynamicExtractor(source, true);
}
}
if (null == currentTextExtractor) { // none specified or didn't find it (<-latter is error)
if (null != source.useTextExtractor()) {
if ((null == source.getStructuredAnalysisConfig()) && (null == source.getUnstructuredAnalysisConfig())
&& (null == source.getProcessingPipeline()))
{
//(UAH and SAH get raw access to the data if they need it, so can carry on - ditto processing pipeline)
StringBuffer errMsg = new StringBuffer("Skipping source=").append(source.getKey()).append(" no_txt_extractor=").append(source.useTextExtractor());
logger.warn(errMsg.toString());
// No point trying this for the rest of the day
throw new ExtractorSourceLevelException(errMsg.toString());
}
else {
bUseRawContentWhereAvailable = true; // (only checked for feeds)
}//TESTED
}
else if (source.getExtractType().equalsIgnoreCase("feed")) // (DB/files just use their existing fullText)
{
if (null != currentEntityExtractor) {
String selfExtraction = currentEntityExtractor.getCapability(EntityExtractorEnum.URLTextExtraction);
// Leave as null unless have no built-in capability
if ((null == selfExtraction) || !selfExtraction.equals("true"))
{
currentTextExtractor = default_text_extractor;
}
}
else {
currentTextExtractor = default_text_extractor;
}
}//TESTED
}
// EXTRACTION
Iterator<DocumentPojo> i = toAdd.iterator(); //iterator created so that elements in the toAdd list can be
// removed within the loop
while ( i.hasNext() )
{
long nTime_ms = System.currentTimeMillis();
DocumentPojo doc = i.next();
boolean bExtractedText = false;
// If I've been stopped then just remove all remaining documents
// (pick them up next time through)
if (bIsKilled) {
i.remove();
if (!calledFromPipeline) {
doc.setTempSource(null); // (can safely corrupt this doc since it's been removed)
}
continue;
}
if ( calledFromPipeline || !urlsThatError.contains(doc.getUrl()) ) //only attempt if url is okay
{
feed_count++;
try {
// (Check for truncation)
if ((null != currentEntityExtractor) && (null != doc.getFullText())) {
try {
String s = currentEntityExtractor.getCapability(EntityExtractorEnum.MaxInputBytes);
if (null != s) {
int maxLength = Integer.parseInt(s);
if (doc.getFullText().length() > maxLength) { //just warn, it's up to the extractor to sort it out
getHarvestStatus().logMessage("Warning: truncating document to max length: " + s, false);
}
}
}
catch (Exception e) {} // max length not reported just carry on
}
if (null != currentTextExtractor)
{
bExtractedText = true;
currentTextExtractor.extractText(doc);
if (null != currentEntityExtractor) {
currentEntityExtractor.extractEntities(doc);
}
}//TESTED
else //db/filesys should already have full text extracted (unless otherwise specified)
{
if (source.getExtractType().equalsIgnoreCase("feed")) { // Need full text so get from current
if ((null == doc.getFullText()) || !bUseRawContentWhereAvailable) {
bExtractedText = true;
if (null != currentEntityExtractor) {
currentEntityExtractor.extractEntitiesAndText(doc);
}
}//TESTED (AlchemyAPI case)
else { // Feed for which we've already extracted data
if (null != currentEntityExtractor) {
currentEntityExtractor.extractEntities(doc);
}
}//TESTED
}
else { // DB/File => use full text
if (null != currentEntityExtractor) {
currentEntityExtractor.extractEntities(doc);
}
}//TESTED
}
//statistics counting
if ( doc.getEntities() != null )
num_ent_extracted.addAndGet(doc.getEntities().size());
if ( doc.getAssociations() != null )
num_event_extracted.addAndGet(doc.getAssociations().size());
}
catch (ExtractorDailyLimitExceededException e) {
//extractor can't do anything else today, return
i.remove();
if (!calledFromPipeline) {
doc.setTempSource(null); // (can safely corrupt this doc since it's been removed)
}
// Source error, ignore all other documents
while (i.hasNext()) {
doc = i.next();
if (!calledFromPipeline) {
doc.setTempSource(null); // (can safely corrupt this doc since it's been removed)
}
i.remove();
}
//TESTED
throw e; // (ie stop processing this source)
}//TESTED
catch (Exception e) { // Anything except daily limit exceeded, expect it to be ExtractorDocumentLevelException
//TODO (INF-1922): put this in a separate function and call that from pipeline on failure...
// (not sure what to do about error_on_feed_count though, need to maintain a separate one of those in pipeline?)
// This can come from (sort-of/increasingly) "user" code so provide a bit more information
StringBuffer errMessage = HarvestExceptionUtils.createExceptionMessage(e);
_harvestStatus.logMessage(errMessage.toString(), true);
num_error_url.incrementAndGet();
nUrlErrorsThisSource++;
if (!calledFromPipeline) {
urlsThatError.add(doc.getUrl());
}
error_on_feed_count++;
i.remove();
if (!calledFromPipeline) {
doc.setTempSource(null); // (can safely corrupt this doc since it's been removed)
}
}
//TESTED
}
// (note this is only ever called in legacy mode - it's handled in the HarvestControllerPipeline)
if ((null != source.getExtractType()) && (source.getExtractType().equalsIgnoreCase("feed"))) {
if (i.hasNext() && bExtractedText) {
nTime_ms = nBetweenFeedDocs_ms - (System.currentTimeMillis() - nTime_ms); // (ie delay time - processing time)
if (nTime_ms > 0) {
try { Thread.sleep(nTime_ms); } catch (Exception e) {};
// (wait 10s between web-site accesses for politeness)
}
}
}//(TESTED)
} // end loop over documents
//check if all toAdd were erroring, or more than 20 (arbitrary number)
//NOTE: this is duplicated in HarvestControllerPipeline for non-legacy cases
if ((error_on_feed_count == feed_count) && (feed_count > 5))
{
String errorMsg = new StringBuffer().append(feed_count).append(" docs, ").append(error_on_feed_count).append(", errors").toString();
if (error_on_feed_count > 20) {
throw new ExtractorSourceLevelMajorException(errorMsg);
}
else {
throw new ExtractorSourceLevelException(errorMsg);
}//TESTED
}
}
catch (ExtractorDailyLimitExceededException e) {
// Percolate upwards!