// A teeny bit of complex logic:
// toAdd by default use a text extractor
// DB/Files by default don't (but can override)
ITextExtractor currentTextExtractor = null;
boolean bUseRawContentWhereAvailable = false; // (only applies for feeds)
if (null != source.useTextExtractor()) {
currentTextExtractor = text_extractor_mappings.get(source.useTextExtractor().toLowerCase());
if (null == currentTextExtractor) { // (second chance)
currentTextExtractor = (ITextExtractor) lookForDynamicExtractor(source, true);
}
}
if (null == currentTextExtractor) { // none specified or didn't find it (<-latter is error)
if (null != source.useTextExtractor()) {
if ((null == source.getStructuredAnalysisConfig()) && (null == source.getUnstructuredAnalysisConfig())
&& (null == source.getProcessingPipeline()))
{
//(UAH and SAH get raw access to the data if they need it, so can carry on - ditto processing pipeline)
StringBuffer errMsg = new StringBuffer("Skipping source=").append(source.getKey()).append(" no_txt_extractor=").append(source.useTextExtractor());
logger.warn(errMsg.toString());
// No point trying this for the rest of the day
throw new ExtractorSourceLevelException(errMsg.toString());
}
else {
bUseRawContentWhereAvailable = true; // (only checked for feeds)
}//TESTED
}
else if (source.getExtractType().equalsIgnoreCase("feed")) // (DB/files just use their existing fullText)
{
if (null != currentEntityExtractor) {
String selfExtraction = currentEntityExtractor.getCapability(EntityExtractorEnum.URLTextExtraction);
// Leave as null unless have no built-in capability
if ((null == selfExtraction) || !selfExtraction.equals("true"))
{
currentTextExtractor = default_text_extractor;
}
}
else {
currentTextExtractor = default_text_extractor;
}
}//TESTED
}
// EXTRACTION
Iterator<DocumentPojo> i = toAdd.iterator(); //iterator created so that elements in the toAdd list can be
// removed within the loop
while ( i.hasNext() )
{
long nTime_ms = System.currentTimeMillis();
DocumentPojo doc = i.next();
boolean bExtractedText = false;
// If I've been stopped then just remove all remaining documents
// (pick them up next time through)
if (bIsKilled) {
i.remove();
if (!calledFromPipeline) {
doc.setTempSource(null); // (can safely corrupt this doc since it's been removed)
}
continue;
}
if ( calledFromPipeline || !urlsThatError.contains(doc.getUrl()) ) //only attempt if url is okay
{
feed_count++;
try {
// (Check for truncation)
if ((null != currentEntityExtractor) && (null != doc.getFullText())) {
try {
String s = currentEntityExtractor.getCapability(EntityExtractorEnum.MaxInputBytes);
if (null != s) {
int maxLength = Integer.parseInt(s);
if (doc.getFullText().length() > maxLength) { //just warn, it's up to the extractor to sort it out
getHarvestStatus().logMessage("Warning: truncating document to max length: " + s, false);
}
}
}
catch (Exception e) {} // max length not reported just carry on
}
if (null != currentTextExtractor)
{
bExtractedText = true;
currentTextExtractor.extractText(doc);
if (null != currentEntityExtractor) {
currentEntityExtractor.extractEntities(doc);
}
}//TESTED