// (applies this value to sleeps inside UAH.executeHarvest)
feedConfig.setWaitTimeOverride_ms(searchConfig.getWaitTimeBetweenPages_ms());
}
//TESTED (including RSS-level value being written back again and applied in SAH/UAH code)
DocumentPojo searchDoc = docToSplit;
Object[] savedMeta = null;
if (null == searchDoc) {
searchDoc = new DocumentPojo();
// Required terms:
searchDoc.setUrl(url);
searchDoc.setScore((double)nIteratingDepth); // (spidering param)
// Handy terms
if (null != src.getHarvestStatus()) {
searchDoc.setModified(src.getHarvestStatus().getHarvested()); // the last time the source was harvested - can use to determine how far back to go
}
// If these exist (they won't normally), fill them:
searchDoc.setFullText(currFullText);
searchDoc.setDescription(currDesc);
searchDoc.setTitle(currTitle);
}//TOTEST
else if (null != searchDoc.getMetadata()){
savedMeta = searchDoc.getMetadata().remove("searchEngineSubsystem");
// (this is normally null)
}//TOTEST
UnstructuredAnalysisHarvester dummyUAH = new UnstructuredAnalysisHarvester();
boolean bMoreDocs = (nPage < nMaxPages - 1);
Object[] searchResults = null;
try {
dummyUAH.executeHarvest(context, src, searchDoc, false, bMoreDocs);
// (the leading false means that we never sleep *before* the query, only after)
searchResults = searchDoc.getMetaData().get("searchEngineSubsystem");
}
finally {
if (null != savedMeta) { // (this is really obscure but handle the case where someone has created this meta field already)
searchDoc.getMetadata().put("searchEngineSubsystem", savedMeta);
}
else if ((null != searchDoc) && (null != searchDoc.getMetadata())) {
searchDoc.getMetadata().remove("searchEngineSubsystem");
}
}//TOTEST
//DEBUG
//System.out.println("NEW DOC MD: " + new com.google.gson.GsonBuilder().setPrettyPrinting().create().toJson(searchDoc.getMetadata()));
// Create extraUrl entries from the metadata
if ((null != searchResults) && (searchResults.length > 0)) {
for (Object searchResultObj: searchResults) {
try {
BasicDBObject bsonObj = (BasicDBObject)searchResultObj;
// 3 fields: url, title, description(=optional)
String linkUrl = bsonObj.getString(DocumentPojo.url_);
nLinksFound++;
if (!dedupSet.contains(linkUrl)) {
dedupSet.add(linkUrl);
String linkTitle = bsonObj.getString(DocumentPojo.title_);
String linkDesc = bsonObj.getString(DocumentPojo.description_);
String linkPubDate = bsonObj.getString(DocumentPojo.publishedDate_);
String linkFullText = bsonObj.getString(DocumentPojo.fullText_);
String spiderOut = bsonObj.getString("spiderOut");
if (null != linkUrl) {
SourceRssConfigPojo.ExtraUrlPojo link = new SourceRssConfigPojo.ExtraUrlPojo();
link.url = linkUrl;
link.title = linkTitle;
link.description = linkDesc;
link.publishedDate = linkPubDate;
link.fullText = linkFullText;
if (!stopLinkFollowing && (null != itUrls) && (null != spiderOut) && spiderOut.equalsIgnoreCase("true")) {
// In this case, add it back to the original list for chained processing
if (null == waitingList) {
waitingList = new LinkedList<ExtraUrlPojo>();
}
waitingList.add(link);
// (can't result in an infinite loop like this because we check
// dedupSet.size() and only allow links not already in dedupSet)
} //TESTED
if (null != linkTitle) {
boolean isDuplicate = false;
if (!stopPaginating && searchConfig.getStopPaginatingOnDuplicate()) {
// Quick duplicate check (full one gets done later)
isDuplicate = context.getDuplicateManager().isDuplicate_Url(linkUrl, src, null);
}//TESTED
if (!isDuplicate) {
if (null == feedConfig.getExtraUrls()) {
feedConfig.setExtraUrls(new ArrayList<ExtraUrlPojo>(searchResults.length));
}
feedConfig.getExtraUrls().add(link);
}
else {
stopPaginating = true;
if (null == feedConfig.getSearchConfig().getPageChangeRegex()) {
stopLinkFollowing = true;
}//TESTED
}//TESTED
}
}
}//(end if URL not already found)
}
catch (Exception e) {
// (just carry on)
//DEBUG
//e.printStackTrace();
}
}
}//TESTED
else if (0 == nPage) { //returned no links, log an error if this is page 1 and one has been saved
Object[] onError = searchDoc.getMetaData().get("_ONERROR_");
if ((null != onError) && (onError.length > 0) && (onError[0] instanceof String) && !(((String)(onError[0]))).isEmpty()) {
throw new ExtractorSourceLevelTransientException("generateFeedFromSearch: _ONERROR_: " + onError[0]);
}
}//TESTED
if (context.isStandalone()) { // debug mode, will display some additional logging
Object[] onDebug = searchDoc.getMetaData().get("_ONDEBUG_");
if ((null != onDebug) && (onDebug.length > 0)) {
for (Object debug: onDebug) {
if (debug instanceof String) {
context.getHarvestStatus().logMessage("_ONDEBUG_: " + (String)debug, true);
}