// (probably slightly less efficient than checking duplicates here, but much simpler, can
// always change it later)
String savedUrl = src.getUrl();
SourceRssConfigPojo feedConfig = src.getRssConfig();
SourceSearchFeedConfigPojo searchConfig = feedConfig.getSearchConfig();
String savedProxyOverride = feedConfig.getProxyOverride();
if ((null == feedConfig) || (null == searchConfig)) {
return;
}
// Now allowed to stop paginating on duplicate in success_iteration/error cases
if ((null == src.getHarvestStatus()) || (HarvestEnum.success != src.getHarvestStatus().getHarvest_status())) {
searchConfig.setStopPaginatingOnDuplicate(false);
}//TESTED
UnstructuredAnalysisConfigPojo savedUAHconfig = src.getUnstructuredAnalysisConfig(); // (can be null)
String savedUserAgent = feedConfig.getUserAgent();
LinkedHashMap<String, String> savedHttpFields = feedConfig.getHttpFields();
Integer savedWaitTimeOverride_ms = feedConfig.getWaitTimeOverride_ms();
// Create a deduplication set to ensure URLs derived from the search pages don't duplicate the originals
// (and also derived URLs)
HashSet<String> dedupSet = new HashSet<String>();
if (null != src.getRssConfig().getExtraUrls()) {
Iterator<ExtraUrlPojo> itDedupUrls = src.getRssConfig().getExtraUrls().iterator();
while (itDedupUrls.hasNext()) {
ExtraUrlPojo itUrl = itDedupUrls.next();
if (null != itUrl.title) {
String dedupUrl = itUrl.url;
dedupSet.add(dedupUrl);
if (maxDocsPerCycle != Integer.MAX_VALUE) {
maxDocsPerCycle++; // (ensure we get as far as adding these)
}
}
}
}//TESTED
Iterator<ExtraUrlPojo> itUrls = null;
// Spider parameters used in conjunction with itUrls
List<ExtraUrlPojo> iteratingList = null;
List<ExtraUrlPojo> waitingList = null;
int nIteratingDepth = 0;
// (ie no URL specified, so using extra URLs as search URLs - and optionally as real URLs also)
if ((null == savedUrl) && (null != src.getRssConfig().getExtraUrls()) && !src.getRssConfig().getExtraUrls().isEmpty()) {
// Spider logic:
iteratingList = src.getRssConfig().getExtraUrls();
// (end spidering logic)
itUrls = iteratingList.iterator();
src.getRssConfig().setExtraUrls(new LinkedList<ExtraUrlPojo>());
// (ie overwrite the original list)
}//TESTED
for (;;) { // The logic for this loop can vary...
if (dedupSet.size() >= maxDocsPerCycle) {
break;
}
String currTitle = null;
String currFullText = null;
String currDesc = null;
if (null != itUrls) {
ExtraUrlPojo urlPojo = itUrls.next();
savedUrl = urlPojo.url;
if (0 == nIteratingDepth) {
if (null != urlPojo.title) { // Also harvest this
src.getRssConfig().getExtraUrls().add(urlPojo);
if (maxDocsPerCycle != Integer.MAX_VALUE) {
maxDocsPerCycle--; // (now added, can remove)
}
}
}
currTitle = urlPojo.title;
currDesc = urlPojo.description;
currFullText = urlPojo.fullText;
}//TESTED
try { // If we error out, we're probably going to abandon the entire search
// We're going to loop over pages
// Apply the regex to the URL for pagination, part 1
int nResultOffset = 0;
int nMaxPages = 1;
Pattern pageChangeRegex = null;
Matcher pageChangeRegexMatcher = null;
if (null != feedConfig.getSearchConfig().getPageChangeRegex()) {
pageChangeRegex = Pattern.compile(feedConfig.getSearchConfig().getPageChangeRegex(), Pattern.CASE_INSENSITIVE);
pageChangeRegexMatcher = pageChangeRegex.matcher(savedUrl);
nMaxPages = feedConfig.getSearchConfig().getNumPages();
if (pageChangeRegexMatcher.find()) {
String group = pageChangeRegexMatcher.group(1);
if (null != group) {
try {
nResultOffset = Integer.parseInt(group);
}
catch (Exception e) {} // just carry on
}
}
else { // URL doesn't match
pageChangeRegexMatcher = null;
}//TESTED
}//TESTED
// Page limit check (see also nLinksFound/nCurrDedupSetSize inside loop)
int nMinLinksToExitLoop = 10; // (use to check one iteration past the point at which nothing happens)
// If checking vs duplicates then have a flag to exit (note: only applies to the current URL)
boolean stopPaginating = false;
boolean stopLinkFollowing = false;
// (if set to stop paginating but only link following occurs, assume this is treated like pagination, eg nextUrl sort of thing)
for (int nPage = 0; nPage < nMaxPages; ++nPage) {
if ((dedupSet.size() >= maxDocsPerCycle) || stopPaginating) {
if (dedupSet.size() >= maxDocsPerCycle) {
src.setReachedMaxDocs();
}
break;
}
// Will use this to check if we reached a page limit (eg some sites will just repeat the same page over and over again)
int nLinksFound = 0;
int nCurrDedupSetSize = dedupSet.size();
String url = savedUrl;
// Apply the regex to the URL for pagination, part 2
if ((null != pageChangeRegex) && (null != feedConfig.getSearchConfig().getPageChangeReplace())) {
int nResultStart = nPage*feedConfig.getSearchConfig().getNumResultsPerPage() + nResultOffset;
String replace = feedConfig.getSearchConfig().getPageChangeReplace().replace("$1", Integer.toString(nResultStart));
if (null == pageChangeRegexMatcher) {
url += replace;
}
else {
url = pageChangeRegexMatcher.replaceFirst(replace);
}
}//TESTED
//DEBUG
//System.out.println("URL=" + url);
// Create a custom UAH object to fetch and parse the search results
UnstructuredAnalysisConfigPojo dummyUAHconfig = new UnstructuredAnalysisConfigPojo();
if (null == feedConfig.getSearchConfig().getScriptflags()) { // Set flags if necessary
if (null == feedConfig.getSearchConfig().getExtraMeta()) {
feedConfig.getSearchConfig().setScriptflags("dt");
}
else {
feedConfig.getSearchConfig().setScriptflags("dtm");
}
}
if (null != feedConfig.getSearchConfig().getExtraMeta()) {
dummyUAHconfig.CopyMeta(feedConfig.getSearchConfig().getExtraMeta());
// Legacy -> Pipeline port
for (metaField extraMeta: dummyUAHconfig.getMeta()) {
if (null == extraMeta.context) { // mandatory in legacy, discarded in pipeline!
extraMeta.context = Context.First;
}
}
}
dummyUAHconfig.setScript(feedConfig.getSearchConfig().getGlobals());
dummyUAHconfig.AddMetaField("searchEngineSubsystem", Context.All, feedConfig.getSearchConfig().getScript(), "javascript", feedConfig.getSearchConfig().getScriptflags());
src.setUnstructuredAnalysisConfig(dummyUAHconfig);
if (null != searchConfig.getProxyOverride()) {
feedConfig.setProxyOverride(searchConfig.getProxyOverride());
}
if (null != searchConfig.getUserAgent()) {
feedConfig.setUserAgent(searchConfig.getUserAgent());
}
if (null != searchConfig.getHttpFields()) {
feedConfig.setHttpFields(searchConfig.getHttpFields());
}
if (null != searchConfig.getWaitTimeBetweenPages_ms()) {
// Web etiquette: don't hit the same site too often
// (applies this value to sleeps inside UAH.executeHarvest)
feedConfig.setWaitTimeOverride_ms(searchConfig.getWaitTimeBetweenPages_ms());
}
//TESTED (including RSS-level value being written back again and applied in SAH/UAH code)
DocumentPojo searchDoc = docToSplit;
Object[] savedMeta = null;
if (null == searchDoc) {
searchDoc = new DocumentPojo();
// Required terms:
searchDoc.setUrl(url);
searchDoc.setScore((double)nIteratingDepth); // (spidering param)
// Handy terms
if (null != src.getHarvestStatus()) {
searchDoc.setModified(src.getHarvestStatus().getHarvested()); // the last time the source was harvested - can use to determine how far back to go
}
// If these exist (they won't normally), fill them:
searchDoc.setFullText(currFullText);
searchDoc.setDescription(currDesc);
searchDoc.setTitle(currTitle);
}//TOTEST
else if (null != searchDoc.getMetadata()){
savedMeta = searchDoc.getMetadata().remove("searchEngineSubsystem");
// (this is normally null)
}//TOTEST
UnstructuredAnalysisHarvester dummyUAH = new UnstructuredAnalysisHarvester();
boolean bMoreDocs = (nPage < nMaxPages - 1);
Object[] searchResults = null;
try {
dummyUAH.executeHarvest(context, src, searchDoc, false, bMoreDocs);
// (the leading false means that we never sleep *before* the query, only after)
searchResults = searchDoc.getMetaData().get("searchEngineSubsystem");
}
finally {
if (null != savedMeta) { // (this is really obscure but handle the case where someone has created this meta field already)
searchDoc.getMetadata().put("searchEngineSubsystem", savedMeta);
}
else if ((null != searchDoc) && (null != searchDoc.getMetadata())) {
searchDoc.getMetadata().remove("searchEngineSubsystem");
}
}//TOTEST
//DEBUG
//System.out.println("NEW DOC MD: " + new com.google.gson.GsonBuilder().setPrettyPrinting().create().toJson(searchDoc.getMetadata()));
// Create extraUrl entries from the metadata
if ((null != searchResults) && (searchResults.length > 0)) {
for (Object searchResultObj: searchResults) {
try {
BasicDBObject bsonObj = (BasicDBObject)searchResultObj;
// 3 fields: url, title, description(=optional)
String linkUrl = bsonObj.getString(DocumentPojo.url_);
nLinksFound++;
if (!dedupSet.contains(linkUrl)) {
dedupSet.add(linkUrl);
String linkTitle = bsonObj.getString(DocumentPojo.title_);
String linkDesc = bsonObj.getString(DocumentPojo.description_);
String linkPubDate = bsonObj.getString(DocumentPojo.publishedDate_);
String linkFullText = bsonObj.getString(DocumentPojo.fullText_);
String spiderOut = bsonObj.getString("spiderOut");
if (null != linkUrl) {
SourceRssConfigPojo.ExtraUrlPojo link = new SourceRssConfigPojo.ExtraUrlPojo();
link.url = linkUrl;
link.title = linkTitle;
link.description = linkDesc;
link.publishedDate = linkPubDate;
link.fullText = linkFullText;
if (!stopLinkFollowing && (null != itUrls) && (null != spiderOut) && spiderOut.equalsIgnoreCase("true")) {
// In this case, add it back to the original list for chained processing
if (null == waitingList) {
waitingList = new LinkedList<ExtraUrlPojo>();
}
waitingList.add(link);
// (can't result in an infinite loop like this because we check
// dedupSet.size() and only allow links not already in dedupSet)
} //TESTED
if (null != linkTitle) {
boolean isDuplicate = false;
if (!stopPaginating && searchConfig.getStopPaginatingOnDuplicate()) {
// Quick duplicate check (full one gets done later)
isDuplicate = context.getDuplicateManager().isDuplicate_Url(linkUrl, src, null);
}//TESTED
if (!isDuplicate) {
if (null == feedConfig.getExtraUrls()) {
feedConfig.setExtraUrls(new ArrayList<ExtraUrlPojo>(searchResults.length));
}
feedConfig.getExtraUrls().add(link);
}
else {
stopPaginating = true;
if (null == feedConfig.getSearchConfig().getPageChangeRegex()) {
stopLinkFollowing = true;
}//TESTED
}//TESTED
}
}
}//(end if URL not already found)
}
catch (Exception e) {
// (just carry on)
//DEBUG
//e.printStackTrace();
}
}
}//TESTED
else if (0 == nPage) { //returned no links, log an error if this is page 1 and one has been saved
Object[] onError = searchDoc.getMetaData().get("_ONERROR_");
if ((null != onError) && (onError.length > 0) && (onError[0] instanceof String) && !(((String)(onError[0]))).isEmpty()) {
throw new ExtractorSourceLevelTransientException("generateFeedFromSearch: _ONERROR_: " + onError[0]);
}
}//TESTED
if (context.isStandalone()) { // debug mode, will display some additional logging
Object[] onDebug = searchDoc.getMetaData().get("_ONDEBUG_");
if ((null != onDebug) && (onDebug.length > 0)) {
for (Object debug: onDebug) {
if (debug instanceof String) {
context.getHarvestStatus().logMessage("_ONDEBUG_: " + (String)debug, true);
}
else {
context.getHarvestStatus().logMessage("_ONDEBUG_: " + new com.google.gson.Gson().toJson(debug), true);
}
}
}
}//TESTED
// PAGINGATION BREAK LOGIC:
// 1: All the links are duplicates of links already in the DB
// 2: No new links from last page
// LOGIC CASE 1: (All the links are duplicates of links already in the DB)
//(already handled above)
// LOGIC CASE 2: (No new links from last page)
//DEBUG
//System.out.println("LINKS_SIZE=" + feedConfig.getExtraUrls().size());
//System.out.println("LINKS=\n"+new com.google.gson.GsonBuilder().setPrettyPrinting().create().toJson(feedConfig.getExtraUrls()));
if (dedupSet.size() == nCurrDedupSetSize) { // All links were duplicate
//DEBUG
//System.out.println("FOUND " + nLinksFound + " vs " + nMinLinksToExitLoop + " duplicate URLs (" + nCurrDedupSetSize + ")");
if (nLinksFound >= nMinLinksToExitLoop) { // (at least 10 found so insta-quit)
break;
}
else { // (fewer than 10 found - includ
nMinLinksToExitLoop = 0; // (also handles the no links found case)
}
}//TESTED
else {
nMinLinksToExitLoop = 10; // (reset)
}//TESTED
}// end loop over pages
}
catch (Exception e) {
//DEBUG
//e.printStackTrace();
if ((null == dedupSet) || dedupSet.isEmpty()) {
throw new ExtractorSourceLevelTransientException("generateFeedFromSearch: " + e.getMessage());
}
else {
throw new ExtractorDocumentLevelException("generateFeedFromSearch: " + e.getMessage());
}
// (don't log since these errors will appear in the log under the source, ie more usefully)
}//TESTED
finally {
// Fix any temp changes we made to the source
src.setUnstructuredAnalysisConfig(savedUAHconfig);
feedConfig.setUserAgent(savedUserAgent);
feedConfig.setHttpFields(savedHttpFields);
feedConfig.setWaitTimeOverride_ms(savedWaitTimeOverride_ms);
feedConfig.setProxyOverride(savedProxyOverride);
}
if (null == itUrls) {
break;
}
else if (!itUrls.hasNext()) {
if (null != waitingList) {
// Spider logic:
if (null == searchConfig.getMaxDepth()) {
searchConfig.setMaxDepth(2); // (default max depth is 2 hops, ie original document, link, link from link)
}
nIteratingDepth++;
if (nIteratingDepth > searchConfig.getMaxDepth()) {
break;
}
itUrls = waitingList.iterator();
waitingList = null;
// (end spider logic)