// Now allowed to stop paginating on duplicate in success_iteration/error cases
if ((null == src.getHarvestStatus()) || (HarvestEnum.success != src.getHarvestStatus().getHarvest_status())) {
searchConfig.setStopPaginatingOnDuplicate(false);
}//TESTED
UnstructuredAnalysisConfigPojo savedUAHconfig = src.getUnstructuredAnalysisConfig(); // (can be null)
String savedUserAgent = feedConfig.getUserAgent();
LinkedHashMap<String, String> savedHttpFields = feedConfig.getHttpFields();
Integer savedWaitTimeOverride_ms = feedConfig.getWaitTimeOverride_ms();
// Create a deduplication set to ensure URLs derived from the search pages don't duplicate the originals
// (and also derived URLs)
HashSet<String> dedupSet = new HashSet<String>();
if (null != src.getRssConfig().getExtraUrls()) {
Iterator<ExtraUrlPojo> itDedupUrls = src.getRssConfig().getExtraUrls().iterator();
while (itDedupUrls.hasNext()) {
ExtraUrlPojo itUrl = itDedupUrls.next();
if (null != itUrl.title) {
String dedupUrl = itUrl.url;
dedupSet.add(dedupUrl);
if (maxDocsPerCycle != Integer.MAX_VALUE) {
maxDocsPerCycle++; // (ensure we get as far as adding these)
}
}
}
}//TESTED
Iterator<ExtraUrlPojo> itUrls = null;
// Spider parameters used in conjunction with itUrls
List<ExtraUrlPojo> iteratingList = null;
List<ExtraUrlPojo> waitingList = null;
int nIteratingDepth = 0;
// (ie no URL specified, so using extra URLs as search URLs - and optionally as real URLs also)
if ((null == savedUrl) && (null != src.getRssConfig().getExtraUrls()) && !src.getRssConfig().getExtraUrls().isEmpty()) {
// Spider logic:
iteratingList = src.getRssConfig().getExtraUrls();
// (end spidering logic)
itUrls = iteratingList.iterator();
src.getRssConfig().setExtraUrls(new LinkedList<ExtraUrlPojo>());
// (ie overwrite the original list)
}//TESTED
for (;;) { // The logic for this loop can vary...
if (dedupSet.size() >= maxDocsPerCycle) {
break;
}
String currTitle = null;
String currFullText = null;
String currDesc = null;
if (null != itUrls) {
ExtraUrlPojo urlPojo = itUrls.next();
savedUrl = urlPojo.url;
if (0 == nIteratingDepth) {
if (null != urlPojo.title) { // Also harvest this
src.getRssConfig().getExtraUrls().add(urlPojo);
if (maxDocsPerCycle != Integer.MAX_VALUE) {
maxDocsPerCycle--; // (now added, can remove)
}
}
}
currTitle = urlPojo.title;
currDesc = urlPojo.description;
currFullText = urlPojo.fullText;
}//TESTED
try { // If we error out, we're probably going to abandon the entire search
// We're going to loop over pages
// Apply the regex to the URL for pagination, part 1
int nResultOffset = 0;
int nMaxPages = 1;
Pattern pageChangeRegex = null;
Matcher pageChangeRegexMatcher = null;
if (null != feedConfig.getSearchConfig().getPageChangeRegex()) {
pageChangeRegex = Pattern.compile(feedConfig.getSearchConfig().getPageChangeRegex(), Pattern.CASE_INSENSITIVE);
pageChangeRegexMatcher = pageChangeRegex.matcher(savedUrl);
nMaxPages = feedConfig.getSearchConfig().getNumPages();
if (pageChangeRegexMatcher.find()) {
String group = pageChangeRegexMatcher.group(1);
if (null != group) {
try {
nResultOffset = Integer.parseInt(group);
}
catch (Exception e) {} // just carry on
}
}
else { // URL doesn't match
pageChangeRegexMatcher = null;
}//TESTED
}//TESTED
// Page limit check (see also nLinksFound/nCurrDedupSetSize inside loop)
int nMinLinksToExitLoop = 10; // (use to check one iteration past the point at which nothing happens)
// If checking vs duplicates then have a flag to exit (note: only applies to the current URL)
boolean stopPaginating = false;
boolean stopLinkFollowing = false;
// (if set to stop paginating but only link following occurs, assume this is treated like pagination, eg nextUrl sort of thing)
for (int nPage = 0; nPage < nMaxPages; ++nPage) {
if ((dedupSet.size() >= maxDocsPerCycle) || stopPaginating) {
if (dedupSet.size() >= maxDocsPerCycle) {
src.setReachedMaxDocs();
}
break;
}
// Will use this to check if we reached a page limit (eg some sites will just repeat the same page over and over again)
int nLinksFound = 0;
int nCurrDedupSetSize = dedupSet.size();
String url = savedUrl;
// Apply the regex to the URL for pagination, part 2
if ((null != pageChangeRegex) && (null != feedConfig.getSearchConfig().getPageChangeReplace())) {
int nResultStart = nPage*feedConfig.getSearchConfig().getNumResultsPerPage() + nResultOffset;
String replace = feedConfig.getSearchConfig().getPageChangeReplace().replace("$1", Integer.toString(nResultStart));
if (null == pageChangeRegexMatcher) {
url += replace;
}
else {
url = pageChangeRegexMatcher.replaceFirst(replace);
}
}//TESTED
//DEBUG
//System.out.println("URL=" + url);
// Create a custom UAH object to fetch and parse the search results
UnstructuredAnalysisConfigPojo dummyUAHconfig = new UnstructuredAnalysisConfigPojo();
if (null == feedConfig.getSearchConfig().getScriptflags()) { // Set flags if necessary
if (null == feedConfig.getSearchConfig().getExtraMeta()) {
feedConfig.getSearchConfig().setScriptflags("dt");
}
else {
feedConfig.getSearchConfig().setScriptflags("dtm");
}
}
if (null != feedConfig.getSearchConfig().getExtraMeta()) {
dummyUAHconfig.CopyMeta(feedConfig.getSearchConfig().getExtraMeta());
// Legacy -> Pipeline port
for (metaField extraMeta: dummyUAHconfig.getMeta()) {
if (null == extraMeta.context) { // mandatory in legacy, discarded in pipeline!
extraMeta.context = Context.First;
}
}
}
dummyUAHconfig.setScript(feedConfig.getSearchConfig().getGlobals());
dummyUAHconfig.AddMetaField("searchEngineSubsystem", Context.All, feedConfig.getSearchConfig().getScript(), "javascript", feedConfig.getSearchConfig().getScriptflags());
src.setUnstructuredAnalysisConfig(dummyUAHconfig);
if (null != searchConfig.getProxyOverride()) {
feedConfig.setProxyOverride(searchConfig.getProxyOverride());
}
if (null != searchConfig.getUserAgent()) {