public SearchResults query(WaybackRequest wbRequest)
throws ResourceIndexNotAvailableException,
ResourceNotInArchiveException, BadQueryException,
AccessControlException {
SearchResults results = null; // return value placeholder
String startKey; // actual key where search will begin
String keyUrl; // "purified" URL request
int startResult; // calculated based on hits/page * pagenum
// first grab all the info from the WaybackRequest, and validate it:
int resultsPerPage = wbRequest.getResultsPerPage();
int pageNum = wbRequest.getPageNum();
startResult = (pageNum - 1) * resultsPerPage;
if (resultsPerPage < 1) {
throw new BadQueryException("resultsPerPage cannot be < 1");
}
if (resultsPerPage > maxRecords) {
throw new BadQueryException("resultsPerPage cannot be > "
+ maxRecords);
}
if (pageNum < 1) {
throw new BadQueryException("pageNum must be > 0");
}
String searchUrl = getRequired(wbRequest, WaybackConstants.REQUEST_URL);
String searchType = getRequired(wbRequest,
WaybackConstants.REQUEST_TYPE);
String startDate = getRequired(wbRequest,
WaybackConstants.REQUEST_START_DATE, Timestamp
.earliestTimestamp().getDateStr());
String endDate = getRequired(wbRequest,
WaybackConstants.REQUEST_END_DATE, Timestamp.latestTimestamp()
.getDateStr());
String exactDate = getRequired(wbRequest,
WaybackConstants.REQUEST_EXACT_DATE, Timestamp
.latestTimestamp().getDateStr());
try {
keyUrl = canonicalizer.urlStringToKey(searchUrl);
} catch (URIException e) {
throw new BadQueryException("invalid "
+ WaybackConstants.REQUEST_URL + " " + searchUrl);
}
// set up the common Filters:
// makes sure we don't inspect too many records: prevents DOS
GuardRailFilter guardrail = new GuardRailFilter(maxRecords);
// checks an exclusion service for every matching record
ObjectFilter<SearchResult> exclusion = wbRequest.getExclusionFilter();
// count how many results got to the ExclusionFilter:
CounterFilter preExCounter = new CounterFilter();
// count how many results got past the ExclusionFilter, or how
// many total matched, if there was no ExclusionFilter:
CounterFilter finalCounter = new CounterFilter();
// has the user asked for only results on the exact host specified?
HostMatchFilter hostMatchFilter = getExactHostFilter(wbRequest);
if (searchType.equals(WaybackConstants.REQUEST_REPLAY_QUERY)
|| searchType.equals(WaybackConstants.REQUEST_CLOSEST_QUERY)) {
results = new CaptureSearchResults();
ObjectFilterChain<SearchResult> forwardFilters =
new ObjectFilterChain<SearchResult>();
// ObjectFilterChain<SearchResult> reverseFilters =
// new ObjectFilterChain<SearchResult>();
// use the same guardrail for both:
forwardFilters.addFilter(guardrail);
// reverseFilters.addFilter(guardrail);
forwardFilters.addFilter(new DuplicateRecordFilter());
// match URL key:
forwardFilters.addFilter(new UrlMatchFilter(keyUrl));
// reverseFilters.addFilter(new UrlMatchFilter(keyUrl));
if(hostMatchFilter != null) {
forwardFilters.addFilter(hostMatchFilter);
// reverseFilters.addFilter(hostMatchFilter);
}
// be sure to only include records within the date range we want:
// The bin search may start the forward filters at a record older
// than we want. Since the fowardFilters only include an abort
// endDateFilter, we might otherwise include a record before the
// requested range.
DateRangeFilter drFilter = new DateRangeFilter(startDate,endDate);
forwardFilters.addFilter(drFilter);
// reverseFilters.addFilter(drFilter);
// abort processing if we hit a date outside the search range:
forwardFilters.addFilter(new EndDateFilter(endDate));
// reverseFilters.addFilter(new StartDateFilter(startDate));
// for replay, do not include records that redirect to
// themselves.. We'll leave this for both closest and replays,
// because the only application of closest at the moment is
// timeline in which case, we don't want to show captures that
// redirect to themselves in the timeline if they are not viewable.
SelfRedirectFilter selfRedirectFilter = new SelfRedirectFilter();
selfRedirectFilter.setCanonicalizer(canonicalizer);
forwardFilters.addFilter(selfRedirectFilter);
// reverseFilters.addFilter(selfRedirectFilter);
// possibly filter via exclusions:
if(exclusion != null) {
forwardFilters.addFilter(preExCounter);
forwardFilters.addFilter(exclusion);
// reverseFilters.addFilter(preExCounter);
// reverseFilters.addFilter(exclusion);
}
forwardFilters.addFilter(finalCounter);
// reverseFilters.addFilter(finalCounter);
forwardFilters.addFilter(new WindowEndFilter(resultsPerPage));
// int resultsPerDirection = (int) Math.floor(resultsPerPage / 2);
// reverseFilters.addFilter(new WindowEndFilter(resultsPerDirection));
startKey = keyUrl;
try {
// CloseableIterator<SearchResult> reverse =
// new AdaptedObjectFilterIterator<SearchResult>(
// source.getPrefixReverseIterator(startKey),
// reverseFilters);
// // reverse the reverseResults:
// ArrayList<SearchResult> reverseResults =
// new ArrayList<SearchResult>();
// while(reverse.hasNext()) {
// reverseResults.add(0, reverse.next());
// }
// now make a composite of the reverse and forwards:
CloseableIterator<SearchResult> forward =
source.getPrefixIterator(startKey);
//
// CompositeIterator<SearchResult> resultsItr =
// new CompositeIterator<SearchResult>();
// resultsItr.addComponent(reverseResults.iterator());
// resultsItr.addComponent(forward);
// and filter:
// filterRecords(resultsItr, forwardFilters, results, true);
filterRecords(forward, forwardFilters, results, true);
} catch (IOException e) {
throw new ResourceIndexNotAvailableException(
e.getLocalizedMessage());
}
} else if (searchType.equals(WaybackConstants.REQUEST_URL_QUERY)) {
results = new CaptureSearchResults();
// build up the FilterChain(s):
ObjectFilterChain<SearchResult> filters =
new ObjectFilterChain<SearchResult>();
filters.addFilter(guardrail);
filters.addFilter(new DuplicateRecordFilter());
filters.addFilter(new UrlMatchFilter(keyUrl));
if(hostMatchFilter != null) {
filters.addFilter(hostMatchFilter);
}
filters.addFilter(new EndDateFilter(endDate));
// possibly filter via exclusions:
if (exclusion != null) {
filters.addFilter(preExCounter);
filters.addFilter(exclusion);
}
filters.addFilter(finalCounter);
// OPTIMIZ: beginning the search at the startDate causes problems
// with deduplicated results. We need to be smarter about rolling
// backwards a ways if we start on a deduped record.
// startKey = keyUrl + " " + startDate;
startKey = keyUrl + " ";
// add the start and end windowing filters:
filters.addFilter(new WindowStartFilter(startResult));
filters.addFilter(new WindowEndFilter(resultsPerPage));
try {
filterRecords(source.getPrefixIterator(startKey), filters, results,
true);
} catch (IOException e) {
throw new ResourceIndexNotAvailableException(
e.getLocalizedMessage());
}
} else if (searchType.equals(WaybackConstants.REQUEST_URL_PREFIX_QUERY)) {
results = new UrlSearchResults();
// build up the FilterChain(s):
ObjectFilterChain<SearchResult> filters =
new ObjectFilterChain<SearchResult>();
filters.addFilter(guardrail);
filters.addFilter(new DuplicateRecordFilter());
filters.addFilter(new UrlPrefixMatchFilter(keyUrl));
if(hostMatchFilter != null) {
filters.addFilter(hostMatchFilter);
}
filters.addFilter(new DateRangeFilter(startDate, endDate));
// possibly filter via exclusions:
if (exclusion != null) {
filters.addFilter(preExCounter);
filters.addFilter(exclusion);
}
filters.addFilter(new CaptureToUrlResultFilter());
filters.addFilter(finalCounter);
startKey = keyUrl;
// add the start and end windowing filters:
filters.addFilter(new WindowStartFilter(startResult));
filters.addFilter(new WindowEndFilter(resultsPerPage));
try {
filterRecords(source.getPrefixIterator(startKey), filters, results,
true);
} catch (IOException e) {
throw new ResourceIndexNotAvailableException(
e.getLocalizedMessage());
}
} else {
throw new BadQueryException("Unknown query type(" + searchType
+ "), must be " + WaybackConstants.REQUEST_REPLAY_QUERY
+ ", " + WaybackConstants.REQUEST_CLOSEST_QUERY + ", "
+ WaybackConstants.REQUEST_URL_QUERY + ", or "
+ WaybackConstants.REQUEST_URL_PREFIX_QUERY);
}
int matched = finalCounter.getNumMatched();
if (matched == 0) {
if (exclusion != null) {
if(preExCounter.getNumMatched() > 0) {
throw new AccessControlException("All results Excluded");
}
}
throw new ResourceNotInArchiveException("the URL " + keyUrl
+ " is not in the archive.");
}
// now we need to set some filter properties on the results:
results.putFilter(WaybackConstants.REQUEST_URL, keyUrl);
results.putFilter(WaybackConstants.REQUEST_TYPE, searchType);
results.putFilter(WaybackConstants.REQUEST_START_DATE, startDate);
results.putFilter(WaybackConstants.REQUEST_EXACT_DATE, exactDate);
results.putFilter(WaybackConstants.REQUEST_END_DATE, endDate);
// window info
results.putFilter(WaybackConstants.RESULTS_FIRST_RETURNED, String
.valueOf(startResult));
results.putFilter(WaybackConstants.RESULTS_REQUESTED, String
.valueOf(resultsPerPage));
// how many are actually in the results:
results.putFilter(WaybackConstants.RESULTS_NUM_RESULTS, String
.valueOf(matched));
// how many matched (includes those outside window)
results.putFilter(WaybackConstants.RESULTS_NUM_RETURNED, String
.valueOf(results.getResultCount()));
return results;
}