JSONObject parentUrlMetadata = jo.getJSONObject("parentUrlMetadata");
String parentHopPath = parentUrlMetadata.getString("pathFromSeed");
String hopPath = parentHopPath + Hop.INFERRED.getHopString();
CrawlURI curi = new CrawlURI(uuri, hopPath, via, LinkContext.INFERRED_MISC);
// set the heritable data from the parent url, passed back to us via amqp
// XXX brittle, only goes one level deep, and only handles strings and arrays, the latter of which it converts to a Set.
// 'heritableData': {'source': 'https://facebook.com/whitehouse/', 'heritable': ['source', 'heritable']}
JSONObject heritableData = parentUrlMetadata.getJSONObject("heritableData");
for (String key: (Set<String>) heritableData.keySet()) {
Object value = heritableData.get(key);
if (value instanceof JSONArray) {
Set<String> valueSet = new HashSet<String>();
JSONArray arr = ((JSONArray) value);
for (int i = 0; i < arr.length(); i++) {
valueSet.add(arr.getString(i));
}
curi.getData().put(key, valueSet);
} else {
curi.getData().put(key, heritableData.get(key));
}
}
// set the http headers from the amqp message
Map<String, String> customHttpRequestHeaders = new HashMap<String, String>();
for (Object key : joHeaders.keySet()) {
customHttpRequestHeaders.put(key.toString(),
joHeaders.getString(key.toString()));
}
curi.getData().put("customHttpRequestHeaders", customHttpRequestHeaders);
/* Use HighestUriQueuePrecedencePolicy to ensure these high priority
* urls really get crawled ahead of others.
* See https://webarchive.jira.com/wiki/display/Heritrix/Precedence+Feature+Notes
*/
curi.setSchedulingDirective(SchedulingConstants.HIGH);
curi.setPrecedence(1);
//curi.setForceFetch(true);
curi.getAnnotations().add(A_RECEIVED_FROM_AMQP);
return curi;
}