if (!runningJob.compareAndSet(null, this)) {
logger.info(context.getJobDetail().getKey() + " is running.");
return;
}
final RiverName riverName = (RiverName) data.get(RIVER_NAME);
final String sessionId = UUID.randomUUID().toString();
final Client client = getClient(data);
final Map<String, Object> vars = new HashMap<String, Object>();
vars.put("riverName", riverName);
vars.put("sessionId", sessionId);
vars.put("client", client);
RiverConfig riverConfig = null;
final RiverSettings settings = (RiverSettings) data.get(SETTINGS);
final Map<String, Object> rootSettings = settings.settings();
try {
executeScript(scriptSerivce, rootSettings, vars, "execute");
@SuppressWarnings("unchecked")
final Map<String, Object> crawlSettings = (Map<String, Object>) rootSettings
.get("crawl");
if (crawlSettings == null) {
logger.warn("No settings for crawling.");
return;
}
@SuppressWarnings("unchecked")
final List<Map<String, Object>> targetList = (List<Map<String, Object>>) crawlSettings
.get("target");
if (targetList == null || targetList.isEmpty()) {
logger.warn("No targets for crawling.");
return;
}
s2Robot = SingletonS2Container.getComponent(S2Robot.class);
s2Robot.setSessionId(sessionId);
// HttpClient Parameters
final Map<String, Object> paramMap = new HashMap<String, Object>();
s2Robot.getClientFactory().setInitParameterMap(paramMap);
// user agent
final String userAgent = SettingsUtils.get(crawlSettings,
"userAgent", DEFAULT_USER_AGENT);
if (StringUtil.isNotBlank(userAgent)) {
paramMap.put(HcHttpClient.USER_AGENT_PROPERTY, userAgent);
}
// robots.txt parser
final Boolean robotsTxtEnabled = SettingsUtils.get(
crawlSettings, "robotsTxt", Boolean.TRUE);
paramMap.put(HcHttpClient.ROBOTS_TXT_ENABLED_PROPERTY,
robotsTxtEnabled);
// proxy
final Map<String, Object> proxyMap = SettingsUtils.get(
crawlSettings, "proxy", null);
if (proxyMap != null) {
final Object host = proxyMap.get("host");
if (host != null) {
paramMap.put(HcHttpClient.PROXY_HOST_PROPERTY, host);
final Object portObj = proxyMap.get("port");
if (portObj instanceof Integer) {
paramMap.put(HcHttpClient.PROXY_PORT_PROPERTY,
portObj);
} else {
paramMap.put(HcHttpClient.PROXY_PORT_PROPERTY,
Integer.valueOf(8080));
}
}
}
// authentications
// "authentications":[{"scope":{"scheme":"","host":"","port":0,"realm":""},
// "credentials":{"username":"","password":""}},{...}]
final List<Map<String, Object>> authList = SettingsUtils.get(
crawlSettings, "authentications", null);
if (authList != null && !authList.isEmpty()) {
final List<Authentication> basicAuthList = new ArrayList<Authentication>();
for (final Map<String, Object> authObj : authList) {
@SuppressWarnings("unchecked")
final Map<String, Object> scopeMap = (Map<String, Object>) authObj
.get("scope");
String scheme = SettingsUtils.get(scopeMap, "scheme",
EMPTY_STRING).toUpperCase(Locale.ENGLISH);
if (StringUtil.isBlank(scheme)) {
logger.warn("Invalid authentication: " + authObj);
continue;
}
@SuppressWarnings("unchecked")
final Map<String, Object> credentialMap = (Map<String, Object>) authObj
.get("credentials");
final String username = SettingsUtils.get(
credentialMap, "username", null);
if (StringUtil.isBlank(username)) {
logger.warn("Invalid authentication: " + authObj);
continue;
}
final String host = SettingsUtils.get(authObj, "host",
AuthScope.ANY_HOST);
final int port = SettingsUtils.get(authObj, "port",
AuthScope.ANY_PORT);
final String realm = SettingsUtils.get(authObj,
"realm", AuthScope.ANY_REALM);
final String password = SettingsUtils.get(
credentialMap, "password", null);
AuthScheme authScheme = null;
Credentials credentials = null;
if (BASIC_SCHEME.equalsIgnoreCase(scheme)) {
authScheme = new BasicScheme();
credentials = new UsernamePasswordCredentials(
username, password);
} else if (DIGEST_SCHEME.equals(scheme)) {
authScheme = new DigestScheme();
credentials = new UsernamePasswordCredentials(
username, password);
} else if (NTLM_SCHEME.equals(scheme)) {
authScheme = new NTLMScheme(new JcifsEngine());
scheme = AuthScope.ANY_SCHEME;
final String workstation = SettingsUtils.get(
credentialMap, "workstation", null);
final String domain = SettingsUtils.get(
credentialMap, "domain", null);
credentials = new NTCredentials(username, password,
workstation == null ? EMPTY_STRING
: workstation,
domain == null ? EMPTY_STRING : domain);
}
final AuthenticationImpl auth = new AuthenticationImpl(
new AuthScope(host, port, realm, scheme),
credentials, authScheme);
basicAuthList.add(auth);
}
paramMap.put(HcHttpClient.BASIC_AUTHENTICATIONS_PROPERTY,
basicAuthList
.toArray(new Authentication[basicAuthList
.size()]));
}
// request header
// "headers":[{"name":"","value":""},{}]
final List<Map<String, Object>> headerList = SettingsUtils.get(
crawlSettings, "headers", null);
if (headerList != null && !headerList.isEmpty()) {
final List<RequestHeader> requestHeaderList = new ArrayList<RequestHeader>();
for (final Map<String, Object> headerObj : headerList) {
final String name = SettingsUtils.get(headerObj,
"name", null);
final String value = SettingsUtils.get(headerObj,
"value", null);
if (name != null && value != null) {
requestHeaderList
.add(new RequestHeader(name, value));
}
}
paramMap.put(
HcHttpClient.REQUERT_HEADERS_PROPERTY,
requestHeaderList
.toArray(new RequestHeader[requestHeaderList
.size()]));
}
// url
@SuppressWarnings("unchecked")
final List<String> urlList = (List<String>) crawlSettings
.get("url");
if (urlList == null || urlList.isEmpty()) {
logger.warn("No url for crawling.");
return;
}
for (final String url : urlList) {
s2Robot.addUrl(url);
}
// include regex
@SuppressWarnings("unchecked")
final List<String> includeFilterList = (List<String>) crawlSettings
.get("includeFilter");
if (includeFilterList != null) {
for (final String regex : includeFilterList) {
s2Robot.addIncludeFilter(regex);
}
}
// exclude regex
@SuppressWarnings("unchecked")
final List<String> excludeFilterList = (List<String>) crawlSettings
.get("excludeFilter");
if (excludeFilterList != null) {
for (final String regex : excludeFilterList) {
s2Robot.addExcludeFilter(regex);
}
}
final S2RobotContext robotContext = s2Robot.getRobotContext();
// max depth
final int maxDepth = SettingsUtils.get(crawlSettings,
"maxDepth", -1);
robotContext.setMaxDepth(maxDepth);
// max access count
final int maxAccessCount = SettingsUtils.get(crawlSettings,
"maxAccessCount", 100);
robotContext.setMaxAccessCount(maxAccessCount);
// num of thread
final int numOfThread = SettingsUtils.get(crawlSettings,
"numOfThread", 5);
robotContext.setNumOfThread(numOfThread);
// interval
final long interval = SettingsUtils.get(crawlSettings,
"interval", 1000);
final WebRiverIntervalController intervalController = (WebRiverIntervalController) s2Robot
.getIntervalController();
intervalController.setDelayMillisForWaitingNewUrl(interval);
// river params
final Map<String, Object> riverParamMap = new HashMap<String, Object>();
riverParamMap.put("index",
SettingsUtils.get(crawlSettings, "index", "web"));
riverParamMap.put(
"type",
SettingsUtils.get(crawlSettings, "type",
riverName.getName()));
riverParamMap.put("overwrite", SettingsUtils.get(crawlSettings,
"overwrite", Boolean.FALSE));
riverParamMap.put("incremental", SettingsUtils.get(
crawlSettings, "incremental", Boolean.FALSE));
// crawl config
riverConfig = SingletonS2Container
.getComponent(RiverConfig.class);
riverConfig.createLock(sessionId);
riverConfig.addRiverParams(sessionId, riverParamMap);
for (final Map<String, Object> targetMap : targetList) {
@SuppressWarnings("unchecked")
final Map<String, Object> patternMap = (Map<String, Object>) targetMap
.get("pattern");
@SuppressWarnings("unchecked")
final Map<String, Map<String, Object>> propMap = (Map<String, Map<String, Object>>) targetMap
.get("properties");
if (patternMap != null && propMap != null) {
if (logger.isDebugEnabled()) {
logger.debug("patternMap: " + patternMap);
logger.debug("propMap: " + propMap);
}
@SuppressWarnings("unchecked")
final Map<String, Object> settingMap = (Map<String, Object>) targetMap
.get("settings");
riverConfig.addScrapingRule(sessionId, settingMap,
patternMap, propMap);
} else {
logger.warn("Invalid pattern or target: patternMap: "
+ patternMap + ", propMap: " + propMap);
}
}
// run s2robot
s2Robot.execute();
s2Robot.stop();
} finally {
executeScript(scriptSerivce, rootSettings, vars, "finish");
runningJob.set(null);
if (riverConfig != null) {
riverConfig.cleanup(sessionId);
}
// clean up
// s2Robot.cleanup(sessionId);
try {
SingletonS2Container.getComponent(EsUrlQueueService.class)
.delete(sessionId);
} catch (final Exception e) {
logger.warn("Failed to delete ", e);
}
SingletonS2Container.getComponent(EsDataService.class).delete(
sessionId);
SingletonS2Container.getComponent(EsUrlFilterService.class)
.delete(sessionId);
final Object oneTime = data.get(ONE_TIME);
if (oneTime != null) {
if (client != null) {
final DeleteMappingResponse deleteMappingResponse = client
.admin().indices()
.prepareDeleteMapping("_river")
.setType(riverName.name()).execute()
.actionGet();
if (deleteMappingResponse.isAcknowledged()) {
logger.info("Deleted one time river: "
+ riverName.name());
} else {
logger.warn("Failed to delete " + riverName.name()
+ ". Resposne: "
+ deleteMappingResponse.toString());
}
}
}