Package org.codelibs.elasticsearch.web.river

Source Code of org.codelibs.elasticsearch.web.river.WebRiver$CrawlJob

package org.codelibs.elasticsearch.web.river;

import static org.quartz.CronScheduleBuilder.cronSchedule;
import static org.quartz.JobBuilder.newJob;
import static org.quartz.JobKey.jobKey;
import static org.quartz.TriggerBuilder.newTrigger;

import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.UUID;
import java.util.concurrent.atomic.AtomicReference;

import org.apache.commons.lang3.time.DateUtils;
import org.apache.http.auth.AuthScheme;
import org.apache.http.auth.AuthScope;
import org.apache.http.auth.Credentials;
import org.apache.http.auth.NTCredentials;
import org.apache.http.auth.UsernamePasswordCredentials;
import org.apache.http.impl.auth.BasicScheme;
import org.apache.http.impl.auth.DigestScheme;
import org.apache.http.impl.auth.NTLMScheme;
import org.codelibs.elasticsearch.quartz.service.ScheduleService;
import org.codelibs.elasticsearch.util.lang.StringUtils;
import org.codelibs.elasticsearch.util.settings.SettingsUtils;
import org.codelibs.elasticsearch.web.WebRiverConstants;
import org.codelibs.elasticsearch.web.config.RiverConfig;
import org.codelibs.elasticsearch.web.robot.interval.WebRiverIntervalController;
import org.codelibs.elasticsearch.web.robot.service.EsDataService;
import org.codelibs.elasticsearch.web.robot.service.EsUrlFilterService;
import org.codelibs.elasticsearch.web.robot.service.EsUrlQueueService;
import org.codelibs.robot.S2Robot;
import org.codelibs.robot.S2RobotContext;
import org.codelibs.robot.client.http.Authentication;
import org.codelibs.robot.client.http.HcHttpClient;
import org.codelibs.robot.client.http.RequestHeader;
import org.codelibs.robot.client.http.impl.AuthenticationImpl;
import org.codelibs.robot.client.http.ntlm.JcifsEngine;
import org.elasticsearch.action.admin.indices.mapping.delete.DeleteMappingResponse;
import org.elasticsearch.client.Client;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.logging.ESLogger;
import org.elasticsearch.common.logging.Loggers;
import org.elasticsearch.river.AbstractRiverComponent;
import org.elasticsearch.river.River;
import org.elasticsearch.river.RiverName;
import org.elasticsearch.river.RiverSettings;
import org.elasticsearch.script.CompiledScript;
import org.elasticsearch.script.ExecutableScript;
import org.elasticsearch.script.ScriptService;
import org.elasticsearch.script.ScriptService.ScriptType;
import org.quartz.Job;
import org.quartz.JobDataMap;
import org.quartz.JobDetail;
import org.quartz.JobExecutionContext;
import org.quartz.JobExecutionException;
import org.quartz.Trigger;
import org.seasar.framework.container.SingletonS2Container;
import org.seasar.framework.container.factory.SingletonS2ContainerFactory;
import org.seasar.framework.util.StringUtil;

public class WebRiver extends AbstractRiverComponent implements River {

    private static final ESLogger logger = Loggers.getLogger(WebRiver.class);

    private static final String RIVER_NAME = "riverName";

    private static final String SETTINGS = "settings";

    private static final String RUNNING_JOB = "runningJob";

    private static final String SCRIPT_SERVICE = "scriptService";

    private static final String TRIGGER_ID_SUFFIX = "Trigger";

    private static final String JOB_ID_SUFFIX = "Job";

    private static final String ES_CLIENT = "esClient";

    private static final String DEFAULT_USER_AGENT = "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Elasticsearch River Web/"
            + WebRiverConstants.VERSION + ")";

    private static final String NTLM_SCHEME = "NTLM";

    private static final String DIGEST_SCHEME = "DIGEST";

    private static final String BASIC_SCHEME = "BASIC";

    private static final String ONE_TIME = "oneTime";

    private static final String EMPTY_STRING = "";

    private final Client client;

    private final ScheduleService scheduleService;

    private String groupId;

    private String id;

    private AtomicReference<CrawlJob> runningJob = new AtomicReference<CrawlJob>();

    private ScriptService scriptService;

    @Inject
    public WebRiver(final RiverName riverName, final RiverSettings settings,
            final Client client, final ScheduleService scheduleService,
            final ScriptService scriptService) {
        super(riverName, settings);
        this.client = client;
        this.scheduleService = scheduleService;
        this.scriptService = scriptService;

        groupId = riverName.type() == null ? "web" : riverName.type();
        id = riverName.name();

        logger.info("Creating WebRiver: " + id);
    }

    @Override
    public void start() {
        logger.info("Scheduling CrawlJob...");

        if (scheduleService == null) {
            logger.warn("Elasticsearch River Web plugin depends on Elasticsearch Quartz plugin, "
                    + "but it's not found. River Web plugin does not start.");
            return;
        }

        final JobDataMap jobDataMap = new JobDataMap();

        String cron = null;
        @SuppressWarnings("unchecked")
        final Map<String, Object> scheduleSettings = (Map<String, Object>) settings
                .settings().get("schedule");
        if (scheduleSettings != null) {
            cron = (String) scheduleSettings.get("cron");
        }

        if (cron == null) {
            Date now = new Date();
            now = DateUtils.addSeconds(now, 60);
            final SimpleDateFormat sdf = new SimpleDateFormat(
                    "s m H d M ? yyyy");
            cron = sdf.format(now);
            jobDataMap.put(ONE_TIME, Boolean.TRUE);
        }

        jobDataMap.put(RIVER_NAME, riverName);
        jobDataMap.put(SETTINGS, settings);
        jobDataMap.put(ES_CLIENT, client);
        jobDataMap.put(RUNNING_JOB, runningJob);
        jobDataMap.put(SCRIPT_SERVICE, scriptService);

        final Map<String, Object> vars = new HashMap<String, Object>();
        vars.put("riverName", riverName);
        vars.put("client", client);
        executeScript(scriptService, settings.settings(), vars, "start");

        final JobDetail crawlJob = newJob(CrawlJob.class)
                .withIdentity(id + JOB_ID_SUFFIX, groupId)
                .usingJobData(jobDataMap).build();

        final Trigger trigger = newTrigger()
                .withIdentity(id + TRIGGER_ID_SUFFIX, groupId)
                .withSchedule(cronSchedule(cron)).startNow().build();
        scheduleService.scheduleJob(crawlJob, trigger);
    }

    @Override
    public void close() {
        if (scheduleService == null) {
            return;
        }

        final Map<String, Object> vars = new HashMap<String, Object>();
        vars.put("riverName", riverName);
        vars.put("client", client);
        executeScript(scriptService, settings.settings(), vars, "close");

        logger.info("Unscheduling  CrawlJob...");

        final CrawlJob crawlJob = runningJob.get();
        if (crawlJob != null) {
            crawlJob.stop();
        }
        scheduleService.deleteJob(jobKey(id + JOB_ID_SUFFIX, groupId));
    }

    protected static void executeScript(final ScriptService scriptService,
            final Map<String, Object> settings, final Map<String, Object> vars,
            final String target) {
        final Map<String, Object> crawlSettings = SettingsUtils.get(settings,
                "crawl");
        final Map<String, Object> scriptSettings = SettingsUtils.get(
                crawlSettings, "script");
        final String script = SettingsUtils.get(scriptSettings, target);
        final String lang = SettingsUtils.get(scriptSettings, "lang", "groovy");
        final String scriptTypeValue = SettingsUtils.get(scriptSettings,
                "script_type", "inline");
        ScriptType scriptType;
        if (ScriptType.FILE.toString().equalsIgnoreCase(scriptTypeValue)) {
            scriptType = ScriptType.FILE;
        } else if (ScriptType.INDEXED.toString().equalsIgnoreCase(
                scriptTypeValue)) {
            scriptType = ScriptType.INDEXED;
        } else {
            scriptType = ScriptType.INLINE;
        }
        if (StringUtils.isNotBlank(script)) {
            final Map<String, Object> localVars = new HashMap<String, Object>(
                    vars);
            localVars.put("container",
                    SingletonS2ContainerFactory.getContainer());
            localVars.put("settings", settings);
            try {
                final CompiledScript compiledScript = scriptService.compile(
                        lang, script, scriptType);
                ExecutableScript executable = scriptService.executable(
                        compiledScript, localVars);
                Object result = executable.run();
                logger.info("[{}] \"{}\" => {}", target, script, result);
            } catch (final Exception e) {
                logger.warn("Failed to execute script: {}", e, script);
            }
        }
    }

    public static class CrawlJob implements Job {

        private S2Robot s2Robot;

        @Override
        public void execute(final JobExecutionContext context)
                throws JobExecutionException {

            final JobDataMap data = context.getMergedJobDataMap();
            @SuppressWarnings("unchecked")
            final AtomicReference<Job> runningJob = (AtomicReference<Job>) data
                    .get(RUNNING_JOB);
            final ScriptService scriptSerivce = (ScriptService) data
                    .get(SCRIPT_SERVICE);
            if (!runningJob.compareAndSet(null, this)) {
                logger.info(context.getJobDetail().getKey() + " is running.");
                return;
            }

            final RiverName riverName = (RiverName) data.get(RIVER_NAME);
            final String sessionId = UUID.randomUUID().toString();

            final Client client = getClient(data);
            final Map<String, Object> vars = new HashMap<String, Object>();
            vars.put("riverName", riverName);
            vars.put("sessionId", sessionId);
            vars.put("client", client);

            RiverConfig riverConfig = null;
            final RiverSettings settings = (RiverSettings) data.get(SETTINGS);
            final Map<String, Object> rootSettings = settings.settings();
            try {

                executeScript(scriptSerivce, rootSettings, vars, "execute");

                @SuppressWarnings("unchecked")
                final Map<String, Object> crawlSettings = (Map<String, Object>) rootSettings
                        .get("crawl");
                if (crawlSettings == null) {
                    logger.warn("No settings for crawling.");
                    return;
                }

                @SuppressWarnings("unchecked")
                final List<Map<String, Object>> targetList = (List<Map<String, Object>>) crawlSettings
                        .get("target");
                if (targetList == null || targetList.isEmpty()) {
                    logger.warn("No targets for crawling.");
                    return;
                }

                s2Robot = SingletonS2Container.getComponent(S2Robot.class);
                s2Robot.setSessionId(sessionId);

                // HttpClient Parameters
                final Map<String, Object> paramMap = new HashMap<String, Object>();
                s2Robot.getClientFactory().setInitParameterMap(paramMap);

                // user agent
                final String userAgent = SettingsUtils.get(crawlSettings,
                        "userAgent", DEFAULT_USER_AGENT);
                if (StringUtil.isNotBlank(userAgent)) {
                    paramMap.put(HcHttpClient.USER_AGENT_PROPERTY, userAgent);
                }

                // robots.txt parser
                final Boolean robotsTxtEnabled = SettingsUtils.get(
                        crawlSettings, "robotsTxt", Boolean.TRUE);
                paramMap.put(HcHttpClient.ROBOTS_TXT_ENABLED_PROPERTY,
                        robotsTxtEnabled);

                // proxy
                final Map<String, Object> proxyMap = SettingsUtils.get(
                        crawlSettings, "proxy", null);
                if (proxyMap != null) {
                    final Object host = proxyMap.get("host");
                    if (host != null) {
                        paramMap.put(HcHttpClient.PROXY_HOST_PROPERTY, host);
                        final Object portObj = proxyMap.get("port");
                        if (portObj instanceof Integer) {
                            paramMap.put(HcHttpClient.PROXY_PORT_PROPERTY,
                                    portObj);
                        } else {
                            paramMap.put(HcHttpClient.PROXY_PORT_PROPERTY,
                                    Integer.valueOf(8080));
                        }
                    }
                }

                // authentications
                // "authentications":[{"scope":{"scheme":"","host":"","port":0,"realm":""},
                //   "credentials":{"username":"","password":""}},{...}]
                final List<Map<String, Object>> authList = SettingsUtils.get(
                        crawlSettings, "authentications", null);
                if (authList != null && !authList.isEmpty()) {
                    final List<Authentication> basicAuthList = new ArrayList<Authentication>();
                    for (final Map<String, Object> authObj : authList) {
                        @SuppressWarnings("unchecked")
                        final Map<String, Object> scopeMap = (Map<String, Object>) authObj
                                .get("scope");
                        String scheme = SettingsUtils.get(scopeMap, "scheme",
                                EMPTY_STRING).toUpperCase(Locale.ENGLISH);
                        if (StringUtil.isBlank(scheme)) {
                            logger.warn("Invalid authentication: " + authObj);
                            continue;
                        }
                        @SuppressWarnings("unchecked")
                        final Map<String, Object> credentialMap = (Map<String, Object>) authObj
                                .get("credentials");
                        final String username = SettingsUtils.get(
                                credentialMap, "username", null);
                        if (StringUtil.isBlank(username)) {
                            logger.warn("Invalid authentication: " + authObj);
                            continue;
                        }
                        final String host = SettingsUtils.get(authObj, "host",
                                AuthScope.ANY_HOST);
                        final int port = SettingsUtils.get(authObj, "port",
                                AuthScope.ANY_PORT);
                        final String realm = SettingsUtils.get(authObj,
                                "realm", AuthScope.ANY_REALM);
                        final String password = SettingsUtils.get(
                                credentialMap, "password", null);

                        AuthScheme authScheme = null;
                        Credentials credentials = null;
                        if (BASIC_SCHEME.equalsIgnoreCase(scheme)) {
                            authScheme = new BasicScheme();
                            credentials = new UsernamePasswordCredentials(
                                    username, password);
                        } else if (DIGEST_SCHEME.equals(scheme)) {
                            authScheme = new DigestScheme();
                            credentials = new UsernamePasswordCredentials(
                                    username, password);
                        } else if (NTLM_SCHEME.equals(scheme)) {
                            authScheme = new NTLMScheme(new JcifsEngine());
                            scheme = AuthScope.ANY_SCHEME;
                            final String workstation = SettingsUtils.get(
                                    credentialMap, "workstation", null);
                            final String domain = SettingsUtils.get(
                                    credentialMap, "domain", null);
                            credentials = new NTCredentials(username, password,
                                    workstation == null ? EMPTY_STRING
                                            : workstation,
                                    domain == null ? EMPTY_STRING : domain);
                        }

                        final AuthenticationImpl auth = new AuthenticationImpl(
                                new AuthScope(host, port, realm, scheme),
                                credentials, authScheme);
                        basicAuthList.add(auth);
                    }
                    paramMap.put(HcHttpClient.BASIC_AUTHENTICATIONS_PROPERTY,
                            basicAuthList
                                    .toArray(new Authentication[basicAuthList
                                            .size()]));
                }

                // request header
                // "headers":[{"name":"","value":""},{}]
                final List<Map<String, Object>> headerList = SettingsUtils.get(
                        crawlSettings, "headers", null);
                if (headerList != null && !headerList.isEmpty()) {
                    final List<RequestHeader> requestHeaderList = new ArrayList<RequestHeader>();
                    for (final Map<String, Object> headerObj : headerList) {
                        final String name = SettingsUtils.get(headerObj,
                                "name", null);
                        final String value = SettingsUtils.get(headerObj,
                                "value", null);
                        if (name != null && value != null) {
                            requestHeaderList
                                    .add(new RequestHeader(name, value));
                        }
                    }
                    paramMap.put(
                            HcHttpClient.REQUERT_HEADERS_PROPERTY,
                            requestHeaderList
                                    .toArray(new RequestHeader[requestHeaderList
                                            .size()]));
                }

                // url
                @SuppressWarnings("unchecked")
                final List<String> urlList = (List<String>) crawlSettings
                        .get("url");
                if (urlList == null || urlList.isEmpty()) {
                    logger.warn("No url for crawling.");
                    return;
                }
                for (final String url : urlList) {
                    s2Robot.addUrl(url);
                }
                // include regex
                @SuppressWarnings("unchecked")
                final List<String> includeFilterList = (List<String>) crawlSettings
                        .get("includeFilter");
                if (includeFilterList != null) {
                    for (final String regex : includeFilterList) {
                        s2Robot.addIncludeFilter(regex);
                    }
                }
                // exclude regex
                @SuppressWarnings("unchecked")
                final List<String> excludeFilterList = (List<String>) crawlSettings
                        .get("excludeFilter");
                if (excludeFilterList != null) {
                    for (final String regex : excludeFilterList) {
                        s2Robot.addExcludeFilter(regex);
                    }
                }

                final S2RobotContext robotContext = s2Robot.getRobotContext();

                // max depth
                final int maxDepth = SettingsUtils.get(crawlSettings,
                        "maxDepth", -1);

                robotContext.setMaxDepth(maxDepth);
                // max access count
                final int maxAccessCount = SettingsUtils.get(crawlSettings,
                        "maxAccessCount", 100);
                robotContext.setMaxAccessCount(maxAccessCount);
                // num of thread
                final int numOfThread = SettingsUtils.get(crawlSettings,
                        "numOfThread", 5);
                robotContext.setNumOfThread(numOfThread);
                // interval
                final long interval = SettingsUtils.get(crawlSettings,
                        "interval", 1000);
                final WebRiverIntervalController intervalController = (WebRiverIntervalController) s2Robot
                        .getIntervalController();
                intervalController.setDelayMillisForWaitingNewUrl(interval);

                // river params
                final Map<String, Object> riverParamMap = new HashMap<String, Object>();
                riverParamMap.put("index",
                        SettingsUtils.get(crawlSettings, "index", "web"));
                riverParamMap.put(
                        "type",
                        SettingsUtils.get(crawlSettings, "type",
                                riverName.getName()));
                riverParamMap.put("overwrite", SettingsUtils.get(crawlSettings,
                        "overwrite", Boolean.FALSE));
                riverParamMap.put("incremental", SettingsUtils.get(
                        crawlSettings, "incremental", Boolean.FALSE));

                // crawl config
                riverConfig = SingletonS2Container
                        .getComponent(RiverConfig.class);
                riverConfig.createLock(sessionId);
                riverConfig.addRiverParams(sessionId, riverParamMap);
                for (final Map<String, Object> targetMap : targetList) {
                    @SuppressWarnings("unchecked")
                    final Map<String, Object> patternMap = (Map<String, Object>) targetMap
                            .get("pattern");
                    @SuppressWarnings("unchecked")
                    final Map<String, Map<String, Object>> propMap = (Map<String, Map<String, Object>>) targetMap
                            .get("properties");
                    if (patternMap != null && propMap != null) {
                        if (logger.isDebugEnabled()) {
                            logger.debug("patternMap: " + patternMap);
                            logger.debug("propMap: " + propMap);
                        }
                        @SuppressWarnings("unchecked")
                        final Map<String, Object> settingMap = (Map<String, Object>) targetMap
                                .get("settings");
                        riverConfig.addScrapingRule(sessionId, settingMap,
                                patternMap, propMap);
                    } else {
                        logger.warn("Invalid pattern or target: patternMap: "
                                + patternMap + ", propMap: " + propMap);
                    }
                }

                // run s2robot
                s2Robot.execute();

                s2Robot.stop();

            } finally {
                executeScript(scriptSerivce, rootSettings, vars, "finish");

                runningJob.set(null);
                if (riverConfig != null) {
                    riverConfig.cleanup(sessionId);
                }
                // clean up
                // s2Robot.cleanup(sessionId);
                try {
                    SingletonS2Container.getComponent(EsUrlQueueService.class)
                            .delete(sessionId);
                } catch (final Exception e) {
                    logger.warn("Failed to delete ", e);
                }
                SingletonS2Container.getComponent(EsDataService.class).delete(
                        sessionId);
                SingletonS2Container.getComponent(EsUrlFilterService.class)
                        .delete(sessionId);

                final Object oneTime = data.get(ONE_TIME);
                if (oneTime != null) {
                    if (client != null) {
                        final DeleteMappingResponse deleteMappingResponse = client
                                .admin().indices()
                                .prepareDeleteMapping("_river")
                                .setType(riverName.name()).execute()
                                .actionGet();
                        if (deleteMappingResponse.isAcknowledged()) {
                            logger.info("Deleted one time river: "
                                    + riverName.name());
                        } else {
                            logger.warn("Failed to delete " + riverName.name()
                                    + ". Resposne: "
                                    + deleteMappingResponse.toString());
                        }
                    }
                }
            }
        }

        private Client getClient(final JobDataMap data) {
            final Object clientObj = data.get(ES_CLIENT);
            if (clientObj instanceof Client) {
                return (Client) clientObj;
            }
            return null;
        }

        public void stop() {
            if (s2Robot != null) {
                s2Robot.stop();
            }
        }

    }

}
TOP

Related Classes of org.codelibs.elasticsearch.web.river.WebRiver$CrawlJob

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.