Package org.codelibs.elasticsearch.web.config

Examples of org.codelibs.elasticsearch.web.config.RiverConfig


            final Map<String, Object> vars = new HashMap<String, Object>();
            vars.put("riverName", riverName);
            vars.put("sessionId", sessionId);
            vars.put("client", client);

            RiverConfig riverConfig = null;
            final RiverSettings settings = (RiverSettings) data.get(SETTINGS);
            final Map<String, Object> rootSettings = settings.settings();
            try {

                executeScript(scriptSerivce, rootSettings, vars, "execute");

                @SuppressWarnings("unchecked")
                final Map<String, Object> crawlSettings = (Map<String, Object>) rootSettings
                        .get("crawl");
                if (crawlSettings == null) {
                    logger.warn("No settings for crawling.");
                    return;
                }

                @SuppressWarnings("unchecked")
                final List<Map<String, Object>> targetList = (List<Map<String, Object>>) crawlSettings
                        .get("target");
                if (targetList == null || targetList.isEmpty()) {
                    logger.warn("No targets for crawling.");
                    return;
                }

                s2Robot = SingletonS2Container.getComponent(S2Robot.class);
                s2Robot.setSessionId(sessionId);

                // HttpClient Parameters
                final Map<String, Object> paramMap = new HashMap<String, Object>();
                s2Robot.getClientFactory().setInitParameterMap(paramMap);

                // user agent
                final String userAgent = SettingsUtils.get(crawlSettings,
                        "userAgent", DEFAULT_USER_AGENT);
                if (StringUtil.isNotBlank(userAgent)) {
                    paramMap.put(HcHttpClient.USER_AGENT_PROPERTY, userAgent);
                }

                // robots.txt parser
                final Boolean robotsTxtEnabled = SettingsUtils.get(
                        crawlSettings, "robotsTxt", Boolean.TRUE);
                paramMap.put(HcHttpClient.ROBOTS_TXT_ENABLED_PROPERTY,
                        robotsTxtEnabled);

                // proxy
                final Map<String, Object> proxyMap = SettingsUtils.get(
                        crawlSettings, "proxy", null);
                if (proxyMap != null) {
                    final Object host = proxyMap.get("host");
                    if (host != null) {
                        paramMap.put(HcHttpClient.PROXY_HOST_PROPERTY, host);
                        final Object portObj = proxyMap.get("port");
                        if (portObj instanceof Integer) {
                            paramMap.put(HcHttpClient.PROXY_PORT_PROPERTY,
                                    portObj);
                        } else {
                            paramMap.put(HcHttpClient.PROXY_PORT_PROPERTY,
                                    Integer.valueOf(8080));
                        }
                    }
                }

                // authentications
                // "authentications":[{"scope":{"scheme":"","host":"","port":0,"realm":""},
                //   "credentials":{"username":"","password":""}},{...}]
                final List<Map<String, Object>> authList = SettingsUtils.get(
                        crawlSettings, "authentications", null);
                if (authList != null && !authList.isEmpty()) {
                    final List<Authentication> basicAuthList = new ArrayList<Authentication>();
                    for (final Map<String, Object> authObj : authList) {
                        @SuppressWarnings("unchecked")
                        final Map<String, Object> scopeMap = (Map<String, Object>) authObj
                                .get("scope");
                        String scheme = SettingsUtils.get(scopeMap, "scheme",
                                EMPTY_STRING).toUpperCase(Locale.ENGLISH);
                        if (StringUtil.isBlank(scheme)) {
                            logger.warn("Invalid authentication: " + authObj);
                            continue;
                        }
                        @SuppressWarnings("unchecked")
                        final Map<String, Object> credentialMap = (Map<String, Object>) authObj
                                .get("credentials");
                        final String username = SettingsUtils.get(
                                credentialMap, "username", null);
                        if (StringUtil.isBlank(username)) {
                            logger.warn("Invalid authentication: " + authObj);
                            continue;
                        }
                        final String host = SettingsUtils.get(authObj, "host",
                                AuthScope.ANY_HOST);
                        final int port = SettingsUtils.get(authObj, "port",
                                AuthScope.ANY_PORT);
                        final String realm = SettingsUtils.get(authObj,
                                "realm", AuthScope.ANY_REALM);
                        final String password = SettingsUtils.get(
                                credentialMap, "password", null);

                        AuthScheme authScheme = null;
                        Credentials credentials = null;
                        if (BASIC_SCHEME.equalsIgnoreCase(scheme)) {
                            authScheme = new BasicScheme();
                            credentials = new UsernamePasswordCredentials(
                                    username, password);
                        } else if (DIGEST_SCHEME.equals(scheme)) {
                            authScheme = new DigestScheme();
                            credentials = new UsernamePasswordCredentials(
                                    username, password);
                        } else if (NTLM_SCHEME.equals(scheme)) {
                            authScheme = new NTLMScheme(new JcifsEngine());
                            scheme = AuthScope.ANY_SCHEME;
                            final String workstation = SettingsUtils.get(
                                    credentialMap, "workstation", null);
                            final String domain = SettingsUtils.get(
                                    credentialMap, "domain", null);
                            credentials = new NTCredentials(username, password,
                                    workstation == null ? EMPTY_STRING
                                            : workstation,
                                    domain == null ? EMPTY_STRING : domain);
                        }

                        final AuthenticationImpl auth = new AuthenticationImpl(
                                new AuthScope(host, port, realm, scheme),
                                credentials, authScheme);
                        basicAuthList.add(auth);
                    }
                    paramMap.put(HcHttpClient.BASIC_AUTHENTICATIONS_PROPERTY,
                            basicAuthList
                                    .toArray(new Authentication[basicAuthList
                                            .size()]));
                }

                // request header
                // "headers":[{"name":"","value":""},{}]
                final List<Map<String, Object>> headerList = SettingsUtils.get(
                        crawlSettings, "headers", null);
                if (headerList != null && !headerList.isEmpty()) {
                    final List<RequestHeader> requestHeaderList = new ArrayList<RequestHeader>();
                    for (final Map<String, Object> headerObj : headerList) {
                        final String name = SettingsUtils.get(headerObj,
                                "name", null);
                        final String value = SettingsUtils.get(headerObj,
                                "value", null);
                        if (name != null && value != null) {
                            requestHeaderList
                                    .add(new RequestHeader(name, value));
                        }
                    }
                    paramMap.put(
                            HcHttpClient.REQUERT_HEADERS_PROPERTY,
                            requestHeaderList
                                    .toArray(new RequestHeader[requestHeaderList
                                            .size()]));
                }

                // url
                @SuppressWarnings("unchecked")
                final List<String> urlList = (List<String>) crawlSettings
                        .get("url");
                if (urlList == null || urlList.isEmpty()) {
                    logger.warn("No url for crawling.");
                    return;
                }
                for (final String url : urlList) {
                    s2Robot.addUrl(url);
                }
                // include regex
                @SuppressWarnings("unchecked")
                final List<String> includeFilterList = (List<String>) crawlSettings
                        .get("includeFilter");
                if (includeFilterList != null) {
                    for (final String regex : includeFilterList) {
                        s2Robot.addIncludeFilter(regex);
                    }
                }
                // exclude regex
                @SuppressWarnings("unchecked")
                final List<String> excludeFilterList = (List<String>) crawlSettings
                        .get("excludeFilter");
                if (excludeFilterList != null) {
                    for (final String regex : excludeFilterList) {
                        s2Robot.addExcludeFilter(regex);
                    }
                }

                final S2RobotContext robotContext = s2Robot.getRobotContext();

                // max depth
                final int maxDepth = SettingsUtils.get(crawlSettings,
                        "maxDepth", -1);

                robotContext.setMaxDepth(maxDepth);
                // max access count
                final int maxAccessCount = SettingsUtils.get(crawlSettings,
                        "maxAccessCount", 100);
                robotContext.setMaxAccessCount(maxAccessCount);
                // num of thread
                final int numOfThread = SettingsUtils.get(crawlSettings,
                        "numOfThread", 5);
                robotContext.setNumOfThread(numOfThread);
                // interval
                final long interval = SettingsUtils.get(crawlSettings,
                        "interval", 1000);
                final WebRiverIntervalController intervalController = (WebRiverIntervalController) s2Robot
                        .getIntervalController();
                intervalController.setDelayMillisForWaitingNewUrl(interval);

                // river params
                final Map<String, Object> riverParamMap = new HashMap<String, Object>();
                riverParamMap.put("index",
                        SettingsUtils.get(crawlSettings, "index", "web"));
                riverParamMap.put(
                        "type",
                        SettingsUtils.get(crawlSettings, "type",
                                riverName.getName()));
                riverParamMap.put("overwrite", SettingsUtils.get(crawlSettings,
                        "overwrite", Boolean.FALSE));
                riverParamMap.put("incremental", SettingsUtils.get(
                        crawlSettings, "incremental", Boolean.FALSE));

                // crawl config
                riverConfig = SingletonS2Container
                        .getComponent(RiverConfig.class);
                riverConfig.createLock(sessionId);
                riverConfig.addRiverParams(sessionId, riverParamMap);
                for (final Map<String, Object> targetMap : targetList) {
                    @SuppressWarnings("unchecked")
                    final Map<String, Object> patternMap = (Map<String, Object>) targetMap
                            .get("pattern");
                    @SuppressWarnings("unchecked")
                    final Map<String, Map<String, Object>> propMap = (Map<String, Map<String, Object>>) targetMap
                            .get("properties");
                    if (patternMap != null && propMap != null) {
                        if (logger.isDebugEnabled()) {
                            logger.debug("patternMap: " + patternMap);
                            logger.debug("propMap: " + propMap);
                        }
                        @SuppressWarnings("unchecked")
                        final Map<String, Object> settingMap = (Map<String, Object>) targetMap
                                .get("settings");
                        riverConfig.addScrapingRule(sessionId, settingMap,
                                patternMap, propMap);
                    } else {
                        logger.warn("Invalid pattern or target: patternMap: "
                                + patternMap + ", propMap: " + propMap);
                    }
                }

                // run s2robot
                s2Robot.execute();

                s2Robot.stop();

            } finally {
                executeScript(scriptSerivce, rootSettings, vars, "finish");

                runningJob.set(null);
                if (riverConfig != null) {
                    riverConfig.cleanup(sessionId);
                }
                // clean up
                // s2Robot.cleanup(sessionId);
                try {
                    SingletonS2Container.getComponent(EsUrlQueueService.class)
View Full Code Here


    @Override
    protected void doStart() throws ElasticsearchException {
        logger.info("Starting S2Container...");

        final RiverConfig riverConfig = SingletonS2Container
                .getComponent(RiverConfig.class);
        riverConfig.setClient(client);
        riverConfig.setScriptService(scriptService);
    }
View Full Code Here

import org.seasar.framework.util.ResourceUtil;

public class ScrapingTransformerTest {
    @Test
    public void fess_codelibs_org() {
        RiverConfig riverConfig = new RiverConfig();
        ScrapingTransformer transformer = new ScrapingTransformer() {
            @SuppressWarnings("unchecked")
            @Override
            protected void storeIndex(ResponseData responseData,
                    Map<String, Object> dataMap) {
                System.out.println(dataMap);
                assertThat(
                        ((List<String>) ((Map<String, Object>) dataMap.get("nav"))
                                .get("sideMenus")).size(), is(27));
                assertThat(
                        ((Map<String, Object>) dataMap.get("section1")).get(
                                "title").toString(), is("What is Fess?"));
                assertThat(
                        ((List<String>) ((Map<String, Object>) dataMap.get("section1"))
                                .get("body")).size(), is(2));
                assertThat(
                        ((Map<String, Object>) dataMap.get("section2")).get(
                                "title").toString(), is("Features"));
                assertThat(
                        ((List<String>) ((Map<String, Object>) dataMap.get("section2"))
                                .get("body")).size(), is(12));
            }
        };
        transformer.riverConfig = riverConfig;

        String sessionId = "test";
        String url = "http://fess.codelibs.org/";

        Map<String, Map<String, Object>> scrapingRuleMap = new HashMap<String, Map<String, Object>>();
        addScrapingRuleMap(scrapingRuleMap, "text", "nav.sideMenus",
                "div.sidebar-nav ul li", Boolean.TRUE, Boolean.TRUE);
        addScrapingRuleMap(scrapingRuleMap, "text", "section1.title",
                "div.section:eq(0) h2", null, null);
        addScrapingRuleMap(scrapingRuleMap, "text", "section1.body",
                "div.section:eq(0) p", Boolean.TRUE, Boolean.TRUE);
        addScrapingRuleMap(scrapingRuleMap, "text", "section2.title",
                "div.section:eq(1) h2", null, null);
        addScrapingRuleMap(scrapingRuleMap, "text", "section2.body",
                "div.section:eq(1) ul li", Boolean.TRUE, Boolean.TRUE);
        Map<String, Object> patternMap = new HashMap<String, Object>();
        patternMap.put("url", url);
        riverConfig.addScrapingRule(sessionId,null, patternMap, scrapingRuleMap);
        InputStream is = null;
        try {
            is = ResourceUtil
                    .getResourceAsStream("html/fess_codelibs_org.html");
            ResponseData responseData = new ResponseData();
View Full Code Here

TOP

Related Classes of org.codelibs.elasticsearch.web.config.RiverConfig

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.