Package bixo.examples.webmining

Source Code of bixo.examples.webmining.DemoWebMiningTool

/*
* Copyright 2009-2013 Scale Unlimited
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package bixo.examples.webmining;

import java.io.IOException;
import java.util.HashSet;
import java.util.Set;

import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.html.HtmlParser;
import org.kohsuke.args4j.CmdLineException;
import org.kohsuke.args4j.CmdLineParser;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import bixo.config.BixoPlatform;
import bixo.config.FetcherPolicy;
import bixo.config.FetcherPolicy.FetcherMode;
import bixo.config.UserAgent;
import bixo.utils.CrawlDirUtils;
import cascading.flow.Flow;

import com.scaleunlimited.cascading.BasePath;
import com.scaleunlimited.cascading.BasePlatform;

@SuppressWarnings("deprecation")
public class DemoWebMiningTool {

    private static final Logger LOGGER = LoggerFactory.getLogger(DemoWebMiningTool.class);

    private static void printUsageAndExit(CmdLineParser parser) {
        parser.printUsage(System.err);
        System.exit(-1);
    }

    static void setupWorkingDir(BasePlatform platform, BasePath workingDirPath, String seedUrlsfileName) throws Exception {
       
        // Check if we already have a crawldb
        BasePath crawlDbPath = null;
        BasePath loopDirPath = CrawlDirUtils.findLatestLoopDir(platform, workingDirPath);
        if (loopDirPath != null) {
            // Clear out any previous loop directory, so we're always starting from scratch
            LOGGER.info("deleting existing working dir");
            while (loopDirPath != null) {
                loopDirPath.delete(true);
                loopDirPath = CrawlDirUtils.findLatestLoopDir(platform, workingDirPath);
            }
        }

        // Create a "0-<timestamp>" loop sub-directory and import the seed urls
        loopDirPath = CrawlDirUtils.makeLoopDir(platform, workingDirPath, 0);
        crawlDbPath = platform.makePath(loopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);
        DemoWebMiningWorkflow.importSeedUrls(platform, crawlDbPath, seedUrlsfileName);


    }

    private static void error(String message, CmdLineParser parser) {
        System.err.println(message);
        printUsageAndExit(parser);
    }

    @SuppressWarnings("rawtypes")
    public static void main(String[] args) throws IOException {
       
        DemoWebMiningOptions options = new DemoWebMiningOptions();
        CmdLineParser parser = new CmdLineParser(options);

        try {
            parser.parseArgument(args);
        } catch (CmdLineException e) {
            System.err.println(e.getMessage());
            printUsageAndExit(parser);
        }

        // Build and run the flow.
       
        try {
            BixoPlatform platform = new BixoPlatform(DemoWebMiningTool.class, options.getPlatformMode());
            BasePath workingDirPath = platform.makePath(options.getWorkingDir());

            setupWorkingDir(platform, workingDirPath, CrawlConfig.SEED_URLS_FILENAME);
            BasePath latestDirPath = CrawlDirUtils.findLatestLoopDir(platform, workingDirPath);
            if (latestDirPath == null) {
                error("No previous cycle output dirs exist in " + workingDirPath, parser);
            }
           
            BasePath crawlDbPath = platform.makePath(latestDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);
           
            UserAgent userAgent = new UserAgent(options.getAgentName(), CrawlConfig.EMAIL_ADDRESS, CrawlConfig.WEB_ADDRESS);
           
            FetcherPolicy fetcherPolicy = new FetcherPolicy();
            fetcherPolicy.setCrawlDelay(CrawlConfig.DEFAULT_CRAWL_DELAY);
            fetcherPolicy.setMaxContentSize(CrawlConfig.MAX_CONTENT_SIZE);
            fetcherPolicy.setFetcherMode(FetcherMode.EFFICIENT);
           
            // We only care about mime types that the Tika HTML parser can handle,
            // so restrict it to the same.
            Set<String> validMimeTypes = new HashSet<String>();
            Set<MediaType> supportedTypes = new HtmlParser().getSupportedTypes(new ParseContext());
            for (MediaType supportedType : supportedTypes) {
                validMimeTypes.add(String.format("%s/%s", supportedType.getType(), supportedType.getSubtype()));
            }
            fetcherPolicy.setValidMimeTypes(validMimeTypes);

            // Let's limit our crawl to two loops
            for (int curLoop = 1; curLoop <= 2; curLoop++) {
                BasePath curLoopDirPath = CrawlDirUtils.makeLoopDir(platform, workingDirPath, curLoop);
                Flow flow = DemoWebMiningWorkflow.createWebMiningWorkflow(platform, crawlDbPath, curLoopDirPath, fetcherPolicy, userAgent, options);
                flow.complete();

                // Update crawlDbPath to point to the latest crawl db
                crawlDbPath = platform.makePath(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);
            }
           
        } catch (Exception e) {
            System.err.println("Exception running job: " + e.getMessage());
            e.printStackTrace(System.err);
            System.exit(-1);
        }
    }

}
TOP

Related Classes of bixo.examples.webmining.DemoWebMiningTool

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.