Package bixo.operations

Examples of bixo.operations.BaseScoreGenerator


       
        // This will force all URLs to get skipped because of the crawl end time limit.
        FetcherPolicy defaultPolicy = new FetcherPolicy();
        defaultPolicy.setCrawlEndTime(0);
        BaseFetcher fetcher = new FakeHttpFetcher(false, 1, defaultPolicy);
        BaseScoreGenerator scorer = new FixedScoreGenerator();
        BaseRobotsParser parser = new SimpleRobotRulesParser();
        BaseFetchJobPolicy fetchJobPolicy = new DefaultFetchJobPolicy(defaultPolicy);
        FetchPipe fetchPipe = new FetchPipe(pipe, scorer, fetcher, fetcher, parser, fetchJobPolicy, 1);

        // Create the output
View Full Code Here


       
        // This will limit us to one URL.
        final int maxUrls = 1;
        FetcherPolicy defaultPolicy = new FetcherPolicy();
        BaseFetcher fetcher = new FakeHttpFetcher(false, 1, defaultPolicy);
        BaseScoreGenerator scorer = new FixedScoreGenerator();
        BaseRobotsParser parser = new SimpleRobotRulesParser();
        BaseFetchJobPolicy fetchJobPolicy = new DefaultFetchJobPolicy(defaultPolicy.getMaxRequestsPerConnection(), maxUrls, BaseFetchJobPolicy.DEFAULT_CRAWL_DELAY);
        FetchPipe fetchPipe = new FetchPipe(pipe, scorer, fetcher, fetcher, parser, fetchJobPolicy, 1);

        // Create the output
View Full Code Here

            BasePath inputPath = platform.makePath(path.getFile());
            Tap in = platform.makeTap(platform.makeTextScheme(), inputPath);

            Pipe importPipe = new Each("url importer", new Fields("line"), new CreateUrlFunction());

            BaseScoreGenerator scorer = new FixedScoreGenerator();
            BaseFetcher fetcher = new FakeHttpFetcher(true, 10);
            FetchPipe fetchPipe = new FetchPipe(importPipe, scorer, fetcher, 1);

            // Create the output, which is a dual file sink tap.
            String output = "build/test/RunFakeFetchPipe/dual";
View Full Code Here

        // TODO add unit test
        urlsToFetchPipe = new GroupBy(urlsToFetchPipe, new Fields(CrawlDbDatum.LINKS_SCORE_FIELD), true);
        long maxToFetch = isLocal ? MAX_LOCAL_FETCH : MAX_DISTRIBUTED_FETCH;
        urlsToFetchPipe = new Each(urlsToFetchPipe, new CreateUrlDatumFromCrawlDbDatum(maxToFetch));

        BaseScoreGenerator scorer = new LinkScoreGenerator();

        // Create the sub-assembly that runs the fetch job
        int maxThreads = isLocal ? CrawlConfig.DEFAULT_NUM_THREADS_LOCAL :  CrawlConfig.DEFAULT_NUM_THREADS_CLUSTER;
        SimpleHttpFetcher fetcher = new SimpleHttpFetcher(maxThreads, fetcherPolicy, userAgent);
        fetcher.setMaxRetryCount(CrawlConfig.MAX_RETRIES);
View Full Code Here

TOP

Related Classes of bixo.operations.BaseScoreGenerator

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.