Package bixo.operations

Examples of bixo.operations.BaseScoreGenerator


        UserAgent userAgent = new FirefoxUserAgent();
        FetcherPolicy fetcherPolicy = new FetcherPolicy();
        fetcherPolicy.setMaxRequestsPerConnection(1);
        fetcherPolicy.setCrawlDelay(5 * 1000L);
        BaseFetcher fetcher = new SimpleHttpFetcher(2, fetcherPolicy, userAgent);
        BaseScoreGenerator scorer = new FixedScoreGenerator();
        FetchPipe fetchPipe = new FetchPipe(pipe, scorer, fetcher, 1);

        FlowConnector flowConnector = platform.makeFlowConnector();
        Flow flow = flowConnector.connect(in, FetchPipe.makeSinkMap(status, content), fetchPipe);
        flow.complete();
View Full Code Here


        fetcherPolicy.setValidMimeTypes(validMimeTypes);

        // The scorer is used by the FetchPipe to assign a score to every URL that passes the
        // robots.txt processing. The score is used to sort URLs such that higher scoring URLs
        // are fetched first. If URLs are skipped for any reason(s) lower scoring URLs are skipped.
        BaseScoreGenerator scorer = new FixedScoreGenerator();

        FetchPipe fetchPipe = new FetchPipe(urlsToFetchPipe, scorer, fetcher, platform.getNumReduceTasks());
        Pipe statusPipe = new Pipe("status pipe", fetchPipe.getStatusTailPipe());
        Pipe contentPipe = new Pipe("content pipe", fetchPipe.getContentTailPipe());
        contentPipe = TupleLogger.makePipe(contentPipe, true);
View Full Code Here

       
        Pipe pipe = new Pipe("urlSource");

        UserAgent userAgent = new FirefoxUserAgent();
        BaseFetcher fetcher = new SimpleHttpFetcher(10, userAgent);
        BaseScoreGenerator scorer = new FixedScoreGenerator();
        FetchPipe fetchPipe = new FetchPipe(pipe, scorer, fetcher, 1);

        FlowConnector flowConnector = platform.makeFlowConnector();

        Flow flow = flowConnector.connect(in, FetchPipe.makeSinkMap(status, content), fetchPipe);
View Full Code Here

           
            // Create the sub-assembly that runs the fetch job
            UserAgent userAgent = new UserAgent(options.getAgentName(), EMAIL_ADDRESS, WEB_ADDRESS);
            Pipe importPipe = new Each("url importer", new Fields("line"), new LoadUrlFunction());
           
            BaseScoreGenerator scorer = new FixedScoreGenerator();
           
            BaseFetcher fetcher = new SimpleHttpFetcher(MAX_THREADS, userAgent);
            FetchPipe fetchPagePipe = new FetchPipe(importPipe, scorer, fetcher, NUM_REDUCERS);
           
            // Here's the pipe that will output UrlDatum tuples, by extracting URLs from the mod_mbox-generated page.
View Full Code Here

    protected void testHeadersInStatus(BasePlatform platform) throws Exception {
        Tap in = makeInputData(platform, "testHeadersInStatus", 1, 1);

        Pipe pipe = new Pipe("urlSource");
        BaseFetcher fetcher = new FakeHttpFetcher(false, 1);
        BaseScoreGenerator scorer = new FixedScoreGenerator();
        BaseRobotsParser parser = new SimpleRobotRulesParser();
        BaseFetchJobPolicy fetchJobPolicy = new DefaultFetchJobPolicy();
        FetchPipe fetchPipe = new FetchPipe(pipe, scorer, fetcher, fetcher, parser, fetchJobPolicy, 1);
       
       
View Full Code Here

        final int port = 8089;
       
        Tap in = makeInputData(platform, "testFetchPipe", "localhost:" + port, numPages, new Payload());

        Pipe pipe = new Pipe("urlSource");
        BaseScoreGenerator scorer = new FixedScoreGenerator();
        BaseFetcher fetcher = new SimpleHttpFetcher(ConfigUtils.BIXO_TEST_AGENT);
        FetchPipe fetchPipe = new FetchPipe(pipe, scorer, fetcher, 1);
       
        String output = "build/test/FetchPipeTest/testFetchPipe";
        BasePath outputPath = platform.makePath(output);
View Full Code Here

        Payload payload = new Payload();
        payload.put("payload-field-1", 1);
        Tap in = makeInputData(platform, "testRedirectException", "localhost:" + port, numPages, payload);

        Pipe pipe = new Pipe("urlSource");
        BaseScoreGenerator scorer = new FixedScoreGenerator();
        FetcherPolicy policy = new FetcherPolicy();
        policy.setRedirectMode(RedirectMode.FOLLOW_TEMP);
        BaseFetcher fetcher = new SimpleHttpFetcher(1, policy, ConfigUtils.BIXO_TEST_AGENT);
        FetchPipe fetchPipe = new FetchPipe(pipe, scorer, fetcher, 1);
       
View Full Code Here

        final int port = 8089;
       
        Tap in = makeInputData(platform, "testTerminatingFetchPipe", "localhost:" + port, numPages, null);

        Pipe pipe = new Pipe("urlSource");
        BaseScoreGenerator scorer = new FixedScoreGenerator();
       
        FetcherPolicy policy = new FetcherPolicy();
        policy.setCrawlEndTime(System.currentTimeMillis() + 50000);
        // Assume we should only need 10ms for fetching all 10 URLs.
        policy.setRequestTimeout(10);
View Full Code Here

        payload.put("key", "value");
        Tap in = makeInputData(platform, "testPayloads", 1, 1, payload);

        Pipe pipe = new Pipe("urlSource");
        BaseFetcher fetcher = new FakeHttpFetcher(false, 10);
        BaseScoreGenerator scorer = new FixedScoreGenerator();
        BaseRobotsParser parser = new SimpleRobotRulesParser();
        BaseFetchJobPolicy fetchJobPolicy = new DefaultFetchJobPolicy();
        FetchPipe fetchPipe = new FetchPipe(pipe, scorer, fetcher, fetcher, parser, fetchJobPolicy, 1);
       
        String output = "build/test/FetchPipeTest/dual";
View Full Code Here

        // Create four pages, for domain0/page0, domain0/page1, domain1/page0, domain1/page1
        Tap in = makeInputData(platform, "testSkippingURLsByScore", 2, 2);

        Pipe pipe = new Pipe("urlSource");
        BaseFetcher fetcher = new FakeHttpFetcher(false, 1);
        BaseScoreGenerator scorer = new SkippedScoreGenerator();
        BaseRobotsParser parser = new SimpleRobotRulesParser();
        BaseFetchJobPolicy fetchJobPolicy = new DefaultFetchJobPolicy();
        FetchPipe fetchPipe = new FetchPipe(pipe, scorer, fetcher, fetcher, parser, fetchJobPolicy, 1);
       
        BasePath outputPath = makeOutputPath(platform, "testSkippingURLsByScore");
View Full Code Here

TOP

Related Classes of bixo.operations.BaseScoreGenerator

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.