Package bixo.robots

Examples of bixo.robots.SimpleRobotRulesParser


              if (!robotsUrl.getPath().toLowerCase().endsWith("/robots.txt")) {
                  robotsUrl = new URL(robotsUrl, "/robots.txt");
              }
             
              System.out.println("Processing " + robotsUrl.toExternalForm());
              BaseRobotRules rules = RobotUtils.getRobotRules(fetcher, new SimpleRobotRulesParser(), robotsUrl);
                System.out.println(String.format("Deferred visits = %s, allow all = %s, allow none = %s, top-level allowed = %s",
                                rules.isDeferVisits(),
                                rules.isAllowAll(),
                                rules.isAllowNone(),
                                rules.isAllowed(UrlUtils.makeProtocolAndDomain(url))));
View Full Code Here


        Tap in = makeInputData(platform, "testHeadersInStatus", 1, 1);

        Pipe pipe = new Pipe("urlSource");
        BaseFetcher fetcher = new FakeHttpFetcher(false, 1);
        BaseScoreGenerator scorer = new FixedScoreGenerator();
        BaseRobotsParser parser = new SimpleRobotRulesParser();
        BaseFetchJobPolicy fetchJobPolicy = new DefaultFetchJobPolicy();
        FetchPipe fetchPipe = new FetchPipe(pipe, scorer, fetcher, fetcher, parser, fetchJobPolicy, 1);
       
       
        BasePath outputPath = makeOutputPath(platform, "testHeadersInStatus");
View Full Code Here

        Tap in = makeInputData(platform, "testPayloads", 1, 1, payload);

        Pipe pipe = new Pipe("urlSource");
        BaseFetcher fetcher = new FakeHttpFetcher(false, 10);
        BaseScoreGenerator scorer = new FixedScoreGenerator();
        BaseRobotsParser parser = new SimpleRobotRulesParser();
        BaseFetchJobPolicy fetchJobPolicy = new DefaultFetchJobPolicy();
        FetchPipe fetchPipe = new FetchPipe(pipe, scorer, fetcher, fetcher, parser, fetchJobPolicy, 1);
       
        String output = "build/test/FetchPipeTest/dual";
        BasePath outputPath = platform.makePath(output);
View Full Code Here

        Tap in = makeInputData(platform, "testSkippingURLsByScore", 2, 2);

        Pipe pipe = new Pipe("urlSource");
        BaseFetcher fetcher = new FakeHttpFetcher(false, 1);
        BaseScoreGenerator scorer = new SkippedScoreGenerator();
        BaseRobotsParser parser = new SimpleRobotRulesParser();
        BaseFetchJobPolicy fetchJobPolicy = new DefaultFetchJobPolicy();
        FetchPipe fetchPipe = new FetchPipe(pipe, scorer, fetcher, fetcher, parser, fetchJobPolicy, 1);
       
        BasePath outputPath = makeOutputPath(platform, "testSkippingURLsByScore");
        BasePath contentPath = platform.makePath(outputPath, "content");
View Full Code Here

        // This will force all URLs to get skipped because of the crawl end time limit.
        FetcherPolicy defaultPolicy = new FetcherPolicy();
        defaultPolicy.setCrawlEndTime(0);
        BaseFetcher fetcher = new FakeHttpFetcher(false, 1, defaultPolicy);
        BaseScoreGenerator scorer = new FixedScoreGenerator();
        BaseRobotsParser parser = new SimpleRobotRulesParser();
        BaseFetchJobPolicy fetchJobPolicy = new DefaultFetchJobPolicy(defaultPolicy);
        FetchPipe fetchPipe = new FetchPipe(pipe, scorer, fetcher, fetcher, parser, fetchJobPolicy, 1);

        // Create the output
        BasePath outputPath = makeOutputPath(platform, "testDurationLimitSimple");
View Full Code Here

        // This will limit us to one URL.
        final int maxUrls = 1;
        FetcherPolicy defaultPolicy = new FetcherPolicy();
        BaseFetcher fetcher = new FakeHttpFetcher(false, 1, defaultPolicy);
        BaseScoreGenerator scorer = new FixedScoreGenerator();
        BaseRobotsParser parser = new SimpleRobotRulesParser();
        BaseFetchJobPolicy fetchJobPolicy = new DefaultFetchJobPolicy(defaultPolicy.getMaxRequestsPerConnection(), maxUrls, BaseFetchJobPolicy.DEFAULT_CRAWL_DELAY);
        FetchPipe fetchPipe = new FetchPipe(pipe, scorer, fetcher, fetcher, parser, fetchJobPolicy, 1);

        // Create the output
        BasePath outputPath = makeOutputPath(platform, "testMaxUrlsPerServer");
View Full Code Here

    public void testUsingAllThreads() throws Exception {
        final int maxThreads = 10;
       
        SimpleHttpFetcher fetcher = new SimpleHttpFetcher(maxThreads, ConfigUtils.BIXO_TEST_AGENT);
        BaseScoreGenerator scorer = new FixedScoreGenerator(1.0);
        BaseRobotsParser parser = new SimpleRobotRulesParser();
        FilterAndScoreByUrlAndRobots op = new FilterAndScoreByUrlAndRobots(fetcher, parser, scorer);
       
        FlowProcess fp = Mockito.mock(NullFlowProcess.class);
       
        OperationCall<NullContext> oc = Mockito.mock(OperationCall.class);
View Full Code Here

    public void testBlockedRobots() throws Exception {
        final int maxThreads = 1;
       
        SimpleHttpFetcher fetcher = new SimpleHttpFetcher(maxThreads, ConfigUtils.BIXO_TEST_AGENT);
        BaseScoreGenerator scorer = new FixedScoreGenerator(1.0);
        BaseRobotsParser parser = new SimpleRobotRulesParser();
        FilterAndScoreByUrlAndRobots op = new FilterAndScoreByUrlAndRobots(fetcher, parser, scorer);
       
        FlowProcess fp = Mockito.mock(NullFlowProcess.class);
       
        OperationCall<NullContext> oc = Mockito.mock(OperationCall.class);
View Full Code Here

     * @param numReducers
     */
   
    public FetchPipe(Pipe urlProvider, BaseScoreGenerator scorer, BaseFetcher fetcher, int numReducers) {
        this(urlProvider, scorer, fetcher, RobotUtils.createFetcher(fetcher),
                        new SimpleRobotRulesParser(),
                        new DefaultFetchJobPolicy(fetcher.getFetcherPolicy()),
                        numReducers);
    }
View Full Code Here

TOP

Related Classes of bixo.robots.SimpleRobotRulesParser

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.