Package cascading.tap

Examples of cascading.tap.Tap


            // Now we should have an output/1-<timestamp>/ directory, where the
            // /urls dir has 11 entries with
            // one being previously crawled, and the other 10 being pending.

            Tap crawldbTap = platform.makeTap(platform.makeBinaryScheme(CrawlDbDatum.FIELDS), crawlDbPath);
            TupleEntryIterator iter = crawldbTap.openForRead(platform.makeFlowProcess());

            int numFetched = 0;
            int numPending = 0;
            while (iter.hasNext()) {
                CrawlDbDatum datum = new CrawlDbDatum(iter.next());
                UrlStatus status = datum.getLastStatus();
                int crawlDepth = datum.getCrawlDepth();
                if (datum.getLastFetched() != 0) {
                    numFetched += 1;

                    assertEquals(UrlStatus.FETCHED, status);
                    assertEquals(0, crawlDepth);
                } else {
                    numPending += 1;
                    assertEquals(UrlStatus.UNFETCHED, status);
                    assertEquals(1, crawlDepth);
                }
            }

            assertEquals(1, numFetched);
            assertEquals(10, numPending);

            // Do it one more time, to verify status gets propagated forward.
            curLoopDirPath = CrawlDirUtils.makeLoopDir(platform, baseDirPath, 2);

            flow = DemoCrawlWorkflow.createFlow(curLoopDirPath, crawlDbPath, defaultPolicy, userAgent, urlFilter, options);
            flow.complete();
            // Update crawldb path
            crawlDbPath = platform.makePath(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);

            crawldbTap = platform.makeTap(platform.makeBinaryScheme(CrawlDbDatum.FIELDS), crawlDbPath);
            iter = crawldbTap.openForRead(platform.makeFlowProcess());

            numFetched = 0;
            numPending = 0;
            int numDepth0 = 0;
            int numDepth1 = 0;
View Full Code Here


            pipe = new Each(pipe, new Debug());

            BixoPlatform platform = new BixoPlatform(LengthenUrlsTool.class, Platform.Local);
            BasePath filePath = platform.makePath(filename);
            TextLine textLineLocalScheme = new TextLine(new Fields("url"));
            Tap sourceTap = platform.makeTap(textLineLocalScheme, filePath, SinkMode.KEEP);
            SinkTap sinkTap = new NullSinkTap(new Fields("url"));
           
            FlowConnector flowConnector = platform.makeFlowConnector();
            Flow flow = flowConnector.connect(sourceTap, sinkTap, pipe);
View Full Code Here

        return server;
    }

    @SuppressWarnings({ "unchecked", "rawtypes" })
    private void validateEntryCount(BasePlatform platform, BasePath dataPath, Fields fields, int expected, String msgStr, boolean isTextLine) throws Exception {
        Tap sourceTap;
       
        if (isTextLine) {
            sourceTap = platform.makeTap(platform.makeTextScheme(), dataPath);
        } else {
            sourceTap = platform.makeTap(platform.makeBinaryScheme(fields), dataPath);
        }
       
        TupleEntryIterator tupleEntryIterator = sourceTap.openForRead(platform.makeFlowProcess());
        int numEntries = 0;
        while (tupleEntryIterator.hasNext()) {
          tupleEntryIterator.next();
          numEntries++;
        }
View Full Code Here

    @SuppressWarnings({ "unchecked", "rawtypes" })
    private boolean validatePageScores(BasePlatform platform, BasePath  dataPath) throws Exception {
        boolean allOK = false;
        int verifiedCnt = 0;
        Tap sourceTap = platform.makeTap(platform.makeTextScheme(), dataPath);
        TupleEntryIterator tupleEntryIterator = sourceTap.openForRead(platform.makeFlowProcess());
        while (tupleEntryIterator.hasNext()) {
          TupleEntry next = tupleEntryIterator.next();
          String line = next.getString("line");
          String[] split = line.split("\t");
          if (split[0].equals(PAGE1_URL)) {
View Full Code Here

            }
        }
    }

    protected void testHeadersInStatus(BasePlatform platform) throws Exception {
        Tap in = makeInputData(platform, "testHeadersInStatus", 1, 1);

        Pipe pipe = new Pipe("urlSource");
        BaseFetcher fetcher = new FakeHttpFetcher(false, 1);
        BaseScoreGenerator scorer = new FixedScoreGenerator();
        BaseRobotsParser parser = new SimpleRobotRulesParser();
        BaseFetchJobPolicy fetchJobPolicy = new DefaultFetchJobPolicy();
        FetchPipe fetchPipe = new FetchPipe(pipe, scorer, fetcher, fetcher, parser, fetchJobPolicy, 1);
       
       
        BasePath outputPath = makeOutputPath(platform, "testHeadersInStatus");
        BasePath statusPath = platform.makePath(outputPath, "status");
        Tap status = platform.makeTap(platform.makeBinaryScheme(StatusDatum.FIELDS), statusPath, SinkMode.REPLACE);
       
        // Finally we can run it.
        FlowDef flowDef = new FlowDef();
        flowDef.setName("testHeadersInStatus");
        flowDef.addSource(pipe, in);
        flowDef.addTailSink(fetchPipe.getStatusTailPipe(), status);
        flowDef.addTailSink(fetchPipe.getContentTailPipe(), new NullSinkTap());
       
        FlowConnector flowConnector = platform.makeFlowConnector();
        Flow flow = flowConnector.connect(flowDef);
        flow.writeDOT("build/test/FetchPipeLRTest/testHeadersInStatus/flow.dot");
        flow.complete();
       
        Tap validate = platform.makeTap(platform.makeBinaryScheme(StatusDatum.FIELDS), statusPath);
        TupleEntryIterator tupleEntryIterator = validate.openForRead(platform.makeFlowProcess());
        Assert.assertTrue(tupleEntryIterator.hasNext());
        StatusDatum sd = new StatusDatum(tupleEntryIterator.next());
        Assert.assertEquals(UrlStatus.FETCHED, sd.getStatus());
        HttpHeaders headers = sd.getHeaders();
        Assert.assertNotNull(headers);
View Full Code Here

    protected void testFetchPipe(BixoPlatform platform) throws Exception {
        // System.setProperty("bixo.root.level", "TRACE");
        final int numPages = 10;
        final int port = 8089;
       
        Tap in = makeInputData(platform, "testFetchPipe", "localhost:" + port, numPages, new Payload());

        Pipe pipe = new Pipe("urlSource");
        BaseScoreGenerator scorer = new FixedScoreGenerator();
        BaseFetcher fetcher = new SimpleHttpFetcher(ConfigUtils.BIXO_TEST_AGENT);
        FetchPipe fetchPipe = new FetchPipe(pipe, scorer, fetcher, 1);
       
        String output = "build/test/FetchPipeTest/testFetchPipe";
        BasePath outputPath = platform.makePath(output);
        BasePath statusPath = platform.makePath(outputPath, "status");
        BasePath contentPath = platform.makePath(outputPath, "content");
        Tap status = platform.makeTap(platform.makeBinaryScheme(StatusDatum.FIELDS), statusPath, SinkMode.REPLACE);
        Tap content = platform.makeTap(platform.makeBinaryScheme(FetchedDatum.FIELDS), contentPath, SinkMode.REPLACE);

        // Finally we can run it.
        FlowConnector flowConnector = platform.makeFlowConnector();
        Flow flow = flowConnector.connect(in, FetchPipe.makeSinkMap(status, content), fetchPipe);
        TestWebServer webServer = null;
       
        try {
            webServer = new TestWebServer(new NoRobotsResponseHandler(), port);
            flow.complete();
        } finally {
            webServer.stop();
        }
       
        // Verify numPages fetched and numPages status entries were saved.
        Tap validate = platform.makeTap(platform.makeBinaryScheme(FetchedDatum.FIELDS), contentPath);
        TupleEntryIterator tupleEntryIterator = validate.openForRead(platform.makeFlowProcess());
       
        int totalEntries = 0;
        boolean[] fetchedPages = new boolean[numPages];
        while (tupleEntryIterator.hasNext()) {
            TupleEntry entry = tupleEntryIterator.next();
            totalEntries += 1;

            // Verify we can convert properly
            FetchedDatum datum = new FetchedDatum(entry);
            String url = datum.getUrl();
            Assert.assertNotNull(url);
           
            // Verify that we got one of each page
            int idOffset = url.indexOf(".html") - 1;
            int pageId = Integer.parseInt(url.substring(idOffset, idOffset + 1));
            Assert.assertFalse(fetchedPages[pageId]);
            fetchedPages[pageId] = true;
        }
       
        Assert.assertEquals(numPages, totalEntries);
        tupleEntryIterator.close();
       
        validate = platform.makeTap(platform.makeBinaryScheme(StatusDatum.FIELDS), statusPath);
        tupleEntryIterator = validate.openForRead(platform.makeFlowProcess());
        totalEntries = 0;
        fetchedPages = new boolean[numPages];
        while (tupleEntryIterator.hasNext()) {
            TupleEntry entry = tupleEntryIterator.next();
            totalEntries += 1;
View Full Code Here

        final int numPages = 1;
        final int port = 8089;
       
        Payload payload = new Payload();
        payload.put("payload-field-1", 1);
        Tap in = makeInputData(platform, "testRedirectException", "localhost:" + port, numPages, payload);

        Pipe pipe = new Pipe("urlSource");
        BaseScoreGenerator scorer = new FixedScoreGenerator();
        FetcherPolicy policy = new FetcherPolicy();
        policy.setRedirectMode(RedirectMode.FOLLOW_TEMP);
        BaseFetcher fetcher = new SimpleHttpFetcher(1, policy, ConfigUtils.BIXO_TEST_AGENT);
        FetchPipe fetchPipe = new FetchPipe(pipe, scorer, fetcher, 1);
       
        String output = "build/test/FetchPipeTest/testRedirectException";
        BasePath outputPath = platform.makePath(output);
        BasePath statusPath = platform.makePath(outputPath, "status");
        BasePath contentPath = platform.makePath(outputPath, "content");
        Tap status = platform.makeTap(platform.makeBinaryScheme(StatusDatum.FIELDS), statusPath, SinkMode.REPLACE);
        Tap content = platform.makeTap(platform.makeBinaryScheme(FetchedDatum.FIELDS), contentPath, SinkMode.REPLACE);

        // Finally we can run it.
        FlowConnector flowConnector = platform.makeFlowConnector();
        Flow flow = flowConnector.connect(in, FetchPipe.makeSinkMap(status, content), fetchPipe);
        TestWebServer webServer = null;
       
        try {
            webServer = new TestWebServer(new RedirectResponseHandler(true), port);
            flow.complete();
        } finally {
            webServer.stop();
        }
       
        // Verify numPages fetched and numPages status entries were saved.
        Tap validate = platform.makeTap(platform.makeBinaryScheme(FetchedDatum.FIELDS), contentPath);
        TupleEntryIterator tupleEntryIterator = validate.openForRead(platform.makeFlowProcess());
        Assert.assertFalse(tupleEntryIterator.hasNext());
        tupleEntryIterator.close();
       
        validate = platform.makeTap(platform.makeBinaryScheme(StatusDatum.FIELDS), statusPath);
        tupleEntryIterator = validate.openForRead(platform.makeFlowProcess());
        int totalEntries = 0;
        boolean[] fetchedPages = new boolean[numPages];
        while (tupleEntryIterator.hasNext()) {
            TupleEntry entry = tupleEntryIterator.next();
            totalEntries += 1;
View Full Code Here

        // System.setProperty("bixo.root.level", "TRACE");
       
        final int numPages = 10;
        final int port = 8089;
       
        Tap in = makeInputData(platform, "testTerminatingFetchPipe", "localhost:" + port, numPages, null);

        Pipe pipe = new Pipe("urlSource");
        BaseScoreGenerator scorer = new FixedScoreGenerator();
       
        FetcherPolicy policy = new FetcherPolicy();
        policy.setCrawlEndTime(System.currentTimeMillis() + 50000);
        // Assume we should only need 10ms for fetching all 10 URLs.
        policy.setRequestTimeout(10);
       
        BaseFetcher fetcher = new SimpleHttpFetcher(1, policy, ConfigUtils.BIXO_TEST_AGENT);
        FetchPipe fetchPipe = new FetchPipe(pipe, scorer, fetcher, 1);
       
        BasePath outputPath = makeOutputPath(platform, "testTerminatingFetchPipe");
        BasePath statusPath = platform.makePath(outputPath, "status");
        Tap status = platform.makeTap(platform.makeBinaryScheme(StatusDatum.FIELDS), statusPath, SinkMode.REPLACE);

        // Finally we can run it.
        FlowConnector flowConnector = platform.makeFlowConnector();
        Flow flow = flowConnector.connect(in, status, fetchPipe.getStatusTailPipe());
        TestWebServer webServer = null;
       
        try {
            final int numBytes = 10000;
           
            // Pick a time way longer than the FetcherPolicy.getRequestTimeout().
            final long numMilliseconds = 100 * 1000L;
            webServer = new TestWebServer(new NoRobotsResponseHandler(numBytes, numMilliseconds), port);
            flow.complete();
        } finally {
            webServer.stop();
        }
       
        Tap validate = platform.makeTap(platform.makeBinaryScheme(StatusDatum.FIELDS), statusPath);
        TupleEntryIterator tupleEntryIterator = validate.openForRead(platform.makeFlowProcess());
        int totalEntries = 0;
        while (tupleEntryIterator.hasNext()) {
            TupleEntry entry = tupleEntryIterator.next();
            totalEntries += 1;

View Full Code Here

    }
   
    protected void testPayloads(BixoPlatform platform) throws Exception {
        Payload payload = new Payload();
        payload.put("key", "value");
        Tap in = makeInputData(platform, "testPayloads", 1, 1, payload);

        Pipe pipe = new Pipe("urlSource");
        BaseFetcher fetcher = new FakeHttpFetcher(false, 10);
        BaseScoreGenerator scorer = new FixedScoreGenerator();
        BaseRobotsParser parser = new SimpleRobotRulesParser();
        BaseFetchJobPolicy fetchJobPolicy = new DefaultFetchJobPolicy();
        FetchPipe fetchPipe = new FetchPipe(pipe, scorer, fetcher, fetcher, parser, fetchJobPolicy, 1);
       
        String output = "build/test/FetchPipeTest/dual";
        BasePath outputPath = platform.makePath(output);
        BasePath statusPath = platform.makePath(outputPath, "status");
        Tap status = platform.makeTap(platform.makeBinaryScheme(StatusDatum.FIELDS), statusPath, SinkMode.REPLACE);
        BasePath contentPath = platform.makePath(outputPath, "content");
        Tap content = platform.makeTap(platform.makeBinaryScheme(FetchedDatum.FIELDS), contentPath, SinkMode.REPLACE);

        // Finally we can run it.
        FlowConnector flowConnector = platform.makeFlowConnector();
        Flow flow = flowConnector.connect(in, FetchPipe.makeSinkMap(status, content), fetchPipe);
        flow.complete();
       
        Tap validate = platform.makeTap(platform.makeBinaryScheme(FetchedDatum.FIELDS), contentPath);
        TupleEntryIterator tupleEntryIterator = validate.openForRead(platform.makeFlowProcess());
       
        int totalEntries = 0;
        while (tupleEntryIterator.hasNext()) {
            TupleEntry entry = tupleEntryIterator.next();
            totalEntries += 1;
           
            FetchedDatum datum = new FetchedDatum(entry);
            String payloadValue = (String)datum.getPayloadValue("key");
            Assert.assertNotNull(payloadValue);
            Assert.assertEquals("value", payloadValue);
        }
       
        Assert.assertEquals(1, totalEntries);
        tupleEntryIterator.close();
       
        validate = platform.makeTap(platform.makeBinaryScheme(StatusDatum.FIELDS), statusPath);
        tupleEntryIterator = validate.openForRead(platform.makeFlowProcess());
        totalEntries = 0;
        while (tupleEntryIterator.hasNext()) {
            TupleEntry entry = tupleEntryIterator.next();
            totalEntries += 1;
View Full Code Here

        Assert.assertEquals(1, totalEntries);
    }
   
    protected void testSkippingURLsByScore(BixoPlatform platform) throws Exception {
        // Create four pages, for domain0/page0, domain0/page1, domain1/page0, domain1/page1
        Tap in = makeInputData(platform, "testSkippingURLsByScore", 2, 2);

        Pipe pipe = new Pipe("urlSource");
        BaseFetcher fetcher = new FakeHttpFetcher(false, 1);
        BaseScoreGenerator scorer = new SkippedScoreGenerator();
        BaseRobotsParser parser = new SimpleRobotRulesParser();
        BaseFetchJobPolicy fetchJobPolicy = new DefaultFetchJobPolicy();
        FetchPipe fetchPipe = new FetchPipe(pipe, scorer, fetcher, fetcher, parser, fetchJobPolicy, 1);
       
        BasePath outputPath = makeOutputPath(platform, "testSkippingURLsByScore");
        BasePath contentPath = platform.makePath(outputPath, "content");
        Tap content = platform.makeTap(platform.makeBinaryScheme(FetchedDatum.FIELDS), contentPath, SinkMode.REPLACE);
       
        // Finally we can run it.
        FlowConnector flowConnector = platform.makeFlowConnector();
        Flow flow = flowConnector.connect(in, FetchPipe.makeSinkMap(null, content), fetchPipe);
        flow.complete();
       
        Tap validate = platform.makeTap(platform.makeBinaryScheme(FetchedDatum.FIELDS), contentPath);
        TupleEntryIterator tupleEntryIterator = validate.openForRead(platform.makeFlowProcess());
        Assert.assertTrue(tupleEntryIterator.hasNext());
        TupleEntry te = tupleEntryIterator.next();
        String url = te.getString(FetchedDatum.URL_FN);
        Assert.assertTrue(url.contains("bixo-test-domain-1.com/page-1.html"));
       
View Full Code Here

TOP

Related Classes of cascading.tap.Tap

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.