Package cascading.tap

Examples of cascading.tap.Tap

    @SuppressWarnings({ "rawtypes", "unchecked" })
    private static void processCrawlDb(BixoPlatform platform, BasePath latestCrawlDirPath, boolean exportDb) throws Exception {
        TupleEntryIterator iter;
        int totalEntries;
        BasePath crawlDbPath = platform.makePath(latestCrawlDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);
        Tap crawldbTap = platform.makeTap(platform.makeBinaryScheme(CrawlDbDatum.FIELDS), crawlDbPath);
        iter = crawldbTap.openForRead(platform.makeFlowProcess());
        totalEntries = 0;
        int fetchedUrls = 0;
        int unfetchedUrls = 0;"Analyzing: " +  CrawlConfig.CRAWLDB_SUBDIR_NAME);
    @SuppressWarnings({ "rawtypes", "unchecked" })
    public static void importOneDomain(BasePlatform platform, String targetDomain, BasePath crawlDbPath) throws Exception {
        try {
            Tap urlSink = platform.makeTap(platform.makeBinaryScheme(CrawlDbDatum.FIELDS), crawlDbPath, SinkMode.REPLACE);
            TupleEntryCollector writer = urlSink.openForWrite(platform.makeFlowProcess());
            SimpleUrlNormalizer normalizer = new SimpleUrlNormalizer();

            CrawlDbDatum datum = new CrawlDbDatum(normalizer.normalize("http://" + targetDomain), 0, 0, UrlStatus.UNFETCHED, 0);

        createDataFile(platform, unfetchedDatumsPath, unfetchedDatums);

        // create a workflow
        Tap inputSource1 = platform.makeTap(platform.makeBinaryScheme(UrlDatum.FIELDS), fetchedDatumsPath);
        Pipe fetchedPipe = new Pipe("fetched");
        Tap inputSource2 = platform.makeTap(platform.makeBinaryScheme(UrlDatum.FIELDS), unfetchedDatumsPath);
        Pipe unfetchedPipe = new Pipe("unfetched");

        Map<String, Tap> sources = new HashMap<String, Tap>();
        sources.put(fetchedPipe.getName(), inputSource1);
        sources.put(unfetchedPipe.getName(), inputSource2);

        BasePath resultsPath = platform.makePath(workingDirPath, "results");
        Tap resultSink = platform.makeTap(platform.makeBinaryScheme(UrlDatum.FIELDS), resultsPath, SinkMode.REPLACE);

        Pipe resultsPipe = new GroupBy("results pipe", Pipe.pipes(fetchedPipe, unfetchedPipe),
                        new Fields(UrlDatum.URL_FN));
        resultsPipe = new Every(resultsPipe, new LatestUrlDatumBuffer(), Fields.RESULTS);

        FlowConnector flowConnector = platform.makeFlowConnector();
        Flow flow = flowConnector.connect(sources, resultSink, resultsPipe);
        // verify that the resulting pipe has the latest tuple
        Tap testSink = platform.makeTap(platform.makeBinaryScheme(UrlDatum.FIELDS), resultsPath);
        TupleEntryIterator reader = testSink.openForRead(platform.makeFlowProcess());
        int count = 0;
        long latest = 0;
        while (reader.hasNext()) {
            TupleEntry next =;
            UrlDatum datum = new UrlDatum(next);
    @SuppressWarnings({ "unchecked", "rawtypes" })
    private void createDataFile(BasePlatform platform, BasePath filePath, List<UrlDatum> datums) throws Exception {
        Tap urlSink = platform.makeTap(platform.makeBinaryScheme(UrlDatum.FIELDS), filePath, SinkMode.REPLACE);
        TupleEntryCollector writer = urlSink.openForWrite(platform.makeFlowProcess());
        for (UrlDatum datum : datums) {
        // Input : the crawldb
        platform.assertPathExists(crawlDbPath, "CrawlDb doesn't exist");

        // Our crawl db is defined by the CrawlDbDatum
        Tap inputSource = platform.makeTap(platform.makeBinaryScheme(CrawlDbDatum.FIELDS), crawlDbPath);
        Pipe importPipe = new Pipe("import pipe");

        // Split into tuples that are to be fetched and that have already been fetched
        SplitterAssembly splitter = new SplitterAssembly(importPipe, new SplitFetchedUnfetchedCrawlDatums());

        Pipe finishedDatumsFromDb = splitter.getRHSPipe();
        Pipe urlsToFetchPipe = new Pipe("urls to Fetch", splitter.getLHSPipe());

        // Convert the urlsToFetchPipe so that we now deal with UrlDatums.
        urlsToFetchPipe = new Each(urlsToFetchPipe, new CreateUrlDatumFromCrawlDbFunction());
        // A TupleLogger is a good way to follow the tuples around in a flow. You can enable the output
        // of tuples by setting options.setDebugLogging() to true.
        TupleLogger tupleLogger = new TupleLogger();
        urlsToFetchPipe = TupleLogger.makePipe(urlsToFetchPipe, true);
        // Create the output sinks :
        //      crawldb
        //      content
        //      parse
        //      status
        BasePath outCrawlDbPath = platform.makePath(curWorkingDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);
        Tap loopCrawldbSink = platform.makeTap(platform.makeBinaryScheme(CrawlDbDatum.FIELDS), outCrawlDbPath, SinkMode.REPLACE);

        BasePath contentDirPath = platform.makePath(curWorkingDirPath, CrawlConfig.CONTENT_SUBDIR_NAME);
        Tap contentSink = platform.makeTap(platform.makeBinaryScheme(FetchedDatum.FIELDS), contentDirPath, SinkMode.REPLACE);

        BasePath parseDirPath = platform.makePath(curWorkingDirPath, CrawlConfig.PARSE_SUBDIR_NAME);
        Tap parseSink = platform.makeTap(platform.makeBinaryScheme(ParsedDatum.FIELDS), parseDirPath, SinkMode.REPLACE);

        BasePath statusDirPath = platform.makePath(curWorkingDirPath, CrawlConfig.STATUS_SUBDIR_NAME);
        Tap statusSink = platform.makeTap(platform.makeTextScheme(), statusDirPath, SinkMode.REPLACE);

        // Create the sub-assembly that runs the fetch job
        SimpleHttpFetcher fetcher = new SimpleHttpFetcher(options.getMaxThreads(), fetcherPolicy, userAgent);

        // You can also provide a set of mime types you want to restrict what content type you
        // want to deal with - for now keep it simple.
        Set<String> validMimeTypes = new HashSet<String>();

        // The scorer is used by the FetchPipe to assign a score to every URL that passes the
        // robots.txt processing. The score is used to sort URLs such that higher scoring URLs
        // are fetched first. If URLs are skipped for any reason(s) lower scoring URLs are skipped.
        BaseScoreGenerator scorer = new FixedScoreGenerator();

        FetchPipe fetchPipe = new FetchPipe(urlsToFetchPipe, scorer, fetcher, platform.getNumReduceTasks());
        Pipe statusPipe = new Pipe("status pipe", fetchPipe.getStatusTailPipe());
        Pipe contentPipe = new Pipe("content pipe", fetchPipe.getContentTailPipe());
        contentPipe = TupleLogger.makePipe(contentPipe, true);
        // Take content and split it into content output plus parse to extract URLs.
        SimpleParser parser;
        if (options.isUseBoilerpipe()) {
            parser = new SimpleParser(new BoilerpipeContentExtractor(), new SimpleLinkExtractor(), new ParserPolicy());
        } else if (options.isGenerateHTML()) {
            parser = new SimpleParser(new HtmlContentExtractor(), new SimpleLinkExtractor(), new ParserPolicy(), true);
        } else {
            parser = new SimpleParser();
        ParsePipe parsePipe = new ParsePipe(contentPipe, parser);

        // Create the output map that connects each tail pipe to the appropriate sink, and the
        // list of tail pipes.
        Map<String, Tap> sinkMap = new HashMap<String, Tap>();
        List<Pipe> tailPipes = new ArrayList<Pipe>();
        if (options.isGenerateHTML()) {
            // Let's write out the parse as text:
            Pipe textParsePipe = new Pipe("text parse data", parsePipe.getTailPipe());
            textParsePipe = new Each(textParsePipe, new Fields(ParsedDatum.PARSED_TEXT_FN), new RegexReplace(new Fields(ParsedDatum.PARSED_TEXT_FN), "[\\r\\n\\t]+", " ", true), Fields.REPLACE);
            textParsePipe = new Each(textParsePipe, new Fields(ParsedDatum.URL_FN, ParsedDatum.PARSED_TEXT_FN), new Identity());
            BasePath textParsePath = platform.makePath(curWorkingDirPath, CrawlConfig.HTML_SUBDIR_NAME);
            Tap textParseTap = platform.makeTap(platform.makeTextScheme(), textParsePath, SinkMode.REPLACE);
            sinkMap.put(textParsePipe.getName(), textParseTap);
        // Let's output a WritableSequenceFile as an example - this file can
        // then be used as input when working with Mahout.
        // For now we only do it when we are running in Hadoop mode
          Tap writableSeqFileSink = null;
          Pipe writableSeqFileDataPipe = null;
            if (!options.isLocalPlatformMode()) {
                writableSeqFileDataPipe = new Pipe("writable seqfile data", new Each(parsePipe.getTailPipe(), new CreateWritableSeqFileData()));
                BasePath writableSeqFileDataPath = platform.makePath(curWorkingDirPath, CrawlConfig.EXTRACTED_TEXT_SUBDIR_NAME);
                WritableSequenceFile writableSeqScheme = new WritableSequenceFile(new Fields(CrawlConfig.WRITABLE_SEQ_FILE_KEY_FN, CrawlConfig.WRITABLE_SEQ_FILE_VALUE_FN), Text.class, Text.class);
        if (!crawlDBPath.exists()) {
            Pipe importPipe = new Pipe("import URLs");
            importPipe = new Each(importPipe, new LoadUrlsFunction());
            BasePath inputPath = platform.makePath(input);
            Tap sourceTap = platform.makeTap(platform.makeTextScheme(), inputPath);
            Tap sinkTap = platform.makeTap(platform.makeBinaryScheme(UrlDatum.FIELDS), crawlDBPath, SinkMode.REPLACE);
            FlowConnector flowConnector = platform.makeFlowConnector();
            Flow flow = flowConnector.connect(sourceTap, sinkTap, importPipe);
        String input = makeCrawlDb(workingFolder, "src/it/resources/apple-pages.txt");
        BixoPlatform platform = new BixoPlatform(FetcherTest.class, Platform.Local);
        BasePath inputPath = platform.makePath(input);

        Tap in = platform.makeTap(platform.makeBinaryScheme(UrlDatum.FIELDS), inputPath);
        String outputDir = "build/it/FetcherTest/testStaleConnection/out";
        BasePath outputPath = platform.makePath(outputDir);

        BasePath contentPath = platform.makePath(outputPath, "content");
        Tap content = platform.makeTap(platform.makeBinaryScheme(FetchedDatum.FIELDS), contentPath, SinkMode.REPLACE);

        BasePath statusPath = platform.makePath(outputPath, "status");
        Tap status = platform.makeTap(platform.makeBinaryScheme(StatusDatum.FIELDS), statusPath, SinkMode.REPLACE);

        Pipe pipe = new Pipe("urlSource");

        UserAgent userAgent = new FirefoxUserAgent();
        FetcherPolicy fetcherPolicy = new FetcherPolicy();
        fetcherPolicy.setCrawlDelay(5 * 1000L);
        BaseFetcher fetcher = new SimpleHttpFetcher(2, fetcherPolicy, userAgent);
        BaseScoreGenerator scorer = new FixedScoreGenerator();
        FetchPipe fetchPipe = new FetchPipe(pipe, scorer, fetcher, 1);

        FlowConnector flowConnector = platform.makeFlowConnector();
        Flow flow = flowConnector.connect(in, FetchPipe.makeSinkMap(status, content), fetchPipe);
        // Test for all valid fetches.
        Tap validate = platform.makeTap(platform.makeBinaryScheme(StatusDatum.FIELDS), statusPath);
        TupleEntryIterator tupleEntryIterator = validate.openForRead(platform.makeFlowProcess());
        while (tupleEntryIterator.hasNext()) {
            TupleEntry entry =;
            StatusDatum sd = new StatusDatum(entry);
            if (sd.getStatus() != UrlStatus.FETCHED) {
                LOGGER.error(String.format("Fetched failed! Status is %s for %s", sd.getStatus(), sd.getUrl()));
        String workingFolder = "build/test-it/FetcherTest/testRunFetcher";
        String input = makeCrawlDb(workingFolder, "src/it/resources/top10urls.txt");
        BixoPlatform platform = new BixoPlatform(FetcherTest.class, Platform.Local);
        BasePath workingPath = platform.makePath(workingFolder);
        BasePath inputPath = platform.makePath(input);
        Tap in = platform.makeTap(platform.makeBinaryScheme(UrlDatum.FIELDS), inputPath);
        BasePath contentPath = platform.makePath(workingPath, "content");
        Tap content = platform.makeTap(platform.makeBinaryScheme(FetchedDatum.FIELDS), contentPath, SinkMode.REPLACE);
        BasePath statusPath = platform.makePath(workingPath, "status");
        Tap status = platform.makeTap(platform.makeTextScheme(), statusPath, SinkMode.REPLACE);

        Pipe pipe = new Pipe("urlSource");

        UserAgent userAgent = new FirefoxUserAgent();
        BaseFetcher fetcher = new SimpleHttpFetcher(10, userAgent);
        BaseScoreGenerator scorer = new FixedScoreGenerator();
        FetchPipe fetchPipe = new FetchPipe(pipe, scorer, fetcher, 1);

        FlowConnector flowConnector = platform.makeFlowConnector();

        Flow flow = flowConnector.connect(in, FetchPipe.makeSinkMap(status, content), fetchPipe);
        // Test for 10 good fetches.
        Tap validate = platform.makeTap(platform.makeBinaryScheme(FetchedDatum.FIELDS), contentPath);
        TupleEntryIterator tupleEntryIterator = validate.openForRead(platform.makeFlowProcess());
        int fetchedPages = 0;
        while (tupleEntryIterator.hasNext()) {
            TupleEntry entry =;
            new FetchedDatum(entry);
            fetchedPages += 1;
        try {
            BixoPlatform platform = new BixoPlatform(AnalyzeEmail.class, options.getPlatformMode());
            // Create the input (source tap), which is just a text file reader
            BasePath inputPath = platform.makePath(inputFileName);
            Tap sourceTap = platform.makeTap(platform.makeTextScheme(), inputPath);
            // Create the sub-assembly that runs the fetch job
            UserAgent userAgent = new UserAgent(options.getAgentName(), EMAIL_ADDRESS, WEB_ADDRESS);
            Pipe importPipe = new Each("url importer", new Fields("line"), new LoadUrlFunction());
            BaseScoreGenerator scorer = new FixedScoreGenerator();
            BaseFetcher fetcher = new SimpleHttpFetcher(MAX_THREADS, userAgent);
            FetchPipe fetchPagePipe = new FetchPipe(importPipe, scorer, fetcher, NUM_REDUCERS);
            // Here's the pipe that will output UrlDatum tuples, by extracting URLs from the mod_mbox-generated page.
        Pipe mboxPagePipe = new Each(fetchPagePipe.getContentTailPipe(), new ParseModMboxPageFunction(), Fields.RESULTS);

        // Create a named pipe for the status of the mod_mbox-generated pages.
            Pipe mboxPageStatusPipe = new Pipe(MBOX_PAGE_STATUS_PIPE_NAME, fetchPagePipe.getStatusTailPipe());

            // Set up appropriate FetcherPolicy, where we increase the max content size (since mailbox files
            // can be big, e.g. 4MB).
            FetcherPolicy defaultPolicy = new FetcherPolicy();
            fetcher = new SimpleHttpFetcher(MAX_THREADS, defaultPolicy, userAgent);
            // We can create the fetch pipe, and set up our Mbox splitter to run on content.
            FetchPipe fetchMboxPipe = new FetchPipe(mboxPagePipe, scorer, fetcher, NUM_REDUCERS);
            SplitEmails splitterPipe = new SplitEmails(fetchMboxPipe);
            // Now create the pipe that's going to analyze the emails we get after splitting them up.
            Pipe analysisPipe = new Pipe(ANALYZER_PIPE_NAME, splitterPipe.getTails()[0]);
            analysisPipe = new Each(analysisPipe, new ParseEmailFunction());
            // We'll get output that has ANALYZED_EMAIL_FIELDS in it. We want to group by
            // the message-id field, and then do an aggregation on that of the scores.
            analysisPipe = new GroupBy(analysisPipe, new Fields(FieldNames.MESSAGE_ID));
            analysisPipe = new Every(analysisPipe, new CalcMessageScoreBuffer(), Fields.RESULTS);

            // Now we want to sum the scores for each user, which is another grouping/summing.
            analysisPipe = new GroupBy(analysisPipe, new Fields(FieldNames.EMAIL_ADDRESS));
            analysisPipe = new Every(analysisPipe, new SumScoresBuffer(), Fields.RESULTS);
            // Let's filter out anybody with an uninteresting score.
            ExpressionFilter filter = new ExpressionFilter(String.format("%s <= 0.0", FieldNames.SUMMED_SCORE), Double.class);
            analysisPipe = new Each(analysisPipe, filter);
            // And let's sort in reverse order (high to low score)
            analysisPipe = new GroupBy(analysisPipe, new Fields(FieldNames.SUMMED_SCORE), true);

            // Create the sink taps
            BasePath outputPath = platform.makePath(outputDirName);
            Tap pageStatusSinkTap = platform.makeTap(platform.makeTextScheme(),
                            platform.makePath(outputPath, "page-status"), SinkMode.REPLACE);
            Tap mboxStatusSinkTap = platform.makeTap(platform.makeTextScheme(),
                            platform.makePath(outputPath, "mbox-status"), SinkMode.REPLACE);
            Tap contentSinkTap = platform.makeTap(platform.makeBinaryScheme(FetchedDatum.FIELDS),
                            platform.makePath(outputPath, "content"), SinkMode.REPLACE);
            Tap analyzerSinkTap = platform.makeTap(platform.makeTextScheme(),
                            platform.makePath(outputPath, "analysis"), SinkMode.REPLACE);

            HashMap<String, Tap> sinkTapMap = new HashMap<String, Tap>(2);
            sinkTapMap.put(MBOX_PAGE_STATUS_PIPE_NAME, pageStatusSinkTap);
            sinkTapMap.put(FetchPipe.STATUS_PIPE_NAME, mboxStatusSinkTap);
        PartitioningKey groupingKey = new PartitioningKey("key", 1);
        FetchSetDatum pfd = new FetchSetDatum(urls, fetchTime, 1000, groupingKey.getValue(), groupingKey.getRef());
        BixoPlatform platform = new BixoPlatform(ScoredUrlDatumTest.class, platformMode);
        BasePath path = platform.makePath("build/test/ScoredUrlDatumTest/testCascadingSerialization/in");
        Tap in = platform.makeTap(platform.makeBinaryScheme(FetchSetDatum.FIELDS), path, SinkMode.REPLACE);
        TupleEntryCollector write = in.openForWrite(platform.makeFlowProcess());
