Examples of makeTextScheme()


Examples of bixo.config.BixoPlatform.makeTextScheme()

        if (!crawlDBPath.exists()) {
            Pipe importPipe = new Pipe("import URLs");
            importPipe = new Each(importPipe, new LoadUrlsFunction());
           
            BasePath inputPath = platform.makePath(input);
            Tap sourceTap = platform.makeTap(platform.makeTextScheme(), inputPath);
            Tap sinkTap = platform.makeTap(platform.makeBinaryScheme(UrlDatum.FIELDS), crawlDBPath, SinkMode.REPLACE);
           
            FlowConnector flowConnector = platform.makeFlowConnector();
            Flow flow = flowConnector.connect(sourceTap, sinkTap, importPipe);
            flow.complete();
View Full Code Here

Examples of bixo.config.BixoPlatform.makeTextScheme()

        Tap in = platform.makeTap(platform.makeBinaryScheme(UrlDatum.FIELDS), inputPath);
       
        BasePath contentPath = platform.makePath(workingPath, "content");
        Tap content = platform.makeTap(platform.makeBinaryScheme(FetchedDatum.FIELDS), contentPath, SinkMode.REPLACE);
        BasePath statusPath = platform.makePath(workingPath, "status");
        Tap status = platform.makeTap(platform.makeTextScheme(), statusPath, SinkMode.REPLACE);

       
        Pipe pipe = new Pipe("urlSource");

        UserAgent userAgent = new FirefoxUserAgent();
View Full Code Here

Examples of bixo.config.BixoPlatform.makeTextScheme()

        BasePath parseDirPath = platform.makePath(curWorkingDirPath, CrawlConfig.PARSE_SUBDIR_NAME);
        Tap parseSink = platform.makeTap(platform.makeBinaryScheme(ParsedDatum.FIELDS), parseDirPath, SinkMode.REPLACE);

        BasePath statusDirPath = platform.makePath(curWorkingDirPath, CrawlConfig.STATUS_SUBDIR_NAME);
        Tap statusSink = platform.makeTap(platform.makeTextScheme(), statusDirPath, SinkMode.REPLACE);

        // Create the sub-assembly that runs the fetch job
        SimpleHttpFetcher fetcher = new SimpleHttpFetcher(options.getMaxThreads(), fetcherPolicy, userAgent);
        fetcher.setMaxRetryCount(CrawlConfig.MAX_RETRIES);
        fetcher.setSocketTimeout(CrawlConfig.SOCKET_TIMEOUT);
View Full Code Here

Examples of bixo.config.BixoPlatform.makeTextScheme()

            // Let's write out the parse as text:
            Pipe textParsePipe = new Pipe("text parse data", parsePipe.getTailPipe());
            textParsePipe = new Each(textParsePipe, new Fields(ParsedDatum.PARSED_TEXT_FN), new RegexReplace(new Fields(ParsedDatum.PARSED_TEXT_FN), "[\\r\\n\\t]+", " ", true), Fields.REPLACE);
            textParsePipe = new Each(textParsePipe, new Fields(ParsedDatum.URL_FN, ParsedDatum.PARSED_TEXT_FN), new Identity());
            BasePath textParsePath = platform.makePath(curWorkingDirPath, CrawlConfig.HTML_SUBDIR_NAME);
            Tap textParseTap = platform.makeTap(platform.makeTextScheme(), textParsePath, SinkMode.REPLACE);
            sinkMap.put(textParsePipe.getName(), textParseTap);
            tailPipes.add(textParsePipe);
        }
       
        // Let's output a WritableSequenceFile as an example - this file can
View Full Code Here

Examples of bixo.config.BixoPlatform.makeTextScheme()

        try {
            BixoPlatform platform = new BixoPlatform(AnalyzeEmail.class, options.getPlatformMode());
            // Create the input (source tap), which is just a text file reader
            BasePath inputPath = platform.makePath(inputFileName);
            Tap sourceTap = platform.makeTap(platform.makeTextScheme(), inputPath);
           
            // Create the sub-assembly that runs the fetch job
            UserAgent userAgent = new UserAgent(options.getAgentName(), EMAIL_ADDRESS, WEB_ADDRESS);
            Pipe importPipe = new Each("url importer", new Fields("line"), new LoadUrlFunction());
           
View Full Code Here

Examples of bixo.config.BixoPlatform.makeTextScheme()

            // And let's sort in reverse order (high to low score)
            analysisPipe = new GroupBy(analysisPipe, new Fields(FieldNames.SUMMED_SCORE), true);

            // Create the sink taps
            BasePath outputPath = platform.makePath(outputDirName);
            Tap pageStatusSinkTap = platform.makeTap(platform.makeTextScheme(),
                            platform.makePath(outputPath, "page-status"), SinkMode.REPLACE);
            Tap mboxStatusSinkTap = platform.makeTap(platform.makeTextScheme(),
                            platform.makePath(outputPath, "mbox-status"), SinkMode.REPLACE);
            Tap contentSinkTap = platform.makeTap(platform.makeBinaryScheme(FetchedDatum.FIELDS),
                            platform.makePath(outputPath, "content"), SinkMode.REPLACE);
View Full Code Here

Examples of bixo.config.BixoPlatform.makeTextScheme()

            // Create the sink taps
            BasePath outputPath = platform.makePath(outputDirName);
            Tap pageStatusSinkTap = platform.makeTap(platform.makeTextScheme(),
                            platform.makePath(outputPath, "page-status"), SinkMode.REPLACE);
            Tap mboxStatusSinkTap = platform.makeTap(platform.makeTextScheme(),
                            platform.makePath(outputPath, "mbox-status"), SinkMode.REPLACE);
            Tap contentSinkTap = platform.makeTap(platform.makeBinaryScheme(FetchedDatum.FIELDS),
                            platform.makePath(outputPath, "content"), SinkMode.REPLACE);
            Tap analyzerSinkTap = platform.makeTap(platform.makeTextScheme(),
                            platform.makePath(outputPath, "analysis"), SinkMode.REPLACE);
View Full Code Here

Examples of bixo.config.BixoPlatform.makeTextScheme()

                            platform.makePath(outputPath, "page-status"), SinkMode.REPLACE);
            Tap mboxStatusSinkTap = platform.makeTap(platform.makeTextScheme(),
                            platform.makePath(outputPath, "mbox-status"), SinkMode.REPLACE);
            Tap contentSinkTap = platform.makeTap(platform.makeBinaryScheme(FetchedDatum.FIELDS),
                            platform.makePath(outputPath, "content"), SinkMode.REPLACE);
            Tap analyzerSinkTap = platform.makeTap(platform.makeTextScheme(),
                            platform.makePath(outputPath, "analysis"), SinkMode.REPLACE);

            HashMap<String, Tap> sinkTapMap = new HashMap<String, Tap>(2);
            sinkTapMap.put(MBOX_PAGE_STATUS_PIPE_NAME, pageStatusSinkTap);
            sinkTapMap.put(FetchPipe.STATUS_PIPE_NAME, mboxStatusSinkTap);
View Full Code Here

Examples of bixo.config.BixoPlatform.makeTextScheme()

            }

            BixoPlatform platform = new BixoPlatform(RunFakeFetchPipe.class, Platform.Local);
           
            BasePath inputPath = platform.makePath(path.getFile());
            Tap in = platform.makeTap(platform.makeTextScheme(), inputPath);

            Pipe importPipe = new Each("url importer", new Fields("line"), new CreateUrlFunction());

            BaseScoreGenerator scorer = new FixedScoreGenerator();
            BaseFetcher fetcher = new FakeHttpFetcher(true, 10);
View Full Code Here

Examples of bixo.config.BixoPlatform.makeTextScheme()

            // Create the output, which is a dual file sink tap.
            String output = "build/test/RunFakeFetchPipe/dual";
            BasePath outputPath = platform.makePath(output);
            BasePath statusPath = platform.makePath(outputPath, "status");
            Tap status = platform.makeTap(platform.makeTextScheme(), statusPath, SinkMode.REPLACE);

            BasePath contentPath = platform.makePath(outputPath, "content");
            Tap content = platform.makeTap(platform.makeTextScheme(), contentPath, SinkMode.REPLACE);
           
            // Finally we can run it.
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.