Examples of bixo.config.BixoPlatform

bixo.config.BixoPlatform


            Pipe pipe = new Pipe("urls");
            pipe = new Each(pipe, new UrlLengthener(fetcher));
            pipe = new Each(pipe, new Debug());


            BixoPlatform platform = new BixoPlatform(LengthenUrlsTool.class, Platform.Local);
            BasePath filePath = platform.makePath(filename);
            TextLine textLineLocalScheme = new TextLine(new Fields("url"));
            Tap sourceTap = platform.makeTap(textLineLocalScheme, filePath, SinkMode.KEEP);
            SinkTap sinkTap = new NullSinkTap(new Fields("url"));
            
            FlowConnector flowConnector = platform.makeFlowConnector();
            Flow flow = flowConnector.connect(sourceTap, sinkTap, pipe);


            flow.complete();
        } catch (Exception e) {
            System.err.println("Exception running tool: " + e.getMessage());

View Full Code Here

    
    @SuppressWarnings({ "rawtypes", "unchecked" })
    @Test
    public void testOperateWithGroupBy() throws Exception {
        
        BixoPlatform platform = new BixoPlatform(LatestUrlDatumBufferTest.class, Platform.Local);
        
        // Create a temp file with a fetched url
        BasePath workingDirPath = platform.makePath(WORKINGDIR);
        BasePath fetchedDatumsPath = platform.makePath(workingDirPath, "fetched");
        ArrayList<UrlDatum> fetchedDatums = new ArrayList<UrlDatum>();
        UrlDatum fetchedDatum1 = new UrlDatum("http://foo.com");
        fetchedDatum1.setPayloadValue(CrawlDbDatum.LAST_FETCHED_FIELD, 2L);
        fetchedDatums.add(fetchedDatum1);
        createDataFile(platform, fetchedDatumsPath, fetchedDatums);
        
        // And another with unfetched urls
        BasePath unfetchedDatumsPath = platform.makePath(workingDirPath, "unfetched");
        ArrayList<UrlDatum> unfetchedDatums = new ArrayList<UrlDatum>();
        UrlDatum unfetchedDatum1 = new UrlDatum("http://foo.com");
        unfetchedDatum1.setPayloadValue(CrawlDbDatum.LAST_FETCHED_FIELD, 0L);
        unfetchedDatums.add(unfetchedDatum1);
        UrlDatum unfetchedDatum2 = new UrlDatum("http://foo.com");
        unfetchedDatum2.setPayloadValue(CrawlDbDatum.LAST_FETCHED_FIELD, 0L);
        unfetchedDatums.add(unfetchedDatum2);
        
        createDataFile(platform, unfetchedDatumsPath, unfetchedDatums);


        
        // create a workflow
        Tap inputSource1 = platform.makeTap(platform.makeBinaryScheme(UrlDatum.FIELDS), fetchedDatumsPath);
        Pipe fetchedPipe = new Pipe("fetched");
        Tap inputSource2 = platform.makeTap(platform.makeBinaryScheme(UrlDatum.FIELDS), unfetchedDatumsPath);
        Pipe unfetchedPipe = new Pipe("unfetched");


        Map<String, Tap> sources = new HashMap<String, Tap>();
        sources.put(fetchedPipe.getName(), inputSource1);
        sources.put(unfetchedPipe.getName(), inputSource2);


        BasePath resultsPath = platform.makePath(workingDirPath, "results");
        Tap resultSink = platform.makeTap(platform.makeBinaryScheme(UrlDatum.FIELDS), resultsPath, SinkMode.REPLACE);


        Pipe resultsPipe = new GroupBy("results pipe", Pipe.pipes(fetchedPipe, unfetchedPipe), 
                        new Fields(UrlDatum.URL_FN));
        resultsPipe = new Every(resultsPipe, new LatestUrlDatumBuffer(), Fields.RESULTS);




        FlowConnector flowConnector = platform.makeFlowConnector();
        Flow flow = flowConnector.connect(sources, resultSink, resultsPipe);
        flow.complete();
        
        // verify that the resulting pipe has the latest tuple
        
        Tap testSink = platform.makeTap(platform.makeBinaryScheme(UrlDatum.FIELDS), resultsPath);
        TupleEntryIterator reader = testSink.openForRead(platform.makeFlowProcess());
        int count = 0;
        long latest = 0;
        while (reader.hasNext()) {
            TupleEntry next = reader.next();
            UrlDatum datum = new UrlDatum(next);

View Full Code Here


    
    @SuppressWarnings("rawtypes")
    public static Flow createFlow(BasePath curWorkingDirPath, BasePath crawlDbPath, FetcherPolicy fetcherPolicy, UserAgent userAgent, BaseUrlFilter urlFilter, DemoCrawlToolOptions options) throws Throwable {


        BixoPlatform platform = new BixoPlatform(DemoCrawlWorkflow.class, options.getPlatformMode());
        platform.resetNumReduceTasks();


        // Input : the crawldb
        platform.assertPathExists(crawlDbPath, "CrawlDb doesn't exist");


        // Our crawl db is defined by the CrawlDbDatum
        Tap inputSource = platform.makeTap(platform.makeBinaryScheme(CrawlDbDatum.FIELDS), crawlDbPath);
        Pipe importPipe = new Pipe("import pipe");


        // Split into tuples that are to be fetched and that have already been fetched
        SplitterAssembly splitter = new SplitterAssembly(importPipe, new SplitFetchedUnfetchedCrawlDatums());


        Pipe finishedDatumsFromDb = splitter.getRHSPipe();
        Pipe urlsToFetchPipe = new Pipe("urls to Fetch", splitter.getLHSPipe());


        // Convert the urlsToFetchPipe so that we now deal with UrlDatums.
        urlsToFetchPipe = new Each(urlsToFetchPipe, new CreateUrlDatumFromCrawlDbFunction());
        // A TupleLogger is a good way to follow the tuples around in a flow. You can enable the output
        // of tuples by setting options.setDebugLogging() to true.
        TupleLogger tupleLogger = new TupleLogger();
        urlsToFetchPipe = TupleLogger.makePipe(urlsToFetchPipe, true);
        
        // Create the output sinks :
        //      crawldb
        //      content
        //      parse
        //      status
        BasePath outCrawlDbPath = platform.makePath(curWorkingDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);
        Tap loopCrawldbSink = platform.makeTap(platform.makeBinaryScheme(CrawlDbDatum.FIELDS), outCrawlDbPath, SinkMode.REPLACE);


        BasePath contentDirPath = platform.makePath(curWorkingDirPath, CrawlConfig.CONTENT_SUBDIR_NAME);
        Tap contentSink = platform.makeTap(platform.makeBinaryScheme(FetchedDatum.FIELDS), contentDirPath, SinkMode.REPLACE);


        BasePath parseDirPath = platform.makePath(curWorkingDirPath, CrawlConfig.PARSE_SUBDIR_NAME);
        Tap parseSink = platform.makeTap(platform.makeBinaryScheme(ParsedDatum.FIELDS), parseDirPath, SinkMode.REPLACE);


        BasePath statusDirPath = platform.makePath(curWorkingDirPath, CrawlConfig.STATUS_SUBDIR_NAME);
        Tap statusSink = platform.makeTap(platform.makeTextScheme(), statusDirPath, SinkMode.REPLACE);


        // Create the sub-assembly that runs the fetch job
        SimpleHttpFetcher fetcher = new SimpleHttpFetcher(options.getMaxThreads(), fetcherPolicy, userAgent);
        fetcher.setMaxRetryCount(CrawlConfig.MAX_RETRIES);
        fetcher.setSocketTimeout(CrawlConfig.SOCKET_TIMEOUT);
        fetcher.setConnectionTimeout(CrawlConfig.CONNECTION_TIMEOUT);


        // You can also provide a set of mime types you want to restrict what content type you 
        // want to deal with - for now keep it simple.
        Set<String> validMimeTypes = new HashSet<String>();
        validMimeTypes.add("text/plain");
        validMimeTypes.add("text/html");
        fetcherPolicy.setValidMimeTypes(validMimeTypes);


        // The scorer is used by the FetchPipe to assign a score to every URL that passes the 
        // robots.txt processing. The score is used to sort URLs such that higher scoring URLs
        // are fetched first. If URLs are skipped for any reason(s) lower scoring URLs are skipped.
        BaseScoreGenerator scorer = new FixedScoreGenerator();


        FetchPipe fetchPipe = new FetchPipe(urlsToFetchPipe, scorer, fetcher, platform.getNumReduceTasks());
        Pipe statusPipe = new Pipe("status pipe", fetchPipe.getStatusTailPipe());
        Pipe contentPipe = new Pipe("content pipe", fetchPipe.getContentTailPipe());
        contentPipe = TupleLogger.makePipe(contentPipe, true);
        
        // Take content and split it into content output plus parse to extract URLs.
        SimpleParser parser;
        if (options.isUseBoilerpipe()) {
            parser = new SimpleParser(new BoilerpipeContentExtractor(), new SimpleLinkExtractor(), new ParserPolicy());
        } else if (options.isGenerateHTML()) {
            parser = new SimpleParser(new HtmlContentExtractor(), new SimpleLinkExtractor(), new ParserPolicy(), true);
        } else {
            parser = new SimpleParser();
        }
        
        parser.setExtractLanguage(false);
        ParsePipe parsePipe = new ParsePipe(contentPipe, parser);


        
        // Create the output map that connects each tail pipe to the appropriate sink, and the
        // list of tail pipes.
        Map<String, Tap> sinkMap = new HashMap<String, Tap>();
        List<Pipe> tailPipes = new ArrayList<Pipe>();
        
        if (options.isGenerateHTML()) {
            // Let's write out the parse as text:
            Pipe textParsePipe = new Pipe("text parse data", parsePipe.getTailPipe());
            textParsePipe = new Each(textParsePipe, new Fields(ParsedDatum.PARSED_TEXT_FN), new RegexReplace(new Fields(ParsedDatum.PARSED_TEXT_FN), "[\\r\\n\\t]+", " ", true), Fields.REPLACE);
            textParsePipe = new Each(textParsePipe, new Fields(ParsedDatum.URL_FN, ParsedDatum.PARSED_TEXT_FN), new Identity());
            BasePath textParsePath = platform.makePath(curWorkingDirPath, CrawlConfig.HTML_SUBDIR_NAME);
            Tap textParseTap = platform.makeTap(platform.makeTextScheme(), textParsePath, SinkMode.REPLACE);
            sinkMap.put(textParsePipe.getName(), textParseTap);
            tailPipes.add(textParsePipe);
        }
        
        // Let's output a WritableSequenceFile as an example - this file can
        // then be used as input when working with Mahout.
        // For now we only do it when we are running in Hadoop mode
          Tap writableSeqFileSink = null;
          Pipe writableSeqFileDataPipe = null;
            if (!options.isLocalPlatformMode()) {
                writableSeqFileDataPipe = new Pipe("writable seqfile data", new Each(parsePipe.getTailPipe(), new CreateWritableSeqFileData()));
                BasePath writableSeqFileDataPath = platform.makePath(curWorkingDirPath, CrawlConfig.EXTRACTED_TEXT_SUBDIR_NAME);
                WritableSequenceFile writableSeqScheme = new WritableSequenceFile(new Fields(CrawlConfig.WRITABLE_SEQ_FILE_KEY_FN, CrawlConfig.WRITABLE_SEQ_FILE_VALUE_FN), Text.class, Text.class);
                writableSeqFileSink = platform.makeTap(writableSeqScheme, writableSeqFileDataPath, SinkMode.REPLACE);
            }
        
        Pipe urlFromOutlinksPipe = new Pipe("url from outlinks", parsePipe.getTailPipe());
        urlFromOutlinksPipe = new Each(urlFromOutlinksPipe, new CreateUrlDatumFromOutlinksFunction(new SimpleUrlNormalizer(), new SimpleUrlValidator()));
        if (urlFilter != null) {
            urlFromOutlinksPipe = new Each(urlFromOutlinksPipe, new UrlFilter(urlFilter));
        }
        
        urlFromOutlinksPipe = TupleLogger.makePipe(urlFromOutlinksPipe, true);


        // Take status and output urls from it  
        Pipe urlFromFetchPipe = new Pipe("url from fetch", statusPipe);
        urlFromFetchPipe = new Each(urlFromFetchPipe, new CreateUrlDatumFromStatusFunction());
        urlFromFetchPipe = TupleLogger.makePipe(urlFromFetchPipe, true);


        // Finally join the URLs we get from parsing content with the URLs we got
        // from the status ouput, and the urls we didn't process from the db so that 
        // we have a unified stream of all known URLs for the crawldb.
        Pipe finishedUrlsFromDbPipe = new Each(finishedDatumsFromDb, new CreateUrlDatumFromCrawlDbFunction());
        finishedUrlsFromDbPipe = TupleLogger.makePipe(finishedUrlsFromDbPipe, true);


        // NOTE : Ideally you would just do a CoGroup instead of converting all the pipes to emit UrlDatums 
        // and then doing the extra step of converting from UrlDatum to CrawlDbDatum.
        // The reason this isn't being done here is because we are sharing LatestUrlDatumBuffer() with JDBCCrawlTool
        Pipe crawlDbPipe = new GroupBy("crawldb pipe", Pipe.pipes(urlFromFetchPipe, urlFromOutlinksPipe, finishedUrlsFromDbPipe), 
                        new Fields(UrlDatum.URL_FN));
        crawlDbPipe = new Every(crawlDbPipe, new LatestUrlDatumBuffer(), Fields.RESULTS);
        
        Pipe outputPipe = new Pipe ("output pipe");
        outputPipe = new Each(crawlDbPipe, new CreateCrawlDbDatumFromUrlFunction());
        
        // Create the output map that connects each tail pipe to the appropriate sink.
        sinkMap.put(statusPipe.getName(), statusSink);
        tailPipes.add(statusPipe);
        
        sinkMap.put(contentPipe.getName(), contentSink);
        tailPipes.add(contentPipe);


        sinkMap.put(parsePipe.getTailPipe().getName(), parseSink);
        tailPipes.add(parsePipe.getTailPipe());


        sinkMap.put(outputPipe.getName(), loopCrawldbSink);
        tailPipes.add(outputPipe);


        if (!options.isLocalPlatformMode()) {
            sinkMap.put(writableSeqFileDataPipe.getName(), writableSeqFileSink);
            tailPipes.add(writableSeqFileDataPipe);
        }
        
        FlowConnector flowConnector = platform.makeFlowConnector();
        Flow flow = flowConnector.connect(inputSource, sinkMap, tailPipes.toArray(new Pipe[tailPipes.size()]));


        return flow;
    }

View Full Code Here

            if (path == null) {
                System.err.println("File not found on classpath: " + args[0]);
                System.exit(-1);
            }


            BixoPlatform platform = new BixoPlatform(RunFakeFetchPipe.class, Platform.Local);
            
            BasePath inputPath = platform.makePath(path.getFile());
            Tap in = platform.makeTap(platform.makeTextScheme(), inputPath);


            Pipe importPipe = new Each("url importer", new Fields("line"), new CreateUrlFunction());


            BaseScoreGenerator scorer = new FixedScoreGenerator();
            BaseFetcher fetcher = new FakeHttpFetcher(true, 10);
            FetchPipe fetchPipe = new FetchPipe(importPipe, scorer, fetcher, 1);


            // Create the output, which is a dual file sink tap.
            String output = "build/test/RunFakeFetchPipe/dual";
            BasePath outputPath = platform.makePath(output);
            BasePath statusPath = platform.makePath(outputPath, "status");
            Tap status = platform.makeTap(platform.makeTextScheme(), statusPath, SinkMode.REPLACE);


            BasePath contentPath = platform.makePath(outputPath, "content");
            Tap content = platform.makeTap(platform.makeTextScheme(), contentPath, SinkMode.REPLACE);
            
            // Finally we can run it.
            FlowConnector flowConnector = platform.makeFlowConnector();
            Flow flow = flowConnector.connect(in, FetchPipe.makeSinkMap(status, content), fetchPipe);
            flow.complete();
        } catch (Throwable t) {
            System.err.println("Exception running fake fetch pipe assembly: " + t.getMessage());
            t.printStackTrace(System.err);

View Full Code Here


    @SuppressWarnings({ "unchecked", "rawtypes" })
  @Test
    public void testParserPipe() throws Exception {


        BixoPlatform platform = new BixoPlatform(ParsePipeTest.class, Platform.Local);
        


        Pipe pipe = new Pipe("parse_source");
        ParsePipe parserPipe = new ParsePipe(pipe, new SimpleParser());
        BasePath inputPath = platform.makePath("build/test/ParserPipeTest/in");
        Tap in = platform.makeTap(platform.makeBinaryScheme(FetchedDatum.FIELDS), inputPath);
        BasePath outputPath = platform.makePath("build/test/ParserPipeTest/out");
        Tap out = platform.makeTap(platform.makeBinaryScheme(ParsedDatum.FIELDS), outputPath, SinkMode.REPLACE);


        TupleEntryCollector write = in.openForWrite(platform.makeFlowProcess());


        ArchiveReader archiveReader = ArchiveReaderFactory.get("src/test/resources/someHtml.arc");
        Iterator<ArchiveRecord> iterator = archiveReader.iterator();
        int max = 300;
        int count = 0;
        int validRecords = 0;
        while (count++ < max && iterator.hasNext()) {
            ArchiveRecord archiveRecord = iterator.next();
            ArchiveRecordHeader header = archiveRecord.getHeader();
            String url = header.getUrl();


            String protocol = "";
            try {
                protocol = new URL(url).getProtocol();
            } catch (MalformedURLException e) {
                // Ignore and skip
            }


            if (protocol.equals("http")) {
                validRecords += 1;
                int contentOffset = header.getContentBegin();
                long totalLength = header.getLength();
                int contentLength = (int) totalLength - contentOffset;


                archiveRecord.skip(contentOffset);
                byte[] content = new byte[contentLength];
                archiveRecord.read(content);


                String mimetype = header.getMimetype();
                // The Arc headers != HTTP headers, but it's at least some data we can jam
                // into the FetchedDatum as a test. Note that the Arc headers will have value
                // types other than a long, so we have do to the conversion.
                HttpHeaders headers = new HttpHeaders();
                Set<String> keys = header.getHeaderFieldKeys();
                for (String key : keys) {
                    String value = header.getHeaderValue(key).toString();
                    headers.add(key, value);
                }
                
                FetchedDatum contentTuple = new FetchedDatum(url, url, System.currentTimeMillis(), headers, new ContentBytes(content), mimetype, 0);
                write.add(contentTuple.getTuple());
            }
        }


        write.close();
        FlowConnector flowConnector = platform.makeFlowConnector();
        Flow flow = flowConnector.connect(in, out, parserPipe);
        flow.complete();
        
        // Currently many of the docs fail parsing:
        // http://webtools.uiuc.edu/calendar/RSS?calId=504

View Full Code Here

public class FetchPipeLocalTest extends AbstractFetchPipeTest {
    


    @Test
    public void testHeadersInStatus() throws Exception {
        testHeadersInStatus(new BixoPlatform(FetchPipeLocalTest.class, Platform.Local));
    }

View Full Code Here

        testHeadersInStatus(new BixoPlatform(FetchPipeLocalTest.class, Platform.Local));
    }
    
    @Test
    public void testFetchPipe() throws Exception {
        testFetchPipe(new BixoPlatform(FetchPipeLocalTest.class, Platform.Local));
    }

View Full Code Here

        testFetchPipe(new BixoPlatform(FetchPipeLocalTest.class, Platform.Local));
    }
    
    @Test
    public void testRedirectException() throws Exception {
        testRedirectException(new BixoPlatform(FetchPipeLocalTest.class, Platform.Local));
    }

View Full Code Here

        testRedirectException(new BixoPlatform(FetchPipeLocalTest.class, Platform.Local));
    }
    
    @Test
    public void testTerminatingFetchPipe() throws Exception {
        testTerminatingFetchPipe(new BixoPlatform(FetchPipeLocalTest.class, Platform.Local));
    }

View Full Code Here

        testTerminatingFetchPipe(new BixoPlatform(FetchPipeLocalTest.class, Platform.Local));
    }
    
    @Test
    public void testPayloads() throws Exception {
        testPayloads(new BixoPlatform(FetchPipeLocalTest.class, Platform.Local));
    }

View Full Code Here

0 1 2 3

TOP

Related Classes of bixo.config.BixoPlatform

bixo.datum.ScoredUrlDatumTest

bixo.examples.crawl.DemoCrawlTool

bixo.examples.crawl.DemoCrawlWorkflow

bixo.examples.crawl.DemoCrawlWorkflowLRTest

bixo.examples.crawl.DemoStatusTool

bixo.examples.crawl.LatestUrlDatumBufferTest

bixo.examples.webmining.DemoWebMiningTool

bixo.examples.webmining.DemoWebMiningWorkflowTest

bixo.fetcher.FetcherTest

bixo.pipes.FetchPipeHadoopTest

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.