Examples of bixo.config.BixoPlatform

bixo.config.BixoPlatform

        }


        String crawlDirName = options.getWorkingDir();


        try {
            BixoPlatform platform = new BixoPlatform(DemoStatusTool.class, options.getPlatformMode());
          BasePath crawlDirPath = platform.makePath(crawlDirName);


          platform.assertPathExists(crawlDirPath, "Prior crawl output directory does not exist");
          
          // Skip Hadoop/Cascading DEBUG messages.
            org.apache.log4j.Logger.getRootLogger().setLevel(Level.INFO);
          
          boolean exportDb = options.isExportDb();

View Full Code Here

        if (!logsDir.endsWith("/")) {
            logsDir = logsDir + "/";
        }
        
        try {
            BixoPlatform platform = new BixoPlatform(DemoCrawlTool.class, options.getPlatformMode());
            BasePath outputPath = platform.makePath(outputDirName);


            // First check if the user want to clean
            if (options.isCleanOutputDir()) {
                if (outputPath.exists()) {
                    outputPath.delete(true);
                }
            }
            
            // See if the user isn't starting from scratch then set up the 
            // output directory and create an initial urls subdir.
            if (!outputPath.exists()) {
                outputPath.mkdirs();


                // Create a "0-<timestamp>" sub-directory with just a /crawldb subdir
                // In the /crawldb dir the input file will have a single URL for the target domain.


                BasePath curLoopDir = CrawlDirUtils.makeLoopDir(platform, outputPath, 0);
                String curLoopDirName = curLoopDir.getName();
                setLoopLoggerFile(logsDir + curLoopDirName, 0);
                BasePath crawlDbPath = platform.makePath(curLoopDir, CrawlConfig.CRAWLDB_SUBDIR_NAME);
                
                if (domain != null) {
                    importOneDomain(platform, domain, crawlDbPath);
                } else {
                    BasePath urlsPath = platform.makePath(urlsFile);
                    UrlImporter urlImporter = new UrlImporter(platform, urlsPath, crawlDbPath);
                    urlImporter.importUrls(false);                }
            }
            
            BasePath latestDirPath = CrawlDirUtils.findLatestLoopDir(platform, outputPath);


            if (latestDirPath == null) {
                System.err.println("No previous cycle output dirs exist in " + outputDirName);
                printUsageAndExit(parser);
            }


            BasePath crawlDbPath = platform.makePath(latestDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);


            // Set up the start and end loop counts.
            int startLoop = CrawlDirUtils.extractLoopNumber(latestDirPath);
            int endLoop = startLoop + options.getNumLoops();


            // Set up the UserAgent for the fetcher.
            UserAgent userAgent = new UserAgent(options.getAgentName(), CrawlConfig.EMAIL_ADDRESS, CrawlConfig.WEB_ADDRESS);


            // You also get to customize the FetcherPolicy
            FetcherPolicy defaultPolicy = new FetcherPolicy();
            defaultPolicy.setCrawlDelay(CrawlConfig.DEFAULT_CRAWL_DELAY);
            defaultPolicy.setMaxContentSize(CrawlConfig.MAX_CONTENT_SIZE);
            defaultPolicy.setFetcherMode(FetcherMode.EFFICIENT);
            
            // It is a good idea to set up a crawl duration when running long crawls as you may 
            // end up in situations where the fetch slows down due to a 'long tail' and by 
            // specifying a crawl duration you know exactly when the crawl will end.
            int crawlDurationInMinutes = options.getCrawlDuration();
            boolean hasEndTime = crawlDurationInMinutes != DemoCrawlToolOptions.NO_CRAWL_DURATION;
            long targetEndTime = hasEndTime ? System.currentTimeMillis() + (crawlDurationInMinutes * CrawlConfig.MILLISECONDS_PER_MINUTE) : 
                FetcherPolicy.NO_CRAWL_END_TIME;


            // By setting up a url filter we only deal with urls that we want to
            // instead of all the urls that we extract.
            BaseUrlFilter urlFilter = null;
            List<String> patterns = null;
            String regexUrlFiltersFile = options.getRegexUrlFiltersFile();
            if (regexUrlFiltersFile != null) {
                patterns = RegexUrlFilter.getUrlFilterPatterns(regexUrlFiltersFile);
            } else {
                patterns = RegexUrlFilter.getDefaultUrlFilterPatterns();
                if (domain != null) {
                    String domainPatterStr = "+(?i)^(http|https)://([a-z0-9]*\\.)*" + domain;
                    patterns.add(domainPatterStr);
                } else {
                    String protocolPatterStr = "+(?i)^(http|https)://*";
                    patterns.add(protocolPatterStr);
                    LOGGER.warn("Defaulting to basic url regex filtering (just suffix and protocol");
                }
            }
            urlFilter = new RegexUrlFilter(patterns.toArray(new String[patterns.size()]));


            // OK, now we're ready to start looping, since we've got our current
            // settings
            for (int curLoop = startLoop + 1; curLoop <= endLoop; curLoop++) {


                // Adjust target end time, if appropriate.
                if (hasEndTime) {
                    int remainingLoops = (endLoop - curLoop) + 1;
                    long now = System.currentTimeMillis();
                    long perLoopTime = (targetEndTime - now) / remainingLoops;
                    defaultPolicy.setCrawlEndTime(now + perLoopTime);
                }


                BasePath curLoopDirPath = CrawlDirUtils.makeLoopDir(platform, outputPath, curLoop);
                String curLoopDirName = curLoopDirPath.getName();
                setLoopLoggerFile(logsDir+curLoopDirName, curLoop);


                Flow flow = DemoCrawlWorkflow.createFlow(curLoopDirPath, crawlDbPath, defaultPolicy, userAgent, urlFilter, options); 
                flow.complete();
                
                // Writing out .dot files is a good way to verify your flows.
//              flow.writeDOT("build/valid-flow.dot");


                // Update crawlDbPath to point to the latest crawl db
                crawlDbPath = platform.makePath(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);
            }
        } catch (Throwable t) {
            System.err.println("Exception running tool: " + t.getMessage());
            t.printStackTrace(System.err);
            System.exit(-1);

View Full Code Here

    }
        
    @SuppressWarnings("rawtypes")
    private String makeCrawlDb(String workingFolder, String input) throws Exception {


        BixoPlatform platform = new BixoPlatform(FetcherTest.class, Platform.Local);
        
        // We don't want to regenerate this DB all the time.
        BasePath workingPath = platform.makePath(workingFolder);
        BasePath crawlDBPath = platform.makePath(workingPath, URL_DB_NAME);
        if (!crawlDBPath.exists()) {
            Pipe importPipe = new Pipe("import URLs");
            importPipe = new Each(importPipe, new LoadUrlsFunction());
            
            BasePath inputPath = platform.makePath(input);
            Tap sourceTap = platform.makeTap(platform.makeTextScheme(), inputPath);
            Tap sinkTap = platform.makeTap(platform.makeBinaryScheme(UrlDatum.FIELDS), crawlDBPath, SinkMode.REPLACE);
            
            FlowConnector flowConnector = platform.makeFlowConnector();
            Flow flow = flowConnector.connect(sourceTap, sinkTap, importPipe);
            flow.complete();
        }


        return crawlDBPath.getAbsolutePath();

View Full Code Here

        System.setProperty("bixo.root.level", "TRACE");


        String workingFolder = "build/it/FetcherTest/testStaleConnection/working";
        String input = makeCrawlDb(workingFolder, "src/it/resources/apple-pages.txt");
        
        BixoPlatform platform = new BixoPlatform(FetcherTest.class, Platform.Local);
        BasePath inputPath = platform.makePath(input);


        Tap in = platform.makeTap(platform.makeBinaryScheme(UrlDatum.FIELDS), inputPath);
        String outputDir = "build/it/FetcherTest/testStaleConnection/out";
        BasePath outputPath = platform.makePath(outputDir);


        BasePath contentPath = platform.makePath(outputPath, "content");
        Tap content = platform.makeTap(platform.makeBinaryScheme(FetchedDatum.FIELDS), contentPath, SinkMode.REPLACE);


        BasePath statusPath = platform.makePath(outputPath, "status");
        Tap status = platform.makeTap(platform.makeBinaryScheme(StatusDatum.FIELDS), statusPath, SinkMode.REPLACE);


        
        Pipe pipe = new Pipe("urlSource");


        UserAgent userAgent = new FirefoxUserAgent();
        FetcherPolicy fetcherPolicy = new FetcherPolicy();
        fetcherPolicy.setMaxRequestsPerConnection(1);
        fetcherPolicy.setCrawlDelay(5 * 1000L);
        BaseFetcher fetcher = new SimpleHttpFetcher(2, fetcherPolicy, userAgent);
        BaseScoreGenerator scorer = new FixedScoreGenerator();
        FetchPipe fetchPipe = new FetchPipe(pipe, scorer, fetcher, 1);


        FlowConnector flowConnector = platform.makeFlowConnector();
        Flow flow = flowConnector.connect(in, FetchPipe.makeSinkMap(status, content), fetchPipe);
        flow.complete();
        
        // Test for all valid fetches.
        Tap validate = platform.makeTap(platform.makeBinaryScheme(StatusDatum.FIELDS), statusPath);
        TupleEntryIterator tupleEntryIterator = validate.openForRead(platform.makeFlowProcess());
        while (tupleEntryIterator.hasNext()) {
            TupleEntry entry = tupleEntryIterator.next();
            StatusDatum sd = new StatusDatum(entry);
            if (sd.getStatus() != UrlStatus.FETCHED) {
                LOGGER.error(String.format("Fetched failed! Status is %s for %s", sd.getStatus(), sd.getUrl()));

View Full Code Here

    public void testRunFetcher() throws Exception {
        System.setProperty("bixo.root.level", "TRACE");
        
        String workingFolder = "build/test-it/FetcherTest/testRunFetcher";
        String input = makeCrawlDb(workingFolder, "src/it/resources/top10urls.txt");
        BixoPlatform platform = new BixoPlatform(FetcherTest.class, Platform.Local);
        BasePath workingPath = platform.makePath(workingFolder);
        BasePath inputPath = platform.makePath(input);
        Tap in = platform.makeTap(platform.makeBinaryScheme(UrlDatum.FIELDS), inputPath);
        
        BasePath contentPath = platform.makePath(workingPath, "content");
        Tap content = platform.makeTap(platform.makeBinaryScheme(FetchedDatum.FIELDS), contentPath, SinkMode.REPLACE);
        BasePath statusPath = platform.makePath(workingPath, "status");
        Tap status = platform.makeTap(platform.makeTextScheme(), statusPath, SinkMode.REPLACE);


        
        Pipe pipe = new Pipe("urlSource");


        UserAgent userAgent = new FirefoxUserAgent();
        BaseFetcher fetcher = new SimpleHttpFetcher(10, userAgent);
        BaseScoreGenerator scorer = new FixedScoreGenerator();
        FetchPipe fetchPipe = new FetchPipe(pipe, scorer, fetcher, 1);


        FlowConnector flowConnector = platform.makeFlowConnector();


        Flow flow = flowConnector.connect(in, FetchPipe.makeSinkMap(status, content), fetchPipe);
        flow.complete();
        
        // Test for 10 good fetches.
        Tap validate = platform.makeTap(platform.makeBinaryScheme(FetchedDatum.FIELDS), contentPath);
        TupleEntryIterator tupleEntryIterator = validate.openForRead(platform.makeFlowProcess());
        int fetchedPages = 0;
        while (tupleEntryIterator.hasNext()) {
            TupleEntry entry = tupleEntryIterator.next();
            new FetchedDatum(entry);
            fetchedPages += 1;

View Full Code Here

    BasePlatform _platform;
    BasePath _outputPath;
    
    @Before
    public void setUp() throws Exception {
        _platform = new BixoPlatform(CrawlDirUtilsTest.class, Platform.Local);
        _outputPath = _platform.makePath("./build/CrawlDirUtilsTest");
        _outputPath.mkdirs();
    }

View Full Code Here


        String inputFileName = options.getInputFile();
        String outputDirName = options.getOutputDir();


        try {
            BixoPlatform platform = new BixoPlatform(AnalyzeEmail.class, options.getPlatformMode());
            // Create the input (source tap), which is just a text file reader
            BasePath inputPath = platform.makePath(inputFileName);
            Tap sourceTap = platform.makeTap(platform.makeTextScheme(), inputPath);
            
            // Create the sub-assembly that runs the fetch job
            UserAgent userAgent = new UserAgent(options.getAgentName(), EMAIL_ADDRESS, WEB_ADDRESS);
            Pipe importPipe = new Each("url importer", new Fields("line"), new LoadUrlFunction());
            
            BaseScoreGenerator scorer = new FixedScoreGenerator();
            
            BaseFetcher fetcher = new SimpleHttpFetcher(MAX_THREADS, userAgent);
            FetchPipe fetchPagePipe = new FetchPipe(importPipe, scorer, fetcher, NUM_REDUCERS);
            
            // Here's the pipe that will output UrlDatum tuples, by extracting URLs from the mod_mbox-generated page.
        Pipe mboxPagePipe = new Each(fetchPagePipe.getContentTailPipe(), new ParseModMboxPageFunction(), Fields.RESULTS);


        // Create a named pipe for the status of the mod_mbox-generated pages.
            Pipe mboxPageStatusPipe = new Pipe(MBOX_PAGE_STATUS_PIPE_NAME, fetchPagePipe.getStatusTailPipe());


            // Set up appropriate FetcherPolicy, where we increase the max content size (since mailbox files
            // can be big, e.g. 4MB).
            FetcherPolicy defaultPolicy = new FetcherPolicy();
            defaultPolicy.setMaxContentSize(MAX_CONTENT_SIZE);
            fetcher = new SimpleHttpFetcher(MAX_THREADS, defaultPolicy, userAgent);
            
            // We can create the fetch pipe, and set up our Mbox splitter to run on content.
            FetchPipe fetchMboxPipe = new FetchPipe(mboxPagePipe, scorer, fetcher, NUM_REDUCERS);
            SplitEmails splitterPipe = new SplitEmails(fetchMboxPipe);
            
            // Now create the pipe that's going to analyze the emails we get after splitting them up.
            Pipe analysisPipe = new Pipe(ANALYZER_PIPE_NAME, splitterPipe.getTails()[0]);
            analysisPipe = new Each(analysisPipe, new ParseEmailFunction());
            
            // We'll get output that has ANALYZED_EMAIL_FIELDS in it. We want to group by
            // the message-id field, and then do an aggregation on that of the scores.
            analysisPipe = new GroupBy(analysisPipe, new Fields(FieldNames.MESSAGE_ID));
            analysisPipe = new Every(analysisPipe, new CalcMessageScoreBuffer(), Fields.RESULTS);


            // Now we want to sum the scores for each user, which is another grouping/summing.
            analysisPipe = new GroupBy(analysisPipe, new Fields(FieldNames.EMAIL_ADDRESS));
            analysisPipe = new Every(analysisPipe, new SumScoresBuffer(), Fields.RESULTS);
            
            // Let's filter out anybody with an uninteresting score.
            ExpressionFilter filter = new ExpressionFilter(String.format("%s <= 0.0", FieldNames.SUMMED_SCORE), Double.class);
            analysisPipe = new Each(analysisPipe, filter);
            
            // And let's sort in reverse order (high to low score)
            analysisPipe = new GroupBy(analysisPipe, new Fields(FieldNames.SUMMED_SCORE), true);


            // Create the sink taps
            BasePath outputPath = platform.makePath(outputDirName);
            Tap pageStatusSinkTap = platform.makeTap(platform.makeTextScheme(), 
                            platform.makePath(outputPath, "page-status"), SinkMode.REPLACE);
            Tap mboxStatusSinkTap = platform.makeTap(platform.makeTextScheme(), 
                            platform.makePath(outputPath, "mbox-status"), SinkMode.REPLACE);
            Tap contentSinkTap = platform.makeTap(platform.makeBinaryScheme(FetchedDatum.FIELDS), 
                            platform.makePath(outputPath, "content"), SinkMode.REPLACE);
            Tap analyzerSinkTap = platform.makeTap(platform.makeTextScheme(), 
                            platform.makePath(outputPath, "analysis"), SinkMode.REPLACE);


            HashMap<String, Tap> sinkTapMap = new HashMap<String, Tap>(2);
            sinkTapMap.put(MBOX_PAGE_STATUS_PIPE_NAME, pageStatusSinkTap);
            sinkTapMap.put(FetchPipe.STATUS_PIPE_NAME, mboxStatusSinkTap);
            sinkTapMap.put(SPLITTER_PIPE_NAME, contentSinkTap);
            sinkTapMap.put(ANALYZER_PIPE_NAME, analyzerSinkTap);
            
            LOGGER.info("Running fetch job with " + options);


            // Finally we can run it.
            FlowConnector flowConnector = platform.makeFlowConnector();
            Flow flow = flowConnector.connect(sourceTap, sinkTapMap, splitterPipe, mboxPageStatusPipe, analysisPipe);
            flow.writeDOT("build/goodFlow.dot");
            flow.complete();
        } catch (Throwable t) {
            System.err.println("Exception running AnalyzeEmail: " + t.getMessage());

View Full Code Here

    @SuppressWarnings({ "unchecked", "rawtypes" })
    @Test
    public void testNotLosingFetchedUrls() throws Throwable {
        String baseDirName = "build/test/DemoCrawlWorkflowLRTest/output";
        
        BixoPlatform platform = new BixoPlatform(DemoCrawlWorkflowLRTest.class, Platform.Local);
        
        BasePath baseDirPath = platform.makePath(baseDirName);
        baseDirPath.delete(true);
        BasePath curLoopDirPath = CrawlDirUtils.makeLoopDir(platform, baseDirPath, 0);
        BasePath crawlDbPath = platform.makePath(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);


        DemoCrawlTool.importOneDomain(platform, "localhost:8089", crawlDbPath);
        curLoopDirPath = CrawlDirUtils.makeLoopDir(platform, baseDirPath, 1);


        FetcherPolicy defaultPolicy = new FetcherPolicy();
        defaultPolicy.setCrawlDelay(1);
        defaultPolicy.setFetcherMode(FetcherMode.COMPLETE);
        BaseUrlFilter urlFilter = new BaseUrlFilter() {


            @Override
            public boolean isRemove(UrlDatum datum) {
                return false;
            }
        };


        DemoCrawlToolOptions options = new DemoCrawlToolOptions();
        options.setUseBoilerpipe(true);
        options.setLocalPlatformMode(true);
        UserAgent userAgent = new UserAgent("test", "test@domain.com", "http://test.domain.com");
        Server server = null;
        try {
            server = startServer(new FakeWebSiteHandler(), 8089);
            Flow flow = DemoCrawlWorkflow.createFlow(curLoopDirPath, crawlDbPath, defaultPolicy, userAgent, urlFilter, options);
            flow.complete();


            // Update the crawlDb path
            crawlDbPath = platform.makePath(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);


            // Now we should have an output/1-<timestamp>/ directory, where the
            // /urls dir has 11 entries with
            // one being previously crawled, and the other 10 being pending.


            Tap crawldbTap = platform.makeTap(platform.makeBinaryScheme(CrawlDbDatum.FIELDS), crawlDbPath);
            TupleEntryIterator iter = crawldbTap.openForRead(platform.makeFlowProcess());


            int numFetched = 0;
            int numPending = 0;
            while (iter.hasNext()) {
                CrawlDbDatum datum = new CrawlDbDatum(iter.next());
                UrlStatus status = datum.getLastStatus();
                int crawlDepth = datum.getCrawlDepth();
                if (datum.getLastFetched() != 0) {
                    numFetched += 1;


                    assertEquals(UrlStatus.FETCHED, status);
                    assertEquals(0, crawlDepth);
                } else {
                    numPending += 1;
                    assertEquals(UrlStatus.UNFETCHED, status);
                    assertEquals(1, crawlDepth);
                }
            }


            assertEquals(1, numFetched);
            assertEquals(10, numPending);


            // Do it one more time, to verify status gets propagated forward.
            curLoopDirPath = CrawlDirUtils.makeLoopDir(platform, baseDirPath, 2);


            flow = DemoCrawlWorkflow.createFlow(curLoopDirPath, crawlDbPath, defaultPolicy, userAgent, urlFilter, options);
            flow.complete();
            // Update crawldb path
            crawlDbPath = platform.makePath(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);


            crawldbTap = platform.makeTap(platform.makeBinaryScheme(CrawlDbDatum.FIELDS), crawlDbPath);
            iter = crawldbTap.openForRead(platform.makeFlowProcess());


            numFetched = 0;
            numPending = 0;
            int numDepth0 = 0;
            int numDepth1 = 0;

View Full Code Here

        
        long fetchTime = System.currentTimeMillis();
        PartitioningKey groupingKey = new PartitioningKey("key", 1);
        FetchSetDatum pfd = new FetchSetDatum(urls, fetchTime, 1000, groupingKey.getValue(), groupingKey.getRef());
        
        BixoPlatform platform = new BixoPlatform(ScoredUrlDatumTest.class, platformMode);
        BasePath path = platform.makePath("build/test/ScoredUrlDatumTest/testCascadingSerialization/in");
        Tap in = platform.makeTap(platform.makeBinaryScheme(FetchSetDatum.FIELDS), path, SinkMode.REPLACE);
        TupleEntryCollector write = in.openForWrite(platform.makeFlowProcess());
        write.add(pfd.getTuple());
        write.close();
    }

View Full Code Here

        DemoWebMiningOptions options = new DemoWebMiningOptions();
        options.setWorkingDir(WORKING_DIR);
        options.setAgentName("test-agent");
        options.setLocalPlatformMode(true);
        
        BixoPlatform platform = new BixoPlatform(DemoWebMiningWorkflowTest.class, options.getPlatformMode());
        BasePath workingDirPath = platform.makePath(WORKING_DIR);
        DemoWebMiningTool.setupWorkingDir(platform, workingDirPath, "/test-seedurls.txt");
        
        BasePath latestDirPath = CrawlDirUtils.findLatestLoopDir(platform, workingDirPath);
        BasePath crawlDbPath = platform.makePath(latestDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);
        
        FetcherPolicy fetcherPolicy = new FetcherPolicy();
        fetcherPolicy.setCrawlDelay(CrawlConfig.DEFAULT_CRAWL_DELAY);
        fetcherPolicy.setMaxContentSize(CrawlConfig.MAX_CONTENT_SIZE);
        fetcherPolicy.setFetcherMode(FetcherMode.EFFICIENT);
        Set<String> validMimeTypes = new HashSet<String>();
        validMimeTypes.add("text/plain");
        validMimeTypes.add("text/html");
        fetcherPolicy.setValidMimeTypes(validMimeTypes);


        UserAgent userAgent = new UserAgent(options.getAgentName(), CrawlConfig.EMAIL_ADDRESS, CrawlConfig.WEB_ADDRESS);


        Server server = null;
        try {
            server = startServer(new DirectoryResponseHandler("src/test/resources/test-pages"), 8089);
            
            BasePath curLoopDirPath = CrawlDirUtils.makeLoopDir(platform, workingDirPath, 1);


            Flow flow = DemoWebMiningWorkflow.createWebMiningWorkflow(platform, crawlDbPath, curLoopDirPath, fetcherPolicy, userAgent, options);
            flow.complete();
        
            // validate
            BasePath statusPath = platform.makePath(curLoopDirPath, CrawlConfig.STATUS_SUBDIR_NAME);
            validateEntryCount(platform, statusPath, null, 1, "status", true);
    
            BasePath contentPath = platform.makePath(curLoopDirPath, CrawlConfig.CONTENT_SUBDIR_NAME);
            validateEntryCount(platform, contentPath, FetchedDatum.FIELDS, 1, "content", false);


            crawlDbPath = platform.makePath(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);
            validateEntryCount(platform, crawlDbPath, null, 3, "crawldb", true);
            
            // run the second loop
            curLoopDirPath =  CrawlDirUtils.makeLoopDir(platform, workingDirPath, 2);
            flow = DemoWebMiningWorkflow.createWebMiningWorkflow(platform, crawlDbPath, curLoopDirPath, fetcherPolicy, userAgent, options);
            flow.complete();
            
            // validate
            statusPath = platform.makePath(curLoopDirPath, CrawlConfig.STATUS_SUBDIR_NAME);
            validateEntryCount(platform, statusPath, null, 2, "status", true);
    
            contentPath = platform.makePath(curLoopDirPath, CrawlConfig.CONTENT_SUBDIR_NAME);
            validateEntryCount(platform, contentPath, FetchedDatum.FIELDS, 2, "content", false);


            crawlDbPath = platform.makePath(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);
            validateEntryCount(platform, crawlDbPath, null, 8, "crawldb", true);
            assertTrue(validatePageScores(platform, crawlDbPath));
            
            BasePath resultsPath = platform.makePath(curLoopDirPath, CrawlConfig.RESULTS_SUBDIR_NAME);
            validateEntryCount(platform, resultsPath, null, 3, "page results", true);
        }  finally {
            if (server != null) {
                server.stop();
            }

View Full Code Here

0 1 2 3

TOP

Related Classes of bixo.config.BixoPlatform

bixo.datum.ScoredUrlDatumTest

bixo.examples.crawl.DemoCrawlTool

bixo.examples.crawl.DemoCrawlWorkflow

bixo.examples.crawl.DemoCrawlWorkflowLRTest

bixo.examples.crawl.DemoStatusTool

bixo.examples.crawl.LatestUrlDatumBufferTest

bixo.examples.webmining.DemoWebMiningTool

bixo.examples.webmining.DemoWebMiningWorkflowTest

bixo.fetcher.FetcherTest

bixo.pipes.FetchPipeHadoopTest

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.