Examples of TupleEntryIterator


Examples of cascading.tuple.TupleEntryIterator

  @SuppressWarnings({ "unchecked", "rawtypes" })
    private static void processStatus(BasePlatform platform, BasePath curDirPath) throws Exception {
        BasePath statusPath = platform.makePath(curDirPath, CrawlConfig.STATUS_SUBDIR_NAME);
        Tap statusTap = platform.makeTap(platform.makeTextScheme(), statusPath);
       
        TupleEntryIterator iter = statusTap.openForRead(platform.makeFlowProcess());
       
        LOGGER.info("Analyzing: " +  CrawlConfig.STATUS_SUBDIR_NAME);
        UrlStatus[] statusValues = UrlStatus.values();
        int[] statusCounts = new int[statusValues.length];
        int totalEntries = 0;
        while (iter.hasNext()) {
            TupleEntry entry = iter.next();
            totalEntries += 1;
   
            String statusLine = entry.getString("line");
            String[] pieces = statusLine.split("\t");
            int pos = StatusDatum.FIELDS.getPos(StatusDatum.STATUS_FN);
View Full Code Here

Examples of cascading.tuple.TupleEntryIterator

        LOGGER.info("");
    }

    @SuppressWarnings({ "rawtypes", "unchecked" })
    private static void processCrawlDb(BixoPlatform platform, BasePath latestCrawlDirPath, boolean exportDb) throws Exception {
        TupleEntryIterator iter;
        int totalEntries;
        BasePath crawlDbPath = platform.makePath(latestCrawlDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);
        Tap crawldbTap = platform.makeTap(platform.makeBinaryScheme(CrawlDbDatum.FIELDS), crawlDbPath);
        iter = crawldbTap.openForRead(platform.makeFlowProcess());
        totalEntries = 0;
        int fetchedUrls = 0;
        int unfetchedUrls = 0;
        LOGGER.info("Analyzing: " +  CrawlConfig.CRAWLDB_SUBDIR_NAME);

        while (iter.hasNext()) {
            TupleEntry entry = iter.next();
            totalEntries += 1;
           
            CrawlDbDatum datum = new CrawlDbDatum(entry);
            if (exportDb) {
                LOGGER.info(datum.toString());
View Full Code Here

Examples of cascading.tuple.TupleEntryIterator

        flow.complete();
       
        // verify that the resulting pipe has the latest tuple
       
        Tap testSink = platform.makeTap(platform.makeBinaryScheme(UrlDatum.FIELDS), resultsPath);
        TupleEntryIterator reader = testSink.openForRead(platform.makeFlowProcess());
        int count = 0;
        long latest = 0;
        while (reader.hasNext()) {
            TupleEntry next = reader.next();
            UrlDatum datum = new UrlDatum(next);
            latest = (Long) datum.getPayloadValue(CrawlDbDatum.LAST_FETCHED_FIELD);
            count++;
        }
       
View Full Code Here

Examples of cascading.tuple.TupleEntryIterator

        Flow flow = flowConnector.connect(in, FetchPipe.makeSinkMap(status, content), fetchPipe);
        flow.complete();
       
        // Test for all valid fetches.
        Tap validate = platform.makeTap(platform.makeBinaryScheme(StatusDatum.FIELDS), statusPath);
        TupleEntryIterator tupleEntryIterator = validate.openForRead(platform.makeFlowProcess());
        while (tupleEntryIterator.hasNext()) {
            TupleEntry entry = tupleEntryIterator.next();
            StatusDatum sd = new StatusDatum(entry);
            if (sd.getStatus() != UrlStatus.FETCHED) {
                LOGGER.error(String.format("Fetched failed! Status is %s for %s", sd.getStatus(), sd.getUrl()));
                BaseFetchException e = sd.getException();
                if (e != null) {
View Full Code Here

Examples of cascading.tuple.TupleEntryIterator

        Flow flow = flowConnector.connect(in, FetchPipe.makeSinkMap(status, content), fetchPipe);
        flow.complete();
       
        // Test for 10 good fetches.
        Tap validate = platform.makeTap(platform.makeBinaryScheme(FetchedDatum.FIELDS), contentPath);
        TupleEntryIterator tupleEntryIterator = validate.openForRead(platform.makeFlowProcess());
        int fetchedPages = 0;
        while (tupleEntryIterator.hasNext()) {
            TupleEntry entry = tupleEntryIterator.next();
            new FetchedDatum(entry);
            fetchedPages += 1;
        }

        Assert.assertEquals(10, fetchedPages);
View Full Code Here

Examples of cascading.tuple.TupleEntryIterator

            // Now we should have an output/1-<timestamp>/ directory, where the
            // /urls dir has 11 entries with
            // one being previously crawled, and the other 10 being pending.

            Tap crawldbTap = platform.makeTap(platform.makeBinaryScheme(CrawlDbDatum.FIELDS), crawlDbPath);
            TupleEntryIterator iter = crawldbTap.openForRead(platform.makeFlowProcess());

            int numFetched = 0;
            int numPending = 0;
            while (iter.hasNext()) {
                CrawlDbDatum datum = new CrawlDbDatum(iter.next());
                UrlStatus status = datum.getLastStatus();
                int crawlDepth = datum.getCrawlDepth();
                if (datum.getLastFetched() != 0) {
                    numFetched += 1;

                    assertEquals(UrlStatus.FETCHED, status);
                    assertEquals(0, crawlDepth);
                } else {
                    numPending += 1;
                    assertEquals(UrlStatus.UNFETCHED, status);
                    assertEquals(1, crawlDepth);
                }
            }

            assertEquals(1, numFetched);
            assertEquals(10, numPending);

            // Do it one more time, to verify status gets propagated forward.
            curLoopDirPath = CrawlDirUtils.makeLoopDir(platform, baseDirPath, 2);

            flow = DemoCrawlWorkflow.createFlow(curLoopDirPath, crawlDbPath, defaultPolicy, userAgent, urlFilter, options);
            flow.complete();
            // Update crawldb path
            crawlDbPath = platform.makePath(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);

            crawldbTap = platform.makeTap(platform.makeBinaryScheme(CrawlDbDatum.FIELDS), crawlDbPath);
            iter = crawldbTap.openForRead(platform.makeFlowProcess());

            numFetched = 0;
            numPending = 0;
            int numDepth0 = 0;
            int numDepth1 = 0;
            int numDepth2 = 0;
            while (iter.hasNext()) {
                CrawlDbDatum datum = new CrawlDbDatum(iter.next());
                UrlStatus status = datum.getLastStatus();
                int depth = datum.getCrawlDepth();

                if (datum.getLastFetched() != 0) {
                    numFetched += 1;
View Full Code Here

Examples of cascading.tuple.TupleEntryIterator

            sourceTap = platform.makeTap(platform.makeTextScheme(), dataPath);
        } else {
            sourceTap = platform.makeTap(platform.makeBinaryScheme(fields), dataPath);
        }
       
        TupleEntryIterator tupleEntryIterator = sourceTap.openForRead(platform.makeFlowProcess());
        int numEntries = 0;
        while (tupleEntryIterator.hasNext()) {
          tupleEntryIterator.next();
          numEntries++;
        }
       
        tupleEntryIterator.close();
        assertEquals(msgStr, expected, numEntries);
    }
View Full Code Here

Examples of cascading.tuple.TupleEntryIterator

    @SuppressWarnings({ "unchecked", "rawtypes" })
    private boolean validatePageScores(BasePlatform platform, BasePath  dataPath) throws Exception {
        boolean allOK = false;
        int verifiedCnt = 0;
        Tap sourceTap = platform.makeTap(platform.makeTextScheme(), dataPath);
        TupleEntryIterator tupleEntryIterator = sourceTap.openForRead(platform.makeFlowProcess());
        while (tupleEntryIterator.hasNext()) {
          TupleEntry next = tupleEntryIterator.next();
          String line = next.getString("line");
          String[] split = line.split("\t");
          if (split[0].equals(PAGE1_URL)) {
              allOK |= split[3].equals(PAGE1_SCORE);
              verifiedCnt++;
          } else if (split[0].equals(PAGE2_URL)) {
              allOK |= split[3].equals(PAGE2_SCORE);
              verifiedCnt++;
          }
        }
       
        tupleEntryIterator.close();
        return allOK && (verifiedCnt==2);
     }
View Full Code Here

Examples of cascading.tuple.TupleEntryIterator

        Flow flow = flowConnector.connect(flowDef);
        flow.writeDOT("build/test/FetchPipeLRTest/testHeadersInStatus/flow.dot");
        flow.complete();
       
        Tap validate = platform.makeTap(platform.makeBinaryScheme(StatusDatum.FIELDS), statusPath);
        TupleEntryIterator tupleEntryIterator = validate.openForRead(platform.makeFlowProcess());
        Assert.assertTrue(tupleEntryIterator.hasNext());
        StatusDatum sd = new StatusDatum(tupleEntryIterator.next());
        Assert.assertEquals(UrlStatus.FETCHED, sd.getStatus());
        HttpHeaders headers = sd.getHeaders();
        Assert.assertNotNull(headers);
        Assert.assertTrue(headers.getNames().size() > 0);
    }
View Full Code Here

Examples of cascading.tuple.TupleEntryIterator

            webServer.stop();
        }
       
        // Verify numPages fetched and numPages status entries were saved.
        Tap validate = platform.makeTap(platform.makeBinaryScheme(FetchedDatum.FIELDS), contentPath);
        TupleEntryIterator tupleEntryIterator = validate.openForRead(platform.makeFlowProcess());
       
        int totalEntries = 0;
        boolean[] fetchedPages = new boolean[numPages];
        while (tupleEntryIterator.hasNext()) {
            TupleEntry entry = tupleEntryIterator.next();
            totalEntries += 1;

            // Verify we can convert properly
            FetchedDatum datum = new FetchedDatum(entry);
            String url = datum.getUrl();
            Assert.assertNotNull(url);
           
            // Verify that we got one of each page
            int idOffset = url.indexOf(".html") - 1;
            int pageId = Integer.parseInt(url.substring(idOffset, idOffset + 1));
            Assert.assertFalse(fetchedPages[pageId]);
            fetchedPages[pageId] = true;
        }
       
        Assert.assertEquals(numPages, totalEntries);
        tupleEntryIterator.close();
       
        validate = platform.makeTap(platform.makeBinaryScheme(StatusDatum.FIELDS), statusPath);
        tupleEntryIterator = validate.openForRead(platform.makeFlowProcess());
        totalEntries = 0;
        fetchedPages = new boolean[numPages];
        while (tupleEntryIterator.hasNext()) {
            TupleEntry entry = tupleEntryIterator.next();
            totalEntries += 1;

            // Verify we can convert properly
            StatusDatum sd = new StatusDatum(entry);
            Assert.assertEquals(UrlStatus.FETCHED, sd.getStatus());
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.