Examples of TupleEntryCollector


Examples of cascading.tuple.TupleEntryCollector

    @SuppressWarnings({ "rawtypes", "unchecked" })
    public static void importOneDomain(BasePlatform platform, String targetDomain, BasePath crawlDbPath) throws Exception {
       
        try {
            Tap urlSink = platform.makeTap(platform.makeBinaryScheme(CrawlDbDatum.FIELDS), crawlDbPath, SinkMode.REPLACE);
            TupleEntryCollector writer = urlSink.openForWrite(platform.makeFlowProcess());
            SimpleUrlNormalizer normalizer = new SimpleUrlNormalizer();

            CrawlDbDatum datum = new CrawlDbDatum(normalizer.normalize("http://" + targetDomain), 0, 0, UrlStatus.UNFETCHED, 0);

            writer.add(datum.getTuple());
            writer.close();
        } catch (Exception e) {
            throw e;
        }
    }
View Full Code Here

Examples of cascading.tuple.TupleEntryCollector

    }
   
    @SuppressWarnings({ "unchecked", "rawtypes" })
    private void createDataFile(BasePlatform platform, BasePath filePath, List<UrlDatum> datums) throws Exception {
        Tap urlSink = platform.makeTap(platform.makeBinaryScheme(UrlDatum.FIELDS), filePath, SinkMode.REPLACE);
        TupleEntryCollector writer = urlSink.openForWrite(platform.makeFlowProcess());
        for (UrlDatum datum : datums) {
            writer.add(datum.getTuple());
        }
        writer.close();
    }
View Full Code Here

Examples of cascading.tuple.TupleEntryCollector

        // Bump the crawl depth value only on a successful parse
        int crawlDepth = (Integer) datum.getPayloadValue(CrawlDbDatum.CRAWL_DEPTH);
        datum.setPayloadValue(CrawlDbDatum.CRAWL_DEPTH, crawlDepth + 1);

        TupleEntryCollector collector = funcCall.getOutputCollector();

        for (Outlink outlink : outlinks) {
            String url = outlink.getToUrl();
            url = url.replaceAll("[\n\r]", "");
            url = _normalizer.normalize(url);
            if (_validator.isValid(url)) {
                UrlDatum urlDatum = new UrlDatum(url);
                urlDatum.setPayload(datum.getPayload());
                collector.add(urlDatum.getTuple());
            }
        }
    }
View Full Code Here

Examples of cascading.tuple.TupleEntryCollector

        FetchSetDatum pfd = new FetchSetDatum(urls, fetchTime, 1000, groupingKey.getValue(), groupingKey.getRef());
       
        BixoPlatform platform = new BixoPlatform(ScoredUrlDatumTest.class, platformMode);
        BasePath path = platform.makePath("build/test/ScoredUrlDatumTest/testCascadingSerialization/in");
        Tap in = platform.makeTap(platform.makeBinaryScheme(FetchSetDatum.FIELDS), path, SinkMode.REPLACE);
        TupleEntryCollector write = in.openForWrite(platform.makeFlowProcess());
        write.add(pfd.getTuple());
        write.close();
    }
View Full Code Here

Examples of cascading.tuple.TupleEntryCollector

   
    private Tap makeInputData(BasePlatform platform, String testname, int numDomains, int numPages, Payload payload) throws Exception {
        String platformName = platform.getClass().getSimpleName();
        BasePath defaultPath = platform.makePath(BASE_INPUT_PATH + testname + "/" + platformName + "/in");
        Tap in = platform.makeTap(platform.makeBinaryScheme(UrlDatum.FIELDS), defaultPath, SinkMode.REPLACE);
        TupleEntryCollector write = in.openForWrite(platform.makeFlowProcess());
        for (int i = 0; i < numDomains; i++) {
            for (int j = 0; j < numPages; j++) {
                // Use special domain name pattern so code deep inside of operations "knows" not
                // to try to resolve host names to IP addresses.
                write.add(makeTuple("bixo-test-domain-" + i + ".com", j, payload));
            }
        }
       
        write.close();
        return in;
    }
View Full Code Here

Examples of cascading.tuple.TupleEntryCollector

   
    private Tap makeInputData(BasePlatform platform, String testname, String domain, int numPages, Payload payload) throws Exception {
        String platformName = platform.getClass().getSimpleName();
        BasePath defaultPath = platform.makePath(BASE_INPUT_PATH + testname + "/" + platformName + "/in");
        Tap in = platform.makeTap(platform.makeBinaryScheme(UrlDatum.FIELDS), defaultPath, SinkMode.REPLACE);
        TupleEntryCollector write = in.openForWrite(platform.makeFlowProcess());
        for (int j = 0; j < numPages; j++) {
            write.add(makeTuple(domain, j, payload));
        }

        write.close();
        return in;
    }
View Full Code Here

Examples of cascading.tuple.TupleEntryCollector

    public void operate(FlowProcess process, FunctionCall<NullContext> funcCall) {
       
        AnalyzedDatum datum = new AnalyzedDatum(funcCall.getArguments().getTuple());
        Outlink outlinks[] = datum.getOutlinks();

        TupleEntryCollector collector = funcCall.getOutputCollector();

        if (outlinks.length > 0) {
            float pageScore = datum.getPageScore();
           
            // Give each outlink 1/N th the page score.
            // Note : Ideally you would deal with duplicates and also ensure that the
            // source url is excluded.
            float outlinkScore = pageScore/outlinks.length;
   
            for (Outlink outlink : outlinks) {
                String url = outlink.getToUrl().trim();
                url = url.replaceAll("[\n\r]", "");
                String normalizedUrl = _normalizer.normalize(url);
                if (_validator.isValid(normalizedUrl)) {
                    LinkDatum linkDatum = new LinkDatum(normalizedUrl, 0, outlinkScore);
                    collector.add(linkDatum.getTuple());
                }
            }
        }
    }
View Full Code Here

Examples of cascading.tuple.TupleEntryCollector

    @Override
    public void operate(FlowProcess process, FunctionCall<NullContext> funcCall) {
       
        AnalyzedDatum datum = new AnalyzedDatum(funcCall.getArguments().getTuple());

        TupleEntryCollector collector = funcCall.getOutputCollector();
        PageResult[] pageResults = datum.getPageResults();
        if (pageResults.length > 0) {
            for (PageResult pageResult : pageResults) {
                String outResult = String.format("%s\t%s\t%s", pageResult.getSourceUrl(), pageResult.getImageUrl(), pageResult.getDescription());
                collector.add(new Tuple(outResult));
            }
        }
    }
View Full Code Here

Examples of cascading.tuple.TupleEntryCollector

        FlowProcess fp = Mockito.mock(NullFlowProcess.class);
       
        OperationCall<NullContext> oc = Mockito.mock(OperationCall.class);
        BufferCall<NullContext> bc = Mockito.mock(BufferCall.class);
       
        TupleEntryCollector collector = Mockito.mock(TupleEntryCollector.class);
       
        Mockito.when(bc.getGroup()).thenReturn(new TupleEntry(new Tuple("http://localhost:8089")));
        Mockito.when(bc.getArgumentsIterator()).thenReturn(getGroupedurlDatumList("http://localhost:8089").iterator());
        Mockito.when(bc.getOutputCollector()).thenReturn(collector);
       
View Full Code Here

Examples of cascading.tuple.TupleEntryCollector

        FlowProcess fp = Mockito.mock(NullFlowProcess.class);
       
        OperationCall<NullContext> oc = Mockito.mock(OperationCall.class);
        BufferCall<NullContext> bc = Mockito.mock(BufferCall.class);
       
        TupleEntryCollector collector = Mockito.mock(TupleEntryCollector.class);
       
        Mockito.when(bc.getGroup()).thenReturn(new TupleEntry(new Tuple("http://localhost:8089")));
        Mockito.when(bc.getArgumentsIterator()).thenReturn(getGroupedurlDatumList("http://localhost:8089").iterator());
        Mockito.when(bc.getOutputCollector()).thenReturn(collector);
       
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.