Examples of bixo.datum.UrlDatum

bixo.datum.UrlDatum

                    // have a fetchTime of 0, so in order to preserve say a SKIPPED status
                    // we set the fetch time here.


        _numCreated += 1;


        UrlDatum urlDatum = new UrlDatum(url);
        urlDatum.setPayloadValue(CrawlDbDatum.LAST_FETCHED_FIELD, fetchTime);
        urlDatum.setPayloadValue(CrawlDbDatum.LAST_UPDATED_FIELD, statusTime);
        urlDatum.setPayloadValue(CrawlDbDatum.LAST_STATUS_FIELD, status.name());
        // Don't change the crawl depth here - we do that only in the case of a
        // successful parse
        urlDatum.setPayloadValue(CrawlDbDatum.CRAWL_DEPTH, datum.getPayloadValue(CrawlDbDatum.CRAWL_DEPTH));


        funcCall.getOutputCollector().add(urlDatum.getTuple());
    }

View Full Code Here

        
        // Create a temp file with a fetched url
        BasePath workingDirPath = platform.makePath(WORKINGDIR);
        BasePath fetchedDatumsPath = platform.makePath(workingDirPath, "fetched");
        ArrayList<UrlDatum> fetchedDatums = new ArrayList<UrlDatum>();
        UrlDatum fetchedDatum1 = new UrlDatum("http://foo.com");
        fetchedDatum1.setPayloadValue(CrawlDbDatum.LAST_FETCHED_FIELD, 2L);
        fetchedDatums.add(fetchedDatum1);
        createDataFile(platform, fetchedDatumsPath, fetchedDatums);
        
        // And another with unfetched urls
        BasePath unfetchedDatumsPath = platform.makePath(workingDirPath, "unfetched");
        ArrayList<UrlDatum> unfetchedDatums = new ArrayList<UrlDatum>();
        UrlDatum unfetchedDatum1 = new UrlDatum("http://foo.com");
        unfetchedDatum1.setPayloadValue(CrawlDbDatum.LAST_FETCHED_FIELD, 0L);
        unfetchedDatums.add(unfetchedDatum1);
        UrlDatum unfetchedDatum2 = new UrlDatum("http://foo.com");
        unfetchedDatum2.setPayloadValue(CrawlDbDatum.LAST_FETCHED_FIELD, 0L);
        unfetchedDatums.add(unfetchedDatum2);
        
        createDataFile(platform, unfetchedDatumsPath, unfetchedDatums);


        
        // create a workflow
        Tap inputSource1 = platform.makeTap(platform.makeBinaryScheme(UrlDatum.FIELDS), fetchedDatumsPath);
        Pipe fetchedPipe = new Pipe("fetched");
        Tap inputSource2 = platform.makeTap(platform.makeBinaryScheme(UrlDatum.FIELDS), unfetchedDatumsPath);
        Pipe unfetchedPipe = new Pipe("unfetched");


        Map<String, Tap> sources = new HashMap<String, Tap>();
        sources.put(fetchedPipe.getName(), inputSource1);
        sources.put(unfetchedPipe.getName(), inputSource2);


        BasePath resultsPath = platform.makePath(workingDirPath, "results");
        Tap resultSink = platform.makeTap(platform.makeBinaryScheme(UrlDatum.FIELDS), resultsPath, SinkMode.REPLACE);


        Pipe resultsPipe = new GroupBy("results pipe", Pipe.pipes(fetchedPipe, unfetchedPipe), 
                        new Fields(UrlDatum.URL_FN));
        resultsPipe = new Every(resultsPipe, new LatestUrlDatumBuffer(), Fields.RESULTS);




        FlowConnector flowConnector = platform.makeFlowConnector();
        Flow flow = flowConnector.connect(sources, resultSink, resultsPipe);
        flow.complete();
        
        // verify that the resulting pipe has the latest tuple
        
        Tap testSink = platform.makeTap(platform.makeBinaryScheme(UrlDatum.FIELDS), resultsPath);
        TupleEntryIterator reader = testSink.openForRead(platform.makeFlowProcess());
        int count = 0;
        long latest = 0;
        while (reader.hasNext()) {
            TupleEntry next = reader.next();
            UrlDatum datum = new UrlDatum(next);
            latest = (Long) datum.getPayloadValue(CrawlDbDatum.LAST_FETCHED_FIELD);
            count++;
        }
        
        assertEquals(1, count);
        assertEquals(2, latest);

View Full Code Here

        return platform.makePath(BASE_OUTPUT_PATH + testname + "/" + platformName + "/out");
    }




    private Tuple makeTuple(String domain, int pageNumber, Payload payload) {
        UrlDatum url = new UrlDatum("http://" + domain + "/page-" + pageNumber + ".html?size=10");
        url.setPayload(payload);
        return url.getTuple();
    }

View Full Code Here

        public void operate(FlowProcess process, FunctionCall<NullContext> funcCall) {
            String urlAsString = funcCall.getArguments().getString("line");
            try {
                URL url = new URL(urlAsString);


                UrlDatum urlDatum = new UrlDatum(url.toString());


                funcCall.getOutputCollector().add(BixoPlatform.clone(urlDatum.getTuple(), process));
            } catch (MalformedURLException e) {
                LOGGER.warn("Invalid URL: " + urlAsString);
                // throw new RuntimeException("Invalid URL: " + urlAsString, e);
            }
        }

View Full Code Here

        }
        
        @Override
        public void operate(FlowProcess flowProcess, FunctionCall<Limit.Context> funcCall) {
            CrawlDbDatum datum = new CrawlDbDatum(funcCall.getArguments());
            UrlDatum urlDatum = new UrlDatum(datum.getUrl());
            urlDatum.setPayloadValue(CustomFields.PAGE_SCORE_FN, datum.getPageScore());
            urlDatum.setPayloadValue(CustomFields.LINKS_SCORE_FN, datum.getLinksScore());
            urlDatum.setPayloadValue(CustomFields.STATUS_FN, datum.getLastStatus().toString());
            urlDatum.setPayloadValue(CustomFields.SKIP_BY_LIMIT_FN, funcCall.getContext().increment());
            
            funcCall.getOutputCollector().add(urlDatum.getTuple());
        }

View Full Code Here

    }


    @SuppressWarnings("rawtypes")
    @Override
    public void operate(FlowProcess process, FunctionCall<NullContext> funCall) {
        UrlDatum datum = new UrlDatum(funCall.getArguments());
        
        // Create copy, since we're setting a field, and the tuple is going to be unmodifiable.
        UrlDatum result = new UrlDatum(datum);
        result.setUrl(_normalizer.normalize(datum.getUrl()));
        funCall.getOutputCollector().add(BixoPlatform.clone(result.getTuple(), process));
    }

View Full Code Here

  }
  
  @SuppressWarnings("rawtypes")
    @Override
  public boolean isRemove(FlowProcess process, FilterCall<NullContext> filterCall) {
    UrlDatum datum = new UrlDatum(filterCall.getArguments());
    if (_filter.isRemove(datum)) {
        process.increment(ImportCounters.URLS_FILTERED, 1);
      _numFiltered += 1;
      return true;
    } else {

View Full Code Here

        
        try {
            // Validate the URL
            new URL(url);
            
            UrlDatum urlDatum = new UrlDatum(url);
            funcCall.getOutputCollector().add(BixoPlatform.clone(urlDatum.getTuple(), process));
            
            _numUrls += 1;
            process.increment(ImportCounters.URLS_ACCEPTED, 1);
        } catch (MalformedURLException e) {
            LOGGER.warn("Invalid URL in input data file: " + url);

View Full Code Here


    @Override
    public void operate(FlowProcess process, FunctionCall<NullContext> funCall) {
        String key;
        try {
            UrlDatum datum = new UrlDatum(funCall.getArguments());
            key = _generator.getGroupingKey(datum);
            GroupedUrlDatum result = new GroupedUrlDatum(datum, key);
            funCall.getOutputCollector().add(BixoPlatform.clone(result.getTuple(), process));
        } catch (Exception e) {
            // TODO KKr - don't lose the tuple (skipping support)

View Full Code Here

0 1

TOP

Related Classes of bixo.datum.UrlDatum

bixo.examples.crawl.CreateCrawlDbDatumFromUrlFunction

bixo.examples.crawl.CreateUrlDatumFromOutlinksFunction

bixo.examples.crawl.CreateUrlDatumFromStatusFunction

bixo.examples.crawl.DemoCrawlWorkflow$CreateUrlDatumFromCrawlDbFunction

bixo.examples.crawl.LatestUrlDatumBuffer

bixo.examples.crawl.LatestUrlDatumBufferTest

bixo.examples.crawl.RegexUrlFilterTest

bixo.examples.webmining.DemoWebMiningWorkflow$CreateUrlDatumFromCrawlDbDatum

bixo.operations.GroupFunction

bixo.operations.LoadUrlsFunction

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.