Package bixo.examples.crawl

Source Code of bixo.examples.crawl.LatestUrlDatumBufferTest

/*
* Copyright 2009-2013 Scale Unlimited
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package bixo.examples.crawl;

import static org.junit.Assert.assertEquals;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.commons.io.FileUtils;
import org.junit.Before;
import org.junit.Test;

import bixo.config.BixoPlatform;
import bixo.config.BixoPlatform.Platform;
import bixo.datum.UrlDatum;
import cascading.flow.Flow;
import cascading.flow.FlowConnector;
import cascading.pipe.Every;
import cascading.pipe.GroupBy;
import cascading.pipe.Pipe;
import cascading.tap.SinkMode;
import cascading.tap.Tap;
import cascading.tuple.Fields;
import cascading.tuple.TupleEntry;
import cascading.tuple.TupleEntryCollector;
import cascading.tuple.TupleEntryIterator;

import com.scaleunlimited.cascading.BasePath;
import com.scaleunlimited.cascading.BasePlatform;


public class LatestUrlDatumBufferTest {

    private static final String WORKINGDIR = "build/test/LatestUrlDatumBufferTest";

    @Before
    public void setUp() throws IOException {
       
        File workingFolder = new File(WORKINGDIR);
        if (workingFolder.exists()) {
            FileUtils.deleteDirectory(workingFolder);
        }
    }
/*  Can't use the test below since it doesn't simulate the reusing of tuples in a Cascading
*  GroupBy operation.
*  In particular it will fail to catch a case where an assignment of the type
*      aDatum = datum
*  is being incorrectly done.
*  Instead we want it to be
*      aDatum = new DatumType(datum)
*/
    /*
    @Test
    public void testOperate() throws BaseFetchException, IOException {
        LatestUrlDatumBuffer op = new LatestUrlDatumBuffer();
       
        HadoopFlowProcess fp = Mockito.mock(HadoopFlowProcess.class);
        Mockito.when(fp.getJobConf()).thenReturn(new JobConf());

        OperationCall<NullContext> oc = Mockito.mock(OperationCall.class);
        BufferCall<NullContext> bc = Mockito.mock(BufferCall.class);
        TupleEntryCollector collector = Mockito.mock(TupleEntryCollector.class);

        List<TupleEntry> tupleEntryList = new ArrayList<TupleEntry>();
        UrlDatum urlDatum1 = new UrlDatum("http://foo.com");
        urlDatum1.setPayloadValue(CrawlDbDatum.LAST_FETCHED_FIELD, 0L);
        urlDatum1.setPayloadValue(CrawlDbDatum.LAST_STATUS_FIELD, UrlStatus.UNFETCHED);
        TupleEntry entry1 = new TupleEntry(UrlDatum.FIELDS);
        entry1.setTuple(urlDatum1.getTuple());
        tupleEntryList.add(entry1);

        UrlDatum urlDatum2 = new UrlDatum("http://foo.com");
        urlDatum2.setPayloadValue(CrawlDbDatum.LAST_FETCHED_FIELD, 2L);
        urlDatum2.setPayloadValue(CrawlDbDatum.LAST_STATUS_FIELD, UrlStatus.FETCHED);
        TupleEntry entry2 = new TupleEntry(UrlDatum.FIELDS);
        entry2.setTuple(urlDatum2.getTuple());
        tupleEntryList.add(entry2);

        UrlDatum urlDatum3 = new UrlDatum("http://foo.com");
        urlDatum3.setPayloadValue(CrawlDbDatum.LAST_FETCHED_FIELD, 0L);
        urlDatum3.setPayloadValue(CrawlDbDatum.LAST_STATUS_FIELD, UrlStatus.UNFETCHED);
        TupleEntry entry3 = new TupleEntry(UrlDatum.FIELDS);
        entry3.setTuple(urlDatum3.getTuple());
        tupleEntryList.add(entry3);
        Mockito.when(bc.getArgumentsIterator()).thenReturn(tupleEntryList.iterator());
        Mockito.when(bc.getOutputCollector()).thenReturn(collector);

        op.prepare(fp, oc);
        op.operate(fp, bc);
        op.cleanup(fp, oc);

        Mockito.verify(collector, Mockito.times(1)).add(Mockito.argThat(new MatchUrlDatum()));
        Mockito.verifyNoMoreInteractions(collector);

    }
   
    private static class MatchUrlDatum extends ArgumentMatcher<Tuple> {

        @Override
        public boolean matches(Object argument) {
            TupleEntry entry = new TupleEntry(UrlDatum.FIELDS);
            entry.setTuple((Tuple)argument);
            UrlDatum datum = new UrlDatum(entry);
            Long expectedVal = new Long(2);
            Long result = (Long)datum.getPayloadValue(CrawlDbDatum.LAST_FETCHED_FIELD);
            if (result.longValue() == expectedVal.longValue()) {
                return true;
            }
            return false;
        }
    }
*/
   
   
    @SuppressWarnings({ "rawtypes", "unchecked" })
    @Test
    public void testOperateWithGroupBy() throws Exception {
       
        BixoPlatform platform = new BixoPlatform(LatestUrlDatumBufferTest.class, Platform.Local);
       
        // Create a temp file with a fetched url
        BasePath workingDirPath = platform.makePath(WORKINGDIR);
        BasePath fetchedDatumsPath = platform.makePath(workingDirPath, "fetched");
        ArrayList<UrlDatum> fetchedDatums = new ArrayList<UrlDatum>();
        UrlDatum fetchedDatum1 = new UrlDatum("http://foo.com");
        fetchedDatum1.setPayloadValue(CrawlDbDatum.LAST_FETCHED_FIELD, 2L);
        fetchedDatums.add(fetchedDatum1);
        createDataFile(platform, fetchedDatumsPath, fetchedDatums);
       
        // And another with unfetched urls
        BasePath unfetchedDatumsPath = platform.makePath(workingDirPath, "unfetched");
        ArrayList<UrlDatum> unfetchedDatums = new ArrayList<UrlDatum>();
        UrlDatum unfetchedDatum1 = new UrlDatum("http://foo.com");
        unfetchedDatum1.setPayloadValue(CrawlDbDatum.LAST_FETCHED_FIELD, 0L);
        unfetchedDatums.add(unfetchedDatum1);
        UrlDatum unfetchedDatum2 = new UrlDatum("http://foo.com");
        unfetchedDatum2.setPayloadValue(CrawlDbDatum.LAST_FETCHED_FIELD, 0L);
        unfetchedDatums.add(unfetchedDatum2);
       
        createDataFile(platform, unfetchedDatumsPath, unfetchedDatums);

       
        // create a workflow
        Tap inputSource1 = platform.makeTap(platform.makeBinaryScheme(UrlDatum.FIELDS), fetchedDatumsPath);
        Pipe fetchedPipe = new Pipe("fetched");
        Tap inputSource2 = platform.makeTap(platform.makeBinaryScheme(UrlDatum.FIELDS), unfetchedDatumsPath);
        Pipe unfetchedPipe = new Pipe("unfetched");

        Map<String, Tap> sources = new HashMap<String, Tap>();
        sources.put(fetchedPipe.getName(), inputSource1);
        sources.put(unfetchedPipe.getName(), inputSource2);

        BasePath resultsPath = platform.makePath(workingDirPath, "results");
        Tap resultSink = platform.makeTap(platform.makeBinaryScheme(UrlDatum.FIELDS), resultsPath, SinkMode.REPLACE);

        Pipe resultsPipe = new GroupBy("results pipe", Pipe.pipes(fetchedPipe, unfetchedPipe),
                        new Fields(UrlDatum.URL_FN));
        resultsPipe = new Every(resultsPipe, new LatestUrlDatumBuffer(), Fields.RESULTS);


        FlowConnector flowConnector = platform.makeFlowConnector();
        Flow flow = flowConnector.connect(sources, resultSink, resultsPipe);
        flow.complete();
       
        // verify that the resulting pipe has the latest tuple
       
        Tap testSink = platform.makeTap(platform.makeBinaryScheme(UrlDatum.FIELDS), resultsPath);
        TupleEntryIterator reader = testSink.openForRead(platform.makeFlowProcess());
        int count = 0;
        long latest = 0;
        while (reader.hasNext()) {
            TupleEntry next = reader.next();
            UrlDatum datum = new UrlDatum(next);
            latest = (Long) datum.getPayloadValue(CrawlDbDatum.LAST_FETCHED_FIELD);
            count++;
        }
       
        assertEquals(1, count);
        assertEquals(2, latest);

       
       
    }
   
    @SuppressWarnings({ "unchecked", "rawtypes" })
    private void createDataFile(BasePlatform platform, BasePath filePath, List<UrlDatum> datums) throws Exception {
        Tap urlSink = platform.makeTap(platform.makeBinaryScheme(UrlDatum.FIELDS), filePath, SinkMode.REPLACE);
        TupleEntryCollector writer = urlSink.openForWrite(platform.makeFlowProcess());
        for (UrlDatum datum : datums) {
            writer.add(datum.getTuple());
        }
        writer.close();
    }
}
TOP

Related Classes of bixo.examples.crawl.LatestUrlDatumBufferTest

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.