Package org.apache.pig.data

Examples of org.apache.pig.data.DataBag


        }     
        //test null value in input
        input.append(null);
       
        Set<Integer> s = new HashSet<Integer>();
        DataBag db = tb.exec(input);
        for (Tuple t : db) {
            s.add((Integer) t.get(0));
        }

        // finally check the bag had everything we put in the tuple.
        assertEquals(101, s.size());
        for (int i = 0; i < 100; ++i) {
            assertTrue(s.contains(i));
        }
        assertTrue("null in tobag result", s.contains(null));
       
        TOTUPLE tt = new TOTUPLE();

        input = TupleFactory.getInstance().newTuple();
        for (int i = 0; i < 100; ++i) {
            input.append(i);
        }

        Tuple output = tt.exec(input);
        assertTrue(!(input == output));
        assertEquals(input, output);
       
        TOP top = new TOP();
        TupleFactory tupleFactory = TupleFactory.getInstance();
        BagFactory bagFactory = DefaultBagFactory.getInstance();
        Tuple inputTuple = tupleFactory.newTuple(3);
        DataBag dBag = bagFactory.newDefaultBag();
       
        // set N = 10 i.e retain top 10 tuples
        inputTuple.set(0, 10);
        // compare tuples by field number 1
        inputTuple.set(1, 1);
        // set the data bag containing the tuples
        inputTuple.set(2, dBag);

        // generate tuples of the form (group-1, 1), (group-2, 2) ...
        for (long i = 0; i < 100; i++) {
            Tuple nestedTuple = tupleFactory.newTuple(2);
            nestedTuple.set(0, "group-" + i);
            nestedTuple.set(1, i);
            dBag.add(nestedTuple);
        }
       
        DataBag outBag = top.exec(inputTuple);
        assertEquals(outBag.size(), 10L);
        checkItemsGT(outBag, 1, 89);
       
        // two initial results
        Tuple init1 = (new TOP.Initial()).exec(inputTuple);
        Tuple init2 = (new TOP.Initial()).exec(inputTuple);
        // two intermediate results

        DataBag intermedBag = bagFactory.newDefaultBag();
        intermedBag.add(init1);
        intermedBag.add(init2);
        Tuple intermedInput = tupleFactory.newTuple(intermedBag);
        Tuple intermedOutput1 = (new TOP.Intermed()).exec(intermedInput);
        Tuple intermedOutput2 = (new TOP.Intermed()).exec(intermedInput);
        checkItemsGT((DataBag)intermedOutput1.get(2), 1, 94);

        // final result
        DataBag finalInputBag = bagFactory.newDefaultBag();
        finalInputBag.add(intermedOutput1);
        finalInputBag.add(intermedOutput2);
        Tuple finalInput = tupleFactory.newTuple(finalInputBag);
        outBag = (new TOP.Final()).exec(finalInput);
        assertEquals(outBag.size(), 10L);
        checkItemsGT(outBag, 1, 96);
    }
View Full Code Here


   
    @Test
    public void testDistinct() throws Exception {
   
        Integer[] inp = new Integer[] { 1, 2 , 3, 1 ,4, 5, 3};
        DataBag inputBag = Util.createBagOfOneColumn(inp);
        EvalFunc<Tuple> initial = new Distinct.Initial();
        DataBag intermedInputBg1 = bagFactory.newDefaultBag();
        DataBag intermedInputBg2 = bagFactory.newDefaultBag();
        int i = 0;
        for (Tuple t : inputBag) {
            Tuple initialOutput = initial.exec(tupleFactory.newTuple(t));
            if(i < inp.length/2 ) {
                intermedInputBg1.add(initialOutput);
            } else {
                intermedInputBg2.add(initialOutput);
            }
            i++;
        }
       
        EvalFunc<Tuple> intermed = new Distinct.Intermediate();
       
        DataBag finalInputBg = bagFactory.newDefaultBag();
        finalInputBg.add(intermed.exec(tupleFactory.newTuple(intermedInputBg1)));
        finalInputBg.add(intermed.exec(tupleFactory.newTuple(intermedInputBg2)));
        EvalFunc<DataBag> fin = new Distinct.Final();
        DataBag result = fin.exec(tupleFactory.newTuple(finalInputBg));
       
        Integer[] exp = new Integer[] { 1, 2, 3, 4, 5};
        DataBag expectedBag = Util.createBagOfOneColumn(exp);
        assertEquals(expectedBag, result);
       
    }   
View Full Code Here

        for(int i = 0; i < inputSize; i+=2) {
            inp[i] = i/2;
            inp[i+1] = i/2;
        }

        DataBag inputBag = Util.createBagOfOneColumn(inp);
        EvalFunc<DataBag> distinct = new Distinct();
        DataBag result = distinct.exec(tupleFactory.newTuple(inputBag));
       
        Integer[] exp = new Integer[inputSize/2];
        for(int j = 0; j < inputSize/2; ++j) {
            exp[j] = j;
        }

        DataBag expectedBag = Util.createBagOfOneColumn(exp);
        assertEquals(expectedBag, result);
       
    }
View Full Code Here

        // Bag size
        Tuple t1 = Util.createTuple(new String[]{"a", "b", "c"});
        Tuple t2 = Util.createTuple(new String[]{"d", "e", "f"});
        Tuple t3 = Util.createTuple(new String[]{"g", "h", "i"});
        Tuple t4 = Util.createTuple(new String[]{"j", "k", "l"});
        DataBag b = Util.createBag(new Tuple[]{t1, t2, t3, t4});
        expected = new Long(4);
        t.set(0, b);
        size = new BagSize();
        msg = "[Testing BagSize on input type: Bag]";
        assertTrue(msg, expected.equals(size.exec(t)));
View Full Code Here

        Tuple t2 = tf.newTuple(1);
        t2.set(0, null);
        Tuple t3 = tf.newTuple(0);
       
        TOKENIZE f = new TOKENIZE();
        DataBag b = f.exec(t1);
        assertTrue(b.size()==3);
        Iterator<Tuple> i = b.iterator();
        Tuple rt = i.next();
        assertTrue(rt.get(0).equals("123"));
        rt = i.next();
        assertTrue(rt.get(0).equals("456"));
        rt = i.next();
View Full Code Here

    public void testDIFF() throws Exception {
        // Test it in the case with two bags.
        BagFactory bf = BagFactory.getInstance();
        TupleFactory tf = TupleFactory.getInstance();

        DataBag b1 = bf.newDefaultBag();
        DataBag b2 = bf.newDefaultBag();
        for (int i = 0; i < 10; i++) b1.add(tf.newTuple(new Integer(i)));
        for (int i = 0; i < 10; i += 2) b2.add(tf.newTuple(new Integer(i)));
        Tuple t = tf.newTuple(2);
        t.set(0, b1);
        t.set(1, b2);
        DIFF d = new DIFF();
        DataBag result = d.exec(t);

        assertEquals(5, result.size());
        Iterator<Tuple> i = result.iterator();
        int[] values = new int[5];
        for (int j = 0; j < 5; j++) values[j] = (Integer)i.next().get(0);
        Arrays.sort(values);
        for (int j = 1; j < 10; j += 2) assertEquals(j, values[j/2]);

        // Test it in the case of two objects that are equals
        t = tf.newTuple(2);
        t.set(0, new Integer(1));
        t.set(1, new Integer(1));
        result = d.exec(t);
        assertEquals(0, result.size());

        // Test it in the case of two objects that are not equal
        t = tf.newTuple(2);
        t.set(0, new Integer(1));
        t.set(1, new Integer(2));
        result = d.exec(t);
        assertEquals(2, result.size());
    }
View Full Code Here

  TupleFactory mTupleFactory = TupleFactory.getInstance();
    BagFactory mBagFactory = BagFactory.getInstance();
   
    public DataBag exec(Tuple input) throws IOException{
      try {
          DataBag output = mBagFactory.newDefaultBag();
          Object o = input.get(0);
          if (!(o instanceof String)) {
              throw new IOException("Expected input to be chararray, but  got " + o.getClass().getName());
          }
          Tokenizer source = new WhitespaceTokenizer(Version.LUCENE_43, new StringReader((String)o));
          TokenStream tokenstream = new LowerCaseEntityPreservingFilter(source);
          tokenstream.reset();
          while (tokenstream.incrementToken()){
            String token = tokenstream.getAttribute(CharTermAttribute.class).toString();
            output.add(mTupleFactory.newTuple(token));
          }
          return output;
      } catch (Exception e) {
          // error handling goes here
        throw new IOException("caught exception",e);
View Full Code Here

import org.apache.pig.data.Tuple;

// Sample usage: cc.twittertools.piggybank.GetLatitude($0#'geo'#'coordinates')
public class GetLatitude extends EvalFunc<String> {
  public String exec(Tuple input) throws IOException {
    DataBag bag = (DataBag) input.get(0);
    Iterator<Tuple> it = bag.iterator();
    if (!it.hasNext()) {
      return null;
    }
    Tuple tup = it.next();
View Full Code Here

import org.apache.pig.data.Tuple;

// Sample usage: cc.twittertools.piggybank.GetLongitude($0#'geo'#'coordinates');
public class GetLongitude extends EvalFunc<String> {
  public String exec(Tuple input) throws IOException {
    DataBag bag = (DataBag) input.get(0);
    Iterator<Tuple> it = bag.iterator();
    if (!it.hasNext()) {
      return null;
    }
    it.next();
    if (!it.hasNext()) {
View Full Code Here

    map.put("a", "x");
    map.put("b", "y");
    map.put("c", "z");
    tuple.set(2, map);

    DataBag bagColl = TypesUtils.createBag();
    Schema schColl = schema.getColumn(3).getSchema();
    Tuple tupColl1 = TypesUtils.createTuple(schColl);
    Tuple tupColl2 = TypesUtils.createTuple(schColl);
    byte[] abs1 = new byte[3];
    byte[] abs2 = new byte[4];
    tupColl1.set(0, 3.1415926);
    tupColl1.set(1, 1.6);
    abs1[0] = 11;
    abs1[1] = 12;
    abs1[2] = 13;
    tupColl1.set(2, new DataByteArray(abs1));
    bagColl.add(tupColl1);
    tupColl2.set(0, 123.456789);
    tupColl2.set(1, 100);
    abs2[0] = 21;
    abs2[1] = 22;
    abs2[2] = 23;
    abs2[3] = 24;
    tupColl2.set(2, new DataByteArray(abs2));
    bagColl.add(tupColl2);
    tuple.set(3, bagColl);

    int row = 0;
    inserter.insert(new BytesWritable(String.format("k%d%d", part + 1, row + 1)
        .getBytes()), tuple);

    // row 2
    row++;
    TypesUtils.resetTuple(tuple);
    TypesUtils.resetTuple(tupRecord);
    map.clear();
    tuple.set(0, false);
    tupRecord.set(0, 2);
    tupRecord.set(1, 1002L);
    tuple.set(1, tupRecord);
    map.put("boy", "girl");
    map.put("adam", "amy");
    map.put("bob", "becky");
    map.put("carl", "cathy");
    tuple.set(2, map);
    bagColl.clear();
    TypesUtils.resetTuple(tupColl1);
    TypesUtils.resetTuple(tupColl2);
    tupColl1.set(0, 7654.321);
    tupColl1.set(1, 0.0001);
    abs1[0] = 31;
    abs1[1] = 32;
    abs1[2] = 33;
    tupColl1.set(2, new DataByteArray(abs1));
    bagColl.add(tupColl1);
    tupColl2.set(0, 0.123456789);
    tupColl2.set(1, 0.3333);
    abs2[0] = 41;
    abs2[1] = 42;
    abs2[2] = 43;
    abs2[3] = 44;
    tupColl2.set(2, new DataByteArray(abs2));
    bagColl.add(tupColl2);
    tuple.set(3, bagColl);
    inserter.insert(new BytesWritable(String.format("k%d%d", part + 1, row + 1)
        .getBytes()), tuple);

    inserter.close();
View Full Code Here

TOP

Related Classes of org.apache.pig.data.DataBag

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.