Package cascading.tap.hadoop

Examples of cascading.tap.hadoop.Hfs$CombinedInputFormat


  @Test
  public void testFlowID() throws Exception
    {
    Tap source = new Lfs( new TextLine(), "input/path" );
    Tap sink = new Hfs( new TextLine(), "output/path", true );

    Pipe pipe = new Pipe( "test" );

    Map<Object, Object> props = getProperties();
    Flow flow1 = getPlatform().getFlowConnector( props ).connect( source, sink, pipe );
View Full Code Here


  @Test
  public void testCopyConfig() throws Exception
    {
    Tap source = new Lfs( new TextLine(), "input/path" );
    Tap sink = new Hfs( new TextLine(), "output/path", true );

    Pipe pipe = new Pipe( "test" );

    Configuration conf = ( (BaseHadoopPlatform) getPlatform() ).getConfiguration();
View Full Code Here

  public void testAsGroupByValue() throws Exception {
    FileSystem.get(new Configuration()).delete(new Path("/tmp/input"), true);
    FileSystem.get(new Configuration()).delete(new Path("/tmp/output"), true);

    Tap t = new Hfs(new ProtobufScheme("value", Example.Person.class), "/tmp/input");
    TupleEntryCollector tec = t.openForWrite(new HadoopFlowProcess(new JobConf()));

    HashSet<Tuple> expectedTuples = new HashSet<Tuple>(){{
      add(new Tuple(Example.Person.newBuilder().setName("bryan").setId(1).build()));
      add(new Tuple(Example.Person.newBuilder().setName("lucas").setId(2).build()));
    }};

    for (Tuple tuple : expectedTuples) {
      tec.add(tuple);
    }

    tec.close();

    Pipe inPipe = new Pipe("input");
    Pipe injectedPipe = new Each(inPipe, Fields.NONE, new Insert(new Fields("key"), 7), new Fields("key", "value"));
    Pipe groupByPipe = new GroupBy(injectedPipe, new Fields("key"));

    Hfs sink = new Hfs(new ProtobufScheme("value", Example.Person.class), "/tmp/output");
    Map<Object, Object> properties = new HashMap<Object, Object>(){{
      put("io.serializations", new JobConf().get("io.serializations") + "," + ProtobufSerialization.class.getName());
    }};
    new HadoopFlowConnector(properties).connect(t, sink, groupByPipe).complete();

    TupleEntryIterator tei = sink.openForRead(new HadoopFlowProcess(new JobConf()));
    Set<Tuple> tuples = new HashSet<Tuple>();
    while (tei.hasNext()) {
      tuples.add(tei.next().getTupleCopy());
    }
View Full Code Here

  public void testAsGroupByKey() throws Exception {
    FileSystem.get(new Configuration()).delete(new Path("/tmp/input"), true);
    FileSystem.get(new Configuration()).delete(new Path("/tmp/output"), true);

    Tap t = new Hfs(new ProtobufScheme("value", Example.Person.class), "/tmp/input");
    TupleEntryCollector tec = t.openForWrite(new HadoopFlowProcess(new JobConf()));

    HashSet<Tuple> expectedTuples = new HashSet<Tuple>(){{
      add(new Tuple(Example.Person.newBuilder().setName("bryan").setId(1).build()));
      add(new Tuple(Example.Person.newBuilder().setName("lucas").setId(2).build()));
    }};

    for (Tuple tuple : expectedTuples) {
      tec.add(tuple);
    }

    tec.close();

    Pipe inPipe = new Pipe("input");
    Pipe groupByPipe = new GroupBy(inPipe, new Fields("value"));

    Hfs sink = new Hfs(new ProtobufScheme("value", Example.Person.class), "/tmp/output");
    Map<Object, Object> properties = new HashMap<Object, Object>(){{
      put("io.serializations",
          new JobConf().get("io.serializations") + "," + ProtobufSerialization.class.getName());
    }};
    new HadoopFlowConnector(properties).connect(t, sink, groupByPipe).complete();

    TupleEntryIterator tei = sink.openForRead(new HadoopFlowProcess(new JobConf()));
    Set<Tuple> tuples = new HashSet<Tuple>();
    while (tei.hasNext()) {
      tuples.add(tei.next().getTupleCopy());
    }
View Full Code Here

    List<Tuple> expected = new ArrayList<Tuple>();
    expected.add(fixture("bryan", "bryan.duxbury@mail.com", 1));
    expected.add(fixture("lucas", "lucas@mail.com", 2));
    expected.add(fixture("vida", null, 3));

    Tap inputTap = new Hfs(new ProtobufScheme("value", Example.Person.class), "/tmp/input");
    TupleEntryCollector tec = inputTap.openForWrite(new HadoopFlowProcess(), null);

    for (Tuple t : expected) {
      tec.add(new TupleEntry(new Fields("value"), t));
    }
    tec.close();

    // read results back out
    Tap outputTap = new Hfs(new ProtobufScheme("value", Example.Person.class), "/tmp/input");
    TupleEntryIterator iter = outputTap.openForRead(new HadoopFlowProcess(), null);
    List<Tuple> tuples = new ArrayList<Tuple>();
    while (iter.hasNext()) {
      tuples.add(iter.next().getTupleCopy());
    }
View Full Code Here

    Pipe p = new Each("input", new ExtractProto(Example.Partnership.class, "leader.name"));

    final String INPUT_PATH = "/tmp/ExtractProtoTest/input";
    final String OUTPUT_PATH = "/tmp/ExtractProtoTest/output";

    Hfs inputTap = new Hfs(new ProtobufScheme("partnership", Example.Partnership.class),
        INPUT_PATH);
    Hfs outputTap = new Hfs(new ProtobufScheme("leader.name", Example.Partnership.class),
        OUTPUT_PATH);

    // make sure the input path exists
    inputTap.openForWrite(new HadoopFlowProcess()).close();
    // make sure the output path does not exist
View Full Code Here

  public void testInFlow() throws Exception {
    FileSystem.get(new Configuration()).delete(new Path("/tmp/input"), true);
    FileSystem.get(new Configuration()).delete(new Path("/tmp/output"), true);

    Hfs inTap = new Hfs(new ProtobufScheme("value", Example.Person.class), "/tmp/input");
    TupleEntryCollector collector = inTap.openForWrite(new HadoopFlowProcess());
    collector.add(new TupleEntry(new Fields("value"), new Tuple(BRYAN.build())));
    collector.add(new TupleEntry(new Fields("value"), new Tuple(LUCAS.build())));
    collector.close();

    Pipe inPipe = new Pipe("in");
    Pipe p = new Each(inPipe, new Fields("value"), new ExpandProto(Example.Person.class), new Fields("id", "name", "email", "position"));

    Hfs sink = new Hfs(new TextLine(), "/tmp/output");
    new HadoopFlowConnector().connect(inTap, sink, p).complete();

    TupleEntryIterator iter = sink.openForRead(new HadoopFlowProcess());
    List<Tuple> results = new ArrayList<Tuple>();
    while (iter.hasNext()) {
      results.add(iter.next().getTupleCopy());
    }
    assertEquals(2, results.size());
View Full Code Here

      JobConf conf = (JobConf) flowProcess.getConfigCopy();

      try {
        LOG.info("HLL counter found " + approxCounter.cardinality() + " distinct keys");

        Hfs tap = new Hfs(new SequenceFile(new Fields("bytes")), BloomProps.getApproxCountsDir(conf));
        TupleEntryCollector out = tap.openForWrite(new HadoopFlowProcess(conf));
        out.add(new Tuple(new BytesWritable(approxCounter.getBytes())));
        out.close();

      } catch (IOException e) {
        throw new RuntimeException("couldn't write approximate counts to side bucket", e);
View Full Code Here

      String partsRoot = BloomProps.getBloomFilterPartsDir(conf);
      maxHashes = BloomProps.getMaxBloomHashes(conf);
      minHashes = BloomProps.getMinBloomHashes(conf);

      for (int i = minHashes; i <= maxHashes; i++) {
        Hfs tap = new Hfs(new SequenceFile(new Fields("split", "filter")), partsRoot + "/" + i + "/");
        numHashesToCollector.put(i, tap.openForWrite(new HadoopFlowProcess(conf)));
      }

    } catch (IOException e) {
      throw new RuntimeException(e);
    }
View Full Code Here

  private static BloomFilter mergeBloomParts(String tapPath, long numBloomBits, long splitSize, int numBloomHashes, long numElems) throws IOException {
    FixedSizeBitSet bitSet = new FixedSizeBitSet(numBloomBits);

    if (FileSystemHelper.getFS().exists(new Path(tapPath))) {
      Hfs tap = new Hfs(new SequenceFile(new Fields("split", "filter")), tapPath);
      TupleEntryIterator itr = tap.openForRead(CascadingUtil.get().getFlowProcess());
      while (itr.hasNext()) {
        TupleEntry cur = itr.next();
        long split = cur.getLong(0);
        FixedSizeBitSet curSet = new FixedSizeBitSet(splitSize, ((BytesWritable) cur.getObject(1)).getBytes());
        for (long i = 0; i < curSet.numBits(); i++) {
View Full Code Here

TOP

Related Classes of cascading.tap.hadoop.Hfs$CombinedInputFormat

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.