Package com.datasalt.pangool.tuplemr

Examples of com.datasalt.pangool.tuplemr.TupleMRBuilder$Input


    String input = "combiner-input";
    String output = "combiner-output";

    withInput(input, writable("hola don pepito hola don jose"));

    TupleMRBuilder jobBuilder = new TestCombiner().getBuilder(conf, input, output);
    try {
      Job job = jobBuilder.createJob();
      job.setNumReduceTasks(1);
      assertRun(job);
    } finally {
      jobBuilder.cleanUpInstanceFiles();
    }

    withOutput(output + "/part-r-00000", writable("don"), writable(2));
    withOutput(output + "/part-r-00000", writable("hola"), writable(2));
    withOutput(output + "/part-r-00000", writable("jose"), writable(1));
View Full Code Here


  public int run(String input, String output, Configuration conf) throws Exception {
    // Define the intermediate schema: It must match SOLR's schema.xml!
    final Schema schema = new Schema("iSchema", Fields.parse("user_id:string, message:string"));

    TupleMRBuilder job = new TupleMRBuilder(conf);
    job.addIntermediateSchema(schema);
    job.setGroupByFields("user_id");
    // Define the input and its associated mapper.
    // We'll just have a Mapper, reducer will be Identity
    job.addInput(new Path(input), new HadoopInputFormat(TextInputFormat.class),
        new TupleMapper<LongWritable, Text>() {

          Tuple tuple = new Tuple(schema);

          @Override
          public void map(LongWritable key, Text value, TupleMRContext context, Collector collector)
              throws IOException, InterruptedException {
            String[] fields = value.toString().split("\t");
            String language = fields[1];
            tuple.set("user_id", fields[0]);
            tuple.set("message", fields[2]);
            if(language.equals("en")) {
              // English -> write to main output
              collector.write(tuple);
            } else if(language.equals("fr")) {
              // French -> write to french index
              collector.getNamedOutput("FR").write(tuple, NullWritable.get());
            } else if(language.equals("es")) {
              // Spanish -> write to spanish index
              collector.getNamedOutput("ES").write(tuple, NullWritable.get());
            }
          }
        });
    // Add multi-output: French index
    job.addNamedOutput("FR", new TupleSolrOutputFormat(new File("src/test/resources/solr-fr"), conf),
        ITuple.class, NullWritable.class);
    // Add multi-output: Spanish index
    job.addNamedOutput("ES", new TupleSolrOutputFormat(new File("src/test/resources/solr-es"), conf),
        ITuple.class, NullWritable.class);
    job.setTupleReducer(new IdentityTupleReducer());
    // Add multi-output: English index
    job.setOutput(new Path(output), new TupleSolrOutputFormat(new File("src/test/resources/solr-en"),
        conf), ITuple.class, NullWritable.class);
    Job hadoopJob = job.createJob();
    try {
      hadoopJob.waitForCompletion(true);
      if(!hadoopJob.isSuccessful()) {
        throw new PangoolRuntimeException("Job was not sucessfull");
      }
    } finally {
      job.cleanUpInstanceFiles();
    }
    return 0;
  }
View Full Code Here

      withInput(input, writable(inputElement));
      tuples[i++] = createTuple(inputElement, schema);
    }
    Path outputPath = new Path(output);

    TupleMRBuilder builder = new TupleMRBuilder(getConf());
    builder.addIntermediateSchema(schema);
    builder.setGroupByFields("country", "age", "name");
    builder.setOrderBy(new OrderBy().add("country", Order.ASC).add("age", Order.ASC)
        .add("name", Order.ASC).add("height", Order.DESC));
    builder.setRollupFrom("country");
    builder.setTupleReducer(new IdentityRed());
    builder.setOutput(outputPath, new HadoopOutputFormat(SequenceFileOutputFormat.class), Text.class,
        Text.class);
    builder.addInput(new Path(input), new HadoopInputFormat(SequenceFileInputFormat.class), new Map());

    Job job = builder.createJob();
    try {
      job.setNumReduceTasks(1);
      assertRun(job);
    } finally {
      builder.cleanUpInstanceFiles();
    }

    FileSystem fs = FileSystem.get(getConf());
    Path outputFile = new Path(output + "/part-r-00000");
    checkRollupOutput(outputFile, 0, 2);
View Full Code Here

      withInput(input, writable(inputElement));
      tuples[i++] = createTuple(inputElement, schema);
    }
    Path outputPath = new Path(output);

    TupleMRBuilder builder = new TupleMRBuilder(getConf());
    builder.addIntermediateSchema(schema);
    builder.setGroupByFields("age", "name", "country");
    builder.setOrderBy(new OrderBy().add("country", Order.ASC).add("age", Order.ASC)
        .add("name", Order.ASC).add("height", Order.DESC));
    builder.setRollupFrom("age");
    builder.setTupleReducer(new IdentityRed());
    builder.setOutput(outputPath, new HadoopOutputFormat(SequenceFileOutputFormat.class), Text.class,
        Text.class);
    builder.addInput(new Path(input), new HadoopInputFormat(SequenceFileInputFormat.class), new Map());

    Job job = builder.createJob();
    try {
      job.setNumReduceTasks(1);
      assertRun(job);
    } finally {
      builder.cleanUpInstanceFiles();
    }

    FileSystem fs = FileSystem.get(getConf());
    Path outputFile = new Path(output + "/part-r-00000");
    checkRollupOutput(outputFile, 1, 2);
View Full Code Here

    Schema schema = new Schema("schema",
        Fields.parse("country:string, age:int, name:string, height:int"));
    Path outputPath = new Path(output);

    TupleMRBuilder builder = new TupleMRBuilder(getConf());
    builder.addIntermediateSchema(schema);
    builder.setGroupByFields("age", "name", "country");
    builder.setOrderBy(new OrderBy().add("country", Order.ASC).add("age", Order.ASC)
        .add("name", Order.ASC));
    builder.setRollupFrom("age");
    builder.setTupleReducer(new IdentityRed());
    builder.setOutput(outputPath, new HadoopOutputFormat(SequenceFileOutputFormat.class), Text.class,
        Text.class);
    builder.addInput(new Path(input), new HadoopInputFormat(SequenceFileInputFormat.class),
        new DoNothingMap());

    Job job = builder.createJob();
    try {
      job.setNumReduceTasks(1);
      assertRun(job);
    } finally {
      builder.cleanUpInstanceFiles();
    }
   
    cleanUp();
    trash(TEST_OUT);
  }
View Full Code Here

    writer = new BufferedWriter(new FileWriter(INPUT2));
    writer.write("4.5" + "\t" + "true" + "\n");
    writer.write("4.6" + "\t" + "false" + "\n");
    writer.close();

    TupleMRBuilder builder = new TupleMRBuilder(getConf());

    final Schema tupleSchema1 = new Schema("tupleSchema1", Fields.parse("a:string, b:int"));
    final Schema tupleSchema2 = new Schema("tupleSchema2", Fields.parse("c:double, d:boolean"));

    List<Field> fields = new ArrayList<Field>();
    fields.add(Field.create("partitionId", Type.INT));
    fields.add(Fields.createTupleField("tuple1", tupleSchema1));
    final Schema schema1 = new Schema("tupleInTuple1", fields);

    fields.clear();
    fields.add(Field.create("partitionId", Type.INT));
    fields.add(Fields.createTupleField("tuple2", tupleSchema2));
    final Schema schema2 = new Schema("tupleInTuple2", fields);

    builder.addIntermediateSchema(schema1);
    builder.addIntermediateSchema(schema2);

    builder.addInput(new Path(INPUT1), new HadoopInputFormat(TextInputFormat.class),
        new TupleMapper<LongWritable, Text>() {

          ITuple tupleInTuple1 = new Tuple(schema1);
          ITuple tuple1 = new Tuple(tupleSchema1);

          @Override
          public void map(LongWritable key, Text value, TupleMRContext context, Collector collector)
              throws IOException, InterruptedException {
            String[] split = value.toString().split("\t");
            tuple1.set("a", split[0]);
            tuple1.set("b", Integer.parseInt(split[1]));

            tupleInTuple1.set("partitionId", 0);
            tupleInTuple1.set("tuple1", tuple1);
            collector.write(tupleInTuple1);
          }
        });

    builder.addInput(new Path(INPUT2), new HadoopInputFormat(TextInputFormat.class),
        new TupleMapper<LongWritable, Text>() {

          ITuple tupleInTuple2 = new Tuple(schema2);
          ITuple tuple2 = new Tuple(tupleSchema2);

          @Override
          public void map(LongWritable key, Text value, TupleMRContext context, Collector collector)
              throws IOException, InterruptedException {
            String[] split = value.toString().split("\t");
            tuple2.set("c", Double.parseDouble(split[0]));
            tuple2.set("d", Boolean.parseBoolean(split[1]));

            tupleInTuple2.set("partitionId", 0);
            tupleInTuple2.set("tuple2", tuple2);
            collector.write(tupleInTuple2);
          }
        });

    builder.setTupleReducer(new TupleReducer<Text, NullWritable>() {

      public void reduce(ITuple group, Iterable<ITuple> tuples, TupleMRContext context, Collector collector)
          throws IOException, InterruptedException, TupleMRException {
       
        Iterator<ITuple> iterator = tuples.iterator();
        ITuple currentTuple;
       
        assertEquals(0, group.get("partitionId"));
       
        currentTuple = iterator.next();
        assertEquals("foo1", ((ITuple)currentTuple.get("tuple1")).get("a").toString());
        assertEquals(30, ((ITuple)currentTuple.get("tuple1")).get("b"));
       
        currentTuple = iterator.next();
        assertEquals("foo2", ((ITuple)currentTuple.get("tuple1")).get("a").toString());
        assertEquals(20, ((ITuple)currentTuple.get("tuple1")).get("b"));

        currentTuple = iterator.next();
        assertEquals("foo3", ((ITuple)currentTuple.get("tuple1")).get("a").toString());
        assertEquals(140, ((ITuple)currentTuple.get("tuple1")).get("b"));

        currentTuple = iterator.next();
        assertEquals("foo4", ((ITuple)currentTuple.get("tuple1")).get("a").toString());
        assertEquals(110, ((ITuple)currentTuple.get("tuple1")).get("b"));

        currentTuple = iterator.next();
        assertEquals("foo5", ((ITuple)currentTuple.get("tuple1")).get("a").toString());
        assertEquals(220, ((ITuple)currentTuple.get("tuple1")).get("b"));

        currentTuple = iterator.next();
        assertEquals("foo6", ((ITuple)currentTuple.get("tuple1")).get("a").toString());
        assertEquals(260, ((ITuple)currentTuple.get("tuple1")).get("b"));

        // Second data source BEGINS
        currentTuple = iterator.next();
        assertEquals(4.5, ((ITuple)currentTuple.get("tuple2")).get("c"));
        assertEquals(true, ((ITuple)currentTuple.get("tuple2")).get("d"));
       
        currentTuple = iterator.next();
        assertEquals(4.6, ((ITuple)currentTuple.get("tuple2")).get("c"));
        assertEquals(false, ((ITuple)currentTuple.get("tuple2")).get("d"));
      };
    });
    builder.setGroupByFields("partitionId");
    builder.setOutput(new Path(OUTPUT), new HadoopOutputFormat(TextOutputFormat.class), Text.class, NullWritable.class);
    Job job = builder.createJob();
    try {
      job.waitForCompletion(true);
    } finally {
      builder.cleanUpInstanceFiles();
    }
    trash(INPUT1, INPUT2, OUTPUT);
  }
View Full Code Here

    ITuple tuple = new Tuple(schema);
    for(int i = 0; i < NUM_ROWS_TO_GENERATE; i++) {
      withTupleInput(input, fillTuple(true, tuple));
    }

    TupleMRBuilder builder = new TupleMRBuilder(getConf(), "test");
    builder.addTupleInput(new Path(input), new IdentityTupleMapper());
    builder.setTupleReducer(new IdentityTupleReducer());
    builder.addIntermediateSchema(schema);
    builder.setGroupByFields(schema.getField(0).getName());
    builder.setTupleOutput(new Path(output), schema);

    Job job = builder.createJob();
    try {
      job.setNumReduceTasks(1);
      assertRun(job);
    } finally {
      builder.cleanUpInstanceFiles();
    }
    trash(input);
    trash(output);
  }
View Full Code Here

    }
    String inputExamples = args[0];
    String output = args[1];
    deleteOutput(output);

    TupleMRBuilder job = new TupleMRBuilder(conf, "Naive Bayes Model Generator");
    job.addIntermediateSchema(INTERMEDIATE_SCHEMA);
    // perform per-category word count mapping
    job.addInput(new Path(inputExamples), new HadoopInputFormat(TextInputFormat.class),
        new TupleMapper<LongWritable, Text>() {

          ITuple tuple = new Tuple(INTERMEDIATE_SCHEMA);

          @Override
          public void map(LongWritable toIgnore, Text value, TupleMRContext context, Collector collector)
              throws IOException, InterruptedException {

            Category category = Category.valueOf(value.toString().split("\t")[0]);
            StringTokenizer itr = new StringTokenizer(value.toString().split("\t")[1]);
            tuple.set("category", category);
            tuple.set("count", 1);
            while(itr.hasMoreTokens()) {
              tuple.set("word", normalizeWord(itr.nextToken()));
              collector.write(tuple);
            }
          }
        });

    TupleReducer countReducer = new TupleReducer<ITuple, NullWritable>() {

      public void reduce(ITuple group, Iterable<ITuple> tuples, TupleMRContext context, Collector collector)
          throws IOException, InterruptedException, TupleMRException {
        int count = 0;
        ITuple outputTuple = null;
        for(ITuple tuple : tuples) {
          count += (Integer) tuple.get("count");
          outputTuple = tuple;
        }
        outputTuple.set("count", count);
        collector.write(outputTuple, NullWritable.get());
      }
    };
    job.setTupleCombiner(countReducer);
    job.setTupleReducer(countReducer);
    job.setGroupByFields("word", "category");
    job.setTupleOutput(new Path(output), INTERMEDIATE_SCHEMA);
    if(job.createJob().waitForCompletion(true)) {
      return 1;
    }
    return -1;
  }
View Full Code Here

    Path tweetsPath = new Path(args[0]);
    Path retweetsPath = new Path(args[1]);
    Path outputPath = new Path(args[2]);
    deleteOutput(outputPath.toString());
   
    TupleMRBuilder mr = new TupleMRBuilder(conf, "AvroTweetsJoin");
    mr.addIntermediateSchema(getPangoolTweetSchema());
    mr.addIntermediateSchema(getPangoolRetweetSchema());
    mr.setGroupByFields("tweet_id");
    mr.setOrderBy(new OrderBy().add("tweet_id",Order.ASC).addSchemaOrder(Order.ASC));
   
    mr.addInput(tweetsPath,new AvroInputFormat<Record>(getAvroTweetSchema()),new TweetsMapper());
    mr.addInput(retweetsPath, new HadoopInputFormat(TextInputFormat.class), new RetweetsMapper());
    mr.setOutput(outputPath,new AvroOutputFormat<Record>(getAvroOutputSchema()),
        AvroWrapper.class,NullWritable.class);

    mr.setTupleReducer(new Red());

    Job job = mr.createJob();
    job.waitForCompletion(true);

    return 0;
  }
View Full Code Here

      return -1;
    }

    deleteOutput(args[1]);

    TupleMRBuilder mr = new TupleMRBuilder(conf, "Pangool Topical Word Count");
    mr.addIntermediateSchema(getSchema());
    mr.setGroupByFields("my_avro");
    //here the custom comparator that groups by "topic,word" is used.
    MyAvroComparator customComp = new MyAvroComparator(getAvroSchema(),"topic","word");
    mr.setOrderBy(new OrderBy().add("my_avro",Order.ASC,customComp));
    mr.addInput(new Path(args[0]), new HadoopInputFormat(TextInputFormat.class), new TokenizeMapper());
    // We'll use a TupleOutputFormat with the same schema than the intermediate schema
    mr.setTupleOutput(new Path(args[1]), getSchema());
    mr.setTupleReducer(new CountReducer());
    mr.setTupleCombiner(new CountReducer());

    mr.createJob().waitForCompletion(true);

    return 1;
  }
View Full Code Here

TOP

Related Classes of com.datasalt.pangool.tuplemr.TupleMRBuilder$Input

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.