Package com.datasalt.pangool.tuplemr.mapred.lib.input

Examples of com.datasalt.pangool.tuplemr.mapred.lib.input.HadoopInputFormat


    TupleMRBuilder mr = new TupleMRBuilder(conf, "Pangool Topical Word Count");
    mr.addIntermediateSchema(getSchema());
    // We will count each (topicId, word) pair
    // Note that the order in which we defined the fields of the Schema is not relevant here
    mr.setGroupByFields("topic", "word");
    mr.addInput(new Path(args[0]), new HadoopInputFormat(TextInputFormat.class), new TokenizeMapper());
    // We'll use a TupleOutputFormat with the same schema than the intermediate schema
    mr.setTupleOutput(new Path(args[1]), getSchema());
    mr.setTupleReducer(new CountReducer());
    mr.setTupleCombiner(new CountReducer());
View Full Code Here


    mr.setOrderBy(new OrderBy().add("url", Order.ASC).addSchemaOrder(Order.ASC));
    mr.setSpecificOrderBy("urlRegister", new OrderBy().add("timestamp", Order.ASC));
    mr.setTupleReducer(new Handler());
    mr.setOutput(new Path(output), new HadoopOutputFormat(TextOutputFormat.class), Text.class,
        NullWritable.class);
    mr.addInput(new Path(input1), new HadoopInputFormat(TextInputFormat.class), new UrlMapProcessor());
    mr.addInput(new Path(input2), new HadoopInputFormat(TextInputFormat.class), new UrlProcessor());

    try {
      mr.createJob().waitForCompletion(true);
    } finally {
      mr.cleanUpInstanceFiles();
View Full Code Here

    // We only need to execute a Map-only job for this task.
    // Every map will process a HTML file and extract the reviews from it.
    MapOnlyJobBuilder builder = new MapOnlyJobBuilder(conf);

    builder.addInput(new Path(inputFolder), new HadoopInputFormat(TextInputFormat.class),
        new MapOnlyMapper<LongWritable, Text, Text, BSONObject>() {

          StringBuffer inMemoryHtml = new StringBuffer();

          @Override
View Full Code Here

    mr.setOrderBy(new OrderBy().add("url", Order.ASC).add("date", Order.ASC));
    // Input / output and such
    mr.setTupleReducer(new MovingAverageHandler(nDaysAverage));
    mr.setOutput(new Path(output), new HadoopOutputFormat(TextOutputFormat.class), Text.class,
        NullWritable.class);
    mr.addInput(new Path(input), new HadoopInputFormat(TextInputFormat.class), new URLVisitsProcessor());

    try {
      mr.createJob().waitForCompletion(true);
    } finally {
      mr.cleanUpInstanceFiles();
View Full Code Here

    final Schema metaSchema1 = new Schema("schema1", fields);

    TupleMRBuilder builder = new TupleMRBuilder(new Configuration());
    builder.addIntermediateSchema(NullableSchema.nullableSchema(metaSchema1));

    builder.addInput(new Path(INPUT1), new HadoopInputFormat(TextInputFormat.class),
        new TupleMapper<LongWritable, Text>() {

          ITuple tupleInTuple1 = new Tuple(metaSchema1);

          @Override
View Full Code Here

    Schema baseSchema = new Schema("schema", Fields.parse("name:string, money:int, country:string"));
    builder.addIntermediateSchema(baseSchema);
    builder.setGroupByFields("country");
    builder.setOrderBy(new OrderBy().add("country", Order.ASC).add("money", Order.DESC)
        .add("name", Order.ASC));
    builder.addInput(new Path(INPUT), new HadoopInputFormat(TextInputFormat.class),
        new MyInputProcessor());
    builder.setTupleReducer(new MyGroupHandler());
    builder.setOutput(new Path(OUTPUT), new HadoopOutputFormat(SequenceFileOutputFormat.class),
        DoubleWritable.class, NullWritable.class);
    // Configure extra outputs
View Full Code Here

    builder.setGroupByFields("title");
    builder.setOrderBy(new OrderBy().add("title", Order.ASC).add("content", Order.ASC));

    builder.setTupleReducer(new IdentityTupleReducer());
    builder.setTupleOutput(outPath, originalSchema);
    builder.addInput(inPath, new HadoopInputFormat(TextInputFormat.class), new MyInputProcessor());

    Job job = builder.createJob();
    try {
      job.waitForCompletion(true);
    } finally {
View Full Code Here

    TupleMRBuilder builder = new TupleMRBuilder(conf);
    fS.delete(out, true);
   
    builder.setTupleOutput(out, getMetaSchema2());
    builder.addIntermediateSchema(getMetaSchema2());
    builder.addInput(new Path("src/test/resources/foo-file.txt"), new HadoopInputFormat(TextInputFormat.class), new MyHandler());
    builder.setGroupByFields("group");
    builder.setTupleReducer(new IdentityTupleReducer());
    Job job = builder.createJob();
    try {
      job.waitForCompletion(true);
View Full Code Here

    fields.add(Field.create("count", Type.INT));

    TupleMRBuilder cg = new TupleMRBuilder(conf);
    cg.addIntermediateSchema(new Schema("schema", fields));
    cg.setJarByClass(TestCombiner.class);
    cg.addInput(new Path(input), new HadoopInputFormat(SequenceFileInputFormat.class), new Split());
    cg.setOutput(new Path(output), new HadoopOutputFormat(SequenceFileOutputFormat.class), Utf8.class,
        IntWritable.class);
    cg.setGroupByFields("word");
    cg.setOrderBy(new OrderBy().add("word", Order.ASC));
    cg.setTupleReducer(new Count());
View Full Code Here

    builder.setFieldAliases("user", new Aliases().add("country", "my_country"));
    builder.setGroupByFields("country");
    builder.setOrderBy(new OrderBy().add("country", Order.ASC).addSchemaOrder(Order.DESC));
    builder.setSpecificOrderBy("user", new OrderBy().add("money", Order.ASC));

    builder.addInput(new Path("test-input"), new HadoopInputFormat(TextInputFormat.class),
        new FirstInputProcessor());
    builder.setTupleReducer(new MyGroupHandler());
    builder.setOutput(new Path("test-output"), new HadoopOutputFormat(TextOutputFormat.class),
        NullWritable.class, NullWritable.class);
View Full Code Here

TOP

Related Classes of com.datasalt.pangool.tuplemr.mapred.lib.input.HadoopInputFormat

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.