Examples of TupleMRBuilder

com.datasalt.pangool.tuplemr.TupleMRBuilder
TupleMRBuilder creates Tuple-based Map-Reduce jobs.
One of the key concepts of Tuple-based Map-Reduce is that Hadoop Key-Value pairs are no longer used.Instead,they are replaced by tuples.
Tuples(see {@link ITuple}) are just an ordered list of elements whose types are defined in a {@link Schema}.TupleMRBuilder contains several methods to define how grouping and sorting among tuples will be performed, avoiding the complex task of defining custom binary {@link SortComparator} , {@link GroupComparator} and{@link TupleHashPartitioner} implementations.
A Tuple-based Map-Red job, in its simplest form, requires to define :
- Intermediate schemas:
  An schema specifies the name and types of a Tuple's fields. Several schemas can be defined in order to perform joins among different input data. It's mandatory to specify ,at least,one schema using {@link #addIntermediateSchema(Schema)}
- Group-by fields:
  Needed to specify how the tuples will be grouped. Several tuples with the same group-by fields will be groupped and reduced together in the Reduce phase.
- Tuple-based Mapper:
  The job needs to specify a {@link TupleMapper} instance,the Tuple-based implementation of Hadoop's {@link Mapper}. Unlike Hadoop's Mappers, Tuple-based mappers are configured using stateful serializable instances and not static class definitions.
- Tuple-based Reducer: Similar to mapper instances,the job needs to specify a {@link TupleReducer}instance,the Tuple-based implementation of Hadoop's {@link Reducer}.
@see ITuple @see Schema @see TupleMapper @see TupleReducer

Examples of com.datasalt.pangool.tuplemr.TupleMRBuilder

    String input2 = args[1];
    String output = args[2];
  
    deleteOutput(output);
    
    TupleMRBuilder mr = new TupleMRBuilder(conf,"Pangool Url Resolution");
    mr.addIntermediateSchema(getURLMapSchema());
    mr.addIntermediateSchema(getURLRegisterSchema());
    mr.setGroupByFields("url");
    mr.setOrderBy(new OrderBy().add("url", Order.ASC).addSchemaOrder(Order.ASC));
    mr.setTupleReducer(new Handler());
    mr.setOutput(new Path(output), new HadoopOutputFormat(TextOutputFormat.class), Text.class, NullWritable.class);
    mr.addInput(new Path(input1), new HadoopInputFormat(TextInputFormat.class), new UrlMapProcessor());
    mr.addInput(new Path(input2), new HadoopInputFormat(TextInputFormat.class), new UrlProcessor());
    mr.createJob().waitForCompletion(true);


    return 1;
  }

View Full Code Here

Examples of com.datasalt.pangool.tuplemr.TupleMRBuilder

      return -1;
    }


    deleteOutput(args[1]);


    TupleMRBuilder mr = new TupleMRBuilder(conf, "Pangool Topical Word Count");
    mr.addIntermediateSchema(getSchema());
    mr.setGroupByFields("my_avro");
    //here the custom comparator that groups by "topic,word" is used. 
    MyAvroComparator customComp = new MyAvroComparator(getAvroSchema(),"topic","word");
    mr.setOrderBy(new OrderBy().add("my_avro",Order.ASC,customComp));
    mr.addInput(new Path(args[0]), new HadoopInputFormat(TextInputFormat.class), new TokenizeMapper());
    // We'll use a TupleOutputFormat with the same schema than the intermediate schema
    mr.setTupleOutput(new Path(args[1]), getSchema());
    mr.setTupleReducer(new CountReducer());
    mr.setTupleCombiner(new CountReducer());


    mr.createJob().waitForCompletion(true);


    return 1;
  }

View Full Code Here

Examples of com.datasalt.pangool.tuplemr.TupleMRBuilder

    Path tweetsPath = new Path(args[0]);
    Path retweetsPath = new Path(args[1]);
    Path outputPath = new Path(args[2]);
    deleteOutput(outputPath.toString());
    
    TupleMRBuilder mr = new TupleMRBuilder(conf, "AvroTweetsJoin");
    mr.addIntermediateSchema(getPangoolTweetSchema());
    mr.addIntermediateSchema(getPangoolRetweetSchema());
    mr.setGroupByFields("tweet_id");
    mr.setOrderBy(new OrderBy().add("tweet_id",Order.ASC).addSchemaOrder(Order.ASC));
    
    mr.addInput(tweetsPath,new AvroInputFormat<Record>(getAvroTweetSchema()),new TweetsMapper());
    mr.addInput(retweetsPath, new HadoopInputFormat(TextInputFormat.class), new RetweetsMapper());
    mr.setOutput(outputPath,new AvroOutputFormat<Record>(getAvroOutputSchema()),
        AvroWrapper.class,NullWritable.class);


    mr.setTupleReducer(new Red());


    Job job = mr.createJob();
    job.waitForCompletion(true);


    return 0;
  }

View Full Code Here

Examples of com.datasalt.pangool.tuplemr.TupleMRBuilder

    Path tweetsPath = new Path(args[0]);
    Path retweetsPath = new Path(args[1]);
    Path outputPath = new Path(args[2]);
    delete(outputPath.toString());


    TupleMRBuilder mr = new TupleMRBuilder(conf, "AvroTweetsJoin");
    mr.addIntermediateSchema(getPangoolTweetSchema());
    mr.addIntermediateSchema(getPangoolRetweetSchema());
    mr.setGroupByFields("tweet_id");
    mr.setOrderBy(new OrderBy().add("tweet_id", Order.ASC).addSchemaOrder(Order.ASC));


    mr.addInput(tweetsPath, new AvroInputFormat<Record>(getAvroTweetSchema()), new TweetsMapper());
    mr.addInput(retweetsPath, new HadoopInputFormat(TextInputFormat.class), new RetweetsMapper());
    mr.setOutput(outputPath, new AvroOutputFormat<Record>(getAvroOutputSchema()), AvroWrapper.class,
        NullWritable.class);


    mr.setTupleReducer(new Red());


    try {
      Job job = mr.createJob();
      job.waitForCompletion(true);
    } finally {
      mr.cleanUpInstanceFiles();
    }


    return 0;
  }

View Full Code Here

Examples of com.datasalt.pangool.tuplemr.TupleMRBuilder

      return -1;
    }


    delete(args[1]);


    TupleMRBuilder mr = new TupleMRBuilder(conf, "Pangool Topical Word Count");
    mr.addIntermediateSchema(getSchema());
    mr.setGroupByFields("my_avro");
    // here the custom comparator that groups by "topic,word" is used.
    MyAvroComparator customComp = new MyAvroComparator(getAvroSchema(), "topic", "word");
    mr.setOrderBy(new OrderBy().add("my_avro", Order.ASC, customComp));
    mr.addInput(new Path(args[0]), new HadoopInputFormat(TextInputFormat.class), new TokenizeMapper());
    // We'll use a TupleOutputFormat with the same schema than the intermediate schema
    mr.setTupleOutput(new Path(args[1]), getSchema());
    mr.setTupleReducer(new CountReducer());
    mr.setTupleCombiner(new CountReducer());


    try {
      mr.createJob().waitForCompletion(true);
    } finally {
      mr.cleanUpInstanceFiles();
    }
    return 1;
  }

View Full Code Here

Examples of com.datasalt.pangool.tuplemr.TupleMRBuilder

    String input2 = args[1];
    String output = args[2];


    delete(output);


    TupleMRBuilder mr = new TupleMRBuilder(conf, "Pangool Url Resolution");
    mr.addIntermediateSchema(getURLMapSchema());
    mr.addIntermediateSchema(getURLRegisterSchema());
    mr.setFieldAliases("urlMap", new Aliases().add("url", "nonCanonicalUrl"));
    mr.setGroupByFields("url");
    mr.setOrderBy(new OrderBy().add("url", Order.ASC).addSchemaOrder(Order.ASC));
    mr.setTupleReducer(new Handler());
    mr.setOutput(new Path(output), new HadoopOutputFormat(TextOutputFormat.class), Text.class,
        NullWritable.class);
    mr.addInput(new Path(input1), new HadoopInputFormat(TextInputFormat.class), new UrlMapProcessor());
    mr.addInput(new Path(input2), new HadoopInputFormat(TextInputFormat.class), new UrlProcessor());


    try {
      mr.createJob().waitForCompletion(true);
    } finally {
      mr.cleanUpInstanceFiles();
    }


    return 1;
  }

View Full Code Here

Examples of com.datasalt.pangool.tuplemr.TupleMRBuilder

      failArguments("Invalid number of arguments");
      return -1;
    }
    delete(args[1]);


    TupleMRBuilder job = new TupleMRBuilder(conf);
    job.addIntermediateSchema(SCHEMA);
    job.setGroupByFields("line");


    String input = args[0], output = args[1];
    FileSystem fileSystem = FileSystem.get(conf);


    for(Category category : Category.values()) { // For each Category
      String categoryString = category.toString().toLowerCase();
      // Add the category, book title input spec with the associated CategoryMapper
      for(FileStatus fileStatus : fileSystem.listStatus(new Path(input + "/" + categoryString))) {
        job.addInput(fileStatus.getPath(), new HadoopInputFormat(TextInputFormat.class),
            new CategoryMapper(category, fileStatus.getPath().getName()));
      }
      // Add a named output for each category
      job.addNamedOutput(categoryString, new TupleSolrOutputFormat(new File(
          "src/test/resources/shakespeare-solr"), conf), ITuple.class, NullWritable.class);
    }
    job.setOutput(new Path(output), new HadoopOutputFormat(NullOutputFormat.class), ITuple.class,
        NullWritable.class);
    // The reducer will just emit the tuple to the corresponding Category output
    job.setTupleReducer(new TupleReducer<ITuple, NullWritable>() {


      ITuple outTuple = new Tuple(OUT_SCHEMA);


      public void reduce(ITuple group, Iterable<ITuple> tuples, TupleMRContext context,
          Collector collector) throws IOException, InterruptedException, TupleMRException {


        for(ITuple tuple : tuples) {
          Category category = (Category) tuple.get("category");
          outTuple.set("line", tuple.get("line"));
          outTuple.set("text", tuple.get("text"));
          outTuple.set("title", tuple.get("title"));
          collector.getNamedOutput(category.toString().toLowerCase())
              .write(outTuple, NullWritable.get());
        }
      }
    });


    try {
      Job hadoopJob = job.createJob();
      hadoopJob.waitForCompletion(true);
    } finally {
      job.cleanUpInstanceFiles();
    }
    return 0;
  }

View Full Code Here

Examples of com.datasalt.pangool.tuplemr.TupleMRBuilder

    String input2 = args[1];
    String output = args[2];
  
    deleteOutput(output);
    
    TupleMRBuilder mr = new TupleMRBuilder(conf,"Pangool Url Resolution");
    mr.addIntermediateSchema(getURLMapSchema());
    mr.addIntermediateSchema(getURLRegisterSchema());
    mr.setFieldAliases("urlMap",new Aliases().add("url","nonCanonicalUrl"));
    mr.setGroupByFields("url");
    mr.setOrderBy(new OrderBy().add("url", Order.ASC).addSchemaOrder(Order.ASC));
    mr.setTupleReducer(new Handler());
    mr.setOutput(new Path(output), new HadoopOutputFormat(TextOutputFormat.class), Text.class, NullWritable.class);
    mr.addInput(new Path(input1), new HadoopInputFormat(TextInputFormat.class), new UrlMapProcessor());
    mr.addInput(new Path(input2), new HadoopInputFormat(TextInputFormat.class), new UrlProcessor());
    mr.createJob().waitForCompletion(true);


    return 1;
  }

View Full Code Here

Examples of com.datasalt.pangool.tuplemr.TupleMRBuilder

    String input = args[0];
    String output = args[1];
    
    deleteOutput(output);
    
    TupleMRBuilder builder = new TupleMRBuilder(conf);
    builder.addIntermediateSchema(getSchema());
    builder.setGroupByFields("first");
    builder.setOrderBy(new OrderBy().add("first",Order.ASC).add("second",Order.ASC));
    // Input / output and such
    builder.setTupleReducer(new Handler());
    builder.setOutput(new Path(output), new HadoopOutputFormat(TextOutputFormat.class), Text.class, NullWritable.class);
    builder.addInput(new Path(input), new HadoopInputFormat(TextInputFormat.class), new IProcessor());
    builder.createJob().waitForCompletion(true);


    return 1;
  }

View Full Code Here

Examples of com.datasalt.pangool.tuplemr.TupleMRBuilder

      failArguments("Invalid number of arguments");
      return -1;
    }
    delete(args[1]);
    
    TupleMRBuilder job = new TupleMRBuilder(conf);
    job.addIntermediateSchema(SCHEMA);
    job.setGroupByFields("line");


    String input = args[0], output = args[1];
    FileSystem fileSystem = FileSystem.get(conf);
    
    for(Category category : Category.values()) { // For each Category
      String categoryString = category.toString().toLowerCase();
      // Add the category, book title input spec with the associated CategoryMapper
      for(FileStatus fileStatus: fileSystem.listStatus(new Path(input + "/" + categoryString))) {
        job.addInput(fileStatus.getPath(), new HadoopInputFormat(TextInputFormat.class),
            new CategoryMapper(category, fileStatus.getPath().getName()));
      }
      // Add a named output for each category
      job.addNamedOutput(categoryString, new TupleSolrOutputFormat(new File("src/test/resources/shakespeare-solr"),
          conf), ITuple.class, NullWritable.class);
    }
    job.setOutput(new Path(output), new HadoopOutputFormat(NullOutputFormat.class), ITuple.class, NullWritable.class);
    // The reducer will just emit the tuple to the corresponding Category output
    job.setTupleReducer(new TupleReducer<ITuple, NullWritable>() {


      ITuple outTuple = new Tuple(OUT_SCHEMA);
      
      public void reduce(ITuple group, Iterable<ITuple> tuples, TupleMRContext context, Collector collector)
          throws IOException, InterruptedException, TupleMRException {


        for(ITuple tuple: tuples) {
          Category category = (Category) tuple.get("category"); 
          outTuple.set("line",  tuple.get("line"));
          outTuple.set("text",  tuple.get("text"));
          outTuple.set("title", tuple.get("title"));
          collector.getNamedOutput(category.toString().toLowerCase()).write(outTuple, NullWritable.get());
        }
      }
    });


    Job hadoopJob = job.createJob();
    hadoopJob.waitForCompletion(true);
    return 0;
  }

View Full Code Here

0 1 2 3 4 5

TOP

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.