Package com.datasalt.pangool.examples.avro

Source Code of com.datasalt.pangool.examples.avro.AvroTopicalWordCount$MyAvroComparator

/**
* Copyright [2012] [Datasalt Systems S.L.]
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*   http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.datasalt.pangool.examples.avro;

import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.StringTokenizer;

import org.apache.avro.Schema.Type;
import org.apache.avro.generic.GenericData.Record;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DataInputBuffer;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.RawComparator;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.util.ToolRunner;
import org.codehaus.jackson.map.ObjectMapper;

import com.datasalt.pangool.PangoolRuntimeException;
import com.datasalt.pangool.examples.BaseExampleJob;
import com.datasalt.pangool.examples.topicalwordcount.TopicalWordCount;
import com.datasalt.pangool.io.ITuple;
import com.datasalt.pangool.io.Schema;
import com.datasalt.pangool.io.Schema.Field;
import com.datasalt.pangool.io.Tuple;
import com.datasalt.pangool.tuplemr.Criteria.Order;
import com.datasalt.pangool.tuplemr.OrderBy;
import com.datasalt.pangool.tuplemr.TupleMRBuilder;
import com.datasalt.pangool.tuplemr.TupleMRException;
import com.datasalt.pangool.tuplemr.TupleMapper;
import com.datasalt.pangool.tuplemr.TupleReducer;
import com.datasalt.pangool.tuplemr.mapred.lib.input.HadoopInputFormat;
import com.datasalt.pangool.tuplemr.serialization.AvroFieldSerialization;
import com.datasalt.pangool.tuplemr.serialization.AvroFieldSerialization.AvroFieldDeserializer;

/**
* This is an advanced example to illustrate the usage of custom serializers and
* custom comparators.<br>
*
* In this example the intermediate schema contains just a single Avro Record, whose avro schema is
* "topic:int, word:string, count:int".<br>
* The custom serialization used is defined in {@link AvroFieldSerialization}. In addition to this,
* a custom comparator {@link MyAvroComparator} is used to just compare and group by "topic , word".
* <br>
* The behaviour is identical to the example {@link TopicalWordCount} , but using the external
* serialization provided by Avro.
*/
public class AvroTopicalWordCount extends BaseExampleJob {

  @SuppressWarnings("serial")
  public static class TokenizeMapper extends TupleMapper<LongWritable, Text> {

    protected Tuple tuple;
    protected Record record;
    protected ObjectMapper mapper;

    public void setup(TupleMRContext context, Collector collector)
        throws IOException, InterruptedException {
      this.mapper = new ObjectMapper();
      tuple = new Tuple(context.getTupleMRConfig().getIntermediateSchema(0));
      record = new Record(getAvroSchema());
      tuple.set("my_avro",record);
    };

    @SuppressWarnings("rawtypes")
    @Override
    public void map(LongWritable key, Text value, TupleMRContext context, Collector collector)
        throws IOException, InterruptedException {

      Map document = mapper.readValue(value.toString(), Map.class);
      record.put("topic", (Integer) document.get("topicId"));
      StringTokenizer itr = new StringTokenizer((String) document.get("text"));
      record.put("count", 1);
      while(itr.hasMoreTokens()) {
        record.put("word", itr.nextToken());
        tuple.set("my_avro",record);
        collector.write(tuple);
      }
    }
  }

  @SuppressWarnings("serial")
  public static class CountReducer extends TupleReducer<ITuple, NullWritable> {

    @Override
    public void reduce(ITuple group, Iterable<ITuple> tuples, TupleMRContext context, Collector collector)
        throws IOException, InterruptedException, TupleMRException {

      int count = 0;
      ITuple outputTuple = null;
      Record outputRecord=null;
      for(ITuple tuple : tuples) {
        Record record = (Record)tuple.get("my_avro");
        count += (Integer) record.get("count");
        outputTuple = tuple;
        outputRecord = record;
      }
      outputRecord.put("count",count);
      outputTuple.set("my_avro",outputRecord);
      collector.write(outputTuple, NullWritable.get());
    }
  }

  public AvroTopicalWordCount() {
    super("Usage: AvroTopicalWordCount [input_path] [output_path]");
  }

  static Schema getSchema() {
    org.apache.avro.Schema avroSchema = getAvroSchema();
    Field avroField = Field.createObject("my_avro",Object.class);
    avroField.setObjectSerialization(AvroFieldSerialization.class);
    avroField.addProp("avro.schema",avroSchema.toString());
    return new Schema("schema",Arrays.asList(avroField));
  }
 
  static org.apache.avro.Schema getAvroSchema(){
    List<org.apache.avro.Schema.Field> avroFields =
        new ArrayList<org.apache.avro.Schema.Field>();
    avroFields.add(
        new org.apache.avro.Schema.Field("word",org.apache.avro.Schema.create(Type.STRING),null,null));
    avroFields.add(
        new org.apache.avro.Schema.Field("topic",org.apache.avro.Schema.create(Type.INT),null,null));
    avroFields.add(
        new org.apache.avro.Schema.Field("count",org.apache.avro.Schema.create(Type.INT),null,null));
    org.apache.avro.Schema result= org.apache.avro.Schema.createRecord("avro_schema",null,null,false);
    result.setFields(avroFields);
    return result;
  }

  /**
   * A custom comparator that deserializes bytes to Avro {@link Record} instances, and then
   * compares by "topic" and "word" fields.
   *
   */
  @SuppressWarnings("serial")
  public static class MyAvroComparator implements RawComparator<Record>,Serializable {

    //MyAvroComparator must be serializable so this must be transient
    private transient AvroFieldDeserializer<Record> deser;
    private transient Record record1,record2;
    private transient DataInputBuffer inputBuffer;
    private String avroSchema;
    private String[] fields;
    public MyAvroComparator(org.apache.avro.Schema avroSchema,String ... fields){
      this.avroSchema = avroSchema.toString();
      this.fields = fields;
    }
   
    //lazy loading of deserializer and buffers
    private void init(){
      if (deser == null){
        deser = new AvroFieldDeserializer<Record>(org.apache.avro.Schema.parse(avroSchema),false);
      }
      if (inputBuffer == null){
        inputBuffer = new DataInputBuffer();
      }
    }
   
    @Override
    @SuppressWarnings({"unchecked","rawtypes"})
    public int compare(Record record1, Record record2) {
      for (String field : fields){
      int comparison =  ((Comparable)record1.get(field)).compareTo(record2.get(field));
        if (comparison != 0){
          return comparison;
        }
      }
      return 0;   
    }

    @Override
    public int compare(byte[] b1, int o1, int l1, byte[] b2, int o2, int l2) {
      init();
      try{
        inputBuffer.reset(b1,o1,l1);
        deser.open(inputBuffer);
        record1 = deser.deserialize(record1);
        deser.close();
        inputBuffer.reset(b2,o2,l2);
        deser.open(inputBuffer);
        record2 = deser.deserialize(record2);
        deser.close();
        return compare(record1, record2);
      } catch(IOException e){
        throw new PangoolRuntimeException(e);
      }
    }
  }
 
 
  @Override
  public int run(String[] args) throws Exception {
    if(args.length != 2) {
      failArguments("Wrong number of arguments");
      return -1;
    }

    deleteOutput(args[1]);

    TupleMRBuilder mr = new TupleMRBuilder(conf, "Pangool Topical Word Count");
    mr.addIntermediateSchema(getSchema());
    mr.setGroupByFields("my_avro");
    //here the custom comparator that groups by "topic,word" is used.
    MyAvroComparator customComp = new MyAvroComparator(getAvroSchema(),"topic","word");
    mr.setOrderBy(new OrderBy().add("my_avro",Order.ASC,customComp));
    mr.addInput(new Path(args[0]), new HadoopInputFormat(TextInputFormat.class), new TokenizeMapper());
    // We'll use a TupleOutputFormat with the same schema than the intermediate schema
    mr.setTupleOutput(new Path(args[1]), getSchema());
    mr.setTupleReducer(new CountReducer());
    mr.setTupleCombiner(new CountReducer());

    mr.createJob().waitForCompletion(true);

    return 1;
  }
 
  public static void main(String[] args) throws Exception {
    ToolRunner.run(new AvroTopicalWordCount(), args);
  }
}
TOP

Related Classes of com.datasalt.pangool.examples.avro.AvroTopicalWordCount$MyAvroComparator

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.