Source Code of com.datasalt.pangool.examples.topicalwordcount.TopicFingerprint

/**
 * Copyright [2012] [Datasalt Systems S.L.]
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.datasalt.pangool.examples.topicalwordcount;


import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;


import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.util.ToolRunner;


import com.datasalt.pangool.examples.BaseExampleJob;
import com.datasalt.pangool.io.ITuple;
import com.datasalt.pangool.io.Schema;
import com.datasalt.pangool.io.Schema.Field;
import com.datasalt.pangool.io.Schema.Field.Type;
import com.datasalt.pangool.io.Tuple;
import com.datasalt.pangool.tuplemr.Criteria.Order;
import com.datasalt.pangool.tuplemr.IdentityTupleMapper;
import com.datasalt.pangool.tuplemr.OrderBy;
import com.datasalt.pangool.tuplemr.TupleMRBuilder;
import com.datasalt.pangool.tuplemr.TupleMRException;
import com.datasalt.pangool.tuplemr.TupleReducer;


/**
 * This example uses the output of {@link TopicalWordCount} for calculating the "top n" words per each topic.
 * It also outputs the total appearances of these words together into an extra (named) output. This example
 * show how easy it is to perform secondary sort in Pangool as well as the use of "named outputs".
 */
public class TopicFingerprint extends BaseExampleJob {


  public final static String OUTPUT_TOTALCOUNT = "totalcount";
  
  @SuppressWarnings("serial")
  public static class TopNWords extends TupleReducer<ITuple, NullWritable> {


    int n; // We will emit as many words as this parameter (at maximum) for each group
    Tuple outputCountTuple;


    public TopNWords(int n) {
      this.n = n;
    }


    public void setup(TupleMRContext context, Collector collector) throws IOException, InterruptedException,
        TupleMRException {
      outputCountTuple = new Tuple(getOutputCountSchema());
    };


    public void reduce(ITuple group, Iterable<ITuple> tuples, TupleMRContext context, Collector collector)
        throws java.io.IOException, InterruptedException, TupleMRException {


      int totalCount = 0;
      Iterator<ITuple> iterator = tuples.iterator();
      for(int i = 0; i < n && iterator.hasNext(); i++) {
        ITuple tuple = iterator.next();
        collector.write(tuple, NullWritable.get());
        totalCount += (Integer) tuple.get("count");
      }
      
      outputCountTuple.set("topic", group.get("topic"));
      outputCountTuple.set("totalcount", totalCount);
      collector.getNamedOutput(OUTPUT_TOTALCOUNT).write(outputCountTuple, NullWritable.get());
    };
  }


  public TopicFingerprint() {
    super("Usage: TopicalWordCountWithStopWords [input_path] [output_path] [top_n_size]");
  }


  public static Schema getOutputCountSchema() {
    List<Field> fields = new ArrayList<Field>();
    fields.add(Field.create("topic", Type.INT));
    fields.add(Field.create("totalcount", Type.INT));
    return new Schema("outputcount", fields);
  }
  
  @Override
  public int run(String[] args) throws Exception {
    if(args.length != 3) {
      failArguments("Wrong number of arguments");
      return -1;
    }


    delete(args[1]);
    // Parse the size of the Top
    Integer n = Integer.parseInt(args[2]);


    TupleMRBuilder builder = new TupleMRBuilder(conf, "Pangool Topic Fingerprint From Topical Word Count");
    builder.addIntermediateSchema(TopicalWordCount.getSchema());
    // We need to group the counts by (topic)
    builder.setGroupByFields("topic");
    // Then we need to sort by topic and count (DESC) -> This way we will receive the most relevant words first.
    builder.setOrderBy(new OrderBy().add("topic", Order.ASC).add("count", Order.DESC));
    // Note that we are changing the grouping logic in the job configuration,
    // However, as we work with tuples, we don't need to write specific code for grouping the same data differently,
    // Therefore an IdentityTupleMapper is sufficient for this Job.
    builder.addTupleInput(new Path(args[0]), new IdentityTupleMapper()); // Note the use of "addTupleInput"
    /*
     * TODO Add Combiner as same Reducer when possible
     */
    builder.setTupleOutput(new Path(args[1]), TopicalWordCount.getSchema());
    builder.addNamedTupleOutput(OUTPUT_TOTALCOUNT, getOutputCountSchema());
    builder.setTupleReducer(new TopNWords(n));


    builder.createJob().waitForCompletion(true);


    return 1;
  }
  
  public static void main(String[] args) throws Exception {
    ToolRunner.run(new TopicFingerprint(), args);
  }
}
Source Code of com.datasalt.pangool.examples.topicalwordcount.TopicFingerprint

Related Classes of com.datasalt.pangool.examples.topicalwordcount.TopicFingerprint