Package com.datasalt.pangool.examples.topicalwordcount

Source Code of com.datasalt.pangool.examples.topicalwordcount.TopicFingerprint

/**
* Copyright [2012] [Datasalt Systems S.L.]
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*   http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.datasalt.pangool.examples.topicalwordcount;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.util.ToolRunner;

import com.datasalt.pangool.examples.BaseExampleJob;
import com.datasalt.pangool.io.ITuple;
import com.datasalt.pangool.io.Schema;
import com.datasalt.pangool.io.Schema.Field;
import com.datasalt.pangool.io.Schema.Field.Type;
import com.datasalt.pangool.io.Tuple;
import com.datasalt.pangool.tuplemr.Criteria.Order;
import com.datasalt.pangool.tuplemr.IdentityTupleMapper;
import com.datasalt.pangool.tuplemr.OrderBy;
import com.datasalt.pangool.tuplemr.TupleMRBuilder;
import com.datasalt.pangool.tuplemr.TupleMRException;
import com.datasalt.pangool.tuplemr.TupleReducer;

/**
* This example uses the output of {@link TopicalWordCount} for calculating the "top n" words per each topic.
* It also outputs the total appearances of these words together into an extra (named) output. This example
* show how easy it is to perform secondary sort in Pangool as well as the use of "named outputs".
*/
public class TopicFingerprint extends BaseExampleJob {

  public final static String OUTPUT_TOTALCOUNT = "totalcount";
 
  @SuppressWarnings("serial")
  public static class TopNWords extends TupleReducer<ITuple, NullWritable> {

    int n; // We will emit as many words as this parameter (at maximum) for each group
    Tuple outputCountTuple;

    public TopNWords(int n) {
      this.n = n;
    }

    public void setup(TupleMRContext context, Collector collector) throws IOException, InterruptedException,
        TupleMRException {
      outputCountTuple = new Tuple(getOutputCountSchema());
    };

    public void reduce(ITuple group, Iterable<ITuple> tuples, TupleMRContext context, Collector collector)
        throws java.io.IOException, InterruptedException, TupleMRException {

      int totalCount = 0;
      Iterator<ITuple> iterator = tuples.iterator();
      for(int i = 0; i < n && iterator.hasNext(); i++) {
        ITuple tuple = iterator.next();
        collector.write(tuple, NullWritable.get());
        totalCount += (Integer) tuple.get("count");
      }
     
      outputCountTuple.set("topic", group.get("topic"));
      outputCountTuple.set("totalcount", totalCount);
      collector.getNamedOutput(OUTPUT_TOTALCOUNT).write(outputCountTuple, NullWritable.get());
    };
  }

  public TopicFingerprint() {
    super("Usage: TopicalWordCountWithStopWords [input_path] [output_path] [top_n_size]");
  }

  public static Schema getOutputCountSchema() {
    List<Field> fields = new ArrayList<Field>();
    fields.add(Field.create("topic", Type.INT));
    fields.add(Field.create("totalcount", Type.INT));
    return new Schema("outputcount", fields);
  }
 
  @Override
  public int run(String[] args) throws Exception {
    if(args.length != 3) {
      failArguments("Wrong number of arguments");
      return -1;
    }

    delete(args[1]);
    // Parse the size of the Top
    Integer n = Integer.parseInt(args[2]);

    TupleMRBuilder builder = new TupleMRBuilder(conf, "Pangool Topic Fingerprint From Topical Word Count");
    builder.addIntermediateSchema(TopicalWordCount.getSchema());
    // We need to group the counts by (topic)
    builder.setGroupByFields("topic");
    // Then we need to sort by topic and count (DESC) -> This way we will receive the most relevant words first.
    builder.setOrderBy(new OrderBy().add("topic", Order.ASC).add("count", Order.DESC));
    // Note that we are changing the grouping logic in the job configuration,
    // However, as we work with tuples, we don't need to write specific code for grouping the same data differently,
    // Therefore an IdentityTupleMapper is sufficient for this Job.
    builder.addTupleInput(new Path(args[0]), new IdentityTupleMapper()); // Note the use of "addTupleInput"
    /*
     * TODO Add Combiner as same Reducer when possible
     */
    builder.setTupleOutput(new Path(args[1]), TopicalWordCount.getSchema());
    builder.addNamedTupleOutput(OUTPUT_TOTALCOUNT, getOutputCountSchema());
    builder.setTupleReducer(new TopNWords(n));

    builder.createJob().waitForCompletion(true);

    return 1;
  }
 
  public static void main(String[] args) throws Exception {
    ToolRunner.run(new TopicFingerprint(), args);
  }
}
TOP

Related Classes of com.datasalt.pangool.examples.topicalwordcount.TopicFingerprint

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.