Examples of com.datasalt.pangool.tuplemr.OrderBy

com.datasalt.pangool.tuplemr.OrderBy
OrderBy is a convenience builder used by {@link TupleMRConfig} , similar to{@link Criteria}. The main difference is that {@link OrderBy} is mutableusing the concatenation pattern and allows to specify schemaOrder. The OrderBy instances are converted to immutable Criteria objects by {@link TupleMRConfig}.

    
    TupleMRBuilder mr = new TupleMRBuilder(conf,"Pangool Url Resolution");
    mr.addIntermediateSchema(getURLMapSchema());
    mr.addIntermediateSchema(getURLRegisterSchema());
    mr.setGroupByFields("url");
    mr.setOrderBy(new OrderBy().add("url", Order.ASC).addSchemaOrder(Order.ASC));
    mr.setTupleReducer(new Handler());
    mr.setOutput(new Path(output), new HadoopOutputFormat(TextOutputFormat.class), Text.class, NullWritable.class);
    mr.addInput(new Path(input1), new HadoopInputFormat(TextInputFormat.class), new UrlMapProcessor());
    mr.addInput(new Path(input2), new HadoopInputFormat(TextInputFormat.class), new UrlProcessor());
    mr.createJob().waitForCompletion(true);

View Full Code Here

    TupleMRBuilder mr = new TupleMRBuilder(conf, "Pangool Topical Word Count");
    mr.addIntermediateSchema(getSchema());
    mr.setGroupByFields("my_avro");
    //here the custom comparator that groups by "topic,word" is used. 
    MyAvroComparator customComp = new MyAvroComparator(getAvroSchema(),"topic","word");
    mr.setOrderBy(new OrderBy().add("my_avro",Order.ASC,customComp));
    mr.addInput(new Path(args[0]), new HadoopInputFormat(TextInputFormat.class), new TokenizeMapper());
    // We'll use a TupleOutputFormat with the same schema than the intermediate schema
    mr.setTupleOutput(new Path(args[1]), getSchema());
    mr.setTupleReducer(new CountReducer());
    mr.setTupleCombiner(new CountReducer());

View Full Code Here

    
    TupleMRBuilder mr = new TupleMRBuilder(conf, "AvroTweetsJoin");
    mr.addIntermediateSchema(getPangoolTweetSchema());
    mr.addIntermediateSchema(getPangoolRetweetSchema());
    mr.setGroupByFields("tweet_id");
    mr.setOrderBy(new OrderBy().add("tweet_id",Order.ASC).addSchemaOrder(Order.ASC));
    
    mr.addInput(tweetsPath,new AvroInputFormat<Record>(getAvroTweetSchema()),new TweetsMapper());
    mr.addInput(retweetsPath, new HadoopInputFormat(TextInputFormat.class), new RetweetsMapper());
    mr.setOutput(outputPath,new AvroOutputFormat<Record>(getAvroOutputSchema()),
        AvroWrapper.class,NullWritable.class);

View Full Code Here


    TupleMRBuilder mr = new TupleMRBuilder(conf, "AvroTweetsJoin");
    mr.addIntermediateSchema(getPangoolTweetSchema());
    mr.addIntermediateSchema(getPangoolRetweetSchema());
    mr.setGroupByFields("tweet_id");
    mr.setOrderBy(new OrderBy().add("tweet_id", Order.ASC).addSchemaOrder(Order.ASC));


    mr.addInput(tweetsPath, new AvroInputFormat<Record>(getAvroTweetSchema()), new TweetsMapper());
    mr.addInput(retweetsPath, new HadoopInputFormat(TextInputFormat.class), new RetweetsMapper());
    mr.setOutput(outputPath, new AvroOutputFormat<Record>(getAvroOutputSchema()), AvroWrapper.class,
        NullWritable.class);

View Full Code Here

    TupleMRBuilder mr = new TupleMRBuilder(conf, "Pangool Topical Word Count");
    mr.addIntermediateSchema(getSchema());
    mr.setGroupByFields("my_avro");
    // here the custom comparator that groups by "topic,word" is used.
    MyAvroComparator customComp = new MyAvroComparator(getAvroSchema(), "topic", "word");
    mr.setOrderBy(new OrderBy().add("my_avro", Order.ASC, customComp));
    mr.addInput(new Path(args[0]), new HadoopInputFormat(TextInputFormat.class), new TokenizeMapper());
    // We'll use a TupleOutputFormat with the same schema than the intermediate schema
    mr.setTupleOutput(new Path(args[1]), getSchema());
    mr.setTupleReducer(new CountReducer());
    mr.setTupleCombiner(new CountReducer());

View Full Code Here

    TupleMRBuilder mr = new TupleMRBuilder(conf, "Pangool Url Resolution");
    mr.addIntermediateSchema(getURLMapSchema());
    mr.addIntermediateSchema(getURLRegisterSchema());
    mr.setFieldAliases("urlMap", new Aliases().add("url", "nonCanonicalUrl"));
    mr.setGroupByFields("url");
    mr.setOrderBy(new OrderBy().add("url", Order.ASC).addSchemaOrder(Order.ASC));
    mr.setTupleReducer(new Handler());
    mr.setOutput(new Path(output), new HadoopOutputFormat(TextOutputFormat.class), Text.class,
        NullWritable.class);
    mr.addInput(new Path(input1), new HadoopInputFormat(TextInputFormat.class), new UrlMapProcessor());
    mr.addInput(new Path(input2), new HadoopInputFormat(TextInputFormat.class), new UrlProcessor());

View Full Code Here

    TupleMRBuilder mr = new TupleMRBuilder(conf,"Pangool Url Resolution");
    mr.addIntermediateSchema(getURLMapSchema());
    mr.addIntermediateSchema(getURLRegisterSchema());
    mr.setFieldAliases("urlMap",new Aliases().add("url","nonCanonicalUrl"));
    mr.setGroupByFields("url");
    mr.setOrderBy(new OrderBy().add("url", Order.ASC).addSchemaOrder(Order.ASC));
    mr.setTupleReducer(new Handler());
    mr.setOutput(new Path(output), new HadoopOutputFormat(TextOutputFormat.class), Text.class, NullWritable.class);
    mr.addInput(new Path(input1), new HadoopInputFormat(TextInputFormat.class), new UrlMapProcessor());
    mr.addInput(new Path(input2), new HadoopInputFormat(TextInputFormat.class), new UrlProcessor());
    mr.createJob().waitForCompletion(true);

View Full Code Here

    deleteOutput(output);
    
    TupleMRBuilder builder = new TupleMRBuilder(conf);
    builder.addIntermediateSchema(getSchema());
    builder.setGroupByFields("first");
    builder.setOrderBy(new OrderBy().add("first",Order.ASC).add("second",Order.ASC));
    // Input / output and such
    builder.setTupleReducer(new Handler());
    builder.setOutput(new Path(output), new HadoopOutputFormat(TextOutputFormat.class), Text.class, NullWritable.class);
    builder.addInput(new Path(input), new HadoopInputFormat(TextInputFormat.class), new IProcessor());
    builder.createJob().waitForCompletion(true);

View Full Code Here

    Schema schema = new Schema("schema", fields);


    TupleMRBuilder mr = new TupleMRBuilder(conf, "Pangool Secondary Sort");
    mr.addIntermediateSchema(schema);
    mr.setGroupByFields("intField", "strField");
    mr.setOrderBy(new OrderBy().add("intField", Order.ASC).add("strField", Order.ASC).add("longField", Order.ASC));
    mr.setTupleReducer(new Handler());
    mr.addInput(new Path(input), new HadoopInputFormat(TextInputFormat.class), new IProcessor());
    mr.setOutput(new Path(output), new HadoopOutputFormat(TextOutputFormat.class), Text.class,
        DoubleWritable.class);
    mr.createJob().waitForCompletion(true);

View Full Code Here

    Schema schema = new Schema("my_schema", fields);


    TupleMRBuilder mr = new TupleMRBuilder(conf);
    mr.addIntermediateSchema(schema);
    mr.setGroupByFields("location", "date", "hashtag");
    mr.setOrderBy(new OrderBy().add("location", Order.ASC).add("date", Order.ASC).add("hashtag", Order.ASC));
    mr.setRollupFrom("date");
    // Input / output and such
    mr.setTupleReducer(new TweetsHandler(n));
    mr.setOutput(new Path(output), new HadoopOutputFormat(TextOutputFormat.class), Text.class, NullWritable.class);
    mr.addInput(new Path(input), new HadoopInputFormat(TextInputFormat.class), new TweetsProcessor());

View Full Code Here

0 1 2 3 4

TOP

Related Classes of com.datasalt.pangool.tuplemr.OrderBy

com.datasalt.pangool.examples.avro.AvroCustomSerializationJob

com.datasalt.pangool.examples.avro.AvroTopicalWordCount

com.datasalt.pangool.examples.avro.AvroTweetsJoin

com.datasalt.pangool.examples.movingaverage.MovingAverage

com.datasalt.pangool.examples.secondarysort.SecondarySort

com.datasalt.pangool.examples.simplesecondarysort.SimpleSecondarySort

com.datasalt.pangool.examples.topicalwordcount.TopicFingerprint

com.datasalt.pangool.examples.topnhashtags.TopNHashTags

com.datasalt.pangool.examples.urlresolution.UrlResolution

com.datasalt.pangool.examples.useractivitynormalizer.UserActivityNormalizer

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.