Package com.datasalt.pangool.tuplemr

Examples of com.datasalt.pangool.tuplemr.TupleMRBuilder$Input


    delete(args[1]);
    // Parse the size of the Top
    Integer n = Integer.parseInt(args[2]);

    TupleMRBuilder builder = new TupleMRBuilder(conf,
        "Pangool Topic Fingerprint From Topical Word Count");
    builder.addIntermediateSchema(TopicalWordCount.getSchema());
    // We need to group the counts by (topic)
    builder.setGroupByFields("topic");
    // Then we need to sort by topic and count (DESC) -> This way we will receive the most relevant words first.
    builder.setOrderBy(new OrderBy().add("topic", Order.ASC).add("count", Order.DESC));
    // Note that we are changing the grouping logic in the job configuration,
    // However, as we work with tuples, we don't need to write specific code for grouping the same data differently,
    // Therefore an IdentityTupleMapper is sufficient for this Job.
    builder.addTupleInput(new Path(args[0]), new IdentityTupleMapper()); // Note the use of "addTupleInput"

    builder.setTupleOutput(new Path(args[1]), TopicalWordCount.getSchema());
    builder.addNamedTupleOutput(OUTPUT_TOTALCOUNT, getOutputCountSchema());
    builder.setTupleCombiner(new TopNWords(n));
    builder.setTupleReducer(new TopNWords(n));

    try {
      builder.createJob().waitForCompletion(true);
    } finally {
      builder.cleanUpInstanceFiles();
    }

    return 1;
  }
View Full Code Here


    Path tweetsPath = new Path(args[0]);
    Path retweetsPath = new Path(args[1]);
    Path outputPath = new Path(args[2]);
    delete(outputPath.toString());

    TupleMRBuilder mr = new TupleMRBuilder(conf, "AvroTweetsJoin");
    mr.addIntermediateSchema(getPangoolTweetSchema());
    mr.addIntermediateSchema(getPangoolRetweetSchema());
    mr.setGroupByFields("tweet_id");
    mr.setOrderBy(new OrderBy().add("tweet_id", Order.ASC).addSchemaOrder(Order.ASC));
    mr.setSpecificOrderBy("retweet", new OrderBy().add("username", Order.ASC));

    mr.addInput(tweetsPath, new AvroInputFormat<Record>(getAvroTweetSchema()), new TweetsMapper());
    mr.addInput(retweetsPath, new HadoopInputFormat(TextInputFormat.class), new RetweetsMapper());
    mr.setOutput(outputPath, new AvroOutputFormat<Record>(getAvroOutputSchema()), AvroWrapper.class,
        NullWritable.class);

    mr.setTupleReducer(new Red());

    try {
      Job job = mr.createJob();
      job.waitForCompletion(true);
    } finally {
      mr.cleanUpInstanceFiles();
    }

    return 0;
  }
View Full Code Here

    }

    delete(args[1]);
    List<String> stopWords = Files.readLines(new File(args[2]), Charset.forName("UTF-8"));

    TupleMRBuilder cg = new TupleMRBuilder(conf, "Pangool Topical Word Count With Stop Words");
    cg.addIntermediateSchema(TopicalWordCount.getSchema());
    // We will count each (topicId, word) pair
    // Note that the order in which we defined the fields of the Schema is not relevant here
    cg.setGroupByFields("topic", "word");
    // Here we instantiate a mapper with stop words:
    // Note that we don't need to use the DistributedCache for that becasuse mappers, reducers, etc themselves are
    // instantiable
    StopWordMapper mapper = new StopWordMapper(stopWords);
    cg.addInput(new Path(args[0]), new HadoopInputFormat(TextInputFormat.class), mapper);
    // We'll use a TupleOutputFormat with the same schema than the intermediate schema
    cg.setTupleOutput(new Path(args[1]), TopicalWordCount.getSchema());
    cg.setTupleReducer(new CountReducer());
    cg.setTupleCombiner(new CountReducer());

    try {
      cg.createJob().waitForCompletion(true);
    } finally {
      cg.cleanUpInstanceFiles();
    }

    return 1;
  }
View Full Code Here

    }
    String inputExamples = args[0];
    String output = args[1];
    delete(output);

    TupleMRBuilder job = new TupleMRBuilder(conf, "Naive Bayes Model Generator");
    job.addIntermediateSchema(INTERMEDIATE_SCHEMA);
    // perform per-category word count mapping
    job.addInput(new Path(inputExamples), new HadoopInputFormat(TextInputFormat.class),
        new TupleMapper<LongWritable, Text>() {

          ITuple tuple = new Tuple(INTERMEDIATE_SCHEMA);

          @Override
          public void map(LongWritable toIgnore, Text value, TupleMRContext context, Collector collector)
              throws IOException, InterruptedException {

            Category category = Category.valueOf(value.toString().split("\t")[0]);
            StringTokenizer itr = new StringTokenizer(value.toString().split("\t")[1]);
            tuple.set("category", category);
            tuple.set("count", 1);
            while(itr.hasMoreTokens()) {
              tuple.set("word", normalizeWord(itr.nextToken()));
              collector.write(tuple);
            }
          }
        });

    TupleReducer countReducer = new TupleReducer<ITuple, NullWritable>() {

      public void reduce(ITuple group, Iterable<ITuple> tuples, TupleMRContext context,
          Collector collector) throws IOException, InterruptedException, TupleMRException {
        int count = 0;
        ITuple outputTuple = null;
        for(ITuple tuple : tuples) {
          count += (Integer) tuple.get("count");
          outputTuple = tuple;
        }
        outputTuple.set("count", count);
        collector.write(outputTuple, NullWritable.get());
      }
    };
    job.setTupleCombiner(countReducer);
    job.setTupleReducer(countReducer);
    job.setGroupByFields("word", "category");
    job.setTupleOutput(new Path(output), INTERMEDIATE_SCHEMA);
    try {
      if(job.createJob().waitForCompletion(true)) {
        return 1;
      }
    } finally {
      job.cleanUpInstanceFiles();
    }
    return -1;
  }
View Full Code Here

      return -1;
    }

    delete(args[1]);

    TupleMRBuilder mr = new TupleMRBuilder(conf, "Pangool Topical Word Count");
    mr.addIntermediateSchema(getSchema());
    // We will count each (topicId, word) pair
    // Note that the order in which we defined the fields of the Schema is not relevant here
    mr.setGroupByFields("topic", "word");
    mr.addInput(new Path(args[0]), new HadoopInputFormat(TextInputFormat.class), new TokenizeMapper());
    // We'll use a TupleOutputFormat with the same schema than the intermediate schema
    mr.setTupleOutput(new Path(args[1]), getSchema());
    mr.setTupleReducer(new CountReducer());
    mr.setTupleCombiner(new CountReducer());

    try {
      mr.createJob().waitForCompletion(true);
    } finally {
      mr.cleanUpInstanceFiles();
    }

    return 1;
  }
View Full Code Here

    String input2 = args[1];
    String output = args[2];

    delete(output);

    TupleMRBuilder mr = new TupleMRBuilder(conf, "Pangool Url Resolution");
    mr.addIntermediateSchema(getURLMapSchema());
    mr.addIntermediateSchema(getURLRegisterSchema());
    mr.setFieldAliases("urlMap", new Aliases().add("url", "nonCanonicalUrl"));
    mr.setGroupByFields("url");
    mr.setOrderBy(new OrderBy().add("url", Order.ASC).addSchemaOrder(Order.ASC));
    mr.setSpecificOrderBy("urlRegister", new OrderBy().add("timestamp", Order.ASC));
    mr.setTupleReducer(new Handler());
    mr.setOutput(new Path(output), new HadoopOutputFormat(TextOutputFormat.class), Text.class,
        NullWritable.class);
    mr.addInput(new Path(input1), new HadoopInputFormat(TextInputFormat.class), new UrlMapProcessor());
    mr.addInput(new Path(input2), new HadoopInputFormat(TextInputFormat.class), new UrlProcessor());

    try {
      mr.createJob().waitForCompletion(true);
    } finally {
      mr.cleanUpInstanceFiles();
    }

    return 1;
  }
View Full Code Here

    fields.add(Field.create("date", Type.STRING));
    fields.add(Field.create("visits", Type.INT));

    Schema schema = new Schema("my_schema", fields);

    TupleMRBuilder mr = new TupleMRBuilder(conf);
    mr.addIntermediateSchema(schema);
    mr.setGroupByFields("url");
    mr.setOrderBy(new OrderBy().add("url", Order.ASC).add("date", Order.ASC));
    // Input / output and such
    mr.setTupleReducer(new MovingAverageHandler(nDaysAverage));
    mr.setOutput(new Path(output), new HadoopOutputFormat(TextOutputFormat.class), Text.class,
        NullWritable.class);
    mr.addInput(new Path(input), new HadoopInputFormat(TextInputFormat.class), new URLVisitsProcessor());

    try {
      mr.createJob().waitForCompletion(true);
    } finally {
      mr.cleanUpInstanceFiles();
    }
    return 1;
  }
View Full Code Here

    List<Field> fields = new ArrayList<Field>();
    fields.addAll(tupleSchema1.getFields());
    fields.add(Field.create(SQLite4JavaOutputFormat.PARTITION_TUPLE_FIELD, Schema.Field.Type.INT));
    final Schema metaSchema1 = new Schema("schema1", fields);

    TupleMRBuilder builder = new TupleMRBuilder(new Configuration());
    builder.addIntermediateSchema(NullableSchema.nullableSchema(metaSchema1));

    builder.addInput(new Path(INPUT1), new HadoopInputFormat(TextInputFormat.class),
        new TupleMapper<LongWritable, Text>() {

          ITuple tupleInTuple1 = new Tuple(metaSchema1);

          @Override
          public void map(LongWritable key, Text value, TupleMRContext context, Collector collector)
              throws IOException, InterruptedException {
            String[] split = value.toString().split("\t");
            tupleInTuple1.set("a", split[0]);
            tupleInTuple1.set("b", Integer.parseInt(split[1]));
            tupleInTuple1.set(SploutSQLOutputFormat.PARTITION_TUPLE_FIELD, 0);
            collector.write(tupleInTuple1);
          }
        });

    TableSpec table1 = new TableSpec(tupleSchema1, tupleSchema1.getField(0));
      
    builder.setTupleReducer(new IdentityTupleReducer());
    builder.setGroupByFields(SploutSQLOutputFormat.PARTITION_TUPLE_FIELD);
    builder.setOutput(new Path(OUTPUT), OutputFormatFactory.getOutputFormat(engine, 10000, new TableSpec[] { table1 }),
        ITuple.class, NullWritable.class);

    Job job = builder.createJob();
    try {
      job.waitForCompletion(true);
    } finally {
      builder.cleanUpInstanceFiles();
    }
  }
View Full Code Here

            } else {
              partitionMap = PartitionMap.oneShardOpenedMap();
            }
            writeOutputMetadata(conf);

            TupleMRBuilder builder = createMRBuilder(nPartitions, conf);
            // Set a TupleOutput here instead of SQLiteOutput
            builder.setOutput(new Path(outputPath, OUT_STORE), new TupleOutputFormat(tableSchema),
                ITuple.class, NullWritable.class);
            executeViewGeneration(builder);
          }
        };
      } else {
View Full Code Here

    fields.add(Field.create("booleanField", Type.BOOLEAN));
    fields.add(Field.createEnum("enumField", TestEnum.class));

    Schema schema = new Schema("schema", fields);

    TupleMRBuilder builder = new TupleMRBuilder(conf);
    builder.addIntermediateSchema(schema);
    builder.setGroupByFields("strField1"); // but we don't care, really
    /*
     * Define the Input Format and the Output Format!
     */
    InputFormat inputFormat = new TupleTextInputFormat(schema, false, false, '\t',
        TupleTextOutputFormat.NO_QUOTE_CHARACTER, TupleTextOutputFormat.NO_ESCAPE_CHARACTER,
        FieldSelector.NONE, TupleTextInputFormat.NO_NULL_STRING);
    OutputFormat outputFormat = new TupleTextOutputFormat(schema, false, '\t',
        TupleTextOutputFormat.NO_QUOTE_CHARACTER, TupleTextOutputFormat.NO_ESCAPE_CHARACTER);

    builder.addInput(inPath, inputFormat, new IdentityTupleMapper());
    builder.setTupleReducer(new IdentityTupleReducer());
    builder.setOutput(outPath, outputFormat, ITuple.class, NullWritable.class);

    Job job = builder.createJob();
    try {
      assertRun(job);
    } finally {
      builder.cleanUpInstanceFiles();
    }

    Assert.assertEquals(line1 + "\n" + line2 + "\n" + line3,
        Files.toString(new File(OUT + "/" + "part-r-00000"), Charset.forName("UTF-8")).trim());

View Full Code Here

TOP

Related Classes of com.datasalt.pangool.tuplemr.TupleMRBuilder$Input

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.