Package com.datasalt.pangool.tuplemr

Examples of com.datasalt.pangool.tuplemr.TupleMRBuilder$Input


    fields.add(Field.create("strField", Type.STRING));
    fields.add(Field.create("longField", Type.LONG));
    fields.add(Field.create("doubleField", Type.DOUBLE));
    Schema schema = new Schema("schema", fields);

    TupleMRBuilder mr = new TupleMRBuilder(conf, "Pangool Secondary Sort");
    mr.addIntermediateSchema(schema);
    mr.setGroupByFields("intField", "strField");
    mr.setOrderBy(new OrderBy().add("intField", Order.ASC).add("strField", Order.ASC).add("longField", Order.ASC));
    mr.setTupleReducer(new Handler());
    mr.addInput(new Path(input), new HadoopInputFormat(TextInputFormat.class), new IProcessor());
    mr.setOutput(new Path(output), new HadoopOutputFormat(TextOutputFormat.class), Text.class,
        DoubleWritable.class);
    mr.createJob().waitForCompletion(true);
    return 1;
  }
View Full Code Here


    }

    deleteOutput(args[1]);
    List<String> stopWords = Files.readLines(new File(args[2]), Charset.forName("UTF-8"));

    TupleMRBuilder cg = new TupleMRBuilder(conf, "Pangool Topical Word Count With Stop Words");
    cg.addIntermediateSchema(TopicalWordCount.getSchema());
    // We will count each (topicId, word) pair
    // Note that the order in which we defined the fields of the Schema is not relevant here
    cg.setGroupByFields("topic", "word");
    // Here we instantiate a mapper with stop words:
    // Note that we don't need to use the DistributedCache for that becasuse mappers, reducers, etc themselves are instantiable
    StopWordMapper mapper = new StopWordMapper(stopWords);
    cg.addInput(new Path(args[0]), new HadoopInputFormat(TextInputFormat.class), mapper);
    // We'll use a TupleOutputFormat with the same schema than the intermediate schema
    cg.setTupleOutput(new Path(args[1]), TopicalWordCount.getSchema());
    cg.setTupleReducer(new CountReducer());
    cg.setTupleCombiner(new CountReducer());

    cg.createJob().waitForCompletion(true);

    return 1;
  }
View Full Code Here

    List<Field> urlMapFields = new ArrayList<Field>();
    urlMapFields.add(Field.create("url",Type.STRING));
    urlMapFields.add(Field.create("canonicalUrl",Type.STRING));

    TupleMRBuilder mr = new TupleMRBuilder(conf,"Pangool Url Resolution");
    mr.addIntermediateSchema(new Schema("urlMap", urlMapFields));
    mr.addIntermediateSchema(new Schema("urlRegister", urlRegisterFields));
    mr.setGroupByFields("url");
    mr.setTupleReducer(new Handler());
    mr.setOutput(new Path(output), new HadoopOutputFormat(TextOutputFormat.class), Text.class, NullWritable.class);
    mr.addInput(new Path(input1), new HadoopInputFormat(TextInputFormat.class), new UrlMapProcessor());
    mr.addInput(new Path(input2), new HadoopInputFormat(TextInputFormat.class), new UrlProcessor());
    mr.createJob().waitForCompletion(true);

    return 1;
  }
View Full Code Here

      return -1;
    }

    deleteOutput(args[1]);

    TupleMRBuilder mr = new TupleMRBuilder(conf, "Pangool Topical Word Count");
    mr.addIntermediateSchema(getSchema());
    // We will count each (topicId, word) pair
    // Note that the order in which we defined the fields of the Schema is not relevant here
    mr.setGroupByFields("topic", "word");
    mr.addInput(new Path(args[0]), new HadoopInputFormat(TextInputFormat.class), new TokenizeMapper());
    // We'll use a TupleOutputFormat with the same schema than the intermediate schema
    mr.setTupleOutput(new Path(args[1]), getSchema());
    mr.setTupleReducer(new CountReducer());
    mr.setTupleCombiner(new CountReducer());

    mr.createJob().waitForCompletion(true);

    return 1;
  }
View Full Code Here

    String input = args[0];
    String output = args[1];

    delete(output);

    TupleMRBuilder builder = new TupleMRBuilder(conf);
    builder.addIntermediateSchema(getSchema());
    builder.setGroupByFields("first");
    builder.setOrderBy(new OrderBy().add("first", Order.ASC).add("second", Order.ASC));
    // Input / output and such
    builder.setTupleReducer(new Handler());
    builder.setOutput(new Path(output), new HadoopOutputFormat(TextOutputFormat.class), Text.class,
        NullWritable.class);
    builder.addInput(new Path(input), new HadoopInputFormat(TextInputFormat.class), new IProcessor());

    try {
      builder.createJob().waitForCompletion(true);
    } finally {
      builder.cleanUpInstanceFiles();
    }

    return 1;
  }
View Full Code Here

    fields.add(Field.create("all", Type.BOOLEAN));
    fields.add(Field.create("clicks", Type.INT));

    Schema schema = new Schema("my_schema", fields);

    TupleMRBuilder mr = new TupleMRBuilder(conf);
    mr.addIntermediateSchema(schema);
    mr.setGroupByFields("user", "all", "feature");
    mr.setOrderBy(new OrderBy().add("user", Order.ASC).add("all", Order.DESC).add("feature", Order.ASC));
    // Rollup from "user" - all features from same user will go to the same Reducer
    mr.setRollupFrom("user");
    // Input / output and such
    mr.setTupleCombiner(new CountCombinerHandler());
    mr.setTupleReducer(new NormalizingHandler());
    mr.setOutput(new Path(output), new HadoopOutputFormat(TextOutputFormat.class), Text.class,
        NullWritable.class);
    mr.addInput(new Path(input), new HadoopInputFormat(TextInputFormat.class),
        new UserActivityProcessor());

    try {
      mr.createJob().waitForCompletion(true);
    } finally {
      mr.cleanUpInstanceFiles();
    }
    return 1;
  }
View Full Code Here

    fields.add(Field.create("date", Type.STRING));
    fields.add(Field.create("hashtag", Type.STRING));
    fields.add(Field.create("count", Type.INT));
    Schema schema = new Schema("my_schema", fields);

    TupleMRBuilder mr = new TupleMRBuilder(conf);
    mr.addIntermediateSchema(schema);
    mr.setGroupByFields("location", "date", "hashtag");
    mr.setOrderBy(new OrderBy().add("location", Order.ASC).add("date", Order.ASC)
        .add("hashtag", Order.ASC));
    mr.setRollupFrom("date");
    // Input / output and such
    mr.setTupleReducer(new TweetsHandler(n));
    mr.setOutput(new Path(output), new HadoopOutputFormat(TextOutputFormat.class), Text.class,
        NullWritable.class);
    mr.addInput(new Path(input), new HadoopInputFormat(TextInputFormat.class), new TweetsProcessor());
    try {
      mr.createJob().waitForCompletion(true);
    } finally {
      mr.cleanUpInstanceFiles();
    }

    return 0;
  }
View Full Code Here

      failArguments("Invalid number of arguments");
      return -1;
    }
    delete(args[1]);

    TupleMRBuilder job = new TupleMRBuilder(conf);
    job.addIntermediateSchema(SCHEMA);
    job.setGroupByFields("line");

    String input = args[0], output = args[1];
    FileSystem fileSystem = FileSystem.get(conf);

    for(Category category : Category.values()) { // For each Category
      String categoryString = category.toString().toLowerCase();
      // Add the category, book title input spec with the associated CategoryMapper
      for(FileStatus fileStatus : fileSystem.listStatus(new Path(input + "/" + categoryString))) {
        job.addInput(fileStatus.getPath(), new HadoopInputFormat(TextInputFormat.class),
            new CategoryMapper(category, fileStatus.getPath().getName()));
      }
      // Add a named output for each category
      job.addNamedOutput(categoryString, new TupleSolrOutputFormat(new File(
          "src/test/resources/shakespeare-solr"), job.getConf()), ITuple.class, NullWritable.class);
    }
    job.setOutput(new Path(output), new HadoopOutputFormat(NullOutputFormat.class), ITuple.class,
        NullWritable.class);
    // The reducer will just emit the tuple to the corresponding Category output
    job.setTupleReducer(new TupleReducer<ITuple, NullWritable>() {

      ITuple outTuple = new Tuple(OUT_SCHEMA);

      public void reduce(ITuple group, Iterable<ITuple> tuples, TupleMRContext context,
          Collector collector) throws IOException, InterruptedException, TupleMRException {

        for(ITuple tuple : tuples) {
          Category category = (Category) tuple.get("category");
          outTuple.set("line", tuple.get("line"));
          outTuple.set("text", tuple.get("text"));
          outTuple.set("title", tuple.get("title"));
          collector.getNamedOutput(category.toString().toLowerCase())
              .write(outTuple, NullWritable.get());
        }
      }
    });

    try {
      Job hadoopJob = job.createJob();
      hadoopJob.waitForCompletion(true);
    } finally {
      job.cleanUpInstanceFiles();
    }
    return 0;
  }
View Full Code Here

        + iterations);

    // Define the intermediate schema: a pair of ints
    final Schema schema = new Schema("minMax", Fields.parse("min:int, max:int"));

    TupleMRBuilder job = new TupleMRBuilder(conf);
    job.addIntermediateSchema(schema);
    job.setGroupByFields("min", "max");
    job.setCustomPartitionFields("min");
    // Define the input and its associated mapper
    // The mapper will just emit the (min, max) pairs to the reduce stage
    job.addInput(new Path(input), new HadoopInputFormat(TextInputFormat.class),
        new TupleMapper<LongWritable, Text>() {

          Tuple tuple = new Tuple(schema);

          @Override
          public void map(LongWritable key, Text value, TupleMRContext context, Collector collector)
              throws IOException, InterruptedException {
            String[] fields = value.toString().split("\t");
            tuple.set("min", Integer.parseInt(fields[0]));
            tuple.set("max", Integer.parseInt(fields[1]));
            collector.write(tuple);
          }
        });

    // Define the reducer
    // The reducer will run as many games of life as (max - min) for each interval it receives
    // It will emit the inputs of GOL that converged together with the number of iterations
    // Note that inputs that produce grid overflow are ignored (but may have longer iteration convergence)
    job.setTupleReducer(new TupleReducer<Text, NullWritable>() {

      public void reduce(ITuple group, Iterable<ITuple> tuples, TupleMRContext context,
          Collector collector) throws IOException, InterruptedException, TupleMRException {

        int min = (Integer) group.get("min"), max = (Integer) group.get("max");
        for(int i = min; i < max; i++) {
          try {
            GameOfLife gameOfLife = new GameOfLife(gridSize, GameOfLife.longToBytes((long) i), maxX,
                maxY, iterations);
            while(true) {
              gameOfLife.nextCycle();
            }
          } catch(GameOfLifeException e) {
            context.getHadoopContext().progress();
            context.getHadoopContext().getCounter("stats", e.getCauseMessage() + "").increment(1);
            if(e.getCauseMessage().equals(CauseMessage.CONVERGENCE_REACHED)) {
              collector.write(
                  new Text(Arrays.toString(GameOfLife.longToBytes((long) i)) + "\t" + e.getIterations()),
                  NullWritable.get());
            }
          }
        }
      };
    });

    job.setOutput(new Path(output), new HadoopOutputFormat(TextOutputFormat.class), Text.class,
        NullWritable.class);
    try {
      job.createJob().waitForCompletion(true);
    } finally {
      job.cleanUpInstanceFiles();
    }
    delete(input);
    return 0;
  }
View Full Code Here

    fields.add(Field.create("strField", Type.STRING));
    fields.add(Field.create("longField", Type.LONG));
    fields.add(Field.create("doubleField", Type.DOUBLE));
    Schema schema = new Schema("schema", fields);

    TupleMRBuilder mr = new TupleMRBuilder(conf, "Pangool Secondary Sort");
    mr.addIntermediateSchema(schema);
    mr.setGroupByFields("intField", "strField");
    mr.setOrderBy(new OrderBy().add("intField", Order.ASC).add("strField", Order.ASC)
        .add("longField", Order.ASC));
    mr.setTupleReducer(new Handler());
    mr.addInput(new Path(input), new HadoopInputFormat(TextInputFormat.class), new IProcessor());
    mr.setOutput(new Path(output), new HadoopOutputFormat(TextOutputFormat.class), Text.class,
        DoubleWritable.class);

    try {
      mr.createJob().waitForCompletion(true);
    } finally {
      mr.cleanUpInstanceFiles();
    }
    return 1;
  }
View Full Code Here

TOP

Related Classes of com.datasalt.pangool.tuplemr.TupleMRBuilder$Input

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.