Examples of com.datasalt.pangool.tuplemr.MapOnlyJobBuilder

com.datasalt.pangool.tuplemr.MapOnlyJobBuilder
The MapOnlyJobBuilder is a simple Pangool primitive that executes map-only Jobs. You must implement {@link MapOnlyMapper} for using it.

    String output = args[2];
    delete(output);
    
    init(conf, new Path(modelFolder));
    
    MapOnlyJobBuilder job = new MapOnlyJobBuilder(conf);
    job.setOutput(new Path(output), new HadoopOutputFormat(TextOutputFormat.class), Text.class, NullWritable.class);
    job.addInput(new Path(input), new HadoopInputFormat(TextInputFormat.class), new MapOnlyMapper<LongWritable, Text, Text, NullWritable>() {
      protected void map(LongWritable key, Text value, Context context) throws IOException ,InterruptedException {
        value.set(value.toString() + "\t" + classify(value.toString()));
        context.write(value, NullWritable.get());
      }
    });
    job.createJob().waitForCompletion(true);
    
    return 1;
  }

View Full Code Here

    String input = args[1];
    String output = args[2];
    
    delete(output);
    
    MapOnlyJobBuilder b = new MapOnlyJobBuilder(conf);
    b.setOutput(new Path(output), new HadoopOutputFormat(TextOutputFormat.class), Text.class, NullWritable.class);
    b.addInput(new Path(input), new HadoopInputFormat(TextInputFormat.class), new GrepHandler(regex));
    b.createJob().waitForCompletion(true);
    
    return 0;
  }

View Full Code Here

    String output = args[2];
    delete(output);
    
    init(conf, new Path(modelFolder));
    
    MapOnlyJobBuilder job = new MapOnlyJobBuilder(conf);
    job.setOutput(new Path(output), new HadoopOutputFormat(TextOutputFormat.class), Text.class, NullWritable.class);
    job.addInput(new Path(input), new HadoopInputFormat(TextInputFormat.class), new MapOnlyMapper<LongWritable, Text, Text, NullWritable>() {
      protected void map(LongWritable key, Text value, Context context) throws IOException ,InterruptedException {
        value.set(value.toString() + "\t" + classify(value.toString()));
        context.write(value, NullWritable.get());
      }
    });
    job.createJob().waitForCompletion(true);
    
    return 1;
  }

View Full Code Here

    String input = args[1];
    String output = args[2];
    
    delete(output);
    
    MapOnlyJobBuilder b = new MapOnlyJobBuilder(conf);
    b.setMapper(new GrepHandler(regex));
    b.setOutput(new Path(output), new HadoopOutputFormat(TextOutputFormat.class), Text.class, NullWritable.class);
    b.addInput(new Path(input), new HadoopInputFormat(TextInputFormat.class));
    b.createJob().waitForCompletion(true);
    
    return 0;
  }

View Full Code Here

    String output = args[2];
    delete(output);
    
    init(conf, new Path(modelFolder));
    
    MapOnlyJobBuilder job = new MapOnlyJobBuilder(conf);
    job.setMapper(new MapOnlyMapper<LongWritable, Text, Text, NullWritable>() {
      protected void map(LongWritable key, Text value, Context context) throws IOException ,InterruptedException {
        value.set(value.toString() + "\t" + classify(value.toString()));
        context.write(value, NullWritable.get());
      }
    });
    job.setOutput(new Path(output), new HadoopOutputFormat(TextOutputFormat.class), Text.class, NullWritable.class);
    job.addInput(new Path(input), new HadoopInputFormat(TextInputFormat.class));
    job.createJob().waitForCompletion(true);
    
    return 1;
  }

View Full Code Here

    String input = args[1];
    String output = args[2];
    
    deleteOutput(output);
    
    MapOnlyJobBuilder b = new MapOnlyJobBuilder(conf);
    b.setMapper(new GrepHandler(regex));
    b.setOutput(new Path(output), new HadoopOutputFormat(TextOutputFormat.class), Text.class, NullWritable.class);
    b.addInput(new Path(input), new HadoopInputFormat(TextInputFormat.class));
    b.createJob().waitForCompletion(true);
    
    return 0;
  }

View Full Code Here

    }
    
    String dbName = args[0];
    String tableName = args[1];
    
    MapOnlyJobBuilder builder = new MapOnlyJobBuilder(conf, "HCatTupleInputFormat Integration Test");
    // input path can't be null in Pangool so we enter anything
    builder.addInput(new Path("anything"), new HCatTupleInputFormat(dbName, tableName, conf), new MapOnlyMapper<ITuple, NullWritable, NullWritable, NullWritable>() {
      
      protected void map(ITuple key, NullWritable value, Context context) throws IOException, InterruptedException {
        System.out.println(key.toString());
      };
    });
    builder.setOutput(new Path(HCatTupleInputFormat.class + "-out"), new HadoopOutputFormat(NullOutputFormat.class), NullWritable.class, NullWritable.class);
    builder.createJob().waitForCompletion(true);
    return 1;
  }

View Full Code Here

    String input = args[1];
    String output = args[2];
    
    delete(output);
    
    MapOnlyJobBuilder b = new MapOnlyJobBuilder(conf);
    b.setOutput(new Path(output), new HadoopOutputFormat(TextOutputFormat.class), Text.class, NullWritable.class);
    b.addInput(new Path(input), new HadoopInputFormat(TextInputFormat.class), new GrepHandler(regex));
    Job job = b.createJob();
    try {
      job.waitForCompletion(true);
    } finally {
      b.cleanUpInstanceFiles();
    }
    
    return 0;
  }

View Full Code Here

    
    delete(outPath);


    // We only need to execute a Map-only job for this task.
    // Every map will process a HTML file and extract the reviews from it.
    MapOnlyJobBuilder builder = new MapOnlyJobBuilder(conf);


    builder.addInput(new Path(inputFolder), new HadoopInputFormat(TextInputFormat.class),
        new MapOnlyMapper<LongWritable, Text, Text, BSONObject>() {


          StringBuffer inMemoryHtml = new StringBuffer();


          @Override
          protected void map(LongWritable key, Text value, Context context) throws IOException,
              InterruptedException {
            // for every line in the HTML just add it to a string buffer
            // we will process the entire HTML in the end (cleanup())
            inMemoryHtml.append(value.toString());
          }


          @Override
          protected void cleanup(Context context, MultipleOutputsCollector coll) throws IOException, InterruptedException {
            String html = inMemoryHtml.toString();


            Matcher startMatcher = startPattern.matcher(html);
            Matcher endMatcher = endPattern.matcher(html);


            Text documentId = new Text();
            
            Matcher placeMatcher = placePattern.matcher(html);
            // we assume this will always match - otherwise fail fast!
            placeMatcher.find();
            String placeId = placeMatcher.group(1);


            // Now we will proceed as follows:
            // We create a regex matcher for start of reviews and end of reviews
            // Within each (start, end) pair, we will execute an arbitrary number of matchers
            // for matching all the other properties (username, date, rating, review text...).
            // finally we add all the properties to a Mongo BSONObject that can be used as output.
            while(startMatcher.find()) {
              BSONObject review = new BasicBSONObject();
              review.put("place_id", placeId);
              int reviewStart = startMatcher.start();
              endMatcher.find();
              int reviewEnd = endMatcher.start();


              // Focus only on (start, end) text for this review
              String reviewText = html.substring(reviewStart, reviewEnd);
              
              for(Map.Entry<String, Pattern> parsingProperty : parsingConfig.entrySet()) {
                Matcher matcher = parsingProperty.getValue().matcher(reviewText);
                if(matcher.find()) {
                  review.put(parsingProperty.getKey(), matcher.group(1).trim());
                }
              }
              
              // The Mongo documentId of the review will the be the  Review_id.
              documentId.set((String) review.get("review_id"));
              // Write the pair (Id, document) to the output collector.
              context.write(documentId, review);
            }
          }
        });


    // --- This is the most important part (what makes it work with MongoDB: ---
    // Set the URL of the MongoDB we will write to. Here we specify the DB and the final Table.
    MongoConfigUtil.setOutputURI(conf, "mongodb://localhost/test.qype");
    // Set the output format to HadoopOutputFormat(MongoOutputFormat.class)
    // The key will be the documentIds for the Mongo table and the value a Mongo BSONObject with all the properties we wish.
    builder.setOutput(new Path(outPath), new HadoopOutputFormat(MongoOutputFormat.class), Text.class,
        BSONObject.class);


    // Finally, build and execute the Pangool Job.
    try {
      builder.createJob().waitForCompletion(true);
    } finally {
      builder.cleanUpInstanceFiles();
    }
    
    // we are not interested in the output folder, so delete it
    delete(outPath);

View Full Code Here

    String output = args[2];
    delete(output);
    
    init(conf, new Path(modelFolder));
    
    MapOnlyJobBuilder job = new MapOnlyJobBuilder(conf);
    job.setOutput(new Path(output), new HadoopOutputFormat(TextOutputFormat.class), Text.class, NullWritable.class);
    job.addInput(new Path(input), new HadoopInputFormat(TextInputFormat.class), new MapOnlyMapper<LongWritable, Text, Text, NullWritable>() {
      protected void map(LongWritable key, Text value, Context context) throws IOException ,InterruptedException {
        value.set(value.toString() + "\t" + classify(value.toString()));
        context.write(value, NullWritable.get());
      }
    });
    Job j = job.createJob();
    try {
      j.waitForCompletion(true);
    } finally {
      job.cleanUpInstanceFiles();
    }
    
    return 1;
  }

View Full Code Here

0 1 2

TOP

Related Classes of com.datasalt.pangool.tuplemr.MapOnlyJobBuilder

com.datasalt.pangool.examples.Grep

com.datasalt.pangool.examples.HCatalogIntegrationTest

com.datasalt.pangool.examples.mongo.QypeScrapper

com.datasalt.pangool.examples.naivebayes.NaiveBayesClassifier

com.datasalt.pangool.tuplemr.mapred.lib.input.TestCascadingTupleInputFormat

com.datasalt.pangool.tuplemr.mapred.lib.input.TupleInputFormat

com.datasalt.pangool.tuplemr.mapred.lib.output.TestTupleTextInputOutputFormat

com.datasalt.pangool.tuplemr.mapred.lib.output.TupleOutputFormat

com.datasalt.pangool.tuplemr.MultipleInputsInterface.Input

com.datasalt.pangool.tuplemr.NamedOutputsInterface.Output

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.