Examples of org.apache.hadoop.conf.Configurable

org.apache.hadoop.conf.Configurable
Something that may be configured with a {@link Configuration}.

/** Create and run the Bayes Trainer. */
public class BayesWeightSummerDriver implements BayesJob {


  @Override
  public void runJob(Path input, Path output, BayesParameters params) throws IOException {
    Configurable client = new JobClient();
    JobConf conf = new JobConf(BayesWeightSummerDriver.class);
    conf.setJobName("Bayes Weight Summer Driver running over input: " + input);
    
    conf.setOutputKeyClass(StringTuple.class);
    conf.setOutputValueClass(DoubleWritable.class);
    
    FileInputFormat.addInputPath(conf, new Path(output, "trainer-tfIdf/trainer-tfIdf"));
    Path outPath = new Path(output, "trainer-weights");
    FileOutputFormat.setOutputPath(conf, outPath);
    HadoopUtil.overwriteOutput(outPath);
    // conf.setNumReduceTasks(1);
    // conf.setNumMapTasks(100);
    conf.setMapperClass(BayesWeightSummerMapper.class);
    // see the javadoc for the spec for file input formats: first token is key,
    // rest is input. Whole document on one line
    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setCombinerClass(BayesWeightSummerReducer.class);
    conf.setReducerClass(BayesWeightSummerReducer.class);
    conf.setOutputFormat(BayesWeightSummerOutputFormat.class);


    conf.set("bayes.parameters", params.toString());
    conf.set("output.table", output.toString());
    
    client.setConf(conf);
    
    JobClient.runJob(conf);
  }

View Full Code Here

/** Create and run the Bayes Feature Reader Step. */
public class BayesFeatureDriver implements BayesJob {


  @Override
  public void runJob(Path input, Path output, BayesParameters params) throws IOException {
    Configurable client = new JobClient();
    JobConf conf = new JobConf(BayesFeatureDriver.class);
    conf.setJobName("Bayes Feature Driver running over input: " + input);
    conf.setOutputKeyClass(StringTuple.class);
    conf.setOutputValueClass(DoubleWritable.class);
    conf.setPartitionerClass(FeaturePartitioner.class);
    conf.setOutputKeyComparatorClass(FeatureLabelComparator.class);
    FileInputFormat.setInputPaths(conf, input);
    FileOutputFormat.setOutputPath(conf, output);
    
    conf.setMapperClass(BayesFeatureMapper.class);
    
    conf.setInputFormat(KeyValueTextInputFormat.class);
    conf.setCombinerClass(BayesFeatureCombiner.class);
    conf.setReducerClass(BayesFeatureReducer.class);
    conf.setOutputFormat(BayesFeatureOutputFormat.class);
    conf.set("io.serializations",
          "org.apache.hadoop.io.serializer.JavaSerialization,org.apache.hadoop.io.serializer.WritableSerialization");
    // this conf parameter needs to be set enable serialisation of conf values


    HadoopUtil.overwriteOutput(output);
    conf.set("bayes.parameters", params.toString());
    
    client.setConf(conf);
    JobClient.runJob(conf);
    
  }

View Full Code Here

  private static final Logger log = LoggerFactory.getLogger(BayesTfIdfDriver.class);


  @Override
  public void runJob(Path input, Path output, BayesParameters params) throws IOException {
    
    Configurable client = new JobClient();
    JobConf conf = new JobConf(BayesWeightSummerDriver.class);
    conf.setJobName("TfIdf Driver running over input: " + input);
    
    conf.setOutputKeyClass(StringTuple.class);
    conf.setOutputValueClass(DoubleWritable.class);
    
    FileInputFormat.addInputPath(conf, new Path(output, "trainer-termDocCount"));
    FileInputFormat.addInputPath(conf, new Path(output, "trainer-wordFreq"));
    FileInputFormat.addInputPath(conf, new Path(output, "trainer-featureCount"));
    Path outPath = new Path(output, "trainer-tfIdf");
    FileOutputFormat.setOutputPath(conf, outPath);
    
    // conf.setNumMapTasks(100);
    
    conf.setJarByClass(BayesTfIdfDriver.class);
    
    conf.setMapperClass(BayesTfIdfMapper.class);
    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setCombinerClass(BayesTfIdfReducer.class);
    
    conf.setReducerClass(BayesTfIdfReducer.class);
    
    conf.setOutputFormat(BayesTfIdfOutputFormat.class);
    
    conf
        .set("io.serializations",
          "org.apache.hadoop.io.serializer.JavaSerialization,org.apache.hadoop.io.serializer.WritableSerialization");
    // Dont ever forget this. People should keep track of how hadoop conf
    // parameters and make or break a piece of code
    
    FileSystem dfs = FileSystem.get(outPath.toUri(), conf);
    HadoopUtil.overwriteOutput(outPath);
    
    Path interimFile = new Path(output, "trainer-docCount/part-*");
    
    Map<String,Double> labelDocumentCounts = SequenceFileModelReader.readLabelDocumentCounts(dfs,
      interimFile, conf);
    
    DefaultStringifier<Map<String,Double>> mapStringifier = new DefaultStringifier<Map<String,Double>>(conf,
        GenericsUtil.getClass(labelDocumentCounts));
    
    String labelDocumentCountString = mapStringifier.toString(labelDocumentCounts);
    log.info("Counts of documents in Each Label");
    Map<String,Double> c = mapStringifier.fromString(labelDocumentCountString);
    log.info("{}", c);
    
    conf.set("cnaivebayes.labelDocumentCounts", labelDocumentCountString);
    log.info(params.print());
    if (params.get("dataSource").equals("hbase")) {
      String tableName = output.toString();
      HBaseConfiguration hc = new HBaseConfiguration(new Configuration());
      HTableDescriptor ht = new HTableDescriptor(tableName);
      HColumnDescriptor hcd = new HColumnDescriptor(BayesConstants.HBASE_COLUMN_FAMILY + ':');
      hcd.setBloomfilter(true);
      hcd.setInMemory(true);
      hcd.setMaxVersions(1);
      hcd.setBlockCacheEnabled(true);
      ht.addFamily(hcd);
      
      log.info("Connecting to hbase...");
      HBaseAdmin hba = new HBaseAdmin(hc);
      log.info("Creating Table {}", output);
      
      if (hba.tableExists(tableName)) {
        hba.disableTable(tableName);
        hba.deleteTable(tableName);
        hba.majorCompact(".META.");
      }
      hba.createTable(ht);
      conf.set("output.table", tableName);
    }
    conf.set("bayes.parameters", params.toString());
    
    client.setConf(conf);
    
    JobClient.runJob(conf);
  }

View Full Code Here

   * 
   * @param params
   *          The Job parameters containing the gramSize, input output folders, defaultCat, encoding
   */
  public static void runJob(Parameters params) throws IOException {
    Configurable client = new JobClient();
    JobConf conf = new JobConf(BayesClassifierDriver.class);
    conf.setJobName("Bayes Classifier Driver running over input: " + params.get("testDirPath"));
    conf.setOutputKeyClass(StringTuple.class);
    conf.setOutputValueClass(DoubleWritable.class);
    
    FileInputFormat.setInputPaths(conf, new Path(params.get("testDirPath")));
    Path outPath = new Path(params.get("testDirPath") + "-output");
    FileOutputFormat.setOutputPath(conf, outPath);
    
    conf.setInputFormat(KeyValueTextInputFormat.class);
    conf.setMapperClass(BayesClassifierMapper.class);
    conf.setCombinerClass(BayesClassifierReducer.class);
    conf.setReducerClass(BayesClassifierReducer.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    
    conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,"
                                  + "org.apache.hadoop.io.serializer.WritableSerialization");
    
    HadoopUtil.overwriteOutput(outPath);
    conf.set("bayes.parameters", params.toString());
    
    client.setConf(conf);
    JobClient.runJob(conf);
    
    Path outputFiles = new Path(outPath, "part*");
    FileSystem dfs = FileSystem.get(outPath.toUri(), conf);    
    ConfusionMatrix matrix = readResult(dfs, outputFiles, conf, params);

View Full Code Here

  
  private static final Logger log = LoggerFactory.getLogger(BayesThetaNormalizerDriver.class);


  @Override
  public void runJob(Path input, Path output, BayesParameters params) throws IOException {
    Configurable client = new JobClient();
    JobConf conf = new JobConf(BayesThetaNormalizerDriver.class);
    
    conf.setJobName("Bayes Theta Normalizer Driver running over input: " + input);
    
    conf.setOutputKeyClass(StringTuple.class);
    conf.setOutputValueClass(DoubleWritable.class);
    FileInputFormat.addInputPath(conf, new Path(output, "trainer-tfIdf/trainer-tfIdf"));
    Path outPath = new Path(output, "trainer-thetaNormalizer");
    FileOutputFormat.setOutputPath(conf, outPath);
    // conf.setNumMapTasks(100);
    // conf.setNumReduceTasks(1);
    conf.setMapperClass(BayesThetaNormalizerMapper.class);
    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setCombinerClass(BayesThetaNormalizerReducer.class);
    conf.setReducerClass(BayesThetaNormalizerReducer.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,"
                                  + "org.apache.hadoop.io.serializer.WritableSerialization");
    // Dont ever forget this. People should keep track of how hadoop conf
    // parameters and make or break a piece of code
    
    HadoopUtil.overwriteOutput(outPath);
    FileSystem dfs = FileSystem.get(outPath.toUri(), conf);    
    
    Path sigmaKFiles = new Path(output, "trainer-weights/Sigma_k/*");
    Map<String,Double> labelWeightSum = SequenceFileModelReader.readLabelSums(dfs, sigmaKFiles, conf);
    DefaultStringifier<Map<String,Double>> mapStringifier = new DefaultStringifier<Map<String,Double>>(conf,
        GenericsUtil.getClass(labelWeightSum));
    String labelWeightSumString = mapStringifier.toString(labelWeightSum);
    
    log.info("Sigma_k for Each Label");
    Map<String,Double> c = mapStringifier.fromString(labelWeightSumString);
    log.info("{}", c);
    conf.set("cnaivebayes.sigma_k", labelWeightSumString);
    
    Path sigmaJSigmaKFile = new Path(output, "trainer-weights/Sigma_kSigma_j/*");
    double sigmaJSigmaK = SequenceFileModelReader.readSigmaJSigmaK(dfs, sigmaJSigmaKFile, conf);
    DefaultStringifier<Double> stringifier = new DefaultStringifier<Double>(conf, Double.class);
    String sigmaJSigmaKString = stringifier.toString(sigmaJSigmaK);
    
    log.info("Sigma_kSigma_j for each Label and for each Features");
    double retSigmaJSigmaK = stringifier.fromString(sigmaJSigmaKString);
    log.info("{}", retSigmaJSigmaK);
    conf.set("cnaivebayes.sigma_jSigma_k", sigmaJSigmaKString);
    
    Path vocabCountFile = new Path(output, "trainer-tfIdf/trainer-vocabCount/*");
    double vocabCount = SequenceFileModelReader.readVocabCount(dfs, vocabCountFile, conf);
    String vocabCountString = stringifier.toString(vocabCount);
    
    log.info("Vocabulary Count");
    conf.set("cnaivebayes.vocabCount", vocabCountString);
    double retvocabCount = stringifier.fromString(vocabCountString);
    log.info("{}", retvocabCount);
    conf.set("bayes.parameters", params.toString());
    conf.set("output.table", output.toString());
    client.setConf(conf);
    
    JobClient.runJob(conf);
    
  }

View Full Code Here

  
  private static final Logger log = LoggerFactory.getLogger(CBayesThetaNormalizerDriver.class);


  @Override
  public void runJob(Path input, Path output, BayesParameters params) throws IOException {
    Configurable client = new JobClient();
    JobConf conf = new JobConf(CBayesThetaNormalizerDriver.class);
    conf.setJobName("Complementary Bayes Theta Normalizer Driver running over input: " + input);
    
    conf.setOutputKeyClass(StringTuple.class);
    conf.setOutputValueClass(DoubleWritable.class);
    FileInputFormat.addInputPath(conf, new Path(output, "trainer-weights/Sigma_j"));
    FileInputFormat.addInputPath(conf, new Path(output, "trainer-tfIdf/trainer-tfIdf"));
    Path outPath = new Path(output, "trainer-thetaNormalizer");
    FileOutputFormat.setOutputPath(conf, outPath);
    // conf.setNumMapTasks(100);
    // conf.setNumReduceTasks(1);
    conf.setMapperClass(CBayesThetaNormalizerMapper.class);
    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setCombinerClass(CBayesThetaNormalizerReducer.class);
    conf.setReducerClass(CBayesThetaNormalizerReducer.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    conf
        .set("io.serializations",
          "org.apache.hadoop.io.serializer.JavaSerialization,org.apache.hadoop.io.serializer.WritableSerialization");
    // Dont ever forget this. People should keep track of how hadoop conf
    // parameters and make or break a piece of code
    
    FileSystem dfs = FileSystem.get(outPath.toUri(), conf);
    HadoopUtil.overwriteOutput(outPath);
    
    Path sigmaKFiles = new Path(output, "trainer-weights/Sigma_k/*");
    Map<String,Double> labelWeightSum = SequenceFileModelReader.readLabelSums(dfs, sigmaKFiles, conf);
    DefaultStringifier<Map<String,Double>> mapStringifier = new DefaultStringifier<Map<String,Double>>(conf,
        GenericsUtil.getClass(labelWeightSum));
    String labelWeightSumString = mapStringifier.toString(labelWeightSum);
    
    log.info("Sigma_k for Each Label");
    Map<String,Double> c = mapStringifier.fromString(labelWeightSumString);
    log.info("{}", c);
    conf.set("cnaivebayes.sigma_k", labelWeightSumString);
    
    Path sigmaKSigmaJFile = new Path(output, "trainer-weights/Sigma_kSigma_j/*");
    double sigmaJSigmaK = SequenceFileModelReader.readSigmaJSigmaK(dfs, sigmaKSigmaJFile, conf);
    DefaultStringifier<Double> stringifier = new DefaultStringifier<Double>(conf, Double.class);
    String sigmaJSigmaKString = stringifier.toString(sigmaJSigmaK);
    
    log.info("Sigma_kSigma_j for each Label and for each Features");
    double retSigmaJSigmaK = stringifier.fromString(sigmaJSigmaKString);
    log.info("{}", retSigmaJSigmaK);
    conf.set("cnaivebayes.sigma_jSigma_k", sigmaJSigmaKString);
    
    Path vocabCountFile = new Path(output, "trainer-tfIdf/trainer-vocabCount/*");
    double vocabCount = SequenceFileModelReader.readVocabCount(dfs, vocabCountFile, conf);
    String vocabCountString = stringifier.toString(vocabCount);
    
    log.info("Vocabulary Count");
    conf.set("cnaivebayes.vocabCount", vocabCountString);
    double retvocabCount = stringifier.fromString(vocabCountString);
    log.info("{}", retvocabCount);
    conf.set("bayes.parameters", params.toString());
    conf.set("output.table", output.toString());
    client.setConf(conf);
    
    JobClient.runJob(conf);
    
  }

View Full Code Here

                            String measureClassName,
                            double t1,
                            double t2,
                            double convergenceDelta) {
    
    Configurable client = new JobClient();
    JobConf conf = new JobConf(MeanShiftCanopyDriver.class);
    
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(MeanShiftCanopy.class);
    
    FileInputFormat.setInputPaths(conf, new Path(input));
    Path outPath = new Path(output);
    FileOutputFormat.setOutputPath(conf, outPath);
    
    conf.setMapperClass(MeanShiftCanopyMapper.class);
    conf.setReducerClass(MeanShiftCanopyReducer.class);
    conf.setNumReduceTasks(1);
    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    conf.set(MeanShiftCanopyConfigKeys.DISTANCE_MEASURE_KEY, measureClassName);
    conf.set(MeanShiftCanopyConfigKeys.CLUSTER_CONVERGENCE_KEY, String.valueOf(convergenceDelta));
    conf.set(MeanShiftCanopyConfigKeys.T1_KEY, String.valueOf(t1));
    conf.set(MeanShiftCanopyConfigKeys.T2_KEY, String.valueOf(t2));
    conf.set(MeanShiftCanopyConfigKeys.CONTROL_PATH_KEY, control);
    
    client.setConf(conf);
    try {
      JobClient.runJob(conf);
    } catch (IOException e) {
      log.warn(e.toString(), e);
    }

View Full Code Here

   * @param output
   *          the output pathname String
   */
  public static void createCanopyFromVectors(String input, String output) {
    
    Configurable client = new JobClient();
    JobConf conf = new JobConf(MeanShiftCanopyDriver.class);
    
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(MeanShiftCanopy.class);
    
    FileInputFormat.setInputPaths(conf, new Path(input));
    Path outPath = new Path(output);
    FileOutputFormat.setOutputPath(conf, outPath);
    
    conf.setMapperClass(MeanShiftCanopyCreatorMapper.class);
    conf.setNumReduceTasks(0);
    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    
    client.setConf(conf);
    try {
      JobClient.runJob(conf);
    } catch (IOException e) {
      log.warn(e.toString(), e);
    }

View Full Code Here

/** Create and run the Bayes Feature Reader Step. */
public class BayesFeatureDriver implements BayesJob {


  @Override
  public void runJob(Path input, Path output, BayesParameters params) throws IOException {
    Configurable client = new JobClient();
    JobConf conf = new JobConf(BayesFeatureDriver.class);
    conf.setJobName("Bayes Feature Driver running over input: " + input);
    conf.setOutputKeyClass(StringTuple.class);
    conf.setOutputValueClass(DoubleWritable.class);
    conf.setPartitionerClass(FeaturePartitioner.class);
    conf.setOutputKeyComparatorClass(FeatureLabelComparator.class);
    FileInputFormat.setInputPaths(conf, input);
    FileOutputFormat.setOutputPath(conf, output);
    
    conf.setMapperClass(BayesFeatureMapper.class);
    
    conf.setInputFormat(KeyValueTextInputFormat.class);
    conf.setCombinerClass(BayesFeatureCombiner.class);
    conf.setReducerClass(BayesFeatureReducer.class);
    conf.setOutputFormat(BayesFeatureOutputFormat.class);
    conf.set("io.serializations",
          "org.apache.hadoop.io.serializer.JavaSerialization,org.apache.hadoop.io.serializer.WritableSerialization");
    // this conf parameter needs to be set enable serialisation of conf values


    HadoopUtil.delete(conf, output);
    conf.set("bayes.parameters", params.toString());
    
    client.setConf(conf);
    JobClient.runJob(conf);
    
  }

View Full Code Here

  private static final Logger log = LoggerFactory.getLogger(BayesTfIdfDriver.class);


  @Override
  public void runJob(Path input, Path output, BayesParameters params) throws IOException {
    
    Configurable client = new JobClient();
    JobConf conf = new JobConf(BayesWeightSummerDriver.class);
    conf.setJobName("TfIdf Driver running over input: " + input);
    
    conf.setOutputKeyClass(StringTuple.class);
    conf.setOutputValueClass(DoubleWritable.class);
    
    FileInputFormat.addInputPath(conf, new Path(output, "trainer-termDocCount"));
    FileInputFormat.addInputPath(conf, new Path(output, "trainer-wordFreq"));
    FileInputFormat.addInputPath(conf, new Path(output, "trainer-featureCount"));
    Path outPath = new Path(output, "trainer-tfIdf");
    FileOutputFormat.setOutputPath(conf, outPath);
    
    // conf.setNumMapTasks(100);
    
    conf.setJarByClass(BayesTfIdfDriver.class);
    
    conf.setMapperClass(BayesTfIdfMapper.class);
    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setCombinerClass(BayesTfIdfReducer.class);
    
    conf.setReducerClass(BayesTfIdfReducer.class);
    
    conf.setOutputFormat(BayesTfIdfOutputFormat.class);
    
    conf.set("io.serializations",
             "org.apache.hadoop.io.serializer.JavaSerialization,"
                 + "org.apache.hadoop.io.serializer.WritableSerialization");
    // Dont ever forget this. People should keep track of how hadoop conf
    // parameters and make or break a piece of code
    HadoopUtil.delete(conf, outPath);
    Path interimFile = new Path(output, "trainer-docCount/part-*");
    
    Map<String,Double> labelDocumentCounts = SequenceFileModelReader.readLabelDocumentCounts(interimFile, conf);
    
    DefaultStringifier<Map<String,Double>> mapStringifier = new DefaultStringifier<Map<String,Double>>(conf,
        GenericsUtil.getClass(labelDocumentCounts));
    
    String labelDocumentCountString = mapStringifier.toString(labelDocumentCounts);
    log.info("Counts of documents in Each Label");
    Map<String,Double> c = mapStringifier.fromString(labelDocumentCountString);
    log.info("{}", c);
    
    conf.set("cnaivebayes.labelDocumentCounts", labelDocumentCountString);
    log.info(params.print());
    conf.set("bayes.parameters", params.toString());
    
    client.setConf(conf);
    
    JobClient.runJob(conf);
  }

View Full Code Here

0 1 2 3 4

TOP

Related Classes of org.apache.hadoop.conf.Configurable

com.asakusafw.runtime.directio.hadoop.SequenceFileFormat

com.fb2pdf.hadoop.cluster.MeanShiftCanopyDriver

org.apache.blur.manager.indexserver.DistributedIndexServer

org.apache.blur.mapreduce.BlurReducer

org.apache.blur.utils.BlurUtil

org.apache.hadoop.hive.thrift.HadoopThriftAuthBridge23

org.apache.hadoop.thriftfs.HadoopThriftAuthBridge$Server

org.apache.mahout.classifier.bayes.mapreduce.bayes.BayesClassifierDriver

org.apache.mahout.classifier.bayes.mapreduce.bayes.BayesThetaNormalizerDriver

org.apache.mahout.classifier.bayes.mapreduce.cbayes.CBayesThetaNormalizerDriver

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.