Package org.nebulaframework.benchmark.scimark2

Examples of org.nebulaframework.benchmark.scimark2.commandline


   
    try {
     
      Parser parser = new Parser();
      parser.setGroup(group);
      CommandLine cmdLine = parser.parse(args);
     
      if (cmdLine.hasOption(helpOpt)) {
        CommandLineUtil.printHelp(group);
        return false;
      }
     
      inputDirectory = new Path((String) cmdLine.getValue(inputDirOpt));
      trainingOutputDirectory = new Path((String) cmdLine.getValue(trainingOutputDirOpt));
      testOutputDirectory = new Path((String) cmdLine.getValue(testOutputDirOpt));
    
      charset = Charset.forName((String) cmdLine.getValue(charsetOpt));

      if (cmdLine.hasOption(testSplitSizeOpt) && cmdLine.hasOption(testSplitPctOpt)) {
        throw new OptionException(testSplitSizeOpt, "must have either split size or split percentage option, not BOTH");
      } else if (!cmdLine.hasOption(testSplitSizeOpt) && !cmdLine.hasOption(testSplitPctOpt)) {
        throw new OptionException(testSplitSizeOpt, "must have either split size or split percentage option");
      }

      if (cmdLine.hasOption(testSplitSizeOpt)) {
        setTestSplitSize(Integer.parseInt((String) cmdLine.getValue(testSplitSizeOpt)));
      }
     
      if (cmdLine.hasOption(testSplitPctOpt)) {
        setTestSplitPct(Integer.parseInt((String) cmdLine.getValue(testSplitPctOpt)));
      }
     
      if (cmdLine.hasOption(splitLocationOpt)) {
        setSplitLocation(Integer.parseInt((String) cmdLine.getValue(splitLocationOpt)));
      }
     
      if (cmdLine.hasOption(randomSelectionSizeOpt)) {
        setTestRandomSelectionSize(Integer.parseInt((String) cmdLine.getValue(randomSelectionSizeOpt)));
      }
     
      if (cmdLine.hasOption(randomSelectionPctOpt)) {
        setTestRandomSelectionPct(Integer.parseInt((String) cmdLine.getValue(randomSelectionPctOpt)));
      }

      fs.mkdirs(trainingOutputDirectory);
      fs.mkdirs(testOutputDirectory);
    
View Full Code Here


  String getHadoopAliasConfFile() {
    return new File(getHadoopClientHome() + "/conf", hadoopAliasConf_).getAbsolutePath();
  }

  void parseArgv(){
    CommandLine cmdLine = null;
    try{
      cmdLine = parser.parse(argv_);
    }catch(Exception oe){
      LOG.error(oe.getMessage());
      if (detailedUsage_) {
        exitUsage(true);
      } else {
        exitUsage(false);
      }
    }
   
    if (cmdLine != null){
      verbose_ =  cmdLine.hasOption("-verbose");
      detailedUsage_ = cmdLine.hasOption("-info");
      debug_ = cmdLine.hasOption("-debug")? debug_ + 1 : debug_;
     
      inputSpecs_.addAll(cmdLine.getValues("-input"));
      output_ = (String) cmdLine.getValue("-output");
     
      mapCmd_ = (String)cmdLine.getValue("-mapper");
      comCmd_ = (String)cmdLine.getValue("-combiner");
      redCmd_ = (String)cmdLine.getValue("-reducer");
     
      packageFiles_.addAll(cmdLine.getValues("-file"));
     
      cluster_ = (String)cmdLine.getValue("-cluster");
     
      configPath_.addAll(cmdLine.getValues("-config"));
     
      String fsName = (String)cmdLine.getValue("-dfs");
      if (null != fsName){
        userJobConfProps_.put("fs.default.name", fsName);       
      }
     
      String jt = (String)cmdLine.getValue("mapred.job.tracker");
      if (null != jt){
        userJobConfProps_.put("fs.default.name", jt);       
      }
     
      additionalConfSpec_ = (String)cmdLine.getValue("-additionalconfspec");
      inputFormatSpec_ = (String)cmdLine.getValue("-inputformat");
      outputFormatSpec_ = (String)cmdLine.getValue("-outputformat");
      numReduceTasksSpec_ = (String)cmdLine.getValue("-numReduceTasks");
      partitionerSpec_ = (String)cmdLine.getValue("-partitioner");
      inReaderSpec_ = (String)cmdLine.getValue("-inputreader");
      mapDebugSpec_ = (String)cmdLine.getValue("-mapdebug");   
      reduceDebugSpec_ = (String)cmdLine.getValue("-reducedebug");
     
      List<String> car = cmdLine.getValues("-cacheArchive");
      if (null != car){
        for(String s : car){
          cacheArchives = (cacheArchives == null)?s :cacheArchives + "," + s; 
        }
      }

      List<String> caf = cmdLine.getValues("-cacheFile");
      if (null != caf){
        for(String s : caf){
          cacheFiles = (cacheFiles == null)?s :cacheFiles + "," + s; 
        }
      }
     
      List<String> jobConfArgs = (List<String>)cmdLine.getValue(jobconf);
      List<String> envArgs = (List<String>)cmdLine.getValue(cmdenv);
     
      if (null != jobConfArgs){
        for(String s : jobConfArgs){
          String []parts = s.split("=", 2);
          userJobConfProps_.put(parts[0], parts[1]);
View Full Code Here

    try {
      Parser parser = new Parser();
      parser.setGroup(group);
      parser.setHelpOption(helpOpt);
      CommandLine cmdLine = parser.parse(args);
      if (cmdLine.hasOption(helpOpt)) {
        CommandLineUtil.printHelp(group);
        return;
      }
      File input = new File((String) cmdLine.getValue(inputOpt));
      String outputDir = (String) cmdLine.getValue(outputDirOpt);

      int chunkSize = 64;
      if (cmdLine.hasOption(chunkSizeOpt)) {
        chunkSize = Integer.parseInt((String) cmdLine.getValue(chunkSizeOpt));
      }

      String prefix = "";
      if (cmdLine.hasOption(keyPrefixOpt)) {
        prefix = (String) cmdLine.getValue(keyPrefixOpt);
      }

      Charset charset = Charset.forName((String) cmdLine.getValue(charsetOpt));
      SequenceFilesFromMailArchives dir = new SequenceFilesFromMailArchives();
      MailOptions options = new MailOptions();
      options.setInput(input);
      options.setOutputDir(outputDir);
      options.setPrefix(prefix);
      options.setChunkSize(chunkSize);
      options.setCharset(charset);


      List<Pattern> patterns = new ArrayList<Pattern>(5);
      // patternOrder is used downstream so that we can know what order the text is in instead
      // of encoding it in the string, which
      // would require more processing later to remove it pre feature selection.
      Map<String, Integer> patternOrder = new HashMap<String, Integer>();
      int order = 0;
      if (cmdLine.hasOption(fromOpt)) {
        patterns.add(MailProcessor.FROM_PREFIX);
        patternOrder.put(MailOptions.FROM, order++);
      }
      if (cmdLine.hasOption(toOpt)) {
        patterns.add(MailProcessor.TO_PREFIX);
        patternOrder.put(MailOptions.TO, order++);
      }
      if (cmdLine.hasOption(refsOpt)) {
        patterns.add(MailProcessor.REFS_PREFIX);
        patternOrder.put(MailOptions.REFS, order++);
      }
      if (cmdLine.hasOption(subjectOpt)) {
        patterns.add(MailProcessor.SUBJECT_PREFIX);
        patternOrder.put(MailOptions.SUBJECT, order++);
      }
      options.setStripQuotedText(cmdLine.hasOption(quotedOpt));

      options.setPatternsToMatch(patterns.toArray(new Pattern[patterns.size()]));
      options.setPatternOrder(patternOrder);
      options.setIncludeBody(cmdLine.hasOption(bodyOpt));
      options.setSeparator("\n");
      if (cmdLine.hasOption(separatorOpt)) {
        options.setSeparator(cmdLine.getValue(separatorOpt).toString());
      }
      if (cmdLine.hasOption(bodySeparatorOpt)) {
        options.setBodySeparator(cmdLine.getValue(bodySeparatorOpt).toString());
      }
      if (cmdLine.hasOption(quotedRegexOpt)){
        options.setQuotedTextPattern(Pattern.compile(cmdLine.getValue(quotedRegexOpt).toString()));
      }
      long start = System.currentTimeMillis();
      dir.createSequenceFiles(options);
      long finish = System.currentTimeMillis();
      log.info("Conversion took {}ms", finish - start);
View Full Code Here

      vectorOpt).withOption(helpOpt).create();

    try {
      Parser parser = new Parser();
      parser.setGroup(group);
      CommandLine cmdLine = parser.parse(args);
      if (cmdLine.hasOption(helpOpt)) {
        CommandLineUtil.printHelp(group);
        return;
      }
     
      Path input = new Path(cmdLine.getValue(inputOpt, "testdata").toString());
      Path output = new Path(cmdLine.getValue(outputOpt, "output").toString());
      String vectorClassName = cmdLine.getValue(vectorOpt,
         "org.apache.mahout.math.RandomAccessSparseVector").toString();
      runJob(input, output, vectorClassName);
    } catch (OptionException e) {
      log.error("Exception parsing command line: ", e);
      CommandLineUtil.printHelp(group);
View Full Code Here

        .create();
    try {
      Parser parser = new Parser();
      parser.setGroup(group);
      parser.setHelpOption(helpOpt);
      CommandLine cmdLine = parser.parse(args);
     
      if (cmdLine.hasOption(helpOpt)) {
        CommandLineUtil.printHelp(group);
        return -1;
      }
     
      Path inputDir = new Path((String) cmdLine.getValue(inputDirOpt));
      Path outputDir = new Path((String) cmdLine.getValue(outputDirOpt));
     
      int chunkSize = 100;
      if (cmdLine.hasOption(chunkSizeOpt)) {
        chunkSize = Integer.parseInt((String) cmdLine.getValue(chunkSizeOpt));
      }
      int minSupport = 2;
      if (cmdLine.hasOption(minSupportOpt)) {
        String minSupportString = (String) cmdLine.getValue(minSupportOpt);
        minSupport = Integer.parseInt(minSupportString);
      }
     
      int maxNGramSize = 1;
     
      if (cmdLine.hasOption(maxNGramSizeOpt)) {
        try {
          maxNGramSize = Integer.parseInt(cmdLine.getValue(maxNGramSizeOpt).toString());
        } catch (NumberFormatException ex) {
          log.warn("Could not parse ngram size option");
        }
      }
      log.info("Maximum n-gram size is: {}", maxNGramSize);
     
      if (cmdLine.hasOption(overwriteOutput)) {
        HadoopUtil.delete(getConf(), outputDir);
      }
     
      float minLLRValue = LLRReducer.DEFAULT_MIN_LLR;
      if (cmdLine.hasOption(minLLROpt)) {
        minLLRValue = Float.parseFloat(cmdLine.getValue(minLLROpt).toString());
      }
      log.info("Minimum LLR value: {}", minLLRValue);
     
      int reduceTasks = 1;
      if (cmdLine.hasOption(numReduceTasksOpt)) {
        reduceTasks = Integer.parseInt(cmdLine.getValue(numReduceTasksOpt).toString());
      }
      log.info("Number of reduce tasks: {}", reduceTasks);

      Class<? extends Analyzer> analyzerClass = DefaultAnalyzer.class;
      if (cmdLine.hasOption(analyzerNameOpt)) {
        String className = cmdLine.getValue(analyzerNameOpt).toString();
        analyzerClass = Class.forName(className).asSubclass(Analyzer.class);
        // try instantiating it, b/c there isn't any point in setting it if
        // you can't instantiate it
        ClassUtils.instantiateAs(analyzerClass, Analyzer.class);
      }
     
      boolean processIdf;
     
      if (cmdLine.hasOption(weightOpt)) {
        String wString = cmdLine.getValue(weightOpt).toString();
        if ("tf".equalsIgnoreCase(wString)) {
          processIdf = false;
        } else if ("tfidf".equalsIgnoreCase(wString)) {
          processIdf = true;
        } else {
          throw new OptionException(weightOpt);
        }
      } else {
        processIdf = true;
      }
     
      int minDf = 1;
      if (cmdLine.hasOption(minDFOpt)) {
        minDf = Integer.parseInt(cmdLine.getValue(minDFOpt).toString());
      }
      int maxDFPercent = 99;
      if (cmdLine.hasOption(maxDFPercentOpt)) {
        maxDFPercent = Integer.parseInt(cmdLine.getValue(maxDFPercentOpt).toString());
      }
      double maxDFSigma = -1.0;
      if (cmdLine.hasOption(maxDFSigmaOpt)) {
        maxDFSigma = Double.parseDouble(cmdLine.getValue(maxDFSigmaOpt).toString());
      }
     
      float norm = PartialVectorMerger.NO_NORMALIZING;
      if (cmdLine.hasOption(powerOpt)) {
        String power = cmdLine.getValue(powerOpt).toString();
        if ("INF".equals(power)) {
          norm = Float.POSITIVE_INFINITY;
        } else {
          norm = Float.parseFloat(power);
        }
      }
     
      boolean logNormalize = false;
      if (cmdLine.hasOption(logNormalizeOpt)) {
        logNormalize = true;
      }

      Configuration conf = getConf();
      Path tokenizedPath = new Path(outputDir, DocumentProcessor.TOKENIZED_DOCUMENT_OUTPUT_FOLDER);
      //TODO: move this into DictionaryVectorizer , and then fold SparseVectorsFrom with EncodedVectorsFrom to have one framework for all of this.
      DocumentProcessor.tokenizeDocuments(inputDir, analyzerClass, tokenizedPath, conf);

      boolean sequentialAccessOutput = false;
      if (cmdLine.hasOption(sequentialAccessVectorOpt)) {
        sequentialAccessOutput = true;
      }

      boolean namedVectors = false;
      if (cmdLine.hasOption(namedVectorOpt)) {
        namedVectors = true;
      }
      boolean shouldPrune = maxDFSigma >=0.0;
      String tfDirName = shouldPrune ? DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER+"-toprune" : DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER;
View Full Code Here

      gBuilder = gBuilder.withOption(opt);
    }

    group = gBuilder.create();

    CommandLine cmdLine;
    try {
      Parser parser = new Parser();
      parser.setGroup(group);
      parser.setHelpOption(helpOpt);
      cmdLine = parser.parse(args);

    } catch (OptionException e) {
      log.error(e.getMessage());
      CommandLineUtil.printHelpWithGenericOptions(group, e);
      return null;
    }

    if (cmdLine.hasOption(helpOpt)) {
      CommandLineUtil.printHelpWithGenericOptions(group);
      return null;
    }

    try {
View Full Code Here

        .withOption(outputOpt).withOption(helpOpt).create();
   
    try {
      Parser parser = new Parser();
      parser.setGroup(group);
      CommandLine cmdLine = parser.parse(args);
     
      if (cmdLine.hasOption("help")) {
        CommandLineUtil.printHelp(group);
        return -1;
      }
     
      isPartial = cmdLine.hasOption(partialOpt);
      String dataName = cmdLine.getValue(dataOpt).toString();
      String datasetName = cmdLine.getValue(datasetOpt).toString();
      String outputName = cmdLine.getValue(outputOpt).toString();
      nbTrees = Integer.parseInt(cmdLine.getValue(nbtreesOpt).toString());
     
      if (cmdLine.hasOption(selectionOpt)) {
        m = Integer.parseInt(cmdLine.getValue(selectionOpt).toString());
      }
      complemented = !cmdLine.hasOption(noCompleteOpt);
      if (cmdLine.hasOption(minSplitOpt)) {
        minSplitNum = Integer.parseInt(cmdLine.getValue(minSplitOpt).toString());
      }
      if (cmdLine.hasOption(minPropOpt)) {
        minVarianceProportion = Double.parseDouble(cmdLine.getValue(minPropOpt).toString());
      }
      if (cmdLine.hasOption(seedOpt)) {
        seed = Long.valueOf(cmdLine.getValue(seedOpt).toString());
      }

      if (log.isDebugEnabled()) {
        log.debug("data : {}", dataName);
        log.debug("dataset : {}", datasetName);
View Full Code Here

                          .withOption(sizeOpt).withOption(numIndexesPerVectorOpt).create();

    try {
      Parser parser = new Parser();
      parser.setGroup(group);
      CommandLine cmdLine = parser.parse(args);

      if (cmdLine.hasOption(helpOpt)) {
        CommandLineUtil.printHelpWithGenericOptions(group);
        return;
      }

      if (cmdLine.hasOption(seqOpt)) {
        Configuration conf = new Configuration();
        Path pathPattern = new Path(cmdLine.getValue(seqOpt).toString());
        FileSystem fs = FileSystem.get(conf);
        FileStatus[] inputPaths = fs.globStatus(pathPattern);

        String dictionaryType = "text";
        if (cmdLine.hasOption(dictTypeOpt)) {
          dictionaryType = cmdLine.getValue(dictTypeOpt).toString();
        }

        boolean sortVectors = cmdLine.hasOption(sortVectorsOpt);
        log.info("Sort? " + sortVectors);

        String[] dictionary = null;
        if (cmdLine.hasOption(dictOpt)) {
          if ("text".equals(dictionaryType)) {
            dictionary = VectorHelper.loadTermDictionary(new File(cmdLine.getValue(dictOpt).toString()));
          } else if ("sequencefile".equals(dictionaryType)) {
            dictionary = VectorHelper.loadTermDictionary(conf, cmdLine.getValue(dictOpt).toString());
          } else {
            throw new OptionException(dictTypeOpt);
          }
        }

        Set<String> filters;
        if (cmdLine.hasOption(filtersOpt)) {
          filters = new HashSet<String>(cmdLine.getValues(filtersOpt));
        } else {
          filters = null;
        }
        boolean useCSV = cmdLine.hasOption(csvOpt);

        boolean sizeOnly = cmdLine.hasOption(sizeOpt);
        boolean namesAsComments = cmdLine.hasOption(namesAsCommentsOpt);
        boolean transposeKeyValue = cmdLine.hasOption(vectorAsKeyOpt);
        Writer writer;
        boolean shouldClose;
        if (cmdLine.hasOption(outputOpt)) {
          shouldClose = true;
          writer = Files.newWriter(new File(cmdLine.getValue(outputOpt).toString()), Charsets.UTF_8);
        } else {
          shouldClose = false;
          writer = new OutputStreamWriter(System.out);
        }
        try {
          boolean printKey = cmdLine.hasOption(printKeyOpt);
          if (useCSV && dictionary != null) {
            writer.write("#");
            for (int j = 0; j < dictionary.length; j++) {
              writer.write(dictionary[j]);
              if (j < dictionary.length - 1) {
                writer.write(',');
              }
            }
            writer.write('\n');
          }
          Long numItems = null;
          if (cmdLine.hasOption(numItemsOpt)) {
            numItems = Long.parseLong(cmdLine.getValue(numItemsOpt).toString());
            writer.append("#Max Items to dump: ").append(String.valueOf(numItems)).append('\n');
          }
          int maxIndexesPerVector = cmdLine.hasOption(numIndexesPerVectorOpt)
              ? Integer.parseInt(cmdLine.getValue(numIndexesPerVectorOpt).toString())
              : Integer.MAX_VALUE;
          long itemCount = 0;
          int fileCount = 0;
          for (FileStatus stat : inputPaths) {
            if (numItems != null && numItems <= itemCount) {
View Full Code Here

    Parser parser = new Parser();
    parser.setGroup(group);

    try {
      CommandLine cmdLine = parser.parse(args);

      if (cmdLine.hasOption(helpOpt)) {
        CommandLineUtil.printHelp(group);
        return;
      }

      String dataset = cmdLine.getValue(inputOpt).toString();
      int target = Integer.parseInt(cmdLine.getValue(labelOpt).toString());
      double threshold =
          cmdLine.hasOption(thresholdOpt) ? Double.parseDouble(cmdLine.getValue(thresholdOpt).toString()) : 0.5;
      int crosspnts =
          cmdLine.hasOption(crosspntsOpt) ? Integer.parseInt(cmdLine.getValue(crosspntsOpt).toString()) : 1;
      double mutrate = Double.parseDouble(cmdLine.getValue(mutrateOpt).toString());
      double mutrange =
          cmdLine.hasOption(mutrangeOpt) ? Double.parseDouble(cmdLine.getValue(mutrangeOpt).toString()) : 0.1;
      int mutprec = cmdLine.hasOption(mutprecOpt) ? Integer.parseInt(cmdLine.getValue(mutprecOpt).toString()) : 2;
      int popSize = Integer.parseInt(cmdLine.getValue(popsizeOpt).toString());
      int genCount = Integer.parseInt(cmdLine.getValue(gencntOpt).toString());

      long start = System.currentTimeMillis();

      runJob(dataset, target, threshold, crosspnts, mutrate, mutrange, mutprec, popSize, genCount);
View Full Code Here

    try {
      Parser parser = new Parser();

      parser.setGroup(group);
      parser.setHelpOption(helpOpt);
      CommandLine cmdLine = parser.parse(args);
      if (cmdLine.hasOption(helpOpt)) {
        CommandLineUtil.printHelp(group);
        return -1;
      }

      String inputDirString = (String) cmdLine.getValue(inputDirOpt);
      String dictDirString = cmdLine.hasOption(dictOpt) ? (String)cmdLine.getValue(dictOpt) : null;
      int numTopics = Integer.parseInt((String) cmdLine.getValue(numTopicsOpt));
      double alpha = Double.parseDouble((String)cmdLine.getValue(alphaOpt));
      double eta = Double.parseDouble((String)cmdLine.getValue(etaOpt));
      int maxIterations = Integer.parseInt((String)cmdLine.getValue(maxIterOpt));
      int burnInIterations = (Integer)cmdLine.getValue(burnInOpt);
      double minFractionalErrorChange = Double.parseDouble((String) cmdLine.getValue(convergenceOpt));
      int numTrainThreads = Integer.parseInt((String)cmdLine.getValue(numTrainThreadsOpt));
      int numUpdateThreads = Integer.parseInt((String)cmdLine.getValue(numUpdateThreadsOpt));
      String topicOutFile = (String)cmdLine.getValue(outputTopicFileOpt);
      String docOutFile = (String)cmdLine.getValue(outputDocFileOpt);
      String reInferDocTopics = (String)cmdLine.getValue(reInferDocTopicsOpt);
      boolean verbose = Boolean.parseBoolean((String) cmdLine.getValue(verboseOpt));
      double modelCorpusFraction = (Double) cmdLine.getValue(modelCorpusFractionOption);

      long start = System.nanoTime();

      if(conf.get("fs.default.name") == null) {
        String dfsNameNode = (String)cmdLine.getValue(dfsOpt);
        conf.set("fs.default.name", dfsNameNode);
      }
      String[] terms = loadDictionary(dictDirString, conf);
      logTime("dictionary loading", System.nanoTime() - start);
      start = System.nanoTime();
View Full Code Here

TOP

Related Classes of org.nebulaframework.benchmark.scimark2.commandline

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.