Package org.apache.mahout.math.stats

Examples of org.apache.mahout.math.stats.OnlineSummarizer


  // TODO: the speedup constant should be checked and oddly, the times don't increase as the counts increase
  @Ignore
  public void fasterThanBefore() {

    OnlineSummarizer s1 = new OnlineSummarizer();
    OnlineSummarizer s2 = new OnlineSummarizer();

    Matrix a = new DenseMatrix(60, 60).assign(Functions.random());

    decompositionSpeedCheck(new Decomposer() {
      @Override
      public QR decompose(Matrix a) {
        return new QRDecomposition(a);
      }
    }, s1, a, "new");

    decompositionSpeedCheck(new Decomposer() {
      @Override
      public QR decompose(Matrix a) {
        return new OldQRDecomposition(a);
      }
    }, s2, a, "old");

    // should be much more than twice as fast.
    System.out.printf("Speedup is about %.1f times\n", s2.getMedian() / s1.getMedian());
    assertTrue(s1.getMedian() < 0.5 * s2.getMedian());
  }
View Full Code Here


      encoder[i] = new ConstantValueEncoder("v" + 1);
    }

    OnlineSummarizer[] s = new OnlineSummarizer[FIELDS];
    for (int i = 0; i < FIELDS; i++) {
      s[i] = new OnlineSummarizer();
    }
    long t0 = System.currentTimeMillis();
    Vector v = new DenseVector(1000);
    if ("--generate".equals(args[0])) {
      PrintWriter out = new PrintWriter(new File(args[2]));
View Full Code Here

   * A quick and dirty hack to compute the median of a vector...
   * @param v
   * @return
   */
  private static double median(Vector v) {
    OnlineSummarizer med = new OnlineSummarizer();
    if (v.size() < 100) {
      return v.zSum() / v.size();
    }
    for (Vector.Element e : v) {
      med.add(e.get());
    }
    return med.getMedian();
  }
View Full Code Here

    // full distance computations as possible.  Our goal is to only do full distance computations for
    // vectors with hash distance at most as large as the searchSize biggest hash distance seen so far.

    OnlineSummarizer[] distribution = new OnlineSummarizer[BITS + 1];
    for (int i = 0; i < BITS + 1; i++) {
      distribution[i] = new OnlineSummarizer();
    }

    distanceEvaluations = 0;
   
    // We keep the counts of the hash distances here.  This lets us accurately
View Full Code Here

    List<OnlineSummarizer> summarizers = Lists.newArrayList();
    if (searcher.size() == 0) {
      return summarizers;
    }
    for (int i = 0; i < searcher.size(); ++i) {
      summarizers.add(new OnlineSummarizer());
    }
    for (Vector v : datapoints) {
      Centroid closest = (Centroid)searcher.search(v,  1).get(0).getValue();
      OnlineSummarizer summarizer = summarizers.get(closest.getIndex());
      summarizer.add(distanceMeasure.distance(v, closest));
    }
    return summarizers;
  }
View Full Code Here

  }

  public static void printSummaries(List<OnlineSummarizer> summarizers, String type, PrintWriter fileOut) {
    double maxDistance = 0;
    for (int i = 0; i < summarizers.size(); ++i) {
      OnlineSummarizer summarizer = summarizers.get(i);
      if (summarizer.getCount() > 1) {
        maxDistance = Math.max(maxDistance, summarizer.getMax());
        System.out.printf("Average distance in cluster %d [%d]: %f\n", i, summarizer.getCount(), summarizer.getMean());
        // If there is just one point in the cluster, quartiles cannot be estimated. We'll just assume all the quartiles
        // equal the only value.
        if (fileOut != null) {
          fileOut.printf("%d,%f,%f,%f,%f,%f,%f,%f,%d,%s\n", i, summarizer.getMean(),
              summarizer.getSD(),
              summarizer.getQuartile(0),
              summarizer.getQuartile(1),
              summarizer.getQuartile(2),
              summarizer.getQuartile(3),
              summarizer.getQuartile(4), summarizer.getCount(), type);
        }
      } else {
        System.out.printf("Cluster %d is has %d data point. Need atleast 2 data points in a cluster for" +
            " OnlineSummarizer.\n", i, summarizer.getCount());
      }
    }
    System.out.printf("Num clusters: %d; maxDistance: %f\n", summarizers.size(), maxDistance);
  }
View Full Code Here

    assertEquals("Total weight not preserved", totalWeight(syntheticData.getFirst()), totalWeight(clusterer), 1.0e-9);

    // Verify that each corner of the cube has a centroid very nearby.
    // This is probably FALSE for large-dimensional spaces!
    OnlineSummarizer summarizer = new OnlineSummarizer();
    for (Vector mean : syntheticData.getSecond()) {
      WeightedThing<Vector> v = searcher.search(mean, 1).get(0);
      summarizer.add(v.getWeight());
    }
    assertTrue(String.format("Median weight [%f] too large [>%f]", summarizer.getMedian(),
        DISTRIBUTION_RADIUS), summarizer.getMedian() < DISTRIBUTION_RADIUS);

    double clusterTime = (endTime - startTime) / 1000.0;
    System.out.printf("%s\n%.2f for clustering\n%.1f us per row\n\n",
        searcher.getClass().getName(), clusterTime,
        clusterTime / syntheticData.getFirst().size() * 1.0e6);
 
View Full Code Here

 
  private int incorrectlyClassified;
 
  public ResultAnalyzer(Collection<String> labelSet, String defaultLabel) {
    confusionMatrix = new ConfusionMatrix(labelSet, defaultLabel);
    summarizer = new OnlineSummarizer();
  }
View Full Code Here

   * A quick and dirty hack to compute the median of a vector...
   * @param v
   * @return
   */
  private static double median(Vector v) {
    OnlineSummarizer med = new OnlineSummarizer();
    if (v.size() < 100) {
      return v.zSum() / v.size();
    }
    for (Vector.Element e : v) {
      med.add(e.get());
    }
    return med.getMedian();
  }
View Full Code Here

     
      if (lmp.getTargetCategories().size() <=2 ) {
        collector = new Auc();
      }
     
      OnlineSummarizer slh = new OnlineSummarizer();
      ConfusionMatrix cm = new ConfusionMatrix(lmp.getTargetCategories(), defaultCategory);

      State<Wrapper, CrossFoldLearner> best = lr.getBest();
      if (best == null) {
        output.printf("%s\n",
            "AdaptiveLogisticRegression has not be trained probably.");
        return;
      }
      CrossFoldLearner learner = best.getPayload().getLearner();

      BufferedReader in = TrainLogistic.open(inputFile);
      String line = in.readLine();
      csv.firstLine(line);
      line = in.readLine();
      if (showScores) {
        output.printf(Locale.ENGLISH, "\"%s\", \"%s\", \"%s\", \"%s\"\n",
            "target", "model-output", "log-likelihood", "average-likelihood");
      }
      while (line != null) {
        Vector v = new SequentialAccessSparseVector(lmp.getNumFeatures());
        //TODO: How to avoid extra target values not shown in the training process.
        int target = csv.processLine(line, v);
        double likelihood = learner.logLikelihood(target, v);
        double score = learner.classifyFull(v).maxValue();
       
        slh.add(likelihood);
        cm.addInstance(csv.getTargetString(line), csv.getTargetLabel(target));       
       
        if (showScores) {
          output.printf(Locale.ENGLISH, "%8d, %.12f, %.13f, %.13f\n", target,
              score, learner.logLikelihood(target, v), slh.getMean());
        }
        if (collector != null) {
          collector.add(target, score);
        }
        line = in.readLine();
      }
     
      output.printf(Locale.ENGLISH,"\nLog-likelihood:");
      output.printf(Locale.ENGLISH, "Min=%.2f, Max=%.2f, Mean=%.2f, Median=%.2f\n",
          slh.getMin(), slh.getMax(), slh.getMean(), slh.getMedian());

      if (collector != null) {       
        output.printf(Locale.ENGLISH, "\nAUC = %.2f\n", collector.auc());       
      }
     
View Full Code Here

TOP

Related Classes of org.apache.mahout.math.stats.OnlineSummarizer

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.