Package org.apache.mahout.math.stats

Source Code of org.apache.mahout.math.stats.OnlineSummarizerTest

/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.mahout.math.stats;

import org.apache.mahout.common.RandomUtils;
import org.apache.mahout.math.MahoutTestCase;
import org.apache.mahout.math.jet.random.AbstractContinousDistribution;
import org.apache.mahout.math.jet.random.Gamma;
import org.junit.Test;

import java.util.Arrays;
import java.util.Random;

public final class OnlineSummarizerTest extends MahoutTestCase {

  @Test
  public void testStats() {
   /**
     the reference limits here were derived using a numerical simulation where I took
     10,000 samples from the distribution in question and computed the stats from that
     sample to get min, 25%-ile, median and so on. I did this 1000 times to get 5% and
     95% confidence limits for those values.
   */

    //symmetrical, well behaved
    System.out.printf("normal\n");
    check(normal(10000));

    //asymmetrical, well behaved. The range for the maximum was fudged slightly to all this to pass.
    System.out.printf("exp\n");
    check(exp(10000));

    //asymmetrical, wacko distribution where mean/median is about 200
    System.out.printf("gamma\n");
    check(gamma(10000, 0.1));
  }

  private static void check(double[] samples) {
    OnlineSummarizer s = new OnlineSummarizer();
    double mean = 0;
    double sd = 0;
    int n = 1;
    for (double x : samples) {
      s.add(x);
      double old = mean;
      mean += (x - mean) / n;
      sd += (x - old) * (x - mean);
      n++;
    }
    sd = Math.sqrt(sd / samples.length);

    Arrays.sort(samples);

    for (int i = 0; i < 5; i++) {
      int index = Math.abs(Arrays.binarySearch(samples, s.getQuartile(i)));
      assertEquals("quartile " + i, i * (samples.length - 1) / 4.0, index, 10);
    }
    assertEquals(s.getQuartile(2), s.getMedian(), 0);

    assertEquals("mean", s.getMean(), mean, 0);
    assertEquals("sd", s.getSD(), sd, 1e-8);
  }

  private static double[] normal(int n) {
    double[] r = new double[n];
    Random gen = RandomUtils.getRandom(1L);
    for (int i = 0; i < n; i++) {
      r[i] = gen.nextGaussian();
    }
    return r;
  }

  private static double[] exp(int n) {
    double[] r = new double[n];
    Random gen = RandomUtils.getRandom(1L);
    for (int i = 0; i < n; i++) {
      r[i] = -Math.log1p(-gen.nextDouble());
    }
    return r;
  }

  private static double[] gamma(int n, double shape) {
    double[] r = new double[n];
    Random gen = RandomUtils.getRandom();
    AbstractContinousDistribution gamma = new Gamma(shape, shape, gen);
    for (int i = 0; i < n; i++) {
      r[i] = gamma.nextDouble();
    }
    return r;
  }
}

TOP

Related Classes of org.apache.mahout.math.stats.OnlineSummarizerTest

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.