Package org.apache.mahout.classifier.df.data

Source Code of org.apache.mahout.classifier.df.data.Utils

/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.mahout.classifier.df.data;

import java.io.BufferedWriter;
import java.io.File;
import java.io.IOException;
import java.util.Arrays;
import java.util.Random;

import com.google.common.base.Charsets;
import com.google.common.io.Closeables;
import com.google.common.io.Files;
import org.apache.commons.lang3.ArrayUtils;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.mahout.classifier.df.data.Dataset.Attribute;
import org.apache.mahout.common.MahoutTestCase;

/**
* Helper methods used by the tests
*
*/
public final class Utils {

  private Utils() {}

  /** Used when generating random CATEGORICAL values */
  private static final int CATEGORICAL_RANGE = 100;

  /**
   * Generates a random list of tokens
   * <ul>
   * <li>each attribute has 50% chance to be NUMERICAL ('N') or CATEGORICAL
   * ('C')</li>
   * <li>10% of the attributes are IGNORED ('I')</li>
   * <li>one randomly chosen attribute becomes the LABEL ('L')</li>
   * </ul>
   *
   * @param rng Random number generator
   * @param nbTokens number of tokens to generate
   */
  public static char[] randomTokens(Random rng, int nbTokens) {
    char[] result = new char[nbTokens];

    for (int token = 0; token < nbTokens; token++) {
      double rand = rng.nextDouble();
      if (rand < 0.1) {
        result[token] = 'I'; // IGNORED
      } else if (rand >= 0.5) {
        result[token] = 'C';
      } else {
        result[token] = 'N'; // NUMERICAL
      } // CATEGORICAL
    }

    // choose the label
    result[rng.nextInt(nbTokens)] = 'L';

    return result;
  }

  /**
   * Generates a space-separated String that contains all the tokens
   */
  public static String generateDescriptor(char[] tokens) {
    StringBuilder builder = new StringBuilder();

    for (char token : tokens) {
      builder.append(token).append(' ');
    }

    return builder.toString();
  }

  /**
   * Generates a random descriptor as follows:<br>
   * <ul>
   * <li>each attribute has 50% chance to be NUMERICAL or CATEGORICAL</li>
   * <li>10% of the attributes are IGNORED</li>
   * <li>one randomly chosen attribute becomes the LABEL</li>
   * </ul>
   */
  public static String randomDescriptor(Random rng, int nbAttributes) {
    return generateDescriptor(randomTokens(rng, nbAttributes));
  }

  /**
   * generates random data based on the given descriptor
   *
   * @param rng Random number generator
   * @param descriptor attributes description
   * @param number number of data lines to generate
   */
  public static double[][] randomDoubles(Random rng, CharSequence descriptor, boolean regression, int number)
    throws DescriptorException {
    Attribute[] attrs = DescriptorUtils.parseDescriptor(descriptor);

    double[][] data = new double[number][];

    for (int index = 0; index < number; index++) {
      data[index] = randomVector(rng, attrs, regression);
    }

    return data;
  }

  /**
   * Generates random data
   *
   * @param rng Random number generator
   * @param nbAttributes number of attributes
   * @param regression true is the label should be numerical
   * @param size data size
   */
  public static Data randomData(Random rng, int nbAttributes, boolean regression, int size) throws DescriptorException {
    String descriptor = randomDescriptor(rng, nbAttributes);
    double[][] source = randomDoubles(rng, descriptor, regression, size);
    String[] sData = double2String(source);
    Dataset dataset = DataLoader.generateDataset(descriptor, regression, sData);
   
    return DataLoader.loadData(dataset, sData);
  }

  /**
   * generates a random vector based on the given attributes.<br>
   * the attributes' values are generated as follows :<br>
   * <ul>
   * <li>each IGNORED attribute receives a Double.NaN</li>
   * <li>each NUMERICAL attribute receives a random double</li>
   * <li>each CATEGORICAL and LABEL attribute receives a random integer in the
   * range [0, CATEGORICAL_RANGE[</li>
   * </ul>
   *
   * @param attrs attributes description
   */
  private static double[] randomVector(Random rng, Attribute[] attrs, boolean regression) {
    double[] vector = new double[attrs.length];

    for (int attr = 0; attr < attrs.length; attr++) {
      if (attrs[attr].isIgnored()) {
        vector[attr] = Double.NaN;
      } else if (attrs[attr].isNumerical()) {
        vector[attr] = rng.nextDouble();
      } else if (attrs[attr].isCategorical()) {
        vector[attr] = rng.nextInt(CATEGORICAL_RANGE);
      } else { // LABEL
        if (regression) {
          vector[attr] = rng.nextDouble();
        } else {
          vector[attr] = rng.nextInt(CATEGORICAL_RANGE);
        }
      }
    }

    return vector;
  }

  /**
   * converts a double array to a comma-separated string
   *
   * @param v double array
   * @return comma-separated string
   */
  private static String double2String(double[] v) {
    StringBuilder builder = new StringBuilder();

    for (double aV : v) {
      builder.append(aV).append(',');
    }

    return builder.toString();
  }

  /**
   * converts an array of double arrays to an array of comma-separated strings
   *
   * @param source array of double arrays
   * @return array of comma-separated strings
   */
  public static String[] double2String(double[][] source) {
    String[] output = new String[source.length];

    for (int index = 0; index < source.length; index++) {
      output[index] = double2String(source[index]);
    }

    return output;
  }

  /**
   * Generates random data with same label value
   *
   * @param number data size
   * @param value label value
   */
  public static double[][] randomDoublesWithSameLabel(Random rng,
                                                      CharSequence descriptor,
                                                      boolean regression,
                                                      int number,
                                                      int value) throws DescriptorException {
    int label = findLabel(descriptor);
   
    double[][] source = randomDoubles(rng, descriptor, regression, number);
   
    for (int index = 0; index < number; index++) {
      source[index][label] = value;
    }

    return source;
  }

  /**
   * finds the label attribute's index
   */
  public static int findLabel(CharSequence descriptor) throws DescriptorException {
    Attribute[] attrs = DescriptorUtils.parseDescriptor(descriptor);
    return ArrayUtils.indexOf(attrs, Attribute.LABEL);
  }

  private static void writeDataToFile(String[] sData, Path path) throws IOException {
    BufferedWriter output = null;
    try {
      output = Files.newWriter(new File(path.toString()), Charsets.UTF_8);
      for (String line : sData) {
        output.write(line);
        output.write('\n');
      }
    } finally {
      Closeables.close(output, false);
    }
 
  }

  public static Path writeDataToTestFile(String[] sData) throws IOException {
    Path testData = new Path("testdata/Data");
    MahoutTestCase ca = new MahoutTestCase();
    FileSystem fs = testData.getFileSystem(ca.getConfiguration());
    if (!fs.exists(testData)) {
      fs.mkdirs(testData);
    }
 
    Path path = new Path(testData, "DataLoaderTest.data");
 
    writeDataToFile(sData, path);
 
    return path;
  }

  /**
   * Split the data into numMaps splits
   */
  public static String[][] splitData(String[] sData, int numMaps) {
    int nbInstances = sData.length;
    int partitionSize = nbInstances / numMaps;
 
    String[][] splits = new String[numMaps][];
 
    for (int partition = 0; partition < numMaps; partition++) {
      int from = partition * partitionSize;
      int to = partition == (numMaps - 1) ? nbInstances : (partition + 1) * partitionSize;
 
      splits[partition] = Arrays.copyOfRange(sData, from, to);
    }
 
    return splits;
  }
}
TOP

Related Classes of org.apache.mahout.classifier.df.data.Utils

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.