Package com.heatonresearch.aifh.normalize

Source Code of com.heatonresearch.aifh.normalize.DataSet

* Artificial Intelligence for Humans
* Volume 1: Fundamental Algorithms
* Java Version
* Code repository:

* Copyright 2013 by Jeff Heaton
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
* For more information on Heaton Research copyrights, licenses
* and trademarks visit:

package com.heatonresearch.aifh.normalize;

import com.heatonresearch.aifh.AIFHError;

import java.text.NumberFormat;
import java.text.ParseException;
import java.util.*;

* Holds a data set.  This is usually loaded from a CSV.  It can also be generated.
public class DataSet {

     * The data loaded from a CSV, or other source.
    private final List<Object[]> data = new ArrayList<Object[]>();
     * The column headers.
    private String[] headers;

     * The number formatter to use for this format.
    private final NumberFormat numberFormatter = NumberFormat.getInstance(Locale.US);

     * Create a blank data set.
     * @param theHeaders The column headers.
    public DataSet(final String[] theHeaders) {
        this.headers = theHeaders;

     * Convert a column to numeric.  Save the new Double object in place of the string.
     * @param obj    The column array.
     * @param column The column to change.
     * @return The numeric value.
    private double convertNumeric(final Object[] obj, final int column) {
        final double x;
        if (obj[column] instanceof Double) {
            x = (Double) obj[column];
        } else {
            try {
                x = this.numberFormatter.parse(obj[column].toString()).doubleValue();
                obj[column] = x;
            } catch (ParseException e) {
                throw new AIFHError(e);

        return x;

     * Load a CSV file from a file.
     * @param filename The filename.
     * @return The data set read.
    public static DataSet load(final File filename) {
        try {
            final FileInputStream fis = new FileInputStream(filename);
            final DataSet ds = load(fis);
            return ds;
        } catch (IOException ex) {
            throw (new AIFHError(ex));

     * Load a CSV from an input stream.
     * @param is The input stream.
     * @return The loaded file.
    public static DataSet load(final InputStream is) {
        final DataSet result;

        try {
            final Reader reader = new InputStreamReader(is);
            final CSVReader csv = new CSVReader(reader);

            final String[] headers = csv.readNext();

            result = new DataSet(headers);

            String[] nextLine;
            while ((nextLine = csv.readNext()) != null) {
                if (nextLine.length <= 1) {
                } else if (nextLine.length != result.getHeaderCount()) {
                    throw new AIFHError("Found a CSV line with "
                            + nextLine.length + " columns, when expecting " + result.getHeaderCount());
                final Object[] obj = new Object[result.getHeaderCount()];
                System.arraycopy(nextLine, 0, obj, 0, nextLine.length);
        } catch (IOException ex) {
            throw (new AIFHError(ex));

        return result;

     * Save the specified data set to a CSV file.
     * @param filename The filename.
     * @param ds       The data set to save.
    public static void save(final File filename, final DataSet ds) {
        try {
            final FileOutputStream fos = new FileOutputStream(filename);
            save(fos, ds);
        } catch (IOException ex) {
            throw (new AIFHError(ex));

     * Save the specified data to an output stream.
     * @param os The output stream.
     * @param ds The data set.
    public static void save(final OutputStream os, final DataSet ds) {
        try {
            final Writer writer = new OutputStreamWriter(os);
            final CSVWriter csv = new CSVWriter(writer);

            final String[] items2 = new String[ds.getHeaderCount()];

            for (final Object[] item : ds.getData()) {
                for (int i = 0; i < ds.getHeaderCount(); i++) {
                    items2[i] = item[i].toString();
        } catch (IOException ex) {
            throw new AIFHError(ex);

     * @return The number of columns (or headers).
    public int getHeaderCount() {
        return headers.length;

     * @return The column headers.
    public String[] getHeaders() {
        return this.headers;

     * Add a row.
     * @param row The row to add.
    public void add(final Object[] row) {;

     * @return The row data.
    public List<Object[]> getData() {

     * Get the maximum numeric value for a column.
     * @param column The column.
     * @return The max numeric value.
    public double getMax(final int column) {
        double result = Double.NEGATIVE_INFINITY;

        for (final Object[] obj : {
            result = Math.max(result, convertNumeric(obj, column));

        return result;

     * Get the minimum numeric value for a column.
     * @param column The column.
     * @return The min numeric value.
    public double getMin(final int column) {
        double result = Double.POSITIVE_INFINITY;

        for (final Object[] obj : {
            result = Math.min(result, convertNumeric(obj, column));

        return result;

     * Normalize a column using range normalization.
     * @param column         The column to normalize.
     * @param dataLow        The low value for the actual data.
     * @param dataHigh       The high value for the actual data.
     * @param normalizedLow  The desired low normalized value.
     * @param normalizedHigh The desired high normalized value.
    public void normalizeRange(final int column, final double dataLow, final double dataHigh, final double normalizedLow, final double normalizedHigh) {
        for (final Object[] obj : {
            final double x = convertNumeric(obj, column);

            obj[column] = ((x - dataLow)
                    / (dataHigh - dataLow))
                    * (normalizedHigh - normalizedLow) + normalizedLow;

     * Normalize a column using range normalization.  Automatically determine the actual data high and low.
     * @param column         The column to normalize.
     * @param normalizedLow  The desired low normalized value.
     * @param normalizedHigh The desired high normalized value.
    public void normalizeRange(final int column, final double normalizedLow, final double normalizedHigh) {
        final double dataLow = getMin(column);
        final double dataHigh = getMax(column);
        normalizeRange(column, dataLow, dataHigh, normalizedLow, normalizedHigh);

     * De-Normalize a column using range normalization.
     * @param column         The column to normalize.
     * @param dataLow        The low value for the actual data.
     * @param dataHigh       The high value for the actual data.
     * @param normalizedLow  The desired low normalized value.
     * @param normalizedHigh The desired high normalized value.
    public void deNormalizeRange(final int column, final double dataLow, final double dataHigh, final double normalizedLow, final double normalizedHigh) {
        for (final Object[] obj : {
            final double x = convertNumeric(obj, column);

            obj[column] = ((dataLow - dataHigh) * x - normalizedHigh
                    * dataLow + dataHigh * normalizedLow)
                    / (normalizedLow - normalizedHigh);

     * Normalize a column using reciprocal normalization.
     * @param column The column to encode.
    public void normalizeReciprocal(final int column) {
        for (final Object[] obj : {
            final double x = convertNumeric(obj, column);
            obj[column] = 1 / x;

     * De-Normalize a column using reciprocal normalization.
     * Note: normalization and de-normalization are the same mathematical operation.
     * @param column The column to encode.
    public void deNormalizeReciprocal(final int column) {

     * Enumerate classes (factors) into a numbered set.
     * @param column The column to enumerate.
     * @return The numbered set.
    public Map<String, Integer> enumerateClasses(final int column) {
        // determine classes
        final Set<String> classes = new HashSet<String>();
        for (final Object[] obj : {
        // assign numeric values to each class
        final Map<String, Integer> result = new HashMap<String,Integer>();
        int index = 0;
        for (final String className : classes) {
            result.put(className, index++);

        return result;

     * Encode (enumerate) a column with simple numeric index encoding.
     * @param column The column to encode.
     * @return The mapping from column names to indexes.
    public Map<String, Integer> encodeNumeric(final int column) {
        final Map<String, Integer> classes = enumerateClasses(column);

        for (final Object[] obj : {
            final int index = classes.get(obj[column].toString());
            obj[column] = index;

        return classes;

     * Encode a column using "one of n" encoding.  Use 0 for the off value, and 1 for on.
     * <p/>
     * @param column The column to use.
     * @return The column to index mapping (the same result as calling enumerateClasses).
    public Map<String, Integer> encodeOneOfN(final int column) {
        return encodeOneOfN(column, 0, 1);

     * Encode a column using "one of n" encoding.
     * <p/>
     * @param column   The column to use.
     * @param offValue The off value to use.
     * @param onValue  The on value to use.
     * @return The column to index mapping (the same result as calling enumerateClasses).
    public Map<String, Integer> encodeOneOfN(final int column, final double offValue, final double onValue) {
        // remember the column name
        final String name = this.headers[column];

        // make space for it
        final Map<String, Integer> classes = enumerateClasses(column);
        insertColumns(column + 1, classes.size() - 1);

        // perform the 1 of n encode
        for (final Object[] obj : {
            final int index = classes.get(obj[column].toString());
            final int classCount = classes.size();

            for (int i = 0; i < classCount; i++) {
                obj[column + i] = (i == index) ? onValue : offValue;

        // name the new columns
        for (int i = 0; i < classes.size(); i++) {
            this.headers[column + i] = name + "-" + i;

        return classes;

     * Use equilateral encoding to encode a column, use zero for the off value and one for the on value.
     * <p/>
     * @param column The column to encode.
     * @return The column to index mapping (the same result as calling enumerateClasses).
    public Map<String, Integer> encodeEquilateral(final int column) {
        return encodeEquilateral(column, 0, 1);

     * Use equilateral encoding to encode a column, use zero for the off value and one for the on value.
     * <p/>
     * @param column   The column to use.
     * @param offValue The off value to use.
     * @param onValue  The on value to use.
     * @return The column to index mapping (the same result as calling enumerateClasses).
    public Map<String, Integer> encodeEquilateral(final int column, final double offValue, final double onValue) {
        // remember the column name
        final String name = this.headers[column];

        // make space for it
        final Map<String, Integer> classes = enumerateClasses(column);
        final int classCount = classes.size();
        insertColumns(column + 1, classCount - 1);

        // perform the equilateral
        final Equilateral eq = new Equilateral(classCount, offValue, onValue);

        for (final Object[] obj : {
            final int index = classes.get(obj[column].toString());

            final double[] encoded = eq.encode(index);

            for (int i = 0; i < classCount - 1; i++) {
                obj[column + i] = encoded[i];

        // name the new columns
        for (int i = 0; i < classes.size(); i++) {
            this.headers[column + i] = name + "-" + i;

        return classes;

     * @return The number of rows.
    public int size() {
        return data.size();

     * Append new columns to the end of the existing columns.
     * @param count The number of new columns.
    public void appendColumns(final int count) {

        // add the headers
        final String[] newHeaders = new String[getHeaderCount() + count];
        System.arraycopy(this.headers, 0, newHeaders, 0, getHeaderCount());

        for (int i = 0; i < count; i++) {
            newHeaders[i + getHeaderCount()] = "new";

        this.headers = newHeaders;

        // add the data
        for (int rowIndex = 0; rowIndex < size(); rowIndex++) {
            final Object[] originalRow =;
            final Object[] newRow = new Object[getHeaderCount()];
            System.arraycopy(originalRow, 0, newRow, 0, originalRow.length);
            for (int i = 0; i < count; i++) {
                newRow[getHeaderCount() - 1 - i] = (double) 0;
  , newRow);

     * Insert columns at a specific location.
     * @param column      The column to insert BEFORE.
     * @param columnCount The count of columns to insert.
    public void insertColumns(final int column, final int columnCount) {
        // create space for new columns

        // insert headers
        System.arraycopy(this.headers, column + 1 - columnCount, this.headers, column + 1, getHeaderCount() - 1 - column);

        // mark new columns headers
        for (int i = 0; i < columnCount; i++) {
            this.headers[column + i] = "new";

        for (final Object[] obj : {
            // insert columns
            System.arraycopy(obj, column + 1 - columnCount, obj, column + 1, getHeaderCount() - 1 - column);

            // mark new columns
            for (int i = 0; i < columnCount; i++) {
                obj[column + i] = (double) 0;


     * {@inheritDoc}
    public boolean equals(final Object other) {
        if (!(other instanceof DataSet)) {
            return false;

        final DataSet otherSet = (DataSet) other;

        // do the basic sizes match

        if (getHeaderCount() != otherSet.getHeaderCount()) {
            return false;

        if (size() != otherSet.size()) {
            return false;

        // do the headers match?
        for (int i = 0; i < getHeaderCount(); i++) {
            if (!this.headers[i].equals(otherSet.getHeaders()[i])) {
                return false;

        // does the data match?
        for (int i = 0; i < size(); i++) {
            final Object[] row1 =;
            final Object[] row2 = ((DataSet) other).getData().get(i);

            for (int j = 0; j < getHeaderCount(); j++) {
                if (!row1[j].equals(row2[j])) {
                    return false;

        return true;

     * Extract and label an unsupervised training set.
     * @param labelIndex The column index to use for the label.
     * @return The training set.
    public List<BasicData> extractUnsupervisedLabeled(final int labelIndex) {
        final List<BasicData> result = new ArrayList<BasicData>();

        final int dimensions = getHeaderCount() - 1;

        for (int rowIndex = 0; rowIndex < size(); rowIndex++) {
            final Object[] raw =;
            final BasicData row = new BasicData(dimensions, 0, raw[labelIndex].toString());

            int colIndex = 0;
            for (int rawColIndex = 0; rawColIndex < getHeaderCount(); rawColIndex++) {
                if (rawColIndex != labelIndex) {
                    row.getInput()[colIndex++] = convertNumeric(raw, rawColIndex);


        return result;

     * Extract a supervised training set.  This has both input and expected (ideal) output.
     * @param inputBegin The first input column.
     * @param inputCount The number of columns for input.
     * @param idealBegin The first ideal column.
     * @param idealCount The number of columns for ideal.
     * @return The training set.
    public List<BasicData> extractSupervised(final int inputBegin, final int inputCount, final int idealBegin, final int idealCount) {
        final List<BasicData> result = new ArrayList<BasicData>();

        for (int rowIndex = 0; rowIndex < size(); rowIndex++) {
            final Object[] raw =;
            final BasicData row = new BasicData(inputCount, idealCount);

            for (int i = 0; i < inputCount; i++) {
                row.getInput()[i] = convertNumeric(raw, inputBegin + i);

            for (int i = 0; i < idealCount; i++) {
                row.getIdeal()[i] = convertNumeric(raw, idealBegin + i);


        return result;


     * Delete all rows that contain unknown data.  An unknown column has a "?" value.
    public void deleteUnknowns() {
        int rowIndex = 0;
        while (rowIndex < {
            final Object[] row = data.get(rowIndex);
            boolean remove = false;
            for (final Object aRow : row) {
                if (aRow.toString().equals("?")) {
                    remove = true;

            if (remove) {
            } else {

     * Delete the specified column.
     * @param col The column to delete.
    public void deleteColumn(final int col) {
        final String[] headers2 = new String[headers.length - 1];

        // first, remove the header
        int h2Index = 0;
        for (int i = 0; i < headers.length; i++) {
            if (i != col) {
                headers2[h2Index++] = headers[i];
        this.headers = headers2;

        // now process the data
        int rowIndex = 0;
        for (final Object[] row : {
            final Object[] row2 = new Object[headers.length];
            int r2Index = 0;
            for (int i = 0; i <= headers.length; i++) {
                if (i != col) {
                    row2[r2Index++] = row[i];
  , row2);

     * Replace all of the specified values in a column.
     * @param columnIndex The column index.
     * @param searchFor   What to search for.
     * @param replaceWith What to replace with.
     * @param others      What to fill in the others with that do not match.
    public void replaceColumn(final int columnIndex, final double searchFor, final double replaceWith, final double others) {
        for (final Object[] row : {
            final double d = convertNumeric(row, columnIndex);
            if (Math.abs(d - searchFor) < 0.0001) {
                row[columnIndex] = replaceWith;
            } else {
                row[columnIndex] = others;


Related Classes of com.heatonresearch.aifh.normalize.DataSet

Copyright © 2018 All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact