Source Code of org.encog.util.normalize.DataNormalization

/*
 * Encog(tm) Core v3.3 - Java Version
 * http://www.heatonresearch.com/encog/
 * https://github.com/encog/encog-java-core
 
 * Copyright 2008-2014 Heaton Research, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *   
 * For more information on Heaton Research copyrights, licenses 
 * and trademarks visit:
 * http://www.heatonresearch.com/copyright
 */
package org.encog.util.normalize;


import java.io.File;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;


import org.encog.NullStatusReportable;
import org.encog.StatusReportable;
import org.encog.ml.data.MLData;
import org.encog.ml.data.MLDataPair;
import org.encog.ml.data.MLDataSet;
import org.encog.ml.data.basic.BasicMLData;
import org.encog.util.csv.CSVFormat;
import org.encog.util.csv.ReadCSV;
import org.encog.util.normalize.input.HasFixedLength;
import org.encog.util.normalize.input.InputField;
import org.encog.util.normalize.input.InputFieldCSV;
import org.encog.util.normalize.input.InputFieldCSVText;
import org.encog.util.normalize.input.InputFieldMLDataSet;
import org.encog.util.normalize.input.MLDataFieldHolder;
import org.encog.util.normalize.output.OutputField;
import org.encog.util.normalize.output.OutputFieldGroup;
import org.encog.util.normalize.output.OutputFieldGrouped;
import org.encog.util.normalize.output.RequireTwoPass;
import org.encog.util.normalize.segregate.Segregator;
import org.encog.util.normalize.target.NormalizationStorage;
import org.encog.util.obj.ReflectionUtil;


/**
 * This class is used to normalize both input and ideal data for neural
 * networks. This class can accept input from a variety of sources and output to
 * a variety of targets. Normalization is a process by which input data is
 * normalized so that it falls in specific ranges. Neural networks typically
 * require input to be in the range of 0 to 1, or -1 to 1, depending on how the
 * network is structured.
 * 
 * The normalize class is typically given for different types of objects to tell
 * it how to process data.
 * 
 * Input Fields:
 * 
 * Input fields specify the raw data that will be read by the Normalize class.
 * Input fields are added to the Normalize class by calling addInputField
 * method. Input fields must implement the InputField interface. There are a
 * number of different input fields provided. Input data can be read from
 * several different sources. For example, you can read the "neural network
 * input" data from one CSV file and the "ideal neural network output" from
 * another.
 * 
 * 
 * Output Fields:
 * 
 * The output fields are used to specify the final output from the Normalize
 * class. The output fields specify both the "neural network input" and "ideal
 * output". The output fields are flagged as either input our ideal. The output
 * fields are not necessarily one-to-one with the input fields. For example,
 * several input fields may combine to produce a single output field. Further
 * some input fields may be used only to segregate data, whereas other input
 * fields may be ignored all together. The type of output field that you specify
 * determines the type of processing that will be done on that field. An
 * OutputField is added by calling the addOutputField method.
 * 
 * 
 * Segregators:
 * 
 * Segregators are used generally for two related purposes. First, segregators
 * can be used to exclude rows of data based on certain input values. Perhaps
 * the data includes several classes of data, and you only want to train on one
 * class. Secondly, segregators can be used to segregate data into training and
 * evaluation sets. You may choose to use 80% of your data for training and 20%
 * for evaluation. A segregator is added by calling the addSegregator method.
 * 
 * 
 * Target Storage:
 * 
 * The data created by the Normalization class must be stored somewhere. The
 * storage targets allow this to be specified. The output can be sent to a CSV
 * file, a NeuralDataSet, or any other target supported by a
 * NormalizationStorage derived class. The target is specified by calling the
 * setTarget method.
 * 
 * The normalization process can take some time. The progress can be reported to
 * a StatusReportable object.
 * 
 * The normalization is a two pass process. The first pass counts the number of
 * records and computes important statistics that will be used to normalize the
 * output. The second pass actually performs the normalization and writes to the
 * target. Both passes are performed when the process method is called.
 * 
 */
public class DataNormalization implements Serializable {


  /**
   * The serial ID.
   */
  private static final long serialVersionUID = 4387885013771660300L;


  /**
   * The input fields.
   */
  private final List<InputField> inputFields = 
    new ArrayList<InputField>();


  /**
   * The output fields.
   */
  private final List<OutputField> outputFields = 
    new ArrayList<OutputField>();


  /**
   * Keep a collection of all of the ReadCSV classes to support all of the
   * distinct CSV files that are to be read.
   */
  private transient Collection<ReadCSV> readCSV;
    
  /**
   * Hold a map between the InputFieldCSV objects and the corresponding
   * ReadCSV object. There will likely be many fields read from a single file.
   * This allows only one ReadCSV object to need to be created per actual CSV
   * file.
   */
  private transient Map<InputField, ReadCSV> csvMap;


  /**
   * For each InputFieldNeuralDataSet input field an Iterator must be kept to
   * actually access the data. Only one Iterator should be kept per data set
   * actually used.
   */
  private transient Collection<Iterator<MLDataPair>> readDataSet;


  /**
   * Map each of the input fields to an internally-build NeuralDataFieldHolder
   * object. The NeuralDataFieldHolder object holds an Iterator, InputField
   * and last NeuralDataPair object loaded.
   */
  private transient Map<InputField, MLDataFieldHolder> dataSetFieldMap;


  /**
   * Map each of the NeuralDataSet Iterators to an internally-build
   * MLDataFieldHolder object. The MLDataFieldHolder object holds an
   * Iterator, InputField and last MLDataPair object loaded.
   */
  private transient Map<Iterator<MLDataPair>, 
    MLDataFieldHolder> dataSetIteratorMap;


  /**
   * Output fields can be grouped together, if the value of one output field
   * might affect all of the others. This collection holds a list of all of
   * the output field groups.
   */
  private final Set<OutputFieldGroup> groups = 
    new HashSet<OutputFieldGroup>();


  /**
   * A list of the segregators.
   */
  private final List<Segregator> segregators = 
    new ArrayList<Segregator>();


  /**
   * Where the final output from the normalization is sent.
   */
  private NormalizationStorage storage;


  /**
   * The object to report the progress of the normalization to.
   */
  private transient StatusReportable report = new NullStatusReportable();


  /**
   * The number of records that were found in the first pass.
   */
  private int recordCount;


  /**
   * The current record's index.
   */
  private int currentIndex;


  /**
   * The format to use for all CSV files.
   */
  private CSVFormat csvFormat = CSVFormat.ENGLISH;


  /**
   * How long has it been since the last report. This filters so that every
   * single record does not produce a message.
   */
  private int lastReport;


  /**
   * Add an input field.
   * 
   * @param f
   *            The input field to add.
   */
  public void addInputField(final InputField f) {
    this.inputFields.add(f);
  }


  /**
   * Add an output field. This output field will be added as a "ML network
   * input field", not an "ideal output field".
   * 
   * @param outputField
   *            The output field to add.
   */
  public void addOutputField(final OutputField outputField) {
    addOutputField(outputField, false);
  }


  /**
   * Add a field and allow it to be specified as an "ideal output field". An
   * "ideal" field is the expected output that the ML network is training
   * towards.
   * 
   * @param outputField
   *            The output field.
   * @param ideal
   *            True if this is an ideal field.
   */
  public void addOutputField(final OutputField outputField,
      final boolean ideal) {
    this.outputFields.add(outputField);
    outputField.setIdeal(ideal);
    if (outputField instanceof OutputFieldGrouped) {
      final OutputFieldGrouped ofg = (OutputFieldGrouped) outputField;
      this.groups.add(ofg.getGroup());
    }
  }


  /**
   * Add a segregator.
   * 
   * @param segregator
   *            The segregator to add.
   */
  public void addSegregator(final Segregator segregator) {
    this.segregators.add(segregator);
    segregator.init(this);
  }


  /**
   * Called internally to allow each of the input fields to update their
   * min/max values in the first pass.
   */
  private void applyMinMax() {
    for (final InputField field : this.inputFields) {
      final double value = field.getCurrentValue();
      field.applyMinMax(value);
    }
  }


  /**
   * Build "input data for a neural network" based on the input values
   * provided. This allows input for a neural network to be normalized. This
   * is typically used when data is to be presented to a trained neural
   * network.
   * 
   * @param data
   *            The input values to be normalized.
   * @return The data to be sent to the neural network.
   */
  public MLData buildForNetworkInput(final double[] data) {


    // feed the input fields
    int index = 0;
    for (final InputField field : this.inputFields) {
      if (field.getUsedForNetworkInput()) {
        if (index >= data.length) {
          throw new NormalizationError(
              "Can't build data, input fields used for neural input, must match provided data("
                  + data.length + ").");
        }
        field.setCurrentValue(data[index++]);
      }
    }


    // count the output fields
    int outputCount = 0;
    for (final OutputField ofield : this.outputFields) {
      if (!ofield.isIdeal()) {
        for (int sub = 0; sub < ofield.getSubfieldCount(); sub++) {
          outputCount++;
        }
      }
    }


    // process the output fields


    initForOutput();


    final MLData result = new BasicMLData(outputCount);


    // write the value
    int outputIndex = 0;
    for (final OutputField ofield : this.outputFields) {
      if (!ofield.isIdeal()) {
        for (int sub = 0; sub < ofield.getSubfieldCount(); sub++) {
          result.setData(outputIndex++, ofield.calculate(sub));
        }
      }
    }


    return result;
  }


  /**
   * Called internally to obtain the current value for an input field.
   * 
   * @param field
   *            The input field to determine.
   * @param index
   *            The current index.
   * @return The value for this input field.
   */
  private double determineInputFieldValue(final InputField field,
      final int index) {
    double result = 0;


    if( field instanceof InputFieldCSVText ) {
      final InputFieldCSVText fieldCSV = (InputFieldCSVText) field;
      final ReadCSV csv = this.csvMap.get(field);
      String v = csv.get(fieldCSV.getOffset());
      if( !fieldCSV.getMappings().containsKey(v) ) {
        throw new NormalizationError("Undefined class value: " + v);
      } else {
        result =  fieldCSV.getMappings().get(v);
      }
    } else if (field instanceof InputFieldCSV) {
      final InputFieldCSV fieldCSV = (InputFieldCSV) field;
      final ReadCSV csv = this.csvMap.get(field);
      result = csv.getDouble(fieldCSV.getOffset());
    } else if (field instanceof InputFieldMLDataSet) {
      final InputFieldMLDataSet neuralField = 
        (InputFieldMLDataSet) field;
      final MLDataFieldHolder holder = this.dataSetFieldMap
          .get(field);
      final MLDataPair pair = holder.getPair();
      int offset = neuralField.getOffset();
      if (offset < pair.getInput().size()) {
        result = pair.getInput().getData(offset);
      } else {
        offset -= pair.getInput().size();
        result = pair.getIdeal().getData(offset);
      }
    } else {
      result = field.getValue(index);
    }


    field.setCurrentValue(result);
    return result;
  }


  /**
   * Called internally to determine all of the input field values.
   * 
   * @param index
   *            The current index.
   */
  private void determineInputFieldValues(final int index) {
    for (final InputField field : this.inputFields) {
      determineInputFieldValue(field, index);
    }
  }


  /**
   * Find an input field by its class.
   * 
   * @param clazz
   *            The input field class type you are looking for.
   * @param count
   *            The instance of the input field needed, 0 for the first.
   * @return The input field if found, otherwise null.
   */
  public InputField findInputField(final Class< ? > clazz, final int count) {
    int i = 0;
    for (final InputField field : this.inputFields) {
      if (ReflectionUtil.isInstanceOf(field.getClass(), clazz)) {
        if (i == count) {
          return field;
        }
        i++;
      }
    }


    return null;
  }


  /**
   * Find an output field by its class.
   * 
   * @param clazz
   *            The output field class type you are looking for.
   * @param count
   *            The instance of the output field needed, 0 for the first.
   * @return The output field if found, otherwise null.
   */
  public OutputField findOutputField(final Class< ? > clazz, 
      final int count) {
    int i = 0;
    for (final OutputField field : this.outputFields) {
      if (ReflectionUtil.isInstanceOf(field.getClass(), clazz)) {
        if (i == count) {
          return field;
        }
        i++;
      }
    }


    return null;
  }


  /**
   * First pass, count everything, establish min/max.
   */
  private void firstPass() {
    openCSV();
    openDataSet();


    this.currentIndex = -1;
    this.recordCount = 0;


    this.report.report(0, 0, "Analyzing file");
    this.lastReport = 0;
    int index = 0;


    initForPass();


    // loop over all of the records
    while (next()) {


      determineInputFieldValues(index);


      if (shouldInclude()) {
        applyMinMax();
        this.recordCount++;
        reportResult("First pass, analyzing file", 0, this.recordCount);
      }
      index++;
    }
  }


  /**
   * @return The CSV format being used.
   */
  public CSVFormat getCSVFormat() {
    return this.csvFormat;
  }


  /**
   * @return The object groups.
   */
  public Set<OutputFieldGroup> getGroups() {
    return this.groups;
  }


  /**
   * @return The input fields.
   */
  public List<InputField> getInputFields() {
    return this.inputFields;
  }


  /**
   * @return The number of output fields that are not used as ideal values,
   *         these will be the input to the neural network. This is the input
   *         layer size for the neural network.
   */
  public int getNetworkInputLayerSize() {
    int result = 0;
    for (final OutputField field : this.outputFields) {
      if (!field.isIdeal()) {
        result += field.getSubfieldCount();
      }
    }
    return result;
  }


  /**
   * @return The number of output fields that are used as ideal values, these
   *         will be the ideal output from the neural network. This is the
   *         output layer size for the neural network.
   */
  public int getNetworkOutputLayerSize() {
    int result = 0;
    for (final OutputField field : this.outputFields) {
      if (field.isIdeal()) {
        result += field.getSubfieldCount();
      }
    }
    return result;
  }


  /**
   * @return The total size of all output fields. This takes into account
   *         output fields that generate more than one value.
   */
  public int getOutputFieldCount() {
    int result = 0;
    for (final OutputField field : this.outputFields) {
      result += field.getSubfieldCount();
    }
    return result;
  }


  /**
   * @return The output fields.
   */
  public List<OutputField> getOutputFields() {
    return this.outputFields;
  }


  /**
   * @return The record count.
   */
  public int getRecordCount() {
    return this.recordCount;
  }


  /**
   * @return The class that progress will be reported to.
   */
  public StatusReportable getReport() {
    return this.report;
  }


  /**
   * @return The segregators in use.
   */
  public List<Segregator> getSegregators() {
    return this.segregators;
  }


  /**
   * @return The place that the normalization output will be stored.
   */
  public NormalizationStorage getStorage() {
    return this.storage;
  }


  /**
   * Setup the row for output.
   */
  public void initForOutput() {


    // init groups
    for (final OutputFieldGroup group : this.groups) {
      group.rowInit();
    }


    // init output fields
    for (final OutputField field : this.outputFields) {
      field.rowInit();
    }
  }


  /**
   * Setup the row for output.
   */
  public void initForPass() {


    // init segregators
    for (final Segregator segregator : this.segregators) {
      segregator.passInit();
    }
  }


  /**
   * Called internally to advance to the next row.
   * 
   * @return True if there are more rows to reed.
   */
  private boolean next() {


    // see if any of the CSV readers want to stop
    for (final ReadCSV csv : this.readCSV) {
      if (!csv.next()) {
        return false;
      }
    }


    // see if any of the data sets want to stop
    for (final Iterator<MLDataPair> iterator : this.readDataSet) {
      if (!iterator.hasNext()) {
        return false;
      }
      final MLDataFieldHolder holder = this.dataSetIteratorMap
          .get(iterator);
      final MLDataPair pair = iterator.next();
      holder.setPair(pair);
    }


    // see if any of the arrays want to stop
    for (final InputField field : this.inputFields) {
      if (field instanceof HasFixedLength) {
        final HasFixedLength fixed = (HasFixedLength) field;
        if ((this.currentIndex + 1) >= fixed.length()) {
          return false;
        }
      }
    }


    this.currentIndex++;


    return true;
  }


  /**
   * Called internally to open the CSV file.
   */
  private void openCSV() {
    // clear out any CSV files already there
    this.csvMap.clear();
    this.readCSV.clear();


    // only add each CSV once
    final Map<File, ReadCSV> uniqueFiles = new HashMap<File, ReadCSV>();


    // find the unique files
    for (final InputField field : this.inputFields) {
      if (field instanceof InputFieldCSV) {
        final InputFieldCSV csvField = (InputFieldCSV) field;
        final File file = csvField.getFile();
        if (!uniqueFiles.containsKey(file)) {
          final ReadCSV csv = new ReadCSV(file.toString(), false,
              this.csvFormat);
          uniqueFiles.put(file, csv);
          this.readCSV.add(csv);
        }
        this.csvMap.put(csvField, uniqueFiles.get(file));
      }
    }
  }


  /**
   * Open any datasets that were used by the input layer.
   */
  private void openDataSet() {
    // clear out any data sets already there
    this.readDataSet.clear();
    this.dataSetFieldMap.clear();
    this.dataSetIteratorMap.clear();


    // only add each iterator once
    final Map<MLDataSet, MLDataFieldHolder> uniqueSets = 
      new HashMap<MLDataSet, MLDataFieldHolder>();


    // find the unique files
    for (final InputField field : this.inputFields) {
      if (field instanceof InputFieldMLDataSet) {
        final InputFieldMLDataSet dataSetField = 
          (InputFieldMLDataSet) field;
        final MLDataSet dataSet = dataSetField.getNeuralDataSet();
        if (!uniqueSets.containsKey(dataSet)) {
          final Iterator<MLDataPair> iterator = dataSet
              .iterator();
          final MLDataFieldHolder holder = 
            new MLDataFieldHolder(
              iterator, dataSetField);
          uniqueSets.put(dataSet, holder);
          this.readDataSet.add(iterator);
        }


        final MLDataFieldHolder holder = uniqueSets.get(dataSet);


        this.dataSetFieldMap.put(dataSetField, holder);
        this.dataSetIteratorMap.put(holder.getIterator(), holder);
      }
    }
  }
  
  public void init() {
    this.readCSV = new ArrayList<ReadCSV>();
      
    this.csvMap = 
      new HashMap<InputField, ReadCSV>();


    this.readDataSet = 
      new ArrayList<Iterator<MLDataPair>>();


    this.dataSetFieldMap = 
      new HashMap<InputField, MLDataFieldHolder>();


    this.dataSetIteratorMap = 
      new HashMap<Iterator<MLDataPair>, MLDataFieldHolder>();
    
    if( this.report==null ) {
      this.report = new NullStatusReportable();
    }
  }


  /**
   * Call this method to begin the normalization process. Any status updates
   * will be sent to the class specified in the constructor.
   */
  public void process() {
    
    init();


    if (twoPassesNeeded()) {
      firstPass();
    }


    secondPass();
  }


  /**
   * Report on the current progress.
   * 
   * @param message
   *            The message to report.
   * @param total
   *            The total number of records to process, 0 for unknown.
   * @param current
   *            The current record.
   */
  private void reportResult(final String message, final int total,
      final int current) {
    // count the records, report status
    this.lastReport++;
    if (this.lastReport >= 10000) {
      this.report.report(total, current, message);
      this.lastReport = 0;
    }
  }


  /**
   * The second pass actually writes the data to the output files.
   */
  private void secondPass() {


    final boolean twopass = twoPassesNeeded();


    // move any CSV and datasets files back to the beginning.
    openCSV();
    openDataSet();
    initForPass();


    this.currentIndex = -1;


    // process the records
    final int size = getOutputFieldCount();
    final double[] output = new double[size];
    
    if( storage==null ) {
      throw new NormalizationError("Must define storage target.");
    }


    this.storage.open(this);
    this.lastReport = 0;
    int index = 0;
    int current = 0;
    while (next()) {
      // read the value
      for (final InputField field : this.inputFields) {
        determineInputFieldValue(field, index);
      }


      if (shouldInclude()) {
        // handle groups
        initForOutput();


        // write the value
        int outputIndex = 0;
        for (final OutputField ofield : this.outputFields) {
          for (int sub = 0; sub < ofield.getSubfieldCount(); sub++) {
            output[outputIndex++] = ofield.calculate(sub);
          }
        }


        if (twopass) {
          reportResult("Second pass, normalizing data",
              this.recordCount, ++current);
        } else {
          reportResult("Processing data (single pass)",
              this.recordCount, ++current);
        }


        this.storage.write(output, 0);
      }


      index++;
    }
    this.storage.close();


  }


  /**
   * Set the CSV format to use.
   * 
   * @param csvFormat
   *            The CSV format to use.
   */
  public void setCSVFormat(final CSVFormat csvFormat) {
    this.csvFormat = csvFormat;
  }


  /**
   * Set the object that this one is reporting to.
   * 
   * @param report
   *            The object that progress reports should be sent to.
   */
  public void setReport(final StatusReportable report) {
    this.report = report;
  }


  /**
   * Determines where the normalized data will be sent.
   * 
   * @param target
   *            The target.
   */
  public void setTarget(final NormalizationStorage target) {
    this.storage = target;
  }


  /**
   * Should this row be included? Check the segregators.
   * 
   * @return True if the row should be included.
   */
  private boolean shouldInclude() {
    
    // If no segregators, then include
    if( this.segregators.size()==0 )
      return true;
    
    // include if one segregator says to include
    boolean included = false;
    for (final Segregator segregator : this.segregators) {


      if (segregator.shouldInclude()) {
        included = true;
      }
    }
    return included;
  }


  /**
   * @return True, if two passes are needed.
   */
  public boolean twoPassesNeeded() {
    for (final OutputField field : this.outputFields) {
      if (field instanceof RequireTwoPass) {
        return true;
      }
    }


    return false;
  }
}
Source Code of org.encog.util.normalize.DataNormalization

Related Classes of org.encog.util.normalize.DataNormalization