/**
* Copyright 2012, Wisdom Omuya.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at:
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.deafgoat.ml.prognosticator;
// Java
import java.io.FileReader;
import java.io.IOException;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
// Log4j
import org.apache.log4j.Logger;
// JSON
import org.json.JSONException;
// SuperCSV
import org.supercsv.cellprocessor.ift.CellProcessor;
import org.supercsv.io.CsvListReader;
import org.supercsv.io.ICsvListReader;
import org.supercsv.prefs.CsvPreference;
// Weka
import weka.core.Attribute;
import weka.core.DenseInstance;
import weka.core.Instances;
import weka.core.Utils;
/**
* This class helps us create ARFF files from CSV dumps using custom
* configuration parameters.
*/
public final class ARFFWriter {
/**
* Allows the user to add posite features for learning
*/
private void addUserDefinedAttributes() {
_logger.info("Adding user defined features");
// initialize list of user defined attributes
_userDefinedAttributes = new ArrayList<UserDefinedAttribute>();
// add each of the user defined attributes. e.g.
// _userDefinedAttributes.add(new TempChange());
// do not edit the lines below this
Attributes attribute = null;
for (UserDefinedAttribute userDefined : _userDefinedAttributes) {
attribute = userDefined.getUserDefinedAttribute();
setType(attribute, false);
_logger.info("Added " + attribute.getAttributeName());
}
}
/**
* Extract the instances from the .CSV file
*
* @param cls
* @throws IOException
* @throws ParseException
*/
private void extractInstances() throws IOException, ParseException {
int index, count = 0;
String curValue = null;
String classValue = null;
List<String> dump;
Attributes attribute;
setReader(_inCSV);
String[] header = _reader.getCSVHeader(true);
_values = new double[_data.numAttributes()];
if (_logger.isDebugEnabled()) {
_logger.debug("Now extracting instances");
}
while ((dump = _reader.read(_processing)) != null) {
for (int i = 0; i < dump.size(); i++) {
curValue = dump.get(i).trim();
index = _config._attributeMap.get(_config._dumpFile).get(header[i].trim().toLowerCase());
attribute = _config._attributes.get(_config._dumpFile).get(index);
// only consider attributes marked as 'include'
if (attribute.isInclude()) {
// insert attribute into double instance array
insertValue(attribute, curValue);
if (attribute.getAttributeName().equals(_config._classValue)) {
classValue = curValue;
}
}
}
// insert user defined feature for this instance
setUserDefinedAttributes(dump);
count += 1;
if (count % 10000 == 0) {
if (_logger.isDebugEnabled()) {
_logger.debug("Processed " + count + " records.");
}
}
// add weights according to instance class value
if (classValue.equals(_config._positiveClassValue)) {
_data.add(new DenseInstance(_config._positiveClassWeight, _values));
} else {
_data.add(new DenseInstance(_config._negativeClassWeight, _values));
}
_values = new double[_data.numAttributes()];
if (_logger.isDebugEnabled()) {
_logger.debug("Processed " + count + " records.");
}
}
_logger.info("Done. Processed " + count + " records!");
}
/**
* Generates mapping from attribute name to an integer value. This is
* important in creating the .ARFF file in an acceptable format. Note: This
* must be called before any user-generate attributes are added.
*/
private void generateAttributeMap() {
if (_logger.isDebugEnabled()) {
_logger.debug("Creating attribute map");
}
_attributeMap = new HashMap<String, Integer>();
int numAttributes = _attributes.size();
// WEKA map
for (int i = 0; i < numAttributes; i++) {
_attributeMap.put(_attributes.get(i).name(), i);
}
}
/**
* Initializes our instances object once we've added all our dump and
* user-defined features
*/
private void initializeInstances() {
_logger.info("Initializing instances object");
_data = new Instances(_config._relation, _attributes, 0);
}
/**
* Inserts the value of an attribute at the appropriate position
*
* @param attribute
* The attribute object
* @param value
* The attribute value to insert
* @throws ParseException
*/
private void insertValue(Attributes attribute, String value) throws ParseException {
String name = attribute.getAttributeName();
String type = attribute.getAttributeType();
if (value.equals("") || value.equals("?")) {
_values[_attributeMap.get(name)] = Utils.missingValue();
} else if (type.equals("numeric")) {
_values[_attributeMap.get(name)] = Double.parseDouble(value);
} else if (type.equals("string")) {
_values[_attributeMap.get(name)] = _data.attribute(_attributeMap.get(name)).addStringValue(value);
} else if (type.equals("date")) {
_values[_attributeMap.get(name)] = _data.attribute(_attributeMap.get(name)).parseDate(value);
} else if (type.equals("nominal")) {
_values[_attributeMap.get(name)] = _nominalRange.get(name).indexOf(value);
} else {
_logger.warn("Found unanticipated entry set: " + name + " = " + value);
}
}
/**
* Prints missing count of all attributes across all instances
*
* @throws IOException
*/
public void printMissingAttributeCount() throws IOException {
List<String> dump;
setReader(_inCSV);
String[] header = _reader.getCSVHeader(true);
Map<String, Integer> attCount = new HashMap<String, Integer>();
if (_logger.isDebugEnabled()) {
_logger.debug("Discovering missing attributes");
}
// iterate through all records to discover
// data set nominal attribute ranges.
while ((dump = _reader.read(_processing)) != null) {
for (int i = 0; i < dump.size(); i++) {
if (!dump.get(i).equals("")) {
if (!attCount.containsKey(header[i])) {
attCount.put(header[i], 1);
} else {
attCount.put(header[i], attCount.get(header[i]) + 1);
}
}
}
}
// print counts
for (Map.Entry<String, Integer> entry : attCount.entrySet()) {
System.out.format("%-42s = %10d%n", entry.getKey(), entry.getValue());
}
}
/**
* Saves an Instances object to .ARFF
*
* @param instances
* The set of instances to write
* @param outFile
* The file to which to write the instances
* @throws IOException
* If instances object can not be written
*/
public void saveInstancesToARFF(Instances instances, String outFile) throws IOException {
if (_logger.isDebugEnabled()) {
_logger.debug("Now saving instances");
}
InstancesWriter.writeInstances(instances, outFile);
if (_logger.isDebugEnabled()) {
_logger.debug("Saved instances to " + outFile);
}
}
/**
* Sets attribute types for all non-nominal attributes
*
* @param cls
* @throws IOException
*/
private void setAttributeType() throws IOException {
if (_logger.isDebugEnabled()) {
_logger.debug("Setting attribute types");
}
// iterate through all attributes and set their type
Attributes attribute = null;
for (int i = 0; i < _config._attributes.get(_config._dumpFile).size(); i++) {
attribute = _config._attributes.get(_config._dumpFile).get(i);
if (attribute.isInclude()) {
setType(attribute, false);
}
}
// set the class attribute type
setType(_target, true);
}
/**
* Sets internal map for user-defined features note that features defined by
* user may use attributes that are not marked as 'include' in the
* configuration file.
*/
private void setInternalMap() {
_internalMap = new HashMap<String, Integer>();
// Internal map
for (int i = 0; i < _header.length; i++) {
_internalMap.put(_header[i].trim(), i);
}
}
/**
* Discovers the set of possible configurations for nominal attributes
*
* @throws IOException
*/
private void setNominalRange() throws IOException {
int index = 0;
List<String> dump;
Attributes attribute;
setReader(_inCSV);
String key, name, value = null;
_header = _reader.getCSVHeader(true);
Map<String, Set<String>> nominalAttRange = new HashMap<String, Set<String>>();
for (UserDefinedAttribute uda : _userDefinedAttributes) {
nominalAttRange.put(uda.getAttributeName(), new HashSet<String>());
}
setInternalMap();
if (_logger.isDebugEnabled()) {
_logger.debug("Discovering nominal ranges");
}
// iterate through all records to discover
// data set nominal attribute ranges.
while ((dump = _reader.read(_processing)) != null) {
for (int i = 0; i < dump.size(); i++) {
value = dump.get(i).trim();
if (!value.equals("")) {
key = _header[i].trim();
try {
index = _config._attributeMap.get(_config._dumpFile).get(key.toLowerCase());
} catch (Exception e) {
_logger.error(
"Could not find specified configuration attribute " + key + "\n" + e.getMessage(), e);
}
attribute = _config._attributes.get(_config._dumpFile).get(index);
// check that attribute is to be included and 'nominal'
if (attribute.getAttributeType().equals("nominal") && attribute.isInclude()) {
name = attribute.getAttributeName();
if (nominalAttRange.get(name) == null) {
nominalAttRange.put(name, new HashSet<String>());
}
nominalAttRange.get(name).add(value);
for (UserDefinedAttribute uda : _userDefinedAttributes) {
if (uda.getUserDefinedAttribute().isInclude()) {
name = uda.getAttributeName();
value = uda.getAttributeValue(dump, _internalMap, _config);
if (!value.equals("")) {
nominalAttRange.get(name).add(value);
}
}
}
}
}
}
}
ArrayList<String> curAttribute;
// convert the sets to ordered lists for later retrieval
_nominalRange = new HashMap<String, ArrayList<String>>();
for (Entry<String, Set<String>> entry : nominalAttRange.entrySet()) {
_nominalRange.put(entry.getKey(), new ArrayList<String>(entry.getValue()));
}
ArrayList<String> _value;
// add the list to our attributes object
for (Entry<String, ArrayList<String>> entry : _nominalRange.entrySet()) {
_value = entry.getValue();
curAttribute = new ArrayList<String>();
for (String s : _value) {
curAttribute.add(s);
}
// convenience check to ensure class attribute is added last
if (entry.getKey().equals(_config._classValue)) {
_targetRange = curAttribute;
} else {
_attributes.add(new Attribute(entry.getKey(), curAttribute));
}
}
}
/**
* Reset the .CSV file reader
*
* @param dataFile
* @throws IOException
*/
private void setReader(String inCSV) throws IOException {
_reader = new CsvListReader(new FileReader(inCSV), CsvPreference.EXCEL_PREFERENCE);
}
/**
* Sets attribute based on its WEKA type
*
* @param attribute
* Thes attribute to set
* @param targetSet
* The flag that allows the class value to be set last
*/
private void setType(Attributes attribute, boolean targetSet) {
String name = attribute.getAttributeName();
String type = attribute.getAttributeType();
_internalAttributes = new ArrayList<Attributes>();
_internalAttributes.add(attribute);
if (name.equals(_config._classValue) && !targetSet) {
_target = attribute;
} else if (type.equals("nominal")) {
if (targetSet) {
_attributes.add(new Attribute(_target.getAttributeName(), _targetRange));
} else {
return;
}
} else if (type.equals("numeric")) {
_attributes.add(new Attribute(name));
} else if (type.equals("string")) {
_attributes.add(new Attribute(name, (ArrayList<String>) null));
} else if (type.equals("date")) {
_attributes.add(new Attribute(name, _config._dateFormat));
} else {
_logger.warn("Found unexpected key: " + attribute.getAttributeName());
}
}
/**
* Sets the values of all user defined features on current instance
*
* @throws ParseException
*/
private void setUserDefinedAttributes(List<String> instance) throws ParseException {
Attributes attribute = null;
String value = null;
for (UserDefinedAttribute userDefined : _userDefinedAttributes) {
attribute = userDefined.getUserDefinedAttribute();
value = userDefined.getAttributeValue(instance, _internalMap, _config);
insertValue(attribute, value);
}
}
/**
* Performs steps involved in creating the .ARFF file in an acceptable
* format.
*
* @throws Exception
*/
public void writeARFF() throws Exception {
_logger.info("Beginning ARFF creation");
// add any other interesting attributes
addUserDefinedAttributes();
// predetermines the range of nominal attributes
setNominalRange();
// sets the attribute types of all other attributes
setAttributeType();
// internal method used to track attribute index
generateAttributeMap();
// initializes the instances object
initializeInstances();
// extracts the instance from the dump
extractInstances();
// saves the instances to file
saveInstancesToARFF(_data, _outARFF);
_logger.info("Finished ARFF creation");
}
/**
* contains a mapping of WEKA attribute name to its location
*/
private Map<String, Integer> _attributeMap;
/**
* contains a mapping of all WEKA transformed attributes in configuration
*/
private ArrayList<Attribute> _attributes;
/**
* the handle to the configuration reader
*/
private ConfigReader _config;
/**
* the set of instances to be written
*/
private Instances _data;
/**
* the headers of the CSV dump
*/
private String[] _header;
/**
* the reference CSV dump
*/
private String _inCSV;
/**
* contains a list of all internal attributes described in configuration
*/
private ArrayList<Attributes> _internalAttributes;
/**
* contains a mapping of internal attribute name to its location
*/
private Map<String, Integer> _internalMap;
/**
* a handle to the logging object
*/
private Logger _logger;
/**
* contains a mapping of nominal attributes to their range
*/
private Map<String, ArrayList<String>> _nominalRange;
/**
* the target ARFF file to be written to
*/
private String _outARFF;
/**
* the cell processing object
*/
private CellProcessor[] _processing;
/**
* the CSV dump reader
*/
private ICsvListReader _reader;
/**
* holds the target class of the data set
*/
private Attributes _target;
/**
* holds the target range of possibly nominal class attribute
*/
private ArrayList<String> _targetRange;
/**
* the list of user-defined attributes
*/
private List<UserDefinedAttribute> _userDefinedAttributes;
/**
* the set of values holding the current instance
*/
private double[] _values;
/**
* Class constructor
*
* @throws IOException
*/
public ARFFWriter() throws IOException {
_logger = Logger.getLogger(AppLogger.class.getName());
}
/**
* Class constructor - accepts input .CSV file and saves to .ARFF file.
*
* @param config
* @throws IOException
* @throws JSONException
*/
public ARFFWriter(ConfigReader config) throws JSONException, IOException {
_logger = AppLogger.getLogger();
_config = config;
_config.readConfig();
_config.setAttributeMap();
_attributes = new ArrayList<Attribute>();
// We don't want to enforce any constraints for now
int columnSize = _config._attributes.get(_config._dumpFile).size();
_processing = new CellProcessor[columnSize];
_outARFF = _config._dumpARFF;
_inCSV = _config._dumpFile;
setReader(_inCSV);
}
/**
* Class constructor - accepts input .CSV file and saves to .ARFF file.
*
* @param config
* @param inCSV
* @param outARFF
* @throws IOException
* @throws JSONException
*/
public ARFFWriter(ConfigReader config, String inCSV, String outARFF) throws IOException, JSONException {
_logger = AppLogger.getLogger();
_config = config;
_config.readConfig();
_config.setAttributeMap();
_attributes = new ArrayList<Attribute>();
// We don't want to enforce any constraints for now
int columnSize = _config._attributes.get(_config._dumpFile).size();
_processing = new CellProcessor[columnSize];
_outARFF = outARFF;
_inCSV = inCSV;
setReader(_inCSV);
}
}