/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/*
* ConverterUtils.java
* Copyright (C) 2000 University of Waikato, Hamilton, New Zealand
*
*/
package org.integratedmodelling.riskwiz.learning.data.loader;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.Serializable;
import java.io.StreamTokenizer;
import java.net.URL;
import java.util.Collections;
import java.util.Enumeration;
import java.util.Hashtable;
import java.util.Vector;
import org.integratedmodelling.riskwiz.learning.data.Instance;
import org.integratedmodelling.riskwiz.learning.data.Instances;
/**
* Utility routines for the converter package.
*
* @author Mark Hall (mhall@cs.waikato.ac.nz)
* @author FracPete (fracpete at waikato dot ac dot nz)
* @version $Revision: 1.14 $
* @see Serializable
*/
public class ConverterUtils
implements Serializable {
/**
* Helper class for loading data from files and URLs. Via the ConverterUtils
* class it determines which converter to use for loading the data into
* memory. If the chosen converter is an incremental one, then the data
* will be loaded incrementally, otherwise as batch. In both cases the
* same interface will be used (<code>hasMoreElements</code>,
* <code>nextElement</code>). Before the
* data can be read again, one has to call the <code>reset</code> method.
* The data source can also be initialized with an Instances object, in
* order to provide a unified interface to files and already loaded datasets.
*
* @author FracPete (fracpete at waikato dot ac dot nz)
* @version $Revision: 1.14 $
* @see #hasMoreElements(Instances)
* @see #nextElement(Instances)
* @see #reset()
* @see DataSink
*/
public static class DataSource
implements Serializable {
/** for serialization. */
private static final long serialVersionUID = -613122395928757332L;
/** the file to load. */
protected File m_File;
/** the URL to load. */
protected URL m_URL;
/** the loader.*/
protected Loader m_Loader;
/** whether the loader is incremental. */
protected boolean m_Incremental;
/** the instance counter for the batch case. */
protected int m_BatchCounter;
/** the last internally read instance. */
protected Instance m_IncrementalBuffer;
/** the batch buffer. */
protected Instances m_BatchBuffer;
/**
* Tries to load the data from the file. Can be either a regular file or
* a web location (http://, https://, ftp:// or file://).
*
* @param location the name of the file to load
* @throws Exception if initialization fails
*/
public DataSource(String location, Loader loader) throws Exception {
super();
// file or URL?
if (location.startsWith("http://")
|| location.startsWith("https://")
|| location.startsWith("ftp://")
|| location.startsWith("file://")) {
m_URL = new URL(location);
} else {
m_File = new File(location);
}
// quick check: is it ARFF?
if (isArff(location)) {
m_Loader = new ArffLoader();
} else {
m_Loader = loader;
// do we have a converter?
if (m_Loader == null) {
throw new IllegalArgumentException(
"No suitable converter found for '" + location
+ "'!");
}
}
// incremental loader?
m_Incremental = (m_Loader instanceof IncrementalConverter);
reset();
}
public DataSource(String location) throws Exception {
super();
// file or URL?
if (location.startsWith("http://")
|| location.startsWith("https://")
|| location.startsWith("ftp://")
|| location.startsWith("file://")) {
m_URL = new URL(location);
} else {
m_File = new File(location);
}
// quick check: is it ARFF?
if (isArff(location)) {
m_Loader = new ArffLoader();
} else {
throw new IllegalArgumentException(
"No suitable converter found for '" + location + "'!");
}
// incremental loader?
m_Incremental = (m_Loader instanceof IncrementalConverter);
reset();
}
/**
* Initializes the datasource with the given dataset.
*
* @param inst the dataset to use
*/
public DataSource(Instances inst) {
super();
m_BatchBuffer = inst;
m_Loader = null;
m_File = null;
m_URL = null;
m_Incremental = false;
}
/**
* Initializes the datasource with the given Loader.
*
* @param loader the Loader to use
*/
public DataSource(Loader loader) {
super();
m_BatchBuffer = null;
m_Loader = loader;
m_File = null;
m_URL = null;
m_Incremental = (m_Loader instanceof IncrementalConverter);
initBatchBuffer();
}
/**
* Initializes the datasource with the given input stream. This stream
* is always interpreted as ARFF.
*
* @param stream the stream to use
*/
public DataSource(InputStream stream) {
super();
m_BatchBuffer = null;
m_Loader = new ArffLoader();
try {
m_Loader.setSource(stream);
} catch (Exception e) {
m_Loader = null;
}
m_File = null;
m_URL = null;
m_Incremental = (m_Loader instanceof IncrementalConverter);
initBatchBuffer();
}
/**
* initializes the batch buffer if necessary, i.e., for non-incremental
* loaders.
*/
protected void initBatchBuffer() {
try {
if (!isIncremental()) {
m_BatchBuffer = m_Loader.getDataSet();
} else {
m_BatchBuffer = null;
}
} catch (Exception e) {
e.printStackTrace();
}
}
/**
* returns whether the extension of the location is likely to be of ARFF
* format, i.e., ending in ".arff" or ".arff.gz" (case-insensitive).
*
* @param location the file location to check
* @return true if the location seems to be of ARFF format
*/
public static boolean isArff(String location) {
if (location.toLowerCase().endsWith(
ArffLoader.FILE_EXTENSION.toLowerCase())
|| location.toLowerCase().endsWith(
ArffLoader.FILE_EXTENSION_COMPRESSED.toLowerCase())) {
return true;
} else {
return false;
}
}
/**
* returns whether the loader is an incremental one.
*
* @return true if the loader is a true incremental one
*/
public boolean isIncremental() {
return m_Incremental;
}
/**
* returns the determined loader, null if the DataSource was initialized
* with data alone and not a file/URL.
*
* @return the loader used for retrieving the data
*/
public Loader getLoader() {
return m_Loader;
}
/**
* returns the full dataset, can be null in case of an error.
*
* @return the full dataset
* @throws Exception if resetting of loader fails
*/
public Instances getDataSet() throws Exception {
Instances result;
result = null;
// reset the loader
reset();
try {
if (m_Loader != null) {
result = m_Loader.getDataSet();
} else {
result = m_BatchBuffer;
}
} catch (Exception e) {
e.printStackTrace();
result = null;
}
return result;
}
/**
* returns the full dataset with the specified class index set,
* can be null in case of an error.
*
* @param classIndex the class index for the dataset
* @return the full dataset
* @throws Exception if resetting of loader fails
*/
public Instances getDataSet(int classIndex) throws Exception {
Instances result;
result = getDataSet();
if (result != null) {
result.setClassIndex(classIndex);
}
return result;
}
/**
* resets the loader.
*
* @throws Exception if resetting fails
*/
public void reset() throws Exception {
if (m_File != null) {
((AbstractFileLoader) m_Loader).setFile(m_File);
} else if (m_URL != null) {
((URLSourcedLoader) m_Loader).setURL(m_URL.toString());
} else if (m_Loader != null) {
m_Loader.reset();
}
m_BatchCounter = 0;
m_IncrementalBuffer = null;
if (m_Loader != null) {
if (!isIncremental()) {
m_BatchBuffer = m_Loader.getDataSet();
} else {
m_BatchBuffer = null;
}
}
}
/**
* returns the structure of the data.
*
* @return the structure of the data
* @throws Exception if something goes wrong
*/
public Instances getStructure() throws Exception {
if (m_Loader != null) {
return m_Loader.getStructure();
} else {
return new Instances(m_BatchBuffer, 0);
}
}
/**
* returns the structure of the data, with the defined class index.
*
* @param classIndex the class index for the dataset
* @return the structure of the data
* @throws Exception if something goes wrong
*/
public Instances getStructure(int classIndex) throws Exception {
Instances result;
result = getStructure();
if (result != null) {
result.setClassIndex(classIndex);
}
return result;
}
/**
* returns whether there are more Instance objects in the data.
*
* @param structure the structure of the dataset
* @return true if there are more Instance objects
* available
* @see #nextElement(Instances)
*/
public boolean hasMoreElements(Instances structure) {
boolean result;
result = false;
if (isIncremental()) {
// user still hasn't collected the last one?
if (m_IncrementalBuffer != null) {
result = true;
} else {
try {
m_IncrementalBuffer = m_Loader.getNextInstance(structure);
result = (m_IncrementalBuffer != null);
} catch (Exception e) {
e.printStackTrace();
result = false;
}
}
} else {
result = (m_BatchCounter < m_BatchBuffer.numInstances());
}
return result;
}
/**
* returns the next element and sets the specified dataset, null if
* none available.
*
* @param dataset the dataset to set for the instance
* @return the next Instance
*/
public Instance nextElement(Instances dataset) {
Instance result;
result = null;
if (isIncremental()) {
// is there still an instance in the buffer?
if (m_IncrementalBuffer != null) {
result = m_IncrementalBuffer;
m_IncrementalBuffer = null;
} else {
try {
result = m_Loader.getNextInstance(dataset);
} catch (Exception e) {
e.printStackTrace();
result = null;
}
}
} else {
if (m_BatchCounter < m_BatchBuffer.numInstances()) {
result = m_BatchBuffer.instance(m_BatchCounter);
m_BatchCounter++;
}
}
result.setDataset(dataset);
return result;
}
/**
* convencience method for loading a dataset in batch mode.
*
* @param location the dataset to load
* @return the dataset
* @throws Exception if loading fails
* @see #DataSource(String)
*/
public static Instances read(String location, Loader loader) throws Exception {
DataSource source;
Instances result;
source = new DataSource(location, loader);
result = source.getDataSet();
return result;
}
/**
* convencience method for loading a dataset in batch mode from a stream.
*
* @param stream the stream to load the dataset from
* @return the dataset
* @throws Exception if loading fails
* @see #DataSource(InputStream)
*/
public static Instances read(InputStream stream) throws Exception {
DataSource source;
Instances result;
source = new DataSource(stream);
result = source.getDataSet();
return result;
}
/**
* convencience method for loading a dataset in batch mode.
*
* @param loader the loader to get the dataset from
* @return the dataset
* @throws Exception if loading fails
* @see #DataSource(Loader)
*/
public static Instances read(Loader loader) throws Exception {
DataSource source;
Instances result;
source = new DataSource(loader);
result = source.getDataSet();
return result;
}
/**
* for testing only - takes a data file as input.
*
* @param args the commandline arguments
* @throws Exception if something goes wrong
*/
public static void main(String[] args) throws Exception {
if (args.length != 1) {
System.out.println(
"\nUsage: " + DataSource.class.getName() + " <file>\n");
System.exit(1);
}
DataSource loader = new DataSource(args[0], new ArffLoader());
System.out.println("Incremental? " + loader.isIncremental());
System.out.println(
"Loader: " + loader.getLoader().getClass().getName());
System.out.println("Data:\n");
Instances structure = loader.getStructure();
System.out.println(structure);
while (loader.hasMoreElements(structure)) {
System.out.println(loader.nextElement(structure));
}
Instances inst = loader.getDataSet();
loader = new DataSource(inst);
System.out.println("\n\nProxy-Data:\n");
System.out.println(loader.getStructure());
while (loader.hasMoreElements(structure)) {
System.out.println(loader.nextElement(inst));
}
}
}
/** for serialization. */
static final long serialVersionUID = -2460855349276148760L;
/** all available loaders (extension <-> classname). */
protected static Hashtable<String, String> m_FileLoaders;
/** all available URL loaders (extension <-> classname). */
protected static Hashtable<String, String> m_URLFileLoaders;
/**
* Gets token, skipping empty lines.
*
* @param tokenizer the stream tokenizer
* @throws IOException if reading the next token fails
*/
public static void getFirstToken(StreamTokenizer tokenizer)
throws IOException {
while (tokenizer.nextToken() == StreamTokenizer.TT_EOL) {}
;
if ((tokenizer.ttype == '\'') || (tokenizer.ttype == '"')) {
tokenizer.ttype = StreamTokenizer.TT_WORD;
} else if ((tokenizer.ttype == StreamTokenizer.TT_WORD)
&& (tokenizer.sval.equals("?"))) {
tokenizer.ttype = '?';
}
}
/**
* Gets token.
*
* @param tokenizer the stream tokenizer
* @throws IOException if reading the next token fails
*/
public static void getToken(StreamTokenizer tokenizer) throws IOException {
tokenizer.nextToken();
if (tokenizer.ttype == StreamTokenizer.TT_EOL) {
return;
}
if ((tokenizer.ttype == '\'') || (tokenizer.ttype == '"')) {
tokenizer.ttype = StreamTokenizer.TT_WORD;
} else if ((tokenizer.ttype == StreamTokenizer.TT_WORD)
&& (tokenizer.sval.equals("?"))) {
tokenizer.ttype = '?';
}
}
/**
* Throws error message with line number and last token read.
*
* @param theMsg the error message to be thrown
* @param tokenizer the stream tokenizer
* @throws IOException containing the error message
*/
public static void errms(StreamTokenizer tokenizer, String theMsg)
throws IOException {
throw new IOException(theMsg + ", read " + tokenizer.toString());
}
/**
* returns a vector with the classnames of all the loaders from the
* given hashtable.
*
* @param ht the hashtable with the extension/converter relation
* @return the classnames of the loaders
*/
protected static Vector<String> getConverters(Hashtable<String, String> ht) {
Vector<String> result;
Enumeration<String> enm;
String converter;
result = new Vector<String>();
// get all classnames
enm = ht.elements();
while (enm.hasMoreElements()) {
converter = enm.nextElement();
if (!result.contains(converter)) {
result.add(converter);
}
}
// sort names
Collections.sort(result);
return result;
}
/**
* tries to determine the converter to use for this kind of file, returns
* null if none can be found in the given hashtable.
*
* @param filename the file to return a converter for
* @param ht the hashtable with the relation extension/converter
* @return the converter if one was found, null otherwise
*/
protected static Object getConverterForFile(String filename, Hashtable<String, String> ht) {
Object result;
String extension;
int index;
result = null;
index = filename.lastIndexOf('.');
if (index > -1) {
extension = filename.substring(index).toLowerCase();
result = getConverterForExtension(extension, ht);
// is it a compressed format?
if (extension.equals(".gz") && result == null) {
index = filename.lastIndexOf('.', index - 1);
extension = filename.substring(index).toLowerCase();
result = getConverterForExtension(extension, ht);
}
}
return result;
}
/**
* tries to determine the loader to use for this kind of extension, returns
* null if none can be found.
*
* @param extension the file extension to return a converter for
* @param ht the hashtable with the relation extension/converter
* @return the converter if one was found, null otherwise
*/
protected static Object getConverterForExtension(String extension, Hashtable<String, String> ht) {
Object result;
String classname;
result = null;
classname = ht.get(extension);
if (classname != null) {
try {
result = Class.forName(classname).newInstance();
} catch (Exception e) {
result = null;
e.printStackTrace();
}
}
return result;
}
}