/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/*
* ArffLoader.java
* Copyright (C) 2000 University of Waikato, Hamilton, New Zealand
*
*/
package org.integratedmodelling.riskwiz.learning.data.loader;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StreamTokenizer;
import java.io.StringReader;
import java.net.URL;
import java.text.ParseException;
import java.util.zip.GZIPInputStream;
import org.integratedmodelling.riskwiz.learning.data.Attribute;
import org.integratedmodelling.riskwiz.learning.data.FastVector;
import org.integratedmodelling.riskwiz.learning.data.Instance;
import org.integratedmodelling.riskwiz.learning.data.Instances;
import org.integratedmodelling.riskwiz.learning.data.SparseInstance;
/**
<!-- globalinfo-start -->
* Reads a source that is in arff (attribute relation file format) format.
* <p/>
<!-- globalinfo-end -->
*
* @author Mark Hall (mhall@cs.waikato.ac.nz)
* @author FracPete (fracpete at waikato dot ac dot nz)
* @version $Revision: 1.19 $
* @see Loader
*/
public class ArffLoader extends AbstractFileLoader
implements BatchConverter, IncrementalConverter, URLSourcedLoader {
/** for serialization */
static final long serialVersionUID = 2726929550544048587L;
/** the file extension */
public static String FILE_EXTENSION = Instances.FILE_EXTENSION;
/** the extension for compressed files */
public static String FILE_EXTENSION_COMPRESSED = FILE_EXTENSION + ".gz";
/** the url */
protected String m_URL = "http://";
/** The reader for the source file. */
protected transient Reader m_sourceReader = null;
/** The parser for the ARFF file */
protected transient ArffReader m_ArffReader = null;
/**
* Reads data from an ARFF file, either in incremental or batch mode. <p/>
*
* Typical code for batch usage:
* <pre>
* BufferedReader reader = new BufferedReader(new FileReader("/some/where/file.arff"));
* ArffReader arff = new ArffReader(reader);
* Instances data = arff.getData();
* data.setClassIndex(data.numAttributes() - 1);
* </pre>
*
* Typical code for incremental usage:
* <pre>
* BufferedReader reader = new BufferedReader(new FileReader("/some/where/file.arff"));
* ArffReader arff = new ArffReader(reader, 1000);
* Instances data = arff.getStructure();
* data.setClassIndex(data.numAttributes() - 1);
* Instance inst;
* while ((inst = arff.readInstance(data)) != null) {
* data.add(inst);
* }
* </pre>
*
* @author Eibe Frank (eibe@cs.waikato.ac.nz)
* @author Len Trigg (trigg@cs.waikato.ac.nz)
* @author fracpete (fracpete at waikato dot ac dot nz)
* @version $Revision: 1.19 $
*/
public static class ArffReader {
/** the tokenizer for reading the stream */
protected StreamTokenizer m_Tokenizer;
/** Buffer of values for sparse instance */
protected double[] m_ValueBuffer;
/** Buffer of indices for sparse instance */
protected int[] m_IndicesBuffer;
/** the actual data */
protected Instances m_Data;
/** the number of lines read so far */
protected int m_Lines;
/**
* Reads the data completely from the reader. The data can be accessed
* via the <code>getData()</code> method.
*
* @param reader the reader to use
* @throws IOException if something goes wrong
* @see #getData()
*/
public ArffReader(Reader reader) throws IOException {
m_Tokenizer = new StreamTokenizer(reader);
initTokenizer();
readHeader(1000);
initBuffers();
Instance inst;
while ((inst = readInstance(m_Data)) != null) {
m_Data.add(inst);
}
;
compactify();
}
/**
* Reads only the header and reserves the specified space for instances.
* Further instances can be read via <code>readInstance()</code>.
*
* @param reader the reader to use
* @param capacity the capacity of the new dataset
* @throws IOException if something goes wrong
* @throws IllegalArgumentException if capacity is negative
* @see #getStructure()
* @see #readInstance(Instances)
*/
public ArffReader(Reader reader, int capacity) throws IOException {
if (capacity < 0) {
throw new IllegalArgumentException(
"Capacity has to be positive!");
}
m_Tokenizer = new StreamTokenizer(reader);
initTokenizer();
readHeader(capacity);
initBuffers();
}
/**
* Reads the data without header according to the specified template.
* The data can be accessed via the <code>getData()</code> method.
*
* @param reader the reader to use
* @param template the template header
* @param lines the lines read so far
* @throws IOException if something goes wrong
* @see #getData()
*/
public ArffReader(Reader reader, Instances template, int lines) throws IOException {
this(reader, template, lines, 100);
Instance inst;
while ((inst = readInstance(m_Data)) != null) {
m_Data.add(inst);
}
;
compactify();
}
/**
* Initializes the reader without reading the header according to the
* specified template. The data must be read via the
* <code>readInstance()</code> method.
*
* @param reader the reader to use
* @param template the template header
* @param lines the lines read so far
* @param capacity the capacity of the new dataset
* @throws IOException if something goes wrong
* @see #getData()
*/
public ArffReader(Reader reader, Instances template, int lines, int capacity) throws IOException {
m_Lines = lines;
m_Tokenizer = new StreamTokenizer(reader);
initTokenizer();
m_Data = new Instances(template, capacity);
initBuffers();
}
/**
* initializes the buffers for sparse instances to be read
*
* @see #m_ValueBuffer
* @see #m_IndicesBuffer
*/
protected void initBuffers() {
m_ValueBuffer = new double[m_Data.numAttributes()];
m_IndicesBuffer = new int[m_Data.numAttributes()];
}
/**
* compactifies the data
*/
protected void compactify() {
if (m_Data != null) {
m_Data.compactify();
}
}
/**
* Throws error message with line number and last token read.
*
* @param msg the error message to be thrown
* @throws IOException containing the error message
*/
protected void errorMessage(String msg) throws IOException {
String str = msg + ", read " + m_Tokenizer.toString();
if (m_Lines > 0) {
int line = Integer.parseInt(str.replaceAll(".* line ", ""));
str = str.replaceAll(" line .*", " line " + (m_Lines + line - 1));
}
throw new IOException(str);
}
/**
* returns the current line number
*
* @return the current line number
*/
public int getLineNo() {
return m_Lines + m_Tokenizer.lineno();
}
/**
* Gets next token, skipping empty lines.
*
* @throws IOException if reading the next token fails
*/
protected void getFirstToken() throws IOException {
while (m_Tokenizer.nextToken() == StreamTokenizer.TT_EOL) {}
;
if ((m_Tokenizer.ttype == '\'') || (m_Tokenizer.ttype == '"')) {
m_Tokenizer.ttype = StreamTokenizer.TT_WORD;
} else if ((m_Tokenizer.ttype == StreamTokenizer.TT_WORD)
&& (m_Tokenizer.sval.equals("?"))) {
m_Tokenizer.ttype = '?';
}
}
/**
* Gets index, checking for a premature and of line.
*
* @throws IOException if it finds a premature end of line
*/
protected void getIndex() throws IOException {
if (m_Tokenizer.nextToken() == StreamTokenizer.TT_EOL) {
errorMessage("premature end of line");
}
if (m_Tokenizer.ttype == StreamTokenizer.TT_EOF) {
errorMessage("premature end of file");
}
}
/**
* Gets token and checks if its end of line.
*
* @param endOfFileOk whether EOF is OK
* @throws IOException if it doesn't find an end of line
*/
protected void getLastToken(boolean endOfFileOk) throws IOException {
if ((m_Tokenizer.nextToken() != StreamTokenizer.TT_EOL)
&& ((m_Tokenizer.ttype != StreamTokenizer.TT_EOF)
|| !endOfFileOk)) {
errorMessage("end of line expected");
}
}
/**
* Gets next token, checking for a premature and of line.
*
* @throws IOException if it finds a premature end of line
*/
protected void getNextToken() throws IOException {
if (m_Tokenizer.nextToken() == StreamTokenizer.TT_EOL) {
errorMessage("premature end of line");
}
if (m_Tokenizer.ttype == StreamTokenizer.TT_EOF) {
errorMessage("premature end of file");
} else if ((m_Tokenizer.ttype == '\'') || (m_Tokenizer.ttype == '"')) {
m_Tokenizer.ttype = StreamTokenizer.TT_WORD;
} else if ((m_Tokenizer.ttype == StreamTokenizer.TT_WORD)
&& (m_Tokenizer.sval.equals("?"))) {
m_Tokenizer.ttype = '?';
}
}
/**
* Initializes the StreamTokenizer used for reading the ARFF file.
*/
protected void initTokenizer() {
m_Tokenizer.resetSyntax();
m_Tokenizer.whitespaceChars(0, ' ');
m_Tokenizer.wordChars(' ' + 1, '\u00FF');
m_Tokenizer.whitespaceChars(',', ',');
m_Tokenizer.commentChar('%');
m_Tokenizer.quoteChar('"');
m_Tokenizer.quoteChar('\'');
m_Tokenizer.ordinaryChar('{');
m_Tokenizer.ordinaryChar('}');
m_Tokenizer.eolIsSignificant(true);
}
/**
* Reads a single instance using the tokenizer and returns it.
*
* @param structure the dataset header information, will get updated
* in case of string or relational attributes
* @return null if end of file has been reached
* @throws IOException if the information is not read
* successfully
*/
public Instance readInstance(Instances structure) throws IOException {
return readInstance(structure, true);
}
/**
* Reads a single instance using the tokenizer and returns it.
*
* @param structure the dataset header information, will get updated
* in case of string or relational attributes
* @param flag if method should test for carriage return after
* each instance
* @return null if end of file has been reached
* @throws IOException if the information is not read
* successfully
*/
public Instance readInstance(Instances structure, boolean flag) throws IOException {
return getInstance(structure, flag);
}
/**
* Reads a single instance using the tokenizer and returns it.
*
* @param structure the dataset header information, will get updated
* in case of string or relational attributes
* @param flag if method should test for carriage return after
* each instance
* @return null if end of file has been reached
* @throws IOException if the information is not read
* successfully
*/
protected Instance getInstance(Instances structure, boolean flag) throws IOException {
m_Data = structure;
// Check if any attributes have been declared.
if (m_Data.numAttributes() == 0) {
errorMessage("no header information available");
}
// Check if end of file reached.
getFirstToken();
if (m_Tokenizer.ttype == StreamTokenizer.TT_EOF) {
return null;
}
// Parse instance
if (m_Tokenizer.ttype == '{') {
return getInstanceSparse(flag);
} else {
return getInstanceFull(flag);
}
}
/**
* Reads a single instance using the tokenizer and returns it.
*
* @param flag if method should test for carriage return after
* each instance
* @return null if end of file has been reached
* @throws IOException if the information is not read
* successfully
*/
protected Instance getInstanceSparse(boolean flag) throws IOException {
int valIndex, numValues = 0, maxIndex = -1;
// Get values
do {
// Get index
getIndex();
if (m_Tokenizer.ttype == '}') {
break;
}
// Is index valid?
try {
m_IndicesBuffer[numValues] = Integer.valueOf(m_Tokenizer.sval).intValue();
} catch (NumberFormatException e) {
errorMessage("index number expected");
}
if (m_IndicesBuffer[numValues] <= maxIndex) {
errorMessage("indices have to be ordered");
}
if ((m_IndicesBuffer[numValues] < 0)
|| (m_IndicesBuffer[numValues] >= m_Data.numAttributes())) {
errorMessage("index out of bounds");
}
maxIndex = m_IndicesBuffer[numValues];
// Get value;
getNextToken();
// Check if value is missing.
if (m_Tokenizer.ttype == '?') {
m_ValueBuffer[numValues] = Instance.missingValue();
} else {
// Check if token is valid.
if (m_Tokenizer.ttype != StreamTokenizer.TT_WORD) {
errorMessage("not a valid value");
}
switch (m_Data.attribute(m_IndicesBuffer[numValues]).type()) {
case Attribute.NOMINAL:
// Check if value appears in header.
valIndex = m_Data.attribute(m_IndicesBuffer[numValues]).indexOfValue(
m_Tokenizer.sval);
if (valIndex == -1) {
errorMessage("nominal value not declared in header");
}
m_ValueBuffer[numValues] = valIndex;
break;
case Attribute.NUMERIC:
// Check if value is really a number.
try {
m_ValueBuffer[numValues] = Double.valueOf(m_Tokenizer.sval).doubleValue();
} catch (NumberFormatException e) {
errorMessage("number expected");
}
break;
case Attribute.STRING:
m_ValueBuffer[numValues] = m_Data.attribute(m_IndicesBuffer[numValues]).addStringValue(
m_Tokenizer.sval);
break;
case Attribute.DATE:
try {
m_ValueBuffer[numValues] = m_Data.attribute(m_IndicesBuffer[numValues]).parseDate(
m_Tokenizer.sval);
} catch (ParseException e) {
errorMessage("unparseable date: " + m_Tokenizer.sval);
}
break;
case Attribute.RELATIONAL:
try {
ArffReader arff = new ArffReader(
new StringReader(m_Tokenizer.sval),
m_Data.attribute(m_IndicesBuffer[numValues]).relation(),
0);
Instances data = arff.getData();
m_ValueBuffer[numValues] = m_Data.attribute(m_IndicesBuffer[numValues]).addRelation(
data);
} catch (Exception e) {
throw new IOException(
e.toString() + " of line " + getLineNo());
}
break;
default:
errorMessage(
"unknown attribute type in column "
+ m_IndicesBuffer[numValues]);
}
}
numValues++;
} while (true);
if (flag) {
getLastToken(true);
}
// Add instance to dataset
double[] tempValues = new double[numValues];
int[] tempIndices = new int[numValues];
System.arraycopy(m_ValueBuffer, 0, tempValues, 0, numValues);
System.arraycopy(m_IndicesBuffer, 0, tempIndices, 0, numValues);
Instance inst = new SparseInstance(1, tempValues, tempIndices,
m_Data.numAttributes());
inst.setDataset(m_Data);
return inst;
}
/**
* Reads a single instance using the tokenizer and returns it.
*
* @param flag if method should test for carriage return after
* each instance
* @return null if end of file has been reached
* @throws IOException if the information is not read
* successfully
*/
protected Instance getInstanceFull(boolean flag) throws IOException {
double[] instance = new double[m_Data.numAttributes()];
int index;
// Get values for all attributes.
for (int i = 0; i < m_Data.numAttributes(); i++) {
// Get next token
if (i > 0) {
getNextToken();
}
// Check if value is missing.
if (m_Tokenizer.ttype == '?') {
instance[i] = Instance.missingValue();
} else {
// Check if token is valid.
if (m_Tokenizer.ttype != StreamTokenizer.TT_WORD) {
errorMessage("not a valid value");
}
switch (m_Data.attribute(i).type()) {
case Attribute.NOMINAL:
// Check if value appears in header.
index = m_Data.attribute(i).indexOfValue(
m_Tokenizer.sval);
if (index == -1) {
errorMessage("nominal value not declared in header");
}
instance[i] = index;
break;
case Attribute.NUMERIC:
// Check if value is really a number.
try {
instance[i] = Double.valueOf(m_Tokenizer.sval).doubleValue();
} catch (NumberFormatException e) {
errorMessage("number expected");
}
break;
case Attribute.STRING:
instance[i] = m_Data.attribute(i).addStringValue(
m_Tokenizer.sval);
break;
case Attribute.DATE:
try {
instance[i] = m_Data.attribute(i).parseDate(
m_Tokenizer.sval);
} catch (ParseException e) {
errorMessage("unparseable date: " + m_Tokenizer.sval);
}
break;
case Attribute.RELATIONAL:
try {
ArffReader arff = new ArffReader(
new StringReader(m_Tokenizer.sval),
m_Data.attribute(i).relation(), 0);
Instances data = arff.getData();
instance[i] = m_Data.attribute(i).addRelation(data);
} catch (Exception e) {
throw new IOException(
e.toString() + " of line " + getLineNo());
}
break;
default:
errorMessage("unknown attribute type in column " + i);
}
}
}
if (flag) {
getLastToken(true);
}
// Add instance to dataset
Instance inst = new Instance(1, instance);
inst.setDataset(m_Data);
return inst;
}
/**
* Reads and stores header of an ARFF file.
*
* @param capacity the number of instances to reserve in the data
* structure
* @throws IOException if the information is not read
* successfully
*/
protected void readHeader(int capacity) throws IOException {
m_Lines = 0;
String relationName = "";
// Get name of relation.
getFirstToken();
if (m_Tokenizer.ttype == StreamTokenizer.TT_EOF) {
errorMessage("premature end of file");
}
if (Instances.ARFF_RELATION.equalsIgnoreCase(m_Tokenizer.sval)) {
getNextToken();
relationName = m_Tokenizer.sval;
getLastToken(false);
} else {
errorMessage("keyword " + Instances.ARFF_RELATION + " expected");
}
// Create vectors to hold information temporarily.
FastVector attributes = new FastVector();
// Get attribute declarations.
getFirstToken();
if (m_Tokenizer.ttype == StreamTokenizer.TT_EOF) {
errorMessage("premature end of file");
}
while (Attribute.ARFF_ATTRIBUTE.equalsIgnoreCase(m_Tokenizer.sval)) {
attributes = parseAttribute(attributes);
}
// Check if data part follows. We can't easily check for EOL.
if (!Instances.ARFF_DATA.equalsIgnoreCase(m_Tokenizer.sval)) {
errorMessage("keyword " + Instances.ARFF_DATA + " expected");
}
// Check if any attributes have been declared.
if (attributes.size() == 0) {
errorMessage("no attributes declared");
}
m_Data = new Instances(relationName, attributes, capacity);
}
/**
* Parses the attribute declaration.
*
* @param attributes the current attributes vector
* @return the new attributes vector
* @throws IOException if the information is not read
* successfully
*/
protected FastVector parseAttribute(FastVector attributes) throws IOException {
String attributeName;
FastVector attributeValues;
// Get attribute name.
getNextToken();
attributeName = m_Tokenizer.sval;
getNextToken();
// Check if attribute is nominal.
if (m_Tokenizer.ttype == StreamTokenizer.TT_WORD) {
// Attribute is real, integer, or string.
if (m_Tokenizer.sval.equalsIgnoreCase(
Attribute.ARFF_ATTRIBUTE_REAL)
|| m_Tokenizer.sval.equalsIgnoreCase(
Attribute.ARFF_ATTRIBUTE_INTEGER)
|| m_Tokenizer.sval.equalsIgnoreCase(
Attribute.ARFF_ATTRIBUTE_NUMERIC)) {
attributes.addElement(
new Attribute(attributeName, attributes.size()));
readTillEOL();
} else if (m_Tokenizer.sval.equalsIgnoreCase(
Attribute.ARFF_ATTRIBUTE_STRING)) {
attributes.addElement(
new Attribute(attributeName, (FastVector) null,
attributes.size()));
readTillEOL();
} else if (m_Tokenizer.sval.equalsIgnoreCase(
Attribute.ARFF_ATTRIBUTE_DATE)) {
String format = null;
if (m_Tokenizer.nextToken() != StreamTokenizer.TT_EOL) {
if ((m_Tokenizer.ttype != StreamTokenizer.TT_WORD)
&& (m_Tokenizer.ttype != '\'')
&& (m_Tokenizer.ttype != '\"')) {
errorMessage("not a valid date format");
}
format = m_Tokenizer.sval;
readTillEOL();
} else {
m_Tokenizer.pushBack();
}
attributes.addElement(
new Attribute(attributeName, format,
attributes.size()));
} else if (m_Tokenizer.sval.equalsIgnoreCase(
Attribute.ARFF_ATTRIBUTE_RELATIONAL)) {
readTillEOL();
// Read attributes for subrelation
// First, save current set of attributes
FastVector atts = attributes;
attributes = new FastVector();
// Now, read attributes until we hit end of declaration of relational value
getFirstToken();
if (m_Tokenizer.ttype == StreamTokenizer.TT_EOF) {
errorMessage("premature end of file");
}
do {
if (Attribute.ARFF_ATTRIBUTE.equalsIgnoreCase(
m_Tokenizer.sval)) {
attributes = parseAttribute(attributes);
} else if (Attribute.ARFF_END_SUBRELATION.equalsIgnoreCase(
m_Tokenizer.sval)) {
getNextToken();
if (!attributeName.equalsIgnoreCase(m_Tokenizer.sval)) {
errorMessage(
"declaration of subrelation "
+ attributeName
+ " must be terminated by "
+ "@end " + attributeName);
}
break;
} else {
errorMessage(
"declaration of subrelation "
+ attributeName
+ " must be terminated by "
+ "@end " + attributeName);
}
} while (true);
// Make relation and restore original set of attributes
Instances relation = new Instances(attributeName, attributes,
0);
attributes = atts;
attributes.addElement(
new Attribute(attributeName, relation,
attributes.size()));
} else {
errorMessage(
"no valid attribute type or invalid "
+ "enumeration");
}
} else {
// Attribute is nominal.
attributeValues = new FastVector();
m_Tokenizer.pushBack();
// Get values for nominal attribute.
if (m_Tokenizer.nextToken() != '{') {
errorMessage("{ expected at beginning of enumeration");
}
while (m_Tokenizer.nextToken() != '}') {
if (m_Tokenizer.ttype == StreamTokenizer.TT_EOL) {
errorMessage("} expected at end of enumeration");
} else {
attributeValues.addElement(m_Tokenizer.sval);
}
}
attributes.addElement(
new Attribute(attributeName, attributeValues,
attributes.size()));
}
getLastToken(false);
getFirstToken();
if (m_Tokenizer.ttype == StreamTokenizer.TT_EOF) {
errorMessage("premature end of file");
}
return attributes;
}
/**
* Reads and skips all tokens before next end of line token.
*
* @throws IOException in case something goes wrong
*/
protected void readTillEOL() throws IOException {
while (m_Tokenizer.nextToken() != StreamTokenizer.TT_EOL) {}
;
m_Tokenizer.pushBack();
}
/**
* Returns the header format
*
* @return the header format
*/
public Instances getStructure() {
return new Instances(m_Data, 0);
}
/**
* Returns the data that was read
*
* @return the data
*/
public Instances getData() {
return m_Data;
}
}
/**
* Returns a string describing this Loader
* @return a description of the Loader suitable for
* displaying in the explorer/experimenter gui
*/
public String globalInfo() {
return "Reads a source that is in arff (attribute relation file format) "
+ "format. ";
}
/**
* Get the file extension used for arff files
*
* @return the file extension
*/
@Override
public String getFileExtension() {
return FILE_EXTENSION;
}
/**
* Gets all the file extensions used for this type of file
*
* @return the file extensions
*/
@Override
public String[] getFileExtensions() {
return new String[] { FILE_EXTENSION, FILE_EXTENSION_COMPRESSED};
}
/**
* Returns a description of the file type.
*
* @return a short file description
*/
@Override
public String getFileDescription() {
return "Arff data files";
}
/**
* Resets the Loader ready to read a new data set
*
* @throws IOException if something goes wrong
*/
@Override
public void reset() throws IOException {
m_structure = null;
setRetrieval(NONE);
if (m_File != null && (new File(m_File)).isFile()) {
setFile(new File(m_File));
} else if (m_URL != null & !m_URL.equals("http://")) {
setURL(m_URL);
}
}
/**
* Resets the Loader object and sets the source of the data set to be
* the supplied url.
*
* @param url the source url.
* @throws IOException if an error occurs
*/
public void setSource(URL url) throws IOException {
m_structure = null;
setRetrieval(NONE);
setSource(url.openStream());
m_URL = url.toString();
}
/**
* get the File specified as the source
*
* @return the source file
*/
@Override
public File retrieveFile() {
return new File(m_File);
}
/**
* sets the source File
*
* @param file the source file
* @throws IOException if an error occurs
*/
@Override
public void setFile(File file) throws IOException {
m_File = file.getAbsolutePath();
setSource(file);
}
/**
* Resets the Loader object and sets the source of the data set to be
* the supplied File object.
*
* @param file the source file.
* @throws IOException if an error occurs
*/
@Override
public void setSource(File file) throws IOException {
m_structure = null;
setRetrieval(NONE);
if (file == null) {
throw new IOException("Source file object is null!");
}
try {
if (file.getName().endsWith(FILE_EXTENSION_COMPRESSED)) {
setSource(new GZIPInputStream(new FileInputStream(file)));
} else {
setSource(new FileInputStream(file));
}
} catch (FileNotFoundException ex) {
throw new IOException("File not found");
}
m_sourceFile = file;
m_File = file.getAbsolutePath();
}
/**
* Set the url to load from
*
* @param url the url to load from
* @throws IOException if the url can't be set.
*/
@Override
public void setURL(String url) throws IOException {
m_URL = url;
setSource(new URL(url));
}
/**
* Return the current url
*
* @return the current url
*/
@Override
public String retrieveURL() {
return m_URL;
}
/**
* Resets the Loader object and sets the source of the data set to be
* the supplied InputStream.
*
* @param in the source InputStream.
* @throws IOException always thrown.
*/
@Override
public void setSource(InputStream in) throws IOException {
m_File = (new File(System.getProperty("user.dir"))).getAbsolutePath();
m_URL = "http://";
m_sourceReader = new BufferedReader(new InputStreamReader(in));
}
/**
* Determines and returns (if possible) the structure (internally the
* header) of the data set as an empty set of instances.
*
* @return the structure of the data set as an empty set of Instances
* @throws IOException if an error occurs
*/
@Override
public Instances getStructure() throws IOException {
if (m_sourceReader == null) {
throw new IOException("No source has been specified");
}
if (m_structure == null) {
try {
m_ArffReader = new ArffReader(m_sourceReader, 1);
m_structure = m_ArffReader.getStructure();
} catch (Exception ex) {
throw new IOException(
"Unable to determine structure as arff (Reason: "
+ ex.toString() + ").");
}
}
return new Instances(m_structure, 0);
}
/**
* Return the full data set. If the structure hasn't yet been determined
* by a call to getStructure then method should do so before processing
* the rest of the data set.
*
* @return the structure of the data set as an empty set of Instances
* @throws IOException if there is no source or parsing fails
*/
@Override
public Instances getDataSet() throws IOException {
if (m_sourceReader == null) {
throw new IOException("No source has been specified");
}
if (getRetrieval() == INCREMENTAL) {
throw new IOException(
"Cannot mix getting Instances in both incremental and batch modes");
}
setRetrieval(BATCH);
if (m_structure == null) {
getStructure();
}
// Read all instances
Instance inst;
while ((inst = m_ArffReader.readInstance(m_structure)) != null) {
m_structure.add(inst);
}
Instances readIn = new Instances(m_structure);
return readIn;
}
/**
* Read the data set incrementally---get the next instance in the data
* set or returns null if there are no
* more instances to get. If the structure hasn't yet been
* determined by a call to getStructure then method should do so before
* returning the next instance in the data set.
*
* @param structure the dataset header information, will get updated in
* case of string or relational attributes
* @return the next instance in the data set as an Instance object or null
* if there are no more instances to be read
* @throws IOException if there is an error during parsing
*/
@Override
public Instance getNextInstance(Instances structure) throws IOException {
m_structure = structure;
if (getRetrieval() == BATCH) {
throw new IOException(
"Cannot mix getting Instances in both incremental and batch modes");
}
setRetrieval(INCREMENTAL);
Instance current = m_ArffReader.readInstance(m_structure);
if (current == null) {
try {
reset();
} catch (Exception ex) {
ex.printStackTrace();
}
}
return current;
}
/**
* Main method.
*
* @param args should contain the name of an input file.
*/
public static void main(String[] args) {
runFileLoader(new ArffLoader(), args);
}
}