/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/*
* ArffLoader.java
* Copyright (C) 2000 University of Waikato, Hamilton, New Zealand
*
*/
package weka.core.converters;
import weka.core.Attribute;
import weka.core.Instance;
import weka.core.DenseInstance;
import weka.core.Instances;
import weka.core.RevisionHandler;
import weka.core.RevisionUtils;
import weka.core.SparseInstance;
import weka.core.Utils;
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StreamTokenizer;
import java.io.StringReader;
import java.net.URL;
import java.text.ParseException;
import java.util.ArrayList;
/**
<!-- globalinfo-start -->
* Reads a source that is in arff (attribute relation file format) format.
* <p/>
<!-- globalinfo-end -->
*
* @author Mark Hall (mhall@cs.waikato.ac.nz)
* @author FracPete (fracpete at waikato dot ac dot nz)
* @version $Revision: 7048 $
* @see Loader
*/
public class ArffLoader
extends AbstractFileLoader
implements BatchConverter, IncrementalConverter, URLSourcedLoader {
/** for serialization */
static final long serialVersionUID = 2726929550544048587L;
/** the file extension */
public static String FILE_EXTENSION = Instances.FILE_EXTENSION;
public static String FILE_EXTENSION_COMPRESSED = FILE_EXTENSION + ".gz";
/** the url */
protected String m_URL = "http://";
/** The reader for the source file. */
protected transient Reader m_sourceReader = null;
/** The parser for the ARFF file */
protected transient ArffReader m_ArffReader = null;
/**
* Reads data from an ARFF file, either in incremental or batch mode. <p/>
*
* Typical code for batch usage:
* <pre>
* BufferedReader reader = new BufferedReader(new FileReader("/some/where/file.arff"));
* ArffReader arff = new ArffReader(reader);
* Instances data = arff.getData();
* data.setClassIndex(data.numAttributes() - 1);
* </pre>
*
* Typical code for incremental usage:
* <pre>
* BufferedReader reader = new BufferedReader(new FileReader("/some/where/file.arff"));
* ArffReader arff = new ArffReader(reader, 1000);
* Instances data = arff.getStructure();
* data.setClassIndex(data.numAttributes() - 1);
* Instance inst;
* while ((inst = arff.readInstance(data)) != null) {
* data.add(inst);
* }
* </pre>
*
* @author Eibe Frank (eibe@cs.waikato.ac.nz)
* @author Len Trigg (trigg@cs.waikato.ac.nz)
* @author fracpete (fracpete at waikato dot ac dot nz)
* @version $Revision: 7048 $
*/
public static class ArffReader
implements RevisionHandler {
/** the tokenizer for reading the stream */
protected StreamTokenizer m_Tokenizer;
/** Buffer of values for sparse instance */
protected double[] m_ValueBuffer;
/** Buffer of indices for sparse instance */
protected int[] m_IndicesBuffer;
/** the actual data */
protected Instances m_Data;
/** the number of lines read so far */
protected int m_Lines;
/**
* Reads the data completely from the reader. The data can be accessed
* via the <code>getData()</code> method.
*
* @param reader the reader to use
* @throws IOException if something goes wrong
* @see #getData()
*/
public ArffReader(Reader reader) throws IOException {
m_Tokenizer = new StreamTokenizer(reader);
initTokenizer();
readHeader(1000);
initBuffers();
Instance inst;
while ((inst = readInstance(m_Data)) != null) {
m_Data.add(inst);
};
compactify();
}
/**
* Reads only the header and reserves the specified space for instances.
* Further instances can be read via <code>readInstance()</code>.
*
* @param reader the reader to use
* @param capacity the capacity of the new dataset
* @throws IOException if something goes wrong
* @throws IllegalArgumentException if capacity is negative
* @see #getStructure()
* @see #readInstance(Instances)
*/
public ArffReader(Reader reader, int capacity) throws IOException {
if (capacity < 0)
throw new IllegalArgumentException("Capacity has to be positive!");
m_Tokenizer = new StreamTokenizer(reader);
initTokenizer();
readHeader(capacity);
initBuffers();
}
/**
* Reads the data without header according to the specified template.
* The data can be accessed via the <code>getData()</code> method.
*
* @param reader the reader to use
* @param template the template header
* @param lines the lines read so far
* @throws IOException if something goes wrong
* @see #getData()
*/
public ArffReader(Reader reader, Instances template, int lines) throws IOException {
this(reader, template, lines, 100);
Instance inst;
while ((inst = readInstance(m_Data)) != null) {
m_Data.add(inst);
};
compactify();
}
/**
* Initializes the reader without reading the header according to the
* specified template. The data must be read via the
* <code>readInstance()</code> method.
*
* @param reader the reader to use
* @param template the template header
* @param lines the lines read so far
* @param capacity the capacity of the new dataset
* @throws IOException if something goes wrong
* @see #getData()
*/
public ArffReader(Reader reader, Instances template, int lines, int capacity) throws IOException {
m_Lines = lines;
m_Tokenizer = new StreamTokenizer(reader);
initTokenizer();
m_Data = new Instances(template, capacity);
initBuffers();
}
/**
* initializes the buffers for sparse instances to be read
*
* @see #m_ValueBuffer
* @see #m_IndicesBuffer
*/
protected void initBuffers() {
m_ValueBuffer = new double[m_Data.numAttributes()];
m_IndicesBuffer = new int[m_Data.numAttributes()];
}
/**
* compactifies the data
*/
protected void compactify() {
if (m_Data != null)
m_Data.compactify();
}
/**
* Throws error message with line number and last token read.
*
* @param msg the error message to be thrown
* @throws IOException containing the error message
*/
protected void errorMessage(String msg) throws IOException {
String str = msg + ", read " + m_Tokenizer.toString();
if (m_Lines > 0) {
int line = Integer.parseInt(str.replaceAll(".* line ", ""));
str = str.replaceAll(" line .*", " line " + (m_Lines + line - 1));
}
throw new IOException(str);
}
/**
* returns the current line number
*
* @return the current line number
*/
public int getLineNo() {
return m_Lines + m_Tokenizer.lineno();
}
/**
* Gets next token, skipping empty lines.
*
* @throws IOException if reading the next token fails
*/
protected void getFirstToken() throws IOException {
while (m_Tokenizer.nextToken() == StreamTokenizer.TT_EOL) {};
if ((m_Tokenizer.ttype == '\'') ||
(m_Tokenizer.ttype == '"')) {
m_Tokenizer.ttype = StreamTokenizer.TT_WORD;
} else if ((m_Tokenizer.ttype == StreamTokenizer.TT_WORD) &&
(m_Tokenizer.sval.equals("?"))){
m_Tokenizer.ttype = '?';
}
}
/**
* Gets index, checking for a premature and of line.
*
* @throws IOException if it finds a premature end of line
*/
protected void getIndex() throws IOException {
if (m_Tokenizer.nextToken() == StreamTokenizer.TT_EOL) {
errorMessage("premature end of line");
}
if (m_Tokenizer.ttype == StreamTokenizer.TT_EOF) {
errorMessage("premature end of file");
}
}
/**
* Gets token and checks if its end of line.
*
* @param endOfFileOk whether EOF is OK
* @throws IOException if it doesn't find an end of line
*/
protected void getLastToken(boolean endOfFileOk) throws IOException {
if ((m_Tokenizer.nextToken() != StreamTokenizer.TT_EOL) &&
((m_Tokenizer.ttype != StreamTokenizer.TT_EOF) || !endOfFileOk)) {
errorMessage("end of line expected");
}
}
/**
* Gets the value of an instance's weight (if one exists)
*
* @return the value of the instance's weight, or NaN
* if no weight has been supplied in the file
*/
protected double getInstanceWeight() throws IOException {
double weight = Double.NaN;
m_Tokenizer.nextToken();
if (m_Tokenizer.ttype == StreamTokenizer.TT_EOL ||
m_Tokenizer.ttype == StreamTokenizer.TT_EOF) {
return weight;
}
// see if we can read an instance weight
// m_Tokenizer.pushBack();
if (m_Tokenizer.ttype == '{') {
m_Tokenizer.nextToken();
String weightS = m_Tokenizer.sval;
// try to parse weight as a double
try {
weight = Double.parseDouble(weightS);
} catch (NumberFormatException e) {
// quietly ignore
return weight;
}
// see if we have the closing brace
m_Tokenizer.nextToken();
if (m_Tokenizer.ttype != '}') {
errorMessage("Problem reading instance weight");
}
}
return weight;
}
/**
* Gets next token, checking for a premature and of line.
*
* @throws IOException if it finds a premature end of line
*/
protected void getNextToken() throws IOException {
if (m_Tokenizer.nextToken() == StreamTokenizer.TT_EOL) {
errorMessage("premature end of line");
}
if (m_Tokenizer.ttype == StreamTokenizer.TT_EOF) {
errorMessage("premature end of file");
} else if ((m_Tokenizer.ttype == '\'') ||
(m_Tokenizer.ttype == '"')) {
m_Tokenizer.ttype = StreamTokenizer.TT_WORD;
} else if ((m_Tokenizer.ttype == StreamTokenizer.TT_WORD) &&
(m_Tokenizer.sval.equals("?"))){
m_Tokenizer.ttype = '?';
}
}
/**
* Initializes the StreamTokenizer used for reading the ARFF file.
*/
protected void initTokenizer(){
m_Tokenizer.resetSyntax();
m_Tokenizer.whitespaceChars(0, ' ');
m_Tokenizer.wordChars(' '+1,'\u00FF');
m_Tokenizer.whitespaceChars(',',',');
m_Tokenizer.commentChar('%');
m_Tokenizer.quoteChar('"');
m_Tokenizer.quoteChar('\'');
m_Tokenizer.ordinaryChar('{');
m_Tokenizer.ordinaryChar('}');
m_Tokenizer.eolIsSignificant(true);
}
/**
* Reads a single instance using the tokenizer and returns it.
*
* @param structure the dataset header information, will get updated
* in case of string or relational attributes
* @return null if end of file has been reached
* @throws IOException if the information is not read
* successfully
*/
public Instance readInstance(Instances structure) throws IOException {
return readInstance(structure, true);
}
/**
* Reads a single instance using the tokenizer and returns it.
*
* @param structure the dataset header information, will get updated
* in case of string or relational attributes
* @param flag if method should test for carriage return after
* each instance
* @return null if end of file has been reached
* @throws IOException if the information is not read
* successfully
*/
public Instance readInstance(Instances structure, boolean flag) throws IOException {
return getInstance(structure, flag);
}
/**
* Reads a single instance using the tokenizer and returns it.
*
* @param structure the dataset header information, will get updated
* in case of string or relational attributes
* @param flag if method should test for carriage return after
* each instance
* @return null if end of file has been reached
* @throws IOException if the information is not read
* successfully
*/
protected Instance getInstance(Instances structure, boolean flag) throws IOException {
m_Data = structure;
// Check if any attributes have been declared.
if (m_Data.numAttributes() == 0) {
errorMessage("no header information available");
}
// Check if end of file reached.
getFirstToken();
if (m_Tokenizer.ttype == StreamTokenizer.TT_EOF) {
return null;
}
// Parse instance
if (m_Tokenizer.ttype == '{') {
return getInstanceSparse(flag);
} else {
return getInstanceFull(flag);
}
}
/**
* Reads a single instance using the tokenizer and returns it.
*
* @param flag if method should test for carriage return after
* each instance
* @return null if end of file has been reached
* @throws IOException if the information is not read
* successfully
*/
protected Instance getInstanceSparse(boolean flag) throws IOException {
int valIndex, numValues = 0, maxIndex = -1;
// Get values
do {
// Get index
getIndex();
if (m_Tokenizer.ttype == '}') {
break;
}
// Is index valid?
try{
m_IndicesBuffer[numValues] = Integer.valueOf(m_Tokenizer.sval).intValue();
} catch (NumberFormatException e) {
errorMessage("index number expected");
}
if (m_IndicesBuffer[numValues] <= maxIndex) {
errorMessage("indices have to be ordered");
}
if ((m_IndicesBuffer[numValues] < 0) ||
(m_IndicesBuffer[numValues] >= m_Data.numAttributes())) {
errorMessage("index out of bounds");
}
maxIndex = m_IndicesBuffer[numValues];
// Get value;
getNextToken();
// Check if value is missing.
if (m_Tokenizer.ttype == '?') {
m_ValueBuffer[numValues] = Utils.missingValue();
} else {
// Check if token is valid.
if (m_Tokenizer.ttype != StreamTokenizer.TT_WORD) {
errorMessage("not a valid value");
}
switch (m_Data.attribute(m_IndicesBuffer[numValues]).type()) {
case Attribute.NOMINAL:
// Check if value appears in header.
valIndex =
m_Data.attribute(m_IndicesBuffer[numValues]).indexOfValue(m_Tokenizer.sval);
if (valIndex == -1) {
errorMessage("nominal value not declared in header");
}
m_ValueBuffer[numValues] = (double)valIndex;
break;
case Attribute.NUMERIC:
// Check if value is really a number.
try{
m_ValueBuffer[numValues] = Double.valueOf(m_Tokenizer.sval).
doubleValue();
} catch (NumberFormatException e) {
errorMessage("number expected");
}
break;
case Attribute.STRING:
m_ValueBuffer[numValues] =
m_Data.attribute(m_IndicesBuffer[numValues]).addStringValue(m_Tokenizer.sval);
break;
case Attribute.DATE:
try {
m_ValueBuffer[numValues] =
m_Data.attribute(m_IndicesBuffer[numValues]).parseDate(m_Tokenizer.sval);
} catch (ParseException e) {
errorMessage("unparseable date: " + m_Tokenizer.sval);
}
break;
case Attribute.RELATIONAL:
try {
ArffReader arff = new ArffReader(new StringReader(m_Tokenizer.sval), m_Data.attribute(m_IndicesBuffer[numValues]).relation(), 0);
Instances data = arff.getData();
m_ValueBuffer[numValues] = m_Data.attribute(m_IndicesBuffer[numValues]).addRelation(data);
}
catch (Exception e) {
throw new IOException(e.toString() + " of line " + getLineNo());
}
break;
default:
errorMessage("unknown attribute type in column " + m_IndicesBuffer[numValues]);
}
}
numValues++;
} while (true);
double weight = 1.0;
if (flag) {
// check for an instance weight
weight = getInstanceWeight();
if (!Double.isNaN(weight)) {
getLastToken(true);
} else {
weight = 1.0;
}
}
// Add instance to dataset
double[] tempValues = new double[numValues];
int[] tempIndices = new int[numValues];
System.arraycopy(m_ValueBuffer, 0, tempValues, 0, numValues);
System.arraycopy(m_IndicesBuffer, 0, tempIndices, 0, numValues);
Instance inst = new SparseInstance(weight, tempValues, tempIndices, m_Data.numAttributes());
inst.setDataset(m_Data);
return inst;
}
/**
* Reads a single instance using the tokenizer and returns it.
*
* @param flag if method should test for carriage return after
* each instance
* @return null if end of file has been reached
* @throws IOException if the information is not read
* successfully
*/
protected Instance getInstanceFull(boolean flag) throws IOException {
double[] instance = new double[m_Data.numAttributes()];
int index;
// Get values for all attributes.
for (int i = 0; i < m_Data.numAttributes(); i++){
// Get next token
if (i > 0) {
getNextToken();
}
// Check if value is missing.
if (m_Tokenizer.ttype == '?') {
instance[i] = Utils.missingValue();
} else {
// Check if token is valid.
if (m_Tokenizer.ttype != StreamTokenizer.TT_WORD) {
errorMessage("not a valid value");
}
switch (m_Data.attribute(i).type()) {
case Attribute.NOMINAL:
// Check if value appears in header.
index = m_Data.attribute(i).indexOfValue(m_Tokenizer.sval);
if (index == -1) {
errorMessage("nominal value not declared in header");
}
instance[i] = (double)index;
break;
case Attribute.NUMERIC:
// Check if value is really a number.
try{
instance[i] = Double.valueOf(m_Tokenizer.sval).
doubleValue();
} catch (NumberFormatException e) {
errorMessage("number expected");
}
break;
case Attribute.STRING:
instance[i] = m_Data.attribute(i).addStringValue(m_Tokenizer.sval);
break;
case Attribute.DATE:
try {
instance[i] = m_Data.attribute(i).parseDate(m_Tokenizer.sval);
} catch (ParseException e) {
errorMessage("unparseable date: " + m_Tokenizer.sval);
}
break;
case Attribute.RELATIONAL:
try {
ArffReader arff = new ArffReader(new StringReader(m_Tokenizer.sval), m_Data.attribute(i).relation(), 0);
Instances data = arff.getData();
instance[i] = m_Data.attribute(i).addRelation(data);
}
catch (Exception e) {
throw new IOException(e.toString() + " of line " + getLineNo());
}
break;
default:
errorMessage("unknown attribute type in column " + i);
}
}
}
double weight = 1.0;
if (flag) {
// check for an instance weight
weight = getInstanceWeight();
if (!Double.isNaN(weight)) {
getLastToken(true);
} else {
weight = 1.0;
}
}
// Add instance to dataset
Instance inst = new DenseInstance(weight, instance);
inst.setDataset(m_Data);
return inst;
}
/**
* Reads and stores header of an ARFF file.
*
* @param capacity the number of instances to reserve in the data
* structure
* @throws IOException if the information is not read
* successfully
*/
protected void readHeader(int capacity) throws IOException {
m_Lines = 0;
String relationName = "";
// Get name of relation.
getFirstToken();
if (m_Tokenizer.ttype == StreamTokenizer.TT_EOF) {
errorMessage("premature end of file");
}
if (Instances.ARFF_RELATION.equalsIgnoreCase(m_Tokenizer.sval)) {
getNextToken();
relationName = m_Tokenizer.sval;
getLastToken(false);
} else {
errorMessage("keyword " + Instances.ARFF_RELATION + " expected");
}
// Create vectors to hold information temporarily.
ArrayList<Attribute> attributes = new ArrayList<Attribute>();
// Get attribute declarations.
getFirstToken();
if (m_Tokenizer.ttype == StreamTokenizer.TT_EOF) {
errorMessage("premature end of file");
}
while (Attribute.ARFF_ATTRIBUTE.equalsIgnoreCase(m_Tokenizer.sval)) {
attributes = parseAttribute(attributes);
}
// Check if data part follows. We can't easily check for EOL.
if (!Instances.ARFF_DATA.equalsIgnoreCase(m_Tokenizer.sval)) {
errorMessage("keyword " + Instances.ARFF_DATA + " expected");
}
// Check if any attributes have been declared.
if (attributes.size() == 0) {
errorMessage("no attributes declared");
}
m_Data = new Instances(relationName, attributes, capacity);
}
/**
* Parses the attribute declaration.
*
* @param attributes the current attributes vector
* @return the new attributes vector
* @throws IOException if the information is not read
* successfully
*/
protected ArrayList<Attribute> parseAttribute(ArrayList<Attribute> attributes) throws IOException {
String attributeName;
ArrayList<String> attributeValues;
// Get attribute name.
getNextToken();
attributeName = m_Tokenizer.sval;
getNextToken();
// Check if attribute is nominal.
if (m_Tokenizer.ttype == StreamTokenizer.TT_WORD) {
// Attribute is real, integer, or string.
if (m_Tokenizer.sval.equalsIgnoreCase(Attribute.ARFF_ATTRIBUTE_REAL) ||
m_Tokenizer.sval.equalsIgnoreCase(Attribute.ARFF_ATTRIBUTE_INTEGER) ||
m_Tokenizer.sval.equalsIgnoreCase(Attribute.ARFF_ATTRIBUTE_NUMERIC)) {
attributes.add(new Attribute(attributeName, attributes.size()));
readTillEOL();
} else if (m_Tokenizer.sval.equalsIgnoreCase(Attribute.ARFF_ATTRIBUTE_STRING)) {
attributes.add(new Attribute(attributeName, (ArrayList<String>)null,
attributes.size()));
readTillEOL();
} else if (m_Tokenizer.sval.equalsIgnoreCase(Attribute.ARFF_ATTRIBUTE_DATE)) {
String format = null;
if (m_Tokenizer.nextToken() != StreamTokenizer.TT_EOL) {
if ((m_Tokenizer.ttype != StreamTokenizer.TT_WORD) &&
(m_Tokenizer.ttype != '\'') &&
(m_Tokenizer.ttype != '\"')) {
errorMessage("not a valid date format");
}
format = m_Tokenizer.sval;
readTillEOL();
} else {
m_Tokenizer.pushBack();
}
attributes.add(new Attribute(attributeName, format, attributes.size()));
} else if (m_Tokenizer.sval.equalsIgnoreCase(Attribute.ARFF_ATTRIBUTE_RELATIONAL)) {
readTillEOL();
// Read attributes for subrelation
// First, save current set of attributes
ArrayList<Attribute> atts = attributes;
attributes = new ArrayList<Attribute>();
// Now, read attributes until we hit end of declaration of relational value
getFirstToken();
if (m_Tokenizer.ttype == StreamTokenizer.TT_EOF) {
errorMessage("premature end of file");
}
do {
if (Attribute.ARFF_ATTRIBUTE.equalsIgnoreCase(m_Tokenizer.sval)) {
attributes = parseAttribute(attributes);
} else if (Attribute.ARFF_END_SUBRELATION.equalsIgnoreCase(m_Tokenizer.sval)) {
getNextToken();
if (!attributeName.equalsIgnoreCase(m_Tokenizer.sval)) {
errorMessage("declaration of subrelation " + attributeName +
" must be terminated by " + "@end " + attributeName);
}
break;
} else {
errorMessage("declaration of subrelation " + attributeName +
" must be terminated by " + "@end " + attributeName);
}
} while (true);
// Make relation and restore original set of attributes
Instances relation = new Instances(attributeName, attributes, 0);
attributes = atts;
attributes.add(new Attribute(attributeName, relation, attributes.size()));
} else {
errorMessage("no valid attribute type or invalid "+
"enumeration");
}
} else {
// Attribute is nominal.
attributeValues = new ArrayList<String>();
m_Tokenizer.pushBack();
// Get values for nominal attribute.
if (m_Tokenizer.nextToken() != '{') {
errorMessage("{ expected at beginning of enumeration");
}
while (m_Tokenizer.nextToken() != '}') {
if (m_Tokenizer.ttype == StreamTokenizer.TT_EOL) {
errorMessage("} expected at end of enumeration");
} else {
attributeValues.add(m_Tokenizer.sval);
}
}
attributes.add(new Attribute(attributeName, attributeValues,
attributes.size()));
}
getLastToken(false);
getFirstToken();
if (m_Tokenizer.ttype == StreamTokenizer.TT_EOF)
errorMessage("premature end of file");
return attributes;
}
/**
* Reads and skips all tokens before next end of line token.
*
* @throws IOException in case something goes wrong
*/
protected void readTillEOL() throws IOException {
while (m_Tokenizer.nextToken() != StreamTokenizer.TT_EOL) {};
m_Tokenizer.pushBack();
}
/**
* Returns the header format
*
* @return the header format
*/
public Instances getStructure() {
return new Instances(m_Data, 0);
}
/**
* Returns the data that was read
*
* @return the data
*/
public Instances getData() {
return m_Data;
}
/**
* Returns the revision string.
*
* @return the revision
*/
public String getRevision() {
return RevisionUtils.extract("$Revision: 7048 $");
}
}
/**
* Returns a string describing this Loader
* @return a description of the Loader suitable for
* displaying in the explorer/experimenter gui
*/
public String globalInfo() {
return "Reads a source that is in arff (attribute relation file format) "
+"format. ";
}
/**
* Get the file extension used for arff files
*
* @return the file extension
*/
public String getFileExtension() {
return FILE_EXTENSION;
}
/**
* Gets all the file extensions used for this type of file
*
* @return the file extensions
*/
public String[] getFileExtensions() {
return new String[]{FILE_EXTENSION, FILE_EXTENSION_COMPRESSED};
}
/**
* Returns a description of the file type.
*
* @return a short file description
*/
public String getFileDescription() {
return "Arff data files";
}
/**
* Resets the Loader ready to read a new data set or the
* same data set again.
*
* @throws IOException if something goes wrong
*/
public void reset() throws IOException {
m_structure = null;
setRetrieval(NONE);
if (m_File != null && !(new File(m_File).isDirectory())) {
setFile(new File(m_File));
} else if (m_URL != null && !m_URL.equals("http://")) {
setURL(m_URL);
}
}
/**
* Resets the Loader object and sets the source of the data set to be
* the supplied url.
*
* @param url the source url.
* @throws IOException if an error occurs
*/
public void setSource(URL url) throws IOException {
m_structure = null;
setRetrieval(NONE);
setSource(url.openStream());
m_URL = url.toString();
// make sure that the file is null so that any calls to
// reset() work properly
m_File = null;
}
/**
* get the File specified as the source
*
* @return the source file
*/
public File retrieveFile() {
return new File(m_File);
}
/**
* sets the source File
*
* @param file the source file
* @throws IOException if an error occurs
*/
public void setFile(File file) throws IOException {
m_File = file.getPath();
setSource(file);
}
/**
* Set the url to load from
*
* @param url the url to load from
* @throws IOException if the url can't be set.
*/
public void setURL(String url) throws IOException {
m_URL = url;
setSource(new URL(url));
}
/**
* Return the current url
*
* @return the current url
*/
public String retrieveURL() {
return m_URL;
}
/**
* Resets the Loader object and sets the source of the data set to be
* the supplied InputStream.
*
* @param in the source InputStream.
* @throws IOException always thrown.
*/
public void setSource(InputStream in) throws IOException {
m_File = (new File(System.getProperty("user.dir"))).getAbsolutePath();
m_URL = "http://";
m_sourceReader = new BufferedReader(new InputStreamReader(in));
}
/**
* Determines and returns (if possible) the structure (internally the
* header) of the data set as an empty set of instances.
*
* @return the structure of the data set as an empty set of Instances
* @throws IOException if an error occurs
*/
public Instances getStructure() throws IOException {
if (m_structure == null) {
if (m_sourceReader == null) {
throw new IOException("No source has been specified");
}
try {
m_ArffReader = new ArffReader(m_sourceReader, 1);
m_structure = m_ArffReader.getStructure();
} catch (Exception ex) {
throw new IOException("Unable to determine structure as arff (Reason: " + ex.toString() + ").");
}
}
return new Instances(m_structure, 0);
}
/**
* Return the full data set. If the structure hasn't yet been determined
* by a call to getStructure then method should do so before processing
* the rest of the data set.
*
* @return the structure of the data set as an empty set of Instances
* @throws IOException if there is no source or parsing fails
*/
public Instances getDataSet() throws IOException {
if (m_sourceReader == null) {
throw new IOException("No source has been specified");
}
if (getRetrieval() == INCREMENTAL) {
throw new IOException("Cannot mix getting Instances in both incremental and batch modes");
}
setRetrieval(BATCH);
if (m_structure == null) {
getStructure();
}
// Read all instances
Instance inst;
while ((inst = m_ArffReader.readInstance(m_structure)) != null)
m_structure.add(inst);
Instances readIn = new Instances(m_structure);
// close the stream
m_sourceReader.close();
return readIn;
}
/**
* Read the data set incrementally---get the next instance in the data
* set or returns null if there are no
* more instances to get. If the structure hasn't yet been
* determined by a call to getStructure then method should do so before
* returning the next instance in the data set.
*
* @param structure the dataset header information, will get updated in
* case of string or relational attributes
* @return the next instance in the data set as an Instance object or null
* if there are no more instances to be read
* @throws IOException if there is an error during parsing
*/
public Instance getNextInstance(Instances structure) throws IOException {
m_structure = structure;
if (getRetrieval() == BATCH) {
throw new IOException("Cannot mix getting Instances in both incremental and batch modes");
}
setRetrieval(INCREMENTAL);
Instance current = null;
if (m_sourceReader != null)
current = m_ArffReader.readInstance(m_structure);
if ((m_sourceReader != null) && (current == null)) {
try {
// close the stream
m_sourceReader.close();
m_sourceReader = null;
// reset();
} catch (Exception ex) {
ex.printStackTrace();
}
}
return current;
}
/**
* Returns the revision string.
*
* @return the revision
*/
public String getRevision() {
return RevisionUtils.extract("$Revision: 7048 $");
}
/**
* Main method.
*
* @param args should contain the name of an input file.
*/
public static void main(String [] args) {
runFileLoader(new ArffLoader(), args);
}
}