Source Code of org.integratedmodelling.riskwiz.learning.data.loader.ConverterUtils$DataSource

/*
 *    This program is free software; you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation; either version 2 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program; if not, write to the Free Software
 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */


/*
 *    ConverterUtils.java
 *    Copyright (C) 2000 University of Waikato, Hamilton, New Zealand
 *
 */


package org.integratedmodelling.riskwiz.learning.data.loader;


 
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.Serializable;
import java.io.StreamTokenizer;
import java.net.URL;
import java.util.Collections;
import java.util.Enumeration;
import java.util.Hashtable;
import java.util.Vector;


import org.integratedmodelling.riskwiz.learning.data.Instance;
import org.integratedmodelling.riskwiz.learning.data.Instances;




/**
 * Utility routines for the converter package.
 *
 * @author Mark Hall (mhall@cs.waikato.ac.nz)
 * @author FracPete (fracpete at waikato dot ac dot nz)
 * @version $Revision: 1.14 $
 * @see Serializable
 */
public class ConverterUtils
        implements Serializable {


    /**
     * Helper class for loading data from files and URLs. Via the ConverterUtils
     * class it determines which converter to use for loading the data into 
     * memory. If the chosen converter is an incremental one, then the data
     * will be loaded incrementally, otherwise as batch. In both cases the 
     * same interface will be used (<code>hasMoreElements</code>, 
     * <code>nextElement</code>). Before the
     * data can be read again, one has to call the <code>reset</code> method.
     * The data source can also be initialized with an Instances object, in 
     * order to provide a unified interface to files and already loaded datasets.
     * 
     * @author FracPete (fracpete at waikato dot ac dot nz)
     * @version $Revision: 1.14 $
     * @see #hasMoreElements(Instances)
     * @see #nextElement(Instances)
     * @see #reset()
     * @see DataSink
     */
    public static class DataSource
            implements Serializable {
      
        /** for serialization. */
        private static final long serialVersionUID = -613122395928757332L;
  
        /** the file to load. */
        protected File m_File;
      
        /** the URL to load. */
        protected URL m_URL;
      
        /** the loader.*/
        protected Loader m_Loader;
      
        /** whether the loader is incremental. */
        protected boolean m_Incremental;
      
        /** the instance counter for the batch case. */
        protected int m_BatchCounter;
  
        /** the last internally read instance. */
        protected Instance m_IncrementalBuffer;
      
        /** the batch buffer. */
        protected Instances m_BatchBuffer;
      
        /**
         * Tries to load the data from the file. Can be either a regular file or
         * a web location (http://, https://, ftp:// or file://).
         * 
         * @param location    the name of the file to load
         * @throws Exception  if initialization fails
         */
        public DataSource(String location, Loader loader) throws Exception {
            super();
        
            // file or URL?
            if (location.startsWith("http://")
                    || location.startsWith("https://")
                    || location.startsWith("ftp://")
                    || location.startsWith("file://")) {
                m_URL = new URL(location);
            } else {
                m_File = new File(location);
            }
        
            // quick check: is it ARFF?
            if (isArff(location)) {
                m_Loader = new ArffLoader();
            } else {
     
                m_Loader = loader;
     
                // do we have a converter?
                if (m_Loader == null) {
                    throw new IllegalArgumentException(
                            "No suitable converter found for '" + location
                            + "'!");
                }
            }
        
            // incremental loader?
            m_Incremental = (m_Loader instanceof IncrementalConverter);
        
            reset();
        }
      
        public DataSource(String location) throws Exception {
            super();
          
            // file or URL?
            if (location.startsWith("http://")
                    || location.startsWith("https://")
                    || location.startsWith("ftp://")
                    || location.startsWith("file://")) {
                m_URL = new URL(location);
            } else {
                m_File = new File(location);
            }
           
            // quick check: is it ARFF?
            if (isArff(location)) {
                m_Loader = new ArffLoader();
            } else {       
                throw new IllegalArgumentException(
                        "No suitable converter found for '" + location + "'!");
            }
          
            // incremental loader?
            m_Incremental = (m_Loader instanceof IncrementalConverter);
          
            reset();
        }
      
        /**
         * Initializes the datasource with the given dataset.
         * 
         * @param inst    the dataset to use
         */
        public DataSource(Instances inst) {
            super();
        
            m_BatchBuffer = inst;
            m_Loader = null;
            m_File = null;
            m_URL = null;
            m_Incremental = false;
        }
      
        /**
         * Initializes the datasource with the given Loader.
         * 
         * @param loader    the Loader to use
         */
        public DataSource(Loader loader) {
            super();
  
            m_BatchBuffer = null;
            m_Loader = loader;
            m_File = null;
            m_URL = null;
            m_Incremental = (m_Loader instanceof IncrementalConverter);
        
            initBatchBuffer();
        }
  
        /**
         * Initializes the datasource with the given input stream. This stream
         * is always interpreted as ARFF.
         * 
         * @param stream    the stream to use
         */
        public DataSource(InputStream stream) {
            super();
        
            m_BatchBuffer = null;
            m_Loader = new ArffLoader();
            try {
                m_Loader.setSource(stream);
            } catch (Exception e) {
                m_Loader = null;
            }
            m_File = null;
            m_URL = null;
            m_Incremental = (m_Loader instanceof IncrementalConverter);
        
            initBatchBuffer();
        }
  
        /**
         * initializes the batch buffer if necessary, i.e., for non-incremental
         * loaders.
         */
        protected void initBatchBuffer() {
            try {
                if (!isIncremental()) {
                    m_BatchBuffer = m_Loader.getDataSet();
                } else {
                    m_BatchBuffer = null;
                }
            } catch (Exception e) {
                e.printStackTrace();
            }
        }
      
        /**
         * returns whether the extension of the location is likely to be of ARFF
         * format, i.e., ending in ".arff" or ".arff.gz" (case-insensitive).
         * 
         * @param location    the file location to check
         * @return      true if the location seems to be of ARFF format
         */
        public static boolean isArff(String location) {
            if (location.toLowerCase().endsWith(
                    ArffLoader.FILE_EXTENSION.toLowerCase())
                            || location.toLowerCase().endsWith(
                                    ArffLoader.FILE_EXTENSION_COMPRESSED.toLowerCase())) {
                return true;
            } else {
                return false;
            }
        }
      
        /**
         * returns whether the loader is an incremental one.
         * 
         * @return    true if the loader is a true incremental one
         */
        public boolean isIncremental() {
            return m_Incremental;
        }
      
        /**
         * returns the determined loader, null if the DataSource was initialized
         * with data alone and not a file/URL.
         * 
         * @return    the loader used for retrieving the data
         */
        public Loader getLoader() {
            return m_Loader;
        }
      
        /**
         * returns the full dataset, can be null in case of an error.
         * 
         * @return      the full dataset
         * @throws Exception   if resetting of loader fails
         */
        public Instances getDataSet() throws Exception {
            Instances    result;
        
            result = null;
        
            // reset the loader
            reset();
        
            try {
                if (m_Loader != null) {
                    result = m_Loader.getDataSet();
                } else {
                    result = m_BatchBuffer;
                }
            } catch (Exception e) {
                e.printStackTrace();
                result = null;
            }
  
            return result;
        }
      
        /**
         * returns the full dataset with the specified class index set, 
         * can be null in case of an error.
         * 
         * @param classIndex  the class index for the dataset
         * @return      the full dataset
         * @throws Exception   if resetting of loader fails
         */
        public Instances getDataSet(int classIndex) throws Exception {
            Instances    result;
        
            result = getDataSet();
            if (result != null) {
                result.setClassIndex(classIndex);
            }
        
            return result;
        }
      
        /**
         * resets the loader.
         * 
         * @throws Exception  if resetting fails
         */
        public void reset() throws Exception {
            if (m_File != null) {
                ((AbstractFileLoader) m_Loader).setFile(m_File);
            } else if (m_URL != null) {
                ((URLSourcedLoader) m_Loader).setURL(m_URL.toString());
            } else if (m_Loader != null) {
                m_Loader.reset();
            }
        
            m_BatchCounter = 0;
            m_IncrementalBuffer = null;
  
            if (m_Loader != null) {
                if (!isIncremental()) {
                    m_BatchBuffer = m_Loader.getDataSet();
                } else {
                    m_BatchBuffer = null;
                }
            }
        }
  
        /**
         * returns the structure of the data.
         * 
         * @return      the structure of the data
         * @throws Exception  if something goes wrong
         */
        public Instances getStructure() throws Exception {
            if (m_Loader != null) {
                return m_Loader.getStructure();
            } else {
                return new Instances(m_BatchBuffer, 0);
            }
        }
  
        /**
         * returns the structure of the data, with the defined class index.
         * 
         * @param classIndex  the class index for the dataset
         * @return      the structure of the data
         * @throws Exception  if something goes wrong
         */
        public Instances getStructure(int classIndex) throws Exception {
            Instances    result;
        
            result = getStructure();
            if (result != null) {
                result.setClassIndex(classIndex);
            }
        
            return result;
        }
      
        /**
         * returns whether there are more Instance objects in the data.
         * 
         * @param structure  the structure of the dataset
         * @return    true if there are more Instance objects 
         *       available
         * @see    #nextElement(Instances)
         */
        public boolean hasMoreElements(Instances structure) {
            boolean  result;
        
            result = false;
        
            if (isIncremental()) {
                // user still hasn't collected the last one?
                if (m_IncrementalBuffer != null) {
                    result = true;
                } else {
                    try {
                        m_IncrementalBuffer = m_Loader.getNextInstance(structure);
                        result = (m_IncrementalBuffer != null);
                    } catch (Exception e) {
                        e.printStackTrace();
                        result = false;
                    }
                }
            } else {
                result = (m_BatchCounter < m_BatchBuffer.numInstances());
            }
        
            return result;
        }
      
        /**
         * returns the next element and sets the specified dataset, null if 
         * none available.
         * 
         * @param dataset  the dataset to set for the instance
         * @return    the next Instance
         */
        public Instance nextElement(Instances dataset) {
            Instance  result;
        
            result = null;
        
            if (isIncremental()) {
                // is there still an instance in the buffer?
                if (m_IncrementalBuffer != null) {
                    result = m_IncrementalBuffer;
                    m_IncrementalBuffer = null;
                } else {
                    try {
                        result = m_Loader.getNextInstance(dataset);
                    } catch (Exception e) {
                        e.printStackTrace();
                        result = null;
                    }
                }
            } else {
                if (m_BatchCounter < m_BatchBuffer.numInstances()) {
                    result = m_BatchBuffer.instance(m_BatchCounter);
                    m_BatchCounter++;
                }
            }
  
            result.setDataset(dataset);
        
            return result;
        }
      
        /**
         * convencience method for loading a dataset in batch mode.
         * 
         * @param location    the dataset to load
         * @return      the dataset
         * @throws Exception  if loading fails
         * @see      #DataSource(String)
         */
        public static Instances read(String location, Loader loader) throws Exception {
            DataSource  source;
            Instances    result;
        
            source = new DataSource(location, loader);
            result = source.getDataSet();
        
            return result;
        }
      
        /**
         * convencience method for loading a dataset in batch mode from a stream.
         * 
         * @param stream    the stream to load the dataset from
         * @return      the dataset
         * @throws Exception  if loading fails
         * @see      #DataSource(InputStream)
         */
        public static Instances read(InputStream stream) throws Exception {
            DataSource  source;
            Instances    result;
        
            source = new DataSource(stream);
            result = source.getDataSet();
        
            return result;
        }
      
        /**
         * convencience method for loading a dataset in batch mode.
         * 
         * @param loader    the loader to get the dataset from
         * @return      the dataset
         * @throws Exception  if loading fails
         * @see      #DataSource(Loader)
         */
        public static Instances read(Loader loader) throws Exception {
            DataSource  source;
            Instances    result;
        
            source = new DataSource(loader);
            result = source.getDataSet();
        
            return result;
        }
      
        /**
         * for testing only - takes a data file as input.
         * 
         * @param args    the commandline arguments
         * @throws Exception   if something goes wrong
         */
        public static void main(String[] args) throws Exception {
            if (args.length != 1) {
                System.out.println(
                        "\nUsage: " + DataSource.class.getName() + " <file>\n");
                System.exit(1);
            }
        
            DataSource loader = new DataSource(args[0], new ArffLoader());
        
            System.out.println("Incremental? " + loader.isIncremental());
            System.out.println(
                    "Loader: " + loader.getLoader().getClass().getName());
            System.out.println("Data:\n");
            Instances structure = loader.getStructure();


            System.out.println(structure);
            while (loader.hasMoreElements(structure)) {
                System.out.println(loader.nextElement(structure));
            }
        
            Instances inst = loader.getDataSet();


            loader = new DataSource(inst);
            System.out.println("\n\nProxy-Data:\n");
            System.out.println(loader.getStructure());
            while (loader.hasMoreElements(structure)) {
                System.out.println(loader.nextElement(inst));
            }
        }
    }


    /** for serialization. */
    static final long serialVersionUID = -2460855349276148760L;


    /** all available loaders (extension &lt;-&gt; classname). */
    protected static Hashtable<String, String> m_FileLoaders;
  
    /** all available URL loaders (extension &lt;-&gt; classname). */
    protected static Hashtable<String, String> m_URLFileLoaders;


    /**
     * Gets token, skipping empty lines.
     *
     * @param tokenizer     the stream tokenizer
     * @throws IOException   if reading the next token fails
     */
    public static void getFirstToken(StreamTokenizer tokenizer) 
        throws IOException {
    
        while (tokenizer.nextToken() == StreamTokenizer.TT_EOL) {}
        ;
        if ((tokenizer.ttype == '\'') || (tokenizer.ttype == '"')) {
            tokenizer.ttype = StreamTokenizer.TT_WORD;
        } else if ((tokenizer.ttype == StreamTokenizer.TT_WORD)
                && (tokenizer.sval.equals("?"))) {
            tokenizer.ttype = '?';
        }
    }


    /**
     * Gets token.
     *
     * @param tokenizer     the stream tokenizer
     * @throws IOException   if reading the next token fails
     */
    public static void getToken(StreamTokenizer tokenizer) throws IOException {
    
        tokenizer.nextToken();
        if (tokenizer.ttype == StreamTokenizer.TT_EOL) {
            return;
        }


        if ((tokenizer.ttype == '\'') || (tokenizer.ttype == '"')) {
            tokenizer.ttype = StreamTokenizer.TT_WORD;
        } else if ((tokenizer.ttype == StreamTokenizer.TT_WORD)
                && (tokenizer.sval.equals("?"))) {
            tokenizer.ttype = '?';
        }
    }


    /**
     * Throws error message with line number and last token read.
     *
     * @param theMsg     the error message to be thrown
     * @param tokenizer     the stream tokenizer
     * @throws IOException   containing the error message
     */
    public static void errms(StreamTokenizer tokenizer, String theMsg) 
        throws IOException {
    
        throw new IOException(theMsg + ", read " + tokenizer.toString());
    }


    /**
     * returns a vector with the classnames of all the loaders from the 
     * given hashtable.
     * 
     * @param ht    the hashtable with the extension/converter relation
     * @return    the classnames of the loaders
     */
    protected static Vector<String> getConverters(Hashtable<String, String> ht) {
        Vector<String>  result;
        Enumeration<String>  enm;
        String    converter;
    
        result = new Vector<String>();
    
        // get all classnames
        enm = ht.elements();
        while (enm.hasMoreElements()) {
            converter = enm.nextElement();
            if (!result.contains(converter)) {
                result.add(converter);
            }
        }
    
        // sort names
        Collections.sort(result);
    
        return result;
    }
  
    /**
     * tries to determine the converter to use for this kind of file, returns
     * null if none can be found in the given hashtable.
     * 
     * @param filename  the file to return a converter for
     * @param ht    the hashtable with the relation extension/converter
     * @return    the converter if one was found, null otherwise
     */
    protected static Object getConverterForFile(String filename, Hashtable<String, String> ht) {
        Object  result;
        String  extension;
        int    index;
    
        result = null;
    
        index = filename.lastIndexOf('.');
        if (index > -1) {
            extension = filename.substring(index).toLowerCase();
            result = getConverterForExtension(extension, ht);
            // is it a compressed format?
            if (extension.equals(".gz") && result == null) {
                index = filename.lastIndexOf('.', index - 1);
                extension = filename.substring(index).toLowerCase();
                result = getConverterForExtension(extension, ht);
            }
        }
    
        return result;
    }


    /**
     * tries to determine the loader to use for this kind of extension, returns
     * null if none can be found.
     * 
     * @param extension  the file extension to return a converter for
     * @param ht    the hashtable with the relation extension/converter
     * @return    the converter if one was found, null otherwise
     */
    protected static Object getConverterForExtension(String extension, Hashtable<String, String> ht) {
        Object  result;
        String  classname;
    
        result = null;
        classname = ht.get(extension);
        if (classname != null) {
            try {
                result = Class.forName(classname).newInstance();
            } catch (Exception e) {
                result = null;
                e.printStackTrace();
            }
        }
    
        return result;
    }
  
}
Source Code of org.integratedmodelling.riskwiz.learning.data.loader.ConverterUtils$DataSource

Related Classes of org.integratedmodelling.riskwiz.learning.data.loader.ConverterUtils$DataSource