Package edu.ucla.sspace.matrix

Source Code of edu.ucla.sspace.matrix.MatrixIO

/*
* Copyright 2009 David Jurgens
*
* This file is part of the S-Space package and is covered under the terms and
* conditions therein.
*
* The S-Space package is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as published
* by the Free Software Foundation and distributed hereunder to you.
*
* THIS SOFTWARE IS PROVIDED "AS IS" AND NO REPRESENTATIONS OR WARRANTIES,
* EXPRESS OR IMPLIED ARE MADE.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, WE MAKE
* NO REPRESENTATIONS OR WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY
* PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR DOCUMENTATION
* WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER
* RIGHTS.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/

package edu.ucla.sspace.matrix;

import edu.ucla.sspace.matrix.Matrix.Type;

import edu.ucla.sspace.vector.CompactSparseVector;
import edu.ucla.sspace.vector.DoubleVector;
import edu.ucla.sspace.vector.SparseDoubleVector;
import edu.ucla.sspace.vector.SparseVector;

import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.EOFException;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOError;
import java.io.IOException;
import java.io.PrintWriter;

import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Scanner;
import java.util.SortedMap;
import java.util.TreeMap;

import java.util.logging.Logger;
import java.util.logging.Level;


/**
* A shared utility for printing matrices to files in a uniform manner and
* converting between different formats.
*
* @see MatrixIO.Format
* @see Matrix
* @see Matrix.Type
*
* @author David Jurgens
*/
public class MatrixIO {

    private static final Logger MATRIX_IO_LOGGER =
        Logger.getLogger(MatrixIO.class.getName());

    /**
     * An enum that specifies the formatting used for the matrix that is
     * serialized in a file.  Format specifications are as follows: <br>
     *
     * <center>
     * <table valign="top" border="1" width="800">
     *
     * <tr><td><center>format</center></td>
     *   <td><center>description</center></td></tr>
     *
     *
     * <tr><td valign="top">{@link Format#DENSE_TEXT DENSE_TEXT}</td>
     *
     *       <td>Each row is on its own line, with each column being delimited
     *       by one or more white space characters.  All rows should have the
     *       same number of columns.</td>
     * </tr>
     *
     *
     * <tr><td valign="top">{@link Format#MATLAB_SPARSE MATLAB_SPARSE}</td>
     *
     *   <td> The sparse format supported by Matlab.  See <a
     *   href="http://www.mathworks.com/access/helpdesk/help/techdoc/index.html?/access/helpdesk/help/techdoc/ref/spconvert.html">here</a>
     *   for full details.</td>
     *
     * </tr>   
     *
     *
     * <tr><td valign="top">{@link Format#SVDLIBC_SPARSE_TEXT SVDLIBC_SPARSE_TEXT}</td>    
     *
     *    <td>The sparse human readable format supported by SVDLIBC.  See <a
     *    href="http://tedlab.mit.edu/~dr/svdlibc/SVD_F_ST.html">here</a> for
     *    full details.</td>
     *
     * </tr>
     *
     *
     * <tr><td valign="top">{@link Format#SVDLIBC_DENSE_TEXT SVDLIBC_DENSE_TEXT}</td>
     *
     *   <td> The dense human readable format supported by SVDLIBC.  See <a
     *   href="http://tedlab.mit.edu/~dr/svdlibc/SVD_F_DT.html">here</a> for
     *   full details.</td>
     *
     * </tr>
     *
     *
     * <tr><td valign="top">{@link Format#SVDLIBC_SPARSE_BINARY SVDLIBC_SPARSE_BINARY}</td>
     *
     *   <td>The sparse binary format supported by SVDLIBC.  See <a
     *   href="http://tedlab.mit.edu/~dr/svdlibc/SVD_F_SB.html">here</a> for
     *   full details.</td>
    
     * </tr>
     *
     *
     * <tr><td valign="top">{@link Format#SVDLIBC_DENSE_BINARY SVDLIBC_DENSE_BINARY}</td>
     *
     *   <td>The dense binary format supported by SVDLIBC.  See <a
     *   href="http://tedlab.mit.edu/~dr/svdlibc/SVD_F_DB.html">here</a> for
     *   full details.</td>
     *
     * </tr>
     * </table>
     * </center>
     */
    public enum Format {
        /**
         * A human readable format where each row has its own line and all
         * column values are provided.
         */
        DENSE_TEXT,

        /**
         * The sparse format supported by Matlab.  See <a
         * href="http://www.mathworks.com/access/helpdesk/help/techdoc/index.html?/access/helpdesk/help/techdoc/ref/spconvert.html">here</a>
         * for full details.
         */
        MATLAB_SPARSE,

        /**
         * The sparse human readable format supported by SVDLIBC.  See <a
         * href="http://tedlab.mit.edu/~dr/svdlibc/SVD_F_ST.html">here</a> for
         * full details.
         */
        SVDLIBC_SPARSE_TEXT,

        /**
         * The dense human readable format supported by SVDLIBC.  See <a
         * href="http://tedlab.mit.edu/~dr/svdlibc/SVD_F_DT.html">here</a> for
         * full details.
         */
        SVDLIBC_DENSE_TEXT,

        /**
         * The sparse binary format supported by SVDLIBC.  See <a
         * href="http://tedlab.mit.edu/~dr/svdlibc/SVD_F_SB.html">here</a> for
         * full details.
         */
        SVDLIBC_SPARSE_BINARY,

        /**
         * The dense binary format supported by SVDLIBC.  See <a
         * href="http://tedlab.mit.edu/~dr/svdlibc/SVD_F_DB.html">here</a> for
         * full details.
         */
        SVDLIBC_DENSE_BINARY,

        /**
         * The sparse text format supported by <a
         * href="http://glaros.dtc.umn.edu/gkhome/cluto/cluto/overview">
         * CLUTO</a>.  See more details in the <a
         * href="http://glaros.dtc.umn.edu/gkhome/fetch/sw/cluto/manual.pdf">
         * CLUTO manual</a>.
         */
        CLUTO_DENSE,

        /**
         * The sparse text format supported by <a
         * href="http://glaros.dtc.umn.edu/gkhome/cluto/cluto/overview">
         * CLUTO</a>.  See more details in the <a
         * href="http://glaros.dtc.umn.edu/gkhome/fetch/sw/cluto/manual.pdf">
         * CLUTO manual</a>.
         */
        CLUTO_SPARSE,
    }

    /**
     * Uninstantiable
     */
    private MatrixIO() { }

    /**
     * Converts the format of the input {@code matrix}, returning a temporary
     * file containing the matrix's data in the desired format.
     *
     * @param matrix a file containing a matrix to convert
     * @param current the format of the {@code matrix} file
     * @param desired the format of the returned matrix file
     *
     * @return a matrix file with the same data in the desired format
     *
     * @throws IOException if any error occurs while reading the input matrix or
     *         wring the output matrix
     */
    public static File convertFormat(File matrix, Format current,
                                     Format desired) throws IOException {
        return convertFormat(matrix, current, desired, false);
    }      

    /**
     * Converts the format of the input {@code matrix}, returning a temporary
     * file containing the matrix's data in the desired format.
     *
     * @param matrix a file containing a matrix to convert
     * @param current the format of the {@code matrix} file
     * @param desired the format of the returned matrix file
     * @param transpose {@code true} if data in the input matrix should be
     *        transposed while converting formats to the output matrix.
     *
     * @return a matrix file with the same data in the desired format
     *
     * @throws IOException if any error occurs while reading the input matrix or
     *         wring the output matrix
     */
    public static File convertFormat(File matrix, Format current,
                                     Format desired, boolean transpose)
            throws IOException {
        if (!transpose && current.equals(desired)) {
            return matrix;
        }
       
        // Special cases for optimized formats
        switch (current) {
        case MATLAB_SPARSE: {
            if (desired.equals(Format.SVDLIBC_SPARSE_TEXT)) {
                File output = File.createTempFile(
                    "matlab-to-SVDLIBC-sparse-text",".dat");
                output.deleteOnExit();
                matlabToSvdlibcSparseText(matrix, output, transpose);
                return output;
            }
            if (desired.equals(Format.SVDLIBC_SPARSE_BINARY)) {
                File output = File.createTempFile(
                    "matlab-to-SVDLIBC-sparse-binary",".dat");
                output.deleteOnExit();
                matlabToSvdlibcSparseBinary(matrix, output, transpose);
                return output;
            }
            break;
        }
        case SVDLIBC_SPARSE_BINARY: {
            if (desired.equals(Format.MATLAB_SPARSE)) {
                File output = File.createTempFile(
                    "SVDLIBC-sparse-binary-to-Matlab",".dat");
                output.deleteOnExit();
                svdlibcSparseBinaryToMatlab(matrix, output, transpose);
                return output;
            }
        }
        }

        // NOTE: the current default implementation does not try to keep the
        // matrix data on disk, which could present a memory bottleneck.
        File output = File.createTempFile("transposed",".dat");           
        // NOTE: the matrix type is not intelligently selected.  Futher work is
        // needed to switch based on the format.
        Matrix transposed = readMatrix(matrix, current,
                                       Matrix.Type.SPARSE_IN_MEMORY, true);
        writeMatrix(transposed, output, desired);
        transposed = null; // for explicit GC
        return output;
    }      

    /**
     * Returns an iterator over the matrix entries in the data file.  For sparse
     * formats that specify only non-zero, no zero valued entries will be
     * returened.  Conversely, for dense matrix formats, all of the entries,
     * including zero entries will be returned.
     *
     * @param matrixFile the file containing matrix data in the specified format
     * @param fileFormat the format of the matrix file
     *
     * @return an interator over the entries in the matrix file
     */
    public static Iterator<MatrixEntry> getMatrixFileIterator(
             File matrixFile, Format fileFormat) throws IOException {
        switch(fileFormat) {
            case DENSE_TEXT:
                return new DenseTextFileIterator(matrixFile);
            case SVDLIBC_SPARSE_BINARY:
                return new SvdlibcSparseBinaryFileIterator(matrixFile);
            case SVDLIBC_SPARSE_TEXT:
                return new SvdlibcSparseTextFileIterator(matrixFile);
            case SVDLIBC_DENSE_BINARY:
                return new SvdlibcDenseBinaryFileIterator(matrixFile);
            case CLUTO_SPARSE:
                return new ClutoSparseFileIterator(matrixFile);
            case CLUTO_DENSE:
                // Cluto dense and svdlibc's dense text formats are equivalent.
            case SVDLIBC_DENSE_TEXT:
                return new SvdlibcDenseTextFileIterator(matrixFile);
            case MATLAB_SPARSE:
                return new MatlabSparseFileIterator(matrixFile);
        }
        throw new Error("Iterating over matrices of " + fileFormat +
                        " format is not "+
                        "currently supported. Email " +
                        "s-space-research-dev@googlegroups.com to request its" +
                        "inclusion and it will be quickly added");
    }

    /**
     * Returns a {@link FileTransformer} for an {@link Matrix} file encodeded in
     * the provided {@link Format}.
     *
     * @param format the format of a {@link Matrix} that will be transformed
     *
     * @return A {@link FileTransformer} for the given format
     */
    public static FileTransformer fileTransformer(Format format) {
        switch (format) {
            case MATLAB_SPARSE:
                return new MatlabSparseFileTransformer();
            case SVDLIBC_SPARSE_TEXT:
                return new SvdlibcSparseTextFileTransformer();
            case SVDLIBC_DENSE_TEXT:
                return new SvdlibcDenseTextFileTransformer();
            case SVDLIBC_SPARSE_BINARY:
                return new SvdlibcSparseBinaryFileTransformer();
            case SVDLIBC_DENSE_BINARY:
                return new SvdlibcDenseBinaryFileTransformer();
            default:
                throw new UnsupportedOperationException(
                        "Transforming format " + format + " is currently " +
                        "not implemented.  Please email " +
                        "s-space-research-dev@googlegroups.com to have this " +
                        "implemented.");
        }
    }

    /**
     * Reads in a matrix in the {@link Format#MATLAB_SPARSE} format and writes
     * it to the output file in {@link Format#SVDLIBC_SPARSE_TEXT} format.
     */
    private static void matlabToSvdlibcSparseText(File input, File output,
                                                  boolean transpose)
            throws IOException {
        MATRIX_IO_LOGGER.info("Converting from Matlab double values to " +
                              "SVDLIBC float values; possible loss of " +
                              "precision");           
        BufferedReader br = new BufferedReader(new FileReader(input));
        Map<Integer,Integer> colToNonZero = new HashMap<Integer,Integer>();

        // read through once to get matrix dimensions
        int rows = 0, cols = 0, nonZero = 0;       
        for (String line = null; (line = br.readLine()) != null; ) {
            String[] rowColVal = line.split("\\s+");
            int row, col;
            if (transpose) {
                 row = Integer.parseInt(rowColVal[1]);
                 col = Integer.parseInt(rowColVal[0]);
            }
            else {
                 row = Integer.parseInt(rowColVal[0]);
                 col = Integer.parseInt(rowColVal[1]);
            }
            if (row > rows)
                rows = row;
            if (col > cols)
                cols = col;
            ++nonZero;

            // NOTE: subtract by 1 here because Matlab arrays start at 1, while
            // SVDLIBC arrays start at 0.
            Integer colCount = colToNonZero.get(col-1);
            colToNonZero.put(col-1, (colCount == null) ? 1 : colCount + 1);
        }
        br.close();

        // print out the header information
        PrintWriter pw = new PrintWriter(output);
        pw.println(rows + "\t" + cols + "\t" + nonZero);

        // Process the entire array in chunks in case the matlab array is too
        // big to fit into memory.

        // REMINDER: this should probably be chosen based on the number of rows
        // and their expected density
        int chunkSize = 1000;

        // This keeps track of the last columns printed.  We need this outside
        // the loop to ensure that blank columns at the end of a chunk are still
        // printed by the next non-zero chunk
        int lastCol = -1;
       
        // lower bound inclusive, upper bound exclusive
        for (int lowerBound = 0, upperBound = chunkSize ; lowerBound < rows;
                 lowerBound = upperBound, upperBound += chunkSize) {
            // Once the dimensions and number of non-zero values are known,
            // reprocess the matrix, storing the rows and values for each column
            // that are inside the bounds
            br = new BufferedReader(new FileReader(input));

           
            // for each column, keep track of which in the next index into the
            // rows array that should be used to store the row index.  Also keep
            // track of the value associated for that row
            int[] colIndices = new int[cols];

            // columns are kept in sorted order
            SortedMap<Integer,int[]> colToRowIndex =
                new TreeMap<Integer,int[]>();
            SortedMap<Integer,float[]> colToRowValues =
                new TreeMap<Integer,float[]>();

            for (String line = null; (line = br.readLine()) != null; ) {
                String[] rowColVal = line.split("\\s+");
                int row, col;
                if (transpose) {
                    row = Integer.parseInt(rowColVal[1]) - 1;
                    col = Integer.parseInt(rowColVal[0]) - 1;
                }
                else {
                    row = Integer.parseInt(rowColVal[0]) - 1;
                    col = Integer.parseInt(rowColVal[1]) - 1;
                }
                // NOTE: SVDLIBC uses floats instead of doubles, which can cause
                // a loss of precision
                float val = Double.valueOf(rowColVal[2]).floatValue();
               
                // check that the current column is within the current chunk
                if (col < lowerBound || col >= upperBound)
                    continue;

                // get the arrays used to store the non-zero row indices for
                // this column and the parallel array that stores the
                // row-index's value
                int[] rowIndices = colToRowIndex.get(col);
                float[] rowValues = colToRowValues.get(col);
                if (rowIndices == null) {
                    rowIndices = new int[colToNonZero.get(col)];
                    rowValues = new float[colToNonZero.get(col)];
                    colToRowIndex.put(col,rowIndices);
                    colToRowValues.put(col,rowValues);
                }
               
                // determine what is the current index in the non-zero row array
                // that can be used to store this row.
                int curColIndex = colIndices[col];
                rowIndices[curColIndex] = row;
                rowValues[curColIndex] = val;
                colIndices[col] += 1;
            }   
            br.close();

            // loop through the stored column and row values, printing out for
            // each column, the number of non zero rows, followed by each row
            // index and the value.  This is the SVDLIBC sparse text format.
            for (Map.Entry<Integer,int[]> e : colToRowIndex.entrySet()) {
                int col = e.getKey().intValue();
                int[] nonZeroRows = e.getValue();
                float[] values = colToRowValues.get(col);
           
                if (col != lastCol) {
                    // print any missing columns in case not all the columns
                    // have data
                    for (int i = lastCol + 1; i < col; ++i)
                        pw.println(0);

                    // print the new header
                    int colCount = colToNonZero.get(col);
                    lastCol = col;
                    pw.println(colCount);           
                }
           
                for (int i = 0; i < nonZeroRows.length; ++i)
                    pw.println(nonZeroRows[i] + " " + values[i]);
            }
        }

        pw.flush();
        pw.close();
    }


    /**
     * Reads in a matrix in the {@link Format#MATLAB_SPARSE} format and writes
     * it to the output file in {@link Format#SVDLIBC_SPARSE_BINARY} format.
     */
    private static void matlabToSvdlibcSparseBinary(File input, File output,
                                                    boolean transpose)
            throws IOException {
        MATRIX_IO_LOGGER.info("Converting from Matlab double values to " +
                              "SVDLIBC float values; possible loss of " +
                              "precision");           
        BufferedReader br = new BufferedReader(new FileReader(input));
        Map<Integer,Integer> colToNonZero = new HashMap<Integer,Integer>();

        // read through once to get matrix dimensions
        int rows = 0, cols = 0, nonZero = 0;       
        for (String line = null; (line = br.readLine()) != null; ) {
            String[] rowColVal = line.split("\\s+");
            int row, col;
            if (transpose) {
                 row = Integer.parseInt(rowColVal[1]);
                 col = Integer.parseInt(rowColVal[0]);
            }
            else {
                 row = Integer.parseInt(rowColVal[0]);
                 col = Integer.parseInt(rowColVal[1]);
            }
            if (row > rows)
                rows = row;
            if (col > cols)
                cols = col;
            ++nonZero;

            // NOTE: subtract by 1 here because Matlab arrays start at 1, while
            // SVDLIBC arrays start at 0.
            Integer colCount = colToNonZero.get(col-1);
            colToNonZero.put(col-1, (colCount == null) ? 1 : colCount + 1);
        }
        br.close();

        // Print out the header information
        DataOutputStream dos = new DataOutputStream(
            new BufferedOutputStream(new FileOutputStream(output)));
        dos.writeInt(rows);
        dos.writeInt(cols);
        dos.writeInt(nonZero);

        // Process the entire array in chunks in case the matlab array is too
        // big to fit into memory.

        // REMINDER: this should probably be chosen based on the number of rows
        // and their expected density
        int chunkSize = 1000;

        // This keeps track of the last columns printed.  We need this outside
        // the loop to ensure that blank columns at the end of a chunk are still
        // printed by the next non-zero chunk
        int lastCol = -1;
       
        // lower bound inclusive, upper bound exclusive
        for (int lowerBound = 0, upperBound = chunkSize ; lowerBound < rows;
                 lowerBound = upperBound, upperBound += chunkSize) {
            // Once the dimensions and number of non-zero values are known,
            // reprocess the matrix, storing the rows and values for each column
            // that are inside the bounds
            br = new BufferedReader(new FileReader(input));

            // for each column, keep track of which in the next index into the
            // rows array that should be used to store the row index.  Also keep
            // track of the value associated for that row
            int[] colIndices = new int[cols];

            // columns are kept in sorted order
            SortedMap<Integer,int[]> colToRowIndex =
                new TreeMap<Integer,int[]>();
            SortedMap<Integer,float[]> colToRowValues =
                new TreeMap<Integer,float[]>();

            for (String line = null; (line = br.readLine()) != null; ) {
                String[] rowColVal = line.split("\\s+");
                int row, col;
                if (transpose) {
                    row = Integer.parseInt(rowColVal[1]) - 1;
                    col = Integer.parseInt(rowColVal[0]) - 1;
                }
                else {
                    row = Integer.parseInt(rowColVal[0]) - 1;
                    col = Integer.parseInt(rowColVal[1]) - 1;
                }
                // NOTE: SVDLIBC uses floats instead of doubles, which can cause
                // a loss of precision
                float val = Double.valueOf(rowColVal[2]).floatValue();
               
                // check that the current column is within the current chunk
                if (col < lowerBound || col >= upperBound)
                    continue;

                // get the arrays used to store the non-zero row indices for
                // this column and the parallel array that stores the
                // row-index's value
                int[] rowIndices = colToRowIndex.get(col);
                float[] rowValues = colToRowValues.get(col);
                if (rowIndices == null) {
                    rowIndices = new int[colToNonZero.get(col)];
                    rowValues = new float[colToNonZero.get(col)];
                    colToRowIndex.put(col,rowIndices);
                    colToRowValues.put(col,rowValues);
                }
               
                // determine what is the current index in the non-zero row array
                // that can be used to store this row.
                int curColIndex = colIndices[col];
                rowIndices[curColIndex] = row;
                rowValues[curColIndex] = val;
                colIndices[col] += 1;
            }   
            br.close();

            // loop through the stored column and row values, printing out for
            // each column, the number of non zero rows, followed by each row
            // index and the value.  This is the SVDLIBC sparse text format.
            for (Map.Entry<Integer,int[]> e : colToRowIndex.entrySet()) {
                int col = e.getKey().intValue();
                int[] nonZeroRows = e.getValue();
                float[] values = colToRowValues.get(col);
           
                if (col != lastCol) {
                    // print any missing columns in case not all the columns
                    // have data
                    for (int i = lastCol + 1; i < col; ++i)
                        dos.writeInt(0);

                    // print the new header
                    int colCount = colToNonZero.get(col);
                    lastCol = col;
                    dos.writeInt(colCount);           
                }
           
                for (int i = 0; i < nonZeroRows.length; ++i) {
                    dos.writeInt(nonZeroRows[i]);
                    dos.writeFloat(values[i]);
                }
            }
        }
        dos.close();
    }

    /**
     * Reads in a matrix in the {@link Format#SVDLIBC_SPARSE_BINARY} format and
     * writes it to the output file in the {@link Format#MATLAB_SPARSE} format.
     */
    private static void svdlibcSparseBinaryToMatlab(File input, File output,
                                                    boolean transpose)
            throws IOException {
       
        DataInputStream dis = new DataInputStream(
            new BufferedInputStream(new FileInputStream(input)));

        PrintWriter pw = new PrintWriter(output);

        int rows = dis.readInt();
        int cols = dis.readInt();
        int entries = dis.readInt();

        // SVDLIBC sparse binary is organized as column data. 
        int entriesSeen = 0;
        int col = 0;
        for (; entriesSeen < entries; ++col) {
            int nonZero = dis.readInt();

            for (int i = 0; i < nonZero; ++i, ++entriesSeen) {
                int row = dis.readInt();
                float val = dis.readFloat();
                if (transpose)
                    pw.println((1 + col) + " " + (1 + row) + " " + val);
                else
                    pw.println((1 + row) + " " + (1 + col) + " " + val);
            }
        }
       
        dis.close();
        pw.close();
    }
   
    /**
     * Reads in the content of the file as a two dimensional Java array matrix.
     * This method has been deprecated in favor of using purely {@link Matrix}
     * objects for all operations.  If a Java array is needed, the {@link
     * Matrix#toArray()} method may be used to generate one.
     *
     * @return a two-dimensional array of the matrix contained in provided file
     */       
    @Deprecated public static double[][] readMatrixArray(File input,
                                                         Format format)
            throws IOException {
        // Use the same code path for reading the Matrix object and then dump to
        // an array.  This comes at a potential 2X space usage, but greatly
        // reduces the possibilities for bugs.
        return readMatrix(input, format).toDenseArray();
    }
   
    /**
     * Converts the contents of a matrix file as a {@link Matrix} object, using
      * the provided format as a hint for what kind to create.  The
      * type of {@code Matrix} object created will assuming that the entire
      * matrix can fit in memory based on the format of the file speicfied
      * Note that the returned {@link Matrix} instance is not backed by the data
      * on file; changes to the {@code Matrix} will <i>not</i> be reflected in
      * the original file's data.
      *
      * @param matrix a file contain matrix data
      * @param format the format of the file
      *
      * @return the {@code Matrix} instance that contains the data in the
      *         provided file
      */
     public static Matrix readMatrix(File matrix, Format format)
             throws IOException {
         switch (format) {
             // Assume all sparse formats will fit in memory.
             case SVDLIBC_SPARSE_TEXT:
             case SVDLIBC_SPARSE_BINARY:
             case MATLAB_SPARSE:
             case CLUTO_SPARSE:
                 return readMatrix(matrix, format,
                                   Type.SPARSE_IN_MEMORY, false);
             // Assume all dense formats will fit in memory.
             case SVDLIBC_DENSE_TEXT:
             case SVDLIBC_DENSE_BINARY:
             case DENSE_TEXT:
             case CLUTO_DENSE:
                 return readMatrix(matrix, format,
                                   Type.DENSE_IN_MEMORY, false);
             default:
                 throw new Error(
                         "Reading matrices of " + format + " format is not "+
                         "currently supported. Email " +
                         "s-space-research-dev@googlegroups.com to request " +
                         "its inclusion and it will be quickly added");
         }
     }
    /**
     * Converts the contents of a matrix file as a {@link Matrix} object, using
     * the provided type description as a hint for what kind to create.  The
     * type of {@code Matrix} object created will be based on an estimate of
     * whether the data will fit into the available memory.  Note that the
     * returned {@link Matrix} instance is not backed by the data on file;
     * changes to the {@code Matrix} will <i>not</i> be reflected in the
     * original file's data.
     *
     * @param matrix a file contain matrix data
     * @param format the format of the file
     * @param matrixType the expected type and behavior of the matrix in
     *        relation to memory.  This value will be used as a hint for what
     *        kind of {@code Matrix} instance to create
     *
     * @return the {@code Matrix} instance that contains the data in the
     *         provided file
     */
    public static Matrix readMatrix(File matrix, Format format,
                                    Type matrixType)
            throws IOException {
        return readMatrix(matrix, format, matrixType, false);
    }

    /**
     * Converts the contents of a matrix file as a {@link Matrix} object, using
     * the provided type description as a hint for what kind to create.  The
     * type of {@code Matrix} object created will be based on an estimate of
     * whether the data will fit into the available memory.  Note that the
     * returned {@link Matrix} instance is not backed by the data on file;
     * changes to the {@code Matrix} will <i>not</i> be reflected in the
     * original file's data.
     *
     * @param matrix a file contain matrix data
     * @param format the format of the file
     * @param matrixType the expected type and behavior of the matrix in
     *        relation to memory.  This value will be used as a hint for what
     *        kind of {@code Matrix} instance to create
     * @param transposeOnRead {@code true} if the matrix should be transposed as
     *        its data is read in.  For certain formats, this is more efficient
     *        than reading the data in and then transposing it directly.
     *
     * @return the {@code Matrix} instance that contains the data in the
     *         provided file, optionally transposed from its original format
     *
     * @throws IOException if any error occurs while reading in the matrix data
     */
    public static Matrix readMatrix(File matrix, Format format,
                                    Type matrixType, boolean transposeOnRead)
            throws IOException {

        try {
            switch(format) {
            case DENSE_TEXT:
                return readDenseTextMatrix(matrix, matrixType, transposeOnRead);
               
            case MATLAB_SPARSE:
                return readMatlabSparse(matrix, matrixType, transposeOnRead);
               
            case CLUTO_SPARSE:
                return readClutoSparse(matrix, matrixType, transposeOnRead);
               
            case SVDLIBC_SPARSE_TEXT:
                return readSparseSVDLIBCtext(matrix, matrixType, transposeOnRead);
               
                // These two formats are equivalent
            case CLUTO_DENSE:
            case SVDLIBC_DENSE_TEXT:
                return readDenseSVDLIBCtext(matrix, matrixType, transposeOnRead);
               
            case SVDLIBC_SPARSE_BINARY:
                return readSparseSVDLIBCbinary(matrix, matrixType, transposeOnRead);
               
            case SVDLIBC_DENSE_BINARY:
                return readDenseSVDLIBCbinary(matrix, matrixType, transposeOnRead);
            }
        } catch (EOFException eofe) {
            // Rethrow with more specific type information
            throw new MatrixIOException("Matrix file " + matrix + " appeared "
                + "truncated, or was missing expected values at the end of its "
                + "contents.");
        }
        throw new Error("Reading matrices of " + format + " format is not "+
                        "currently supported. Email " +
                        "s-space-research-dev@googlegroups.com to request its "+
                        "inclusion and it will be quickly added");
    }

    /**
     * Creates a {@code Matrix} from the data encoded as {@link
     * Format#CLUTO_SPARSE} in provided file.
     *
     * @param matrix The matrix file to read from
     * @param matrixType The expected format of {@code matrix}
     * @param transposeOnRead If true, the returned matrix will be a transposed
     *        form of the data in {@code matrix}.
     *
     * @return A {@link SparseMatrix} containing the data in {@code matrix}.
     */
    private static Matrix readClutoSparse(File matrix, Type matrixType,
                                          boolean transposeOnRead)
            throws IOException {
        BufferedReader br = new BufferedReader(new FileReader(matrix));

        String[] rowCol = br.readLine().split("\\s+");

        // unknown number of rows, so do a quick scan to determine it
        int rows = Integer.parseInt(rowCol[0]);
        int cols = Integer.parseInt(rowCol[1]);

        // Create the matrix.
        Matrix m = (transposeOnRead)
            ? Matrices.create(cols, rows, matrixType)
            : Matrices.create(rows, cols, matrixType);

        // Cluto Sparse stores each row's values on a single line in the form of
        // "col value" tuples with everything separated by space.
        int row = 0;
        for (String line = null; (line = br.readLine()) != null; ++row) {
            String[] colValuePairs = line.split("\\s+");
            for (int i = 0; i < colValuePairs.length; i+=2) {
                int col = Integer.parseInt(colValuePairs[i]) - 1;
                double value = Double.parseDouble(colValuePairs[i+1]);
                // Store the value in the matrix.
                if (transposeOnRead)
                    m.set(col, row, value);
                else
                    m.set(row, col, value);
            }
        }

        return m;
    }

    /**
     * Creates a {@code Matrix} from the data encoded as {@link
     * Format#DENSE_TEXT} in provided file.
     *
     * @param matrix
     * @param matrixType
     *
     * @return as
     */
    private static Matrix readDenseTextMatrix(File matrix, Type matrixType,
                                              boolean transposeOnRead)
            throws IOException {
        BufferedReader br = new BufferedReader(new FileReader(matrix));

        // unknown number of rows, so do a quick scan to determine it
        int rows = 0;
        int cols = -1;
        for (String line = null; (line = br.readLine()) != null; rows++) {
            // Create a scanner to parse out the values from the current line of
            // the file.
            Scanner s = new Scanner(line);
            s.useLocale(Locale.US);
            int vals = 0;
            // Count how many columns the row has
            for (; s.hasNextDouble(); vals++, s.nextDouble())
                ;

            // Base case if the number of columns has not been set
            if (cols == -1)
                cols = vals;
            // Otherwise, ensure that the number of values in the current row
            // matches the number of values seen in the first
            else {
                if (cols != vals) {
                    throw new MatrixIOException("line " + (rows + 1) +
                        " contains an inconsistent number of columns. " +
                        cols + " columns expected versus " + vals + " found.");
                }
            }
        }
        br.close();

        if (MATRIX_IO_LOGGER.isLoggable(Level.FINE)) {
            MATRIX_IO_LOGGER.fine("reading in text matrix with " + rows  +
                                  " rows and " + cols + " cols");
        }

        // Once the matrix has had its dimensions determined, re-open the file
        // to load the data into a Matrix instance.  Use a Scanner to parse the
        // text for us.
        Scanner scanner = new Scanner(matrix);
        scanner.useLocale(Locale.US);
        Matrix m = (transposeOnRead)
            ? Matrices.create(cols, rows, matrixType)
            : Matrices.create(rows, cols, matrixType);
       
        for (int row = 0; row < rows; ++row) {
            for (int col = 0; col < cols; ++col) {
                double d = scanner.nextDouble();
                if (transposeOnRead)
                    m.set(col, row, d);
                else
                    m.set(row, col, d);
            }
        }
        scanner.close();
       
        return m;
    }

    /**
     * Creates a {@code Matrix} from the data encoded as {@link
     * Format#SVDLIBC_DENSE_TEXT} in provided file.
     *
     * @param matrix
     * @param matrixType
     *
     * @return a matrix whose data was specified by the provided file
     */
    private static Matrix readDenseSVDLIBCtext(File matrix, Type matrixType,
                                               boolean transposeOnRead)
            throws IOException {
        BufferedReader br = new BufferedReader(new FileReader(matrix));

        // Note that according to the formatting, spaces and new lines are
        // equivalent.  Therefore, someone could just print all of the matrix
        // values on a single line.
       
        int rows = -1;
        int cols = -1;
        int valuesSeen = 0;
        // REMINDER: possibly use on disk if the matrix is too big
        Matrix m = null;

        for (String line = null; (line = br.readLine()) != null; ) {
            String[] vals = line.split("\\s+");
            for (int i = 0; i < vals.length; ++i) {
                // rows is specified first
                if (rows == -1) {
                    rows = Integer.parseInt(vals[i]);
                }
                // cols will be second
                else if (cols == -1) {
                    cols = Integer.parseInt(vals[i]);

                    // once both rows and cols have been assigned, create the
                    // matrix
                    m = (transposeOnRead)
                        ? Matrices.create(cols, rows, matrixType)
                        : Matrices.create(rows, cols, matrixType);
                    MATRIX_IO_LOGGER.log(Level.FINE,
                        "created matrix of size {0} x {1}",
                        new Object[] {Integer.valueOf(rows),
                                      Integer.valueOf(cols)});
                }
                else {
                    int row = valuesSeen / cols;
                    int col = valuesSeen % cols;

                    double val = Double.parseDouble(vals[i]);

                    if (transposeOnRead)
                        m.set(col, row, val);
                    else
                        m.set(row, col, val);
               
                    // increment the number of values seen to properly set the
                    // next index of the matrix
                    ++valuesSeen;
                }
            }
        }
       
        br.close();
        return m;
    }   

    /**
     * Creates a {@code Matrix} from the data encoded as {@link
     * Format#SVDLIBC_DENSE_BINARY} in provided file.
     *
     * @param matrix
     * @param matrixType
     *
     * @return a matrix whose data was specified by the provided file
     */
    private static Matrix readDenseSVDLIBCbinary(File matrix, Type matrixType,
                                                 boolean transposeOnRead)
            throws IOException {
        DataInputStream dis = new DataInputStream(
            new BufferedInputStream(new FileInputStream(matrix)));

        int rows = dis.readInt();
        int cols = dis.readInt();
        Matrix m = (transposeOnRead)
            ? Matrices.create(cols, rows, matrixType)
            : Matrices.create(rows, cols, matrixType);
       
        if (transposeOnRead) {
            for (int row = 0; row < rows; ++row) {
                for (int col = 0; col < cols; ++col) {
                    m.set(col, row, dis.readFloat());
                }
            }
        }
        else {
            for (int row = 0; row < rows; ++row) {
                for (int col = 0; col < cols; ++col) {
                    m.set(row, col, dis.readFloat());
                }
            }
        }
        dis.close();
        return m;
    }   

    /**
     * Creates a {@code Matrix} from the data encoded as {@link
     * Format#SVDLIBC_SPARSE_BINARY} in provided file.
     *
     * @param matrix
     * @param matrixType
     *
     * @return a matrix whose data was specified by the provided file
     */
    private static Matrix readSparseSVDLIBCbinary(File matrix, Type matrixType,
                                                  boolean transposeOnRead)
            throws IOException {
        DataInputStream dis = new DataInputStream(
            new BufferedInputStream(new FileInputStream(matrix)));

        int rows = dis.readInt();
        int cols = dis.readInt();
        int nz = dis.readInt();
        MATRIX_IO_LOGGER.fine(
            String.format("Creating %s matrix %d rows, %d cols, %d nz%n",
                          ((transposeOnRead) ? "transposed" : ""),
                          rows, cols, nz));
        Matrix m = null;
       
        // Special case for reading transposed data.  This avoids the log(n)
        // overhead from resorting the row data for the matrix, which can be
        // significant in large matrices.
        if (transposeOnRead) {
            SparseDoubleVector[] rowArr = new SparseDoubleVector[cols];
            int entriesSeen = 0;
            int col = 0;
            int curRow = 0;
            for (; entriesSeen < nz; ++col) {
                int nzInCol = dis.readInt();
                int[] indices = new int[nzInCol];
                double[] vals = new double[nzInCol];
                for (int i = 0; i < nzInCol; ++i, ++entriesSeen) {
                    indices[i] = dis.readInt();
                    vals[i] = dis.readFloat();
                }
                SparseDoubleVector rowVec =
                    new CompactSparseVector(indices, vals, rows);
                rowArr[curRow] = rowVec;
                ++curRow;               
            }
            m = Matrices.asSparseMatrix(Arrays.asList(rowArr));
        }
        else {
            m = new SparseHashMatrix(rows, cols);
            int entriesSeen = 0;
            int col = 0;
            for (; entriesSeen < nz; ++col) {
                int nzInCol = dis.readInt();
                for (int i = 0; i < nzInCol; ++i, ++entriesSeen) {
                    m.set(dis.readInt(), col, dis.readFloat());
                }
            }
        }
        dis.close();

        MATRIX_IO_LOGGER.fine("Completed loading matrix");
        return m;
    }   

    /**
     * Creates a {@code Matrix} from the data encoded as {@link
     * Format#SVDLIBC_SPARSE_TEXT} in provided file.
     *
     * @param matrix
     * @param matrixType
     * @param transposeOnRead
     *
     * @return a matrix whose data was specified by the provided file
     */
     private static Matrix readMatlabSparse(
             File matrixFile,
             Type matrixType,
             boolean transposeOnRead) throws IOException {

         Matrix matrix = new GrowingSparseMatrix();
         BufferedReader br = new BufferedReader(new FileReader(matrixFile));
         for (String line = null; (line = br.readLine()) != null; ) {
             String[] rowColVal = line.split("\\s+");
             int row = Integer.parseInt(rowColVal[0]) - 1;
             int col = Integer.parseInt(rowColVal[1]) - 1;
             double value = Double.parseDouble(rowColVal[2]);
             if (transposeOnRead)
                 matrix.set(col, row, value);
             else
                 matrix.set(row, col, value);
         }
         br.close();
         return matrix;
     }


    /**
     * Creates a {@code Matrix} from the data encoded as {@link
     * Format#SVDLIBC_SPARSE_TEXT} in provided file.
     *
     * @param matrix
     * @param matrixType
     * @param transposeOnRead
     *
     * @return a matrix whose data was specified by the provided file
     */
     private static Matrix readSparseSVDLIBCtext(
             File matrix,
             Type matrixType,
             boolean transposeOnRead) throws IOException {
         BufferedReader br = new BufferedReader(new FileReader(matrix));
         String line = br.readLine();
         if (line == null)
             throw new IOException("Empty input Matrix");
             String[] numRowsColsNonZeros = line.split("\\s");
             int rows = Integer.parseInt(numRowsColsNonZeros[0]);
             int cols = Integer.parseInt(numRowsColsNonZeros[1]);
         Matrix m = (transposeOnRead)
             ? Matrices.create(cols, rows, matrixType)
             : Matrices.create(rows, cols, matrixType);
        
         for (int j = 0; j < cols && (line = br.readLine()) != null; ++j) {
             int numNonZeros = Integer.parseInt(line);
             for (int i = 0; i < numNonZeros &&
                     (line = br.readLine()) != null; ++i) {
                 String[] rowValue = line.split("\\s");
                 int row = Integer.parseInt(rowValue[0]);
                 double value = Double.parseDouble(rowValue[1]);
                 if (value != 0d) {
                     if (transposeOnRead)
                         m.set(j, row, value);
                     else
                         m.set(row, j, value);
                 }
             }
         }
         br.close();
         return m;
     }
    /**
     * Writes the matrix to the specified output file in the provided format
     *
     * @param matrix the matrix to be written
     * @param output the file in which the matrix should be written
     * @param format the data format in which the matrix's data should be
     *        written
     *
     * @throws IllegalArgumentException if the input matrix is 0-dimensional
     * @throws IOException if an error occurs while writing to the output file
     */
    public static void writeMatrix(Matrix matrix, File output, Format format)
        throws IOException {
        if (matrix.rows() == 0 || matrix.columns() == 0)
            throw new IllegalArgumentException(
                "cannot write 0-dimensional matrix");
        switch (format) {
          
        case DENSE_TEXT: {
            PrintWriter pw = new PrintWriter(output);
            for (int i = 0; i < matrix.rows(); ++i) {
                StringBuffer sb = new StringBuffer(matrix.columns() 5);
                for (int j = 0; j < matrix.columns(); ++j) {
                    sb.append(matrix.get(i,j)).append(" ");
                }
                pw.println(sb.toString());
            }
            pw.close();
            break;
        }
           
        // These two formats are equivalent
        case CLUTO_DENSE:
        case SVDLIBC_DENSE_TEXT: {
            PrintWriter pw = new PrintWriter(output);
            pw.println(matrix.rows() + " " +  matrix.columns());
            for (int i = 0; i < matrix.rows(); ++i) {
                StringBuffer sb = new StringBuffer(32);
                for (int j = 0; j < matrix.columns(); ++j) {
                    sb.append((float)(matrix.get(i,j))).append(" ");
                }
                pw.println(sb.toString());
            }
            pw.close();
            break;
        }

        case SVDLIBC_DENSE_BINARY: {
            DataOutputStream outStream = new DataOutputStream(
                new BufferedOutputStream(new FileOutputStream(output)));
            outStream.writeInt(matrix.rows());
            outStream.writeInt(matrix.columns());
            for (int i = 0; i < matrix.rows(); ++i) {
                for (int j = 0; j < matrix.columns(); ++j) {
                    outStream.writeFloat((float)matrix.get(i,j));
                }
            }
            outStream.close();
            break;
        }

        case CLUTO_SPARSE: {
            PrintWriter pw = new PrintWriter(output);
            // Count the number of non-zero values in the matrix
            int nonZero = 0;
            int rows = matrix.rows();
            for (int i = 0; i < rows; ++i) {
                DoubleVector v = matrix.getRowVector(i);
                if (v instanceof SparseVector)
                    nonZero += ((SparseVector)v).getNonZeroIndices().length;
                else {
                    for (int col = 0; col < v.length(); ++col) {
                        if (v.get(col) != 0)
                            nonZero++;
                    }
                }
            }
            // Write the header: rows cols non-zero
            pw.println(matrix.rows() + " " + matrix.columns() + " " + nonZero);
            for (int row = 0; row < rows; ++row) {
                StringBuilder sb = new StringBuilder(nonZero / rows);
                // NOTE: the columns in CLUTO start at 1, not 0, so increment
                // one to each of the columns
                DoubleVector v = matrix.getRowVector(row);
                if (v instanceof SparseVector) {
                    int[] nzIndices = ((SparseVector)v).getNonZeroIndices();
                    for (int nz : nzIndices) {
                        sb.append(nz + 1).append(" ").
                            append(v.get(nz)).append(" ");
                    }
                }
                else {
                    for (int col = 0; col < v.length(); ++col) {
                        double d = v.get(col);
                        if (d != 0)
                            sb.append(col+1).append(" ").append(d).append(" ");
                    }
                }
                pw.println(sb.toString());
            }
            pw.close();
            break;
        }

        case SVDLIBC_SPARSE_TEXT: {
            PrintWriter pw = new PrintWriter(output);
            // count the number of non-zero values for each column as well as
            // the total
            int nonZero = 0;
            int[] nonZeroPerCol = new int[matrix.columns()];
            for (int i = 0; i < matrix.rows(); ++i) {
                for (int j = 0; j < matrix.columns(); ++j) {
                    if (matrix.get(i, j) != 0) {
                        nonZero++;
                        nonZeroPerCol[j]++;
                    }
                }
            }

            // loop through the matrix a second time, printing out the number of
            // non-zero values for each column, followed by those values and
            // their associated row
            pw.println(matrix.rows() + " " + matrix.columns() + " " + nonZero);
            for (int col = 0; col < matrix.columns(); ++col) {
                pw.println(nonZeroPerCol[col]);
                if (nonZeroPerCol[col] > 0) {
                    for (int row = 0; row < matrix.rows(); ++row) {
                        double val = matrix.get(row, col);
                        if (val != 0d) {
                            // NOTE: need to convert to float since this is what
                            // SVDLIBC uses
                            pw.println(row + " " +
                                       Double.valueOf(val).floatValue());
                        }
                    }
                }
            }
            pw.close();
            break;
        }

        case SVDLIBC_SPARSE_BINARY: {
            DataOutputStream outStream = new DataOutputStream(
                new BufferedOutputStream(new FileOutputStream(output)));         
           
            // count the number of non-zero values for each column as well as
            // the total
            int nonZero = 0;
            int[] nonZeroPerCol = new int[matrix.columns()];
            for (int i = 0; i < matrix.rows(); ++i) {
                for (int j = 0; j < matrix.columns(); ++j) {
                    if (matrix.get(i, j) != 0) {
                        nonZero++;
                        nonZeroPerCol[j]++;
                    }
                }
            }

            // Write the 12 byte header data
            outStream.writeInt(matrix.rows());
            outStream.writeInt(matrix.columns());
            outStream.writeInt(nonZero);
           
            // loop through the matrix a second time, printing out the number of
            // non-zero values for each column, followed by those values and
            // their associated row
            for (int col = 0; col < matrix.columns(); ++col) {
                outStream.writeInt(nonZeroPerCol[col]);
                if (nonZeroPerCol[col] > 0) {
                    for (int row = 0; row < matrix.rows(); ++row) {
                        double val = matrix.get(row, col);
                        if (val != 0) {
                            // NOTE: need to convert to float since this is what
                            // SVDLIBC uses
                            outStream.writeInt(row);
                            outStream.writeFloat((float)val);
                        }
                    }
                }
            }
            outStream.close();
            break;
        }

        case MATLAB_SPARSE: {
            PrintWriter pw = new PrintWriter(output);
            // NOTE: Matlab's sparse matrix offers no way of specifying the
            // original matrix's dimensions.  This is only problematic if the
            // matrix contains trailing rows or columns that are all
            // zeros. Therefore to ensure that the matrix has the correct size,
            // we track the maximum values written and write a 0-value to extend
            // the matrix to its correct size
            int maxRowSeen = 0;
            int maxColSeen = 0;
            for (int i = 0; i < matrix.rows(); ++i) {
                for (int j = 0; j < matrix.columns(); ++j) {
                    if (matrix.get(i,j) == 0)
                        continue;
                    if (j > maxColSeen)
                        maxColSeen = j;
                    if (i > maxRowSeen)
                        maxRowSeen = i;
                    StringBuffer sb = new StringBuffer(32);
                    // Add 1 to index values since Matlab arrays are 1-based,
                    // not 0-based
                    sb.append(i+1).append(" ").append(j+1);
                    sb.append(" ").append(matrix.get(i,j));
                    pw.println(sb.toString());
                }
            }
            // Check whether we need to extend the matrix
            if (maxRowSeen + 1 !=  matrix.rows()
                    || maxColSeen + 1 != matrix.columns()) {
                pw.println(matrix.rows() + " " + matrix.columns() + " 0");
            }
            pw.close();
            break;                       
        }

        default:
            throw new UnsupportedOperationException(
                "writing to " + format + " is currently unsupported");
        }
    }

    @Deprecated
    public static void writeMatrixArray(double[][] matrix, File output)
        throws IOException {
        if (matrix.length == 0 || matrix[0].length == 0)
            throw new IllegalArgumentException("invalid matrix dimensions");

        PrintWriter pw = new PrintWriter(output);
        int rows = matrix.length;
        int cols = matrix[0].length;
        for (int row = 0; row < rows; ++row) {
            for (int col = 0; col < cols; ++col)
                pw.print(matrix[row][col] + ((col + 1 == cols) ? "\n" : " "));
        }

        pw.close();
        return;
    }
}
TOP

Related Classes of edu.ucla.sspace.matrix.MatrixIO

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.