Package edu.ucla.sspace.common

Source Code of edu.ucla.sspace.common.StaticSemanticSpace

/*
* Copyright 2009 Keith Stevens
*
* This file is part of the S-Space package and is covered under the terms and
* conditions therein.
*
* The S-Space package is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as published
* by the Free Software Foundation and distributed hereunder to you.
*
* THIS SOFTWARE IS PROVIDED "AS IS" AND NO REPRESENTATIONS OR WARRANTIES,
* EXPRESS OR IMPLIED ARE MADE.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, WE MAKE
* NO REPRESENTATIONS OR WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY
* PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR DOCUMENTATION
* WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER
* RIGHTS.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/

package edu.ucla.sspace.common;

import edu.ucla.sspace.common.SemanticSpaceIO.SSpaceFormat;

import edu.ucla.sspace.matrix.ArrayMatrix;
import edu.ucla.sspace.matrix.Matrices;
import edu.ucla.sspace.matrix.Matrix;
import edu.ucla.sspace.matrix.SparseMatrix;

import edu.ucla.sspace.vector.CompactSparseVector;
import edu.ucla.sspace.vector.Vector;
import edu.ucla.sspace.vector.Vectors;

import edu.ucla.sspace.util.IntegerMap;

import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.DataInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.IOError;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;

import java.util.Arrays;
import java.util.Collections;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.Properties;
import java.util.Set;

import java.util.logging.Level;
import java.util.logging.Logger;


/**
* An unmodifiable {@link SemanticSpace} whose data is loaded into memory from
* an {@code .sspace} file.  Instance of this class perform no document
* processing, and the {@code processDocument} and {@code processSpace} methods
* throw an {@link UnsupportedOperationException}
*
* <p> In general, users should call {@link
* edu.ucla.sspace.common.SemanticSpaceUtils#loadSemanticSpace(File)
* SemanticSpaceUtils.loadSemanticSpace(File)} rather than create an instance of
* this class directly.<p>
*
* This class is thread-safe
*
* @see OnDiskSemanticSpace
* @see SemanticSpaceUtils
* @see SemanticSpaceUtils.SSpaceFormat
*/
public class StaticSemanticSpace implements SemanticSpace {

    private static final Logger LOGGER =
        Logger.getLogger(StaticSemanticSpace.class.getName());

    /**
     * The {@code Matrix} which contains the data read from a finished {@link
     * SemanticSpace}.
     */
    private Matrix wordSpace;

    /**
     * A mapping of terms to row indexes.  Also serves as a quick means of
     * retrieving the words known by this {@link SemanticSpace}.
     */
    private Map<String, Integer> termToIndex ;

    /**
     * The name of this semantic space.
     */
    private String spaceName;   

    /**
     * Creates the {@link StaticSemanticSpace} from the file.
     *
     * @param filename the name of a file containing {@code SemanticSpace} data.
     *
     * @throws IOException if any I/O exception occurs when reading the semantic
     *         space data from the file
     */
    public StaticSemanticSpace(String filename) throws IOException {
        this(new File(filename));
    }

    /**
     * Creates the {@link StaticSemanticSpace} from the provided file.
     *
     * @param file a file containing the data of a {@link
     *        edu.ucla.sspace.common.SemanticSpace}.
     *
     * @throws IOException if any I/O exception occurs when reading the semantic
     *         space data from the file
     */
    public StaticSemanticSpace(File file) throws IOException {
        spaceName = file.getName();
        SSpaceFormat format = SemanticSpaceIO.getFormat(file);
        if (format == null)
            throw new Error("Unrecognzied format in " +
                            "file: " + file.getName());
        DataInputStream dis = new DataInputStream(
            new BufferedInputStream(new FileInputStream(file)));
        // Read off the four byte header from the stream so the loading methods
        // do not see the data.  This is necessary to support older formats that
        // did not include the header.
        dis.readInt();
        loadFromFormat(dis, format);
    }

    /**
     * Creates the {@link StaticSemanticSpace} from the provided file in the
     * specified format.  This method is only to be used in accessing {@code
     * SemanticSpace} files that do not include the format in their file
     * contents.
     *
     * @param file a file containing the data of a {@link
     *        edu.ucla.sspace.common.SemanticSpace}.
     * @param format the format of the semantic space
     *
     * @throws IOException if any I/O exception occurs when reading the semantic
     *         space data from the file
     */
    @Deprecated public StaticSemanticSpace(File file, SSpaceFormat format)
            throws IOException {
        loadFromFormat(new BufferedInputStream(
                           new FileInputStream(file)), format);
        spaceName = file.getName();
    }

    /**
     * Loads the semantic space data from the specified stream, using the format
     * to determine how the data is layed out internally within the stream.
     *
     * @param is the input stream from which the semantic space will be read
     * @param format the internal data formatting of the semantic space
     */
    private void loadFromFormat(InputStream is, SSpaceFormat format)
            throws IOException {
        // NOTE: Use a LinkedHashMap here because this will ensure that the
        // words are returned in the same row-order as the matrix.  This
        // generates better disk I/O behavior for accessing the matrix since
        // each word is directly after the previous on disk.
        termToIndex = new LinkedHashMap<String, Integer>();
        Matrix m = null;
        long start = System.currentTimeMillis();

            switch (format) {
            case TEXT:
                m = Matrices.synchronizedMatrix(loadText(is));
                break;
            case BINARY:
                m = Matrices.synchronizedMatrix(loadBinary(is));
                break;
           
            // REMINDER: we don't use synchronized here because the current
            // sparse matrix implementations are thread-safe.  We really should
            // be aware of this for when the file-based sparse matrix gets
            // implemented.  -jurgens 05/29/09
            case SPARSE_TEXT:
                m = loadSparseText(is);
                break;
            case SPARSE_BINARY:
                m = loadSparseBinary(is);
                break;
        }
                   
        if (LOGGER.isLoggable(Level.FINE)) {
            LOGGER.fine("loaded " + format + " .sspace file in " +
                (System.currentTimeMillis() - start) + "ms");
       
        wordSpace = m;
    }

    /**
     * Loads the {@link SemanticSpace} from the text formatted file, adding its
     * words to {@link #termToIndex} and returning the {@code Matrix} containing
     * the space's vectors.
     *
     * @param sspaceFile a file in {@link SSpaceFormat#TEXT text} format
     */
    private Matrix loadText(InputStream fileStream) throws IOException {
        Matrix matrix = null;

        BufferedReader br =
                new BufferedReader(new InputStreamReader(fileStream));
        String line = br.readLine();
        if (line == null)
            throw new IOException("Empty .sspace file");
            // Strip off the 4-byte (2 char) header
        String[] dimensions = line.split("\\s");
        int rows = Integer.parseInt(dimensions[0]);
        int columns = Integer.parseInt(dimensions[1]);
        int index = 0;
       
        // reusable array for writing rows into the matrix
        double[] row = new double[columns];
       
        matrix = new ArrayMatrix(rows, columns);

        while ((line = br.readLine()) != null) {
            if (index >= rows)
                throw new IOException("More rows than specified");
            String[] termVectorPair = line.split("\\|");
            String[] values = termVectorPair[1].split("\\s");
            termToIndex.put(termVectorPair[0], index);
            if (values.length != columns) {
                throw new IOException(
                            "improperly formated semantic space file");
            }
            for (int c = 0; c < columns; ++c) {
                double d = Double.parseDouble(values[c]);
                row[c] = d;
                // matrix.set(index, c, d);
            }
            matrix.setRow(index, row);
            index++;
        }
        if (index != rows)
            throw new IOException(String.format(
                "Expected %d rows; saw %d", rows, index));
        return matrix;   
    }

    /**
     * Loads the {@link SemanticSpace} from the text formatted file, adding its
     * words to {@link #termToIndex} and returning the {@code Matrix} containing
     * the space's vectors.
     *
     * @param sspaceFile a file in {@link SSpaceFormat#TEXT text} format
     */
    private Matrix loadSparseText(InputStream fileStream) throws IOException {
        Matrix matrix = null;

        BufferedReader br =
                new BufferedReader(new InputStreamReader(fileStream));
        String line = br.readLine();
        if (line == null)
            throw new IOError(new Throwable(
                        "An empty file has been passed in"));
        String[] dimensions = line.split("\\s");
        int rows = Integer.parseInt(dimensions[0]);
        int columns = Integer.parseInt(dimensions[1]);

        int row = 0;
       
        // create a sparse matrix
        matrix = Matrices.create(rows, columns, false);
        while ((line = br.readLine()) != null) {
            String[] termVectorPair = line.split("\\|");
            String[] values = termVectorPair[1].split(",");
            termToIndex.put(termVectorPair[0], row);

            // even indicies are columns, odd are the values
            for (int i = 0; i < values.length; i +=2 ) {
                int col = Integer.parseInt(values[i]);
                double val = Double.parseDouble(values[i+1]);
                matrix.set(row, col, val);
            }
            row++;
        }
        return matrix;   
    }

    /**
     * Loads the {@link SemanticSpace} from the binary formatted file, adding
     * its words to {@link #termToIndex} and returning the {@code Matrix}
     * containing the space's vectors.
     *
     * @param sspaceFile a file in {@link SSpaceFormat#BINARY binary} format
     */
    private Matrix loadBinary(InputStream fileStream) throws IOException {
        DataInputStream dis = new DataInputStream(fileStream);
        int rows = dis.readInt();
        int cols = dis.readInt();

        // create a dense matrix
        Matrix m = new ArrayMatrix(rows, cols);
        double[] d = new double[cols];
        for (int row = 0; row < rows; ++row) {
            String word = dis.readUTF();
            termToIndex.put(word, row);

            for (int col = 0; col < cols; ++col) {
                d[col] = dis.readDouble();
            }
            m.setRow(row, d);
        }
        return m;
    }

    /**
     * Loads the {@link SemanticSpace} from the binary formatted file, adding
     * its words to {@link #termToIndex} and returning the {@code Matrix}
     * containing the space's vectors.
     *
     * @param sspaceFile a file in {@link SSpaceFormat#BINARY binary} format
     */
    private Matrix loadSparseBinary(InputStream fileStream) throws IOException {
        DataInputStream dis = new DataInputStream(fileStream);
        int rows = dis.readInt();
        int cols = dis.readInt();
        // Create the sparse matrix as individual rows since we can fully
        // allocate the indices values at once, rather than pay the log(n)
        // overhead of sorting them
        CompactSparseVector[] rowVectors = new CompactSparseVector[rows];

        for (int row = 0; row < rows; ++row) {
            String word = dis.readUTF();
            termToIndex.put(word, row);
           
            int nonZero = dis.readInt();
            int[] indices = new int[nonZero];
            double[] values = new double[nonZero];
            for (int i = 0; i < nonZero; ++i) {
                int nz = dis.readInt();
                double val = dis.readDouble();
                indices[i] = nz;
                values[i] = val;
            }
            rowVectors[row] = new CompactSparseVector(indices, values, cols);
        }
        return Matrices.asSparseMatrix(Arrays.asList(rowVectors));
    }

    /**
     * {@inheritDoc}
     */
    public Set<String> getWords() {
        return Collections.unmodifiableSet(termToIndex.keySet());
    }
 
    /**
     * {@inheritDoc}
     */
    public Vector getVector(String term) {
        Integer index = termToIndex.get(term);
        return (index == null)
            ? null
            : wordSpace.getRowVector(index.intValue());
    }

    /**
     * {@inheritDoc}
     */
    public String getSpaceName() {
        return spaceName;
    }

    /**
     * {@inheritDoc}
     */
    public int getVectorLength() {
        return wordSpace.columns();
    }

    /**
     * Not supported; throws an {@link UnsupportedOperationException} if called.
     *
     * @throws an {@link UnsupportedOperationException} if called
     */
    public void processDocument(BufferedReader document) {
        throw new UnsupportedOperationException(
            "StaticSemanticSpace instances cannot be updated");
    }

    /**
     * Not supported; throws an {@link UnsupportedOperationException} if called.
     *
     * @throws an {@link UnsupportedOperationException} if called
     */
    public void processSpace(Properties props) {
        throw new UnsupportedOperationException(
            "StaticSemanticSpace instances cannot be updated");
    }
}
TOP

Related Classes of edu.ucla.sspace.common.StaticSemanticSpace

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.