Package edu.ucla.sspace.esa

Source Code of edu.ucla.sspace.esa.ExplicitSemanticAnalysis

/*
* Copyright 2009 Keith Stevens
*
* This file is part of the S-Space package and is covered under the terms and
* conditions therein.
*
* The S-Space package is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as published
* by the Free Software Foundation and distributed hereunder to you.
*
* THIS SOFTWARE IS PROVIDED "AS IS" AND NO REPRESENTATIONS OR WARRANTIES,
* EXPRESS OR IMPLIED ARE MADE.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, WE MAKE
* NO REPRESENTATIONS OR WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY
* PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR DOCUMENTATION
* WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER
* RIGHTS.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/

package edu.ucla.sspace.esa;

import edu.ucla.sspace.basis.BasisMapping;
import edu.ucla.sspace.basis.StringBasisMapping;

import edu.ucla.sspace.common.SemanticSpace;
import edu.ucla.sspace.common.GenericTermDocumentVectorSpace;

import edu.ucla.sspace.matrix.MatrixBuilder;
import edu.ucla.sspace.matrix.MatrixFile;
import edu.ucla.sspace.matrix.MatrixIO;
import edu.ucla.sspace.matrix.MatrixIO.Format;
import edu.ucla.sspace.matrix.SvdlibcSparseBinaryMatrixBuilder;
import edu.ucla.sspace.matrix.TfIdfTransform;

import edu.ucla.sspace.util.GrowableArrayList;
import edu.ucla.sspace.util.SparseArray;
import edu.ucla.sspace.util.SparseHashArray;

import edu.ucla.sspace.vector.SparseVector;
import edu.ucla.sspace.vector.Vector;

import java.io.File;
import java.io.IOError;
import java.io.IOException;

import java.util.Collections;
import java.util.List;
import java.util.Properties;

import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;


/**
* An implementation of Explicit Semanic Analysis proposed by Evgeniy
* Gabrilovich and Shaul Markovitch.    For full details see:
*
* <ul>
*
*     <li style="font-family:Garamond, Georgia, serif"> Evgeniy Gabrilovich and
*         Shaul Markovitch. (2007). "Computing Semantic Relatedness using
*         Wikipedia-based Explicit Semantic Analysis," Proceedings of The 20th
*         International Joint Conference on Artificial Intelligence (IJCAI),
*         Hyderabad, India, January 2007. </li>
*
* </ul>
*
* @author Keith Stevens
*/
public class ExplicitSemanticAnalysis extends GenericTermDocumentVectorSpace {

    public static final String ESA_SSPACE_NAME =
        "esa-semantic-space";

    /**
     * A mapping from document indices to document labels.  This {@link List}
     * must both be thread safe and able to dynamically grow it's current length
     * since multiple calls to {@link List#set} can be made in parallel, and the
     * final number of documents is unknown.
     */
    private final List<String> documentLabels;

    /**
     * Constructs a new {@link ExplicitSemanticAnalysis} instance.
     */
    public ExplicitSemanticAnalysis() throws IOException {
        super(true, new StringBasisMapping(),
              new SvdlibcSparseBinaryMatrixBuilder());

        // We use a synchronized and growable array list in order to save space.
        // Since the GrowableArrayList does the growing whenever set is called,
        // the number of synchronization calls are minimized.
        documentLabels = Collections.synchronizedList(
                new GrowableArrayList<String>());
    }

    /**
     * Constructs a new {@code ExplicitSemanticAnalysis} using the provided
     * objects for processing.
     *
     * @param termToIndex The {@link BasisMapping} used to map strings to
     *        indices.
     * @param termDocumentMatrixBuilder The {@link MatrixBuilder} used to write
     *        document vectors to disk which later get processed in {@link
     *        #processSpace(Properties) processSpace}.
     *
     * @throws IOException if this instance encounters any errors when creatng
     *         the backing array files required for processing
     */
    public ExplicitSemanticAnalysis(
            BasisMapping<String, String> termToIndex,
            MatrixBuilder termDocumentMatrixBuilder) throws IOException {
        super(true, termToIndex, termDocumentMatrixBuilder);

        // We use a synchronized and growable array list in order to save space.
        // Since the GrowableArrayList does the growing whenever set is called,
        // the number of synchronization calls are minimized.
        documentLabels = Collections.synchronizedList(
                new GrowableArrayList<String>());
    }

    /**
     * Stores {@link header} at index {@link docIndex}.
     */
    protected void handleDocumentHeader(int docIndex, String header) {
        documentLabels.set(docIndex, header);
    }

    /**
     * Returns a {@link SparseArray} containing document labels for any non zero
     * value in the given {@link Vector}.  The given {@link Vector}s are
     * expected to have the same dimensionality as this {@link
     * ExplicitSemanticAnalysis} word space.  Under ESA, these returned document
     * labels can be considered the wikipedia articles that best describe the
     * vector created by combining each of the term vectors in a fragment of
     * text.
     */
    public SparseArray<String> getDocumentDescriptors(Vector documentVector) {
        if (documentVector.length() != getVectorLength())
            throw new IllegalArgumentException(
                    "An documentVector with an invalid length cannot be " +
                    "interpreted by ESA.");

        SparseArray<String> docLabels = new SparseHashArray<String>();

        // Extract the indices which are non zero and add the corresponding
        // document label to docLabels.
        if (documentVector instanceof SparseVector) {
            int[] nonZeros = ((SparseVector) documentVector).getNonZeroIndices();
            for (int index : nonZeros)
                docLabels.set(index, documentLabels.get(index));
        } else {
            for (int index = 0; index < documentVector.length(); index++)
                if (documentVector.getValue(index).doubleValue() != 0d)
                    docLabels.set(index, documentLabels.get(index));
        }

        return docLabels;
    }

    /**
     * {@inheritDoc}
     */
    public void processSpace(Properties properties) {
        try {
            MatrixFile processedSpace = processSpace(
                    new TfIdfTransform());
            wordSpace = MatrixIO.readMatrix(
                    processedSpace.getFile(), processedSpace.getFormat());
        } catch (IOException ioe) {
            throw new IOError(ioe);
        }
    }

    /**
     * {@inheritDoc}
     */
    public String getSpaceName() {
        return ESA_SSPACE_NAME;
    }
}
TOP

Related Classes of edu.ucla.sspace.esa.ExplicitSemanticAnalysis

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.