Package edu.ucla.sspace.matrix

Source Code of edu.ucla.sspace.matrix.TfIdfDocStripedTransform$TfIdfGlobalTransform

/*
* Copyright 2011 Keith Stevens
*
* This file is part of the S-Space package and is covered under the terms and
* conditions therein.
*
* The S-Space package is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as published
* by the Free Software Foundation and distributed hereunder to you.
*
* THIS SOFTWARE IS PROVIDED "AS IS" AND NO REPRESENTATIONS OR WARRANTIES,
* EXPRESS OR IMPLIED ARE MADE.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, WE MAKE
* NO REPRESENTATIONS OR WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY
* PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR DOCUMENTATION
* WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER
* RIGHTS.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/


package edu.ucla.sspace.matrix;

import edu.ucla.sspace.matrix.MatrixIO.Format;
import edu.ucla.sspace.matrix.TransformStatistics.MatrixStatistics;

import edu.ucla.sspace.vector.DoubleVector;

import java.io.File;


/**
* Tranforms a matrix according to the <a
* href="http://en.wikipedia.org/wiki/Tf%E2%80%93idf">Term frequency-Inverse
* Document Frequency</a> weighting.  The input matrix is assumed to be
* formatted as rows representing documents and columns representing words.  Each
* matrix cell indicates the number of times the columns's word occurs within
* the rows's document.  For full details see:
*
* <ul><li style="font-family:Garamond, Georgia, serif">Spärck Jones, Karen
*      (1972). "A statistical interpretation of term specificity and its
*      application in retrieval". <i>Journal of Documentation</i> <b>28</b>
*      (1): 11–21.</li></ul>
*
* @see TfIdfTransform
*
* @author Keith Stevens
*/
public class TfIdfDocStripedTransform extends BaseTransform {

    /**
     * {@inheritDoc}
     */
    protected GlobalTransform getTransform(Matrix matrix) {
        return new TfIdfGlobalTransform(matrix);
    }

    /**
     * {@inheritDoc}
     */
    protected GlobalTransform getTransform(File inputMatrixFile,
                                           MatrixIO.Format format) {
        return new TfIdfGlobalTransform(inputMatrixFile, format);
    }

    /**
     * Returns the name of this transform.
     */
    public String toString() {
        return "TF-IDF-DOC-STRIPED";
    }

    public class TfIdfGlobalTransform implements GlobalTransform {

        /**
         * The total number of terms that occur in each document.
         */
        private double[] docTermCount;

        /**
         * The total number of documents that each term occurs in.
         */
        private double[] termDocCount;

        /**
         * The total number of documents present in the matrix.
         */
        private int totalDocCount;

        /**
         * Creates an instance of {@code TfIdfGlobalTransform} from a {@link
         * Matrix}.
         */
        public TfIdfGlobalTransform(Matrix matrix) {
            MatrixStatistics stats =
                TransformStatistics.extractStatistics(matrix, false, true);
            docTermCount = stats.rowSums;
            termDocCount = stats.columnSums;
            totalDocCount = docTermCount.length;
        }
       
        /**
         * Creates an instance of {@code TfIdfGlobalTransform} from a {@code
         * File} in the format {@link Format}.
         */
        public TfIdfGlobalTransform(File inputMatrixFile, Format format) {
            MatrixStatistics stats = TransformStatistics.extractStatistics(
                    inputMatrixFile, format, false, true);
            docTermCount = stats.rowSums;
            termDocCount = stats.columnSums;
            totalDocCount = docTermCount.length;
        }

        /**
         * Computes the Term Frequency-Inverse Document Frequency for a given
         * value where {@code value} is the observed frequency of term {@code
         * row} in document {@code column}.
         *
         * @param row The index speicifying the term being observed
         * @param column The index specifying the document being observed
         * @param value The number of occurances of the term in the document.
         *
         * @return the TF-IDF of the observed value
         */
        public double transform(int row, int column, double value) {
            double tf = value / docTermCount[row];
            double idf =
                Math.log(totalDocCount / (1 + termDocCount[column]));
            return tf * idf;
        }

        /**
         * Throws an {@link UnsupportedOperationException} if called.
         */
        public double transform(int row, DoubleVector column) {
            throw new UnsupportedOperationException(
                "Cannot compute weighting given a column an a " +
                "doc-striped matrix.");
        }
    }
}
TOP

Related Classes of edu.ucla.sspace.matrix.TfIdfDocStripedTransform$TfIdfGlobalTransform

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.