Package edu.ucla.sspace.ri

Source Code of edu.ucla.sspace.ri.HadoopRandomIndexing

/*
* Copyright 2010 David Jurgens
*
* This file is part of the S-Space package and is covered under the terms and
* conditions therein.
*
* The S-Space package is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as published
* by the Free Software Foundation and distributed hereunder to you.
*
* THIS SOFTWARE IS PROVIDED "AS IS" AND NO REPRESENTATIONS OR WARRANTIES,
* EXPRESS OR IMPLIED ARE MADE.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, WE MAKE
* NO REPRESENTATIONS OR WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY
* PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR DOCUMENTATION
* WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER
* RIGHTS.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/

package edu.ucla.sspace.ri;

import edu.ucla.sspace.common.Filterable;
import edu.ucla.sspace.common.SemanticSpace;
import edu.ucla.sspace.common.SemanticSpaceWriter;

import edu.ucla.sspace.hadoop.CooccurrenceExtractor;
import edu.ucla.sspace.hadoop.WordCooccurrence;
import edu.ucla.sspace.hadoop.WordCooccurrenceCountingJob;

import edu.ucla.sspace.index.IntegerVectorGenerator;
import edu.ucla.sspace.index.PermutationFunction;
import edu.ucla.sspace.index.RandomIndexVectorGenerator;
import edu.ucla.sspace.index.TernaryPermutationFunction;

import edu.ucla.sspace.text.IteratorFactory;

import edu.ucla.sspace.util.GeneratorMap;

import edu.ucla.sspace.vector.CompactSparseIntegerVector;
import edu.ucla.sspace.vector.DenseIntVector;
import edu.ucla.sspace.vector.IntegerVector;
import edu.ucla.sspace.vector.TernaryVector;
import edu.ucla.sspace.vector.Vector;
import edu.ucla.sspace.vector.VectorMath;
import edu.ucla.sspace.vector.Vectors;

import java.io.BufferedReader;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.File;
import java.io.IOException;

import java.lang.reflect.Constructor;

import java.util.ArrayDeque;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Properties;
import java.util.Queue;
import java.util.Random;
import java.util.Set;

import java.util.concurrent.ConcurrentHashMap;

import java.util.logging.Logger;
import java.util.logging.Level;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.input.*;
import org.apache.hadoop.mapreduce.lib.output.*;
import org.apache.hadoop.util.*;

/**
* An implementation of Random Indexing that uses Hadoop to perform a
* distributed co-occurrence counting.  This class is designed to only be used
* on systems where a Hadoop system is currently installed and running.
* Furthermore, this class depends on all external resources and corpora being
* present in the Hadoop distributed file system.
*
* <p> This class supports the following properties.
*
* <dl style="margin-left: 1em">
*
* <dt> <i>Property:</i> <code><b>{@value #WINDOW_SIZE_PROPERTY}
*      </b></code> <br>
*      <i>Default:</i> {@value #DEFAULT_WINDOW_SIZE}
*
* <dd style="padding-top: .5em">This property sets the number of words before
*      and after that are counted as co-occurring.  With the default value,
*      {@value #DEFAULT_WINDOW_SIZE} words are counted before and {@value
*      #DEFAULT_WINDOW_SIZE} words are counter after.  This class always uses a
*      symmetric window. <p>
*
* <dt> <i>Property:</i> <code><b>{@value #VECTOR_LENGTH_PROPERTY}
*      </b></code> <br>
*      <i>Default:</i> {@value #DEFAULT_VECTOR_LENGTH}
*
* <dd style="padding-top: .5em">This property sets the number of dimensions to
*      be used for the index and semantic vectors. <p>
*
* <dt> <i>Property:</i> <code><b>{@value #USE_PERMUTATIONS_PROPERTY}
*      </b></code> <br>
*      <i>Default:</i> {@code false}
*
* <dd style="padding-top: .5em">This property specifies whether to enable
*      permuting the index vectors of co-occurring words.  Enabling this option
*      will cause the word semantics to include word-ordering information.
*      However this option is best used with a larger corpus.<p>
*
* <dt> <i>Property:</i> <code><b>{@value #PERMUTATION_FUNCTION_PROPERTY}
*      </b></code> <br>
*      <i>Default:</i> {@link edu.ucla.sspace.index.DefaultPermutationFunction
*      DefaultPermutationFunction}
*
* <dd style="padding-top: .5em">This property specifies the fully qualified
*      class name of a {@link PermutationFunction} instance that will be used
*      to permute index vectors.  If the {@value #USE_PERMUTATIONS_PROPERTY} is
*      set to {@code false}, the value of this property has no effect.<p>
*
* <dt> <i>Property:</i> <code><b>{@value #USE_SPARSE_SEMANTICS_PROPERTY}
*      </b></code> <br>
*      <i>Default:</i> {@code true}
*
* <dd style="padding-top: .5em">This property specifies whether to use a sparse
*       encoding for each word's semantics.  Using a sparse encoding can result
*       in a large saving in memory, while requiring more time to process each
*       document.<p>
*
* </dl> <p>
*
* @see RandomIndexing
* @see WordCooccurrenceCountingJob
*/
public class HadoopRandomIndexing {

    private static final Logger LOGGER =
        Logger.getLogger(HadoopRandomIndexing.class.getName());

    /**
     * The prefix for naming public properties.
     */
    private static final String PROPERTY_PREFIX =
        "edu.ucla.sspace.ri.HadoopRandomIndexing";

    /**
     * The property to specify the number of dimensions to be used by the index
     * and semantic vectors.
     */
    public static final String VECTOR_LENGTH_PROPERTY =
        PROPERTY_PREFIX + ".vectorLength";

    /**
     * The property to specify the number of words to view before and after each
     * word in focus.
     */
    public static final String WINDOW_SIZE_PROPERTY =
        PROPERTY_PREFIX + ".windowSize";

    /**
     * The property to specify whether the index vectors for co-occurrent words
     * should be permuted based on their relative position.
     */
    public static final String USE_PERMUTATIONS_PROPERTY =
        PROPERTY_PREFIX + ".usePermutations";

    /**
     * The property to specify the fully qualified named of a {@link
     * PermutationFunction} if using permutations is enabled.
     */
    public static final String PERMUTATION_FUNCTION_PROPERTY =
        PROPERTY_PREFIX + ".permutationFunction";

    /**
     * Specifies whether to use a sparse encoding for each word's semantics,
     * which saves space but requires more computation.
     */
    public static final String USE_SPARSE_SEMANTICS_PROPERTY =
        PROPERTY_PREFIX + ".sparseSemantics";

    /**
     * The default number of words to view before and after each word in focus.
     */
    public static final int DEFAULT_WINDOW_SIZE = 2; // +2/-2

    /**
     * The default number of dimensions to be used by the index and semantic
     * vectors.
     */
    public static final int DEFAULT_VECTOR_LENGTH = 4000;

    /**
     * A mapping from each word to its associated index vector
     */
    private final Map<String,TernaryVector> wordToIndexVector;

    /**
     * The number of dimensions for the semantic and index vectors.
     */
    private final int vectorLength;

    /**
     * The number of words to view before and after each focus word in a window.
     */
    private final int windowSize;

    /**
     * Whether the index vectors for co-occurrent words should be permuted based
     * on their relative position.
     */
    private final boolean usePermutations;

    /**
     * If permutations are enabled, the permutation function to use on the
     * index vectors.
     */
    private final PermutationFunction<TernaryVector> permutationFunc;

    /**
     * A flag for whether this instance should use {@code SparseIntegerVector}
     * instances for representic a word's semantics, which saves space but
     * requires more computation.
     */
    private final boolean useSparseSemantics;

    /**
     * An optional set of words that restricts the set of semantic vectors that
     * this instance will retain.
     */
    private final Set<String> semanticFilter;

    /**
     * Creates an instance of {@code HadoopRandomIndexing} using the system
     * properties for setting the configuratons.
     */
    public HadoopRandomIndexing() {
        this(System.getProperties());
    }

    /**
     * Creates an instance of {@code HadoopRandomIndexing} using the provided
     * properties for configuring the instance.
     */
    public HadoopRandomIndexing(Properties properties) {
        String vectorLengthProp =
            properties.getProperty(VECTOR_LENGTH_PROPERTY);
        vectorLength = (vectorLengthProp != null)
            ? Integer.parseInt(vectorLengthProp)
            : DEFAULT_VECTOR_LENGTH;

        String windowSizeProp = properties.getProperty(WINDOW_SIZE_PROPERTY);
        windowSize = (windowSizeProp != null)
            ? Integer.parseInt(windowSizeProp)
            : DEFAULT_WINDOW_SIZE;

        String usePermutationsProp =
            properties.getProperty(USE_PERMUTATIONS_PROPERTY);
        usePermutations = (usePermutationsProp != null)
            ? Boolean.parseBoolean(usePermutationsProp)
            : false;

        String permutationFuncProp =
            properties.getProperty(PERMUTATION_FUNCTION_PROPERTY);
        permutationFunc = (permutationFuncProp != null)
            ? loadPermutationFunction(permutationFuncProp)
            : new TernaryPermutationFunction();

        RandomIndexVectorGenerator indexVectorGenerator =
            new RandomIndexVectorGenerator(vectorLength, properties);

        String useSparseProp =
        properties.getProperty(USE_SPARSE_SEMANTICS_PROPERTY);
        useSparseSemantics = (useSparseProp != null)
            ? Boolean.parseBoolean(useSparseProp)
            : true;

        wordToIndexVector = new GeneratorMap<TernaryVector>(
                indexVectorGenerator);
        semanticFilter = new HashSet<String>();
    }


    /**
     * Creates an {@link IntegerVector} of the kind specified by the user.
     */
    private IntegerVector createSemanticVector() {
        IntegerVector v = (useSparseSemantics)
            ? new CompactSparseIntegerVector(vectorLength)
            : new DenseIntVector(vectorLength);
        return v;
    }

    /**
     * Returns an instance of the the provided class name, that implements
     * {@code PermutationFunction}.
     *
     * @param className the fully qualified name of a class
     */
    @SuppressWarnings("unchecked")
    private static PermutationFunction<TernaryVector> loadPermutationFunction(
            String className) {
        try {
            Class clazz = Class.forName(className);
            return (PermutationFunction<TernaryVector>)(clazz.newInstance());
        } catch (Exception e) {
            // catch all of the exception and rethrow them as an error
            throw new Error(e);
        }
    }

    /**
     * Computes the co-occurrences present in the documents in the input
     * directory and writes the {@link SemanticSpace} to the provided writer.
     *
     * @param inputDirs one or more a directories in the Hadoop file system that
     *        contain all the documents to process
     * @param writer a writer to which the output {@link SemanticSpace} will be
     *        written upon completion of the map-reduce analysis.
     *
     * @throws Exception if any exception occurs during the Hadoop processing or
     *         when writing the {@link SemanticSpace}
     */
    public void execute(Collection<String> inputDirs,
                        SemanticSpaceWriter writer) throws Exception {
        // Set the window size property used the the Cooccurrence Mapper
        Properties props = System.getProperties();
        props.setProperty(CooccurrenceExtractor.WINDOW_SIZE_PROPERTY,
                          String.valueOf(windowSize));
        LOGGER.info("Beginning Hadoop corpus processing");
        // Construct the counting job that will use Hadoop to count the
        // co-occurrences
        WordCooccurrenceCountingJob job =
            new WordCooccurrenceCountingJob(props);
        Iterator<WordCooccurrence> occurrences = job.execute(inputDirs);
        LOGGER.info("Finished Hadoop corpus processing; calculating sspace");
        int wordCount =  0;
        // Local state variables for updating the current word's vector.
        String curWord = null;
        IntegerVector semantics = null;
        while (occurrences.hasNext()) {
            WordCooccurrence occ = occurrences.next();
            String word = occ.focusWord();
            // Base case for the first word seen
            if (curWord == null) {
                curWord = word;
                semantics = createSemanticVector();
            }
            // If we've seen a new word, write the previous word's vector
            else if (!curWord.equals(word)) {
                writer.write(curWord, semantics);
                if (LOGGER.isLoggable(Level.FINE)) {
                    LOGGER.fine(String.format(
                        "processed word #%d: %s%n ", ++wordCount, curWord));
                }
                curWord = word;
                semantics = createSemanticVector();
            }

            // Check if this is the last word, if yes then write
            if(!occurrences.hasNext()) {
                writer.write(curWord, semantics);
            }

            // NOTE: because we are using a GeneratorMap, this call will create
            // a new index vector for the word if it didn't exist prior.
            TernaryVector indexVector =
                wordToIndexVector.get(occ.relativeWord());

            if (usePermutations) {
                indexVector =
                    permutationFunc.permute(indexVector, occ.getDistance());
            }
            // Scale the index vector by the number of times this occurrence
            // happened
            VectorMath.addWithScalars(
                semantics, 1, indexVector, occ.getCount());
        }
        writer.close();
    }

    /**
     * Returns an unmodifiable view on the token to {@link IntegerVector}
     * mapping used by this instance.  Any further changes made by this instance
     * to its token to {@code IntegerVector} mapping will be reflected in the
     * returned map.
     *
     * @return a mapping from the current set of tokens to the index vector used
     *         to represent them
     */
    public Map<String,TernaryVector> getWordToIndexVector() {
        return Collections.unmodifiableMap(wordToIndexVector);
    }

    /**
     * Assigns the token to {@link IntegerVector} mapping to be used by this
     * instance.  The contents of the map are copied, so any additions of new
     * index words by this instance will not be reflected in the parameter's
     * mapping.
     *
     * @param m a mapping from token to the {@code IntegerVector} that should be
     *        used represent it when calculating other word's semantics
     */
    public void setWordToIndexVector(Map<String,TernaryVector> m) {
        wordToIndexVector.clear();
        wordToIndexVector.putAll(m);
    }
}
TOP

Related Classes of edu.ucla.sspace.ri.HadoopRandomIndexing

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.