Source Code of edu.ucla.sspace.wordsi.psd.PseudoWordContextExtractor

/*
 * Copyright 2010 Keith Stevens 
 *
 * This file is part of the S-Space package and is covered under the terms and
 * conditions therein.
 *
 * The S-Space package is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as published
 * by the Free Software Foundation and distributed hereunder to you.
 *
 * THIS SOFTWARE IS PROVIDED "AS IS" AND NO REPRESENTATIONS OR WARRANTIES,
 * EXPRESS OR IMPLIED ARE MADE.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, WE MAKE
 * NO REPRESENTATIONS OR WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY
 * PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR DOCUMENTATION
 * WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER
 * RIGHTS.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 */


package edu.ucla.sspace.wordsi.psd;


import edu.ucla.sspace.wordsi.ContextExtractor;
import edu.ucla.sspace.wordsi.ContextGenerator;
import edu.ucla.sspace.wordsi.Wordsi;


import edu.ucla.sspace.text.IteratorFactory;


import edu.ucla.sspace.vector.SparseDoubleVector;


import java.io.BufferedReader;


import java.util.ArrayDeque;
import java.util.Iterator;
import java.util.Queue;
import java.util.Map;




/**
 * A pseudo word based {@link ContextExtractor}.  A mapping from real tokens to
 * pseudo words is used to automatically replace tokens in a corpus while {@link
 * Wordsi} processes contexts.  Only pseudo words are represented in the {@link
 * Wordsi} space.  When a token is encountered, if it has a pseudo word mapping,
 * that instance is replaced with the pseudo word mapping.  A context vector
 * will be generated for the context surrounded that word instance, and the
 * pseudo word replacement will serve as the primary key for the reporter and
 * the raw token will serve as the secondary key.  The pseudo word will then
 * replace the raw token in the context for all other words, and thus serve as a
 * feature in place of the real token.
 *
 * @author Keith Stevens
 */
public class PseudoWordContextExtractor implements ContextExtractor {


    /**
     * A single empty token.
     */
    private static final String EMPTY = "";


    /**
     * The mapping from real tokens to their pseudo word replacements.
     */
    private final Map<String, String> pseudoWordMap;


    /**
     * The generator responsible for creating context vectors from a sliding
     * window of text.
     */
    private final ContextGenerator generator;


    /**
     * The size of the sliding window of text, in a single direction.
     */
    private final int windowSize;


    /**
     * Creates a new {@link PseudoWordContextExtracto}.
     *
     * @param generator The {@link ContextGenerator} responsible for creating
     *        context vectors
     * @param windowSize The number of words before and after the focus word
     *        which compose a context
     * @param pseudoWordMap The mapping from real words to their pseudo word
     *        replacements
     */
    public PseudoWordContextExtractor(ContextGenerator generator,
                                      int windowSize,
                                      Map<String, String> pseudoWordMap) {
        this.pseudoWordMap = pseudoWordMap;
        this.generator = generator;
        this.windowSize = windowSize;
    }


    /**
     * {@inheritDoc}
     */
    public int getVectorLength() {
        return generator.getVectorLength();
    }


    /**
     * {@inheritDoc}
     */
    public void processDocument(BufferedReader document, Wordsi wordsi) {
        Queue<String> prevWords = new ArrayDeque<String>();
        Queue<String> nextWords = new ArrayDeque<String>();
        Queue<String> nextRealWord = new ArrayDeque<String>();


        Iterator<String> it = IteratorFactory.tokenizeOrdered(document);


        // Fill up the words after the context so that when the real processing
        // starts, the context is fully prepared.
        for (int i = 0 ; i < windowSize && it.hasNext(); ++i)
            addNextToken(it.next(), nextWords, nextRealWord);


        // Iterate through each of the words in the context, generating context
        // vectors for each acceptable word.
        String focusWord = null;
        String replacementWord = null;
        while (!nextWords.isEmpty()) {
            focusWord = nextWords.remove();
            replacementWord = nextRealWord.remove();


            // Advance the sliding window to the right.
            if (it.hasNext())
                addNextToken(it.next(), nextWords, nextRealWord);


            // Represent the word if wordsi is willing to process it.
            if (!replacementWord.equals(EMPTY)) {
                SparseDoubleVector contextVector = generator.generateContext(
                        prevWords, nextWords);
                wordsi.handleContextVector(
                        focusWord, replacementWord, contextVector);
            }


            // Advance the sliding window to the right.
            prevWords.offer(focusWord);
            if (prevWords.size() > windowSize)
                prevWords.remove();
        }
    }


    /**
     * If {@code token} is part of a pseudo word, the replacement psedo word
     * will be stored in {@code nextWords} and {@code token} is stored in {@code
     * nextRealWords}.  Otherwise, {@code token} is stored in {@code nextWords}
     * and and empty string is stored in {@code nextRealWords}.
     */
    private void addNextToken(String token,
                              Queue<String> nextWords,
                              Queue<String> nextRealWords) {
        String replacement = pseudoWordMap.get(token);
        if (replacement == null) {
            nextWords.offer(token);
            nextRealWords.offer(EMPTY);
        } else {
            nextWords.offer(replacement);
            nextRealWords.offer(token);
        }
    }
}
Source Code of edu.ucla.sspace.wordsi.psd.PseudoWordContextExtractor

Related Classes of edu.ucla.sspace.wordsi.psd.PseudoWordContextExtractor