Source Code of edu.ucla.sspace.wordsi.semeval.SemEvalContextExtractor

/*
 * Copyright 2010 Keith Stevens 
 *
 * This file is part of the S-Space package and is covered under the terms and
 * conditions therein.
 *
 * The S-Space package is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as published
 * by the Free Software Foundation and distributed hereunder to you.
 *
 * THIS SOFTWARE IS PROVIDED "AS IS" AND NO REPRESENTATIONS OR WARRANTIES,
 * EXPRESS OR IMPLIED ARE MADE.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, WE MAKE
 * NO REPRESENTATIONS OR WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY
 * PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR DOCUMENTATION
 * WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER
 * RIGHTS.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 */


package edu.ucla.sspace.wordsi.semeval;


import edu.ucla.sspace.wordsi.ContextExtractor;
import edu.ucla.sspace.wordsi.ContextGenerator;
import edu.ucla.sspace.wordsi.Wordsi;


import edu.ucla.sspace.text.IteratorFactory;


import edu.ucla.sspace.vector.SparseDoubleVector;


import edu.ucla.sspace.vector.VectorIO;


import java.io.BufferedReader;


import java.util.ArrayDeque;
import java.util.Iterator;
import java.util.Queue;




/**
 * A {@link ContextExtractor} for handling SemEval or SenseEval corpora.  For
 * each document, there should be an instance identifier, which uniquely
 * identifies the context.  There should also be some marker, i.e., "|||", that
 * marks where the focus word is in the document.  Only one context vector will
 * be generated for each document.  This class depends on a {@link
 * ContextGenerator} for generating the context vectors.
 *
 * @author Keith Stevens
 */
public class SemEvalContextExtractor implements ContextExtractor {


    /**
     * The default separator used.
     */
    private static final String DEFAULT_SEPARATOR = "||||";


    /**
     * The {@link ContextGenerator} responsible for creating context vectors.
     */
    private final ContextGenerator generator;


    /**
     * The number of words before and after a focus word which compose the
     * context.
     */
    private final int windowSize;


    /**
     * The token used to separate the previous context from the focus word.
     */
    private final String separator;


    /**
     * Creates a new {@link SemEvalContextExtractor}.
     *
     * @param generator The {@link ContextGenerator} responsible for creating
     *        context vectors
     * @param windowSize the number of words before and after a focus word which
     *        compose the context.
     */
    public SemEvalContextExtractor(ContextGenerator generator,
                                   int windowSize) {
        this(generator, windowSize, DEFAULT_SEPARATOR);
    }


    /**
     * Creates a new {@link SemEvalContextExtractor}.
     *
     * @param generator The {@link ContextGenerator} responsible for creating
     *        context vectors
     * @param windowSize the number of words before and after a focus word which
     *        compose the context.
     */
    public SemEvalContextExtractor(ContextGenerator generator,
                                   int windowSize,
                                   String separator) {
        this.generator = generator;
        this.windowSize = windowSize;
        this.separator = separator;
    }


    /**
     * {@inheritDoc}
     */
    public int getVectorLength() {
        return generator.getVectorLength();
    }


    /**
     * {@inheritDoc}
     */
    public void processDocument(BufferedReader document, Wordsi wordsi) {
        Queue<String> prevWords = new ArrayDeque<String>();
        Queue<String> nextWords = new ArrayDeque<String>();


        Iterator<String> it = IteratorFactory.tokenizeOrdered(document);


        // Skip empty documents.
        if (!it.hasNext())
            return;


        String instanceId = it.next();


        // Fill up the words after the context so that when the real processing
        // starts, the context is fully prepared.
        for (int i = 0 ; it.hasNext(); ++i) {
            String term = it.next();
            if (term.equals(separator))
                break;
            prevWords.offer(term.intern());
        }


        // Eliminate the first set of words that we don't want to inspect.
        while (prevWords.size() > windowSize)
            prevWords.remove();


        // It's possible that the SenseEval/SemEval parser failed to find the
        // focus word.  For these cases, skip the context.
        if (!it.hasNext())
            return;


        String focusWord = it.next().intern();


        // Extract the set of words to consider after the focus word.
        while (it.hasNext() && nextWords.size() < windowSize)
            nextWords.offer(it.next().intern());


        // Create the context vector and have wordsi handle it.
        SparseDoubleVector contextVector = generator.generateContext(
                prevWords, nextWords);
        wordsi.handleContextVector(focusWord, instanceId, contextVector);
    }
}
Source Code of edu.ucla.sspace.wordsi.semeval.SemEvalContextExtractor

Related Classes of edu.ucla.sspace.wordsi.semeval.SemEvalContextExtractor