Package edu.ucla.sspace.wordsi.semeval

Source Code of edu.ucla.sspace.wordsi.semeval.SemEvalContextExtractor

/*
* Copyright 2010 Keith Stevens
*
* This file is part of the S-Space package and is covered under the terms and
* conditions therein.
*
* The S-Space package is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as published
* by the Free Software Foundation and distributed hereunder to you.
*
* THIS SOFTWARE IS PROVIDED "AS IS" AND NO REPRESENTATIONS OR WARRANTIES,
* EXPRESS OR IMPLIED ARE MADE.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, WE MAKE
* NO REPRESENTATIONS OR WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY
* PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR DOCUMENTATION
* WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER
* RIGHTS.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/

package edu.ucla.sspace.wordsi.semeval;

import edu.ucla.sspace.wordsi.ContextExtractor;
import edu.ucla.sspace.wordsi.ContextGenerator;
import edu.ucla.sspace.wordsi.Wordsi;

import edu.ucla.sspace.text.IteratorFactory;

import edu.ucla.sspace.vector.SparseDoubleVector;

import edu.ucla.sspace.vector.VectorIO;

import java.io.BufferedReader;

import java.util.ArrayDeque;
import java.util.Iterator;
import java.util.Queue;


/**
* A {@link ContextExtractor} for handling SemEval or SenseEval corpora.  For
* each document, there should be an instance identifier, which uniquely
* identifies the context.  There should also be some marker, i.e., "|||", that
* marks where the focus word is in the document.  Only one context vector will
* be generated for each document.  This class depends on a {@link
* ContextGenerator} for generating the context vectors.
*
* @author Keith Stevens
*/
public class SemEvalContextExtractor implements ContextExtractor {

    /**
     * The default separator used.
     */
    private static final String DEFAULT_SEPARATOR = "||||";

    /**
     * The {@link ContextGenerator} responsible for creating context vectors.
     */
    private final ContextGenerator generator;

    /**
     * The number of words before and after a focus word which compose the
     * context.
     */
    private final int windowSize;

    /**
     * The token used to separate the previous context from the focus word.
     */
    private final String separator;

    /**
     * Creates a new {@link SemEvalContextExtractor}.
     *
     * @param generator The {@link ContextGenerator} responsible for creating
     *        context vectors
     * @param windowSize the number of words before and after a focus word which
     *        compose the context.
     */
    public SemEvalContextExtractor(ContextGenerator generator,
                                   int windowSize) {
        this(generator, windowSize, DEFAULT_SEPARATOR);
    }

    /**
     * Creates a new {@link SemEvalContextExtractor}.
     *
     * @param generator The {@link ContextGenerator} responsible for creating
     *        context vectors
     * @param windowSize the number of words before and after a focus word which
     *        compose the context.
     */
    public SemEvalContextExtractor(ContextGenerator generator,
                                   int windowSize,
                                   String separator) {
        this.generator = generator;
        this.windowSize = windowSize;
        this.separator = separator;
    }

    /**
     * {@inheritDoc}
     */
    public int getVectorLength() {
        return generator.getVectorLength();
    }

    /**
     * {@inheritDoc}
     */
    public void processDocument(BufferedReader document, Wordsi wordsi) {
        Queue<String> prevWords = new ArrayDeque<String>();
        Queue<String> nextWords = new ArrayDeque<String>();

        Iterator<String> it = IteratorFactory.tokenizeOrdered(document);

        // Skip empty documents.
        if (!it.hasNext())
            return;

        String instanceId = it.next();

        // Fill up the words after the context so that when the real processing
        // starts, the context is fully prepared.
        for (int i = 0 ; it.hasNext(); ++i) {
            String term = it.next();
            if (term.equals(separator))
                break;
            prevWords.offer(term.intern());
        }

        // Eliminate the first set of words that we don't want to inspect.
        while (prevWords.size() > windowSize)
            prevWords.remove();

        // It's possible that the SenseEval/SemEval parser failed to find the
        // focus word.  For these cases, skip the context.
        if (!it.hasNext())
            return;

        String focusWord = it.next().intern();

        // Extract the set of words to consider after the focus word.
        while (it.hasNext() && nextWords.size() < windowSize)
            nextWords.offer(it.next().intern());

        // Create the context vector and have wordsi handle it.
        SparseDoubleVector contextVector = generator.generateContext(
                prevWords, nextWords);
        wordsi.handleContextVector(focusWord, instanceId, contextVector);
    }
}
TOP

Related Classes of edu.ucla.sspace.wordsi.semeval.SemEvalContextExtractor

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.