Source Code of edu.ucla.sspace.grefenstette.Grefenstette

/*
 * Copyright 2009 Grace Park
 *
 * This file is part of the S-Space package and is covered under the terms and
 * conditions therein.
 *
 * The S-Space package is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as published
 * by the Free Software Foundation and distributed hereunder to you.
 *
 * THIS SOFTWARE IS PROVIDED "AS IS" AND NO REPRESENTATIONS OR WARRANTIES,
 * EXPRESS OR IMPLIED ARE MADE.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, WE MAKE
 * NO REPRESENTATIONS OR WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY
 * PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR DOCUMENTATION
 * WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER
 * RIGHTS.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 */


package edu.ucla.sspace.grefenstette;


import edu.ucla.sspace.common.SemanticSpace;


import edu.ucla.sspace.matrix.GrowingSparseMatrix;
import edu.ucla.sspace.matrix.Matrix;


import edu.ucla.sspace.text.Document;


import edu.ucla.sspace.util.Pair;


import edu.ucla.sspace.vector.Vector;


import java.io.BufferedReader;
import java.io.FileReader;
import java.io.File;
import java.io.IOException;
import java.io.IOError;
import java.io.PrintWriter;


import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import java.util.Properties;


import java.util.concurrent.atomic.AtomicInteger;


import java.util.logging.Level;
import java.util.logging.Logger;


/**
 * An implementation of a semantic space built from syntactic co-occurrence, as
 * described by Grefenstette.  See the following references for full details.
 * <ul>
 *
 *   <li style="font-family:Garamond, Georgia, serif">G. Grefenstette,
 *   <i>Explorations in Automatic Thesaurus Discovery</i>. Indiana University
 *   Press, 1994.</li>
 *
 * </ul>
 *
 *
 * @author Grace Park 
 */
public class Grefenstette implements SemanticSpace {


    /**
     * The logger for reporting all debugging information
     */
    private static final Logger LOGGER = 
        Logger.getLogger(Grefenstette.class.getName());


    /**
     * The temporary file used to record syntactic word relations while the
     * documents are parsed.  Relations are written to a file to save memory.
     */
    private final File wordRelations;


    /**
     * The writer to the {@code wordRelations} file.
     */
    private final PrintWriter wordRelationsWriter;


    /**
     * A mapping from a string token to the integer that represents that token's
     * row in the {@code syntacticCooccurrence} matrix.
     */
    private final Map<String,Integer> objectTable;


    /**
     * A mapping from a token in a specific syntactic position to the integer
     * that represents that token configuration's column.
     */
    private final Map<String,Integer> attributeTable;


    /**
     * A matrix where rows correspond to tokens and columns correspond to the
     * syntactic co-occurrence of a specific token in a specific syntactic
     * position.
     */
    private final Matrix syntacticCooccurrence;


    /**
     * An incremental counter used for assigning tokens to matrix row indices
     */
    private final AtomicInteger objectCounter;


    /**
     * An incremental counter used for assigning token syntax positions to
     * matrix column indices
     */
    private final AtomicInteger attributeCounter;


    /**
     * Constructs an instance using the system properties for any required
     * configuration
     *
     * @throws IOError if unable to create the backing file to hold data while
     *         processing
     */
    public Grefenstette() {
        try {
            wordRelations = File.createTempFile("word-relation-list","txt");
            wordRelationsWriter = new PrintWriter(wordRelations);
          
            objectTable = new HashMap<String,Integer>();
            attributeTable = new HashMap<String,Integer>();
          
            syntacticCooccurrence = new GrowingSparseMatrix();
          
            objectCounter = new AtomicInteger(0);
            attributeCounter = new AtomicInteger(0);
        } catch (IOException ioe) {
            throw new IOError(ioe);
        }
    }


    /**
     * {@inheritDoc}
     */
    public void processDocument(BufferedReader document) throws IOException {
        ArrayList<Pair<String>> wordsInPhrase = new ArrayList<Pair<String>>();


        String nounPhrase = "";
        String lastNoun = "";
        String lastVerb = "";
        String secondPrevPhrase = "";
        String prevPhrase = "";


        nounPhrase = document.readLine();


        for( String tag = getNextTag(nounPhrase); 
             tag != null; tag = getNextTag(nounPhrase) ) {
            String word;


            int startOfTag = nounPhrase.indexOf(tag);


            nounPhrase = nounPhrase.substring(startOfTag);
            wordsInPhrase.clear();


            if( tag.equals("NP") ) {
                while( nounPhrase.charAt(0) != ')' ) {


                    // extract tag of word in noun phrase
                    tag = getNextTag(nounPhrase); 


                    if( isPhraseOrClause(tag) || isPreposition(tag) ) {
                        nounPhrase = nounPhrase.
                            substring(nounPhrase.indexOf(tag) + tag.length());
                        // stop processing NP
                        break;
                    } else if( inStartSet(tag) || inReceiveSet(tag) ) {
                        // note to self: find out why this broke
                        try {
                            word = nounPhrase.
                                substring(nounPhrase.indexOf(" ", 
                                              nounPhrase.indexOf(tag)) + 1, 
                                          nounPhrase.indexOf(")"));


                            wordsInPhrase.add(new Pair<String>(tag,word));


                            nounPhrase = nounPhrase.
                                substring(nounPhrase.indexOf(")",
                                              nounPhrase.indexOf(word))+1);
                        } catch (StringIndexOutOfBoundsException e) {
                            nounPhrase = nounPhrase.substring(nounPhrase.indexOf(")"));
                        }
                        // else it's not a tag I care about
                    } else {
                        nounPhrase = nounPhrase.substring(nounPhrase.indexOf(")")+1);
                    }
                }


                // note to self: is this if statement represent the same thing
                // as the next if statement??
                if( !wordsInPhrase.isEmpty() ) {
                    // set head noun to last word in noun phrase
                    String headNoun = wordsInPhrase.get(wordsInPhrase.size()-1).y;


                    // create the relations from pass two
                    if( prevPhrase.equals("PP") && secondPrevPhrase.equals("NP") 
                        && lastNoun.length() != 0 ) {
                        wordRelationsWriter.println(lastNoun + " " + headNoun);


                        addRelation(lastNoun, headNoun);
                    }


                    // create relations from pass four
                    if( prevPhrase.equals("PP") && secondPrevPhrase.equals("VP")
                        && lastVerb.length() != 0 ) {


                        wordRelationsWriter.println(lastVerb + " " + headNoun);


                        addRelation(lastVerb, headNoun);
                    } else if( prevPhrase.equals("VP") ) {


                        wordRelationsWriter.println(lastVerb + " " + headNoun);


                        addRelation(lastVerb, headNoun);
                    }


                    lastNoun = headNoun;
                }


                // reached end of noun phrase
                if( nounPhrase.charAt(0) == ')' ) {
                    // create relations between words in noun phrase
                    // relations from pass one
                    processWordsInNP(wordsInPhrase);


                    if( !"NP".equals(prevPhrase) ) {
                        secondPrevPhrase = prevPhrase;
                        prevPhrase = "NP";
                    }
                }
            } //end processing NP
            else if( tag.equals("VP") ) {
                while( tag != null && tag.startsWith("V") ) {
                    // nonphrase verb
                    if( tag.startsWith("VB") ) {
                        word = nounPhrase.substring( nounPhrase.indexOf(" ", 
                                                                        nounPhrase.indexOf(tag))+1, nounPhrase.indexOf(")"));
                        lastVerb = word;
                    }


                    nounPhrase = nounPhrase.substring(nounPhrase.indexOf(tag)+1);
                    tag = getNextTag(nounPhrase);
                }


                // relations from pass three
                if( prevPhrase.equals("NP") && lastNoun.length() != 0 ) {


                    wordRelationsWriter.println(lastNoun + " " + lastVerb);


                    addRelation(lastNoun, lastVerb);
                }


                if( !prevPhrase.equals("VP") ) {
                    secondPrevPhrase = prevPhrase;
                    prevPhrase = "VP";
                }
            }
            else if( isPhraseOrClause(tag) || isPreposition(tag) ) {
                nounPhrase = nounPhrase.substring( nounPhrase.indexOf(tag)
                                                   + tag.length());
                if( !tag.equals(prevPhrase) ) {
                    secondPrevPhrase = prevPhrase;
                    prevPhrase = tag;
                }
            }
            else {
                nounPhrase = nounPhrase.substring( nounPhrase.indexOf(tag)
                                                   + tag.length());
            }
        }
    }


    /**
     * Adds a relation pair to the matrix
     */
    private void addRelation(String object, String attribute) {
        double val;
        int row, col;


        object = object.toLowerCase();
        attribute = attribute.toLowerCase();


        // get row in matrix
        if( objectTable.containsKey(object) ) {
            // if the object already exists in matrix, find its index
            row = objectTable.get(object);
        } else {
            // otherwise give the object a new index number
            row = Integer.valueOf(objectCounter.getAndIncrement());
            // insert new object/index pair into lookup table
            objectTable.put( object, row );
            System.out.println(object + " " + row);
        }


        // get column in matrix
        if( attributeTable.containsKey(attribute) ) {
            col = attributeTable.get(attribute);
        } else {
            col = Integer.valueOf(attributeCounter.getAndIncrement());
            attributeTable.put( attribute, col );
        }


        // update entry in matrix which records how many times the
        // object/attribute pair has been seen
        if( row < syntacticCooccurrence.rows() && 
            col < syntacticCooccurrence.columns()) {
            // if there's already an entry for the object and attribute, get the
            // current value for the pair of words
            val = syntacticCooccurrence.get(row, col);
            // increment the current value by one and store in matrix
            syntacticCooccurrence.set(row, col, val+1);
        } else {
            // otherwise set the row, col value to 1
            syntacticCooccurrence.set(row, col, 1.0);
        }
    }


    /**
     * Creates relations between words in a noun phrase
     */
    private void processWordsInNP(ArrayList<Pair<String>> wordsInPhrase) {
        if( wordsInPhrase.size() > 1 ) {
            // this is from Grefenstette's pseudo code
            for (int i = 0; i < wordsInPhrase.size()-1; i++) {


                if (inStartSet(wordsInPhrase.get(i).x) ) {
                
                    for (int j = i+1; j < wordsInPhrase.size(); j++ ) {
                        
                        if (inReceiveSet( wordsInPhrase.get(j).x ) ) {


                            wordRelationsWriter.
                                println(wordsInPhrase.get(j).y + " "
                                        + wordsInPhrase.get(i).y);
                            
//                             System.out.println(wordsInPhrase.get(j).y + " "
//                                                + wordsInPhrase.get(i).y);
                            
                            addRelation(wordsInPhrase.get(j).y, 
                                        wordsInPhrase.get(i).y);
                        }
                    }
                }
            }
        }
    }


  
    /**
     * Checks to see if the tag can modify another word
     *
     * @param tag A tag from the parsed corpus to be checked
     */
    private boolean inStartSet(String tag) {
        return
            // noun
            tag.startsWith("NN") ||
            // adjective
            tag.startsWith("JJ") ||
            // adverb
            tag.startsWith("RB") ||
            // cardinal number
            tag.startsWith("CD");
    }




    /**
     * Checks to see if tag can be modified by a word in StartSet
     */
    private boolean inReceiveSet(String tag) {
        return
            tag.startsWith("NN") ||
            tag.startsWith("VB");
    }




    /**
     * Checks to see if tag is a preposition
     */
    private boolean isPreposition(String tag) {
        return tag.startsWith("PP");
    }




    /**
     * Checks to see if tag marks a phrase or clause
     */
    private boolean isPhraseOrClause(String tag) {
        // find out why adding more reduced the number of relations
        return
            (!tag.equals("SYM") &&
             tag.startsWith("S")) ||
            tag.equals("ADJP") ||
            tag.equals("ADVP") ||
            tag.equals("CONJP") ||
            tag.equals("FRAG") ||
            tag.equals("INTJ") ||
            tag.equals("LST") ||
            tag.equals("NAC") ||
            tag.equals("NP") ||
            tag.equals("NX") ||
            tag.equals("PP") ||
            tag.equals("PRN") ||
            /* removing prt adds 1% more relations */
            tag.equals("PRT") ||
            tag.equals("QP") ||
            tag.equals("RRC") ||
            tag.equals("UCP") ||
            tag.equals("VP") ||
            tag.startsWith("WH") ||
            tag.equals("X");
    }


    /**
     * Returns the next tag in the sentence or null if there are no more tags
     * @param str The sentence that the tag is extracted from
     */
    private String getNextTag(String str) {
        String tag;
        int endIndex;
        int tagIndex = str.indexOf("(");


        if( tagIndex < 0 ) {
            return null;
        }


        // in case there's nothing in the sentence
        endIndex = str.indexOf(" ", tagIndex);


        if( endIndex < 0 ) {
            return null;
        }


        tag = str.substring( tagIndex+1, endIndex ); 


        if( tag.length() > 0 ) {
            return tag;
        } else {
            str = str.substring( tagIndex+1 );
            return getNextTag(str);
        }
    }


    /**
     * {@inheritDoc}
     */
    public Set<String> getWords() {
        return Collections.unmodifiableSet(objectTable.keySet());
    }


    /**
     * {@inheritDoc}
     */
    public Vector getVector(String word) {
        word = word.toLowerCase();
        if(objectTable.containsKey(word)) {
            int wordIndex = objectTable.get(word);
            if(wordIndex < syntacticCooccurrence.rows()) {
                return syntacticCooccurrence.getRowVector(wordIndex);
            }
            // At this section, several exception handlers were removed.  These
            // may have been superfluous, or the code may have relied on them
            // being caught.
        }
        return null;
    }
    
    /**
     * Does nothing.
     */
    public void processSpace(Properties properties) {
    }


    /**
     * {@inheritDoc}
     */
    public String getSpaceName() {
        return "grefenstette-syntatic-analysis";
    }


    /**
     * {@inheritDoc}
     */
    public int getVectorLength() {
      return syntacticCooccurrence.columns();
    }
}
Source Code of edu.ucla.sspace.grefenstette.Grefenstette

Related Classes of edu.ucla.sspace.grefenstette.Grefenstette