Source Code of opennlp.ccg.ngrams.RepetitionScorer

///////////////////////////////////////////////////////////////////////////////
// Copyright (C) 2004 University of Edinburgh (Michael White)
// 
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
// 
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU Lesser General Public License for more details.
// 
// You should have received a copy of the GNU Lesser General Public
// License along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
//////////////////////////////////////////////////////////////////////////////


package opennlp.ccg.ngrams;


import opennlp.ccg.synsem.Sign;
import opennlp.ccg.synsem.SignScorer;
import opennlp.ccg.lexicon.Word;


import java.util.*;
import gnu.trove.*;


/**
 * Scores a sign according to how repetitive its words are given the 
 * observed context.  Relevant repeated items (eg stems) are counted, 
 * with full counts given to items in the previous words or recent context, 
 * and fractional counts to older items.  The score is then assigned according 
 * to the number of repeated items and the configured penalty, as 
 * 10 to the minus (penalty times repeated items).
 *
 * @author      Michael White
 * @version     $Revision: 1.6 $, $Date: 2011/03/20 20:11:58 $
 */
@SuppressWarnings({"unchecked","rawtypes"})
public class RepetitionScorer implements SignScorer
{
    /** The repetition penalty (defaults to 1.0). */
    public double penalty = 1.0;


    /** The fractional count for the older items (defaults to 0.5). */
    public double olderCount = 0.5;
    
    /** The fractional count for the even older items (defaults to 0.25). */
    public double evenOlderCount = 0.25;
    
    /** The fractional count for the oldest items (defaults to 0.125). */
    public double oldestCount = 0.125;
    
    /** The interned POS values to use for repetition scoring purposes. */
    protected Set posValsToUse = new THashSet(new TObjectIdentityHashingStrategy());
    
    /** The interned stems to ignore for repetition scoring purposes. */
    protected Set stemsToIgnore = new THashSet(new TObjectIdentityHashingStrategy());
    
    /** The interned items (eg stems) seen in the previous words. */
    protected Set previousItems = new THashSet(new TObjectIdentityHashingStrategy());
    
    /** The interned items (eg stems) seen in the recent context. */
    protected Set contextItems = new THashSet(new TObjectIdentityHashingStrategy());
    
    /** The interned items (eg stems) seen in the older context. */
    protected Set olderContextItems = new THashSet(new TObjectIdentityHashingStrategy());
    
    /** The interned items (eg stems) seen in the even older context. */
    protected Set evenOlderContextItems = new THashSet(new TObjectIdentityHashingStrategy());
    
    /** The interned items (eg stems) seen in the oldest context. */
    protected Set oldestContextItems = new THashSet(new TObjectIdentityHashingStrategy());
    
    /**
     * Default constructor.
     * Adds "NNP", "N", "V", "Adj" and "Adv" to posValsToUse, 
     * and "do" and "not" to stemsToIgnore.
     */
    public RepetitionScorer() {
        String[] posVals = { "NNP", "N", "V", "Adj", "Adv" };
        posValsToUse.addAll(Arrays.asList(posVals));
        String[] stems = { "do", "not" };
        stemsToIgnore.addAll(Arrays.asList(stems));
    }
    
    /** Resets all the context items. */
    public void resetContext() { 
        contextItems.clear(); olderContextItems.clear(); 
        evenOlderContextItems.clear(); oldestContextItems.clear();
    }
    
    /** Ages the context items, clearing the recent ones. */
  public void ageContext() { 
        oldestContextItems.clear(); oldestContextItems.addAll(evenOlderContextItems);
        evenOlderContextItems.clear(); evenOlderContextItems.addAll(olderContextItems);
        olderContextItems.clear(); olderContextItems.addAll(contextItems);
        contextItems.clear(); 
    }
    
    /** Adds the items (eg stems) from the given sign's words to the context items. */
    public void updateContext(Sign sign) {
        List words = sign.getWords(); 
        if (words == null) return;
        for (int i = 0; i < words.size(); i++) {
            Word word = (Word) words.get(i);
            updateItems(word, contextItems);
        }
    }
    
    /** 
     * Adds the items (eg stems) from the given word to the given set.
     * By default, adds the relevant stems, per the relevantStem method.
     */
    protected void updateItems(Word word, Set set) {
        String stem = relevantStem(word);
        if (stem != null) set.add(stem);
    }


    /**
     * Returns the stem of the given word if its POS is in posValsToUse, 
     * unless the stem is in stemsToIgnore; otherwise returns null.
     */
    protected String relevantStem(Word word) {
        if (!(posValsToUse.contains(word.getPOS()))) return null;
        String stem = word.getStem();
        if (!(stemsToIgnore.contains(stem))) return stem;
        return null;
    }
    
    /** 
     * Returns a score between 0 (worst) and 1 (best) for the given sign 
     * and completeness flag, according to how repetitive its word are compared to 
     * the observed context.  
     * In particular, returns 10 to the minus (penalty times repeated items), 
     * or zero if there are no words.
     */
    public double score(Sign sign, boolean complete) {
        List words = sign.getWords(); 
        if (words == null) return 0;
        return Math.pow(10, -1 * penalty * repeatedItems(words));
    }
    
    /** 
     * Returns the number of repeated items (eg stems) in the given word list, 
     * using fractional counts for repetitions of older items.
     * The previous items set is cleared, and then the repeated items 
     * are summed for each word, updating the previous items along the way.
     */
    protected double repeatedItems(List words) {
        previousItems.clear();
        double retval = 0;
        for (int i = 0; i < words.size(); i++) {
            Word word = (Word) words.get(i);
            retval += repeatedItems(word);
            updateItems(word, previousItems);
        }
        return retval;
    }
    
    /** 
     * Returns the number of repeated items (eg stems) in the given word, 
     * using fractional counts for repetitions of older items.
     * By default, returns 1 (or a fractional count) if the stem is relevant, 
     * per the relevantStem method. 
     */
    protected double repeatedItems(Word word) {
        String stem = relevantStem(word);
        if (stem == null) return 0;
        if (contextItems.contains(stem) || previousItems.contains(stem)) return 1;
        if (olderContextItems.contains(stem)) return olderCount;
        if (evenOlderContextItems.contains(stem)) return evenOlderCount;
        if (oldestContextItems.contains(stem)) return oldestCount;
        return 0;
    }
}
Source Code of opennlp.ccg.ngrams.RepetitionScorer

Related Classes of opennlp.ccg.ngrams.RepetitionScorer