Source Code of edu.stanford.nlp.parser.lexparser.BaseUnknownWordModel

package edu.stanford.nlp.parser.lexparser;


import java.util.Map;
import java.util.Set;


import edu.stanford.nlp.ling.Label;
import edu.stanford.nlp.ling.Tag;
import edu.stanford.nlp.stats.ClassicCounter;
import edu.stanford.nlp.stats.Counter;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.Index;




/**
 *  An unknown word model for a generic language.  This was originally designed for
 *  German, changing only to remove German-specific numeric features.  Models unknown
 *  words based on their prefix and suffixes, as well as capital letters.
 *
 * @author Roger Levy
 * @author Greg Donaker (corrections and modeling improvements)
 * @author Christopher Manning (generalized and improved what Greg did)
 * @author Anna Rafferty
 *
 */
public class BaseUnknownWordModel implements UnknownWordModel {


  private static final long serialVersionUID = 6355171148751673822L;


  protected static final boolean VERBOSE = false;


  protected boolean useFirst; //= true;
  private final boolean useEnd;
  protected boolean useGT;
  private final boolean useFirstCap; // Only care if first is capitalized


  private int endLength = 2; // only used if useEnd==true


  /** What type of equivalence classing is done in getSignature */
  protected final int unknownLevel;


  protected static final String unknown = "UNK";


  protected static final int nullWord = -1;
  protected static final short nullTag = -1;


  protected static final IntTaggedWord NULL_ITW = new IntTaggedWord(nullWord, nullTag);


  protected final TrainOptions trainOptions;


  protected final Index<String> wordIndex;
  protected final Index<String> tagIndex;


  /**
   * Has counts for taggings in terms of unseen signatures. The IntTagWords are
   * for (tag,sig), (tag,null), (null,sig), (null,null). (None for basic UNK if
   * there are signatures.)
   */
  protected final ClassicCounter<IntTaggedWord> unSeenCounter;


  /** This maps from a tag (as a label) to a Counter from word signatures to
   *  their P(sig|tag), as estimated in the model. For Chinese, the word
   *  signature is just the first character or its unicode type for things
   *  that aren't Chinese characters.
   */
  protected final Map<Label,ClassicCounter<String>> tagHash;


  /** This is the set of all signatures that we have seen. */
  final private Set<String> seenEnd;


  final Map<String,Float> unknownGT;


  /** All classes that implement UnknownWordModel must call the constructor that initializes this variable. */
  private final Lexicon lex;




  public BaseUnknownWordModel(Options op, Lexicon lex,
                              Index<String> wordIndex,
                              Index<String> tagIndex,
                              ClassicCounter<IntTaggedWord> unSeenCounter,
                              Map<Label,ClassicCounter<String>> tagHash,
                              Map<String,Float> unknownGT,
                              Set<String> seenEnd) {
    endLength = op.lexOptions.unknownSuffixSize;
    // TODO: refactor these terms into BaseUnknownWordModelTrainer
    useEnd = (op.lexOptions.unknownSuffixSize > 0 &&
              op.lexOptions.useUnknownWordSignatures > 0);
    useFirstCap = op.lexOptions.useUnknownWordSignatures > 0;
    useGT = (op.lexOptions.useUnknownWordSignatures == 0);
    useFirst = false;
    this.lex = lex;
    this.trainOptions = op.trainOptions;
    this.wordIndex = wordIndex;
    this.tagIndex = tagIndex;
    this.unSeenCounter = unSeenCounter;
    this.tagHash = tagHash;
    this.seenEnd = seenEnd;
    this.unknownGT = unknownGT;
    unknownLevel = op.lexOptions.useUnknownWordSignatures;
  }




  /**
   * This constructor creates an UWM with empty data structures.  Only
   * use if loading in the data separately, such as by reading in text
   * lines containing the data.
   */
  public BaseUnknownWordModel(Options op, Lexicon lex,
                              Index<String> wordIndex,
                              Index<String> tagIndex) {
    this(op, lex, wordIndex, tagIndex,
         new ClassicCounter<IntTaggedWord>(),
         Generics.<Label,ClassicCounter<String>>newHashMap(),
         Generics.<String,Float>newHashMap(),
         Generics.<String>newHashSet());
  }




  /**
   * Currently we don't consider loc or the other parameters in determining
   * score in the default implementation; only English uses them.
   */
  @Override
  public float score(IntTaggedWord itw, int loc, double c_Tseen, double total, double smooth, String word) {
    return score(itw, word);
  }




  // todo [cdm 2010]: Recheck that this method really does the right thing in making a P(W|T) estimate....
  public float score(IntTaggedWord itw, String word) {
    float logProb;


    // Label tag = itw.tagLabel();
    String tagStr = itw.tagString(tagIndex);
    Label tag = new Tag(tagStr);


    // testing
    //EncodingPrintWriter.out.println("Scoring unknown word " + word + " with tag " + tag,encoding);
    // end testing


    if (useEnd || useFirst || useFirstCap) {
      String end = getSignature(word, -1); // The getSignature here doesn't use sentence position
      if (useGT && ! seenEnd.contains(end)) {
        logProb = scoreGT(tagStr);
      } else {
        if ( ! seenEnd.contains(end)) {
          end = unknown;
        }
        //System.out.println("using end-character model for for unknown word "+  word + " for tag " + tag);


        /* get the Counter of terminal rewrites for the relevant tag */
        ClassicCounter<String> wordProbs = tagHash.get(tag);
        /* if the proposed tag has never been seen before, issue a
         * warning and return probability 0
         */
        if (wordProbs == null) {
          System.err.println("Warning: proposed tag is unseen in training data:\t"+tagStr);
          logProb = Float.NEGATIVE_INFINITY;
        } else if (wordProbs.keySet().contains(end)) {
          logProb = (float) wordProbs.getCount(end);
        } else {
          logProb = (float) wordProbs.getCount(unknown);
        }
      }
    } else if (useGT) {
      logProb = scoreGT(tagStr);
    } else {
      System.err.println("Warning: no unknown word model in place!\nGiving the combination " + word + ' ' + tagStr + " zero probability.");
      logProb = Float.NEGATIVE_INFINITY; // should never get this!
    }


    //EncodingPrintWriter.out.println("Unknown word estimate for " + word + " as " + tag + ": " + logProb,encoding); //debugging
    return logProb;
  }




  /** Calculate P(Tag|Signature) with Bayesian smoothing via just P(Tag|Unknown) */
  @Override
  public double scoreProbTagGivenWordSignature(IntTaggedWord iTW, int loc, double smooth, String word) {
    throw new UnsupportedOperationException();
  }




  // todo [cdm 2012, based on error report from Thang]: this is broken because the Label passed in is a Tag, which will never match on the CoreLabel's now in unknownGT.keySet()
  // todo [cdm 2012]: But see if this bug is only if you use Lexicon's main method, or also when training a parser in the usual way.
  protected float scoreGT(String tag) {
    if (VERBOSE) System.err.println("using GT for unknown word and tag " + tag);
    if (unknownGT.containsKey(tag)) {
      return unknownGT.get(tag).floatValue();
    } else {
      return Float.NEGATIVE_INFINITY;
    }
  }


  /**
   * Signature for a specific word; loc parameter is ignored.
   * @param word The word
   * @param loc Its sentence position
   * @return A "signature" (which represents an equivalence class of Strings), e.g., a suffix of the string
   */
  @Override
  public String getSignature(String word, int loc) {
    StringBuilder subStr = new StringBuilder("UNK-");
    int n = word.length() - 1;
    char first = word.charAt(0);
    if (useFirstCap) {
      if (Character.isUpperCase(first) || Character.isTitleCase(first)) {
        subStr.append('C');
      } else {
        subStr.append('c');
      }
    }
    if (useFirst) {
      subStr.append(first);
    }
    if (useEnd) {
      subStr.append(word.substring(n - endLength > 0 ? n - endLength : 0, n));
    }
    return subStr.toString();
  }


  @Override
  public int getSignatureIndex(int wordIndex, int sentencePosition, String word) {
    return 0;
  }


  /**
   * Get the lexicon associated with this unknown word model; usually not used, but
   * might be useful to tell you if a related word is known or unknown, for example.
   */
  @Override
  public Lexicon getLexicon() {
    return lex;
  }




  @Override
  public int getUnknownLevel() {
    return unknownLevel;
  }


  /**
   * Adds the tagging with count to the data structures in this Lexicon.
   */
  @Override
  public void addTagging(boolean seen, IntTaggedWord itw, double count) {
    if (seen) {
      System.err.println("UWM.addTagging: Shouldn't call with seen word!");
   } else {
      unSeenCounter.incrementCount(itw, count);
      // if (itw.tag() == nullTag) {
      // sigs.add(itw);
      // }
    }
  }


  @Override
  public Counter<IntTaggedWord> unSeenCounter() {
    return unSeenCounter;
  }


}
Source Code of edu.stanford.nlp.parser.lexparser.BaseUnknownWordModel

Related Classes of edu.stanford.nlp.parser.lexparser.BaseUnknownWordModel