Source Code of org.sf.mustru.utils.TrainSpellChecker$Externalizer

/*
 * LingPipe v. 2.0
 * Copyright (C) 2003-5 Alias-i
 *
 * This program is licensed under the Alias-i Royalty Free License
 * Version 1 WITHOUT ANY WARRANTY, without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the Alias-i
 * Royalty Free License Version 1 for more details.
 * 
 * You should have received a copy of the Alias-i Royalty Free License
 * Version 1 along with this program; if not, visit
 * http://www.alias-i.com/lingpipe/licenseV1.txt or contact
 * Alias-i, Inc. at 181 North 11th Street, Suite 401, Brooklyn, NY 11211,
 * +1 (718) 290-9170.
 */


package org.sf.mustru.utils;


import com.aliasi.lm.CompiledNGramProcessLM;
import com.aliasi.lm.NGramProcessLM;


import com.aliasi.spell.CompiledSpellChecker;
import com.aliasi.spell.WeightedEditDistance;
import com.aliasi.tokenizer.Tokenizer;
import com.aliasi.tokenizer.TokenizerFactory;


import com.aliasi.util.AbstractExternalizable;
import com.aliasi.util.Compilable;
import com.aliasi.util.ObjectToCounter;
import com.aliasi.util.ObjectToCounterMap;
import com.aliasi.util.Strings;


import java.io.ObjectInput;
import java.io.ObjectOutput;
import java.io.IOException;
import java.io.ObjectOutputStream;
import java.io.Serializable;


import java.util.HashSet;
import java.util.Set;


/**
 * A <code>TrainSpellChecker</code> instance provides a mechanism for
 * collecting training data for a compiled spell checker.  Training
 * instances are nothing more than character sequences which represent
 * likely user queries.  
 *
 * <P>After training, a model is written out through the
 * <code>Compilable</code> interface using {@link
 * #compileTo(ObjectOutput)}.  When this model is read back in, it
 * will be an instance of {@link CompiledSpellChecker}.  The compiled
 * spell checkers allow many runtime parameters to be tuned; see the
 * class documentation for full details.
 * 
 * <P>In training the source language model, all training data is
 * whitespace normalized with an initial whitespace, final whitespace,
 * and all internal whitespace sequences converted to a single space
 * character.
 *
 * <P>A tokenization factory may be optionally specified for training
 * token-sensitive spell checkers.  With tokenization, input is
 * further normalized to insert a single whitespace between all
 * tokens not already separated by a space in the input.  The tokens
 * are then output during compilation and read back into the compiled
 * spell checker.  The set of tokens output may be pruned to remove
 * any below a given count threshold.  The resulting set of tokens
 * is used to constrain the set of alternative spellings suggested
 * during spelling correction to include only tokens in the observed
 * token set.  
 *
 * <P>In constructing a spell checker trainer, a compilable weighted
 * edit distance must be specified.  This edit distance model will be
 * compiled along with the language model and token set and used as
 * the channel model in the compiled spell checker.
 *
 * <P>As an alternative to using the spell checker, a language model
 * may be trained directly and supplied in compiled form along with
 * a weighted edit distance to the public constructors for compiled
 * spell checkers.
 *
 * @author Bob Carpenter
 * @version 2.0
 * @since   LingPipe2.0
 */
public class TrainSpellChecker implements Compilable, Serializable {
    private static final long serialVersionUID = 4907338741905144267L;
    private final WeightedEditDistance mEditDistance;
    private final NGramProcessLM mLM;
    private final TokenizerFactory mTokenizerFactory;
    private final ObjectToCounterMap mTokenCounter = new ObjectToCounterMap();


    /**
     * Construct a non-tokenizing spell checker trainer from the
     * specified language model and edit distance. 
     *
     * @param lm Compilable language model.
     * @param editDistance Compilable weighted edit distance.
     * @throws IllegalArgumentException If the edit distance is not
     * compilable.
     */
    public TrainSpellChecker(NGramProcessLM lm,
           WeightedEditDistance editDistance) {
  this(lm,editDistance,null);
    }


    /**
     * Construct a spell checker trainer from the specified n-gram
     * process language model, tokenizer factory and edit distance.
     * The language model must be an instance of the character-level
     * n-gram process language model class.  The edit distance must be
     * compilable.  The tokenizer factory may be <code>null</code>, in
     * which case tokens are not saved as part of training and the
     * compiled spell checker is not token sensitive.  If the
     * tokenizer factory is specified, it must be compilable.
     *
     * @param lm Compilable language model.
     * @param editDistance Compilable weighted edit distance.
     * @param tokenizerFactory Optional tokenizer factory.
     * @throws IllegalArgumentException If the edit distance is not
     * compilable or if the tokenizer factory is non-null and not compilable.
     */
    public TrainSpellChecker(NGramProcessLM lm,
           WeightedEditDistance editDistance, 
           TokenizerFactory tokenizerFactory) {
  assertCompilable("Edit distance",editDistance);
  if (tokenizerFactory != null)
      assertCompilable("Tokenizer factory",tokenizerFactory);
  mLM = lm;
  mTokenizerFactory = tokenizerFactory;
  mEditDistance = editDistance;
    }




    /**
     * Returns the counter for the tokens in the training set.  This
     * may be used to print out the tokens with their counts for later
     * perusal.  The value returned is the actual counter, so any
     * changes made to it will be reflected in this spell checker.
     * Pruning the token counts may have eliminated tokens in the
     * training data from the counter.
     *
     * @return The counter for the tokens in the training set.
     */
    public ObjectToCounter tokenCounter() {
  return mTokenCounter;
    }


    /**
     * Train the spelling checker on the specified character sequence.
     * The sequence is normalized by normalizing all whitespace
     * sequences to a single space character and inserting an initial
     * and final whitespace.  If a tokenization factory is specified,
     * a single space character is insterted between any tokens
     * not already separated by a white space.
     *
     * @param cSeq Character sequence for training.
     */
    public void train(CharSequence cSeq) {
  mLM.train(normalizeQuery(cSeq));
    }


    /**
     * Prunes the set of collected tokens of all tokens with count
     * less than the specified minimum.  If there was no tokenization
     * factory specified for this spell checker, this method will
     * have no effect.
     *
     * @param minCount Minimum count of preserved token.
     */
    public void pruneTokens(int minCount) {
  mTokenCounter.prune(minCount);
    }


    /**
     * Prunes the underlying character language model to remove
     * substring counts of less than the specified minimum.
     *
     * @param minCount Minimum count of preserved substrings.
     */
    public void pruneLM(int minCount) {
  mLM.substringCounter().prune(minCount);
    }


    /**
     * Writes a compiled spell checker to the specified object output.
     * The class of the spell checker read back in is {@link
     * CompiledSpellChecker}.
     *
     * @param objOut Object output to which this spell checker is
     * written.
     * @throws IOException If there is an I/O error while writing.
     */
    public void compileTo(ObjectOutput objOut) throws IOException {
  objOut.writeObject(new Externalizer(this));
    }


    /**
     * Writes the NGramProcess language model to the output stream
     * The class is read back using the static <code> readFrom </code>
     * method in the NGramProcessLM class.
     * @param objOut
     * @throws IOException
     */
    public void dumpTo(ObjectOutputStream objOut) throws IOException {
  mLM.writeTo(objOut);
    }


    StringBuffer normalizeQuery(CharSequence cSeq) {
  StringBuffer sb = new StringBuffer();
  sb.append(' ');
  if (mTokenizerFactory != null) {
      char[] cs = Strings.toCharArray(cSeq);
      Tokenizer tokenizer = mTokenizerFactory.tokenizer(cs,0,cs.length);
      String nextToken;
      while ((nextToken = tokenizer.nextToken()) != null) {
    mTokenCounter.increment(nextToken);
    sb.append(nextToken);
    sb.append(' '); 
      }
  } else {
      Strings.normalizeWhitespace(cSeq,sb);
      sb.append(' ');
  }
  return sb;
    }


    static void assertCompilable(String description, Object x) {
  if (!(x instanceof Compilable)) {
      String msg = description
    + " must implement com.aliasi.util.Compilable."
    + " Found class=" + x.getClass();
      throw new IllegalArgumentException(msg);
  }
    }


    static class Externalizer extends AbstractExternalizable {
  private static final long serialVersionUID = 4907338741905144267L;
  private final TrainSpellChecker mTrainer;
  public Externalizer() { 
      this(null);
  }
  public Externalizer(TrainSpellChecker trainer) {
      mTrainer = trainer;
  }
  public void writeExternal(ObjectOutput objOut) throws IOException {
      mTrainer.mLM.compileTo(objOut);
      boolean tokenizing = mTrainer.mTokenizerFactory != null;
      objOut.writeBoolean(tokenizing);
      if (tokenizing) {
    Set keySet = mTrainer.mTokenCounter.keySet();
    objOut.writeObject(new HashSet(keySet));
      }
      ((Compilable) mTrainer.mEditDistance).compileTo(objOut);
  }
  public Object read(ObjectInput objIn) 
      throws ClassNotFoundException, IOException {


      CompiledNGramProcessLM lm 
    = (CompiledNGramProcessLM) objIn.readObject();
      boolean tokenizing = objIn.readBoolean();
      Set tokenSet = tokenizing 
    ? (Set) objIn.readObject()
    : null;
      WeightedEditDistance editDistance 
    = (WeightedEditDistance) objIn.readObject();
      return new CompiledSpellChecker(lm,editDistance,tokenSet);
  }
    }
    
}
Source Code of org.sf.mustru.utils.TrainSpellChecker$Externalizer

Related Classes of org.sf.mustru.utils.TrainSpellChecker$Externalizer