/*
* LingPipe v. 2.0
* Copyright (C) 2003-5 Alias-i
*
* This program is licensed under the Alias-i Royalty Free License
* Version 1 WITHOUT ANY WARRANTY, without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the Alias-i
* Royalty Free License Version 1 for more details.
*
* You should have received a copy of the Alias-i Royalty Free License
* Version 1 along with this program; if not, visit
* http://www.alias-i.com/lingpipe/licenseV1.txt or contact
* Alias-i, Inc. at 181 North 11th Street, Suite 401, Brooklyn, NY 11211,
* +1 (718) 290-9170.
*/
package org.sf.mustru.utils;
import com.aliasi.lm.CompiledNGramProcessLM;
import com.aliasi.lm.NGramProcessLM;
import com.aliasi.spell.CompiledSpellChecker;
import com.aliasi.spell.WeightedEditDistance;
import com.aliasi.tokenizer.Tokenizer;
import com.aliasi.tokenizer.TokenizerFactory;
import com.aliasi.util.AbstractExternalizable;
import com.aliasi.util.Compilable;
import com.aliasi.util.ObjectToCounter;
import com.aliasi.util.ObjectToCounterMap;
import com.aliasi.util.Strings;
import java.io.ObjectInput;
import java.io.ObjectOutput;
import java.io.IOException;
import java.io.ObjectOutputStream;
import java.io.Serializable;
import java.util.HashSet;
import java.util.Set;
/**
* A <code>TrainSpellChecker</code> instance provides a mechanism for
* collecting training data for a compiled spell checker. Training
* instances are nothing more than character sequences which represent
* likely user queries.
*
* <P>After training, a model is written out through the
* <code>Compilable</code> interface using {@link
* #compileTo(ObjectOutput)}. When this model is read back in, it
* will be an instance of {@link CompiledSpellChecker}. The compiled
* spell checkers allow many runtime parameters to be tuned; see the
* class documentation for full details.
*
* <P>In training the source language model, all training data is
* whitespace normalized with an initial whitespace, final whitespace,
* and all internal whitespace sequences converted to a single space
* character.
*
* <P>A tokenization factory may be optionally specified for training
* token-sensitive spell checkers. With tokenization, input is
* further normalized to insert a single whitespace between all
* tokens not already separated by a space in the input. The tokens
* are then output during compilation and read back into the compiled
* spell checker. The set of tokens output may be pruned to remove
* any below a given count threshold. The resulting set of tokens
* is used to constrain the set of alternative spellings suggested
* during spelling correction to include only tokens in the observed
* token set.
*
* <P>In constructing a spell checker trainer, a compilable weighted
* edit distance must be specified. This edit distance model will be
* compiled along with the language model and token set and used as
* the channel model in the compiled spell checker.
*
* <P>As an alternative to using the spell checker, a language model
* may be trained directly and supplied in compiled form along with
* a weighted edit distance to the public constructors for compiled
* spell checkers.
*
* @author Bob Carpenter
* @version 2.0
* @since LingPipe2.0
*/
public class TrainSpellChecker implements Compilable, Serializable {
private static final long serialVersionUID = 4907338741905144267L;
private final WeightedEditDistance mEditDistance;
private final NGramProcessLM mLM;
private final TokenizerFactory mTokenizerFactory;
private final ObjectToCounterMap mTokenCounter = new ObjectToCounterMap();
/**
* Construct a non-tokenizing spell checker trainer from the
* specified language model and edit distance.
*
* @param lm Compilable language model.
* @param editDistance Compilable weighted edit distance.
* @throws IllegalArgumentException If the edit distance is not
* compilable.
*/
public TrainSpellChecker(NGramProcessLM lm,
WeightedEditDistance editDistance) {
this(lm,editDistance,null);
}
/**
* Construct a spell checker trainer from the specified n-gram
* process language model, tokenizer factory and edit distance.
* The language model must be an instance of the character-level
* n-gram process language model class. The edit distance must be
* compilable. The tokenizer factory may be <code>null</code>, in
* which case tokens are not saved as part of training and the
* compiled spell checker is not token sensitive. If the
* tokenizer factory is specified, it must be compilable.
*
* @param lm Compilable language model.
* @param editDistance Compilable weighted edit distance.
* @param tokenizerFactory Optional tokenizer factory.
* @throws IllegalArgumentException If the edit distance is not
* compilable or if the tokenizer factory is non-null and not compilable.
*/
public TrainSpellChecker(NGramProcessLM lm,
WeightedEditDistance editDistance,
TokenizerFactory tokenizerFactory) {
assertCompilable("Edit distance",editDistance);
if (tokenizerFactory != null)
assertCompilable("Tokenizer factory",tokenizerFactory);
mLM = lm;
mTokenizerFactory = tokenizerFactory;
mEditDistance = editDistance;
}
/**
* Returns the counter for the tokens in the training set. This
* may be used to print out the tokens with their counts for later
* perusal. The value returned is the actual counter, so any
* changes made to it will be reflected in this spell checker.
* Pruning the token counts may have eliminated tokens in the
* training data from the counter.
*
* @return The counter for the tokens in the training set.
*/
public ObjectToCounter tokenCounter() {
return mTokenCounter;
}
/**
* Train the spelling checker on the specified character sequence.
* The sequence is normalized by normalizing all whitespace
* sequences to a single space character and inserting an initial
* and final whitespace. If a tokenization factory is specified,
* a single space character is insterted between any tokens
* not already separated by a white space.
*
* @param cSeq Character sequence for training.
*/
public void train(CharSequence cSeq) {
mLM.train(normalizeQuery(cSeq));
}
/**
* Prunes the set of collected tokens of all tokens with count
* less than the specified minimum. If there was no tokenization
* factory specified for this spell checker, this method will
* have no effect.
*
* @param minCount Minimum count of preserved token.
*/
public void pruneTokens(int minCount) {
mTokenCounter.prune(minCount);
}
/**
* Prunes the underlying character language model to remove
* substring counts of less than the specified minimum.
*
* @param minCount Minimum count of preserved substrings.
*/
public void pruneLM(int minCount) {
mLM.substringCounter().prune(minCount);
}
/**
* Writes a compiled spell checker to the specified object output.
* The class of the spell checker read back in is {@link
* CompiledSpellChecker}.
*
* @param objOut Object output to which this spell checker is
* written.
* @throws IOException If there is an I/O error while writing.
*/
public void compileTo(ObjectOutput objOut) throws IOException {
objOut.writeObject(new Externalizer(this));
}
/**
* Writes the NGramProcess language model to the output stream
* The class is read back using the static <code> readFrom </code>
* method in the NGramProcessLM class.
* @param objOut
* @throws IOException
*/
public void dumpTo(ObjectOutputStream objOut) throws IOException {
mLM.writeTo(objOut);
}
StringBuffer normalizeQuery(CharSequence cSeq) {
StringBuffer sb = new StringBuffer();
sb.append(' ');
if (mTokenizerFactory != null) {
char[] cs = Strings.toCharArray(cSeq);
Tokenizer tokenizer = mTokenizerFactory.tokenizer(cs,0,cs.length);
String nextToken;
while ((nextToken = tokenizer.nextToken()) != null) {
mTokenCounter.increment(nextToken);
sb.append(nextToken);
sb.append(' ');
}
} else {
Strings.normalizeWhitespace(cSeq,sb);
sb.append(' ');
}
return sb;
}
static void assertCompilable(String description, Object x) {
if (!(x instanceof Compilable)) {
String msg = description
+ " must implement com.aliasi.util.Compilable."
+ " Found class=" + x.getClass();
throw new IllegalArgumentException(msg);
}
}
static class Externalizer extends AbstractExternalizable {
private static final long serialVersionUID = 4907338741905144267L;
private final TrainSpellChecker mTrainer;
public Externalizer() {
this(null);
}
public Externalizer(TrainSpellChecker trainer) {
mTrainer = trainer;
}
public void writeExternal(ObjectOutput objOut) throws IOException {
mTrainer.mLM.compileTo(objOut);
boolean tokenizing = mTrainer.mTokenizerFactory != null;
objOut.writeBoolean(tokenizing);
if (tokenizing) {
Set keySet = mTrainer.mTokenCounter.keySet();
objOut.writeObject(new HashSet(keySet));
}
((Compilable) mTrainer.mEditDistance).compileTo(objOut);
}
public Object read(ObjectInput objIn)
throws ClassNotFoundException, IOException {
CompiledNGramProcessLM lm
= (CompiledNGramProcessLM) objIn.readObject();
boolean tokenizing = objIn.readBoolean();
Set tokenSet = tokenizing
? (Set) objIn.readObject()
: null;
WeightedEditDistance editDistance
= (WeightedEditDistance) objIn.readObject();
return new CompiledSpellChecker(lm,editDistance,tokenSet);
}
}
}