.getProperty("compact.attribute.nums"));
String format = props.getProperty("format");
// load the lexicon and the raw file
Lexicon lexicon = new Lexicon(lexiconF);
String weightingScheme = props.getProperty(
"classification_weight_scheme", "tfidf");
WeightingMethod method = WeightingMethod
.methodFromString(weightingScheme);
lexicon.setMethod(method);
// get the raw file
FileTrainingCorpus ftc = new FileTrainingCorpus(new File(raw));
int keepNBestAttributes = Integer.parseInt(props.getProperty(
"keepNBestAttributes", "-1"));
if (keepNBestAttributes != -1) {
// double scores[] = logLikelihoodAttributeFilter.getScores(ftc,
// lexicon);
// lexicon.setLogLikelihoodRatio(scores);
// lexicon.keepTopNAttributesLLR(keepNBestAttributes);
AttributeScorer scorer = logLikelihoodAttributeScorer.getScorer(
ftc, lexicon);
lexicon.setAttributeScorer(scorer);
lexicon.applyAttributeFilter(scorer, keepNBestAttributes);
} else {
// apply the filters on the Lexicon
int minFreq = Integer.parseInt(props
.getProperty("classification_minFreq"));
int maxFreq = Integer.MAX_VALUE;
lexicon.pruneTermsDocFreq(minFreq, maxFreq);
}
// change the indices of the attributes to remove
// gaps between them
Map<Integer, Integer> equiv = null;
if (compact) {
// create a new Lexicon object
equiv = lexicon.compact();
}
// save the modified lexicon file
if (newLexicon != null)
lexicon.saveToFile(newLexicon);
// dump a new vector file
Utils.writeExamples(ftc, lexicon, true, vector_location, equiv, format);
}