/**
* determines the dictionary.
*/
private void determineDictionary() {
// initialize stopwords
Stopwords stopwords = new Stopwords();
if (getUseStoplist()) {
try {
if (getStopwords().exists() && !getStopwords().isDirectory())
stopwords.read(getStopwords());
}
catch (Exception e) {
e.printStackTrace();
}
}
// Operate on a per-class basis if class attribute is set
int classInd = getInputFormat().classIndex();
int values = 1;
if (!m_doNotOperateOnPerClassBasis && (classInd != -1)) {
values = getInputFormat().attribute(classInd).numValues();
}
//TreeMap dictionaryArr [] = new TreeMap[values];
TreeMap [] dictionaryArr = new TreeMap[values];
for (int i = 0; i < values; i++) {
dictionaryArr[i] = new TreeMap();
}
// Make sure we know which fields to convert
determineSelectedRange();
// Tokenize all training text into an orderedMap of "words".
long pruneRate =
Math.round((m_PeriodicPruningRate/100.0)*getInputFormat().numInstances());
for (int i = 0; i < getInputFormat().numInstances(); i++) {
Instance instance = getInputFormat().instance(i);
int vInd = 0;
if (!m_doNotOperateOnPerClassBasis && (classInd != -1)) {
vInd = (int)instance.classValue();
}
// Iterate through all relevant string attributes of the current instance
Hashtable h = new Hashtable();
for (int j = 0; j < instance.numAttributes(); j++) {
if (m_SelectedRange.isInRange(j) && (instance.isMissing(j) == false)) {
// Get tokenizer
m_Tokenizer.tokenize(instance.stringValue(j));
// Iterate through tokens, perform stemming, and remove stopwords
// (if required)
while (m_Tokenizer.hasMoreElements()) {
String word = ((String)m_Tokenizer.nextElement()).intern();
if(this.m_lowerCaseTokens==true)
word = word.toLowerCase();
word = m_Stemmer.stem(word);
if(this.m_useStoplist==true)
if(stopwords.is(word))
continue;
if(!(h.contains(word)))
h.put(word, new Integer(0));