public void run() throws IOException
{
loadBagOfWords();
CSVFileReader in = new CSVFileReader(inCSV, ',');
Vector<String> fields = in.readFields();
while(fields!=null)
{
//generate here the id of the element
fields.set(0, fields.get(0) + fields.get(1));
String text = fields.get(3).toLowerCase();
loadStopWords();
SnowballStemmer stemmer = new org.tartarus.snowball.ext.englishStemmer();
for(int i=0; i < stopWords.size(); i++)
{
text = text.replaceAll("[\\s]"+stopWords.get(i)+"[\\s]", " ");
}
Pattern p = Pattern.compile("[^a-zA-Z]+");
String [] words = p.split(text);
text = "";
for(int i=0; i < words.length; i++)
{
if(words[i].length() < 3)
{
continue;
}
stemmer.setCurrent(words[i]);
stemmer.stem();
String stemmed = stemmer.getCurrent();
if(!loadedWords.contains(stemmed))
{
continue;
}
if(stemWordsTotal.containsKey(stemmed))
{
stemWordsTotal.put(stemmed, stemWordsTotal.get(stemmed) + 1);
}
else
{
stemWordsTotal.put(stemmed, 1);
}
if(text.indexOf(stemmed) >= 0)
{
continue;
}
if(documentsWords.containsKey(stemmed))
{
if(documentsWords.get(stemmed).indexOf(fields.get(0)) < 0)
{
documentsWords.put(stemmed, documentsWords.get(stemmed) + " " + fields.get(0));
stemWords.put(stemmed, stemWords.get(stemmed) + 1);
}
}
else
{
stemWords.put(stemmed, 1);
documentsWords.put(stemmed, fields.get(0));
}
}
fields = in.readFields();
}
in.close();
listBagOfWords();
}