package dmt.features.definitions;
import java.io.IOException;
import java.util.HashMap;
import java.util.Vector;
import java.util.regex.Pattern;
import org.tartarus.snowball.SnowballStemmer;
import dmt.config.Configuration;
import dmt.features.Feature;
import dmt.tools.CSVFileReader;
import dmt.tools.CSVFileWriter;
public class NumberOfCharsStemmedFeature extends Feature
{
private static Vector<String> stopWords = null;
private static HashMap<String, Integer> stemWords = new HashMap<String, Integer>();
private static HashMap<String, Integer> stemWordsTotal = new HashMap<String, Integer>();
private static HashMap<String, String> documentsWords = new HashMap<String, String>();
@Override
public String[] getNames()
{
return new String[] { "Final Number of Chars" };
}
@Override
public Object[] getValues(Vector<String> fields,
HashMap<String, Boolean> hashMap)
{
String text = fields.get(3).toLowerCase();
loadStopWords();
SnowballStemmer stemmer = new org.tartarus.snowball.ext.englishStemmer();
for(int i=0; i < stopWords.size(); i++)
{
text = text.replaceAll("[\\s]"+stopWords.get(i)+"[\\s]", " ");
}
Pattern p = Pattern.compile("[^a-zA-Z]+");
String [] words = p.split(text);
text = "";
for(int i=0; i < words.length; i++)
{
if(words[i].length() < 3)
{
continue;
}
stemmer.setCurrent(words[i]);
stemmer.stem();
String stemmed = stemmer.getCurrent();
if(VerifyParents.getParent(fields.get(1)) == null)
{
if(stemWordsTotal.containsKey(stemmed))
{
stemWordsTotal.put(stemmed, stemWordsTotal.get(stemmed) + 1);
}
else
{
stemWordsTotal.put(stemmed, 1);
}
}
/*if(text.indexOf(stemmed) >= 0)
{
continue;
}*/
if(VerifyParents.getParent(fields.get(1)) == null)
{
if(documentsWords.containsKey(stemmed))
{
if(documentsWords.get(stemmed).indexOf(fields.get(0)) < 0)
{
documentsWords.put(stemmed, documentsWords.get(stemmed) + " " + fields.get(0));
stemWords.put(stemmed, stemWords.get(stemmed) + 1);
}
}
else
{
stemWords.put(stemmed, 1);
documentsWords.put(stemmed, fields.get(0));
}
}
text += stemmed;
}
return new Object[] { new Integer(text.length())};
}
private static void loadStopWords()
{
if(stopWords != null)
{
return;
}
CSVFileReader in;
try
{
in = new CSVFileReader(Configuration.STOP_WORDS_LIST_WITH_CONTRACTIONS, ',');
stopWords = in.readFields();
in.close();
} catch (IOException e)
{
e.printStackTrace();
}
}
public static void printWords() throws IOException
{
CSVFileWriter out = new CSVFileWriter(Configuration.OUTPUT_BAG_OF_WORDS_ALL, ',');
Object[] keys = stemWords.keySet().toArray();
for(int i=0; i< keys.length; i++)
{
Vector<String> fields = new Vector<String>();
fields.add(keys[i].toString());
fields.add(stemWords.get(keys[i]).toString());
fields.add(stemWordsTotal.get(keys[i]).toString());
out.writeFields(fields);
}
out.close();
}
}