WVTConfiguration config = new WVTConfiguration();
final WVTStemmer dummyStemmer = new DummyStemmer();
final WVTStemmer porterStemmer = new PorterStemmerWrapper();
config.setConfigurationRule(WVTConfiguration.STEP_STEMMER, new WVTConfigurationFact(new ToLowerCaseConverter()));
// Initialize the input list with two classes
WVTFileInputList list = new WVTFileInputList(0);
// Add entries
list.addEntry(new WVTDocumentInfo("../data/german_utf8", "txt", "utf-8", "german"));
list.addEntry(new WVTDocumentInfo("../data/german_iso", "txt", "iso-8859-1", "german"));
// Generate the word list
WVTWordList wordList = wvt.createWordList(list, config);
// Prune the word list
//wordList.pruneByFrequency(2, 5);
// Alternativ I: read an already created word list from a file
// WVTWordList wordList2 =
// new WVTWordList(new FileReader("/home/wurst/tmp/wordlisttest.txt"));
// Alternative II: Use predifined dimensions
// List dimensions = new Vector();
// dimensions.add("atheist");
// dimensions.add("christian");
// wordList =
// wvt.createWordList(list, config, dimensions, false);
// Store the word list in a file
wordList.storePlain(new FileWriter("wordlist.txt"));
// Create the word vectors
// Set up an output filter (write sparse vectors to a file)
FileWriter outFile = new FileWriter("wv.txt");
WordVectorWriter wvw = new WordVectorWriter(outFile, true);
config.setConfigurationRule(WVTConfiguration.STEP_OUTPUT, new WVTConfigurationFact(wvw));
config.setConfigurationRule(WVTConfiguration.STEP_VECTOR_CREATION, new WVTConfigurationFact(new TFIDF()));
// Create the vectors
wvt.createVectors(list, config, wordList);
// Alternatively: create word list and vectors together