//files = FileTools.getFilesFromDirectoryByName(new File("/home/ptc24/oscarworkspace/corpora/paperset1"), "source.xml");
//files = FileTools.getFilesFromDirectoryByName(new File("/home/ptc24/newows/corpora/paperset1"), "source.xml");
//files = FileTools.getFilesFromDirectoryByName(new File("/home/ptc24/newows/corpora/BioIE"), "source.xml");
files = FileTools.getFilesFromDirectoryByName(new File("/home/ptc24/newows/corpora/roughPubMed"), "source.xml");
//files = FileTools.getFilesFromDirectoryByName(new File("/scratch/pubmed/2005"), "source.xml");
StringSource ss = new StringSource(files, false);
Bag<String> wordCounts = new Bag<String>();
ss.reset();
for(String s : ss) {
TokenSequence t = Tokeniser.getInstance().tokenise(s);
for(String word : t.getTokenStringList()) {
if(!word.matches(".*[a-z][a-z].*")) continue;
word = StringTools.normaliseName(word);