import java.io.File;
import LBJ2.nlp.*;
import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Scanner;
import java.util.Set;
import java.util.TreeSet;
import org.apache.commons.collections4.Bag;
import org.apache.commons.collections4.bag.HashBag;
import weka.associations.FPGrowth;
import weka.core.Instances;
class tools
{
//outputList contains all sentences split by List (and WSW withstopwords)
List<String> outputList;// = new ArrayList<String>();
List<String> outputListWSW;
//frequent contains a list of all frequent words
ArrayList<String> frequent; //= new ArrayList<String>();
ArrayList<String> wordList;
final String path = "C:\\\\Users\\\\Jonathan\\\\Desktop\\\\";
//dir containing all patient Data files
final String recordsDir = path + "Data";
//path to files containing family dict and stop dict
final String familyDict = path + "family.txt";
final String stopDict = path + "stopwords.txt";
final String diseaseDict = path + "disease.txt";
final String arffout = path + "smalldog.arff";
final String fpout = path + "fpout.txt";
final String wordAssoK5 = path + "wordAssociationsK5.txt";
final String wordListsOutput = path + "wordListsOutput.txt";
//String builder for wordAssociationsFile based on K
String wordAssociations = path + "wordAssociationsK";
//String of discarded characters
final String sentenceStripper = "[\\Q][()\"{},.;:'!?<>%\\E]";
final String famMember = "sister";
final int minsupp = 4;
int k, n;
char c;
//creates four data structures to be held in instance memory
public void loadLists()
{
outputList = new ArrayList<String>();
outputListWSW = new ArrayList<String>();
frequent = new ArrayList<String>();
wordList = new ArrayList<String>();
}
//loads the files and removes words which occur less than 5 tims
public void loadProgram() throws FileNotFoundException
{
Bag wordBag = new HashBag();
//Load all the medical files in the directory
File[] fileList = new File(recordsDir).listFiles();
for (int i = 0; i<fileList.length; i++) {
loadFile(fileList[i].toString(), wordBag);
}
//dump bag into frequent who appear in 5 or more medical records
for (Object obj : wordBag.uniqueSet()) {
if (wordBag.getCount(obj) > 4){
frequent.add((String)obj);
}
}
}
//manually constructs and arff file
public void makeArf() throws IOException{
//this portion writes to the .arff file
PrintWriter writer;
writer = new PrintWriter(new FileWriter(arffout));
//relation header
writer.println("@relation MyRelation");
//loading attributes
for (String temp : frequent)
{
writer.println("@attribute " + temp + " {0,1}");
}
//data header
writer.println("@data");
for (String tempSentence : outputList)
{
sentence(tempSentence, writer);
}
writer.close();
}
//goes called by loadprogram, this goes through a file and makes sure it has a family member in it and
//fills output and outputwithstopwords data structures. it also returns the input bag with word counts
public void loadFile(String file, Bag wordBag) throws FileNotFoundException
{
//load dictionaries
Set familySet = dictionaryLoad(familyDict);
Set stopSet = dictionaryLoad(stopDict);
Set tempBag = new TreeSet<String>();
//Buffer array to store sentences as we pass them looking for family members
String[] buffer = new String[500];
int familyFlag = 0, bufferCount = 0;
String sentence, output = "", outputWSW = "";
SentenceSplitter sp = new SentenceSplitter(file);
for (Sentence s : sp.splitAll()) //iterates over Sentence objects
{
sentence = s.toString(); //sentence is now a string
//breaking up the sentence
for (String temp : sentence.replaceAll(sentenceStripper, "").toLowerCase().replaceAll("\\s+", " ").split(" "))
{
//saving each word in buffer incase the sentence contains family member
buffer[bufferCount] = temp;
bufferCount++;
//if sentence contains family mamber we set flag
if (familySet.contains(temp))
{
familyFlag = 1;
}
}
//after going through the sentence, if flag was raised we go through the saved sentence
if (familyFlag == 1)
{
//go through the buffer and remove stopwords before adding it to the bag of words
for (int i = 0; i < bufferCount; i++)
{
outputWSW += buffer[i] + " ";
//if the word is not a stop word build buffer string
if (!stopSet.contains(buffer[i]))
{
tempBag.add(buffer[i]);
output += buffer[i] + " ";
}
}
//add buffered string to output list
outputList.add(output);
outputListWSW.add(outputWSW);
output = "";
}
//reset the flags
familyFlag = 0;
bufferCount = 0;
}
wordBag.addAll(tempBag);
}
//loads the dictionaries into a set
public Set dictionaryLoad(String fileLocation) throws FileNotFoundException
{
String dictionary = fileLocation;
File dict = new File(dictionary);
Scanner sc = new Scanner(dict);
List<String> lines = new ArrayList<String>();
while (sc.hasNextLine()) {
lines.add(sc.nextLine());
}
String[] arr = lines.toArray(new String[0]);
Set <String> set = new TreeSet<String>();
for (int i = 0; i < arr.length; ++i)
{
for (String member : arr[i].split(" "))
{
set.add(member);
}
}
return set;
}
//checks to see if the given sentence matches frequent patterns constructing arf output
private void sentence(String tempSentence, PrintWriter writer) {
int first = 1;
TreeSet sentenceSet = new TreeSet<String>();
for (String tempword : tempSentence.split(" "))
{
sentenceSet.add(tempword);
}
//go through all the frequent words
for (int i = 0; i < frequent.size(); i++)
{
//if the sentence contains a frequent word, mark the attribute as true
if (sentenceSet.contains(frequent.get(i)))
{
if (first != 1)
writer.print("," + i + " 1");
else
{
writer.print("{" + i + " 1");
first = 0;
}
sentenceSet.remove(frequent.get(i));
}
}
writer.println("}");
}
//conducts fp growth analysis on the sparse arff file
public void fpGrowth() throws IOException, Exception
{
String premise, consequence;
BufferedReader br = new BufferedReader(new FileReader(arffout));
Instances inst = new Instances(br);
br.close();
FPGrowth fp = new FPGrowth();
fp.setLowerBoundMinSupport(5.0);
fp.setNumRulesToFind(100000);
fp.setUpperBoundMinSupport(1.0);
fp.setMinMetric(.0);
fp.buildAssociations(inst);
List<FPGrowth.AssociationRule> rules = fp.getAssociationRules();
Set fpgrowth = new TreeSet();
String pattern;
PrintWriter writer;
writer = new PrintWriter(new FileWriter(fpout));
//go through all rules cleaning string and adding it to fpgrowth file if it hasn't been added
for (FPGrowth.AssociationRule ar : rules)
{
premise = ar.getPremise().toString().replaceAll(sentenceStripper, "").replace("=1", "");
consequence = ar.getConsequence().toString().replaceAll(sentenceStripper, "").replace("=1", "");
pattern = premise + " " + consequence;
if (!fpgrowth.contains(pattern))
{
fpgrowth.add(pattern);
writer.println(pattern);
}
}
writer.close();
}
//set k
public void setK(int input)
{
k = input;
}
//scan associations and see which occur together with less than k span
public void wordAssociation() throws FileNotFoundException, IOException
{
String filePrefix = k + ".txt";
wordAssociations += filePrefix;
PrintWriter writer;
writer = new PrintWriter(new FileWriter(wordAssociations));
File fpfile = new File(fpout);
String patternString;
Scanner sc = new Scanner(fpfile);
int tempDist = 10, numberOfTimesLessK = 0, seen = 0, numberOfTimesTotal = 0;
Set<String> patternSet = new TreeSet();
//scan fp file
while (sc.hasNextLine()) {
patternString = sc.nextLine();
//take each entry and add all words to a set
for (String word : patternString.split(" "))
{
patternSet.add(word);
}
//for the output withstop words check to see if it contains items in the set
//if it does then you remove the item and start counting to the next item
//if the item occurs less than k span increase numberOfTimesLessK and seen
//otherwise (if you see the other entry beyond k) increase only seen
for (String tempSentence : outputListWSW)
{
TreeSet<String> tempPatternSet = new TreeSet<String>(patternSet);
for (String tempWord : tempSentence.split(" "))
{
//if the pattern contains the temp word
if (tempPatternSet.contains(tempWord))
{
//remove the word
tempPatternSet.remove(tempWord);
if (tempDist < k && seen >= 1){
tempDist = 0;
seen++;
if (seen == patternSet.size())
numberOfTimesLessK++;
}
else if (seen == 0)
{
tempDist = 0;
seen = 1;
}
}
tempDist++;
}
if (tempPatternSet.isEmpty())
numberOfTimesTotal++;
tempDist = 15;
seen = 0;
tempPatternSet.clear();
}
if (numberOfTimesLessK / numberOfTimesTotal < .60)
writer.println(patternString);
//reset flags
patternSet.clear();
numberOfTimesLessK = 0;
numberOfTimesTotal = 0;
}
writer.close();
}
//go through the patterns and construct array based on order
void wordLists() throws FileNotFoundException, IOException
{
File wordA = new File(wordAssoK5);
Scanner sc = new Scanner(wordA);
String patternString, output;
Set patternSet = new TreeSet();
ArrayList <String>tempPatternSet = new ArrayList<String>();
while (sc.hasNextLine()) {
patternString = sc.nextLine();
for (String word : patternString.split(" "))
{
patternSet.add(word);
}
for (String tempSentence : outputListWSW)
{
for (String tempWord : tempSentence.split(" "))
{
//if the pattern contains the temp word
if (patternSet.contains(tempWord))
{
if (!tempPatternSet.contains(tempWord))
tempPatternSet.add(tempWord);
}
}
if (tempPatternSet.size() == patternSet.size())
{
output = "";
for (String temp : tempPatternSet)
output += temp + " ";
wordList.add(output);
}
tempPatternSet.clear();
}
patternSet.clear();
}
//print the wordlist
PrintWriter writer;
writer = new PrintWriter(new FileWriter(wordListsOutput));
for (String wordItem : wordList)
writer.println(wordItem);
writer.close();
}
//set n
void setN(int input)
{
n = input;
}
//set C
void setChar(char option)
{
c = option;
}
//compares the disease word lists and the pattern word lists to get frequency
void vi() throws FileNotFoundException
{
//load the disease list
File diseaseFile = new File(diseaseDict);
Scanner sc = new Scanner(diseaseFile);
ArrayList<ArrayList<String>> diseaseList = new ArrayList<ArrayList<String>>();
while (sc.hasNextLine()) {
ArrayList<String> diseaseEntry = new ArrayList<String>();
for (String diseaseWord : sc.nextLine().toLowerCase().split(" "))
{
diseaseEntry.add(diseaseWord);
}
diseaseList.add(diseaseEntry);
}
//load the familty list
Set <String>familySet = new TreeSet();
familySet = dictionaryLoad(familyDict);
if (c == 'g')
{
familySet.clear();
familySet.add(famMember);
}
//load the word list
File wordListFile = new File(wordListsOutput);
Scanner sca = new Scanner(wordListFile);
ArrayList<ArrayList<String>> wordList = new ArrayList<ArrayList<String>>();
while (sca.hasNextLine()) {
ArrayList<String> wordListEntry = new ArrayList<String>();
for (String wordListWord : sca.nextLine().toLowerCase().split(" "))
{
wordListEntry.add(wordListWord);
}
wordList.add(wordListEntry);
}
ArrayList<ArrayList<String>> output = new ArrayList <ArrayList<String>>();
ArrayList <String> tempDiseaseList = new ArrayList<String>();
for (ArrayList<String> wordEntry : wordList)
{
//FLAGS
int familyFlag = 0, diseaseFlag = 0, firstDiseaseWordEncountered = 0;
for (String word : wordEntry)
{
//check if word is family word
if (familySet.contains(word))
{
familyFlag++;
}
else if (diseaseFlag == 0)
{
//if a diseaseword has not yet been encountered
if (firstDiseaseWordEncountered == 0)
{
//check each entries first item and check if it matches the current word
for (ArrayList diseaseEntry : diseaseList)
{
if (word.equals(diseaseEntry.get(0)))
{
firstDiseaseWordEncountered = 1;
tempDiseaseList.addAll(diseaseEntry);
//if the disease entry is only one word long we have a match
if (diseaseEntry.size() == 1)
{
diseaseFlag = 1;
}
}
}
}
//we've already found first disease word now we check if it matches the next disease word
else
{
//if the word matches the second position
if (word.equals(tempDiseaseList.get(firstDiseaseWordEncountered)))
{
firstDiseaseWordEncountered++;
if (tempDiseaseList.size() == firstDiseaseWordEncountered)
{
diseaseFlag = 1;
}
}
else
{
firstDiseaseWordEncountered = 0;
//check each entries first item and check if it matches the current word
for (ArrayList diseaseEntry : diseaseList)
{
if (word.equals(diseaseEntry.get(0)))
{
firstDiseaseWordEncountered = 1;
tempDiseaseList.addAll(diseaseEntry);
//if the disease entry is only one word long we have a match
if (diseaseEntry.size() == 1)
{
diseaseFlag = 1;
}
}
}
}
}
}
}
//based on users 'c' we will decide what to add to the output
switch (c)
{
case 'a':
//check flags
if (familyFlag > 0)
output.add(wordEntry);
break;
case 'b':
//check flags
if (diseaseFlag == 1)
output.add(wordEntry);
break;
case 'c':
//check flags
if (familyFlag == 1 && diseaseFlag == 0)
output.add(wordEntry);
break;
case 'e':
//check flags
if (familyFlag == 1 && diseaseFlag == 1)
output.add(wordEntry);
break;
case 'f':
//check flags
if (familyFlag == 0 && diseaseFlag == 0)
output.add(wordEntry);
break;
case 'g':
//check flags
if (familyFlag == 1 && diseaseFlag == 1)
output.add(wordEntry);
break;
}
}
//put entries in bag to get count
String entryBuilder;
Bag <String> bagOfEntries = new <String>HashBag();
for (ArrayList<String> temp : output)
{
entryBuilder = "";
for (String tempWord : temp)
{
String scrap = tempWord + " ";
entryBuilder += scrap;
}
bagOfEntries.add(entryBuilder);
}
//make a map with values - count and keys - string and sort by frequency
Map<String, Integer> frequentWords = new <String, Integer>HashMap();
for (String pattern : bagOfEntries.uniqueSet()) {
frequentWords.put(pattern, bagOfEntries.getCount(pattern));
}
Map.Entry<String, Integer>[] entries = frequentWords.entrySet().toArray(new Map.Entry[0]);
Arrays.sort(entries, new Comparator<Map.Entry<String, Integer>>() {
public int compare(Map.Entry<String, Integer> o1, Map.Entry<String, Integer> o2) {
return o2.getValue().compareTo(o1.getValue());
}
});
int rank = 1;
System.out.println("Rank Freq Pattern");
for (Map.Entry<String, Integer> entry : entries){
int count = entry.getValue();
if (rank < (n + 1)){
System.out.printf("%4d %4d %s\n", rank++, count, entry.getKey());
}
}
}
}