Package

Source Code of tools

import java.io.File;
import LBJ2.nlp.*;
import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Scanner;
import java.util.Set;
import java.util.TreeSet;
import org.apache.commons.collections4.Bag;
import org.apache.commons.collections4.bag.HashBag;
import weka.associations.FPGrowth;
import weka.core.Instances;




class tools
{
  //outputList contains all sentences split by List (and WSW withstopwords)
  List<String> outputList;// = new ArrayList<String>();
  List<String> outputListWSW;
  //frequent contains a list of all frequent words
  ArrayList<String> frequent; //= new ArrayList<String>();
  ArrayList<String> wordList;
  final String path = "C:\\\\Users\\\\Jonathan\\\\Desktop\\\\";
  //dir containing all patient Data files
  final String recordsDir = path + "Data";
  //path to files containing family dict and stop dict
  final String familyDict = path + "family.txt";
  final String stopDict = path + "stopwords.txt";
  final String diseaseDict = path + "disease.txt";
  final String arffout = path + "smalldog.arff";
  final String fpout = path + "fpout.txt";
  final String wordAssoK5 = path + "wordAssociationsK5.txt";
  final String wordListsOutput = path + "wordListsOutput.txt";

  //String builder for wordAssociationsFile based on K
  String wordAssociations = path + "wordAssociationsK";

  //String of discarded characters
  final String sentenceStripper = "[\\Q][()\"{},.;:'!?<>%\\E]";
        final String famMember = "sister";
       
  final int minsupp = 4;
  int k, n;
  char c;

  //creates four data structures to be held in instance memory
  public void loadLists()
  {
    outputList = new ArrayList<String>();
    outputListWSW = new ArrayList<String>();
    frequent = new ArrayList<String>();
    wordList = new ArrayList<String>();
  }

  //loads the files and removes words which occur less than 5 tims
  public void loadProgram() throws FileNotFoundException
  {
    Bag wordBag = new HashBag();

    //Load all the medical files in the directory
    File[] fileList = new File(recordsDir).listFiles();
    for (int i = 0; i<fileList.length; i++) {
      loadFile(fileList[i].toString(), wordBag);
    }

    //dump bag into frequent who appear in 5 or more medical records
    for (Object obj : wordBag.uniqueSet()) {
      if (wordBag.getCount(obj) > 4){
        frequent.add((String)obj);
      }
    }

  }

  //manually constructs and arff file
  public void makeArf() throws IOException{

    //this portion writes to the .arff file
    PrintWriter writer;
    writer = new PrintWriter(new FileWriter(arffout));

    //relation header   
    writer.println("@relation MyRelation");

    //loading attributes
    for (String temp : frequent)
    {
      writer.println("@attribute " + temp + " {0,1}");
    }
    //data header
    writer.println("@data");

    for (String tempSentence : outputList)
    {
      sentence(tempSentence, writer);
    }

    writer.close();

  }

  //goes called by loadprogram, this goes through a file and makes sure it has a family member in it and
  //fills output and outputwithstopwords data structures.  it also returns the input bag with word counts
  public void loadFile(String file, Bag wordBag) throws FileNotFoundException
  {
    //load dictionaries
    Set familySet = dictionaryLoad(familyDict);
    Set stopSet = dictionaryLoad(stopDict);
    Set tempBag = new TreeSet<String>();
    //Buffer array to store sentences as we pass them looking for family members
    String[] buffer = new String[500];
    int familyFlag = 0, bufferCount = 0;
    String sentence, output = "", outputWSW = "";
    SentenceSplitter sp = new SentenceSplitter(file);
    for (Sentence s : sp.splitAll()) //iterates over Sentence objects
    {
      sentence = s.toString(); //sentence is now a string

      //breaking up the sentence
      for (String temp : sentence.replaceAll(sentenceStripper, "").toLowerCase().replaceAll("\\s+", " ").split(" "))
      {
        //saving each word in buffer incase the sentence contains family member
        buffer[bufferCount] = temp;
        bufferCount++;
        //if sentence contains family mamber we set flag
        if (familySet.contains(temp))
        {
          familyFlag = 1;
        }
      }
      //after going through the sentence, if flag was raised we go through the saved sentence
      if (familyFlag == 1)
      {
        //go through the buffer and remove stopwords before adding it to the bag of words
        for (int i = 0; i < bufferCount; i++)
        {
          outputWSW += buffer[i] + " ";
          //if the word is not a stop word build buffer string
          if (!stopSet.contains(buffer[i]))
          {
            tempBag.add(buffer[i]);
            output += buffer[i] + " ";
          }
        }
        //add buffered string to output list
        outputList.add(output);
        outputListWSW.add(outputWSW);
        output = "";
      }
      //reset the flags
      familyFlag = 0;
      bufferCount = 0;
    }
    wordBag.addAll(tempBag);
  }
  //loads the dictionaries into a set
  public Set dictionaryLoad(String fileLocation) throws FileNotFoundException
  {
    String dictionary = fileLocation;
    File dict = new File(dictionary);
    Scanner sc = new Scanner(dict);
    List<String> lines = new ArrayList<String>();
    while (sc.hasNextLine()) {
      lines.add(sc.nextLine());
    }
    String[] arr = lines.toArray(new String[0]);
    Set <String> set = new TreeSet<String>();
    for (int i = 0; i < arr.length; ++i)
    {
      for (String member : arr[i].split(" "))
      {
        set.add(member);
      }
    }
    return set;
  }

  //checks to see if the given sentence matches frequent patterns constructing arf output
  private void sentence(String tempSentence, PrintWriter writer) {
    int first = 1;
    TreeSet sentenceSet = new TreeSet<String>();
    for (String tempword : tempSentence.split(" "))
    {
      sentenceSet.add(tempword);
    }
    //go through all the frequent words
    for (int i = 0; i < frequent.size(); i++)
    {
      //if the sentence contains a frequent word, mark the attribute as true
      if (sentenceSet.contains(frequent.get(i)))
      {
        if (first != 1)
          writer.print("," + i + " 1");
        else
        {
          writer.print("{" + i + " 1");
          first = 0;
        }
        sentenceSet.remove(frequent.get(i));
      }
    }
    writer.println("}");
  }

  //conducts fp growth analysis on the sparse arff file
  public void fpGrowth() throws IOException, Exception
  {
    String premise, consequence;

    BufferedReader br = new BufferedReader(new FileReader(arffout));
    Instances inst = new Instances(br);
    br.close();

    FPGrowth fp = new FPGrowth();
    fp.setLowerBoundMinSupport(5.0);
    fp.setNumRulesToFind(100000);
    fp.setUpperBoundMinSupport(1.0);
    fp.setMinMetric(.0);
    fp.buildAssociations(inst);
    List<FPGrowth.AssociationRule> rules = fp.getAssociationRules();

    Set fpgrowth = new TreeSet();
    String pattern;
    PrintWriter writer;
    writer = new PrintWriter(new FileWriter(fpout));
    //go through all rules cleaning string and adding it to fpgrowth file if it hasn't been added
    for (FPGrowth.AssociationRule ar : rules)
    {
      premise = ar.getPremise().toString().replaceAll(sentenceStripper, "").replace("=1", "");

      consequence = ar.getConsequence().toString().replaceAll(sentenceStripper, "").replace("=1", "");
      pattern = premise + " " + consequence;
      if (!fpgrowth.contains(pattern))
      {
        fpgrowth.add(pattern);
        writer.println(pattern);
      }
    }
    writer.close();
  }
  //set k
  public void setK(int input)
  {
    k = input;
  }
  //scan associations and see which occur together with less than k span
  public void wordAssociation() throws FileNotFoundException, IOException
  {
    String filePrefix = k + ".txt";
    wordAssociations += filePrefix;
    PrintWriter writer;
    writer = new PrintWriter(new FileWriter(wordAssociations));
    File fpfile = new File(fpout);
    String patternString;
    Scanner sc = new Scanner(fpfile);
    int tempDist = 10, numberOfTimesLessK = 0, seen = 0, numberOfTimesTotal = 0;
    Set<String> patternSet = new TreeSet();
    //scan fp file
    while (sc.hasNextLine()) {
      patternString = sc.nextLine();
      //take each entry and add all words to a set
      for (String word : patternString.split(" "))
      {
        patternSet.add(word);
      }
      //for the output withstop words check to see if it contains items in the set
      //if it does then you remove the item and start counting to the next item
      //if the item occurs less than k span increase numberOfTimesLessK and seen
      //otherwise (if you see the other entry beyond k) increase only seen
      for (String tempSentence : outputListWSW)
      {
        TreeSet<String> tempPatternSet = new TreeSet<String>(patternSet);
        for (String tempWord : tempSentence.split(" "))
        {
          //if the pattern contains the temp word
          if (tempPatternSet.contains(tempWord))
          {
            //remove the word
            tempPatternSet.remove(tempWord);
            if (tempDist < k && seen >= 1){
              tempDist = 0;
              seen++;
              if (seen == patternSet.size())
                numberOfTimesLessK++;
            }
            else if (seen == 0)
            {
              tempDist = 0;
              seen = 1;
            }
          }
          tempDist++;
        }
        if (tempPatternSet.isEmpty())
          numberOfTimesTotal++;
        tempDist = 15;
        seen = 0;
        tempPatternSet.clear();
      }
      if (numberOfTimesLessK / numberOfTimesTotal < .60)
        writer.println(patternString);
      //reset flags
      patternSet.clear();
      numberOfTimesLessK = 0;
      numberOfTimesTotal = 0;
    }

    writer.close();
  }
  //go through the patterns and construct array based on order
  void wordLists() throws FileNotFoundException, IOException
  {
    File wordA = new File(wordAssoK5);
    Scanner sc = new Scanner(wordA);
    String patternString, output;
    Set patternSet = new TreeSet();

    ArrayList <String>tempPatternSet = new ArrayList<String>();
    while (sc.hasNextLine()) {
      patternString = sc.nextLine();
      for (String word : patternString.split(" "))
      {
        patternSet.add(word);
      }
      for (String tempSentence : outputListWSW)
      {
        for (String tempWord : tempSentence.split(" "))
        {
          //if the pattern contains the temp word
          if (patternSet.contains(tempWord))
          {
            if (!tempPatternSet.contains(tempWord))
              tempPatternSet.add(tempWord);
          }

        }
        if (tempPatternSet.size() == patternSet.size())
        {
          output = "";
          for (String temp : tempPatternSet)
            output += temp + " ";
          wordList.add(output);
        }
        tempPatternSet.clear();
      }
      patternSet.clear();
    }
    //print the wordlist
    PrintWriter writer;
    writer = new PrintWriter(new FileWriter(wordListsOutput));
    for (String wordItem : wordList)
      writer.println(wordItem);
    writer.close();
  }
  //set n
  void setN(int input)
  {
    n = input;
  }
  //set C
  void setChar(char option)
  {
    c = option;
  }
  //compares the disease word lists and the pattern word lists to get frequency
  void vi() throws FileNotFoundException
  {
    //load the disease list
    File diseaseFile = new File(diseaseDict);
    Scanner sc = new Scanner(diseaseFile);
    ArrayList<ArrayList<String>> diseaseList = new ArrayList<ArrayList<String>>();
    while (sc.hasNextLine()) {
      ArrayList<String> diseaseEntry = new ArrayList<String>();
      for (String diseaseWord : sc.nextLine().toLowerCase().split(" "))
      {
        diseaseEntry.add(diseaseWord);
      }
      diseaseList.add(diseaseEntry);
    }

    //load the familty list
    Set <String>familySet = new TreeSet();
    familySet = dictionaryLoad(familyDict);

                if (c == 'g')
                {
                    familySet.clear();
                    familySet.add(famMember);
                }
               
    //load the word list
    File wordListFile = new File(wordListsOutput);
    Scanner sca = new Scanner(wordListFile);
    ArrayList<ArrayList<String>> wordList = new ArrayList<ArrayList<String>>();
    while (sca.hasNextLine()) {
      ArrayList<String> wordListEntry = new ArrayList<String>();
      for (String wordListWord : sca.nextLine().toLowerCase().split(" "))
      {
        wordListEntry.add(wordListWord);
      }
      wordList.add(wordListEntry);
    }

    ArrayList<ArrayList<String>> output = new ArrayList <ArrayList<String>>();
    ArrayList <String> tempDiseaseList = new ArrayList<String>();
    for (ArrayList<String> wordEntry : wordList)
    {
      //FLAGS
      int familyFlag = 0, diseaseFlag = 0, firstDiseaseWordEncountered = 0;
      for (String word : wordEntry)
      {
        //check if word is family word
        if (familySet.contains(word))
        {
          familyFlag++;
        }
        else if (diseaseFlag == 0)
        {
          //if a diseaseword has not yet been encountered
          if (firstDiseaseWordEncountered == 0)
          {
            //check each entries first item and check if it matches the current word
            for (ArrayList diseaseEntry : diseaseList)
            {
              if (word.equals(diseaseEntry.get(0)))
              {
                firstDiseaseWordEncountered = 1;
                tempDiseaseList.addAll(diseaseEntry);
                //if the disease entry is only one word long we have a match
                if (diseaseEntry.size() == 1)
                {
                  diseaseFlag = 1;
                }
              }
            }
          }
          //we've already found first disease word now we check if it matches the next disease word
          else
          {
            //if the word matches the second position
            if (word.equals(tempDiseaseList.get(firstDiseaseWordEncountered)))
            {
              firstDiseaseWordEncountered++;
              if (tempDiseaseList.size() == firstDiseaseWordEncountered)
              {
                diseaseFlag = 1;
              }
            }
            else
            {
              firstDiseaseWordEncountered = 0;
              //check each entries first item and check if it matches the current word
              for (ArrayList diseaseEntry : diseaseList)
              {
                if (word.equals(diseaseEntry.get(0)))
                {
                  firstDiseaseWordEncountered = 1;
                  tempDiseaseList.addAll(diseaseEntry);
                  //if the disease entry is only one word long we have a match
                  if (diseaseEntry.size() == 1)
                  {
                    diseaseFlag = 1;
                  }
                }
              }
            }
          }
        }
      }
      //based on users 'c' we will decide what to add to the output
      switch (c)
      {
      case 'a':
        //check flags
        if (familyFlag > 0)
          output.add(wordEntry);
        break;
      case 'b':
        //check flags
        if (diseaseFlag == 1)
          output.add(wordEntry);
        break;
      case 'c':
        //check flags
        if (familyFlag == 1 && diseaseFlag == 0)
          output.add(wordEntry);
        break;
      case 'e':
        //check flags
        if (familyFlag == 1 && diseaseFlag == 1)
          output.add(wordEntry);
        break;
      case 'f':
        //check flags
        if (familyFlag == 0 && diseaseFlag == 0)
          output.add(wordEntry);
        break;
                        case 'g':
        //check flags
        if (familyFlag == 1 && diseaseFlag == 1)
          output.add(wordEntry);
        break;
      }
    }



    //put entries in bag to get count
    String entryBuilder;
    Bag <String> bagOfEntries = new <String>HashBag();
    for (ArrayList<String> temp : output)
    {
      entryBuilder = "";
      for (String tempWord : temp)
      {
        String scrap = tempWord + " ";
        entryBuilder += scrap;
      }
      bagOfEntries.add(entryBuilder);
    }

    //make a map with values - count and keys - string and sort by frequency
    Map<String, Integer> frequentWords = new <String, Integer>HashMap();
    for (String pattern : bagOfEntries.uniqueSet()) {
      frequentWords.put(pattern, bagOfEntries.getCount(pattern));
    }
    Map.Entry<String, Integer>[] entries = frequentWords.entrySet().toArray(new Map.Entry[0]);
    Arrays.sort(entries, new Comparator<Map.Entry<String, Integer>>() {
        public int compare(Map.Entry<String, Integer> o1, Map.Entry<String, Integer> o2) {
          return o2.getValue().compareTo(o1.getValue());
        }
    });
    int rank = 1;
    System.out.println("Rank Freq Pattern");
    for (Map.Entry<String, Integer> entry : entries){
      int count = entry.getValue();
      if (rank < (n + 1)){
        System.out.printf("%4d %4d %s\n", rank++, count, entry.getKey());
      }
    }
  }

}
TOP

Related Classes of tools

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.