Source Code of owlOntologies.CreateOntologyFromThesaurus

package owlOntologies;




import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;


import org.semanticweb.owlapi.apibinding.OWLManager;
import org.semanticweb.owlapi.io.RDFXMLOntologyFormat;
import org.semanticweb.owlapi.model.IRI;
import org.semanticweb.owlapi.model.OWLClass;
import org.semanticweb.owlapi.model.OWLDataFactory;
import org.semanticweb.owlapi.model.OWLOntology;
import org.semanticweb.owlapi.model.OWLOntologyCreationException;
import org.semanticweb.owlapi.model.OWLOntologyManager;
import org.semanticweb.owlapi.model.OWLOntologyStorageException;
import org.semanticweb.owlapi.util.OWLOntologyMerger;




public class CreateOntologyFromThesaurus {
  
  
  /*
   * This class is designed to read in a normalized corpus, and then using an thesaurus, generate an Owl ontology that represents that corpus.  it does this by
   * saying that every synonym word shares the equivilance class "is a" 
   * we can then add subclass by saying that if the synonym relationship is not symmetrical (e.g, cow is a synonym of steer, but steer is not a synonym of cow, we
   * say that all steers are cows, but not all cows are steers...therefore, steer is a subclass of cow).  it should be noted that this type of inference is far from
   * perfect and in fact will add many non-meaningful relationships.  however, as our coverage goal will analyze associations from existing ontologies and compare
   * those to the corpus, it shouldnt inadvertently impact our results
   */
  
  //this method generates a set with words from the corpus.  it then returns a set containing each unique words from the corpus
  //@param filePath:  this is the absolute path to the file you want to use as your corpus
  public HashSet<String> fetchCorpus(String filePath) throws IOException{
    HashSet<String> wordSet = new HashSet<String>();  //we use a set here because it doesnt matter how many times a word happens, only that it appears in the corpus
    //if you want to make a coprus that changes based upon word frequency or n-grams, this has to change
    
    BufferedReader in = new BufferedReader(new FileReader(new File(filePath)));
    String[] wordsFromLine;
    
    Pattern p = Pattern.compile("[^a-z0-9]", Pattern.CASE_INSENSITIVE);    
    String current = in.readLine();
    while(current != null){
//      wordsFromLine = current.split("\\s+");
      wordsFromLine = p.split(current); //this is better because if there are weird characters, we split, they shouldnt be there
      for(String word : wordsFromLine){


        wordSet.add(word.trim());
      }
      current = in.readLine();
    }
    return wordSet;
  }
  
  
  //@param filePath: this is the path to the thesaurus you want to you
  public ThesaurusManager generateThesaurusManager(String filePath) throws IOException{
    return new ThesaurusManager(filePath);
  }
  
  
  
  public MyOwlOntologyManager generateOwlOntologyManager(){
    return new MyOwlOntologyManager();
  }
  
  
  //@param args:  args[0] needs to be the absolute path to the corpus, and 
  //args[1] needs to be the absolute path to where you want it stored
  //args[2] needs to be "true" or "false" on whether you want equivalent axioms or not
  public static void main(String[] args) throws IOException, OWLOntologyCreationException, OWLOntologyStorageException{
//    java.util.Date timeStamp = new java.util.Date();
//    System.out.println(timeStamp.toString());
    String corpusPath = args[0];
    String storagePath = args[1];
    boolean equivalent = Boolean.valueOf(args[2]);
    String thesaurusPath = args[3];
    
    CreateOntologyFromThesaurus cot = new CreateOntologyFromThesaurus();
    cot.buildOntologyFromScratch(corpusPath, storagePath, equivalent, thesaurusPath);
  
  }
  
  
  /*
   * @param firstOntPath: the absolute path to the first ontology file
   * @param secondOntPath: the absolute path to the second ontology file
   * @param outputPath: the path and name of the second ontology file
   * this takes in existing ontology files and merges their elements.  It then stores the newly generated ontology.
   */
  public void mergeOntology(String firstOntPath, String secondOntPath, String outputPath) throws OWLOntologyStorageException, OWLOntologyCreationException{
    // Just load two arbitrary ontologies for the purposes of this example
        OWLOntologyManager man = OWLManager.createOWLOntologyManager();
        man.loadOntologyFromOntologyDocument(new File(firstOntPath));
        man.loadOntologyFromOntologyDocument(new File(secondOntPath));
        
        // Create our ontology merger
        OWLOntologyMerger merger = new OWLOntologyMerger(man);
        // We merge all of the loaded ontologies. Since an OWLOntologyManager is an OWLOntologySetProvider we just pass this in. 
        //We also need to specify the URI of the new ontology that will be created.
        IRI mergedOntologyIRI = IRI.create(outputPath);
        OWLOntology merged = merger.createMergedOntology(man, mergedOntologyIRI);
        // Print out the axioms in the merged ontology.


        // Save to RDF/XML
        man.saveOntology(merged, new RDFXMLOntologyFormat(), IRI.create("file:" + outputPath));
  }
  
  
  /*
   * given an existing ontology, add a class to it.  this not only adds the class itself, but adds any synonyms, all equivalent classes
   * and all subclasses.  
   * @param ontPath: the absolute file path to the existing ontology
   * @param className the name of the class you want to add (e.g., car, or wonderstruck)
   * @param thesaurusPath: the absolute file path to the thesaurus file
   */
  public void addToOntology(String ontPath, String className, String thesaurusPath) throws OWLOntologyCreationException, IOException, OWLOntologyStorageException{
    ThesaurusManager tm = generateThesaurusManager(thesaurusPath);  //get the thesaurus
    MyOwlOntologyManager owl = generateOwlOntologyManager();  //create an ontology manager
    OWLOntologyManager manager = owl.loadOntologyFromFile(ontPath);
    
    owl.shouldAddClass(manager, className);
    HashSet<String> classes = new HashSet<String>();  //put out word in a hashSet to use existing correct functionality  
    classes.add(className);  
    
    addEquivilentClasses(classes, owl, manager, tm);
    
    //add subclasses
    addSubClasses(classes, owl, manager, tm);


    owl.shouldSaveOntologies(manager, ontPath);  //re-save now that we have new data
    System.out.println("i took the existing ontology and added your specific class and all its implications.  im DONE!");
    
  }
  
  
  
  
  /*
   * given a path to the corpus, build an ontology from it and store it at the requested location.  this adds each word in the corpus
   * document, all the synonyms, all the equivalence between them, and then checks for subclasses and adds those too.  its takes quite
   * some time to finish
   * @param corpusPath: the absolute path to the corpus
   * @param storagePath: the absolute path to the location where you want to save the owl ontology
   * @param equivalent: whether or not you want to add equivalent axioms
   * @param thesaurusPath: the absolute path to the thesaurus file we are using.
   */
  public void buildOntologyFromScratch(String corpusPath, String storagePath, boolean equivalent, String thesaurusPath) throws IOException, OWLOntologyCreationException, OWLOntologyStorageException{
    


    HashSet<String> wordSet = fetchCorpus(corpusPath);   //get the word set from the corpus
    ThesaurusManager tm = generateThesaurusManager(thesaurusPath);  //get the thesaurus
    MyOwlOntologyManager owl = generateOwlOntologyManager();  //create an ontology manager
    
    OWLOntologyManager manager = owl.shouldCreateOntology("corpusOntology");//initialize empty ontology 
//    OWLOntology ontology = manager.getOntology(owl.getCurrentOntologyID());
    
    //for each word in the word set, make it a class in the ontology
    addClasses(wordSet, owl, manager);
    


    if(equivalent)
      addEquivilentClasses(wordSet, owl, manager, tm);//one by one add "equal to" references from thesaurus
    else
      addSynonymClasses(wordSet, owl, manager, tm);//just add the synonyms as classes, dont worry about the equal to references
    
    //one by one add "subclass" references from thesaurus
    addSubClasses(wordSet, owl, manager, tm);
    
    //save ontology
    owl.shouldSaveOntologies(manager, storagePath);  //re-save now that we have new data
    System.out.println("i have gathered the corpus, the synonyms, and made an ontology of it.  im DONE!");
  }
  
  
  
  //given a set of words, add them all individually as classes into the ontology
  public void addClasses(HashSet<String> wordSet, MyOwlOntologyManager owl, OWLOntologyManager manager){
    for(String currentWord : wordSet){
      owl.shouldAddClass(manager, currentWord);
    }
  }
  
  


  
  
  /*
   * given a set of words, go through and see if there is symmetry within the synonyms.  for example, assume you have two words, A, and B
   * if A is a synonym of B, but B is not a synonym of A, then we say that A is a subclass of B
   * note that using this will add a lot of unreasonable classes, but shouldnt hurt the overall scores and should help as the ontology we
   * test shouldnt have these random associations
   *   @param wordSet the words that already exist in the ontology
   *  @param owl access to myManager class
     *  @param manager access to the manager of the ontologies as decided by the API
   *  @param tm access to the thesaurus manager
   */
  public void addSubClasses(HashSet<String> wordSet, MyOwlOntologyManager owl, OWLOntologyManager manager, ThesaurusManager tm) throws IOException, OWLOntologyCreationException, OWLOntologyStorageException{
    int counter = 0;
    HashSet<String> synonyms;
    
    for(String currentWord : wordSet){
      counter ++;
      if (counter % 100 == 0)
        System.out.println("we have finished " + Integer.toString(counter) + " words for subclasses, out of " + Integer.toString(wordSet.size()));
      
      synonyms = tm.getSynonyms(currentWord);
      
      for(String currentSynonym: synonyms){
        if (tm.getSynonyms(currentSynonym).contains(currentWord)) //both words are synonyms of each other, move on
          continue;
        else{ //the currentWord is a subclass of the currentSynonym
          OWLClass cls1 = owl.getClassFromName(manager, currentWord); 
          OWLClass cls2 = owl.getClassFromName(manager, currentSynonym);
          
          if(cls1 == null || cls2 == null) //if one of the super classes/sub classes had a weird name that breaks OWL, just move on
            continue;
          
          owl.shouldAddSubclassAxiom(manager, cls1, cls2);
        }
      }//end of currentSynonym for loop
    }//end of currentWord for loop
  }
  
  
  
  /*
   * given a set of words, add all the synonyms as classes, but dont do any equivilance associations
   * note that using this will add a lot of unreasonable classes, but shouldnt hurt the overall scores and should help as the ontology we
   * test shouldnt have these random associations
   *   @param wordSet the words that already exist in the ontology
   *  @param owl access to myManager class
     *  @param manager access to the manager of the ontologies as decided by the API
   *  @param tm access to the thesaurus manager
   */
  public void addSynonymClasses(HashSet<String> wordSet, MyOwlOntologyManager owl, OWLOntologyManager manager, ThesaurusManager tm) throws IOException{
    int counter = 0;
    
    for(String currentWord : wordSet){
      counter ++;
      
      if (counter % 100 == 0)
        System.out.println("we have finished " + Integer.toString(counter) + " words with their synonyms, out of " + Integer.toString(wordSet.size()));
      
      HashSet<String> synonyms = tm.getSynonyms(currentWord);
      HashSet<String> filteredSyn = new HashSet<String>();
      
      if (synonyms.isEmpty()) //this word had no synonyms, move on
        continue;
      
      Pattern p = Pattern.compile("[^a-z]",Pattern.CASE_INSENSITIVE);
      Matcher m;
      for(String s: synonyms){
        m = p.matcher(s);
        if(! m.find())
          filteredSyn.add(s);
      }
      
      addClasses(filteredSyn, owl, manager); //add all the synonyms into the ontology
    }
  }
  
  
  //given a set of words, and a thesaurus, add each class from teh thesaurus into the ontology, and then connect all words as eqivilent
  //@param wordSet the words that already exist in the ontology
  //@param owl access to myManager class
  //@param manager access to the manager of the ontologies as decided by the API
  //@param tm access to the thesaurus manager
  public void addEquivilentClasses(HashSet<String> wordSet, MyOwlOntologyManager owl, OWLOntologyManager manager, ThesaurusManager tm) throws IOException{
    int counter = 0;
    
    for(String currentWord : wordSet){
      counter ++;
      
      if (counter % 100 == 0)
        System.out.println("we have finished " + Integer.toString(counter) + " words with their synonyms, out of " + Integer.toString(wordSet.size()));
      
      HashSet<String> synonyms = tm.getSynonyms(currentWord);
      
      if (synonyms.isEmpty()) //this word had no synonyms, move on
        continue;
      
      addClasses(synonyms, owl, manager); //add all the synonyms into the ontology    
      ArrayList<String> equivalents = new ArrayList<String>();
      equivalents.addAll(synonyms);
      equivalents.add(currentWord);
      
      for(int i =0; i < equivalents.size(); i++){//every word in this set is "equivalent" to each other word...but dont bother doing it to itself.
        for(int j = 0; j < equivalents.size(); j++){
          if (i != j){
            
            //these could be passed into the method, but for readability, i make them separate calls
            OWLClass cls1 = owl.getClassFromName(manager, equivalents.get(i)); 
            OWLClass cls2 = owl.getClassFromName(manager, equivalents.get(j));
            if ( cls1 == null || cls2 == null) //one of the classes has weird characters that cannot be represented, move on
              continue;
            
            owl.shouldAddEquivalentClassAxiom(manager, cls1, cls2); 
          }
        }
      }//end of for loop doing equivalents
    }//end of for loop dealing with words from teh word set
  }//end of method
  
  
}
Source Code of owlOntologies.CreateOntologyFromThesaurus

Related Classes of owlOntologies.CreateOntologyFromThesaurus