Source Code of simplenlg.lexicon.NIHDBLexicon

/*
 * The contents of this file are subject to the Mozilla Public License
 * Version 1.1 (the "License"); you may not use this file except in
 * compliance with the License. You may obtain a copy of the License at
 * http://www.mozilla.org/MPL/
 *
 * Software distributed under the License is distributed on an "AS IS"
 * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
 * License for the specific language governing rights and limitations
 * under the License.
 *
 * The Original Code is "Simplenlg".
 *
 * The Initial Developer of the Original Code is Ehud Reiter, Albert Gatt and Dave Westwater.
 * Portions created by Ehud Reiter, Albert Gatt and Dave Westwater are Copyright (C) 2010-11 The University of Aberdeen. All Rights Reserved.
 *
 * Contributor(s): Ehud Reiter, Albert Gatt, Dave Wewstwater, Roman Kutlak, Margaret Mitchell.
 */
package simplenlg.lexicon;


import gov.nih.nlm.nls.lexAccess.Api.LexAccessApi;
import gov.nih.nlm.nls.lexAccess.Api.LexAccessApiResult;
import gov.nih.nlm.nls.lexCheck.Lib.AdjEntry;
import gov.nih.nlm.nls.lexCheck.Lib.AdvEntry;
import gov.nih.nlm.nls.lexCheck.Lib.InflVar;
import gov.nih.nlm.nls.lexCheck.Lib.LexRecord;
import gov.nih.nlm.nls.lexCheck.Lib.NounEntry;
import gov.nih.nlm.nls.lexCheck.Lib.VerbEntry;


import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.List;
import java.util.Vector;


import simplenlg.features.Inflection;
import simplenlg.features.LexicalFeature;
import simplenlg.framework.LexicalCategory;
import simplenlg.framework.NLGElement;
import simplenlg.framework.WordElement;


/**
 * This class gets Words from the NIH Specialist Lexicon
 * 
 * @author ereiter
 * 
 */
public class NIHDBLexicon extends Lexicon {


  // default DB parameters
  private static String DB_HSQL_DRIVER = "org.hsqldb.jdbcDriver"; // DB driver
  private static String DB_HQSL_JDBC = "jdbc:hsqldb:"; // JDBC specifier for
  // HSQL
  private static String DB_DEFAULT_USERNAME = "sa"; // DB username
  private static String DB_DEFAULT_PASSWORD = ""; // DB password
  private static String DB_HSQL_EXTENSION = ".data"; // filename extension for
  // HSQL DB


  // class variables
  private Connection conn = null; // DB connection
  private LexAccessApi lexdb = null; // Lexicon access object


  // if false, don't keep standard inflections in the Word object
  private boolean keepStandardInflections = false;


  /****************************************************************************/
  // constructors
  /****************************************************************************/


  /**
   * set up lexicon using file which contains downloaded lexAccess HSQL DB and
   * default passwords
   * 
   * @param filename
   *            of HSQL DB
   */
  public NIHDBLexicon(String filename) {
    super();
    // get rid of .data at end of filename if necessary
    String dbfilename = filename;
    if (dbfilename.endsWith(DB_HSQL_EXTENSION))
      dbfilename = dbfilename.substring(0, dbfilename.length()
          - DB_HSQL_EXTENSION.length());


    // try to open DB and set up lexicon
    try {
      Class.forName(DB_HSQL_DRIVER);
      conn = DriverManager.getConnection(DB_HQSL_JDBC + dbfilename,
          DB_DEFAULT_USERNAME, DB_DEFAULT_PASSWORD);
      // now set up lexical access object
      lexdb = new LexAccessApi(conn);


    } catch (Exception ex) {
      System.out.println("Cannot open lexical db: " + ex.toString());
      // probably should thrown an exception
    }
  }


  /**
   * set up lexicon using general DB parameters; DB must be NIH specialist
   * lexicon from lexAccess
   * 
   * @param driver
   * @param url
   * @param username
   * @param password
   */
  public NIHDBLexicon(String driver, String url, String username,
      String password) {


    super();


    // try to open DB and set up lexicon
    try {
      Class.forName(driver);
      conn = DriverManager.getConnection(url, username, password);
      // now set up lexical access object
      lexdb = new LexAccessApi(conn);
    } catch (Exception ex) {
      System.out.println("Cannot open lexical db: " + ex.toString());
      // probably should thrown an exception
    }
  }


  // need more constructors for general case...


  /***************** methods to set global parameters ****************************/


  /**
   * reports whether Words include standard (derivable) inflections
   * 
   * @return true if standard inflections are kept
   */
  public boolean isKeepStandardInflections() {
    return keepStandardInflections;
  }


  /**
   * set whether Words should include standard (derivable) inflections
   * 
   * @param keepStandardInflections
   *            - if true, standard inflections are kept
   */
  public void setKeepStandardInflections(boolean keepStandardInflections) {
    this.keepStandardInflections = keepStandardInflections;
  }


  /****************************************************************************/
  // core methods to retrieve words from DB
  /****************************************************************************/


  /*
   * (non-Javadoc)
   * 
   * @see simplenlg.lexicon.Lexicon#getWords(java.lang.String,
   * simplenlg.features.LexicalCategory)
   */
  @Override
  public synchronized List<WordElement> getWords(String baseForm, LexicalCategory category) {
    // get words from DB
    try {
      LexAccessApiResult lexResult = lexdb.GetLexRecordsByBase(baseForm,
          LexAccessApi.B_EXACT);      
      return getWordsFromLexResult(category, lexResult);
    } catch (SQLException ex) {
      System.out.println("Lexical DB error: " + ex.toString());
      // probably should thrown an exception
    }
    return null;
  }


  /*
   * (non-Javadoc)
   * 
   * @see simplenlg.lexicon.Lexicon#getWordsByID(java.lang.String)
   */
  @Override
  public synchronized List<WordElement> getWordsByID(String id) {
    // get words from DB
    try {
      LexAccessApiResult lexResult = lexdb.GetLexRecords(id);
      return getWordsFromLexResult(LexicalCategory.ANY, lexResult);
    } catch (SQLException ex) {
      System.out.println("Lexical DB error: " + ex.toString());
      // probably should thrown an exception
    }
    return null;
  }


  /*
   * (non-Javadoc)
   * 
   * @see simplenlg.lexicon.Lexicon#getWordsFromVariant(java.lang.String,
   * simplenlg.features.LexicalCategory)
   */
  @Override
  public synchronized List<WordElement> getWordsFromVariant(String variant,
      LexicalCategory category) {
    // get words from DB
    try {
      LexAccessApiResult lexResult = lexdb.GetLexRecords(variant);
      return getWordsFromLexResult(category, lexResult);
    } catch (SQLException ex) {
      System.out.println("Lexical DB error: " + ex.toString());
      // probably should thrown an exception
    }
    return null;
  }


  /****************************************************************************/
  // other methods
  /****************************************************************************/


  /*
   * (non-Javadoc)
   * 
   * @see simplenlg.lexicon.Lexicon#close()
   */
  @Override
  public void close() {
    if (lexdb != null)
      lexdb.CleanUp();
  }


  /**
   * make a WordElement from a lexical record. Currently just specifies basic
   * params and inflections Should do more in the future!
   * 
   * @param record
   * @return
   */
  private WordElement makeWord(LexRecord record) {
    // get basic data
    String baseForm = record.GetBase();    
    LexicalCategory category = getSimplenlgCategory(record);
    String id = record.GetEui();


    // create word class
    WordElement wordElement = new WordElement(baseForm, category, id);


    // now add type information
    switch (category) {
    case ADJECTIVE:
      addAdjectiveInfo(wordElement, record.GetCatEntry().GetAdjEntry());
      break;
    case ADVERB:
      addAdverbInfo(wordElement, record.GetCatEntry().GetAdvEntry());
      break;
    case NOUN:
      addNounInfo(wordElement, record.GetCatEntry().GetNounEntry());
      break;
    case VERB:
      addVerbInfo(wordElement, record.GetCatEntry().GetVerbEntry());
      break;
    // ignore closed class words
    }


    Inflection defaultInfl = (Inflection) wordElement
        .getDefaultInflectionalVariant();


    // now add inflected forms
    // if (keepStandardInflections || !standardInflections(record,
    // category)) {
    for (InflVar inflection : record.GetInflVarsAndAgreements()
        .GetInflValues()) {
      String simplenlgInflection = getSimplenlgInflection(inflection
          .GetInflection());


      if (simplenlgInflection != null) {
        String inflectedForm = inflection.GetVar();
        Inflection inflType = Inflection.getInflCode(inflection
            .GetType());


        // store all inflectional variants, except for regular ones
        // unless explicitly set
        if (inflType != null
            && !(Inflection.REGULAR.equals(inflType) && !this.keepStandardInflections)) {
          wordElement.addInflectionalVariant(inflType,
              simplenlgInflection, inflectedForm);
        }


        // if the infl variant is the default, also set this feature on
        // the word
        if (defaultInfl == null
            || (defaultInfl.equals(inflType) && !(Inflection.REGULAR
                .equals(inflType) && !this.keepStandardInflections))) {
          wordElement.setFeature(simplenlgInflection, inflectedForm);
        }


        // wordElement
        // .setFeature(simplenlgInflection, inflection.GetVar());
      }
    }
    // }


    // add acronym info
    addAcronymInfo(wordElement, record);


    // now add spelling variants
    addSpellingVariants(wordElement, record);


    return wordElement;
  }


  /**
   * return list of WordElement from LexAccessApiResult
   * 
   * @param category
   *            - desired category (eg, NOUN) (this filters list)
   * @param lexResult
   *            - the LexAccessApiResult
   * @return list of WordElement
   */
  private List<WordElement> getWordsFromLexResult(LexicalCategory category,
      LexAccessApiResult lexResult) {
    List<LexRecord> records = lexResult.GetJavaObjs();


    // set up array of words to return
    List<WordElement> wordElements = new ArrayList<WordElement>();


    // iterate through result records, adding to words as appropriate
    for (LexRecord record : records) {


      if (category == LexicalCategory.ANY
          || category == getSimplenlgCategory(record))
        wordElements.add(makeWord(record));
    }
    return wordElements;
  }


  /**
   * check if this record has a standard (regular) inflection
   * 
   * @param record
   * @param simplenlg
   *            syntactic category
   * @return true if standard (regular) inflection
   */
  @SuppressWarnings("unused")
  private boolean standardInflections(LexRecord record,
      LexicalCategory category) {
    List<String> variants = null;
    switch (category) {
    case NOUN:
      variants = record.GetCatEntry().GetNounEntry().GetVariants();
      break;
    case ADJECTIVE:
      variants = record.GetCatEntry().GetAdjEntry().GetVariants();
      break;
    case ADVERB:
      variants = record.GetCatEntry().GetAdvEntry().GetVariants();
      break;
    case MODAL:
      variants = record.GetCatEntry().GetModalEntry().GetVariant();
      break;
    case VERB:
      if (record.GetCatEntry().GetVerbEntry() != null) // aux verbs (eg
        // be) won't
        // have verb
        // entries
        variants = record.GetCatEntry().GetVerbEntry().GetVariants();
      break;
    }


    return notEmpty(variants) && variants.contains("reg");
  }


  /***********************************************************************************/
  // The following methods map codes in the NIH Specialist Lexicon
  // into the codes used in simplenlg
  /***********************************************************************************/


  /**
   * get the simplenlg LexicalCategory of a record
   * 
   * @param cat
   * @return
   */
  private LexicalCategory getSimplenlgCategory(LexRecord record) {
    String cat = record.GetCategory();
    if (cat == null)
      return LexicalCategory.ANY;
    else if (cat.equalsIgnoreCase("noun"))
      return LexicalCategory.NOUN;
    else if (cat.equalsIgnoreCase("verb"))
      return LexicalCategory.VERB;
    else if (cat.equalsIgnoreCase("aux")
        && record.GetBase().equalsIgnoreCase("be")) // return aux "be"
      // as a VERB
      // not needed for other aux "have" and "do", they have a verb entry
      return LexicalCategory.VERB;
    else if (cat.equalsIgnoreCase("adj"))
      return LexicalCategory.ADJECTIVE;
    else if (cat.equalsIgnoreCase("adv"))
      return LexicalCategory.ADVERB;
    else if (cat.equalsIgnoreCase("pron"))
      return LexicalCategory.PRONOUN;
    else if (cat.equalsIgnoreCase("det"))
      return LexicalCategory.DETERMINER;
    else if (cat.equalsIgnoreCase("prep"))
      return LexicalCategory.PREPOSITION;
    else if (cat.equalsIgnoreCase("conj"))
      return LexicalCategory.CONJUNCTION;
    else if (cat.equalsIgnoreCase("compl"))
      return LexicalCategory.COMPLEMENTISER;
    else if (cat.equalsIgnoreCase("modal"))
      return LexicalCategory.MODAL;


    // return ANY for other cats
    else
      return LexicalCategory.ANY;
  }


  /**
   * convert an inflection type in NIH lexicon into one used by simplenlg
   * return null if no simplenlg equivalent to NIH inflection type
   * 
   * @param NIHInflection
   *            - inflection type in NIH lexicon
   * @return inflection type in simplenlg
   */
  private String getSimplenlgInflection(String NIHInflection) {
    if (NIHInflection == null)
      return null;
    else if (NIHInflection.equalsIgnoreCase("comparative"))
      return LexicalFeature.COMPARATIVE;
    else if (NIHInflection.equalsIgnoreCase("superlative"))
      return LexicalFeature.SUPERLATIVE;
    else if (NIHInflection.equalsIgnoreCase("plural"))
      return LexicalFeature.PLURAL;
    else if (NIHInflection.equalsIgnoreCase("pres3s"))
      return LexicalFeature.PRESENT3S;
    else if (NIHInflection.equalsIgnoreCase("past"))
      return LexicalFeature.PAST;
    else if (NIHInflection.equalsIgnoreCase("pastPart"))
      return LexicalFeature.PAST_PARTICIPLE;
    else if (NIHInflection.equalsIgnoreCase("presPart"))
      return LexicalFeature.PRESENT_PARTICIPLE;
    else
      // no equvalent in simplenlg, eg clitic or negative
      return null;
  }


  /**
   * extract adj information from NIH AdjEntry record, and add to a simplenlg
   * WordElement For now just extract position info
   * 
   * @param wordElement
   * @param AdjEntry
   */
  private void addAdjectiveInfo(WordElement wordElement, AdjEntry adjEntry) {
    boolean qualitativeAdj = false;
    boolean colourAdj = false;
    boolean classifyingAdj = false;
    boolean predicativeAdj = false;
    List<String> positions = adjEntry.GetPosition();
    for (String position : positions) {
      if (position.startsWith("attrib(1)"))
        qualitativeAdj = true;
      else if (position.startsWith("attrib(2)"))
        colourAdj = true;
      else if (position.startsWith("attrib(3)"))
        classifyingAdj = true;
      else if (position.startsWith("pred"))
        predicativeAdj = true;
      // ignore other positions
    }
    // ignore (for now) other info in record
    wordElement.setFeature(LexicalFeature.QUALITATIVE, qualitativeAdj);
    wordElement.setFeature(LexicalFeature.COLOUR, colourAdj);
    wordElement.setFeature(LexicalFeature.CLASSIFYING, classifyingAdj);
    wordElement.setFeature(LexicalFeature.PREDICATIVE, predicativeAdj);
    return;
  }


  /**
   * extract adv information from NIH AdvEntry record, and add to a simplenlg
   * WordElement For now just extract modifier type
   * 
   * @param wordElement
   * @param AdvEntry
   */
  private void addAdverbInfo(WordElement wordElement, AdvEntry advEntry) {
    boolean verbModifier = false;
    boolean sentenceModifier = false;
    boolean intensifier = false;


    List<String> modifications = advEntry.GetModification();
    for (String modification : modifications) {
      if (modification.startsWith("verb_modifier"))
        verbModifier = true;
      else if (modification.startsWith("sentence_modifier"))
        sentenceModifier = true;
      else if (modification.startsWith("intensifier"))
        intensifier = true;
      // ignore other modification types
    }
    // ignore (for now) other info in record
    wordElement.setFeature(LexicalFeature.VERB_MODIFIER, verbModifier);
    wordElement.setFeature(LexicalFeature.SENTENCE_MODIFIER,
        sentenceModifier);
    wordElement.setFeature(LexicalFeature.INTENSIFIER, intensifier);
    return;
  }


  /**
   * extract noun information from NIH NounEntry record, and add to a
   * simplenlg WordElement For now just extract whether count/non-count and
   * whether proper or not
   * 
   * @param wordElement
   * @param nounEntry
   */
  private void addNounInfo(WordElement wordElement, NounEntry nounEntry) {
    boolean proper = nounEntry.IsProper();
    // boolean nonCountVariant = false;
    // boolean regVariant = false;


    // add the inflectional variants
    List<String> variants = nounEntry.GetVariants();


    if (!variants.isEmpty()) {
      List<Inflection> wordVariants = new ArrayList<Inflection>();


      for (String v : variants) {
        int index = v.indexOf("|");
        String code;


        if (index > -1) {
          code = v.substring(0, index).toLowerCase().trim();


        } else {
          code = v.toLowerCase().trim();
        }


        Inflection infl = Inflection.getInflCode(code);


        if (infl != null) {
          wordVariants.add(infl);
          wordElement.addInflectionalVariant(infl);
        }
      }


      // if the variants include "reg", this is the default, otherwise
      // just a random pick
      Inflection defaultVariant = wordVariants
          .contains(Inflection.REGULAR)
          || wordVariants.isEmpty() ? Inflection.REGULAR
          : wordVariants.get(0);      
      wordElement.setFeature(LexicalFeature.DEFAULT_INFL, defaultVariant);
      wordElement.setDefaultInflectionalVariant(defaultVariant);
    }


    // for (String variant : variants) {
    // if (variant.startsWith("uncount")
    // || variant.startsWith("groupuncount"))
    // nonCountVariant = true;
    //
    // if (variant.startsWith("reg"))
    // regVariant = true;
    // // ignore other variant info
    // }


    // lots of words have both "reg" and "unCount", indicating they
    // can be used in either way. Regard such words as normal,
    // only flag as nonCount if unambiguous
    // wordElement.setFeature(LexicalFeature.NON_COUNT, nonCountVariant
    // && !regVariant);
    wordElement.setFeature(LexicalFeature.PROPER, proper);
    // ignore (for now) other info in record


    return;
  }


  /**
   * extract verb information from NIH VerbEntry record, and add to a
   * simplenlg WordElement For now just extract transitive, instransitive,
   * and/or ditransitive
   * 
   * @param wordElement
   * @param verbEntry
   */
  private void addVerbInfo(WordElement wordElement, VerbEntry verbEntry) {
    if (verbEntry == null) { // should only happen for aux verbs, which have
      // auxEntry instead of verbEntry in NIH Lex
      // just flag as transitive and return
      wordElement.setFeature(LexicalFeature.INTRANSITIVE, false);
      wordElement.setFeature(LexicalFeature.TRANSITIVE, true);
      wordElement.setFeature(LexicalFeature.DITRANSITIVE, false);
      return;
    }


    boolean intransitiveVerb = notEmpty(verbEntry.GetIntran());
    boolean transitiveVerb = notEmpty(verbEntry.GetTran())
        || notEmpty(verbEntry.GetCplxtran());
    boolean ditransitiveVerb = notEmpty(verbEntry.GetDitran());


    wordElement.setFeature(LexicalFeature.INTRANSITIVE, intransitiveVerb);
    wordElement.setFeature(LexicalFeature.TRANSITIVE, transitiveVerb);
    wordElement.setFeature(LexicalFeature.DITRANSITIVE, ditransitiveVerb);


    // add the inflectional variants
    List<String> variants = verbEntry.GetVariants();


    if (!variants.isEmpty()) {
      List<Inflection> wordVariants = new ArrayList<Inflection>();


      for (String v : variants) {
        int index = v.indexOf("|");
        String code;
        Inflection infl;


        if (index > -1) {
          code = v.substring(0, index).toLowerCase().trim();
          infl = Inflection.getInflCode(code);          


        } else {
          infl = Inflection.getInflCode(v.toLowerCase().trim());
        }


        if (infl != null) {
          wordElement.addInflectionalVariant(infl);
          wordVariants.add(infl);
        }
      }


      // if the variants include "reg", this is the default, otherwise
      // just a random pick
      Inflection defaultVariant = wordVariants
          .contains(Inflection.REGULAR)
          || wordVariants.isEmpty() ? Inflection.REGULAR
          : wordVariants.get(0);
//      wordElement.setFeature(LexicalFeature.INFLECTIONS, wordVariants);
//      wordElement.setFeature(LexicalFeature.DEFAULT_INFL, defaultVariant);
      wordElement.setDefaultInflectionalVariant(defaultVariant);
    }


    // ignore (for now) other info in record
    return;
  }


  /**
   * convenience method to test that a list is not null and not empty
   * 
   * @param list
   * @return
   */
  @SuppressWarnings("unchecked")
  private boolean notEmpty(List list) {
    return list != null && !list.isEmpty();
  }


  /**
   * extract information about acronyms from NIH record, and add to a
   * simplenlg WordElement.
   * 
   * <P>
   * Acronyms are represented as lists of word elements. Any acronym will have
   * a list of full form word elements, retrievable via
   * {@link LexicalFeature#ACRONYM_OF}
   * 
   * @param wordElement
   * @param record
   */
  private void addAcronymInfo(WordElement wordElement, LexRecord record) {
    // NB: the acronyms are actually the full forms of which the word is an
    // acronym
    List<String> acronyms = record.GetAcronyms();


    if (!acronyms.isEmpty()) {
      // the list of full forms of which this word is an acronym
      List<NLGElement> acronymOf = wordElement
          .getFeatureAsElementList(LexicalFeature.ACRONYM_OF);


      // keep all acronym full forms and set them up as wordElements
      for (String fullForm : acronyms) {
        if (fullForm.contains("|")) {
          // get the acronym id
          String acronymID = fullForm.substring(
              fullForm.indexOf("|") + 1, fullForm.length());
          // create the full form element
          WordElement fullFormWE = this.getWordByID(acronymID);


          if (fullForm != null) {
            // add as full form of this acronym
            acronymOf.add(fullFormWE);


            // List<NLGElement> fullFormAcronyms = fullFormWE
            // .getFeatureAsElementList(LexicalFeature.ACRONYMS);
            // fullFormAcronyms.add(wordElement);
            // fullFormWE.setFeature(LexicalFeature.ACRONYMS,
            // fullFormAcronyms);
          }
        }
      }


      // set all the full forms for this acronym
      wordElement.setFeature(LexicalFeature.ACRONYM_OF, acronymOf);
    }


    // if (!acronyms.isEmpty()) {
    //
    // String acronym = acronyms.get(0);
    // // remove anything after a |, this will be an NIH ID
    // if (acronym.contains("|"))
    // acronym = acronym.substring(0, acronym.indexOf("|"));
    // wordElement.setFeature(LexicalFeature.ACRONYM_OF, acronym);
    // }


    return;
  }


  /**
   * Extract info about the spelling variants of a word from an NIH record,
   * and add to the simplenlg Woordelement.
   * 
   * <P>
   * Spelling variants are represented as lists of strings, retrievable via
   * {@link LexicalFeature#SPELL_VARS}
   * 
   * @param wordElement
   * @param record
   */
  private void addSpellingVariants(WordElement wordElement, LexRecord record) {
    Vector<String> vars = record.GetSpellingVars();


    if (vars != null && !vars.isEmpty()) {
      List<String> wordVars = new ArrayList<String>();
      wordVars.addAll(vars);
      wordElement.setFeature(LexicalFeature.SPELL_VARS, wordVars);
    }


    // we set the default spelling var as the baseForm
    wordElement.setFeature(LexicalFeature.DEFAULT_SPELL, wordElement
        .getBaseForm());
  }


}
Source Code of simplenlg.lexicon.NIHDBLexicon

Related Classes of simplenlg.lexicon.NIHDBLexicon