/*
* The contents of this file are subject to the Mozilla Public License
* Version 1.1 (the "License"); you may not use this file except in
* compliance with the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS"
* basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
* License for the specific language governing rights and limitations
* under the License.
*
* The Original Code is "Simplenlg".
*
* The Initial Developer of the Original Code is Ehud Reiter, Albert Gatt and Dave Westwater.
* Portions created by Ehud Reiter, Albert Gatt and Dave Westwater are Copyright (C) 2010-11 The University of Aberdeen. All Rights Reserved.
*
* Contributor(s): Ehud Reiter, Albert Gatt, Dave Wewstwater, Roman Kutlak, Margaret Mitchell.
*/
package simplenlg.lexicon;
import java.io.File;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import simplenlg.features.Inflection;
import simplenlg.features.LexicalFeature;
import simplenlg.framework.ElementCategory;
import simplenlg.framework.LexicalCategory;
import simplenlg.framework.WordElement;
/**
* This class loads words from an XML lexicon. All features specified in the
* lexicon are loaded
*
* @author ereiter
*
*/
public class XMLLexicon extends Lexicon {
// node names in lexicon XML files
private static final String XML_BASE = "base"; // base form of Word
private static final String XML_CATEGORY = "category"; // base form of Word
private static final String XML_ID = "id"; // base form of Word
private static final String XML_WORD = "word"; // node defining a word
// lexicon
private Set<WordElement> words; // set of words
private Map<String, WordElement> indexByID; // map from ID to word
private Map<String, List<WordElement>> indexByBase; // map from base to set
// of words with this
// baseform
private Map<String, List<WordElement>> indexByVariant; // map from variants
// to set of words
// with this variant
/**********************************************************************/
// constructors
/**********************************************************************/
/**
* Load an XML Lexicon from a named file
*
* @param filename
*/
public XMLLexicon(String filename) {
super();
File file = new File(filename);
createLexicon(file.toURI());
}
/**
* Load an XML Lexicon from a File
*
* @param file
*/
public XMLLexicon(File file) {
super();
createLexicon(file.toURI());
}
/**
* Load an XML Lexicon from a URI
*
* @param lexiconURI
*/
public XMLLexicon(URI lexiconURI) {
super();
createLexicon(lexiconURI);
}
public XMLLexicon() {
try {
createLexicon(getClass().getResource(
"/simplenlg/lexicon/default-lexicon.xml").toURI());
} catch (URISyntaxException ex) {
System.out.println(ex.toString());
}
}
/**
* method to actually load and index the lexicon from a URI
*
* @param uri
*/
private void createLexicon(URI lexiconURI) {
// initialise objects
words = new HashSet<WordElement>();
indexByID = new HashMap<String, WordElement>();
indexByBase = new HashMap<String, List<WordElement>>();
indexByVariant = new HashMap<String, List<WordElement>>();
try {
DocumentBuilderFactory factory = DocumentBuilderFactory
.newInstance();
DocumentBuilder builder = factory.newDocumentBuilder();
Document doc = builder.parse(lexiconURI.toString());
if (doc != null) {
Element lexRoot = doc.getDocumentElement();
NodeList wordNodes = lexRoot.getChildNodes();
for (int i = 0; i < wordNodes.getLength(); i++) {
Node wordNode = wordNodes.item(i);
// ignore things that aren't elements
if (wordNode.getNodeType() == Node.ELEMENT_NODE) {
WordElement word = convertNodeToWord(wordNode);
if (word != null) {
words.add(word);
IndexWord(word);
}
}
}
}
} catch (Exception ex) {
System.out.println(ex.toString());
}
addSpecialCases();
}
/**
* add special cases to lexicon
*
*/
private void addSpecialCases() {
// add variants of "be"
WordElement be = getWord("be", LexicalCategory.VERB);
if (be != null) {
updateIndex(be, "is", indexByVariant);
updateIndex(be, "am", indexByVariant);
updateIndex(be, "are", indexByVariant);
updateIndex(be, "was", indexByVariant);
updateIndex(be, "were", indexByVariant);
}
}
/**
* create a simplenlg WordElement from a Word node in a lexicon XML file
*
* @param wordNode
* @return
* @throws XPathUtilException
*/
private WordElement convertNodeToWord(Node wordNode) {
// if this isn't a Word node, ignore it
if (!wordNode.getNodeName().equalsIgnoreCase(XML_WORD))
return null;
// // if there is no base, flag an error and return null
// String base = XPathUtil.extractValue(wordNode, Constants.XML_BASE);
// if (base == null) {
// System.out.println("Error in loading XML lexicon: Word with no base");
// return null;
// }
// create word
WordElement word = new WordElement();
List<Inflection> inflections = new ArrayList<Inflection>();
// now copy features
NodeList nodes = wordNode.getChildNodes();
for (int i = 0; i < nodes.getLength(); i++) {
Node featureNode = nodes.item(i);
if (featureNode.getNodeType() == Node.ELEMENT_NODE) {
String feature = featureNode.getNodeName().trim();
String value = featureNode.getTextContent();
if (value != null)
value = value.trim();
if (feature == null) {
System.out.println("Error in XML lexicon node for "
+ word.toString());
break;
}
if (feature.equalsIgnoreCase(XML_BASE)) {
word.setBaseForm(value);
} else if (feature.equalsIgnoreCase(XML_CATEGORY))
word.setCategory(LexicalCategory.valueOf(value
.toUpperCase()));
else if (feature.equalsIgnoreCase(XML_ID))
word.setId(value);
else if (value == null || value.equals("")) {
// if this is an infl code, add it to inflections
Inflection infl = Inflection.getInflCode(feature);
if (infl != null) {
inflections.add(infl);
} else {
// otherwise assume it's a boolean feature
word.setFeature(feature, true);
}
} else
word.setFeature(feature, value);
}
}
// if no infl specified, assume regular
if (inflections.isEmpty()) {
inflections.add(Inflection.REGULAR);
}
// default inflection code is "reg" if we have it, else random pick form
// infl codes available
Inflection defaultInfl = inflections.contains(Inflection.REGULAR) ? Inflection.REGULAR
: inflections.get(0);
word.setFeature(LexicalFeature.DEFAULT_INFL, defaultInfl);
word.setDefaultInflectionalVariant(defaultInfl);
for(Inflection infl: inflections) {
word.addInflectionalVariant(infl);
}
// done, return word
return word;
}
/**
* add word to internal indices
*
* @param word
*/
private void IndexWord(WordElement word) {
// first index by base form
String base = word.getBaseForm();
// shouldn't really need is, as all words have base forms
if (base != null) {
updateIndex(word, base, indexByBase);
}
// now index by ID, which should be unique (if present)
String id = word.getId();
if (id != null) {
if (indexByID.containsKey(id))
System.out.println("Lexicon error: ID " + id
+ " occurs more than once");
indexByID.put(id, word);
}
// now index by variant
for (String variant : getVariants(word)) {
updateIndex(word, variant, indexByVariant);
}
// done
}
/**
* convenience method to update an index
*
* @param word
* @param base
* @param index
*/
private void updateIndex(WordElement word, String base,
Map<String, List<WordElement>> index) {
if (!index.containsKey(base))
index.put(base, new ArrayList<WordElement>());
index.get(base).add(word);
}
/******************************************************************************************/
// main methods to get data from lexicon
/******************************************************************************************/
/*
* (non-Javadoc)
*
* @see simplenlg.lexicon.Lexicon#getWords(java.lang.String,
* simplenlg.features.LexicalCategory)
*/
@Override
public List<WordElement> getWords(String baseForm, LexicalCategory category) {
return getWordsFromIndex(baseForm, category, indexByBase);
}
/**
* get matching keys from an index map
*
* @param indexKey
* @param category
* @param indexMap
* @return
*/
private List<WordElement> getWordsFromIndex(String indexKey,
LexicalCategory category, Map<String, List<WordElement>> indexMap) {
List<WordElement> result = new ArrayList<WordElement>();
// case 1: unknown, return empty list
if (!indexMap.containsKey(indexKey))
return result;
// case 2: category is ANY, return everything
if (category == LexicalCategory.ANY)
return indexMap.get(indexKey);
// case 3: other category, search for match
else
for (WordElement word : indexMap.get(indexKey))
if (word.getCategory() == category)
result.add(word);
return result;
}
/*
* (non-Javadoc)
*
* @see simplenlg.lexicon.Lexicon#getWordsByID(java.lang.String)
*/
@Override
public List<WordElement> getWordsByID(String id) {
List<WordElement> result = new ArrayList<WordElement>();
if (indexByID.containsKey(id))
result.add(indexByID.get(id));
return result;
}
/*
* (non-Javadoc)
*
* @see simplenlg.lexicon.Lexicon#getWordsFromVariant(java.lang.String,
* simplenlg.features.LexicalCategory)
*/
@Override
public List<WordElement> getWordsFromVariant(String variant,
LexicalCategory category) {
return getWordsFromIndex(variant, category, indexByVariant);
}
/**
* quick-and-dirty routine for getting morph variants should be replaced by
* something better!
*
* @param word
* @return
*/
private Set<String> getVariants(WordElement word) {
Set<String> variants = new HashSet<String>();
variants.add(word.getBaseForm());
ElementCategory category = word.getCategory();
if (category instanceof LexicalCategory) {
switch ((LexicalCategory) category) {
case NOUN:
variants.add(getVariant(word, LexicalFeature.PLURAL, "s"));
break;
case ADJECTIVE:
variants
.add(getVariant(word, LexicalFeature.COMPARATIVE, "er"));
variants
.add(getVariant(word, LexicalFeature.SUPERLATIVE, "est"));
break;
case VERB:
variants.add(getVariant(word, LexicalFeature.PRESENT3S, "s"));
variants.add(getVariant(word, LexicalFeature.PAST, "ed"));
variants.add(getVariant(word, LexicalFeature.PAST_PARTICIPLE,
"ed"));
variants.add(getVariant(word,
LexicalFeature.PRESENT_PARTICIPLE, "ing"));
break;
default:
// only base needed for other forms
break;
}
}
return variants;
}
/**
* quick-and-dirty routine for computing morph forms Should be replaced by
* something better!
*
* @param word
* @param feature
* @param string
* @return
*/
private String getVariant(WordElement word, String feature, String suffix) {
if (word.hasFeature(feature))
return word.getFeatureAsString(feature);
else
return getForm(word.getBaseForm(), suffix);
}
/**
* quick-and-dirty routine for standard orthographic changes Should be
* replaced by something better!
*
* @param base
* @param suffix
* @return
*/
private String getForm(String base, String suffix) {
// add a suffix to a base form, with orthographic changes
// rule 1 - convert final "y" to "ie" if suffix does not start with "i"
// eg, cry + s = cries , not crys
if (base.endsWith("y") && !suffix.startsWith("i"))
base = base.substring(0, base.length() - 1) + "ie";
// rule 2 - drop final "e" if suffix starts with "e" or "i"
// eg, like+ed = liked, not likeed
if (base.endsWith("e")
&& (suffix.startsWith("e") || suffix.startsWith("i")))
base = base.substring(0, base.length() - 1);
// rule 3 - insert "e" if suffix is "s" and base ends in s, x, z, ch, sh
// eg, watch+s -> watches, not watchs
if (suffix.startsWith("s")
&& (base.endsWith("s") || base.endsWith("x")
|| base.endsWith("z") || base.endsWith("ch") || base
.endsWith("sh")))
base = base + "e";
// have made changes, now append and return
return base + suffix; // eg, want + s = wants
}
}