package synalp.generation.morphology;
import javax.xml.parsers.*;
import org.apache.log4j.Logger;
import org.xml.sax.*;
import org.xml.sax.helpers.DefaultHandler;
import synalp.commons.input.*;
import synalp.commons.unification.FeatureStructure;
import synalp.generation.morphology.javacc.MphFormatParser;
* Reads a MorphLexicon in XML or MPH format. The XML format contains lines such as <m
* l="disapprove" f="disapproved" fs="cat=v; pers=3; mode=ppart"/>, where l is the lemma, f the
* inflected form and fs the feature structure.
* @author Alexandre Denis
public class MorphLexiconReader extends DefaultHandler
public static Logger logger = Logger.getLogger(MorphLexiconReader.class);
private MorphLexicon lexicon;
* Shows how to read a lexicon.
* @param args
* @throws IOException
* @throws SAXException
public static void main(String[] args) throws SAXException, IOException
MorphLexicon lexicon = readLexicon(new File("resources/sem-xtag2/auto/morph.mph"));
for(MorphLexiconEntry entry : lexicon.values())
* Reads the given morph lexicon. This method determines the proper way to read the lexicon
* thanks to the file extension. If the extension is ".xml", it uses the XML format, if the
* extension is ".mph" it uses the MPH format. Otherwise it throws an exception.
* @param file
* @return a morph lexicon
* @throws SAXException
* @throws IOException
public static MorphLexicon readLexicon(File file) throws SAXException, IOException
if (file.getName().endsWith(".xml"))
return readLexiconXMLFormat(file);
else if (file.getName().endsWith(".mph"))
return readLexiconMphFormat(file);
else throw new IOException("Error: unable to determine lexicon format, the file must end either with '.xml' or '.mph'");
* Reads the given morph lexicon in MPH format.
* @param file
* @return a morph lexicon
* @throws IOException
public static MorphLexicon readLexiconMphFormat(File file) throws IOException
return MphFormatParser.readLexicon(file);
catch (Exception e)
throw new IOException("Error: unable to read " + file + " : " + e.getMessage());
* Reads the given lexicon in XML format.
* @param file
* @return a morphological lexicon
* @throws SAXException
* @throws IOException
public static MorphLexicon readLexiconXMLFormat(File file) throws SAXException, IOException
MorphLexiconReader reader = new MorphLexiconReader();
{"Reading morph lexicon " + file);
SAXParserFactory.newInstance().newSAXParser().parse(file, reader);
catch (ParserConfigurationException e)
return reader.lexicon;
public void startDocument() throws SAXException
lexicon = new MorphLexicon();
public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException
if (qName.equals("m"))
String lemmaStr = attributes.getValue("l");
if (lemmaStr == null)
logger.error("Missing lemma feature \"l\" for a morphological entry, skipping");
String morphStr = attributes.getValue("f");
if (morphStr == null)
logger.error("Missing inflected form feature \"f\" for a morphological entry, skipping");
String fsStr = attributes.getValue("fs");
FeatureStructure fs = new FeatureStructure();
if (fsStr != null)
fs = MphFormatParser.readFeatureStructure(fsStr);
catch (Exception e)
logger.error("Unable to parse feature structure '" + fsStr + "' : " + e.getMessage() + ", skipping fs");
lexicon.add(new MorphLexiconEntry(new Lemma(lemmaStr), new Lexem(morphStr, fs)));