Package synalp.generation.morphology

Source Code of synalp.generation.morphology.MorphLexiconReader

package synalp.generation.morphology;

import java.io.*;

import javax.xml.parsers.*;

import org.apache.log4j.Logger;
import org.xml.sax.*;
import org.xml.sax.helpers.DefaultHandler;

import synalp.commons.input.*;
import synalp.commons.unification.FeatureStructure;
import synalp.generation.morphology.javacc.MphFormatParser;


/**
* Reads a MorphLexicon in XML or MPH format. The XML format contains lines such as <m
* l="disapprove" f="disapproved" fs="cat=v; pers=3; mode=ppart"/>, where l is the lemma, f the
* inflected form and fs the feature structure.
* @author Alexandre Denis
*/
public class MorphLexiconReader extends DefaultHandler
{
  @SuppressWarnings("javadoc")
  public static Logger logger = Logger.getLogger(MorphLexiconReader.class);

  private MorphLexicon lexicon;


  /**
   * Shows how to read a lexicon.
   * @param args
   * @throws IOException
   * @throws SAXException
   */
  public static void main(String[] args) throws SAXException, IOException
  {
    MorphLexicon lexicon = readLexicon(new File("resources/sem-xtag2/auto/morph.mph"));
    for(MorphLexiconEntry entry : lexicon.values())
      System.out.println(entry);
  }


  /**
   * Reads the given morph lexicon. This method determines the proper way to read the lexicon
   * thanks to the file extension. If the extension is ".xml", it uses the XML format, if the
   * extension is ".mph" it uses the MPH format. Otherwise it throws an exception.
   * @param file
   * @return a morph lexicon
   * @throws SAXException
   * @throws IOException
   */
  public static MorphLexicon readLexicon(File file) throws SAXException, IOException
  {
    if (file.getName().endsWith(".xml"))
      return readLexiconXMLFormat(file);
    else if (file.getName().endsWith(".mph"))
      return readLexiconMphFormat(file);
    else throw new IOException("Error: unable to determine lexicon format, the file must end either with '.xml' or '.mph'");
  }


  /**
   * Reads the given morph lexicon in MPH format.
   * @param file
   * @return a morph lexicon
   * @throws IOException
   */
  public static MorphLexicon readLexiconMphFormat(File file) throws IOException
  {
    try
    {
      return MphFormatParser.readLexicon(file);
    }
    catch (Exception e)
    {
      throw new IOException("Error: unable to read " + file + " : " + e.getMessage());
    }
  }


  /**
   * Reads the given lexicon in XML format.
   * @param file
   * @return a morphological lexicon
   * @throws SAXException
   * @throws IOException
   */
  public static MorphLexicon readLexiconXMLFormat(File file) throws SAXException, IOException
  {
    MorphLexiconReader reader = new MorphLexiconReader();
    try
    {
      logger.info("Reading morph lexicon " + file);
      SAXParserFactory.newInstance().newSAXParser().parse(file, reader);
    }
    catch (ParserConfigurationException e)
    {
      e.printStackTrace();
    }
    return reader.lexicon;
  }


  @Override
  public void startDocument() throws SAXException
  {
    lexicon = new MorphLexicon();
  }


  @Override
  public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException
  {
    if (qName.equals("m"))
    {
      String lemmaStr = attributes.getValue("l");
      if (lemmaStr == null)
      {
        logger.error("Missing lemma feature \"l\" for a morphological entry, skipping");
        return;
      }

      String morphStr = attributes.getValue("f");
      if (morphStr == null)
      {
        logger.error("Missing inflected form feature \"f\" for a morphological entry, skipping");
        return;
      }

      String fsStr = attributes.getValue("fs");
      FeatureStructure fs = new FeatureStructure();
      if (fsStr != null)
        try
        {
          fs = MphFormatParser.readFeatureStructure(fsStr);
        }
        catch (Exception e)
        {
          logger.error("Unable to parse feature structure '" + fsStr + "' : " + e.getMessage() + ", skipping fs");

        }

      lexicon.add(new MorphLexiconEntry(new Lemma(lemmaStr), new Lexem(morphStr, fs)));
    }
  }
}
TOP

Related Classes of synalp.generation.morphology.MorphLexiconReader

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.