Source Code of opennlp.ccg.grammar.Grammar

///////////////////////////////////////////////////////////////////////////////
// Copyright (C) 2003-5 University of Edinburgh (Michael White) and Gunes Erkan
// 
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
// 
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU Lesser General Public License for more details.
// 
// You should have received a copy of the GNU Lesser General Public
// License along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
//////////////////////////////////////////////////////////////////////////////


package opennlp.ccg.grammar;


import opennlp.ccg.lexicon.*;
import opennlp.ccg.util.*;
import opennlp.ccg.synsem.*;
import opennlp.ccg.hylo.*;
import opennlp.ccg.parse.Parser;
import opennlp.ccg.parse.ParseException;
import opennlp.ccg.realize.Realizer;


import org.jdom.*;
import org.jdom.input.*;
import org.jdom.output.*;
import org.jdom.transform.*;
import org.xml.sax.*;


import javax.xml.parsers.*;
import javax.xml.transform.*;
import javax.xml.transform.stream.*;
import javax.xml.transform.sax.*;
import java.io.*;
import java.net.URL;
import java.util.*;


/**
 * A CCG grammar is essentially a lexicon plus a rule group.
 * A grammar may also have sequences of transformations to use in 
 * loading/saving LFs from/to XML.
 *
 * @author  Michael White
 * @author  Gunes Erkan
 * @version $Revision: 1.45 $, $Date: 2010/12/06 02:39:35 $ 
 */
public class Grammar {


    /** The lexicon. */
    public final Lexicon lexicon;
    
    /** The rule group. */
    public final RuleGroup rules;


    /** The type hierarchy. */
    public final Types types;
    
    /** The features to include in supertags. */
    public final Set<String> supertagFeatures = new HashSet<String>();
    
    /** The sequence of transformations to use when loading LFs from XML. */
    public final URL[] fromXmlTransforms;
    
    /** The sequence of transformations to use when saving LFs to XML. */
    public final URL[] toXmlTransforms;


    /** Preferences for displaying elements in this grammar. */
    public DisplayPrefs prefs = new DisplayPrefs();
   
    /** For access to the current grammar; should be generalized eventually. */
    public static Grammar theGrammar;
  
    // name of the grammar
    private String grammarName = null;
  
    // parser, for getting parsed words
    private Parser parser = null; 


    // XML factories
    private SAXParserFactory spf = null; 
    private static SAXTransformerFactory stf = null; 
    
    // transformer for loading/saving LFs from/to XML
    private Transformer transformer = null;
    
    // transformations for loading/saving LFs from/to XML
    private Templates[] fromXmlTemplates = null;
    private Templates[] toXmlTemplates = null;
    
    // transformer for saving strings to APML
    private Transformer apmlTransformer = null;
    
    /** The pitch accents recognized as underscored suffixes for translation to APML. */
    public static final String[] pitchAccents = { 
        "H*", "L*", "L+H*", "L*+H", "H*+L", "H+L*"
    };


    // set of pitch accents
    private static Set<String> pitchAccentsSet = null;    
    
    /** The boundary tones recognized as separate tokens for translation to APML. */
    public static final String[] boundaryTones = { 
        "L", "H", "LL%", "HH%", "LH%", "HL%"
    };
    
    // set of boundary tones
    private static Set<String> boundaryTonesSet = null;    


    
    /** Loads a grammar from the given filename. */
    public Grammar(String filename) throws IOException {
        this(new File(filename).toURI().toURL());
    }
    
    /** Loads a grammar from the given URL. */
  public Grammar(URL url) throws IOException {
      this(url, false);
    }
    
    /** Loads a grammar from the given URL, with the given flag for whether to ignore rule combos. */
    @SuppressWarnings("unchecked")
  public Grammar(URL url, boolean ignoreCombos) throws IOException {
        theGrammar = this;
        // read XML
        SAXBuilder builder = new SAXBuilder();
        Document doc;
        try {
            doc = builder.build(url);
        } catch (JDOMException jde) {
            throw (IOException) new IOException().initCause(jde);
        }
        Element root = doc.getRootElement();  // root corresponds to <grammar>
        grammarName = root.getAttributeValue("name");
    
        Element supertagsElt = root.getChild("supertags");
        if (supertagsElt != null) {
            String feats = supertagsElt.getAttributeValue("feats");
            if (feats != null) {
                String[] names = feats.split("\\s+");
                for (int i = 0; i < names.length; i++) {
                    supertagFeatures.add(names[i]);
                }
            }
        }
        if (supertagFeatures.isEmpty()) {
            // default is "form" and "lex"
            supertagFeatures.add("form"); supertagFeatures.add("lex"); 
        }
        
        Tokenizer tokenizer = null;
        Element tokenizerElt = root.getChild("tokenizer");
        if (tokenizerElt != null) {
            String tokenizerClass = tokenizerElt.getAttributeValue("classname");
            if (tokenizerClass != null) {
                try {
                    tokenizer = (Tokenizer) Class.forName(tokenizerClass).newInstance();
                } catch (Exception exc) {
                    throw (IOException) new IOException().initCause(exc);
                }
            }
            else tokenizer = new DefaultTokenizer();
            String replacementSemClasses = tokenizerElt.getAttributeValue("replacement-sem-classes");
            if (replacementSemClasses != null) {
                String[] semClasses = replacementSemClasses.split("\\s+");
                for (int i = 0; i < semClasses.length; i++) {
                    tokenizer.addReplacementSemClass(semClasses[i]);
                }
            }
        }
        
        Element typesElt = root.getChild("types");
        URL typesUrl;
        if (typesElt != null) {
            typesUrl = new URL(url, typesElt.getAttributeValue("file"));
        }
        else typesUrl = null;
        Element lexiconElt = root.getChild("lexicon");
        boolean openlex = "true".equals(lexiconElt.getAttributeValue("openlex"));
        URL lexiconUrl = new URL(url, lexiconElt.getAttributeValue("file")); 
        Element morphElt = root.getChild("morphology");
        URL morphUrl = new URL(url, morphElt.getAttributeValue("file"));
        Element rulesElt = root.getChild("rules");
        URL rulesUrl = new URL(url, rulesElt.getAttributeValue("file"));
        Element fromXmlElt = root.getChild("LF-from-XML");
        if (fromXmlElt != null) {
            List<Element> children = fromXmlElt.getChildren();
            fromXmlTransforms = new URL[children.size()];
            for (int i = 0; i < children.size(); i++) {
                Element transformElt = (Element) children.get(i);
                fromXmlTransforms[i] = new URL(url, transformElt.getAttributeValue("file"));
            }
        } else {
            fromXmlTransforms = new URL[0];
        }
        Element toXmlElt = root.getChild("LF-to-XML");
        if (toXmlElt != null) {
            List<Element> children = toXmlElt.getChildren();
            toXmlTransforms = new URL[children.size()];
            for (int i = 0; i < children.size(); i++) {
                Element transformElt = (Element) children.get(i);
                toXmlTransforms[i] = new URL(url, transformElt.getAttributeValue("file"));
            }
        } else {
            toXmlTransforms = new URL[0];
        }
        
        // load type hierarchy, lexicon and rules
        if (typesUrl != null) types = new Types(typesUrl, this);
        else types = new Types(this);
        if (tokenizer != null) lexicon = new Lexicon(this, tokenizer);
        else lexicon = new Lexicon(this);
        lexicon.openlex = openlex;
        lexicon.init(lexiconUrl, morphUrl); 
        rules = new RuleGroup(rulesUrl, this);
        
        // add observed supertag-rule combos for filtering, if any, unless ignoring combos
        if (!ignoreCombos) {
          String combosfile = rulesElt.getAttributeValue("combosfile");
          if (combosfile != null) {
            URL combosUrl = new URL(url, combosfile);
            rules.loadSupercatRuleCombos(combosUrl);
          }
          // set dynamic combos: defaults to true with a combosfile, otherwise defaults to false
          boolean dynamic = (combosfile != null);
          String dynamicCombos = rulesElt.getAttributeValue("dynamic-combos");
          if (dynamicCombos != null) dynamic = Boolean.parseBoolean(dynamicCombos);
          rules.setDynamicCombos(dynamic);
        }
    }


    
    /**
     * Returns a file url string relative to the user's current directory 
     * for the given filename.
     */
    public static String convertToFileUrl(String filename) {
        try {
            return new File(filename).toURI().toURL().toString();
        }
        catch (java.net.MalformedURLException exc) {
            throw (RuntimeException) new RuntimeException().initCause(exc);
        }
        // return "file:"+System.getProperty("user.dir")+"/"+filename;
    }
    
    
    // initializes factories and transformers
    private void initializeTransformers() throws TransformerConfigurationException {
        // init factories
        if (spf == null) {
            spf = SAXParserFactory.newInstance(); 
            spf.setNamespaceAware(true);
        }
        if (stf == null) {
            stf = (SAXTransformerFactory) TransformerFactory.newInstance();
            try { // try setting indent at factory level
                stf.setAttribute("indent-number", new Integer(2));
            } catch (IllegalArgumentException exc) {} // ignore
        }
        // set up transformer with indenting
        // nb: with some JVMs (eg JDK 1.4.1 on Windows), 
        //     the transformer needs to be reinitialized each time, in order to 
        //     run multiple :r FN commands in tccg 
        if (transformer == null) {
            transformer = stf.newTransformer();
            transformer.setOutputProperty(OutputKeys.INDENT, "yes");
            try { // also try setting indent as a xalan property 
                transformer.setOutputProperty("{http://xml.apache.org/xalan}indent-amount", "2");
            } catch (IllegalArgumentException exc) {} // ignore
        }
        // set up apml transformer 
        if (apmlTransformer == null) {
            InputStream toApmlStr = ClassLoader.getSystemResourceAsStream("opennlp/ccg/grammar/to-apml.xsl");
            apmlTransformer = stf.newTransformer(new StreamSource(toApmlStr));
            // nb: DOCTYPE SYSTEM also specified in to-apml.xsl; including  
            //     redundant specification here to workaround omission of DOCTYPE with Linux 1.5 JVM
            apmlTransformer.setOutputProperty(OutputKeys.DOCTYPE_SYSTEM, "apml.dtd");
        }
    }
    
    
    // does setup for LF from XML transformation, and returns a SAXSource for the given input stream
    // nb: need a new filter chain one for each use (perhaps due to an underyling bug)
    private SAXSource fromXmlSetup(InputStream istream) throws IOException {
        try {
            // initialize transformer
            initializeTransformers();
            // load transformations
            if (fromXmlTemplates == null) {
                fromXmlTemplates = new Templates[fromXmlTransforms.length];
                for (int i = 0; i < fromXmlTemplates.length; i++) {
                    String url = fromXmlTransforms[i].toString();
                    fromXmlTemplates[i] = stf.newTemplates(new StreamSource(url));
                }
            }
            // set up initial reader
            SAXParser parser = spf.newSAXParser();
            XMLReader reader = parser.getXMLReader();
            // set up chain of filters
            XMLFilter[] filters = new XMLFilter[fromXmlTransforms.length];
            for (int i = 0; i < filters.length; i++) {
                // create filter
                filters[i] = stf.newXMLFilter(fromXmlTemplates[i]);
                // set parent
                if (i == 0) { filters[0].setParent(reader); }
                else { filters[i].setParent(filters[i-1]); }
            }
            // set final reader/filter
            XMLReader finalReader = (filters.length == 0) ? reader : filters[filters.length-1];
            // set up and return LF from XML SAX source with final reader/filter
            return new SAXSource(finalReader, new InputSource(istream));
        } catch (ParserConfigurationException pce) {
            throw (IOException) new IOException().initCause(pce);
        } catch (SAXException se) {
            throw (IOException) new IOException().initCause(se);
        } catch (TransformerConfigurationException tce) {
            throw (IOException) new IOException().initCause(tce);
        }
    }
    
    /**
     * Loads a document from the XML in the given input stream, 
     * applying the configured from-XML transformations.
     */
    public synchronized Document loadFromXml(InputStream istream) throws IOException {
        try {
            // do setup and get source
            Source source = fromXmlSetup(istream);
            // do transformation
            JDOMResult result = new JDOMResult();
            transformer.transform(source, result);
            // return result doc
            return result.getDocument();
        } catch (TransformerException exc) { 
            throw (IOException) new IOException().initCause(exc);
        }
    }
    
    /**
     * Loads a document from the XML file with the given filename, 
     * applying the configured from-XML transformations.
     */
    public synchronized Document loadFromXml(String filename) throws IOException {
        BufferedInputStream bis = new BufferedInputStream(new FileInputStream(filename));
        Document retval = loadFromXml(bis);
        bis.close();
        return retval;
    }
    


    // does setup for LF to XML transformation, and returns a SAXSource for the given source
    // nb: need a new filter chain one for each use (perhaps due to an underyling bug)
    private SAXSource toXmlSetup(Source source) throws IOException {
        try {
            // initialize transformer
            initializeTransformers();
            // load transformations
            if (toXmlTemplates == null) {
                toXmlTemplates = new Templates[toXmlTransforms.length];
                for (int i = 0; i < toXmlTemplates.length; i++) {
                    // File file = new File(toXmlTransforms[i]);
                    // toXmlTemplates[i] = stf.newTemplates(new StreamSource(file));
                    String url = toXmlTransforms[i].toString();
                    toXmlTemplates[i] = stf.newTemplates(new StreamSource(url));
                }
            }
            // set up initial reader
            SAXParser parser = spf.newSAXParser();
            XMLReader reader = parser.getXMLReader();
            // set up chain of filters
            XMLFilter[] filters = new XMLFilter[toXmlTransforms.length];
            for (int i = 0; i < filters.length; i++) {
                // create filter
                filters[i] = stf.newXMLFilter(toXmlTemplates[i]);
                // set parent
                if (i == 0) { filters[0].setParent(reader); }
                else { filters[i].setParent(filters[i-1]); }
            }
            // set final reader/filter
            XMLReader finalReader = (filters.length == 0) ? reader : filters[filters.length-1];
            // set up and return LF to XML SAX source with final reader/filter
            return new SAXSource(finalReader, SAXSource.sourceToInputSource(source));
        } catch (ParserConfigurationException pce) {
            throw (IOException) new IOException().initCause(pce);
        } catch (SAXException se) {
            throw (IOException) new IOException().initCause(se);
        } catch (TransformerConfigurationException tce) {
            throw (IOException) new IOException().initCause(tce);
        }
    }


    /**
     * Saves the given LF with the given target string to an XML file 
     * with the given filename, applying the configured to-XML
     * transformations.
     */
    public synchronized void saveToXml(LF lf, String target, String filename) throws IOException { 
        // ensure dirs exist for filename
        File file = new File(filename);
        File parent = file.getParentFile();
        if (parent != null && !parent.exists()) { parent.mkdirs(); }
        FileOutputStream out = new FileOutputStream(file); 
        saveToXml(lf, target, out);
        out.close();
    }


    /**
     * Saves the given LF with the given target string as XML to the 
     * given output stream, applying the configured to-XML
     * transformations.
     */
    public synchronized void saveToXml(LF lf, String target, OutputStream out) throws IOException { 
        // make doc with XML for LF and target
        Document doc = new Document();
        Element root = new Element("xml");
        doc.setRootElement(root);
        root.addContent(HyloHelper.toXml(lf));
        Element targetElt = new Element("target");
        targetElt.addContent(target);
        root.addContent(targetElt);


        // write transformed doc to file
        try {
            // do setup and get source
            Source source = toXmlSetup(new JDOMSource(doc));
            // do transformation
            transformer.transform(source, new StreamResult(new OutputStreamWriter(out)));
        } catch (TransformerException exc) { 
            throw (IOException) new IOException().initCause(exc);
        }
    }


    
    /**
     * Transforms an LF by applying the configured to-XML and from-XML transformations, 
     * then loading the LF from the resulting doc.
     */
    public synchronized LF transformLF(LF lf) throws IOException {
        ByteArrayOutputStream out = new ByteArrayOutputStream();
        saveToXml(lf, "", out);
        ByteArrayInputStream in = new ByteArrayInputStream(out.toByteArray());
        Document doc = loadFromXml(in);
        return Realizer.getLfFromDoc(doc);
    }
    
    /**
     * Loads an LF by applying the configured from-XML transformations, 
     * then loading the LF from the resulting doc.
     */
    public synchronized LF loadLF(Document doc) throws IOException {
        ByteArrayOutputStream out = new ByteArrayOutputStream();
        serializeXml(doc, out);
        ByteArrayInputStream in = new ByteArrayInputStream(out.toByteArray());
        Document doc2 = loadFromXml(in);
        return Realizer.getLfFromDoc(doc2);
    }
    
    
    /**
     * Convenience method to serialize XML.
     */
    public synchronized void serializeXml(Document doc, OutputStream out) throws IOException {
        try {
            initializeTransformers();
            JDOMResult result = new JDOMResult(); // as suggested by Amy Isard, for better java/xml version compatibility
            transformer.transform(new JDOMSource(doc), result);
            XMLOutputter outputter = new XMLOutputter();
            outputter.setFormat(Format.getPrettyFormat());
            outputter.output(result.getDocument(), new OutputStreamWriter(out)); // end of A.I. suggestion
        } catch (TransformerException exc) { 
            throw (IOException) new IOException().initCause(exc);
        }
    }


    
    /** 
     * Makes an element for the given LF, applying the configured to-XML transformations.
     */
    public synchronized Element makeLfElt(LF lf) throws IOException { 
        // make doc with LF in it
        Document lfDoc = new Document();
        lfDoc.setRootElement(HyloHelper.toXml(lf));
        // apply to-XML transformations
        try {
            // do setup and get source
            Source source = toXmlSetup(new JDOMSource(lfDoc));
            // do transformation and get resulting doc
            JDOMResult result = new JDOMResult();
            transformer.transform(source, result);
            lfDoc = result.getDocument();
        } catch (TransformerException exc) { 
            throw (IOException) new IOException().initCause(exc);
        }
        return lfDoc.detachRootElement();
    }


    
    /** 
     * Returns whether the given string is a recognized pitch accent.
     */
    public static boolean isPitchAccent(String s) {
        if (pitchAccentsSet == null) {
            pitchAccentsSet = new HashSet<String>();
            for (int i = 0; i < pitchAccents.length; i++) {
                pitchAccentsSet.add(pitchAccents[i]);
            }
        }
        return pitchAccentsSet.contains(s);
    }
    
    /** 
     * Returns whether the given string is a recognized boundary tone. 
     */
    public static boolean isBoundaryTone(String s) {
        if (boundaryTonesSet == null) {
            boundaryTonesSet = new HashSet<String>();
            for (int i = 0; i < boundaryTones.length; i++) {
                boundaryTonesSet.add(boundaryTones[i]);
            }
        }
        return boundaryTonesSet.contains(s);
    }
    
    
    /**
     * Saves the given sign's words, pitch accents and boundary tones 
     * to an APML file with the given filename.
     */
    public synchronized void saveToApml(Sign sign, String filename) throws IOException {
        // ensure dirs exist for filename
        File file = new File(filename);
        File parent = file.getParentFile();
        if (parent != null && !parent.exists()) { parent.mkdirs(); }
        // do transformation
        FileWriter fw = new FileWriter(file);
        saveToApml(sign, fw);
        fw.close();
    }
    
    /**
     * Saves the given sign's words, pitch accents and boundary tones 
     * as APML to the given writer.
     * The orthography is first converted to XML using Sign.getWordsInXml, 
     * and then converted to APML using opennlp/ccg/grammar/to-apml.xsl.
     * The string is assumed to be a single performative.
     */
    public synchronized void saveToApml(Sign sign, Writer writer) throws IOException { 
        // convert words
        Document doc = sign.getWordsInXml();
        // write transformed doc to file
        try {
            // do setup and get source
            initializeTransformers();
            Source source = new JDOMSource(doc);
            // do transformation
            apmlTransformer.transform(source, new StreamResult(writer));
        } catch (TransformerException exc) { 
            throw (IOException) new IOException().initCause(exc);
        }
    }
    
    
    /** 
     * Returns the words for the given string, as determined by its 
     * first parse, or an empty list, if it cannot be parsed.
     */
    // NB: Could try to extend this to find the parse with the intended LF.
    public List<Word> getParsedWords(String s) {
        // ensure parser instantiated
        if (parser == null) parser = new Parser(this);
        // get parses
        try {
            parser.parse(s);
        }
        catch (ParseException pe) {
            return new ArrayList<Word>(0);
        }
        List<Sign> parses = parser.getResult();
        // return words of first parse
        Sign sign = parses.get(0);
        return sign.getWords();
    }




  /**
  * Returns the name of the loaded grammar. Null if no name given.
  */
  public final String getName() {
    return grammarName;
  }
  
  
    /**
     * Writes the list of words to a basic morph file.
     * @throws IOException 
     */
    public void toMorphXml(List<Word> words, String filename) throws IOException {
      Collections.sort(words);
      XMLOutputter xout = new XMLOutputter();
      xout.setFormat(Format.getPrettyFormat());
      PrintWriter out = new PrintWriter(new BufferedWriter(new FileWriter(filename)));
      out.println("<?xml version=\"1.0\" encoding=\"UTF-8\"?>");
      out.println("<morph name=\"" + grammarName + "\">");
      for (Word w : words) {
        Element e = new Element("entry");
        e.setAttribute("word", w.getForm());
        if (w.getForm() != w.getStem() && w.getStem() != null) e.setAttribute("stem", w.getStem());
        if (w.getPOS() != null) e.setAttribute("pos", w.getPOS());
        if (w.getSemClass() != null) e.setAttribute("class", w.getSemClass());
        xout.output(e, out); out.println();
      }
      out.println("</morph>");
      out.flush(); out.close();
    }
  
    /**
     * Writes the list of categories and associated POS tags to a basic lexicon file.
     * Note that the LFs are expected to have a [*DEFAULT*] proposition in the 
     * desired location for predicate insertion.
     * @throws IOException 
     */
    public void toLexiconXml(List<Category> cats, List<String> POSs, String filename) throws IOException {
      // create map from supertags with unique suffixes to cat/pos pairs
      Map<String,Pair<Category,String>> stagMap = new HashMap<String,Pair<Category,String>>();
      for (int i=0; i < cats.size(); i++) {
        Category cat = cats.get(i); String pos = POSs.get(i);
        String stag = cat.getSupertag();
        if (stagMap.containsKey(stag)) {
            int j = 1;
          while (stagMap.containsKey(stag+"-"+j)) j++;
          stag += "-"+j;
        }
        stagMap.put(stag, new Pair<Category,String>(cat, pos));
      }
      List<String> keys = new ArrayList<String>(stagMap.keySet());
      Collections.sort(keys);
      XMLOutputter xout = new XMLOutputter();
      xout.setFormat(Format.getPrettyFormat());
      PrintWriter out = new PrintWriter(new BufferedWriter(new FileWriter(filename)));
      out.println("<?xml version=\"1.0\" encoding=\"UTF-8\"?>");
      out.println("<lexicon name=\"" + grammarName + "\">");
      for (String key : keys) {
        Pair<Category,String> p = stagMap.get(key);
        Category cat = p.a; String pos = p.b;
        Element fam = new Element("family");
        fam.setAttribute("name", key);
        fam.setAttribute("pos", pos);
        Element ent = new Element("entry");
        ent.setAttribute("name", "1");
        fam.addContent(ent);
        ent.addContent(cat.toXml());
        xout.output(fam, out); out.println();
      }
      out.println("</lexicon>");
      out.flush(); out.close();
    }
}
Source Code of opennlp.ccg.grammar.Grammar

Related Classes of opennlp.ccg.grammar.Grammar