Package opennlp.ccgbank

Source Code of opennlp.ccgbank.CCGBankConvert$TreeWalker

///////////////////////////////////////////////////////////////////////////////
// Copyright (C) 2005-2009 Scott Martin, Rajakrishan Rajkumar and Michael White
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
//////////////////////////////////////////////////////////////////////////////

/*
* $Id: CCGBankConvert.java,v 1.8 2011/11/10 22:18:42 mwhite14850 Exp $
*/
package opennlp.ccgbank;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.List;
import java.util.StringTokenizer;

import javax.xml.transform.Source;
import javax.xml.transform.sax.SAXSource;

import opennlp.ccgbank.convert.InfoHelper;
import opennlp.ccgbank.convert.XSLTTrueCaser;
import opennlp.ccgbank.convert.MorphLookup;
import opennlp.ccgbank.parse.CCGbankDerivation;
import opennlp.ccgbank.parse.SimpleNode;

import org.apache.tools.ant.BuildException;
import org.apache.tools.ant.DefaultLogger;
import org.apache.tools.ant.Project;
import org.apache.tools.ant.ProjectHelper;
import org.jdom.Document;
import org.jdom.Element;
import org.jdom.transform.JDOMSource;
import org.xml.sax.InputSource;


/**
* Converts the CCGBank to a modified version for grammar extraction.
* <p>
* Within this task, a series of <code>FileList</code>s is specified. These
* files are the lists of xsltProcessors that should be used to transform the
* CCGBank. These xsltProcessors are processed in the order they occur in the
* <code>FileList</code> specified within this task.
* @author <a href="http://www.ling.osu.edu/~scott/">Scott Martin</a>
* @author Rajakrishnan Rajkumar
* @version $Revision: 1.8 $
* @see CCGBankExtract
*/
public class CCGBankConvert extends CCGBankTask {
 
  /** Flag for whether to keep case-marking preps in PP categories; defaults to false. */
  boolean keepPPHeads = false;

  TreeWalker treeWalker = new TreeWalker();
 
  CCGbankDerivation deriv = null;
  File auxFileDirectory, bbnAuxDirectory, wordsFile, stemsFile,
    currentDirectory = null;
 
  /* (non-Javadoc)
   * @see opennlp.ccgbank.CCGBankTask#start()
   */
  @Override
  protected void start() throws BuildException {
   
    InfoHelper.init(auxFileDirectory, bbnAuxDirectory);
    String trueCaseListPath=auxFileDirectory.getAbsolutePath()+"/"+"truecase-list.gz";
    XSLTTrueCaser.init(trueCaseListPath);
   
    try {
      MorphLookup.init(wordsFile, stemsFile);
     
    }
    catch(IOException io) {
      throw new BuildException("problem loading words or stems", io,
          getLocation());
    }
   
    xsltProcessor = useXMLFilter
      ? new XMLFilterProcessor(this, this)
      : new TemplatesProcessor(this);
   
    // "prime" parser
    // TODO fix this hack!!
    try {
      File tmp = File.createTempFile(getClass().getName(), "prime");
      tmp.deleteOnExit();
           
      deriv = new CCGbankDerivation(new FileReader(tmp));
    }
    catch(IOException e) {
      throw new BuildException("Problem priming parser: "
        + e.getMessage(), e, getLocation());
    }
  }

  /** Read aux files for the next WSJ section **/
  @Override
  protected void nextDirectory(File section) throws BuildException {
   
    currentDirectory = section;
   
    // only create if a numbered directory
    File d = new File(target, currentDirectory.getName());
    if(!d.exists() && !d.mkdirs()) {
      throw new BuildException("unable to create directory " + d);
    }
   
    //Read in aux files
    try {
      InfoHelper.readBBNAuxfiles(section.getName());
      InfoHelper.readQuoteAuxfiles(section.getName());
      InfoHelper.readPTBAuxfiles(section.getName());
      InfoHelper.readTreeAuxfiles(section.getName());
    }
    catch(NumberFormatException nfe) {
      // not a numbered PTB directory
    }
  }

  @Override
  protected InputSource nextFile(File file) throws BuildException {
    try {
      Reader reader = new BufferedReader(new FileReader(file));
     
      if(deriv == null) {
        deriv = new CCGbankDerivation(reader);
      }
      else {
        CCGbankDerivation.ReInit(reader);
      }
     
      SimpleNode root = CCGbankDerivation.start();
      Element result = new Element("Derivation");
     
      String fileName = file.getName();
      int start = fileName.contains(File.separator)
        ? fileName.lastIndexOf(File.separatorChar) : 0;
     
      StringBuilder sb = new StringBuilder(
            fileName.substring(start, fileName.lastIndexOf('.')));
        sb.append(".xml");
       
        File targetDir = new File(target, currentDirectory.getName());
        File targetFile = new File(targetDir, sb.toString());             
       
        xsltProcessor.resetSerializer();
        xsltProcessor.setTarget(targetFile);
               
        Document doc = new Document(treeWalker.eval(root, result));
       
        // TODO attempt to get error reporting for file / line !!
        Source s = new JDOMSource(doc);
        s.setSystemId(file.toURI().toString());
       
        return SAXSource.sourceToInputSource(s);
    }
    catch(Exception e) {
      throw new BuildException(e, getLocation());
    }
  }
 
 
  /** @param keepPPHeads the keepPPHeads value to set */
  public void setKeepPPHeads(boolean keepPPHeads) { this.keepPPHeads = keepPPHeads; }
 
 
  /**
   * @param stemsFile the stemsFile to set
   */
  public void setStemsFile(File stemsFile) {
    this.stemsFile = stemsFile;
  }

 
  /**
   * @param wordsFile the wordsFile to set
   */
  public void setWordsFile(File wordsFile) {
    this.wordsFile = wordsFile;
  }

  /**
   * @param auxFileDirectory the auxFileDirectory to set
   */
  public void setAuxFileDirectory(File auxFileDirectory) {
    this.auxFileDirectory = auxFileDirectory;
   
  }
 
  /**
   * @param bbnAuxDirectory the bbnAuxDirectory to set
   */
  public void setBbnAuxDirectory(File bbnAuxDirectory) {
    this.bbnAuxDirectory = bbnAuxDirectory;
  }

  public static void main(String[] args) {
    File baseDir = new File(System.getProperty("user.dir"));
    File buildFile = new File(baseDir, "build.xml");
   
    Project project = new Project();
   
    project.init();
   
    project.setBaseDir(baseDir);
   
    ProjectHelper helper = ProjectHelper.getProjectHelper();
   
    project.setProjectReference(helper);
   
    helper.parse(project, buildFile);
   
    DefaultLogger logger = new DefaultLogger();
    logger.setErrorPrintStream(System.err);
    logger.setOutputPrintStream(System.out);
   
    project.addBuildListener(logger);
   
    project.executeTarget("convert-base");   
  }
 
  class TreeWalker {
    // General purpose datastructure to store ccgbank indices of categories.
    // Refreshed after the lifespan of a node is over.
    public List<String> idList = new ArrayList<String>();
    // flag for whether under a leaf node;
    // used to control whether to add fs id's
    private boolean underLeaf = false;
   
    public Element eval(SimpleNode node, Element root) throws Exception {

      // No:of children of any given node
      int numC = node.jjtGetNumChildren();

      // Loop & flag variables
      int i = 0;
      SimpleNode child;

      // Processing the child nodes of the current node.
      for (i = 0; i < numC; i++) {

        child = (SimpleNode) node.jjtGetChild(i);

        // Cat spec without co-indexation info in the leafnodes
        if (child.type.equals("Redundant")) {

          if (node.type.equals("Leafnode")) {
            node.catRedundant = child.print();
            continue;
          }

          // Processes treenode categories
          if (node.type.equals("Treenode"))
            child.type = "complexcat";
        }

        // The header node is accessed and the CCGbankId is passed on to the
        // treenode root of the sentence which is processed next
        if (child.type.equals("Header")) {

          i++;
          String temp1 = child.getHeader();
          int spacePos = temp1.indexOf(' ');
          if (spacePos > 0)
            temp1 = temp1.substring(0, spacePos);

          child = (SimpleNode) node.jjtGetChild(i);
          child.setHeader(temp1);
          // System.out.println(temp1);
        }

        // Xml element which is going to be generated.
        Element leaf = new Element(child.type);

        if (child.type.equals("complexcat")
            || child.type.equals("Treenode")
            || child.type.equals("Leafnode")) {

          // Atomic categories are represented in the javacc tree as
          // catSpec-aotmcat. So for such cases the catSpec child is
          // skipped and the next child is accessed.

          if (child.jjtGetNumChildren() == 1
              && child.type.equals("complexcat")) {

            child = (SimpleNode) child.jjtGetChild(0);

            // The element which is to be added to the xml
            // representation
            leaf = new Element("atomcat");

            // Extracting the content of the node and storing it.
            node.cat = child.print();
          } else {

            // A complexcat element is created.
            leaf = ccinserter(child, leaf);

            // Extracting the content of the node and storing it.
            if (root.getName().equals("Leafnode")) {
              node.cat = child.print();
            }

            if (root.getName().equals("Treenode")
                && leaf.getName().equals("complexcat")) {
              // Leafnode and treenode cat spec elements created
              node.cat = child.print();
            }

            // Recursive processing of the children of the current node
            leaf = eval(child, leaf);

            // Sending Leaf,Tree nodes for to a function which inserts
            // the family (ie normalized cat spec) of its contents.
            if (!child.type.equals("complexcat"))
              leaf = catInserter(child, leaf);

            // Adding the current element to its parent in the xml tree.
            root.addContent(leaf);

            continue;
          }

        }

        // Slash elements added to the tree
        if (child.cat.equals("/") || child.cat.equals("\\")) {
          leaf = opinserter(child);
          root.addContent(leaf);
          continue;
        }

        // Atomcat elements added to the tree
        leaf = atomcatinserter(child);
        root.addContent(leaf);

        // if(!child.type.matches("\\p{Punct}"))
      }

      // The final result of the above operations returned.
      return root;

    }
   
    public Element ccinserter(SimpleNode node, Element leaf) {

      // This function produces complexcat/treenode/leafnode elements.
      // mww: the name of this function is not very helpful

      // The node can be of any of the above types.
      String name;
      name = node.type;

      // Set treebankId,parseNo at root of the sent
      String h = node.getHeader();
      if (h != null)
        leaf.setAttribute("Header", h);

      // Treenode info ie head,daughter represented
      if (name.equals("Treenode")) {

        leaf.setAttribute("head", node.head);
        leaf.setAttribute("dtr", node.dtr);

        idList.clear();
        underLeaf = false;
      }

      // Leafnode info represented
      if (name.equals("Leafnode")) {
        leaf.setAttribute("lexeme", node.lex); // nb: may be truecased later
        leaf.setAttribute("lexeme0", node.lex);
        leaf.setAttribute("pos", node.pos);
        idList.clear();
        underLeaf = true;
      }

      // add propbank info here
      if (node.nodeRoles != null) {
        String roles = "";
        for (SimpleNode.LexSenseRole lexSenseRole : node.nodeRoles) {
          if (lexSenseRole.role.equals("rel")) {
            leaf.setAttribute("rel", lexSenseRole.lex + "." + lexSenseRole.sense);
          }
          else {
            String role = adjustRole(lexSenseRole.role);
            roles += lexSenseRole.lex + "." + lexSenseRole.sense + ":" + role + " ";
          }
        }
        if (roles.length() > 0) leaf.setAttribute("roles", roles.trim());
      }
      if (node.argRoles != null) {
        String args = "";
        for (String role : node.argRoles) {
          role = adjustRole(role);
          args += role + " ";
        }
        leaf.setAttribute("argRoles", args.trim());
      }
      // done
      return leaf;
    }

    public Element atomcatinserter(SimpleNode node) throws Exception {

      // Predicate for atomcat creation.
      // Flag signifies whether the elem is a single atomcat.
      // Relevant as if the present cat is an atomcat LF variable can be set
      // in the syntax here itself.

      StringTokenizer lex;
      Element atomcat = new Element("atomcat");
      Element fs = new Element("fs");
      Element feat = new Element("feat");
      String id = "NotGiven";
      String form;

      // Current element info extracted from the node
      String elem = node.print();

      // Index extracted by sensing undescore delimiter
      if (elem.contains("_")) {
        // elem=elem.replaceAll(":[A-Z]","");
        lex = new StringTokenizer(elem, "_");
        elem = lex.nextToken();
        id = lex.nextToken();
        String x[] = id.split(":");
        // System.out.println(id);

        if (x.length == 2) {
          // mww: moved this to atomcat
          //feat.setAttribute("attr", "dep");
          //feat.setAttribute("val", x[1]);
          //fs.addContent(feat);
          atomcat.setAttribute("dep", x[1]);
        }

        id = id.replaceAll(":[A-Z]", "");
      } else {
        // Default id 1 is alloted otherwise
        id = Integer.toString(idList.size() + 1);
        idList.add(id);
      }

      // Normalizing the id by comparing with previous indices.
      if (!idList.contains(id))
        idList.add(id);
      id = Integer.toString(idList.indexOf(id) + 1);

      // Normalized index is set (leaves only)
      if (underLeaf)
        fs.setAttribute("id", id);

      // make lowercase
      elem = elem.toLowerCase();
      // remove superfluous [nb] in np[nb]
      elem = elem.replaceAll("np\\[nb\\]", "np");
      // also strip PP heads if apropos
      elem = stripPPHeads(elem);

      // Form attrtibute detected and set.
      if (elem.contains("[")) {
        lex = new StringTokenizer(elem, "[,]");
        elem = lex.nextToken();
        form = lex.nextToken();
        feat = new Element("feat");
        feat.setAttribute("attr", "form");
        feat.setAttribute("val", form);
        fs.addContent(feat);
      }

      // add fs if non-empty
      if (underLeaf || fs.getContentSize() > 0)
        atomcat.addContent(fs);
      atomcat.setAttribute("type", elem);
      return atomcat;

    }

    public Element opinserter(SimpleNode node) {

      Element slash = new Element("slash");
      String dir;
      String op = node.cat;

      // Slash direction sensed and set.

      if (op.equals("\\"))
        dir = "<";
      else
        dir = ">";

      slash.setAttribute("dir", op);
      slash.setAttribute("mode", dir);
      return slash;

    }

    public Element catInserter(SimpleNode node, Element leaf) {
      int i;

      // The normalization process. Relevant indices replaced by 1,2,3..n

      if (idList.size() > 0) {
        for (i = 0; i < idList.size(); i++)
          node.cat = node.cat.replaceAll(idList.get(i),
              Integer.toString(i + 1));
      }

      String l = node.getLeftover();
      if (l != null)
        node.cat = node.cat + l;

      // Purging the cat spec of indices outside brackets & colons ie )_2 ,:B
      int ind = node.cat.indexOf(")_");

      while (ind != -1) {

        String str1 = node.cat.substring(0, ind + 1);
        String str2 = node.cat.substring(ind + 1, node.cat.length());

        str2 = str2.replaceFirst("_(\\p{Digit})++", "");

        // System.out.println(str1);
        // System.out.println(str2);

        node.cat = str1 + str2;

        ind = node.cat.indexOf(")_");

      }

      node.cat = node.cat.replaceAll(":[A-Z]", "");

      // Add categories with normalized indices, lowercased
      String cat = node.cat.toLowerCase();
      cat = cat.replaceAll("np\\[nb\\]", "np");
      // also strip PP heads if apropos
      cat = stripPPHeads(cat);
      leaf.setAttribute("cat", cat);

      String cat0 = "";

      // Add the same category to the treenodes
      if (node.type.equals("Treenode"))
        cat0 = node.cat;
      else
        cat0 = node.catRedundant;

      // Add the bare category to the leafnodes
      leaf.setAttribute("cat0", cat0);

      // Refresh index list.
      idList.clear();

      return leaf;
    }
  }
 
  // strips PP heads if apropos
  private String stripPPHeads(String cat) {
    if (keepPPHeads) return cat;
    return cat.replaceAll("pp\\[[a-z]+\\]", "pp");
  }
 
  // adjusts role, stripping PP head if apropos
  private String adjustRole(String role) {
    role = role.replaceFirst("ARG", "Arg");
    if (!keepPPHeads) {
      int hyph = role.indexOf('-');
      if (hyph > 0) role = role.substring(0, hyph);
    }
    return role;
  }
}
TOP

Related Classes of opennlp.ccgbank.CCGBankConvert$TreeWalker

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.