Package edu.stanford.nlp.trees

Source Code of edu.stanford.nlp.trees.BobChrisTreeNormalizer

package edu.stanford.nlp.trees;

import edu.stanford.nlp.ling.Label;

import java.io.Serializable;
import java.util.function.Predicate;


/**
* Normalizes trees in the way used in Manning and Carpenter 1997.
* NB: This implementation is still incomplete!
* The normalizations performed are: (i) terminals are interned, (ii)
* nonterminals are stripped of alternants, functional tags and
* cross-reference codes, and then interned, (iii) empty
* elements (ones with nonterminal label "-NONE-") are deleted from the
* tree, (iv) the null label at the root node is replaced with the label
* "ROOT". <br>
* 17 Apr 2001: This was fixed to work with different kinds of labels,
* by making proper use of the Label interface, after it was moved into
* the trees module.
* <p/>
* The normalizations of the original (Prolog) BobChrisNormalize were:
* 1. Remap the root node to be called 'ROOT'
* 2. Truncate all nonterminal labels before characters introducing
* annotations according to TreebankLanguagePack
* (traditionally, -, =, | or # (last for BLLIP))
* 3. Remap the representation of certain leaf symbols (brackets etc.)
* 4. Map to lowercase all leaf nodes
* 5. Delete empty/trace nodes (ones marked '-NONE-')
* 6. Recursively delete any nodes that do not dominate any words
* 7. Delete A over A nodes where the top A dominates nothing else
* 8. Remove backslahes from lexical items
* (the Treebank inserts them to escape slashes (/) and stars (*)).
* 4 is deliberately omitted, and a few things are purely aesthetic.
* <p/>
* 14 June 2002: It now deletes unary A over A if both nodes' labels are equal
* (7), and (6) was always part of the Tree.prune() functionality...
* 30 June 2005: Also splice out an EDITED node, just in case you're parsing
* the Brown corpus.
*
* @author Christopher Manning
*/
public class BobChrisTreeNormalizer extends TreeNormalizer implements TreeTransformer {

  protected final TreebankLanguagePack tlp;


  public BobChrisTreeNormalizer() {
    this(new PennTreebankLanguagePack());
  }

  public BobChrisTreeNormalizer(TreebankLanguagePack tlp) {
    this.tlp = tlp;
  }


  /**
   * Normalizes a leaf contents.
   * This implementation interns the leaf.
   */
  @Override
  public String normalizeTerminal(String leaf) {
    // We could unquote * and / with backslash \ in front of them
    return leaf.intern();
  }


  /**
   * Normalizes a nonterminal contents.
   * This implementation strips functional tags, etc. and interns the
   * nonterminal.
   */
  @Override
  public String normalizeNonterminal(String category) {
    return cleanUpLabel(category).intern();
  }


  /**
   * Remove things like hyphened functional tags and equals from the
   * end of a node label.  This version always just returns the phrase
   * structure category, or "ROOT" if the label was <code>null</code>.
   *
   * @param label The label from the treebank
   * @return The cleaned up label (phrase structure category)
   */
  protected String cleanUpLabel(final String label) {
    if (label == null || label.length() == 0) {
      return "ROOT";
      // String constants are always interned
    } else {
      return tlp.basicCategory(label);
    }
  }


  /**
   * Normalize a whole tree -- one can assume that this is the
   * root.  This implementation deletes empty elements (ones with
   * nonterminal tag label '-NONE-') from the tree, and splices out
   * unary A over A nodes.  It does work for a null tree.
   */
  @Override
  public Tree normalizeWholeTree(Tree tree, TreeFactory tf) {
    return tree.prune(emptyFilter, tf).spliceOut(aOverAFilter, tf);
  }

  @Override
  public Tree transformTree(Tree tree) {
    return normalizeWholeTree(tree, tree.treeFactory());
  }


  protected Predicate<Tree> emptyFilter = new EmptyFilter();

  protected Predicate<Tree> aOverAFilter = new AOverAFilter();

  private static final long serialVersionUID = -1005188028979810143L;


  public static class EmptyFilter implements Predicate<Tree>, Serializable {

    private static final long serialVersionUID = 8914098359495987617L;

    /** Doesn't accept nodes that only cover an empty. */
    public boolean test(Tree t) {
      Tree[] kids = t.children();
      Label l = t.label();
      // Delete (return false for) empty/trace nodes (ones marked '-NONE-')
      return ! ((l != null) && "-NONE-".equals(l.value()) && !t.isLeaf() && kids.length == 1 && kids[0].isLeaf());
    }

      //    private static final long serialVersionUID = 1L;

  } // end class EmptyFilter


  public static class AOverAFilter implements Predicate<Tree>, Serializable {

    /** Doesn't accept nodes that are A over A nodes (perhaps due to
     *  empty removal or are EDITED nodes).
     */
    public boolean test(Tree t) {
      if (t.isLeaf() || t.isPreTerminal()) {
        return true;
      }
      // The special switchboard non-terminals clause
      if ("EDITED".equals(t.label().value()) || "CODE".equals(t.label().value())) {
        return false;
      }
      if (t.numChildren() != 1) {
        return true;
      }
      return ! (t.label() != null && t.label().value() != null && t.label().value().equals(t.getChild(0).label().value()));
    }

    private static final long serialVersionUID = 1L;

  } // end class AOverAFilter

}
TOP

Related Classes of edu.stanford.nlp.trees.BobChrisTreeNormalizer

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.