Package edu.stanford.nlp.trees.international.negra

Source Code of edu.stanford.nlp.trees.international.negra.NegraPennTreeNormalizer

package edu.stanford.nlp.trees.international.negra;

import java.util.*;

import edu.stanford.nlp.ling.Label;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreeFactory;
import edu.stanford.nlp.trees.TreeNormalizer;
import edu.stanford.nlp.trees.TreebankLanguagePack;
import java.util.function.Predicate;
import edu.stanford.nlp.util.Generics;

/**
* Tree normalizer for Negra Penn Treebank format.
*
* @author Roger Levy
*/
public class NegraPennTreeNormalizer extends TreeNormalizer {
  /** How to clean up node labels: 0 = do nothing, 1 = keep category and
   *  function, 2 = just category
   */
  private final int nodeCleanup;
  private static final String nonUnaryRoot = "NUR"; // non-unary root
  protected final TreebankLanguagePack tlp;
  private boolean insertNPinPP = false;

  private final Predicate<Tree> emptyFilter;
  private final Predicate<Tree> aOverAFilter;

  public NegraPennTreeNormalizer() {
    this(new NegraPennLanguagePack());
  }

  public NegraPennTreeNormalizer(TreebankLanguagePack tlp) {
    this(tlp, 0);
  }

  public NegraPennTreeNormalizer(TreebankLanguagePack tlp, int nodeCleanup) {
    this.tlp = tlp;
    this.nodeCleanup = nodeCleanup;

    emptyFilter = new Predicate<Tree>() {
      private static final long serialVersionUID = -606371737889816130L;
      public boolean test(Tree t) {
        Tree[] kids = t.children();
        Label l = t.label();
        if ((l != null) && l.value() != null && (l.value().matches("^\\*T.*$")) && !t.isLeaf() && kids.length == 1 && kids[0].isLeaf())
          return false;
        return true;
      }
    };
    aOverAFilter = new Predicate<Tree>() {
      private static final long serialVersionUID = -606371737889816130L;
      public boolean test(Tree t) {
        if (t.isLeaf() || t.isPreTerminal() || t.children().length != 1)
          return true;
        if (t.label() != null && t.label().equals(t.children()[0].label()))
          return false;
        return true;
      }
    };
  }


  public String rootSymbol() {
    return tlp.startSymbol();
  }

  public String nonUnaryRootSymbol() {
    return nonUnaryRoot;
  }

  public void setInsertNPinPP(boolean b) {
    insertNPinPP = b;
  }

  public boolean getInsertNPinPP() {
    return insertNPinPP;
  }

  /**
   * Normalizes a leaf contents.
   * This implementation interns the leaf.
   */
  @Override
  public String normalizeTerminal(String leaf) {
    return leaf.intern();
  }

  private static final String junkCPP = "---CJ";
  private static final String cpp = "CPP";

  /**
   * Normalizes a nonterminal contents.
   * This implementation strips functional tags, etc. and interns the
   * nonterminal.
   */
  @Override
  public String normalizeNonterminal(String category) {
    if (junkCPP.equals(category)) // one garbage category cleanup here.
      category = cpp;

    //Accommodate the null root nodes in Negra/Tiger trees
    category = cleanUpLabel(category);

    return (category == null) ? null : category.intern();
  }

  private Tree fixNonUnaryRoot(Tree t, TreeFactory tf) {
    List<Tree> kids = t.getChildrenAsList();
    if(kids.size() == 2 && t.firstChild().isPhrasal() && tlp.isSentenceFinalPunctuationTag(t.lastChild().value())) {
      List<Tree> grandKids = t.firstChild().getChildrenAsList();
      grandKids.add(t.lastChild());
      t.firstChild().setChildren(grandKids);
      kids.remove(kids.size() - 1);
      t.setChildren(kids);
      t.setValue(tlp.startSymbol());
     
    } else {
      t.setValue(nonUnaryRoot);
      t = tf.newTreeNode(tlp.startSymbol(), Collections.singletonList(t));
    }
    return t;
  }


  /**
   * Normalize a whole tree -- one can assume that this is the
   * root.  This implementation deletes empty elements (ones with
   * nonterminal tag label starting with '*T') from the tree.  It
   * does work for a null tree.
   */
  @Override
  public Tree normalizeWholeTree(Tree tree, TreeFactory tf) {
    // add an extra root to non-unary roots
    if(tree.value() == null)
      tree = fixNonUnaryRoot(tree, tf);
    else if(!tree.value().equals(tlp.startSymbol()))
      tree = tf.newTreeNode(tlp.startSymbol(), Collections.singletonList(tree));

    tree = tree.prune(emptyFilter, tf).spliceOut(aOverAFilter, tf);

    // insert NPs in PPs if you're supposed to do that
    if (insertNPinPP) {
      insertNPinPPall(tree);
    }

    for(Tree t : tree) {
      if(t.isLeaf() || t.isPreTerminal()) continue;
      if(t.value() == null || t.value().equals("")) t.setValue("DUMMY");

      // there's also a '--' category
      if(t.value().matches("--.*")) continue;

      // fix a bug in the ACL08 German tiger treebank
      String cat = t.value();
      if(cat == null || cat.equals("")) {
        if (t.numChildren() == 3 && t.firstChild().label().value().equals("NN") && t.getChild(1).label().value().equals("$.")) {
          System.err.println("Correcting treebank error: giving phrase label DL to " + t);
          t.label().setValue("DL");
        }
      }
    }

    return tree;
  }


  private Set<String> prepositionTags = Generics.newHashSet(Arrays.asList(new String[]{"APPR", "APPRART"}));
  private Set<String> postpositionTags = Generics.newHashSet(Arrays.asList(new String[]{"APPO", "APZR"}));


  private void insertNPinPPall(Tree t) {
    Tree[] kids = t.children();
    for (int i = 0, n = kids.length; i < n; i++) {
      insertNPinPPall(kids[i]);
    }
    insertNPinPP(t);
  }


  private void insertNPinPP(Tree t) {
    if (tlp.basicCategory(t.label().value()).equals("PP")) {
      Tree[] kids = t.children();
      int i = 0;
      int j = kids.length - 1;
      while (i < j && prepositionTags.contains(tlp.basicCategory(kids[i].label().value()))) {
        i++;
      } // i now indexes first dtr of new NP
      while (i < j && postpositionTags.contains(tlp.basicCategory(kids[j].label().value()))) {
        j--;
      } // j now indexes last dtr of new NP

      if (i > j) {
        System.err.println("##### Warning -- no NP material here!");
        return; // there is no NP material!
      }

      int npKidsLength = j - i + 1;
      Tree[] npKids = new Tree[npKidsLength];
      System.arraycopy(kids, i, npKids, 0, npKidsLength);
      Tree np = t.treeFactory().newTreeNode(t.label().labelFactory().newLabel("NP"), Arrays.asList(npKids));
      Tree[] newPPkids = new Tree[kids.length - npKidsLength + 1];
      System.arraycopy(kids, 0, newPPkids, 0, i + 1);
      newPPkids[i] = np;
      System.arraycopy(kids, j + 1, newPPkids, i + 1, kids.length - j - 1);
      t.setChildren(newPPkids);
      System.out.println("#### inserted NP in PP");
      t.pennPrint();
    }
  }


  /**
   * Remove things like hyphened functional tags and equals from the
   * end of a node label.
   */
  protected String cleanUpLabel(String label) {
    if (nodeCleanup == 1) {
      return tlp.categoryAndFunction(label);
    } else if (nodeCleanup == 2) {
      return tlp.basicCategory(label);
    }
    return label;
  }

  private static final long serialVersionUID = 8529514903815041064L;
}
TOP

Related Classes of edu.stanford.nlp.trees.international.negra.NegraPennTreeNormalizer

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.