Package edu.stanford.nlp.trees

Source Code of edu.stanford.nlp.trees.NPTmpRetainingTreeNormalizer$NPTmpAdvRetainingTreeReaderFactory

package edu.stanford.nlp.trees;

import edu.stanford.nlp.ling.Label;
import edu.stanford.nlp.ling.LabelFactory;
import java.util.function.Predicate;

import java.io.Reader;
import java.util.regex.Pattern;
import java.util.*;


/**
* Same TreeNormalizer as BobChrisTreeNormalizer, but optionally provides
* four extras.  I.e., the class name is now a misnomer.<br>
* 1) retains -TMP labels on NP with the new identification NP-TMP,
* and provides various options to percolate that option downwards
* to the head noun, and perhaps also to inherit this from a PP-TMP.<br>
* 2) Annotates S nodes which contain a gapped subject: i.e.,
* <code>S &lt; (/^NP-SBJ/ &lt; -NONE-) --> S-G</code>  <br>
* 3) Leave all functional tags on nodes. <br>
* 4) Keeps -ADV labels on NP and marks head tag with &`^ADV
* <p/>
* <i>Performance note:</i> At one point in time, PCFG labeled F1 results
* for the various TEMPORAL options in lexparser were:
* 0=86.7, 1=87.49, 2=86.87, 3=87.49, 4=87.48, 5=87.5, 6=87.07.
* So, mainly avoid values of 0, 2, and 6.
* <p/>
* At another point they were:
* 0=86.53, 1=87.1, 2=87.14, 3=87.22, 4=87.1, 5=87.13, 6=86.95, 7=87.16
*
* @author Christopher Manning
* @author Dan Klein
*/
public class NPTmpRetainingTreeNormalizer extends BobChrisTreeNormalizer {

  private static final long serialVersionUID = 7548777133196579107L;

  public static final int TEMPORAL_NONE = 0;
  public static final int TEMPORAL_ACL03PCFG = 1;
  public static final int TEMPORAL_ANY_TMP_PERCOLATED = 2;
  public static final int TEMPORAL_ALL_TERMINALS = 3;
  public static final int TEMPORAL_ALL_NP = 4;
  public static final int TEMPORAL_ALL_NP_AND_PP = 5;
  public static final int TEMPORAL_NP_AND_PP_WITH_NP_HEAD = 6;
  public static final int TEMPORAL_ALL_NP_EVEN_UNDER_PP = 7;
  public static final int TEMPORAL_ALL_NP_PP_ADVP = 8;
  public static final int TEMPORAL_9 = 9;

  private static final boolean onlyTagAnnotateNstar = true;

  private static final Pattern NPTmpPattern = Pattern.compile("NP.*-TMP.*");
  private static final Pattern PPTmpPattern = Pattern.compile("PP.*-TMP.*");
  private static final Pattern ADVPTmpPattern = Pattern.compile("ADVP.*-TMP.*");
  private static final Pattern TmpPattern = Pattern.compile(".*-TMP.*");
  private static final Pattern NPSbjPattern = Pattern.compile("NP.*-SBJ.*");
  private static final Pattern NPAdvPattern = Pattern.compile("NP.*-ADV.*");

  private final int temporalAnnotation;
  private final boolean doSGappedStuff;
  private final int leaveItAll;
  private final boolean doAdverbialNP;
  private final HeadFinder headFinder;


  public NPTmpRetainingTreeNormalizer() {
    this(TEMPORAL_ACL03PCFG, false);
  }

  public NPTmpRetainingTreeNormalizer(int temporalAnnotation, boolean doSGappedStuff) {
    this(temporalAnnotation, doSGappedStuff, 0, false);
  }

  public NPTmpRetainingTreeNormalizer(int temporalAnnotation, boolean doSGappedStuff, int leaveItAll, boolean doAdverbialNP) {
    this(temporalAnnotation, doSGappedStuff, leaveItAll, doAdverbialNP, new ModCollinsHeadFinder());
  }

  /**
   * Create a TreeNormalizer that maintains some functional annotations,
   * particularly those involving temporal annotation.
   *
   * @param temporalAnnotation One of the constants:
   *                           TEMPORAL_NONE (no temporal annotation kept on trees),
   *                           TEMPORAL_ACL03PCFG (temporal annotation on NPs, and percolated down
   *                           to head of constituent until and including POS tag),
   *                           TEMPORAL_ANY_TMP_PERCOLATED (temporal annotation on any phrase is
   *                           kept and percolated via head chain to and including POS tag),
   *                           TEMPORAL_ALL_TERMINALS (temporal annotation is kept on NPs, and
   *                           is placed on all POS tag daughters of that NP (but is not
   *                           percolated down a head chain through phrasal categories),
   *                           TEMPORAL_ALL_NP (temporal annotation on NPs, and it is percolated
   *                           down via the head chain, but only through NPs: annotation stops
   *                           at either a POS tag (which is annotated) or a non-NP head
   *                           (which isn't annotated)),
   *                           TEMPORAL_ALL_NP_AND_PP (keeps temporal annotation on NPs and PPs,
   *                           and it is percolated down via the head chain, but only through
   *                           NPs: annotation stops at either a POS tag (which is annotated)
   *                           or a non-NP head (which isn't annotated)).
   *                           TEMPORAL_NP_AND_PP_WITH_NP_HEAD (like TEMPORAL_ALL_NP_AND_PP
   *                           except an NP is regarded as the head of a PP)
   *                           TEMPORAL_ALL_NP_EVEN_UNDER_PP (like TEMPORAL_ALL_NP, but a PP-TMP
   *                           annotation above an NP is 'passed down' to annotate that NP
   *                           as temporal (but the PP itself isn't marked))
   *                           TEMPORAL_ALL_NP_PP_ADVP (keeps temporal annotation on NPs, PPs, and
   *                           ADVPs
   *                           and it is percolated down via the head chain, but only through
   *                           those categories: annotation stops at either a POS tag
   *                           (which is annotated)
   *                           or a non-NP/PP/ADVP head (which isn't annotated)),
   *                           TEMPORAL_9 (annotates like the previous one but
   *                           does all NP inside node, and their children if
   *                           pre-pre-terminal rather than only if head).
   * @param doSGappedStuff     Leave -SBJ marking on subject NP and then mark
   *                           S-G sentences with a gapped subject.
   * @param leaveItAll         0 means the usual stripping of functional tags and indices;
   *                           1 leaves all functional tags but still strips indices;
   *                           2 leaves everything
   * @param doAdverbialNP      Leave -ADV functional tag on adverbial NPs and
   *                           maybe add it to their head
   * @param headFinder         A head finder that is used with some of the
   *                           options for temporalAnnotation
   */
  public NPTmpRetainingTreeNormalizer(int temporalAnnotation, boolean doSGappedStuff, int leaveItAll, boolean doAdverbialNP, HeadFinder headFinder) {
    this.temporalAnnotation = temporalAnnotation;
    this.doSGappedStuff = doSGappedStuff;
    this.leaveItAll = leaveItAll;
    this.doAdverbialNP = doAdverbialNP;
    this.headFinder = headFinder;
  }


  /**
   * Remove things like hyphened functional tags and equals from the
   * end of a node label.
   */
  @Override
  protected String cleanUpLabel(String label) {
    if (label == null) {
      return "ROOT";
      // String constants are always interned
    } else if (leaveItAll == 1) {
      return tlp.categoryAndFunction(label);
    } else if (leaveItAll == 2) {
      return label;
    } else {
      boolean nptemp = NPTmpPattern.matcher(label).matches();
      boolean pptemp = PPTmpPattern.matcher(label).matches();
      boolean advptemp = ADVPTmpPattern.matcher(label).matches();
      boolean anytemp = TmpPattern.matcher(label).matches();
      boolean subj = NPSbjPattern.matcher(label).matches();
      boolean npadv = NPAdvPattern.matcher(label).matches();
      label = tlp.basicCategory(label);
      if (anytemp && temporalAnnotation == TEMPORAL_ANY_TMP_PERCOLATED) {
        label += "-TMP";
      } else if (pptemp && (temporalAnnotation == TEMPORAL_ALL_NP_AND_PP || temporalAnnotation == TEMPORAL_NP_AND_PP_WITH_NP_HEAD || temporalAnnotation == TEMPORAL_ALL_NP_EVEN_UNDER_PP || temporalAnnotation == TEMPORAL_ALL_NP_PP_ADVP || temporalAnnotation == TEMPORAL_9)) {
        label = label + "-TMP";
      } else if (advptemp && (temporalAnnotation == TEMPORAL_ALL_NP_PP_ADVP || temporalAnnotation == TEMPORAL_9)) {
        label = label + "-TMP";
      } else if (temporalAnnotation > 0 && nptemp) {
        label = label + "-TMP";
      }
      if (doAdverbialNP && npadv) {
        label = label + "-ADV";
      }
      if (doSGappedStuff && subj) {
        label = label + "-SBJ";
      }
      return label;
    }
  }


  private static boolean includesEmptyNPSubj(Tree t) {
    if (t == null) {
      return false;
    }
    Tree[] kids = t.children();
    if (kids == null) {
      return false;
    }
    boolean foundNullSubj = false;
    for (Tree kid : kids) {
      Tree[] kidkids = kid.children();
      if (NPSbjPattern.matcher(kid.value()).matches()) {
        kid.setValue("NP");
        if (kidkids != null && kidkids.length == 1 && kidkids[0].value().equals("-NONE-")) {
          // only set flag, since there are 2 a couple of times (errors)
          foundNullSubj = true;
        }
      }
    }
    return foundNullSubj;
  }


  /**
   * Normalize a whole tree -- one can assume that this is the root.
   * This implementation deletes empty elements (ones with nonterminal
   * tag label '-NONE-') from the tree.
   */
  @Override
  public Tree normalizeWholeTree(Tree tree, TreeFactory tf) {
    TreeTransformer transformer1 = t -> {
      if (doSGappedStuff) {
        String lab = t.label().value();
        if (lab.equals("S") && includesEmptyNPSubj(t)) {
          LabelFactory lf = t.label().labelFactory();
          // Note: this changes the tree label, rather than
          // creating a new tree node.  Beware!
          t.setLabel(lf.newLabel(t.label().value() + "-G"));
        }
      }
      return t;
    };
    Predicate<Tree> subtreeFilter = new Predicate<Tree>() {

      private static final long serialVersionUID = -7250433816896327901L;

      @Override
      public boolean test(Tree t) {
        Tree[] kids = t.children();
        Label l = t.label();
        // The special Switchboard non-terminals clause.
        // Note that it deletes IP which other Treebanks might use!
        if ("RS".equals(t.label().value()) || "RM".equals(t.label().value()) || "IP".equals(t.label().value()) || "CODE".equals(t.label().value())) {
          return false;
        }
        if ((l != null) && l.value() != null && (l.value().equals("-NONE-")) && !t.isLeaf() && kids.length == 1 && kids[0].isLeaf()) {
          // Delete empty/trace nodes (ones marked '-NONE-')
          return false;
        }
        return true;
      }
    };
    Predicate<Tree> nodeFilter = new Predicate<Tree>() {

      private static final long serialVersionUID = 9000955019205336311L;

      @Override
      public boolean test(Tree t) {
        if (t.isLeaf() || t.isPreTerminal()) {
          return true;
        }
        // The special switchboard non-terminals clause. Try keeping EDITED for now....
        // if ("EDITED".equals(t.label().value())) {
        //   return false;
        // }
        if (t.numChildren() != 1) {
          return true;
        }
        if (t.label() != null && t.label().value() != null && t.label().value().equals(t.children()[0].label().value())) {
          return false;
        }
        return true;
      }
    };
    TreeTransformer transformer2 = t -> {
      if (temporalAnnotation == TEMPORAL_ANY_TMP_PERCOLATED) {
        String lab = t.label().value();
        if (TmpPattern.matcher(lab).matches()) {
          Tree oldT = t;
          Tree ht;
          do {
            ht = headFinder.determineHead(oldT);
            // special fix for possessives! -- make noun before head
            if (ht.label().value().equals("POS")) {
              int j = oldT.objectIndexOf(ht);
              if (j > 0) {
                ht = oldT.getChild(j - 1);
              }
            }
            LabelFactory lf = ht.label().labelFactory();
            // Note: this changes the tree label, rather than
            // creating a new tree node.  Beware!
            ht.setLabel(lf.newLabel(ht.label().value() + "-TMP"));
            oldT = ht;
          } while (!ht.isPreTerminal());
          if (lab.startsWith("PP")) {
            ht = headFinder.determineHead(t);
            // look to right
            int j = t.objectIndexOf(ht);
            int sz = t.children().length;
            if (j + 1 < sz) {
              ht = t.getChild(j + 1);
            }
            if (ht.label().value().startsWith("NP")) {
              while (!ht.isLeaf()) {
                LabelFactory lf = ht.label().labelFactory();
                // Note: this changes the tree label, rather than
                // creating a new tree node.  Beware!
                ht.setLabel(lf.newLabel(ht.label().value() + "-TMP"));
                ht = headFinder.determineHead(ht);
              }
            }
          }
        }
      } else if (temporalAnnotation == TEMPORAL_ALL_TERMINALS) {
        String lab = t.label().value();
        if (NPTmpPattern.matcher(lab).matches()) {
          Tree ht;
          ht = headFinder.determineHead(t);
          if (ht.isPreTerminal()) {
            // change all tags to -TMP
            LabelFactory lf = ht.label().labelFactory();
            Tree[] kids = t.children();
            for (Tree kid : kids) {
              if (kid.isPreTerminal()) {
                // Note: this changes the tree label, rather
                // than creating a new tree node.  Beware!
                kid.setLabel(lf.newLabel(kid.value() + "-TMP"));
              }
            }
          } else {
            Tree oldT = t;
            do {
              ht = headFinder.determineHead(oldT);
              oldT = ht;
            } while (!ht.isPreTerminal());
            LabelFactory lf = ht.label().labelFactory();
            // Note: this changes the tree label, rather than
            // creating a new tree node.  Beware!
            ht.setLabel(lf.newLabel(ht.label().value() + "-TMP"));
          }
        }
      } else if (temporalAnnotation == TEMPORAL_ALL_NP) {
        String lab = t.label().value();
        if (NPTmpPattern.matcher(lab).matches()) {
          Tree oldT = t;
          Tree ht;
          do {
            ht = headFinder.determineHead(oldT);
            // special fix for possessives! -- make noun before head
            if (ht.label().value().equals("POS")) {
              int j = oldT.objectIndexOf(ht);
              if (j > 0) {
                ht = oldT.getChild(j - 1);
              }
            }
            if (ht.isPreTerminal() || ht.value().startsWith("NP")) {
              LabelFactory lf = ht.labelFactory();
              // Note: this changes the tree label, rather than
              // creating a new tree node.  Beware!
              ht.setLabel(lf.newLabel(ht.label().value() + "-TMP"));
              oldT = ht;
            }
          } while (ht.value().startsWith("NP"));
        }
      } else if (temporalAnnotation == TEMPORAL_ALL_NP_AND_PP || temporalAnnotation == TEMPORAL_NP_AND_PP_WITH_NP_HEAD || temporalAnnotation == TEMPORAL_ALL_NP_EVEN_UNDER_PP) {
        // also allow chain to start with PP
        String lab = t.value();
        if (NPTmpPattern.matcher(lab).matches() || PPTmpPattern.matcher(lab).matches()) {
          Tree oldT = t;
          do {
            Tree ht = headFinder.determineHead(oldT);
            // special fix for possessives! -- make noun before head
            if (ht.value().equals("POS")) {
              int j = oldT.objectIndexOf(ht);
              if (j > 0) {
                ht = oldT.getChild(j - 1);
              }
            } else if ((temporalAnnotation == TEMPORAL_NP_AND_PP_WITH_NP_HEAD || temporalAnnotation == TEMPORAL_ALL_NP_EVEN_UNDER_PP) && (ht.value().equals("IN") || ht.value().equals("TO"))) {
              // change the head to be NP if possible
              Tree[] kidlets = oldT.children();
              for (int k = kidlets.length - 1; k > 0; k--) {
                if (kidlets[k].value().startsWith("NP")) {
                  ht = kidlets[k];
                }
              }
            }
            LabelFactory lf = ht.labelFactory();
            // Note: this next bit changes the tree label, rather
            // than creating a new tree node.  Beware!
            if (ht.isPreTerminal() || ht.value().startsWith("NP")) {
              ht.setLabel(lf.newLabel(ht.value() + "-TMP"));
            }
            if (temporalAnnotation == TEMPORAL_ALL_NP_EVEN_UNDER_PP && oldT.value().startsWith("PP")) {
              oldT.setLabel(lf.newLabel(tlp.basicCategory(oldT.value())));
            }
            oldT = ht;
          } while (oldT.value().startsWith("NP") || oldT.value().startsWith("PP"));
        }
      } else if (temporalAnnotation == TEMPORAL_ALL_NP_PP_ADVP) {
        // also allow chain to start with PP or ADVP
        String lab = t.value();
        if (NPTmpPattern.matcher(lab).matches() || PPTmpPattern.matcher(lab).matches() || ADVPTmpPattern.matcher(lab).matches()) {
          Tree oldT = t;
          do {
            Tree ht = headFinder.determineHead(oldT);
            // special fix for possessives! -- make noun before head
            if (ht.value().equals("POS")) {
              int j = oldT.objectIndexOf(ht);
              if (j > 0) {
                ht = oldT.getChild(j - 1);
              }
            }
            // Note: this next bit changes the tree label, rather
            // than creating a new tree node.  Beware!
            if (ht.isPreTerminal() || ht.value().startsWith("NP")) {
              LabelFactory lf = ht.labelFactory();
              ht.setLabel(lf.newLabel(ht.value() + "-TMP"));
            }
            oldT = ht;
          } while (oldT.value().startsWith("NP"));
        }
      } else if (temporalAnnotation == TEMPORAL_9) {
        // also allow chain to start with PP or ADVP
        String lab = t.value();
        if (NPTmpPattern.matcher(lab).matches() || PPTmpPattern.matcher(lab).matches() || ADVPTmpPattern.matcher(lab).matches()) {
          // System.err.println("TMP: Annotating " + t);
          addTMP9(t);
        }
      } else if (temporalAnnotation == TEMPORAL_ACL03PCFG) {
        String lab = t.label().value();
        if (lab != null && NPTmpPattern.matcher(lab).matches()) {
          Tree oldT = t;
          Tree ht;
          do {
            ht = headFinder.determineHead(oldT);
            // special fix for possessives! -- make noun before head
            if (ht.label().value().equals("POS")) {
              int j = oldT.objectIndexOf(ht);
              if (j > 0) {
                ht = oldT.getChild(j - 1);
              }
            }
            oldT = ht;
          } while (!ht.isPreTerminal());
          if ( ! onlyTagAnnotateNstar || ht.label().value().startsWith("N")) {
            LabelFactory lf = ht.label().labelFactory();
            // Note: this changes the tree label, rather than
            // creating a new tree node.  Beware!
            ht.setLabel(lf.newLabel(ht.label().value() + "-TMP"));
          }
        }
      }
      if (doAdverbialNP) {
        String lab = t.value();
        if (NPAdvPattern.matcher(lab).matches()) {
          Tree oldT = t;
          Tree ht;
          do {
            ht = headFinder.determineHead(oldT);
            // special fix for possessives! -- make noun before head
            if (ht.label().value().equals("POS")) {
              int j = oldT.objectIndexOf(ht);
              if (j > 0) {
                ht = oldT.getChild(j - 1);
              }
            }
            if (ht.isPreTerminal() || ht.value().startsWith("NP")) {
              LabelFactory lf = ht.labelFactory();
              // Note: this changes the tree label, rather than
              // creating a new tree node.  Beware!
              ht.setLabel(lf.newLabel(ht.label().value() + "-ADV"));
              oldT = ht;
            }
          } while (ht.value().startsWith("NP"));
        }
      }
      return t;
    };
    // if there wasn't an empty nonterminal at the top, but an S, wrap it.
    if (tree.label().value().equals("S")) {
      tree = tf.newTreeNode("ROOT", Collections.singletonList(tree));
    }
    // repair for the phrasal VB in Switchboard (PTB version 3) that should be a VP
    for (Tree subtree : tree) {
      if (subtree.isPhrasal() && "VB".equals(subtree.label().value())) {
        subtree.setValue("VP");
      }
    }
    tree = tree.transform(transformer1);
    if (tree == null) { return null; }
    tree = tree.prune(subtreeFilter, tf);
    if (tree == null) { return null; }
    tree = tree.spliceOut(nodeFilter, tf);
    if (tree == null) { return null; }
    return tree.transform(transformer2, tf);
  }

  /**
   * Add -TMP when not present within an NP
   * @param tree The tree to add temporal info to.
   */
  private void addTMP9(final Tree tree) {
    // do the head chain under it
    Tree ht = headFinder.determineHead(tree);
    // special fix for possessives! -- make noun before head
    if (ht.value().equals("POS")) {
      int j = tree.objectIndexOf(ht);
      if (j > 0) {
        ht = tree.getChild(j - 1);
      }
    }
    // Note: this next bit changes the tree label, rather
    // than creating a new tree node.  Beware!
    if (ht.isPreTerminal() || ht.value().startsWith("NP") ||
        ht.value().startsWith("PP") || ht.value().startsWith("ADVP")) {
      if (!TmpPattern.matcher(ht.value()).matches()) {
        LabelFactory lf = ht.labelFactory();
        // System.err.println("TMP: Changing " + ht.value() + " to " +
        //                   ht.value() + "-TMP");
        ht.setLabel(lf.newLabel(ht.value() + "-TMP"));
      }
      if (ht.value().startsWith("NP") || ht.value().startsWith("PP") ||
          ht.value().startsWith("ADVP")) {
        addTMP9(ht);
      }
    }
    // do the NPs under it (which may or may not be the head chain
    Tree[] kidlets = tree.children();
    for (int k = 0; k < kidlets.length; k++) {
      ht = kidlets[k];
      LabelFactory lf;
      if (tree.isPrePreTerminal() && !TmpPattern.matcher(ht.value()).matches()) {
        // System.err.println("TMP: Changing " + ht.value() + " to " +
        //                   ht.value() + "-TMP");
        lf = ht.labelFactory();
        // Note: this next bit changes the tree label, rather
        // than creating a new tree node.  Beware!
        ht.setLabel(lf.newLabel(ht.value() + "-TMP"));
      } else if (ht.value().startsWith("NP")) {
        // don't add -TMP twice!
        if (!TmpPattern.matcher(ht.value()).matches()) {
          lf = ht.labelFactory();
          // System.err.println("TMP: Changing " + ht.value() + " to " +
          //                   ht.value() + "-TMP");
          // Note: this next bit changes the tree label, rather
          // than creating a new tree node.  Beware!
          ht.setLabel(lf.newLabel(ht.value() + "-TMP"));
        }
        addTMP9(ht);
      }
    }
  }

  /** Implementation of TreeReaderFactory, mainly for convenience of
   *  constructing by reflection.
   */
  public static class NPTmpRetainingTreeReaderFactory implements TreeReaderFactory {

    @Override
    public TreeReader newTreeReader(Reader in) {
      return new PennTreeReader(in, new LabeledScoredTreeFactory(), new NPTmpRetainingTreeNormalizer());
    }

  }

  /** Implementation of TreeReaderFactory, mainly for convenience of
   *  constructing by reflection. This one corresponds to what's currently
   *  used in englishPCFG accurate unlexicalized parser.
   */
  public static class NPTmpAdvRetainingTreeReaderFactory implements TreeReaderFactory {

    @Override
    public TreeReader newTreeReader(Reader in) {
      return new PennTreeReader(in, new LabeledScoredTreeFactory(),
              new NPTmpRetainingTreeNormalizer(NPTmpRetainingTreeNormalizer.TEMPORAL_ACL03PCFG,
                      false, 0, true));
    }

  }

}
TOP

Related Classes of edu.stanford.nlp.trees.NPTmpRetainingTreeNormalizer$NPTmpAdvRetainingTreeReaderFactory

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.