Source Code of edu.stanford.nlp.trees.international.spanish.SpanishTreeNormalizer

package edu.stanford.nlp.trees.international.spanish;


import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;


import edu.stanford.nlp.international.spanish.process.AnCoraPronounDisambiguator;
import edu.stanford.nlp.international.spanish.SpanishVerbStripper;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.HasTag;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.ling.Label;
import edu.stanford.nlp.trees.*;
import edu.stanford.nlp.trees.tregex.TregexMatcher;
import edu.stanford.nlp.trees.tregex.TregexPattern;
import edu.stanford.nlp.trees.tregex.tsurgeon.Tsurgeon;
import edu.stanford.nlp.trees.tregex.tsurgeon.TsurgeonPattern;
import java.util.function.Predicate;
import edu.stanford.nlp.util.Pair;
import edu.stanford.nlp.util.StringUtils;


/**
 * Normalize trees read from the AnCora Spanish corpus.
 */
public class SpanishTreeNormalizer extends BobChrisTreeNormalizer {


  /**
   * Tag provided to words which are extracted from a multi-word token
   * into their own independent nodes
   */
  public static final String MW_TAG = "MW?";


  /**
   * Tag provided to constituents which contain words from MW tokens
   */
  public static final String MW_PHRASE_TAG = "MW_PHRASE?";


  public static final String EMPTY_LEAF_VALUE = "=NONE=";
  public static final String LEFT_PARENTHESIS = "=LRB=";
  public static final String RIGHT_PARENTHESIS = "=RRB=";


  private static final Map<String, String> spellingFixes = new HashMap<String, String>();
  static {
    spellingFixes.put("embargp", "embargo"); // 18381_20000322.tbf-4
    spellingFixes.put("jucio", "juicio"); // 4800_2000406.tbf-5
    spellingFixes.put("méxico", "México"); // 111_C-3.tbf-17
    spellingFixes.put("reirse", "reírse"); // 140_20011102.tbf-13
    spellingFixes.put("tambien", "también"); // 41_19991002.tbf-8


    spellingFixes.put("Intitute", "Institute"); // 22863_20001129.tbf-16


    // Hack: these aren't exactly spelling mistakes, but we need to
    // run a search-and-replace across the entire corpus with them, so
    // they should be treated just like spelling mistakes for our
    // purposes
    spellingFixes.put("(", LEFT_PARENTHESIS);
    spellingFixes.put(")", RIGHT_PARENTHESIS);
  }


  /**
   * A filter which rejects preterminal nodes that contain "empty" leaf
   * nodes.
   */
  private static final Predicate<Tree> emptyFilter = new Predicate<Tree>() {
    public boolean test(Tree tree) {
      if (tree.isPreTerminal()
          && tree.firstChild().value().equals(EMPTY_LEAF_VALUE))
        return false;
      return true;
    }
  };


  /**
   * Resolves some inconsistencies in constituent naming:
   *
   * - "sa" and "s.a" are equivalent -- merge to "s.a"
   */
  private static final TreeTransformer constituentRenamer = new TreeTransformer() {
    @Override
    public Tree transformTree(Tree t) {
      if (t.isLeaf())
        return t;


      String value = t.value();
      if (value == null)
        return t;


      if (value.equals("sa"))
        t.setValue("s.a");


      return t;
    }
  };


  @SuppressWarnings("unchecked")
  private static final Pair<String, String>[] cleanupStrs = new Pair[] {
    new Pair("sp < (sp=sp <: prep=prep)", "replace sp prep"),


    // Left and right parentheses should be at same depth
    new Pair("fpa > __=grandparent $++ (__=ancestor <<` fpt=fpt >` =grandparent)",
      "move fpt $- ancestor"),


    // Nominal groups where adjectival groups belong
    new Pair("/^s\\.a$/ <: (/^grup\\.nom$/=gn <: /^a/)",
      "relabel gn /grup.a/"),


    // Adverbial phrases should always have adverb group children
    // -- we see about 50 exceptions in the corpus..
    new Pair("sadv !< /^grup\\.adv$/ <: /^(rg|neg)$/=adv",
      "adjoinF (grup.adv foot@) adv"),


    // 'z' tag should be 'z0'
    new Pair("z=z <: (__ !< __)", "relabel z z0"),


    // Conjunction groups aren't necessary if they head single
    // prepositional phrases (we already see a `conj < sp` pattern;
    // replicate that
    new Pair("/^grup\\.c/=grup > conj <: sp=sp", "replace grup sp"),


    // "Lift up" sentence-final periods which have been nested within
    // constituents (convention in AnCora is to have sentence-final
    // periods as final right children of the `sentence` constituent)
    new Pair("__=N <<` (fp|fs=fp <: (/^\\.$/ !. __)) > sentence=sentence",
             "move fp $- N"),


    // AnCora has a few weird parses of "nada que ver" and related
    // phrases. Normalize them:
    //
    //     (grup.nom (pi000000 X) (S (relatiu (pr000000 que))
    //                               (infinitiu (vmn0000 Y))))
    new Pair("(pi000000 <: __ !$+ S >` (/^grup\\.nom/=gn >` sn=sn))" +
               ". ((que >: (__=queTag $- =sn)) . (__=vb !< __ >>: (__=vbContainer $- =queTag)))",


             "[insert (S (relatiu (pr000000 que)) (infinitiu vmn0000=vbFoot)) >-1 gn]" +
               "[move vb >0 vbFoot]" +
               "[delete queTag]" +
               "[delete vbContainer]"),


    // One more bizarre "nada que ver"
    new Pair("sn=sn <: (/^grup\\.nom/=gn <<: Nada)" +
               "$+ (infinitiu=inf <<, que=que <<` (ver , =que) $+ sp=sp)",


             "[delete inf] [insert (S (relatiu (pr000000 que)) (infinitiu (vmn0000 ver))) >-1 gn]" +
               "[move sp >-1 sn]"),


    // Remove date lead-ins
    new Pair("sentence <<, (sn=sn <, (/^grup\\.w$/ $+ fp))",
             "delete sn"),


    // Shed "conj" parents of periods in the middle of trees so that
    // our splitter can identify sentence boundaries properly
    new Pair("conj=conj <: fp=fp", "replace conj fp"),


    // Fix mis-tagging of inverted question mark
    new Pair("fit=fit <: ¿", "relabel fit fia"),
  };


  private static final List<Pair<TregexPattern, TsurgeonPattern>> cleanup
    = compilePatterns(cleanupStrs);


  /**
   * If one of the constituents in this set has a single child has a
   * multi-word token, it should be replaced by a node heading the
   * expanded word leaves rather than simply receive that node as a
   * child.
   *
   * Note that this is only the case for constituents with a *single*
   * child which is a multi-word token.
   */
  private static final Set<String> mergeWithConstituentWhenPossible = new HashSet<String>(
    Arrays.asList(
      "grup.adv",
      "grup.nom",
      "grup.nom.loc",
      "grup.nom.org",
      "grup.nom.otros",
      "grup.nom.pers",
      "grup.verb",
      "spec"
    ));


  // Customization
  private boolean simplifiedTagset;
  private boolean aggressiveNormalization;
  private boolean retainNER;


  public SpanishTreeNormalizer() {
    this(true, false, false);
  }


  public SpanishTreeNormalizer(boolean simplifiedTagset,
                               boolean aggressiveNormalization,
                               boolean retainNER) {
    super(new SpanishTreebankLanguagePack());


    if (retainNER && !simplifiedTagset)
      throw new IllegalArgumentException("retainNER argument only valid when " +
                                         "simplified tagset is used");


    this.simplifiedTagset = simplifiedTagset;
    this.aggressiveNormalization = aggressiveNormalization;
    this.retainNER = retainNER;
  }


  @Override
  public Tree normalizeWholeTree(Tree tree, TreeFactory tf) {
    // Begin with some basic transformations
    tree = tree.prune(emptyFilter).spliceOut(aOverAFilter)
      .transform(constituentRenamer);


    // Now start some simple cleanup
    tree = Tsurgeon.processPatternsOnTree(cleanup, tree);


    // That might've produced some more A-over-As
    tree = tree.spliceOut(aOverAFilter);


    // Find all named entities which are not multi-word tokens and nest
    // them within named entity NP groups
    if (retainNER)
      markSimpleNamedEntities(tree);


    for (Tree t : tree) {
      if (simplifiedTagset && t.isPreTerminal()) {
        // This is a part of speech tag. Remove extra morphological
        // information.
        CoreLabel label = (CoreLabel) t.label();
        String pos = label.value();


        pos = simplifyPOSTag(pos).intern();
        label.setValue(pos);
        label.setTag(pos);
      } else if (aggressiveNormalization && isMultiWordCandidate(t)) {
        // Expand multi-word token if necessary
        normalizeForMultiWord(t, tf);
      }
    }


    // More tregex-powered fixes
    tree = expandElisions(tree);
    tree = expandConmigo(tree);
    tree = expandCliticPronouns(tree);


    // Make sure the tree has a top-level unary rewrite; the root
    // should have a proper root label
    String rootLabel = tlp.startSymbol();
    if (!tree.value().equals(rootLabel))
      tree = tf.newTreeNode(rootLabel, Collections.singletonList(tree));


    return tree;
  }


  @Override
  public String normalizeTerminal(String word) {
    if (spellingFixes.containsKey(word))
      return spellingFixes.get(word);
    return word;
  }


  /**
   * Return a "simplified" version of an original AnCora part-of-speech
   * tag, with much morphological annotation information removed.
   */
  private String simplifyPOSTag(String pos) {
    if (pos.length() == 0)
      return pos;


    switch (pos.charAt(0)) {
    case 'd':
      // determinant (d)
      //   retain category, type
      //   drop person, gender, number, possessor
      return pos.substring(0, 2) + "0000";
    case 's':
      // preposition (s)
      //   retain category, type
      //   drop form, gender, number
      return pos.substring(0, 2) + "000";
    case 'p':
      // pronoun (p)
      //   retain category, type
      //   drop person, gender, number, case, possessor, politeness
      return pos.substring(0, 2) + "000000";
    case 'a':
      // adjective
      //   retain category, type, grade
      //   drop gender, number, function
      return pos.substring(0, 3) + "000";
    case 'n':
      // noun
      //   retain category, type, number, NER label
      //   drop type, gender, classification


      char ner = retainNER && pos.length() == 7 ? pos.charAt(6) : '0';
      return pos.substring(0, 2) + '0' + pos.charAt(3) + "00" + ner;
    case 'v':
      // verb
      //   retain category, type, mood, tense
      //   drop person, number, gender
      return pos.substring(0, 4) + "000";
    default:
      // adverb
      //   retain all
      // punctuation
      //   retain all
      // numerals
      //   retain all
      // date and time
      //   retain all
      // conjunction
      //   retain all
      return pos;
    }
  }


  /**
   * Matches a verb with attached pronouns; used in several following
   * Tregex expressions
   */
  private static final String VERB_LEAF_WITH_PRONOUNS_TREGEX =
    // Match a leaf that looks like it has a clitic pronoun


    // Match suffixes of regular forms which may carry attached
    // pronouns (imperative, gerund, infinitive)
    "/(?:(?:[aeiáéí]r|[áé]ndo|[aeáé]n?|[aeiáéí](?:d(?!os)|(?=os)))" +
      // Match irregular imperative stems
      "|^(?:d[ií]|h[aá]z|v[eé]|p[oó]n|s[aá]l|sé|t[eé]n|v[eé]n|(?:id(?=os$))))" +
      // Match attached pronouns
      "(?:(?:(?:[mts]e|n?os|les?)(?:l[oa]s?)?)|l[oa]s?)$/=vb " +
      // It should actually be a verb (gerund, imperative or
      // infinitive)
      //
      // (Careful: other code that uses this pattern requires that this
      // node be at the end, with parens so that it can be named /
      // modified. See e.g. #verbWithCliticPronounAndSiblings)
      "> (/^vm[gmn]0000$/";


  /**
   * Matches verbs (infinitives, gerunds and imperatives) which have
   * attached pronouns, and the clauses which contain them
   */
  private static final TregexPattern verbWithCliticPronouns =
    TregexPattern.compile(VERB_LEAF_WITH_PRONOUNS_TREGEX +
                          // Verb tag should not have siblings in verb
                          // phrase
                          " !$ __)" +
                          // Locate the clause which contains it, and
                          // the child just below that clause
                          ">+(/^[^S]/) (/^(infinitiu|gerundi|grup\\.verb)$/=target " +
                          "> /^(sentence|S|grup\\.verb|infinitiu|gerundi)$/=clause << =vb " +
                          // Make sure we're not up too far in the tree:
                          // there should be no infinitive / gerund /
                          // verb phrase between the located ancestor
                          // and the verb
                          "!<< (/^(infinitiu|gerundi|grup\\.verb)$/ << =vb))");


  /**
   * Matches verbs (infinitives, gerunds and imperatives) which have
   * attached pronouns and siblings within their containing verb
   * phrases
   */
  private static final TregexPattern verbWithCliticPronounsAndSiblings =
    TregexPattern.compile(VERB_LEAF_WITH_PRONOUNS_TREGEX +
        // Name the matched verb tag as the target for insertion;
        // require that it have siblings
        "=target $ __) " +
        // Locate the clause which contains it, and
        // the child just below that clause
        ">+(/^[^S]/) (/^(infinitiu|gerundi|grup\\.verb)$/ " +
        "> /^(sentence|S|grup\\.verb|infinitiu|gerundi)$/=clause << =vb " +
        // Make sure we're not up too far in the tree:
        // there should be no infinitive / gerund /
        // verb phrase between the located ancestor
        // and the verb
        "!<< (/^(infinitiu|gerundi|grup\\.verb)$/ << =vb))");


  /**
   * Matches verbs which really should be in a clause, but were
   * squeezed into an infinitive constituent (because the pronoun was
   * attached to the verb, we could just pretend it wasn't a clause..
   * not anymore!)
   */
  private static final TregexPattern clauselessVerbWithCliticPronouns = TregexPattern.compile(
    VERB_LEAF_WITH_PRONOUNS_TREGEX +
      ") > (/^vmn/ > (/^infinitiu$/=target > /^sp$/))"
  );
  private static final TsurgeonPattern clausifyVerbWithCliticPronouns =
    Tsurgeon.parseOperation("adjoinF (S foot@) target");


  private static final SpanishVerbStripper verbStripper = SpanishVerbStripper.getInstance();


  /**
   * Separate clitic pronouns into their own tokens in the given tree.
   * (The clitic pronouns are attached under new `grup.nom` constituents
   * which follow the verbs to which they were formerly attached.)
   */
  private static Tree expandCliticPronouns(Tree t) {
    // Perform some cleanup first -- we want to match as many
    // clitic-attached verbs as possible..
    t = Tsurgeon.processPattern(clauselessVerbWithCliticPronouns,
      clausifyVerbWithCliticPronouns, t);


    // Run two separate stages: one for only-child VPs, then another
    // for VP children which have siblings
    t = expandCliticPronounsInner(t, verbWithCliticPronouns);
    t = expandCliticPronounsInner(t, verbWithCliticPronounsAndSiblings);


    return t;
  }


  /**
   * Expand clitic pronouns on verbs matching the given pattern.
   */
  private static Tree expandCliticPronounsInner(Tree t, TregexPattern pattern) {
    TregexMatcher matcher = pattern.matcher(t);
    while (matcher.find()) {
      Tree verbNode = matcher.getNode("vb");
      String verb = verbNode.value();


      if (!SpanishVerbStripper.isStrippable(verb))
        continue;


      Pair<String, List<String>> split = verbStripper.separatePronouns(verb);
      if (split == null)
        continue;


      // Retrieve some context for the pronoun disambiguator: take the
      // matched clause and walk (at most) two constituents up
      StringBuilder clauseYieldBuilder = new StringBuilder();
      for (Label label : matcher.getNode("clause").yield())
        clauseYieldBuilder.append(label.value()).append(" ");
      String clauseYield = clauseYieldBuilder.toString();
      clauseYield = clauseYield.substring(0, clauseYield.length() - 1);


      // Insert clitic pronouns as leaves of pronominal phrases which are
      // siblings of `target`. Iterate in reverse order since pronouns are
      // attached to immediate right of `target`
      List<String> pronouns = split.second();
      for (int i = pronouns.size() - 1; i >= 0; i--) {
        String pronoun = pronouns.get(i);


        String newTreeStr = null;
        if (AnCoraPronounDisambiguator.isAmbiguous(pronoun)) {
          AnCoraPronounDisambiguator.PersonalPronounType type =
            AnCoraPronounDisambiguator.disambiguatePersonalPronoun(split, i, clauseYield);


          switch (type) {
            case OBJECT:
              newTreeStr = "(sn (grup.nom (pp000000 %s)))"; break;
            case REFLEXIVE:
              newTreeStr = "(morfema.pronominal (pp000000 %s))"; break;
            case UNKNOWN:
              // Mark for manual disambiguation
              newTreeStr = "(PRONOUN? (pp000000 %s))"; break;
          }
        } else {
          // Unambiguous clitic pronouns are all indirect / direct
          // object pronouns.. convenient!
          newTreeStr = "(sn (grup.nom (pp000000 %s)))";
        }


        String patternString = "[insert " + String.format(newTreeStr, pronoun) + " $- target]";
        TsurgeonPattern insertPattern = Tsurgeon.parseOperation(patternString);
        t = insertPattern.matcher().evaluate(t, matcher);
      }


      TsurgeonPattern relabelOperation =
        Tsurgeon.parseOperation(String.format("[relabel vb /%s/]", split.first()));
      t = relabelOperation.matcher().evaluate(t, matcher);
    }


    return t;
  }


  private static final List<Pair<TregexPattern, TsurgeonPattern>> markSimpleNEs;


  // Generate some reusable patterns for four different NE groups
  static {
    @SuppressWarnings("unchecked")
    Pair<String, String>[] patternTemplates = new Pair[] {
      // NE as only child of a `grup.nom`
      new Pair("/^grup\\.nom$/=target <: (/np0000%c/ < __)",
               "[relabel target /grup.nom.%s/]"),


      // NE as child with a right sibling in a `grup.nom`
      new Pair("/^grup\\.nom$/ < ((/np0000%c/=target < __) $+ __)",
               "[adjoinF (grup.nom.%s foot@) target]"),


      // NE as child with a left sibling in a `grup.nom`
      new Pair("/^grup\\.nom$/ < ((/np0000%c/=target < __) $- __)",
               "[adjoinF (grup.nom.%s foot@) target]")
    };


    // Pairs tagset annotation codes with the annotations used in our
    // constituents
    @SuppressWarnings("unchecked")
    Pair<Character, String>[] namedEntityTypes = new Pair[] {
      new Pair('0', "otros"), // other
      new Pair('l', "lug"), // place
      new Pair('o', "org"), // location
      new Pair('p', "pers"), // person
    };


    markSimpleNEs =
      new ArrayList<Pair<TregexPattern, TsurgeonPattern>>(patternTemplates.length * namedEntityTypes.length);
    for (Pair<String, String> template : patternTemplates) {
      for (Pair<Character, String> namedEntityType : namedEntityTypes) {
        String tregex = String.format(template.first(), namedEntityType.first());
        String tsurgeon = String.format(template.second(), namedEntityType.second());


        markSimpleNEs.add(new Pair<TregexPattern, TsurgeonPattern>(TregexPattern.compile(tregex),
                                                                   Tsurgeon.parseOperation(tsurgeon)));
      }
    }
  };


  /**
   * Find all named entities which are not multi-word tokens and nest
   * them in named entity NP groups (`grup.nom.{lug,org,pers,otros}`).
   *
   * Do this only for "simple" NEs: the multi-word NEs have to be done
   * at a later step in `MultiWordPreprocessor`.
   */
  void markSimpleNamedEntities(Tree t) {
    Tsurgeon.processPatternsOnTree(markSimpleNEs, t);
  }


  /**
   * Determine whether the given tree node is a multi-word token
   * expansion candidate. (True if the node has at least one grandchild
   * which is a leaf node.)
   */
  boolean isMultiWordCandidate(Tree t) {
    for (Tree child : t.children())
      for (Tree grandchild : child.children())
        if (grandchild.isLeaf())
          return true;


    return false;
  }


  /**
   * Normalize a pre-pre-terminal tree node by accounting for multi-word
   * tokens.
   *
   * Detects multi-word tokens in leaves below this pre-pre-terminal and
   * expands their constituent words into separate leaves.
   */
  void normalizeForMultiWord(Tree t, TreeFactory tf) {
    Tree[] preterminals = t.children();


    for (int i = 0; i < preterminals.length; i++) {
      // This particular child is not actually a preterminal --- skip
      if (!preterminals[i].isPreTerminal())
        continue;


      Tree leaf = preterminals[i].firstChild();
      String leafValue = ((CoreLabel) leaf.label()).value();


      String[] words = getMultiWords(leafValue);
      if (words.length == 1)
        continue;


      // Leaf is a multi-word token; build new nodes for each of its
      // constituent words
      List<Tree> newNodes = new ArrayList<Tree>(words.length);
      for (int j = 0; j < words.length; j++) {
        String word = normalizeTerminal(words[j]);


        Tree newLeaf = tf.newLeaf(word);
        if (newLeaf.label() instanceof HasWord)
          ((HasWord) newLeaf.label()).setWord(word);


        Tree newNode = tf.newTreeNode(MW_TAG, Arrays.asList(newLeaf));
        if (newNode.label() instanceof HasTag)
          ((HasTag) newNode.label()).setTag(MW_TAG);


        newNodes.add(newNode);
      }


      // Value of the phrase which should head these preterminals. Mark
      // that this was created from a multiword token, and also retain
      // the original parts of speech.
      String phraseValue = MW_PHRASE_TAG + "_"
        + simplifyPOSTag(preterminals[i].value());


      // Should we insert these new nodes as children of the parent `t`
      // (i.e., "merge" the multi-word token phrase into its parent), or
      // head them with a new node and set that as a child of the
      // parent?
      boolean shouldMerge = preterminals.length == 1
        && mergeWithConstituentWhenPossible.contains(t.value());


      if (shouldMerge) {
        t.setChildren(newNodes);
        t.setValue(phraseValue);
      } else {
        Tree newHead = tf.newTreeNode(phraseValue, newNodes);
        t.setChild(i, newHead);
      }
    }
  }


  private static final Pattern pQuoted = Pattern.compile("\"(.+)\"");


  /**
   * Characters which may separate words in a single token.
   */
  private static final String WORD_SEPARATORS = ",-_¡!¿?()/%";


  /**
   * Word separators which should not be treated as separate "words" and
   * dropped from a multi-word token.
   */
  private static final String WORD_SEPARATORS_DROP = "_";


  /**
   * These bound morphemes should not be separated from the words with
   * which they are joined by hyphen.
   */
  // TODO how to handle clitics? chino-japonés
  private static final Set<String> hyphenBoundMorphemes = new HashSet<String>(Arrays.asList(
      "anti", // anti-Gil
      "co", // co-promotora
      "ex", // ex-diputado
      "meso", // meso-americano
      "neo", // neo-proteccionismo
      "pre", // pre-presidencia
      "pro", // pro-indonesias
      "quasi", // quasi-unidimensional
      "re", // re-flotamiento
      "semi", // semi-negro
      "sub" // sub-18
  ));


  /**
   * Prepare the given token for multi-word detection / extraction.
   *
   * This method makes up for some various oddities in corpus annotations.
   */
  private String prepareForMultiWordExtraction(String token) {
    return token.replaceAll("-fpa-", "(").replaceAll("-fpt-", ")");
  }


  /**
   * Return the (single or multiple) words which make up the given
   * token.
   *
   * TODO can't SpanishTokenizer handle most of this?
   */
  private String[] getMultiWords(String token) {
    token = prepareForMultiWordExtraction(token);


    Matcher quoteMatcher = pQuoted.matcher(token);
    if (quoteMatcher.matches()) {
      String[] ret = new String[3];
      ret[0] = "\"";
      ret[1] = quoteMatcher.group(1);
      ret[2] = "\"";


      return ret;
    }


    // Confusing: we are using a tokenizer to split a token into its
    // constituent words
    StringTokenizer splitter = new StringTokenizer(token, WORD_SEPARATORS,
                                                   true);
    int remainingTokens = splitter.countTokens();


    List<String> words = new ArrayList<String>();


    while (splitter.hasMoreTokens()) {
      String word = splitter.nextToken();
      remainingTokens--;


      if (shouldDropWord(word))
        // This is a delimiter that we should drop
        continue;


      if (remainingTokens >= 2 && hyphenBoundMorphemes.contains(word)) {
        String hyphen = splitter.nextToken();
        remainingTokens--;


        if (!hyphen.equals("-")) {
          // Ouch. We expected a hyphen here. Clean things up and keep
          // moving.
          words.add(word);


          if (!shouldDropWord(hyphen))
            words.add(hyphen);


          continue;
        }


        String freeMorpheme = splitter.nextToken();
        remainingTokens--;


        words.add(word + hyphen + freeMorpheme);
        continue;
      } else if (word.equals(",") && remainingTokens >= 1 && words.size() > 0) {
        int prevIndex = words.size() - 1;
        String prevWord = words.get(prevIndex);


        if (StringUtils.isNumeric(prevWord)) {
          String nextWord = splitter.nextToken();
          remainingTokens--;


          if (StringUtils.isNumeric(nextWord)) {
            words.set(prevIndex, prevWord + ',' + nextWord);
          } else {
            // Expected a number here.. clean up and move on
            words.add(word);
            words.add(nextWord);
          }


          continue;
        }
      }


      // Otherwise..
      words.add(word);
    }


    return words.toArray(new String[words.size()]);
  }


  /**
   * Determine if the given "word" which is part of a multiword token
   * should be dropped.
   */
  private boolean shouldDropWord(String word) {
    return word.length() == 1
      && WORD_SEPARATORS_DROP.indexOf(word.charAt(0)) != -1;
  }


  /**
   * Expand grandchild tokens which are elided forms of multi-word
   * expressions ('al,' 'del').
   *
   * We perform this expansion separately from multi-word expansion
   * because we follow special rules about where the expanded tokens
   * should be placed in the case of elision.
   *
   * @param t Tree representing an entire sentence
   */
  private Tree expandElisions(Tree t) {
    return Tsurgeon.processPatternsOnTree(elisionExpansions, t);
  }


  @SuppressWarnings("unchecked")
  private static final Pair<String, String>[] elisionExpansionStrs = new Pair[] {
    // Elided forms with a `prep` ancestor which has an `sn` phrase as a
    // right sibling


    new Pair(// Search for `sn` which is right sibling of closest `prep`
             // ancestor to the elided node; cascade down tree to lowest `sn`
             "/^(prep|sadv|conj)$/ <+(/^(prep|grup\\.(adv|cc|prep))$/) (sp000=sp < /(?i)^(del|al)$/=elided) <<` =sp " +
               "$+ (sn > (__ <+(sn) (sn=sn !< sn) << =sn) !$- sn)",


             // Insert the 'el' specifier as a constituent in adjacent
             // noun phrase
             "[relabel elided /(?i)l//] [insert (spec (da0000 el)) >0 sn]"),


    // Prepositional forms with a `prep` grandparent which has a
    // `grup.nom` phrase as a right sibling
    new Pair("prep < (sp000 < /(?i)^(del|al)$/=elided) $+ /grup\\.nom/=target",


             "[relabel elided /(?i)l//] " +
             "[adjoinF (sn (spec (da0000 el)) foot@) target]"),


    // Elided forms with a `prep` ancestor which has an adjectival
    // phrase as a right sibling ('al segundo', etc.)
    new Pair("prep < (sp000 < /(?i)^(del|al)$/=elided) $+ /s\\.a/=target",


             "[relabel elided /(?i)l//] " +
             // Turn neighboring adjectival phrase into a noun phrase,
             // adjoining original adj phrase beneath a `grup.nom`
             "[adjoinF (sn (spec (da0000 el)) (grup.nom foot@)) target]"),


    // "del que golpea:" insert 'el' as specifier into adjacent relative
    // phrase
    new Pair("sp < (prep=prep < (sp000 < /(?i)^(a|de)l$/=elided) $+ " +
      "(S=S <<, relatiu))",


             // Build a noun phrase in the neighboring relative clause
             // containing the 'el' specifier
             "[relabel elided /(?i)l//] " +
             "[adjoinF (sn (spec (da0000 el)) (grup.nom foot@)) S]"),


    // "al" + infinitive phrase
    new Pair("prep < (sp000 < /(?i)^(al|del)$/=elided) $+ " +
             // Looking for an infinitive directly to the right of the
             // "al" token, nested within one or more clause
             // constituents
             "(S=target <+(S) infinitiu=inf <<, =inf)",


             "[relabel elided /(?i)l//] " +
             "[adjoinF (sn (spec (da0000 el)) foot@) target]"),


    // "al no" + infinitive phrase
    new Pair("prep < (sp000 < /(?i)^al$/=elided) $+ (S=target <, neg <2 infinitiu)",


             "[relabel elided a] " +
             "[adjoinF (sn (spec (da0000 el)) foot@) target]"),


    // "al que quisimos tanto"
    new Pair("prep < (sp000 < /(?i)^al$/=elided) $+ relatiu=target",


             "[relabel elided a] " +
             "[adjoinF (sn (spec (da0000 el)) foot@) target]"),


    // "al de" etc.
    new Pair("prep < (sp000 < /(?i)^al$/=elided) $+ (sp=target <, prep)",


             "[relabel elided a] " +
             "[adjoinF (sn (spec (da0000 el)) (grup.nom foot@)) target]"),


    // leading adjective in sibling: "al chileno Fernando"
    new Pair("prep < (sp000 < /(?i)^(del|al)$/=elided) $+ " +
             "(/grup\\.nom/=target <, /s\\.a/ <2 /sn|nc0[sp]000/)",


             "[relabel elided /(?i)l//] " +
             "[adjoinF (sn (spec (da0000 el)) foot@) target]"),


    // "al" + phrase begun by participle -> "a lo <participle>"
    // e.g. "al conseguido" -> "a lo conseguido"
    new Pair("prep < (sp000 < /(?i)^(al|del)$/=elided) $+ (S=target < participi)",


             "[relabel elided /(?i)l//] " +
             "[adjoinF (sn (spec (da0000 lo)) foot@) target]"),


    // "del" used within specifier; e.g. "más del 30 por ciento"
    new Pair("spec < (sp000=target < /(?i)^del$/=elided) > sn $+ /grup\\.nom/",


             "[relabel elided /(?i)l//] " +
             "[insert (da0000 el) $- target]"),


    // "del," "al" in date phrases: "1 de enero del 2001"
    new Pair("sp000=kill < /(?i)^(del|al)$/ $+ w=target",


             "[delete kill] " +
             "[adjoinF (sp (prep (sp000 de)) (sn (spec (da0000 el)) foot@)) target]"),


    // "a favor del X," "en torno al Y": very common (and somewhat
    // complex) phrase structure that we can match
    new Pair("sp000 < /(?i)^(a|de)l$/=contraction >: (prep >` (/^grup\\.prep$/ " +
      ">` (prep=prep > sp $+ (sn=sn <, /^grup\\.(nom|[wz])/))))",


      "[relabel contraction /(?i)l//] [insert (spec (da0000 el)) >0 sn]"),


    // "en vez del X": same as above, except prepositional phrase
    // functions as conjunction (and is labeled as such)
    new Pair("sp000 < /(?i)^(a|de)l$/=contraction >: (prep >` (sp >: (conj $+ (sn=sn <, /^grup\\.(nom|[wz])/))))",


      "[relabel contraction /(?i)l//] [insert (spec (da0000 el)) >0 sn]"),


    // "a favor del X," "en torno al Y" where X, Y are doubly nested
    // substantives
    new Pair("sp000 < /(?i)^(a|de)l$/=contraction >: (prep >` (/^grup\\.prep$/ " +
      ">` (prep=prep > sp $+ (sn <, (sn=sn <, /^grup\\.(nom|[wz])/)))))",


      "[relabel contraction /(?i)l//] [insert (spec (da0000 el)) >0 sn]"),


    // "a favor del X," "en torno al Y" where X, Y already have
    // leading specifiers
    new Pair("sp000 < /(?i)^(a|de)l$/=contraction >: (prep >` (/^grup\\.prep$/ " +
      ">` (prep > sp $+ (sn=sn <, spec=spec))))",


      "[relabel contraction /(?i)l//] [insert (da0000 el) >0 spec]"),


    // "a favor del X," "en torno al Y" where X, Y are nominal
    // groups (not substantives)
    new Pair("sp000 < /(?i)^(a|de)l$/=contraction >: (prep >` (/^grup\\.prep$/ " +
      ">` (prep > sp $+ /^grup\\.(nom|[wz])$/=ng)))",


      "[adjoinF (sn (spec (da0000 el)) foot@) ng] [relabel contraction /(?i)l//]"),


    // "al," "del" as part of coordinating conjunction: "frente al,"
    // "además del"
    //
    // (nearby noun phrase labeled as nominal group)
    new Pair("sp000 < /(?i)^(de|a)l$/=elided >` (/^grup\\.cc$/ >: (conj $+ /^grup\\.nom/=gn))",
      "[relabel elided /(?i)l//] [adjoinF (sn (spec (da0000 el)) foot@) gn]"),


    // "al" + participle in adverbial phrase: "al contado," "al descubierto"
    new Pair("sp000=sp < /(?i)^al$/=elided $+ /^vmp/",
      "[relabel elided /(?i)l//] [insert (da0000 el) $- sp]"),


    // über-special case: 15021_20000218.tbf-5
    //
    // intentional: article should bind all of quoted phrase, even
    // though there are multiple clauses (kind of a crazy sentence)
    new Pair("prep < (sp000 < /(?i)^(al|del)$/=elided) $+ (S=S <+(S) (/^f/=punct $+ (S <+(S) (S <, infinitiu))))",
      "[relabel elided /(?i)l//] [adjoinF (sn (spec (da0000 el)) (grup.nom foot@)) S]"),


    // special case: "del todo" -> "de el todo" (flat)
    new Pair("__=sp < del=contraction >, __=parent $+ (__ < todo >` =parent)",
      "[relabel contraction de] [insert (da0000 el) $- sp]"),
  };


  private static final List<Pair<TregexPattern, TsurgeonPattern>> elisionExpansions =
    compilePatterns(elisionExpansionStrs);


  private static TregexPattern conmigoPattern =
    TregexPattern.compile("/(?i)^con[mst]igo$/=conmigo > (/^pp/ > (/^grup\\.nom$/ > sn=sn))");


  /**
   * ¡Venga, expand conmigo!
   */
  private static Tree expandConmigo(Tree t) {
    TregexMatcher matcher = conmigoPattern.matcher(t);


    while (matcher.find()) {
      Tree conmigoNode = matcher.getNode("conmigo");
      String word = conmigoNode.value();


      String newPronoun = null;
      if (word.equalsIgnoreCase("conmigo"))
        newPronoun = "mí";
      else if (word.equalsIgnoreCase("contigo"))
        newPronoun = "ti";
      else if (word.equalsIgnoreCase("consigo"))
        newPronoun = "sí";


      if (word.charAt(0) == 'C')
        newPronoun = newPronoun.toUpperCase();


      String tsurgeon = String.format(
        "[relabel conmigo /%s/]" +
          "[adjoinF (sp (prep (sp000 con)) foot@) sn]",
        newPronoun);
      TsurgeonPattern pattern = Tsurgeon.parseOperation(tsurgeon);
      t = pattern.matcher().evaluate(t, matcher);
    }


    return t;
  }


  private static List<Pair<TregexPattern, TsurgeonPattern>> compilePatterns(Pair<String, String>[] patterns) {
    List<Pair<TregexPattern, TsurgeonPattern>> ret = new ArrayList<Pair<TregexPattern, TsurgeonPattern>>(patterns.length);
    for (Pair<String, String> pattern : patterns)
      ret.add(new Pair<TregexPattern, TsurgeonPattern>(TregexPattern.compile(pattern.first()),
                                                       Tsurgeon.parseOperation(pattern.second())));


    return ret;
  }


}
Source Code of edu.stanford.nlp.trees.international.spanish.SpanishTreeNormalizer

Related Classes of edu.stanford.nlp.trees.international.spanish.SpanishTreeNormalizer