Package opennlp.ccgbank.extract

Source Code of opennlp.ccgbank.extract.Testbed

///////////////////////////////////////////////////////////////////////////////
// Copyright (C) 2005-2009 Scott Martin, Rajakrishan Rajkumar and Michael White
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
//////////////////////////////////////////////////////////////////////////////

//Program which creates a temp.xml file from the bareparse. temp.xml serves are the input for creating lexicon.xml & morph.xml

package opennlp.ccgbank.extract;

import java.io.*;
import java.util.*;

import opennlp.ccgbank.CCGBankTaskSources;
import opennlp.ccgbank.CCGBankTaskTestbed;
import opennlp.ccg.grammar.Grammar;
import opennlp.ccg.grammar.RuleGroup;
import opennlp.ccg.hylo.*;
import opennlp.ccg.lexicon.*;
import opennlp.ccg.parse.ParseException;
import opennlp.ccg.synsem.*;
import opennlp.ccg.test.*;
import opennlp.ccg.unify.*;

import org.jdom.Document;
import org.jdom.Element;
import org.jdom.JDOMException;
import org.jdom.input.SAXBuilder;
import org.jdom.output.Format;
import org.jdom.output.XMLOutputter;

/**
* Creates test files under in 'test' dir under extracted grammar.
*/
public class Testbed {

  // the grammar
  private Grammar grammar;
  private Lexicon lexicon;
  private RuleGroup rules;

  // supertagger stand-in
  private SupertaggerStandIn supertaggerStandIn = new SupertaggerStandIn();
 
  // results of following deriv
  private Sign sign = null;

  private LF lf = null;

  private String str = "";

  private int numParses = 0;

  private String header = "";

  // Store info related to treenodes in the xml deriv
  private static ArrayList<String> treeInfo = new ArrayList<String>();

  private static boolean treeInfoFlag = false;

  // Store details of preds (nomId key: pos,stag, pos and pred name)
  private static Hashtable<String, String> predInfo = new Hashtable<String, String>();

  // supertag-rule combos
  private Set<String> combos = null;

  Set<CCGBankTaskSources> sourcesSet;

  CCGBankTaskTestbed ccgBankTaskTestbed;

  File grammarFile, targetDirectory;

  // constructor
  public Testbed(Set<CCGBankTaskSources> sourcesSet, File targetDirectory,
      CCGBankTaskTestbed testbed) throws IOException {
    grammarFile = new File(targetDirectory, "grammar.xml");

    this.grammar = new Grammar(grammarFile.toURI().toURL(), true);
    this.lexicon = grammar.lexicon;
    this.rules = grammar.rules;

    this.sourcesSet = sourcesSet;
    this.targetDirectory = targetDirectory;
    this.ccgBankTaskTestbed = testbed;
  }

  // main method for creating test files
  @SuppressWarnings("rawtypes")
  public void createTestFiles() throws IOException, JDOMException {

    ccgBankTaskTestbed.log("Creating test files:");

    // config grammar
    Tokenizer tokenizer = grammar.lexicon.tokenizer;
    grammar.prefs.showFeats = true;
    grammar.prefs.showSem = ccgBankTaskTestbed.isShowsSem();

    // ensure test dir exists
    File testDir = new File(targetDirectory, "test");
    testDir.mkdirs();
    ccgBankTaskTestbed.log("Writing test files to: " + testDir.getPath());

    // text, class-replaced text factors etc. output
    PrintWriter textPW = null;
    PrintWriter textscPW = null;
    PrintWriter factorsPW = null;
    PrintWriter combosPW = null;
    PrintWriter predsPW = null;
    PrintWriter treePW = null;
    File textFile = ccgBankTaskTestbed.getText();
    File factorsFile = ccgBankTaskTestbed.getFactors();
    File combosFile = ccgBankTaskTestbed.getCombos();
    File predsFile = ccgBankTaskTestbed.getPreds();
    File treeFile = ccgBankTaskTestbed.getTree();
   
    if (textFile != null) {
      File textscFile=new File(textFile.getParent()+"/"+textFile.getName().replaceFirst("text-","textsc-"));
      ccgBankTaskTestbed.log("Writing text to: " + textFile);
      ccgBankTaskTestbed.log("Writing class-replaced text to: " + textscFile);
            textFile.getParentFile().mkdirs();
      textPW = new PrintWriter(new BufferedWriter(new FileWriter(textFile)));
      textscPW = new PrintWriter(new BufferedWriter(new FileWriter(textscFile)));
    }
    if (factorsFile != null) {
      ccgBankTaskTestbed.log("Writing factors to: " + factorsFile);
            factorsFile.getParentFile().mkdirs();
      factorsPW = new PrintWriter(new BufferedWriter(new FileWriter(factorsFile)));
    }
    if (combosFile != null) {
      ccgBankTaskTestbed.log("Writing supertag-rule combos to: " + combosFile);
            combosFile.getParentFile().mkdirs();
      combos = new HashSet<String>();
      combosPW = new PrintWriter(new BufferedWriter(new FileWriter(combosFile)));
    }
    if (predsFile != null) {
      ccgBankTaskTestbed.log("Writing preds to: " + predsFile);
      predsFile.getParentFile().mkdirs();
      predsPW = new PrintWriter(new BufferedWriter(new FileWriter(predsFile)));
    }
    if (treeFile != null) {
      ccgBankTaskTestbed.log("Writing tree node info to: " + treeFile);
            treeFile.getParentFile().mkdirs();
      treePW = new PrintWriter(new BufferedWriter(new FileWriter(treeFile)));
      treeInfoFlag = true;
    }

    // jdom stuff
    SAXBuilder builder = new SAXBuilder();
    XMLOutputter outputter = new XMLOutputter(Format.getPrettyFormat());

    // counters
    int numWithLFs = 0;
    int numSingleRootLFs = 0;
    int numWithoutLFs = 0;

    for (CCGBankTaskSources sources : sourcesSet) {
      for (File file : sources) {
        File testSectDir = new File(testDir, file.getParentFile().getName());
        testSectDir.mkdir();

        ccgBankTaskTestbed.log("Debug Print: " + testSectDir.getAbsolutePath());

        // parse derivations
        Document inDoc = builder.build(file);
        Element inRoot = inDoc.getRootElement();

        // make test doc, sign map
        Document outDoc = new Document();
        Element outRoot = new Element("regression");
        outDoc.setRootElement(outRoot);
        Map<String,Sign> signMap = new HashMap<String,Sign>();

        // loop through derivations, making test items
        List derivElts = inRoot.getChildren();
        for (Object derivObj : derivElts) {
          Element derivElt = (Element) derivObj;
          followDeriv(derivElt);
          if (lf != null) {
            numWithLFs++;
            // check for single root
            if (lf instanceof SatOp)
              numSingleRootLFs++;
            // add test item, sign
            Element item = RegressionInfo.makeTestItem(grammar, str, numParses, lf);
            if (header == null) {
              header = "missing";
              ccgBankTaskTestbed.log("Warning: missing header in " + file);
            }
            item.setAttribute("info", header);
            if (header != null) signMap.put(header, sign);

            // Add parsed words as a separate LF element
            Element fullWordsElt = new Element("full-words");
            fullWordsElt.addContent(tokenizer.format(sign.getWords()));

            // Add info about LF lexical preds as a separate element
            Element predInfoElt = new Element("pred-info");
            String predInfoText = collectPredInfo(header);
            predInfoElt.setAttribute("data", predInfoText);

            item.addContent(fullWordsElt);
            item.addContent(predInfoElt);
            outRoot.addContent(item);

            // append to text, factors files
            if (textPW != null)
              textPW.println(str);
           
             if (textscPW != null) {
                             textscPW.flush();
                             String textsc="";
                             //Note sem class replacement works only for NE classes spec in the grammar file
                             textsc=tokenizer.getOrthography((List<Word>)sign.getWords(),true);
                             textscPW.println(textsc);
                             textscPW.flush();
             }

           
            if (factorsPW != null)
              factorsPW.println(tokenizer.format(sign.getWords()));
            // append new combos to combos file
            if (combosPW != null) {
              for (String combo : newCombos()) combosPW.println(combo);
            }
            // also to preds
            if (predsPW != null)
              predsPW.println(predInfoText);

            if (treePW != null) {
              for (String info : treeInfo) {
                treePW.println(info);
                treePW.flush();
              }
            }
            treeInfo = new ArrayList<String>();
          } else
            numWithoutLFs++;
        }

        // write test doc, saved signs
        File regressionFile = new File(testSectDir, file.getName());
        outputter.output(outDoc, new FileOutputStream(regressionFile));
        RegressionInfo.writeSerFile(signMap, regressionFile);
      }
    }

    // flush text, factors, combos, preds, tree files
    if (textPW != null) {
      textPW.flush();
      textPW.close();
    }
    if (factorsPW != null) {
      factorsPW.flush();
      factorsPW.close();
    }
    if (combosPW != null) {
      combosPW.flush();
      combosPW.close();
    }
    if (predsPW != null) {
      predsPW.flush();
      predsPW.close();
    }
    if (treePW != null) {
      treePW.flush();
      treePW.close();
    }

    // summary
    ccgBankTaskTestbed.log("numWithLFs: " + numWithLFs);
    ccgBankTaskTestbed.log("numSingleRootLFs: " + numSingleRootLFs);
    ccgBankTaskTestbed.log("numWithoutLFs: " + numWithoutLFs);
    ccgBankTaskTestbed.log("total: " + (numWithLFs + numWithoutLFs));
  }

  private void followDeriv(Element derivElt) {
   
    // reset
    sign = null;
    lf = null;
    str = "";
    header = derivElt.getAttributeValue("Header");
   
    // bookkeeping
    UnifyControl.startUnifySequence();
   
    try {

      Category cat = null;
      Nominal index = null;
      LF flatLF = null;

      // recurse through deriv
      SignHash signs = followDerivR(derivElt);
      // set results, using first available sign (ie some arbitrary one)
      if (!signs.isEmpty()) {
        Iterator<Sign> iter = signs.asSignSet().iterator();
        // System.out.println("Processing file no: "+header);
        // Count of single rooted LFs produced by the constrained parser
        int matchSRLF = 0;
        // Check whether any of the signs have a single rooted LF
        while (iter.hasNext()) {
          // System.out.println("Found LF");
          sign = iter.next();
          cat = sign.getCategory();
          index = cat.getIndexNominal();
          flatLF = cat.getLF();
          if (flatLF != null) {
            lf = HyloHelper.compactAndConvertNominals(flatLF, index, sign);
            // Break when the first single rooted LF is encountered
            if (lf instanceof SatOp) {
              matchSRLF++;
              // System.out.println("Single root LF found");
              break;
            }
          }
        }

        // If no single rooted LF is there, using first available sign
        // (ie some arbitrary one)
        if (matchSRLF == 0) {
          sign = signs.asSignSet().iterator().next();
          cat = sign.getCategory();
          index = cat.getIndexNominal();
          flatLF = cat.getLF();
          if (flatLF != null) lf = HyloHelper.compactAndConvertNominals(flatLF, index, sign);
        }

        if (flatLF != null) {
          extrPredInfo(flatLF, "");
        }

        numParses = signs.size();
        str = str.trim();
      }
     
    } catch (ParseException exc) {
      ccgBankTaskTestbed.log("Warning for " + header + ": " + exc.toString());
    }
  }

  // recurse through deriv, returning signs
  @SuppressWarnings({ "rawtypes", "unchecked" })
  private SignHash followDerivR(Element derivElt) throws ParseException {
    String eltName = derivElt.getName();
    // follow deriv, applying combinatory rules
    // nb: no checks made for intended deriv!
    if (eltName.equals("Treenode")) {
      String cat = derivElt.getAttributeValue("cat");
      String ntId = derivElt.getAttributeValue("nt_id");
      String simpleCat = derivElt.getAttributeValue("stag");
      List childElts = derivElt.getChildren();
      int numChildren = childElts.size();
      if (numChildren == 0)
        throw new ParseException(header
            + ": no child elements for TreeNode for cat: " + cat);
      // if no cat element present, adjust list with an initial dummy node,
      // to avoid code changes in what follows
      Element elt0 = (Element) childElts.get(0);
      String elt0name = elt0.getName();
      if (elt0name.equals("Treenode") || elt0name.equals("Leafnode")) {
        childElts.add(0, new Element("dummy"));
        numChildren++;
      }
      if (numChildren != 2 && numChildren != 3)
        throw new ParseException(header
            + ": wrong number of child elements: " + numChildren
            + " for cat: " + cat);
      Element firstInputElt = (Element) childElts.get(1);
      SignHash firstSigns = followDerivR(firstInputElt);
      SignHash retval = new SignHash();
      // unary case
      if (numChildren == 2) {
        // apply rules
        for (Sign s : firstSigns.asSignSet()) {
          List<Sign> results = rules.applyUnaryRules(s);
          for (Sign rSign : results)
            retval.insert(rSign);
        }
        // caution/warn upon failure
        if (!containsCat(retval, simpleCat)) {
          boolean noResults = retval.isEmpty();
          String inCat = firstInputElt.getAttributeValue("cat");
          String msg = "Unable to derive: " + cat + " from: " + inCat;
          if (!noResults)
            ccgBankTaskTestbed.log("Caution for " + header + ": " + msg);
          if (ccgBankTaskTestbed.isDebugDerivations()) {
            ccgBankTaskTestbed.log(header + ": derivation stymied; inputs: ");
            for (Sign s : firstSigns.asSignSet()) {
              ccgBankTaskTestbed.log(s.toString());
            }
            if (!noResults) {
              ccgBankTaskTestbed.log("Outputs: ");
              for (Sign s : retval.asSignSet())
                ccgBankTaskTestbed.log(s.toString());
            }
          }
          if (noResults)
            throw new ParseException("Derivation blocked: " + msg);
        }
      }
      // binary case
      else if (numChildren == 3) {
        Element secondInputElt = (Element) childElts.get(2);
        SignHash secondSigns = followDerivR(secondInputElt);
        // apply rules
        for (Sign sign1 : firstSigns.asSignSet()) {
          for (Sign sign2 : secondSigns.asSignSet()) {
            List<Sign> results = rules.applyBinaryRules(sign1, sign2);
            for (Sign rSign : results)
              retval.insert(rSign);
          }
        }
        // if no results, propagate one input if the other is
        // internal punct
        if (retval.isEmpty()) {
          if (isPunct(secondInputElt))
            return firstSigns;
          else if (isPunct(firstInputElt))
            return secondSigns;
        }
        // caution/warn upon failure
        if (!containsCat(retval, simpleCat)) {
          boolean noResults = retval.isEmpty();
          String inCat1 = firstInputElt.getAttributeValue("cat");
          String inCat2 = secondInputElt.getAttributeValue("cat");
          String msg = "Unable to derive: " + cat + " from: " + inCat1 + " and: " + inCat2;
          if (!noResults)
            ccgBankTaskTestbed.log("Caution for " + header + ": " + msg);
          if (ccgBankTaskTestbed.isDebugDerivations()) {
            ccgBankTaskTestbed.log(header + ": derivation stymied; first inputs: ");
            for (Sign sign1 : firstSigns.asSignSet()) {
              ccgBankTaskTestbed.log(sign1.toString());
            }
            ccgBankTaskTestbed.log("Second inputs: ");
            for (Sign sign2 : secondSigns.asSignSet()) {
              ccgBankTaskTestbed.log(sign2.toString());
            }
            if (!noResults) {
              ccgBankTaskTestbed.log("Outputs: ");
              for (Sign s : retval.asSignSet())
                ccgBankTaskTestbed.log(s.toString());
            }
          }
          if (noResults)
            throw new ParseException("Derivation blocked: " + msg);
        }
      }

      // Store cat ids of tree nodes for printing to aux files
      if (treeInfoFlag) {

        for (Sign s : retval.asSignSet()) {

          Hashtable<String, String> idConvTally = new Hashtable<String, String>();
          Hashtable<String, Integer> freqTally = new Hashtable<String, Integer>();
          ArrayList<String> fullCat = new ArrayList<String>();
          String catId = "";

          Category treeCat = s.getCategory();
          // System.out.println(header+" "+ntId+" "+treeCat);
          recurseCat(treeCat, fullCat, idConvTally, freqTally);
          /*
           * System.out.println(freqTally);
           * System.out.println(fullCat); System.out.println('\n');
           */

          if (fullCat.size() > 1) {

            for (String x : fullCat) {

              String y[] = x.split("_");
              if (y.length == 1) {
                catId = catId + "," + y[0];
                continue;
              }

              int freq = freqTally.get(y[1]);
              freqTally.put(y[1], freq - 1);

              if (x.endsWith("_M") && freq <= 1)
                x = x.replaceFirst("_M", "");

              catId = catId + "," + x;
            }
            catId = catId.replaceFirst(",", "");
            treeInfo.add(header + " " + ntId + " " + catId);
          }
          /*
           * System.out.println(idConvTally);
           * System.out.println(fullCat); System.out.println('\n');
           */
        }

      }

      // done
      return retval;
    }
    // lex lookup
    // nb: not always insisting on right POS, b/c hashing strategy uses
    // surface words,
    // thus doesn't distinguish lex signs based solely on POS
    // nb: might make sense to warn on lex cats with missing semantics
    else if (eltName.equals("Leafnode")) {
      try {
        String lex = derivElt.getAttributeValue("lexeme");
        Word w = lexicon.tokenizer.parseToken(lex);
        str += w.getForm() + " ";
        String cat = derivElt.getAttributeValue("cat");
        String simpleCat = derivElt.getAttributeValue("stag");
        String rel = derivElt.getAttributeValue("rel");
        String indexRel = derivElt.getAttributeValue("indexRel");
        String semClass = "";
        semClass = derivElt.getAttributeValue("class");

        String roles = derivElt.getAttributeValue("argRoles");
        String pos = derivElt.getAttributeValue("pos");
        // nb: for now, need to ignore rel for non-VB pos
        if (!pos.startsWith("VB"))
          rel = null;
        // lex lookup with required supertag
        // NB: there's no guarantee of getting the right arg roles if the word-cat pair is observed
        lexicon.setSupertagger(supertaggerStandIn);
        supertaggerStandIn.setTag(simpleCat);
        SignHash lexSigns = lexicon.getSignsFromWord(w);

        if (semClass == null || semClass.length() == 0)
          semClass = "NoClass";

        // add lex signs, filtered by rel, reindexed
        // also check number with matching pos, match on no class
        int matchPOS = 0;
        boolean matchNoClass = false;
        for (Iterator<Sign> it = lexSigns.asSignSet().iterator(); it.hasNext();) {
          Sign s = it.next();

          Word wTemp = s.getWords().get(0);
          String morphClass = wTemp.getSemClass();
          if (morphClass == null || morphClass.length() == 0)
            morphClass = "NoClass";

          Category lexcat = s.getCategory();
          LF lexLF = lexcat.getLF();

          // allow any class if no sem class given
          if (!(semClass.equals("NoClass") || semClass.equals(morphClass))
              || !containsPred(lexLF, rel)
              || !containsRoles(lexLF, roles)
              || !containsRel(lexLF, indexRel, s)) {
            it.remove();
          }
          else {
            UnifyControl.reindex(lexcat);
            if (wTemp.getPOS().equals(pos)) {
              matchPOS++;
              if (semClass.equals("NoClass") && morphClass.equals("NoClass"))
                matchNoClass = true;
            }
          }
        }
        // filter by pos unless none match
        if (matchPOS > 0) {
          for (Iterator<Sign> it = lexSigns.asSignSet().iterator(); it.hasNext();) {
            Sign s = it.next();
            Word wTemp = s.getWords().get(0);
            if (!wTemp.getPOS().equals(pos)) {
              it.remove(); continue;
            }
            // filter by mismatched class if apropos
            if (matchNoClass) {
              String morphClass = wTemp.getSemClass();
              if (morphClass != null && morphClass.length() != 0)
                it.remove();
            }
          }
        }
        if (lexSigns.isEmpty())
          throw new LexException("No matching category " + cat + " for: " + w);
        return lexSigns;
      } catch (LexException exc) {
        // try continuing derivations without lex signs for punctuation,
        // otherwise throw parse exception
        if (isPunct(derivElt)) {
          if (ccgBankTaskTestbed.isDebugDerivations()) {
            ccgBankTaskTestbed.log(header + ": " + exc.toString());
          }
          return new SignHash();
        }
        throw new ParseException(exc.toString());
      } catch (RuntimeException exc) {
        // for other exceptions, throw parse exception
        throw new ParseException(exc.toString());
      }
    } else
      throw new RuntimeException(header + ": unrecognized element in derivation: " + eltName);
  }

  // Recurse through a CCG cat and print out the atomcats and their ids
  private static void recurseCat(Category cat, ArrayList<String> fullCat,
      Hashtable<String, String> idConvTally,
      Hashtable<String, Integer> freqTally) {

    if (cat instanceof ComplexCat) {

      ComplexCat cc = (ComplexCat) cat.copy();
      Category resCat = cc.getResult();

      recurseCat(resCat, fullCat, idConvTally, freqTally);
      int argStart = 0;

      if (resCat instanceof ComplexCat) {
        ComplexCat temp = (ComplexCat) resCat.copy();
        argStart = temp.getArgStack().size();
      }

      ArgStack argStack = cc.getArgStack(argStart);

      for (int i = 0; i < argStack.size(); i++) {
        if (argStack.get(i) instanceof BasicArg) {
          BasicArg bArg = (BasicArg) argStack.get(i);
          Category argCat = (Category) bArg.getCat();
          Slash argSlash = (Slash) bArg.getSlash();
          // System.out.println(argSlash.toString()+'\n');
          fullCat.add(argSlash.toString());
          recurseCat(argCat, fullCat, idConvTally, freqTally);
        }
      }

    } else if (cat instanceof AtomCat) {

      AtomCat ac = (AtomCat) cat.copy();
      FeatureStructure fs = ac.getFeatureStructure();

      if (fs.hasAttribute("index")) {
        String index = fs.getValue("index").toString();
        // System.out.println(index);
        String id[] = index.split(":");

        if (!idConvTally.containsKey(id[0]))
          idConvTally.put(id[0], Integer.toString(idConvTally.size() + 1));
        String numId = idConvTally.get(id[0]);
        String catId = ac.getType() + "_" + numId;

        if (!freqTally.containsKey(numId))
          freqTally.put(numId, 0);

        int freq = freqTally.get(numId);
        freqTally.put(numId, freq + 1);

        if (fs.hasAttribute("mod-index"))
          catId = catId + "_" + "M";

        // System.out.println('\n');

        fullCat.add(catId);
      }
    }
  }

  // returns whether the given LF contains the given the lexical predicate
  private static boolean containsPred(LF lf, String pred) {
    if (pred == null)
      return true;
    if (lf == null)
      return false;
    for (SatOp satOp : HyloHelper.getPreds(lf)) {
      if (HyloHelper.isLexPred(satOp)) {
        if (HyloHelper.getLexPred(satOp).equals(pred))
          return true;
      }
    }
    return false;
  }

  // roles in a given LF
  private static Set<String> rolesSet = new HashSet<String>();

  // returns whether the given LF contains the given the lexical predicate
  private static boolean containsRoles(LF lf, String roles) {
    if (roles == null)
      return true;
    if (lf == null)
      return false;
    String[] rolesArray = roles.split("\\s+");
    // get roles in LF
    rolesSet.clear();
    for (SatOp satOp : HyloHelper.getPreds(lf)) {
      if (HyloHelper.isRelPred(satOp)) {
        rolesSet.add(HyloHelper.getRel(satOp));
      }
    }
    // check presence of roles in LF
    for (String role : rolesArray) {
      if (role.equals("null") || role.equals("e"))
        continue;
      if (!rolesSet.contains(role))
        return false;
    }
    return true;
  }

  // returns whether the given LF contains the given indexRel
  private static boolean containsRel(LF lf, String indexRel, Sign sign) {

    if (indexRel == null)
      return true;
    if (lf == null)
      return false;

    indexRel = "<" + indexRel + ">";

    /*
     * System.out.println(sign.getSupertag()+" "+sign.getPOS());
     * System.out.println(indexRel); System.out.println(rolesSet);
     * System.out.println(lf); System.out.println('\n');
     */

    // check presence of that rel/feat in LF
    if (!lf.toString().contains(indexRel))
      return false;
    else
      return true;
  }

  // identifies punctuation
  private static boolean isPunct(Element elt) {
    String pos = elt.getAttributeValue("pos");
    if (pos == null)
      return false;

    return (pos.equals("|") || pos.equals(".") || pos.equals(",")
        || pos.equals(";") || pos.equals(":") || pos.equals("LRB")
        || pos.equals("RRB") || pos.equals("``") || pos.equals("''"));
  }

  // return whether signs contains cat; filter if so
  private static boolean containsCat(SignHash signs, String cat) {
    // special case: give free pass to cats with dollars
    if (!signs.isEmpty() && cat.indexOf('$') >= 0)
      return true;
    // check for cat
    boolean retval = false;
    for (Sign sign : signs.asSignSet()) {
      String supertag = sign.getCategory().getSupertag();
      // again, give free pass to cats with dollars
      if (supertag.indexOf('$') >= 0 || cat.equals(supertag)) {
        retval = true;
        break;
      }
    }
    // filter if found
    if (retval) {
      for (Iterator<Sign> it = signs.asSignSet().iterator(); it.hasNext();) {
        Sign sign = it.next();
        String supertag = sign.getCategory().getSupertag();
        if (supertag.indexOf('$') >= 0 || cat.equals(supertag))
          continue;
        else
          it.remove();
      }
    }
    return retval;
  }

  // returns new combos for current sign
  private List<String> newCombos() {
    List<String> retval = new ArrayList<String>();
    newCombos(sign, retval);
    return retval;
  }

  // recursively adds new combos for given sign
  private void newCombos(Sign s, List<String> retval) {
    Sign[] inputs = s.getDerivationHistory().getInputs();
    if (inputs != null) {
      StringBuffer sb = new StringBuffer();
      for (int i = 0; i < inputs.length; i++) {
        sb.append(inputs[i].getCategory().getSupertag()).append(' ');
      }
      sb.append(s.getDerivationHistory().getRule().name());
      String combo = sb.toString();
      if (!combos.contains(combo)) {
        retval.add(combo);
        combos.add(combo);
      }
      for (int i = 0; i < inputs.length; i++) {
        newCombos(inputs[i], retval);
      }
    }
  }

  // Extracts nom-id,pos,supertag info related to LF lexical preds
  private void extrPredInfo(LF lf, String sentId) {
    // System.out.println(sentId);
    extractPredInfo(lf, predInfo);
  }

  /**
   * Extracts the nom id, pos, and supertag info related to LF lexical preds,
   * and puts it in the given map keyed off the nom id.
   * Note that the map should be cleared for each new LF.
   */
  public static void extractPredInfo(LF lf, Map<String,String> predInfoMap) {

    String predData = "";
    List<SatOp> preds = HyloHelper.getPreds(lf);

    for (SatOp pred : preds) {

      String lexPred = HyloHelper.getLexPred(pred);
      if (lexPred == null)
        continue;

      if (!(pred.getArg() instanceof Proposition))
        continue;

      Proposition p = (Proposition) pred.getArg();

      String lex = (p.getName()).toString();

      // Get supertag & pos tag info and store that
      String stag = pred.getOrigin().getSupertag();
      String pos = pred.getOrigin().getPOS();
      Nominal nom = pred.getNominal();
      String nomInd = nom.toString();
      String nomIndParts[] = nomInd.split(":");

      if (stag == null || pos == null || lex == null)
        continue;

      predData = escape(stag) + ":" + escape(pos) + ":" + escape(lex);
      predInfoMap.put(nomIndParts[0], predData);
    }
  }

  // Collects nom-id,pos,supertag info related to LF lexical preds for this
  // particular LF
  private static String collectPredInfo(String sentId) {

    String predData = "";

    for (Enumeration<String> e = predInfo.keys(); e.hasMoreElements();) {

      String nomId = e.nextElement();
      predData = predData + " " + nomId + ":" + predInfo.get(nomId);
    }

    predInfo = new Hashtable<String, String>();

    return predData.trim();
  }

  /**
   * Returns the pred info string for the given pred info map (see extractPredInfo).
   */
  public static String getPredInfo(Map<String,String> predInfoMap) {
    String predData = "";
    for (String nomId : predInfoMap.keySet()) {
      predData = predData + " " + nomId + ":" + predInfoMap.get(nomId);
    }
    return predData.trim();
  }
 
  // escapes a string using DefaultTokenizer
  private static String escape(String s) { return DefaultTokenizer.escape(s); }
 
  // stands in for a supertagger during lex lookup
  private static class SupertaggerStandIn implements SupertaggerAdapter {
    // map for a single key
    private Map<String,Double> map = new HashMap<String,Double>(2);
    public Map<String,Double> getSupertags() { return map; }
   
    // set tag
    void setTag(String tag) { map.clear(); map.put(tag, 1.0); }
   
    // dummy implementations
    public void setIncludeGold(boolean includeGold) {}
    public void resetBeta() {}
    public void resetBetaToMax() {}
    public void nextBeta() {}
    public void previousBeta() {}
    public boolean hasMoreBetas() { return false; }
    public boolean hasLessBetas() { return false; }
    public double[] getBetas() { return new double[]{1.0}; }
    public void setBetas(double[] betas) {}
    public double getCurrentBetaValue() { return 1.0; }
  }
}
TOP

Related Classes of opennlp.ccgbank.extract.Testbed

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.