Package edu.stanford.nlp.trees.international.tuebadz

Source Code of edu.stanford.nlp.trees.international.tuebadz.TueBaDZLanguagePack

package edu.stanford.nlp.trees.international.tuebadz;

import edu.stanford.nlp.trees.AbstractTreebankLanguagePack;
import edu.stanford.nlp.trees.TreebankLanguagePack;
import edu.stanford.nlp.trees.TreeReaderFactory;
import edu.stanford.nlp.trees.HeadFinder;


/** Language pack for the Tuebingen Treebank of Written German (TueBa-D/Z).
*  http://www.sfs.nphil.uni-tuebingen.de/en_tuebadz.shtml
*  This treebank is in utf-8.
*
@author Roger Levy (rog@stanford.edu)
*/
public class TueBaDZLanguagePack extends AbstractTreebankLanguagePack {
  private boolean limitedGF = false;

  private static String[] gfToKeepArray = {"ON", "OA", "OD"};

  private static String[] tuebadzPunctTags = {"$.","$,","$-LRB"};

  private static String[] tuebadzSFPunctTags = {"$."};


  private static String[] tuebadzPunctWords = { "`", "-", ",", ";", ":", "!", "?", "/", ".", "...","'", "\"", "[", "]", "*"};

  private static String[] tuebadzSFPunctWords = {".", "!", "?"};

  /**
   * The first one is used by the TueBaDZ Treebank, and the rest are used by Klein's lexparser.
   */
  private static char[] annotationIntroducingChars = {':', '^', '~', '%', '#', '='};


  /**
   * Gives a handle to the TreebankLanguagePack
   */
  public TueBaDZLanguagePack() {
    this(false);
  }

  /**
   * Make a new language pack with grammatical functions used based on the value of leaveGF
   */
  public TueBaDZLanguagePack(boolean leaveGF) {
    this(leaveGF, AbstractTreebankLanguagePack.DEFAULT_GF_CHAR);
  }

  /**
   * Make a new language pack with grammatical functions used based on the value of leaveGF
   * and marked with the character gfChar.  gfChar should *not* be an annotation introducing character.
   */
  public TueBaDZLanguagePack(boolean leaveGF, char gfChar) {
    this(false, leaveGF, gfChar);
  }

  /**
   * Make a new language pack with grammatical functions used based on the value of leaveGF
   * and marked with the character gfChar.  gfChar should *not* be an annotation introducing character.
   */
  public TueBaDZLanguagePack(boolean useLimitedGF, boolean leaveGF, char gfChar) {
    super(gfChar);
    this.leaveGF  = leaveGF;
    this.limitedGF = useLimitedGF;
  }


  /**
   * Return an array of characters at which a String should be
   * truncated to give the basic syntactic category of a label.
   * The idea here is that Penn treebank style labels follow a syntactic
   * category with various functional and crossreferencing information
   * introduced by special characters (such as "NP-SBJ=1").  This would
   * be truncated to "NP" by the array containing '-' and "=".
   *
   * @return An array of characters that set off label name suffixes
   */
  @Override
  public char[] labelAnnotationIntroducingCharacters() {
    return annotationIntroducingChars;
  }

  @Override
  public String[] punctuationTags() {
    return tuebadzPunctTags;
  }

  @Override
  public String[] punctuationWords() {
    return tuebadzPunctWords;
  }

  @Override
  public String[] sentenceFinalPunctuationTags() {
    return tuebadzSFPunctTags;
  }

  @Override
  public String[] startSymbols() {
    return new String[] {"TOP"};
  }

  public String[] sentenceFinalPunctuationWords() {
    return tuebadzSFPunctWords;
  }

  public String treebankFileExtension() {
    return ".penn";
  }

  private boolean leaveGF = false;

  @Override
  public String basicCategory(String category) {
    String basicCat = super.basicCategory(category);
    if(!leaveGF) {
      basicCat = stripGF(basicCat);
    }
    return basicCat;
  }

  @Override
  public String stripGF(String category) {
    if(category == null) {
      return null;
    }
    int index = category.lastIndexOf(gfCharacter);
    if(index > 0) {
      if(!limitedGF || !containsKeptGF(category, index))
        category = category.substring(0, index);
    }
    return category;
  }

  /**
   * Helper method for determining if the gf in category
   * is one of those in the array gfToKeepArray.  Index is the
   * index where the gfCharacter appears.
   */
  private static boolean containsKeptGF(String category, int index) {
    for(String gf : gfToKeepArray) {
      int gfLength = gf.length();
      if(gfLength < (category.length() - index)) {
        if(category.substring(index+1).equals(gf))//category.substring(index+1, index+1+gfLength).equals(gf))
          return true;
      }
    }
    return false;
  }

  public boolean isLeaveGF() {
    return leaveGF;
  }

  public void setLeaveGF(boolean leaveGF) {
    this.leaveGF = leaveGF;
  }


  /**
   * Return the input Charset encoding for the Treebank.
   * See documentation for the <code>Charset</code> class.
   *
   * @return Name of Charset
   */
  @Override
  public String getEncoding() {
    return "iso-8859-15";
  }

  /** Prints a few aspects of the TreebankLanguagePack, just for debugging.
   */
  public static void main(String[] args) {
    TreebankLanguagePack tlp = new TueBaDZLanguagePack();
    System.out.println("Start symbol: " + tlp.startSymbol());
    String start = tlp.startSymbol();
    System.out.println("Should be true: " + (tlp.isStartSymbol(start)));
    String[] strs = new String[]{"-", "-LLB-", "NP-2", "NP=3", "NP-LGS", "NP-TMP=3", "CARD-HD"};
    for (String str : strs) {
      System.out.println("String: " + str + " basic: " + tlp.basicCategory(str) + " basicAndFunc: " + tlp.categoryAndFunction(str));
    }
  }

  private static final long serialVersionUID = 2697418320262700673L;


  public boolean isLimitedGF() {
    return limitedGF;
  }

  public void setLimitedGF(boolean limitedGF) {
    this.limitedGF = limitedGF;
  }

  @Override
  public TreeReaderFactory treeReaderFactory() {
    return new TueBaDZTreeReaderFactory(this);
  }

  /** {@inheritDoc} */
  public HeadFinder headFinder() {
    return new TueBaDZHeadFinder();
  }

  /** {@inheritDoc} */
  public HeadFinder typedDependencyHeadFinder() {
    return new TueBaDZHeadFinder();
  }

}
TOP

Related Classes of edu.stanford.nlp.trees.international.tuebadz.TueBaDZLanguagePack

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.