///////////////////////////////////////////////////////////////////////////////
// Copyright (C) 2011-3 Michael White
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
//////////////////////////////////////////////////////////////////////////////
package opennlp.ccg.hylo;
import java.util.*;
import opennlp.ccg.lexicon.Word;
import opennlp.ccg.perceptron.*;
import opennlp.ccg.synsem.*;
import opennlp.ccg.util.TrieMap;
/**
* A class for extracting lexical dependency features. This class
* implements the features in White and Rajkumar's EMNLP-12 paper
* <a href="http://aclweb.org/anthology-new/D/D12/D12-1023.pdf">
* Minimal Dependency Length in Realization Ranking</a>.
*
* Features are extracted lazily for efficiency, using a prefix of "ld"
* for "lexical dependency".
*
* Features potentially involve the word, POS tag, word class, definiteness, syntactic
* complexity, short-long order, and total dependency length, where the latter
* three can be controlled by the appropriate include* flags; of
* these, only dependency length is included by default.
*
* Word class is based on the semantic class, a check for color terms, common suffixes,
* and the presence of a hyphen or capitalization.
*
* The checks for definite NPs, color terms and common suffixes are done by methods with
* defaults for the English CCGbank, which can be overridden in subclasses or
* reconfigured in the case of color terms and suffixes.
*
* @author Michael White
* @version $Revision: 1.4 $, $Date: 2011/11/10 15:12:53 $
*/
public class LexDepFeatureExtractor implements FeatureExtractor {
/** Feature prefix constant: "ld". */
public static final String PREFIX = "ld";
/** Head precedes dep constant: "hpd". */
public static final String HEAD_PRECEDES_DEP = "hpd";
/** Dep precedes head constant: "dph". */
public static final String DEP_PRECEDES_HEAD = "dph";
/** Left of head sibs precedence constant: "lsp". */
public static final String LEFT_SIBS_PRECEDENCE = "lsp";
/** Right of head sibs precedence constant: "rsp". */
public static final String RIGHT_SIBS_PRECEDENCE = "rsp";
/** Returns the appropriate siblings precedence constant for the head-dep order constant. */
public static String sibPrecedenceForDep(String depConst) {
return (depConst == DEP_PRECEDES_HEAD) ? LEFT_SIBS_PRECEDENCE : RIGHT_SIBS_PRECEDENCE;
}
/** Definiteness constant: "def1". */
public static final String DEF = "def1";
/** Indefiniteness constant: "def0". */
public static final String INDEF = "def0";
/** Class constant PRO. */
public static final String CLASS_PRO = "PRO";
/** Class constant COLOR. */
public static final String CLASS_COLOR = "COLOR";
/** Class constant HYPH. */
public static final String CLASS_HYPH = "HYPH";
/** Class constant CAP. */
public static final String CLASS_CAP = "CAP";
/** Class constant NIL. */
public static final String CLASS_NIL = "NIL";
/** Flag for whether to include syntactic complexity ordering features (defaults to false). */
public boolean includeComplexityFeats = false;
/** Complexity ordering constant for verb presence: "1v". */
public static final String HAS_V = "1v";
/** Complexity ordering constant for verb presence: "0v". */
public static final String NO_V = "0v";
/** Complexity ordering constant for punct presence: "1p". */
public static final String HAS_P = "1p";
/** Complexity ordering constant for punct absence: "0p". */
public static final String NO_P = "0p";
/** Flag for whether to include short-long features (defaults to false). */
public boolean includeShortLong = false;
/** Short-long order constant: "sl". */
public static final String SHORT_LONG_ORDER = "sl";
/** Long-short order constant: "ls". */
public static final String LONG_SHORT_ORDER = "ls";
/** Flag for whether to include global dependency length features (defaults to true). */
public boolean includeDepLen = true;
/** Global dependency length feature constant: "$deplen". */
public static final String DEPLEN = "$deplen";
/** Conditional lazy extractor, for lazily extracting a feature subject to a test. */
public static abstract class ConditionalLazyExtractor {
abstract boolean test();
List<TrieMap.KeyExtractor<String>> lazyExtractor = new ArrayList<TrieMap.KeyExtractor<String>>(5);
}
/** Conditional lazy evaluator, for lazily extracting a feature and its value, subject to a test. */
public static abstract class ConditionalLazyEvaluator extends ConditionalLazyExtractor {
abstract float eval();
}
/** Feature map wrapper, for unique retrieval from a sign's data objects. */
public static class FeatureMapWrapper {
public FeatureMap featureMap;
public FeatureMapWrapper(FeatureMap featureMap) { this.featureMap = featureMap; }
}
/** The alphabet. */
protected Alphabet alphabet = null;
/** Current feature map. */
protected FeatureMap currentMap = null;
/** Current sign (for extracting features). */
protected Sign currentSign = null;
/** Current input signs (for extracting features). */
protected Sign[] currentInputs = null;
/** Current dependency (for extracting features). */
protected LexDependency currentDep = null;
/** Current head index. */
protected int currentHeadIndex = -1;
/** Current dependent index. */
protected int currentDepIndex = -1;
/** Current sibling dependency (for extracting features). */
protected LexDependency currentSib = null;
/** Current sibling dependent index. */
protected int currentSibIndex = -1;
/** Current head broad POS (for extracting features). */
protected String currentHeadBroadPOS = null;
/** Current head-dependent order (for extracting features). */
protected String currentHeadDepOrder = null;
/** Current siblings precedence relation (for extracting features). */
protected String currentSibsPrecedence = null;
/** Current dep prececes sib flag (for extracting features). */
protected boolean currentDepPrecedesSib = false;
/** Current dep sign (for extracting features). */
protected Sign currentDepSign = null;
/** Current sib sign (for extracting features). */
protected Sign currentSibSign = null;
/** Current dep phrase lengths (for extracting features). */
protected PhraseLengths currentDepLengths = null;
/** Current sib phrase lengths (for extracting features). */
protected PhraseLengths currentSibLengths = null;
/** Current difference in lengths between second and first siblings (for extracting features). */
protected PhraseLengths currentLengthsDiff = null;
///** Lexical feature extractors. */
//protected List<List<TrieMap.KeyExtractor<String>>> lexExtractors = new ArrayList<List<TrieMap.KeyExtractor<String>>>();
/** Dependency order feature extractors. */
protected List<List<TrieMap.KeyExtractor<String>>> depOrderExtractors = new ArrayList<List<TrieMap.KeyExtractor<String>>>();
/** Siblings order feature extractors. */
protected List<List<TrieMap.KeyExtractor<String>>> sibsOrderExtractors = new ArrayList<List<TrieMap.KeyExtractor<String>>>();
/** Siblings order conditional feature extractors. */
protected List<ConditionalLazyExtractor> sibsOrderCondExtractors = new ArrayList<ConditionalLazyExtractor>();
/** Siblings complexity order conditional feature extractors. */
protected List<ConditionalLazyExtractor> sibsComplexityOrderCondExtractors = new ArrayList<ConditionalLazyExtractor>();
/** Short-long conditional feature evaluators. */
protected List<ConditionalLazyEvaluator> shortLongCondEvaluators = new ArrayList<ConditionalLazyEvaluator>();
/** Global dependency length keys. */
protected List<String> depLenKeys = new ArrayList<String>(1);
/** Global dependency length feature. */
protected Alphabet.Feature depLenFeat = null;
/** Constructor. */
public LexDepFeatureExtractor() {
// init lazy feature extractors
depOrderExtractors.add(head_dep_order_words());
depOrderExtractors.add(head_dep_order_pos());
depOrderExtractors.add(head_dep_order_word_pos());
depOrderExtractors.add(head_dep_order_pos_word());
sibsOrderExtractors.add(sibs_precedence_words());
sibsOrderExtractors.add(sibs_precedence_word_pos());
sibsOrderExtractors.add(sibs_precedence_pos_word());
sibsOrderCondExtractors.add(sibs_precedence_pos());
sibsOrderExtractors.add(sibs_precedence_word_class());
sibsOrderExtractors.add(sibs_precedence_class_word());
sibsOrderCondExtractors.add(sibs_precedence_class());
sibsOrderCondExtractors.add(sibs_precedence_rels());
sibsOrderCondExtractors.add(sibs_precedence_defs());
// init complexity feature extractors
sibsComplexityOrderCondExtractors.add(sibs_precedence_verbs());
sibsComplexityOrderCondExtractors.add(sibs_precedence_puncts());
// init short-long feature evaluators
shortLongCondEvaluators.add(short_long_words());
// init dep len keys
depLenKeys.add(DEPLEN);
}
/** Sets the alphabet. */
public void setAlphabet(Alphabet alphabet) {
this.alphabet = alphabet;
}
/** Returns the features for the given sign and completeness flag. */
public FeatureVector extractFeatures(Sign sign, boolean complete) {
addFeatures(sign, complete);
return getFeatureMap(sign);
}
/** Recursively adds features to the feature map for the given sign, if not already present. */
protected void addFeatures(Sign sign, boolean complete) {
// check for existing map, otherwise make one
if (getFeatureMap(sign) != null) return;
// lex case
if (sign.isLexical()) {
currentSign = sign;
currentMap = new FeatureMap(0);
//inc(lexExtractors);
}
// non-terminal
else {
Sign[] inputs = sign.getDerivationHistory().getInputs();
// first recurse
for (Sign child : inputs) addFeatures(child, false);
// use input maps in making current map
currentSign = sign;
currentInputs = inputs;
if (inputs.length == 1) {
currentMap = new FeatureMap(getFeatureMap(inputs[0]));
}
else if (inputs.length == 2) {
currentMap = new FeatureMap(getFeatureMap(inputs[0]), getFeatureMap(inputs[1]));
}
// do each newly filled dep
for (LexDependency dep : sign.getFilledDeps()) {
currentDep = dep;
currentHeadBroadPOS = getHeadBroadPOS(dep);
setDepIndexes(dep);
currentHeadDepOrder = getHeadDepOrder();
inc(depOrderExtractors);
// do dep len
if (includeDepLen) {
Alphabet.Feature f = getDepLenFeat();
if (f != null) currentMap.add(f, (float)depLen());
}
// do order for each sib on the same side of the same head
for (LexDependency sib : sign.getSiblingFilledDeps()) {
currentSib = sib;
if (dep.lexHead != sib.lexHead) continue;
setSibIndex(sib);
if (currentHeadDepOrder != getHeadSibOrder()) continue;
currentSibsPrecedence = sibPrecedenceForDep(currentHeadDepOrder);
currentDepPrecedesSib = depPrecedesSib();
inc(sibsOrderExtractors);
incCond(sibsOrderCondExtractors);
// do complexity, length feats
if (includeComplexityFeats || includeShortLong) {
setLengthsDiff();
if (currentLengthsDiff != null) {
if (includeComplexityFeats) incCond(sibsComplexityOrderCondExtractors);
if (includeShortLong) addCond(shortLongCondEvaluators);
}
}
}
}
}
// store it
storeFeatureMap(sign);
}
/** Stores the current feature map as a data object in the given sign. */
protected void storeFeatureMap(Sign sign) {
sign.addData(new FeatureMapWrapper(currentMap));
}
/** Returns the feature map for this extractor from the given sign (null if none). */
protected FeatureMap getFeatureMap(Sign sign) {
FeatureMapWrapper fmw = (FeatureMapWrapper)sign.getData(FeatureMapWrapper.class);
return (fmw != null) ? fmw.featureMap : null;
}
/**
* Increments the count of the given features, if relevant.
*/
protected void inc(List<List<TrieMap.KeyExtractor<String>>> extractors) {
for (List<TrieMap.KeyExtractor<String>> lazyExtractor : extractors) {
Alphabet.Feature f = alphabet.indexLazy(lazyExtractor);
if (f != null) currentMap.inc(f);
}
}
/**
* Increments the count of the given conditional features, if relevant.
*/
protected void incCond(List<ConditionalLazyExtractor> condExtractors) {
for (ConditionalLazyExtractor condExtractor : condExtractors) {
if (condExtractor.test()) {
Alphabet.Feature f = alphabet.indexLazy(condExtractor.lazyExtractor);
if (f != null) currentMap.inc(f);
}
}
}
/**
* Adds to the values of the given conditional features, if relevant.
*/
protected void addCond(List<ConditionalLazyEvaluator> condEvaluators) {
for (ConditionalLazyEvaluator condEvaluator : condEvaluators) {
if (condEvaluator.test()) {
Alphabet.Feature f = alphabet.indexLazy(condEvaluator.lazyExtractor);
if (f != null) currentMap.add(f, condEvaluator.eval());
}
}
}
//------------------------------------
// utility functions
// returns up to the first two chars of the head POS
private String getHeadBroadPOS(LexDependency dep) {
String pos = dep.lexHead.getPOS();
String retval = pos;
if (pos.length() > 2) retval = pos.substring(0, 2).intern();
return retval;
}
// sets the current head and dep indexes, and the dep sign
private void setDepIndexes(LexDependency dep) {
currentHeadIndex = currentSign.wordIndex(dep.lexHead);
currentDepIndex = currentSign.wordIndex(dep.lexDep);
currentDepSign = currentSign.getSignHeadedByDep(dep);
}
// returns the head-dependent order
private String getHeadDepOrder() {
return (currentHeadIndex < currentDepIndex) ? HEAD_PRECEDES_DEP : DEP_PRECEDES_HEAD;
}
// sets the current sib index and the sib sign
private void setSibIndex(LexDependency sib) {
currentSibIndex = currentSign.wordIndex(sib.lexDep);
currentSibSign = currentSign.getSignHeadedByDep(sib);
}
// returns the head--sibling dependent order
private String getHeadSibOrder() {
return (currentHeadIndex < currentSibIndex) ? HEAD_PRECEDES_DEP : DEP_PRECEDES_HEAD;
}
// returns whether the dep precedes the sib
private boolean depPrecedesSib() {
return currentDepIndex < currentSibIndex;
}
//------------------------------------
// definiteness functions
/** Class for storing whether a sign is or immediately contains a definite NP. */
public static class DefiniteNP {
/** Definiteness value; null means not an NP (or NP parent). */
public Boolean def;
public DefiniteNP(Boolean def) { this.def = def; }
public String toString() { return "defNP: " + def; }
}
/** Returns the definite NP status for a sign, caching it in the sign. */
public DefiniteNP getDefiniteNP(Sign sign) {
// check cached
DefiniteNP defNP = (DefiniteNP)sign.getData(DefiniteNP.class);
if (defNP != null) return defNP;
// determine def NP status
Boolean def = null;
// check for NP
Sign npSign = getSignOrChildSignAsNP(sign);
if (npSign != null) {
// set status to definite by default; check for indef
def = Boolean.TRUE;
// get sem feats
Nominal npNom = npSign.getCategory().getIndexNominal();
List<SatOp> semFeats = HyloHelper.getSemFeatsForHead(npNom, npSign.getCategory().getLF());
// check for <det>nil
for (SatOp feat : semFeats) {
if (isIndefFeat(feat)) {
def = Boolean.FALSE; break;
}
}
// otherwise check deps
if (def) {
// get all deps
List<LexDependency> allDeps = new ArrayList<LexDependency>(5);
allDeps.addAll(npSign.getFilledDeps());
allDeps.addAll(npSign.getSiblingFilledDeps());
// check for <Det>a|an|some|any
for (LexDependency dep : allDeps) {
if (isIndefDep(dep)) {
def = Boolean.FALSE; break;
}
}
}
}
// store result and return
defNP = new DefiniteNP(def);
sign.addData(defNP);
return defNP;
}
/**
* Returns the given sign if it's an NP sign;
* otherwise returns the first child sign that's an NP sign;
* otherwise returns null.
*/
protected Sign getSignOrChildSignAsNP(Sign sign) {
if (isNP(sign)) return sign;
if (sign.isLexical()) return null;
Sign[] inputs = sign.getDerivationHistory().getInputs();
for (int i = 0; i < inputs.length; i++) {
if (isNP(inputs[i])) return inputs[i];
}
return null;
}
/**
* Returns whether the given sign is an NP.
* The default implementation tests for a category type of "np".
*/
protected boolean isNP(Sign sign) {
Category cat = sign.getCategory();
if (!(cat instanceof AtomCat)) return false;
AtomCat ac = (AtomCat) cat;
return (ac.getType().equals("np"));
}
/**
* Returns whether the given semantic features signals indefiniteness.
* The default implementation tests for <:det>nil.
*/
protected boolean isIndefFeat(SatOp feat) {
return HyloHelper.getRel(feat).equals("det") && "nil".equals(HyloHelper.getVal(feat));
}
/**
* Returns whether the given lexical dependency signals indefiniteness.
* The default implementation checks for <Det>a|an|any|some.
*/
protected boolean isIndefDep(LexDependency dep) {
if (dep.rel.equalsIgnoreCase("Det")) {
String form = dep.lexDep.getWordForm();
if (form=="a" || form=="an" || form=="any" || form=="some")
return true;
}
return false;
}
/** Returns whether two signs differ in definiteness. */
public boolean defDifference(Sign sign1, Sign sign2) {
DefiniteNP defNP1 = getDefiniteNP(sign1);
if (defNP1.def == null) return false;
DefiniteNP defNP2 = getDefiniteNP(sign2);
if (defNP2.def == null) return false;
return defNP1.def != defNP2.def;
}
/** Returns the appropriate definiteness/indefiniteness constant. */
public String defConstant(DefiniteNP defNP) {
return (defNP.def) ? DEF : INDEF;
}
//------------------------------------
// phrase length functions
/**
* Class for storing length of phrase in words, puncts and (finite) verbs, for unique retrieval from a sign's data objects.
*/
public static class PhraseLengths {
public int wordlen, punctlen, verblen;
public PhraseLengths(int wordlen, int punctlen, int verblen) {
this.wordlen = wordlen; this.punctlen = punctlen; this.verblen = verblen;
}
public String toString() { return " wordlen: " + wordlen + " punctlen: " + punctlen + " verblen: " + verblen; }
}
/** Returns the phrase lengths for a sign, caching them in the sign. */
public PhraseLengths getPhraseLengths(Sign sign) {
// check cached
PhraseLengths lengths = (PhraseLengths)sign.getData(PhraseLengths.class);
if (lengths != null) return lengths;
int wordlen = 0, punctlen = 0, verblen = 0;
// lex case
if (sign.isLexical()) {
for (Word w: sign.getWords()) {
wordlen++;
if (isPunct(w)) punctlen++;
if (isVerb(w)) verblen++;
}
}
// non-lex: add child lengths
else {
Sign[] inputs = sign.getDerivationHistory().getInputs();
for (int i = 0; i < inputs.length; i++) {
PhraseLengths lengthsI = getPhraseLengths(inputs[i]);
wordlen += lengthsI.wordlen; punctlen += lengthsI.punctlen; verblen += lengthsI.punctlen;
}
}
// store result and return
lengths = new PhraseLengths(wordlen, punctlen, verblen);
sign.addData(lengths);
return lengths;
}
/**
* Returns whether a word is a punctuation mark that typically signals sentence-internal complexity.
* The default implementation tests for commas, dashes (--), semi-colons and colons.
*/
protected boolean isPunct(Word word) {
// NB: in principle could use POS, but sometimes punctuation marks seem to end up with IN as the POS tag
String form = word.getForm();
return (form == "," || form == "--" || form == ";" || form == ":");
}
/**
* Returns whether a word is a verb that indicates a substantial clause.
* The default implementation tests for the finite verb POS tags VBD, VBP and VBZ.
*/
protected boolean isVerb(Word word) {
String pos = word.getPOS();
return (pos == "VBD" || pos == "VBP" || pos == "VBZ");
}
/**
* Sets the differences in length between the signs headed by the current dep and sib, or null if none;
* also sets the current dep and sib lengths. The lengths are set to the lengths of the second sign
* minus those of the first sign.
*/
protected void setLengthsDiff() {
// reset
currentLengthsDiff = null; currentDepLengths = null; currentSibLengths = null;
// ensure both there
if (currentDepSign == null || currentSibSign == null) return;
// get phrase lengths
currentDepLengths = getPhraseLengths(currentDepSign);
currentSibLengths = getPhraseLengths(currentSibSign);
// get 1st and 2nd phrase lengths
PhraseLengths pl1 = (currentDepPrecedesSib) ? currentDepLengths : currentSibLengths;
PhraseLengths pl2 = (currentDepPrecedesSib) ? currentSibLengths : currentDepLengths;
// set diff to 2nd - 1st
currentLengthsDiff = new PhraseLengths(pl2.wordlen-pl1.wordlen, pl2.punctlen-pl1.punctlen, pl2.verblen-pl1.verblen);
}
//------------------------------------
// dep len functions
/** Returns the dep len feature if not already set. */
protected Alphabet.Feature getDepLenFeat() {
if (depLenFeat == null)
depLenFeat = alphabet.index(depLenKeys);
return depLenFeat;
}
/**
* Returns the dependency length between the current head and the current dependent.
* The default implementation returns the number of intervening words excluding
* punctuation (as determined by isPunct), and doesn't count each word in a collapsed NE separately.
*/
protected int depLen() {
List<Word> words = currentSign.getWords();
int min = Math.min(currentHeadIndex, currentDepIndex);
int max = Math.max(currentHeadIndex, currentDepIndex);
int count = 0;
for (int i=min+1; i < max; i++) {
Word w = words.get(i);
if (!isPunct(w)) count++;
}
return count;
}
//------------------------------------
// word class functions
/**
* Returns a class for the word, or CLASS_NIL if none.
* The default implementation returns one of the following, in this order:
* the semantic class of the word;
* CLASS_PRO, if a pronoun;
* CLASS_COLOR, if a color word;
* the suffix, if getSuffix returns a value;
* CLASS_HYPH, if the word is hyphenated;
* CLASS_CAP, if capitalized;
* or CLASS_NIL, otherwise.
* The word class is cached using cachedWordClasses.
*/
protected String getWordClass(Word word) {
String retval = cachedWordClasses.get(word);
if (retval != null) return retval;
String wClass = word.getSemClass();
if (wClass != null) return updateCachedWordClasses(word, wClass);
if (isPro(word)) return updateCachedWordClasses(word, CLASS_PRO);
String form = word.getForm();
if (colors.contains(form)) return updateCachedWordClasses(word, CLASS_COLOR);
String suffix = getSuffix(form);
if (suffix != null) return updateCachedWordClasses(word, suffix);
if (form.indexOf('-') >= 0) return updateCachedWordClasses(word, CLASS_HYPH);
if (Character.isUpperCase(form.charAt(0))) return updateCachedWordClasses(word, CLASS_CAP);
return updateCachedWordClasses(word, CLASS_NIL);
}
/**
* Returns whether a word is a pronoun.
* The default implementation returns whether the POS tag starts with "PR".
*/
protected boolean isPro(Word word) { return word.getPOS().startsWith("PR"); }
/**
* The set of color words to check for in determining the word class.
*/
protected Set<String> colors = defaultColors();
/**
* Sets the set of color words.
*/
public void setColorWords(Set<String> colorWords) { colors = colorWords; }
/**
* Returns the default set of color words: 11 common English colors, with two spellings of gray/grey.
*/
protected Set<String> defaultColors() {
String[] colors = {
"black", "blue", "brown", "gray", "grey", "green",
"orange", "pink", "purple", "red", "white", "yellow"
};
return new HashSet<String>(Arrays.asList(colors));
}
/**
* A sequence of suffixes to check for in determining the word class, ordered by specificity.
*/
protected String[] suffixClasses = defaultSuffixClasses();
/**
* Returns the default suffix classes: 61 common English suffixes from various lists on the web.
*/
protected String[] defaultSuffixClasses() {
return new String[] {
"ancy", "aphy", "arch", "crat", "gram", "less", "logy", "ness", "nomy", "ship", "some", "sque", "tude",
"ade", "age", "ant", "aph", "ary", "ast", "ate", "ble", "dom", "ent", "est", "ful",
"ian", "ile", "ion", "ing", "ish", "ism", "ist", "ise", "ite", "ium", "ive", "ize",
"nce", "oid", "ory", "ose", "ote", "ous", "sig", "ure",
"ac", "al", "an", "cy", "ed", "en", "er", "fy", "ic", "le", "ly", "or", "se", "sy", "ty",
"y"
};
}
/**
* Sets the suffix classes, which are assumed to be interned.
*/
public void setSuffixClasses(String[] suffixes) { suffixClasses = suffixes; }
/**
* Returns a matching suffix class, or null if none.
*/
protected String getSuffix(String form) {
for (int i=0; i < suffixClasses.length; i++) {
String suff = suffixClasses[i];
if (form.length() > suff.length() && form.endsWith(suff)) return suff;
}
return null;
}
/**
* Cache of word classes, using a weak hash map.
*/
protected WeakHashMap<Word,String> cachedWordClasses = new WeakHashMap<Word,String>();
/**
* Updates the cached word classes with the given word and word class, and returns the word class.
*/
protected String updateCachedWordClasses(Word word, String wordClass) {
cachedWordClasses.put(word, wordClass); return wordClass;
}
//------------------------------------
// shared feature extractor elements
// prefix: "ld" + head broad POS
private void add_prefix(List<TrieMap.KeyExtractor<String>> retval) {
retval.add(new TrieMap.KeyExtractor<String>(){public String getKey(){ return PREFIX; }});
retval.add(new TrieMap.KeyExtractor<String>(){public String getKey(){ return currentHeadBroadPOS; }});
}
// head-dep order
private void add_head_dep_order(List<TrieMap.KeyExtractor<String>> retval) {
retval.add(new TrieMap.KeyExtractor<String>(){public String getKey(){ return currentHeadDepOrder; }});
}
// rel
private void add_rel(List<TrieMap.KeyExtractor<String>> retval) {
retval.add(new TrieMap.KeyExtractor<String>(){public String getKey(){ return currentDep.rel.intern(); }});
}
// common head-dep order elements
private void add_head_dep_order_common(List<TrieMap.KeyExtractor<String>> retval) {
add_prefix(retval);
add_head_dep_order(retval);
add_rel(retval);
}
// head word
private void add_head_word(List<TrieMap.KeyExtractor<String>> retval) {
retval.add(new TrieMap.KeyExtractor<String>(){public String getKey(){ return currentDep.lexHead.getWordForm(); }});
}
// head pos
private void add_head_pos(List<TrieMap.KeyExtractor<String>> retval) {
retval.add(new TrieMap.KeyExtractor<String>(){public String getKey(){ return currentDep.lexHead.getPOS(); }});
}
// dep word
private void add_dep_word(List<TrieMap.KeyExtractor<String>> retval) {
retval.add(new TrieMap.KeyExtractor<String>(){public String getKey(){ return currentDep.lexDep.getWordForm(); }});
}
// dep pos
private void add_dep_pos(List<TrieMap.KeyExtractor<String>> retval) {
retval.add(new TrieMap.KeyExtractor<String>(){public String getKey(){ return currentDep.lexDep.getPOS(); }});
}
// sibs precedence
private void add_sibs_precedence(List<TrieMap.KeyExtractor<String>> retval) {
retval.add(new TrieMap.KeyExtractor<String>(){public String getKey(){ return currentSibsPrecedence; }});
}
// common sibs precedence elements
private void add_sibs_precedence_common(List<TrieMap.KeyExtractor<String>> retval) {
add_prefix(retval);
add_sibs_precedence(retval);
}
// sibs word1
private void add_sibs_word1(List<TrieMap.KeyExtractor<String>> retval) {
retval.add(new TrieMap.KeyExtractor<String>(){public String getKey(){
return (currentDepPrecedesSib) ? currentDep.lexDep.getWordForm() : currentSib.lexDep.getWordForm();
}});
}
// sibs word2
private void add_sibs_word2(List<TrieMap.KeyExtractor<String>> retval) {
retval.add(new TrieMap.KeyExtractor<String>(){public String getKey(){
return (currentDepPrecedesSib) ? currentSib.lexDep.getWordForm() : currentDep.lexDep.getWordForm();
}});
}
// sibs pos1
private void add_sibs_pos1(List<TrieMap.KeyExtractor<String>> retval) {
retval.add(new TrieMap.KeyExtractor<String>(){public String getKey(){
return (currentDepPrecedesSib) ? currentDep.lexDep.getPOS() : currentSib.lexDep.getPOS();
}});
}
// sibs pos2
private void add_sibs_pos2(List<TrieMap.KeyExtractor<String>> retval) {
retval.add(new TrieMap.KeyExtractor<String>(){public String getKey(){
return (currentDepPrecedesSib) ? currentSib.lexDep.getPOS() : currentDep.lexDep.getPOS();
}});
}
// sibs class1
private void add_sibs_class1(List<TrieMap.KeyExtractor<String>> retval) {
retval.add(new TrieMap.KeyExtractor<String>(){public String getKey(){
Sign first = (currentDepPrecedesSib) ? currentDep.lexDep : currentSib.lexDep;
return getWordClass(first.getWords().get(0));
}});
}
// sibs class2
private void add_sibs_class2(List<TrieMap.KeyExtractor<String>> retval) {
retval.add(new TrieMap.KeyExtractor<String>(){public String getKey(){
Sign second = (currentDepPrecedesSib) ? currentSib.lexDep : currentDep.lexDep;
return getWordClass(second.getWords().get(0));
}});
}
// sibs rel1
private void add_sibs_rel1(List<TrieMap.KeyExtractor<String>> retval) {
retval.add(new TrieMap.KeyExtractor<String>(){public String getKey(){
return (currentDepPrecedesSib) ? currentDep.rel.intern() : currentSib.rel.intern();
}});
}
// sibs rel2
private void add_sibs_rel2(List<TrieMap.KeyExtractor<String>> retval) {
retval.add(new TrieMap.KeyExtractor<String>(){public String getKey(){
return (currentDepPrecedesSib) ? currentSib.rel.intern() : currentDep.rel.intern();
}});
}
//-------------------------------
// feature extractors
// head-dep order words
private List<TrieMap.KeyExtractor<String>> head_dep_order_words() {
List<TrieMap.KeyExtractor<String>> retval = new ArrayList<TrieMap.KeyExtractor<String>>(5);
add_head_dep_order_common(retval);
add_head_word(retval);
add_dep_word(retval);
return retval;
}
// head-dep order pos
private List<TrieMap.KeyExtractor<String>> head_dep_order_pos() {
List<TrieMap.KeyExtractor<String>> retval = new ArrayList<TrieMap.KeyExtractor<String>>(5);
add_head_dep_order_common(retval);
add_head_pos(retval);
add_dep_pos(retval);
return retval;
}
// head-dep order word/pos
private List<TrieMap.KeyExtractor<String>> head_dep_order_word_pos() {
List<TrieMap.KeyExtractor<String>> retval = new ArrayList<TrieMap.KeyExtractor<String>>(5);
add_head_dep_order_common(retval);
add_head_word(retval);
add_dep_pos(retval);
return retval;
}
// head-dep order pos/word
private List<TrieMap.KeyExtractor<String>> head_dep_order_pos_word() {
List<TrieMap.KeyExtractor<String>> retval = new ArrayList<TrieMap.KeyExtractor<String>>(5);
add_head_dep_order_common(retval);
add_head_pos(retval);
add_dep_word(retval);
return retval;
}
// sibs precedence words
private List<TrieMap.KeyExtractor<String>> sibs_precedence_words() {
List<TrieMap.KeyExtractor<String>> retval = new ArrayList<TrieMap.KeyExtractor<String>>(5);
add_sibs_precedence_common(retval);
add_sibs_word1(retval);
add_sibs_word2(retval);
return retval;
}
// sibs precedence word pos
private List<TrieMap.KeyExtractor<String>> sibs_precedence_word_pos() {
List<TrieMap.KeyExtractor<String>> retval = new ArrayList<TrieMap.KeyExtractor<String>>(5);
add_sibs_precedence_common(retval);
add_sibs_word1(retval);
add_sibs_pos2(retval);
return retval;
}
// sibs precedence pos word
private List<TrieMap.KeyExtractor<String>> sibs_precedence_pos_word() {
List<TrieMap.KeyExtractor<String>> retval = new ArrayList<TrieMap.KeyExtractor<String>>(5);
add_sibs_precedence_common(retval);
add_sibs_pos1(retval);
add_sibs_word2(retval);
return retval;
}
// sibs precedence pos
private ConditionalLazyExtractor sibs_precedence_pos() {
ConditionalLazyExtractor retval = new ConditionalLazyExtractor() {
boolean test() { return currentDep.lexDep.getPOS() != currentSib.lexDep.getPOS(); }
};
add_sibs_precedence_common(retval.lazyExtractor);
add_sibs_pos1(retval.lazyExtractor);
add_sibs_pos2(retval.lazyExtractor);
return retval;
}
// sibs precedence word / class
private List<TrieMap.KeyExtractor<String>> sibs_precedence_word_class() {
List<TrieMap.KeyExtractor<String>> retval = new ArrayList<TrieMap.KeyExtractor<String>>(5);
add_sibs_precedence_common(retval);
add_sibs_word1(retval);
add_sibs_class2(retval);
return retval;
}
// sibs precedence class / word
private List<TrieMap.KeyExtractor<String>> sibs_precedence_class_word() {
List<TrieMap.KeyExtractor<String>> retval = new ArrayList<TrieMap.KeyExtractor<String>>(5);
add_sibs_precedence_common(retval);
add_sibs_class1(retval);
add_sibs_word2(retval);
return retval;
}
// sibs precedence class
private ConditionalLazyExtractor sibs_precedence_class() {
ConditionalLazyExtractor retval = new ConditionalLazyExtractor() {
boolean test() {
return getWordClass(currentDep.lexDep.getWords().get(0)) != getWordClass(currentSib.lexDep.getWords().get(0));
}
};
add_sibs_precedence_common(retval.lazyExtractor);
add_sibs_class1(retval.lazyExtractor);
add_sibs_class2(retval.lazyExtractor);
return retval;
}
// sibs precedence rels
private ConditionalLazyExtractor sibs_precedence_rels() {
ConditionalLazyExtractor retval = new ConditionalLazyExtractor() {
boolean test() { return !currentDep.rel.equals(currentSib.rel); }
};
add_sibs_precedence_common(retval.lazyExtractor);
add_sibs_rel1(retval.lazyExtractor);
add_sibs_rel2(retval.lazyExtractor);
return retval;
}
// sibs precedence defs
private ConditionalLazyExtractor sibs_precedence_defs() {
ConditionalLazyExtractor retval = new ConditionalLazyExtractor() {
boolean test() {
if (currentDepSign == null || currentSibSign == null) return false;
return defDifference(currentDepSign, currentSibSign);
}
};
add_sibs_precedence_common(retval.lazyExtractor);
retval.lazyExtractor.add(new TrieMap.KeyExtractor<String>(){public String getKey(){
return (currentDepPrecedesSib) ? defConstant(getDefiniteNP(currentDepSign)) : defConstant(getDefiniteNP(currentSibSign));
}});
retval.lazyExtractor.add(new TrieMap.KeyExtractor<String>(){public String getKey(){
return (currentDepPrecedesSib) ? defConstant(getDefiniteNP(currentSibSign)) : defConstant(getDefiniteNP(currentDepSign));
}});
return retval;
}
// sibs precedence verbs
private ConditionalLazyExtractor sibs_precedence_verbs() {
ConditionalLazyExtractor retval = new ConditionalLazyExtractor() {
boolean test() {
return currentLengthsDiff.verblen != 0 &&
(currentDepLengths.verblen == 0 || currentSibLengths.verblen == 0);
}
};
add_sibs_precedence_common(retval.lazyExtractor);
retval.lazyExtractor.add(new TrieMap.KeyExtractor<String>(){public String getKey(){
return (currentLengthsDiff.verblen > 0) ? NO_V : HAS_V;
}});
retval.lazyExtractor.add(new TrieMap.KeyExtractor<String>(){public String getKey(){
return (currentLengthsDiff.verblen > 0) ? HAS_V : NO_V;
}});
return retval;
}
// sibs precedence puncts
private ConditionalLazyExtractor sibs_precedence_puncts() {
ConditionalLazyExtractor retval = new ConditionalLazyExtractor() {
boolean test() {
return currentLengthsDiff.punctlen != 0 &&
(currentDepLengths.punctlen == 0 || currentSibLengths.punctlen == 0);
}
};
add_sibs_precedence_common(retval.lazyExtractor);
retval.lazyExtractor.add(new TrieMap.KeyExtractor<String>(){public String getKey(){
return (currentLengthsDiff.punctlen > 0) ? NO_P : HAS_P;
}});
retval.lazyExtractor.add(new TrieMap.KeyExtractor<String>(){public String getKey(){
return (currentLengthsDiff.punctlen > 0) ? HAS_P : NO_P;
}});
return retval;
}
// short-long words
private ConditionalLazyEvaluator short_long_words() {
ConditionalLazyEvaluator retval = new ConditionalLazyEvaluator() {
boolean test() { return currentLengthsDiff.wordlen != 0; }
float eval() { return (float) Math.abs(currentLengthsDiff.wordlen); }
};
add_sibs_precedence_common(retval.lazyExtractor);
retval.lazyExtractor.add(new TrieMap.KeyExtractor<String>(){public String getKey(){
return (currentLengthsDiff.wordlen > 0) ? SHORT_LONG_ORDER : LONG_SHORT_ORDER;
}});
return retval;
}
}