* sentence-level contextual feature extractor. (VERY todo-ish, though.)
*/
public Collection<Pair<String, Double>> getFeatures(Map<Integer, TaggedWord> sentence, Integer wordIndex, boolean training) {
Collection<Pair<String, Double>> result = new ArrayList<Pair<String, Double>>(30);
TaggedWord current, prev, prevPrev, next, nextNext;
current = sentence.get(wordIndex);
// -------- The left periphery ------------
int wind = wordIndex.intValue();
if (wind > 1) {
prev = sentence.get(wind - 1);
prevPrev = sentence.get(wind - 2);
} else if (wind > 0) {
prev = sentence.get(wind - 1);
prevPrev = Constants.OOB;
} else {
prev = prevPrev = Constants.OOB;
}
// -------- The right periphery -----------
int tempSize = sentence.size();
if ((tempSize - (wind + 1)) >= 2) {
next = sentence.get(wind + 1);
nextNext = sentence.get(wind + 2);
} else if (tempSize - (wind + 1) >= 1) {
next = sentence.get(wind + 1);
nextNext = Constants.OOB;
} else {
next = nextNext = Constants.OOB;
}
Double activation = Constants.one;
if (training) {
result.add(new Pair<String, Double>(current.getPOS(), activation));
}
// we do not use tag-sequence features in this model.
// these are in a separate sequence model (n-gram model over POS sequences).
// standard contextual features (word to the left, current word, word to the right, etc.).
// these features are from Ratnaparkhi (1996).
result.add(new Pair<String, Double>(curL + "=" + current.getForm(), activation));
result.add(new Pair<String, Double>(prevL + "=" + prev.getForm(), activation));
result.add(new Pair<String, Double>(prevPrevL + "=" + prevPrev.getForm(), activation));
result.add(new Pair<String, Double>(nextL + "=" + next.getForm(), activation));
result.add(new Pair<String, Double>(nextNextL + "=" + nextNext.getForm(), activation));
// features that replace the tagging dictionary.
// add real-valued (activation = prior log-prob) features for each of the beta-best prior
// tags, given this word.
if(posPrior != null) {
List<Pair<Double,String>> priors = posPrior.getPriors(current.getWord());
double beta = 0.1;
double best = priors.get(0).a;
String wform = current.getForm();
for(Pair<Double,String> prior : priors) {
if(prior.a > (beta * best)) {
// add the features PPOS=<POSTAG>:<log-prob> and PPOS_word=<POSTAG>_<wordForm>:<log-prob>.
result.add(new Pair<String,Double>(priorF + "=" + prior.b, prior.a));
result.add(new Pair<String,Double>(priorF + "_word" + "=" + prior.b + "_" + wform, prior.a));
} else {
break;
}
}
}
// these are in addition to Ratnaparkhi's (1996) contextual features.
// now for conjunctions of features: w-2w-1=..., w-1w+1=..., w+1w+2=... (same for posp).
// (i.e., bigram features over words and parts of speech and bigrams of words and POSs that straddle the current token).
// N.B. only use single-best POSs (maybe change later).
TaggedWord[] wds = {prevPrev, prev, current, next, nextNext};
for (int j = 1; j < wds.length; j++) {
result.add(new Pair<String, Double>(lxfLabs[j - 1] + "|" + lxfLabs[j] + "=" + wds[j - 1].getForm() + "|" + wds[j].getForm(), activation));
// also, if at the current word slot, add bigrams that straddle the current word.
if (j == 2) {
result.add(new Pair<String, Double>(lxfLabs[j - 1] + "|" + lxfLabs[j + 1] + "=" + wds[j - 1].getForm() + "|" + wds[j + 1].getForm(), activation));
}
}
// affix features from Ratnaparkhi (1996).
// if the word's length is > 4, then extract the 1-, 2-, 3- and 4-character affixes.
if(current.getForm().length() > 4) {
StringBuffer prefixes = new StringBuffer(4), suffixes = new StringBuffer(4);
char[] wdForm = current.getForm().toCharArray();
// prefixes.
int cursor = 0;
for(cursor = 0; cursor < 4; cursor++) {
prefixes.append(wdForm[cursor]);
result.add(new Pair<String,Double>(prefix+"="+prefixes.toString(), Constants.one));
}
// suffixes.
for(cursor = wdForm.length-1; cursor >= wdForm.length-5; cursor--) {
suffixes.insert(0, wdForm[cursor]);
result.add(new Pair<String,Double>(suffix+"="+suffixes.toString(), Constants.one));
}
}
// now do "contains hyphen", "contains number", "contains uppercase letter" and contains fused NE connecter (_) features.
// also from Ratnaparkhi (1996).
if(current.getForm().contains("-")) { result.add(new Pair<String,Double>(hyphen, Constants.one)); }
if(current.getForm().matches(".*[0-9]+.*")) { result.add(new Pair<String,Double>(num, Constants.one)); }
if(!current.getForm().toLowerCase().equals(current.getForm())) { result.add(new Pair<String,Double>(caps, Constants.one)); }
// if we see a NE connector, this is likely a NNP (in English, e.g.).
if(current.getForm().contains(neConnecter)) { result.add(new Pair<String,Double>(neConn, Constants.one)); }
return result;
}