/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept.
This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
http://www.cs.umass.edu/~mccallum/mallet
This software is provided under the terms of the Common Public License,
version 1.0, as published by http://www.opensource.org. For further
information, see the file `LICENSE' included with this distribution. */
/**
Create new features from features (matching a regex within a window +/- the current position).
For example,
<br><code>
FeaturesInWindow p = new FeaturesInWindow("PREV-", -1, 1, Pattern.compile("POS-.*"), true)
</code> <br>
will create a pipe that adds a feature to the current position for each
feature in the previous starting with "POS-". So if the previous position
has "POS-NN" we add "PREV-POS-NN". The last argument to the constructor is
currently ignored. The alternative constructor matches all patterns, so:
<br><code>
FeaturesInWindow p = new FeaturesInWindow(s, l, r);
</code> <br>
is equivalent to
<br><code>
FeaturesInWindow p = new FeaturesInWindow("PREV-", -1, 1, Pattern.compile(".*"), true);
</code> <br>
but more efficient, since we don't actually check using the Pattern.
@author Andrew McCallum <a href="mailto:mccallum@cs.umass.edu">mccallum@cs.umass.edu</a>
*/
package cc.mallet.pipe.tsf;
import java.io.*;
import java.util.regex.*;
import cc.mallet.pipe.Pipe;
import cc.mallet.types.Instance;
import cc.mallet.types.Token;
import cc.mallet.types.TokenSequence;
import cc.mallet.util.PropertyList;
public class FeaturesInWindow extends Pipe implements Serializable
{
String namePrefix, namePrefixLeft;
int leftBoundary;
int rightBoundary;
Pattern featureRegex;
boolean includeBeginEndBoundaries;
boolean includeCurrentToken = false;
private static final int maxWindowSize = 20;
private static final PropertyList[] startfs = new PropertyList[maxWindowSize];
private static final PropertyList[] endfs = new PropertyList[maxWindowSize];
static {
initStartEndFs ();
}
private static void initStartEndFs ()
{
for (int i = 0; i < maxWindowSize; i++) {
startfs[i] = PropertyList.add ("<START"+i+">", 1.0, null);
endfs[i] = PropertyList.add ("<END"+i+">", 1.0, null);
}
}
/** @param namePrefix what to prepend to feature names
* @param leftBoundaryOffset left boundary of the window (e.g. -1 means
* include the previous word
* @param rightBoundaryOffset right boundary for this window (e.g. 1 means
* include the current position, but not the next
* @param featureRegex add only for features matching this (null = always match
* @param includeBeginEndBoundaries ignored
*/
public FeaturesInWindow (String namePrefix, int leftBoundaryOffset, int rightBoundaryOffset,
Pattern featureRegex, boolean includeBeginEndBoundaries)
{
this.namePrefix = namePrefix;
this.leftBoundary = leftBoundaryOffset;
this.rightBoundary = rightBoundaryOffset;
this.featureRegex = featureRegex;
this.includeBeginEndBoundaries = includeBeginEndBoundaries;
}
/**
equivalent to <br>
<code>
FeaturesInWindow((namePrefix, leftBoundaryOffset, rightBoundaryOffset, null, true);
</code>
*/
public FeaturesInWindow (String namePrefix, int leftBoundaryOffset, int rightBoundaryOffset)
{
this (namePrefix, leftBoundaryOffset, rightBoundaryOffset, null, true);
}
public Instance pipe (Instance carrier)
{
TokenSequence ts = (TokenSequence) carrier.getData();
int tsSize = ts.size();
PropertyList[] newFeatures = new PropertyList[tsSize];
for (int i = 0; i < tsSize; i++) {
Token t = ts.get (i);
PropertyList pl = t.getFeatures();
newFeatures[i] = pl;
for (int position = i + leftBoundary; position < i + rightBoundary; position++) {
if (position == i && !includeCurrentToken)
continue;
PropertyList pl2;
if (position < 0)
pl2 = startfs[-position];
else if (position >= tsSize)
pl2 = endfs[position-tsSize];
else
pl2 = ts.get(position).getFeatures ();
PropertyList.Iterator pl2i = pl2.iterator();
while (pl2i.hasNext()) {
pl2i.next();
String key = pl2i.getKey();
if (featureRegex == null || featureRegex.matcher(key).matches()) {
newFeatures[i] = PropertyList.add ((namePrefixLeft == null || position-i>0 ? namePrefix : namePrefixLeft)+key,
pl2i.getNumericValue(), newFeatures[i]);
}
}
}
}
for (int i = 0; i < tsSize; i++) {
// Put the new PropertyLists in place
ts.get (i).setFeatures (newFeatures[i]);
}
return carrier;
}
// Serialization
private static final long serialVersionUID = 1;
private static final int CURRENT_SERIAL_VERSION = 0;
private void writeObject (ObjectOutputStream out) throws IOException {
out.writeInt (CURRENT_SERIAL_VERSION);
out.writeObject (namePrefix);
out.writeInt (leftBoundary);
out.writeInt (rightBoundary);
out.writeObject (featureRegex);
out.writeBoolean (includeBeginEndBoundaries);
}
private void readObject (ObjectInputStream in) throws IOException, ClassNotFoundException {
int version = in.readInt ();
namePrefix = (String) in.readObject();
leftBoundary = in.readInt ();
rightBoundary = in.readInt ();
featureRegex = (Pattern) in.readObject();
includeBeginEndBoundaries = in.readBoolean();
}
}