/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept.
This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
http://www.cs.umass.edu/~mccallum/mallet
This software is provided under the terms of the Common Public License,
version 1.0, as published by http://www.opensource.org. For further
information, see the file `LICENSE' included with this distribution. */
/**
Create new feature from the conjunction of features from given
offsets that match given regular expressions. This can be seen
as hand-coding in a few of the conjunctions that you'd get
from {@link OffsetConjunctions}.
<P>
For example, creating a pipe with
<TT>new OffsetFeatureConjunction ("TIME", new String[] { "number", "W=:" "number" }, new int[] { 0, 1, 2 })<TT>
will create a feature that is true whenever all of (a) a feature at the
current time matches "number" (b) a feature at the next time step matches "W=:"
(b) a feature 2 timesteps from now match "number", so that you have a
simple time detector.
<P>If the conjunction passes, then either the first timestep
(that is, the one all the offsets were computed from), or all matching timesteps,
get the feature "TIME" --- depending on the value of the field tagAllTimesteps.
@author Charles Sutton <a href="mailto:casutton@cs.umass.edu">casutton@cs.umass.edu</a>
@author Andrew McCallum <a href="mailto:mccallum@cs.umass.edu">mccallum@cs.umass.edu</a>
*/
package cc.mallet.pipe.tsf;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.Serializable;
import java.util.regex.Pattern;
import cc.mallet.pipe.Pipe;
import cc.mallet.types.Instance;
import cc.mallet.types.Token;
import cc.mallet.types.TokenSequence;
import cc.mallet.util.PropertyList;
public class OffsetFeatureConjunction extends Pipe implements Serializable
{
private String thisFeatureName;
private Pattern[] featurePatterns;
private int[] offsets;
private boolean[] isNonNegated;
private boolean tagAllTimesteps;
/**
* Create a Pipe for adding conjunctions of specified features.
* @param thisFeatureName Name of this conjunction feature.
* @param featureNames String giving name for each subfeature i.
* @param offsets For each subfeature i, which offset from the current timestep
* must i appear at.
* @param isNonNegated If element i is false, then the negation of the
* feature is added to the conjuction.
*/
public OffsetFeatureConjunction (String thisFeatureName, String[] featureNames, int[] offsets, boolean[] isNonNegated, boolean tagAllTimesteps)
{
this.thisFeatureName = thisFeatureName;
this.featurePatterns = patternify (featureNames);
this.offsets = offsets;
this.isNonNegated = isNonNegated;
this.tagAllTimesteps = tagAllTimesteps;
}
private static boolean[] trueArray (int length) {
boolean[] ret = new boolean[length];
for (int i = 0; i < length; i++)
ret[i] = true;
return ret;
}
private Pattern[] patternify (String[] regex) {
Pattern[] retval = new Pattern [regex.length];
for (int i = 0; i < regex.length; i++) {
retval [i] = Pattern.compile (regex[i]);
}
return retval;
}
public OffsetFeatureConjunction (String thisFeatureName, String[] featureNames, int[] offsets,
boolean tagAllTimesteps)
{
this (thisFeatureName, featureNames, offsets, trueArray(featureNames.length), tagAllTimesteps);
}
public OffsetFeatureConjunction (String thisFeatureName, String[] featureNames, int[] offsets)
{
this (thisFeatureName, featureNames, offsets, trueArray(featureNames.length), false);
}
public boolean isTagAllTimesteps ()
{
return tagAllTimesteps;
}
public String getFeatureName ()
{
return thisFeatureName;
}
public Pattern[] getFeaturePatterns ()
{
return featurePatterns;
}
public int[] getOffsets ()
{
return offsets;
}
public boolean[] getNonNegated ()
{
return isNonNegated;
}
public Instance pipe (Instance carrier)
{
TokenSequence ts = (TokenSequence) carrier.getData();
int tsSize = ts.size();
for (int t = 0; t < tsSize; t++) {
// Check whether the conjunction is true at time step t
boolean passes = true;
for (int fnum = 0; fnum < featurePatterns.length; fnum++) {
int pos = t + offsets[fnum];
if (!(pos >= 0 && pos < tsSize)) {
passes = false;
break;
}
boolean featurePresent = hasMatchingFeature (ts.get(pos), featurePatterns [fnum]);
if (featurePresent != isNonNegated [fnum]) {
passes = false;
break;
}
}
if (passes) {
if (tagAllTimesteps) {
for (int fnum = 0; fnum < featurePatterns.length; fnum++) {
int pos = t + offsets[fnum];
ts.get(pos).setFeatureValue (thisFeatureName, 1.0);
}
} else {
ts.get(t).setFeatureValue (thisFeatureName, 1.0);
}
}
}
return carrier;
}
private boolean hasMatchingFeature (Token token, Pattern pattern)
{
PropertyList.Iterator iter = token.getFeatures ().iterator ();
while (iter.hasNext()) {
iter.next();
if (pattern.matcher (iter.getKey()). matches ()) {
if (iter.getNumericValue() == 1.0) {
return true;
}
}
}
return false;
}
// Serialization
private static final long serialVersionUID = 1;
private static final int CURRENT_SERIAL_VERSION = 1;
private static final int NULL_INTEGER = -1;
private void writeObject (ObjectOutputStream out) throws IOException
{
out.writeInt (CURRENT_SERIAL_VERSION);
out.writeObject (thisFeatureName);
out.writeBoolean (tagAllTimesteps);
int size;
size = (featurePatterns == null) ? NULL_INTEGER : featurePatterns.length;
out.writeInt(size);
if (size != NULL_INTEGER) {
for (int i = 0; i <size; i++) {
out.writeObject (featurePatterns[i]);
out.writeInt (offsets[i]);
out.writeBoolean (isNonNegated[i]);
}
}
}
private void readObject (ObjectInputStream in) throws IOException, ClassNotFoundException
{
int size;
int version = in.readInt ();
thisFeatureName = (String) in.readObject();
if (version >= 1) tagAllTimesteps = in.readBoolean ();
size = in.readInt();
if (size == NULL_INTEGER) {
featurePatterns = null;
offsets = null;
isNonNegated = null;
} else {
featurePatterns = new Pattern[size];
offsets = new int[size];
isNonNegated = new boolean[size];
for (int i = 0; i < size; i++) {
featurePatterns[i] = (Pattern) in.readObject();
offsets[i] = in.readInt();
isNonNegated[i] = in.readBoolean();
}
}
}
}