/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept.
This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
http://www.cs.umass.edu/~mccallum/mallet
This software is provided under the terms of the Common Public License,
version 1.0, as published by http://www.opensource.org. For further
information, see the file `LICENSE' included with this distribution. */
/**
Create new features from all possible conjunctions with other
(possibly position-offset) features.
@author Andrew McCallum <a href="mailto:mccallum@cs.umass.edu">mccallum@cs.umass.edu</a>
*/
package cc.mallet.pipe.tsf;
import java.io.*;
import java.util.regex.*;
import cc.mallet.pipe.Pipe;
import cc.mallet.types.Instance;
import cc.mallet.types.Token;
import cc.mallet.types.TokenSequence;
import cc.mallet.util.PropertyList;
public class OffsetConjunctions extends Pipe implements Serializable
{
int[][] conjunctions;
boolean includeOriginalSingletons;
// boolean includeBeginEndBoundaries;
Pattern featureRegex;
static final int maxWindowSize = 50;
static final PropertyList[] startfs = new PropertyList[maxWindowSize];
static final PropertyList[] endfs = new PropertyList[maxWindowSize];
static {
initStartEndFs ();
}
private static void initStartEndFs ()
{
for (int i = 0; i < maxWindowSize; i++) {
startfs[i] = PropertyList.add ("<START"+i+">", 1.0, null);
endfs[i] = PropertyList.add ("<END"+i+">", 1.0, null);
}
}
// To include all the old previous singleton features, pass {{0}}
// For a conjunction at the current time step, pass {{0,0}}
// For a conjunction of current and previous, pass {{0,-1}}
// For a conjunction of the current and next two, pass {{0,1,2}}
public OffsetConjunctions (boolean includeOriginalSingletons, Pattern featureRegex, int[][] conjunctions)
{
this.conjunctions = conjunctions;
this.featureRegex = featureRegex;
this.includeOriginalSingletons = includeOriginalSingletons;
}
public OffsetConjunctions (boolean includeOriginalSingletons, int[][] conjunctions)
{
this (includeOriginalSingletons, null, conjunctions);
}
public OffsetConjunctions (int[][] conjunctions)
{
this (true, conjunctions);
}
public Instance pipe (Instance carrier)
{
TokenSequence ts = (TokenSequence) carrier.getData();
int tsSize = ts.size();
PropertyList[] oldfs = null;
PropertyList[] newfs = null;
try {
oldfs = new PropertyList[ts.size()];
}
catch (Exception e) {
System.err.println("Exception allocating oldfs: " + e);
}
try {
newfs = new PropertyList[ts.size()];
}
catch (Exception e) {
System.err.println("Exception allocating newfs: " + e);
}
for (int i = 0; i < tsSize; i++)
oldfs[i] = ts.get(i).getFeatures ();
if (includeOriginalSingletons)
for (int i = 0; i < tsSize; i++)
newfs[i] = ts.get(i).getFeatures ();
for (int i = 0; i < tsSize; i++) {
for (int j = 0; j < conjunctions.length; j++) {
// allow conjunction offsets of length n - awc
PropertyList.Iterator[] iters = getOffsetIters (conjunctions, j, tsSize, i, oldfs);
if (iters == null)
continue;
int[] iterIndices = new int[iters.length];
for (int ii=0; ii < iterIndices.length; ii++)
iterIndices[ii] = -1;
newfs[i] = makeConjunctions (iters, 0, conjunctions, j, tsSize, newfs[i], i, oldfs, iterIndices);
}
}
// Put the new PropertyLists in place
for (int i = 0; i < ts.size(); i++)
ts.get(i).setFeatures (newfs[i]);
return carrier;
}
/** Recursively makes conjunctions by iterating through features at each offset
* @param iters iterate over the PropertyLists at each offset
* @param currIndex which offset we're currently on, e..g 1 in the list [0,1,2]
* @param conjunctions list of conjunctions
* @param j which offset list we're currently on, e.g. [0,1,2] in the list [[0,1],[0,1,2]]
* @param tsSize size of token sequence
* @param newfs new features
* @param tsi token sequence index
* @param oldfs old features
* @param iterIndices counter to keep track how far in each iterator in "iters"
* @return new features
*/
private PropertyList makeConjunctions (PropertyList.Iterator[] iters, int currIndex, int[][] conjunctions,
int j, int tsSize, PropertyList newfs, int tsi, PropertyList[] oldfs,
int[] iterIndices) {
if (iters.length == currIndex) { // base case: add feature for current conjunction of iters
// avoid redundant doubling of feature space; include only upper triangle
if (redundant (conjunctions, j, iterIndices)) {
return newfs;
}
String newFeature = "";
double newValue = 1.0;
for (int i=0; i < iters.length; i++) {
String s = iters[i].getKey();
if (featureRegex != null && !featureRegex.matcher(s).matches())
return newfs;
newFeature += (i==0 ? "" : "_&_") + s + (conjunctions[j][i]==0 ? "" : ("@" + conjunctions[j][i]));
newValue *= iters[i].getNumericValue();
}
//System.err.println ("Adding new feature " + newFeature);
newfs = PropertyList.add (newFeature, newValue, newfs);
}
else { // recursive step
while (iters[currIndex].hasNext()) {
iters[currIndex].next();
iterIndices[currIndex]++;
newfs = makeConjunctions (iters, currIndex+1, conjunctions, j, tsSize, newfs, tsi, oldfs, iterIndices);
}
// reset iterator at currIndex
iters[currIndex] = getOffsetIter (conjunctions, j, currIndex, tsSize, tsi, oldfs);
iterIndices[currIndex] = -1;
}
return newfs;
}
/** Is the current feature redundant? The current feature is
* determined by the current values in iterIndices, which tells us
* where we are in each PropertyList.Iterator. We do this test to
* ensure we only include the upper triange of conjunctions.
* @param conjunctions conjunction array
* @param j which offset we're on
* @param iterIndices counters for each PropertyList.Iterator
* @return true if feature is redundant
*/
private boolean redundant (int[][] conjunctions, int j, int[] iterIndices) {
for (int i=1; i < iterIndices.length; i++) {
if (conjunctions[j][i-1] == conjunctions[j][i] && iterIndices[i] <= iterIndices[i-1])
return true;
}
return false;
}
/** Get iterators for each token in this offset */
private PropertyList.Iterator[] getOffsetIters (int [][] conjunctions, int j, int tsSize, int tsi,
PropertyList[] oldfs) {
PropertyList.Iterator[] iters = new PropertyList.Iterator[conjunctions[j].length];
// get iterators for offsets
for (int iteri=0; iteri < iters.length; iteri++) {
iters[iteri] = getOffsetIter (conjunctions, j, iteri, tsSize, tsi, oldfs);
if (iters[iteri]==null)
return null;
}
return iters;
}
private PropertyList.Iterator getOffsetIter (int [][] conjunctions, int j, int iteri, int tsSize, int tsi,
PropertyList[] oldfs) {
PropertyList.Iterator iter;
if (tsi+conjunctions[j][iteri] < 0)
iter = startfs[-(tsi+conjunctions[j][iteri])-1].iterator();
else if (conjunctions[j][iteri]+tsi > tsSize-1)
iter = endfs[tsi+conjunctions[j][iteri]-tsSize].iterator();
else if (oldfs[conjunctions[j][iteri]+tsi] == null)
iter = null;
else
iter = oldfs[tsi+conjunctions[j][iteri]].iterator();
return iter;
}
// Serialization
private static final long serialVersionUID = 1;
private static final int CURRENT_SERIAL_VERSION = 0;
private static final int NULL_INTEGER = -1;
private void writeObject (ObjectOutputStream out) throws IOException {
out.writeInt (CURRENT_SERIAL_VERSION);
int size1, size2;
size1 = (conjunctions == null) ? NULL_INTEGER : conjunctions.length;
out.writeInt(size1);
if (size1 != NULL_INTEGER) {
for (int i = 0; i <size1; i++) {
size2 = (conjunctions[i] == null) ? NULL_INTEGER: conjunctions[i].length;
out.writeInt(size2);
if (size2 != NULL_INTEGER) {
for (int j = 0; j <size2; j++) {
out.writeInt(conjunctions[i][j]);
}
}
}
}
out.writeBoolean(includeOriginalSingletons);
out.writeObject(featureRegex); //add by fuchun
}
private void readObject (ObjectInputStream in) throws IOException, ClassNotFoundException {
int size1, size2;
int version = in.readInt ();
size1 = in.readInt();
// Deserialization doesn't call the unnamed class initializer, so do it here
if (startfs[0] == null)
initStartEndFs ();
if (size1 == NULL_INTEGER) {
conjunctions = null;
}
else {
conjunctions = new int[size1][];
for (int i = 0; i < size1; i++) {
size2 = in.readInt();
if (size2 == NULL_INTEGER) {
conjunctions[i] = null;
}
else {
conjunctions[i] = new int[size2];
for (int j = 0; j < size2; j++) {
conjunctions[i][j] = in.readInt();
}
}
}
}
includeOriginalSingletons = in.readBoolean();
featureRegex = (Pattern) in.readObject();//add by fuchun
}
}