/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept.
This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
This software is provided under the terms of the Common Public License,
version 1.0, as published by http://www.opensource.org. For further
information, see the file `LICENSE' included with this distribution. */
Create new features from all possible conjunctions with other
(possibly position-offset) features.
@author Andrew McCallum <a href="mailto:mccallum@cs.umass.edu">mccallum@cs.umass.edu</a>
package cc.mallet.pipe.tsf;
import java.io.*;
import java.util.regex.*;
import cc.mallet.pipe.Pipe;
import cc.mallet.types.Instance;
import cc.mallet.types.Token;
import cc.mallet.types.TokenSequence;
import cc.mallet.util.PropertyList;
public class OffsetConjunctions extends Pipe implements Serializable
int[][] conjunctions;
boolean includeOriginalSingletons;
// boolean includeBeginEndBoundaries;
Pattern featureRegex;
static final int maxWindowSize = 50;
static final PropertyList[] startfs = new PropertyList[maxWindowSize];
static final PropertyList[] endfs = new PropertyList[maxWindowSize];
static {
initStartEndFs ();
private static void initStartEndFs ()
for (int i = 0; i < maxWindowSize; i++) {
startfs[i] = PropertyList.add ("<START"+i+">", 1.0, null);
endfs[i] = PropertyList.add ("<END"+i+">", 1.0, null);
// To include all the old previous singleton features, pass {{0}}
// For a conjunction at the current time step, pass {{0,0}}
// For a conjunction of current and previous, pass {{0,-1}}
// For a conjunction of the current and next two, pass {{0,1,2}}
public OffsetConjunctions (boolean includeOriginalSingletons, Pattern featureRegex, int[][] conjunctions)
this.conjunctions = conjunctions;
this.featureRegex = featureRegex;
this.includeOriginalSingletons = includeOriginalSingletons;
public OffsetConjunctions (boolean includeOriginalSingletons, int[][] conjunctions)
this (includeOriginalSingletons, null, conjunctions);
public OffsetConjunctions (int[][] conjunctions)
this (true, conjunctions);
public Instance pipe (Instance carrier)
TokenSequence ts = (TokenSequence) carrier.getData();
int tsSize = ts.size();
PropertyList[] oldfs = null;
PropertyList[] newfs = null;
try {
oldfs = new PropertyList[ts.size()];
catch (Exception e) {
System.err.println("Exception allocating oldfs: " + e);
try {
newfs = new PropertyList[ts.size()];
catch (Exception e) {
System.err.println("Exception allocating newfs: " + e);
for (int i = 0; i < tsSize; i++)
oldfs[i] = ts.get(i).getFeatures ();
if (includeOriginalSingletons)
for (int i = 0; i < tsSize; i++)
newfs[i] = ts.get(i).getFeatures ();
for (int i = 0; i < tsSize; i++) {
for (int j = 0; j < conjunctions.length; j++) {
// allow conjunction offsets of length n - awc
PropertyList.Iterator[] iters = getOffsetIters (conjunctions, j, tsSize, i, oldfs);
if (iters == null)
int[] iterIndices = new int[iters.length];
for (int ii=0; ii < iterIndices.length; ii++)
iterIndices[ii] = -1;
newfs[i] = makeConjunctions (iters, 0, conjunctions, j, tsSize, newfs[i], i, oldfs, iterIndices);
// Put the new PropertyLists in place
for (int i = 0; i < ts.size(); i++)
ts.get(i).setFeatures (newfs[i]);
return carrier;
/** Recursively makes conjunctions by iterating through features at each offset
* @param iters iterate over the PropertyLists at each offset
* @param currIndex which offset we're currently on, e..g 1 in the list [0,1,2]
* @param conjunctions list of conjunctions
* @param j which offset list we're currently on, e.g. [0,1,2] in the list [[0,1],[0,1,2]]
* @param tsSize size of token sequence
* @param newfs new features
* @param tsi token sequence index
* @param oldfs old features
* @param iterIndices counter to keep track how far in each iterator in "iters"
* @return new features
private PropertyList makeConjunctions (PropertyList.Iterator[] iters, int currIndex, int[][] conjunctions,
int j, int tsSize, PropertyList newfs, int tsi, PropertyList[] oldfs,
int[] iterIndices) {
if (iters.length == currIndex) { // base case: add feature for current conjunction of iters
// avoid redundant doubling of feature space; include only upper triangle
if (redundant (conjunctions, j, iterIndices)) {
return newfs;
String newFeature = "";
double newValue = 1.0;
for (int i=0; i < iters.length; i++) {
String s = iters[i].getKey();
if (featureRegex != null && !featureRegex.matcher(s).matches())
return newfs;
newFeature += (i==0 ? "" : "_&_") + s + (conjunctions[j][i]==0 ? "" : ("@" + conjunctions[j][i]));
newValue *= iters[i].getNumericValue();
//System.err.println ("Adding new feature " + newFeature);
newfs = PropertyList.add (newFeature, newValue, newfs);
else { // recursive step
while (iters[currIndex].hasNext()) {
newfs = makeConjunctions (iters, currIndex+1, conjunctions, j, tsSize, newfs, tsi, oldfs, iterIndices);
// reset iterator at currIndex
iters[currIndex] = getOffsetIter (conjunctions, j, currIndex, tsSize, tsi, oldfs);
iterIndices[currIndex] = -1;
return newfs;
/** Is the current feature redundant? The current feature is
* determined by the current values in iterIndices, which tells us
* where we are in each PropertyList.Iterator. We do this test to
* ensure we only include the upper triange of conjunctions.
* @param conjunctions conjunction array
* @param j which offset we're on
* @param iterIndices counters for each PropertyList.Iterator
* @return true if feature is redundant
private boolean redundant (int[][] conjunctions, int j, int[] iterIndices) {
for (int i=1; i < iterIndices.length; i++) {
if (conjunctions[j][i-1] == conjunctions[j][i] && iterIndices[i] <= iterIndices[i-1])
return true;
return false;
/** Get iterators for each token in this offset */
private PropertyList.Iterator[] getOffsetIters (int [][] conjunctions, int j, int tsSize, int tsi,
PropertyList[] oldfs) {
PropertyList.Iterator[] iters = new PropertyList.Iterator[conjunctions[j].length];
// get iterators for offsets
for (int iteri=0; iteri < iters.length; iteri++) {
iters[iteri] = getOffsetIter (conjunctions, j, iteri, tsSize, tsi, oldfs);
if (iters[iteri]==null)
return null;
return iters;
private PropertyList.Iterator getOffsetIter (int [][] conjunctions, int j, int iteri, int tsSize, int tsi,
PropertyList[] oldfs) {
PropertyList.Iterator iter;
if (tsi+conjunctions[j][iteri] < 0)
iter = startfs[-(tsi+conjunctions[j][iteri])-1].iterator();
else if (conjunctions[j][iteri]+tsi > tsSize-1)
iter = endfs[tsi+conjunctions[j][iteri]-tsSize].iterator();
else if (oldfs[conjunctions[j][iteri]+tsi] == null)
iter = null;
iter = oldfs[tsi+conjunctions[j][iteri]].iterator();
return iter;
// Serialization
private static final long serialVersionUID = 1;
private static final int CURRENT_SERIAL_VERSION = 0;
private static final int NULL_INTEGER = -1;
private void writeObject (ObjectOutputStream out) throws IOException {
int size1, size2;
size1 = (conjunctions == null) ? NULL_INTEGER : conjunctions.length;
if (size1 != NULL_INTEGER) {
for (int i = 0; i <size1; i++) {
size2 = (conjunctions[i] == null) ? NULL_INTEGER: conjunctions[i].length;
if (size2 != NULL_INTEGER) {
for (int j = 0; j <size2; j++) {
out.writeObject(featureRegex); //add by fuchun
private void readObject (ObjectInputStream in) throws IOException, ClassNotFoundException {
int size1, size2;
int version = in.readInt ();
size1 = in.readInt();
// Deserialization doesn't call the unnamed class initializer, so do it here
if (startfs[0] == null)
initStartEndFs ();
if (size1 == NULL_INTEGER) {
conjunctions = null;
else {
conjunctions = new int[size1][];
for (int i = 0; i < size1; i++) {
size2 = in.readInt();
if (size2 == NULL_INTEGER) {
conjunctions[i] = null;
else {
conjunctions[i] = new int[size2];
for (int j = 0; j < size2; j++) {
conjunctions[i][j] = in.readInt();
includeOriginalSingletons = in.readBoolean();
featureRegex = (Pattern) in.readObject();//add by fuchun