Package cc.mallet.pipe.tsf

Source Code of cc.mallet.pipe.tsf.OffsetConjunctions

/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept.
   This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
   http://www.cs.umass.edu/~mccallum/mallet
   This software is provided under the terms of the Common Public License,
   version 1.0, as published by http://www.opensource.org.  For further
   information, see the file `LICENSE' included with this distribution. */




/**
   Create new features from all possible conjunctions with other
   (possibly position-offset) features.

   @author Andrew McCallum <a href="mailto:mccallum@cs.umass.edu">mccallum@cs.umass.edu</a>
*/

package cc.mallet.pipe.tsf;

import java.io.*;
import java.util.regex.*;

import cc.mallet.pipe.Pipe;
import cc.mallet.types.Instance;
import cc.mallet.types.Token;
import cc.mallet.types.TokenSequence;
import cc.mallet.util.PropertyList;

public class OffsetConjunctions extends Pipe implements Serializable
{
  int[][] conjunctions;
  boolean includeOriginalSingletons;
  //  boolean includeBeginEndBoundaries;
  Pattern featureRegex;

  static final int maxWindowSize = 50;
  static final PropertyList[] startfs = new PropertyList[maxWindowSize];
  static final PropertyList[] endfs = new PropertyList[maxWindowSize];

  static {
    initStartEndFs ();
  }

  private static void initStartEndFs ()
  {
    for (int i = 0; i < maxWindowSize; i++) {
      startfs[i] = PropertyList.add ("<START"+i+">", 1.0, null);
      endfs[i] = PropertyList.add ("<END"+i+">", 1.0, null);
    }
  }
 
  // To include all the old previous singleton features, pass {{0}}
  // For a conjunction at the current time step, pass {{0,0}}
  // For a conjunction of current and previous, pass {{0,-1}}
  // For a conjunction of the current and next two, pass {{0,1,2}}
  public OffsetConjunctions (boolean includeOriginalSingletons, Pattern featureRegex, int[][] conjunctions)
  {
    this.conjunctions = conjunctions;
    this.featureRegex = featureRegex;
    this.includeOriginalSingletons = includeOriginalSingletons;
  }

  public OffsetConjunctions (boolean includeOriginalSingletons, int[][] conjunctions)
  {
    this (includeOriginalSingletons, null, conjunctions);
  }
 
  public OffsetConjunctions (int[][] conjunctions)
  {
    this (true, conjunctions);
  }
 
  public Instance pipe (Instance carrier)
  {
    TokenSequence ts = (TokenSequence) carrier.getData();
    int tsSize = ts.size();
    PropertyList[] oldfs = null;
    PropertyList[] newfs = null;
    try {
      oldfs = new PropertyList[ts.size()];
    }
    catch (Exception e) {
      System.err.println("Exception allocating oldfs: " + e);
    }
    try {
      newfs = new PropertyList[ts.size()];
    }
    catch (Exception e) {
      System.err.println("Exception allocating newfs: " + e);
    }
   
    for (int i = 0; i < tsSize; i++)
      oldfs[i] = ts.get(i).getFeatures ();
    if (includeOriginalSingletons)
      for (int i = 0; i < tsSize; i++)
        newfs[i] = ts.get(i).getFeatures ();

    for (int i = 0; i < tsSize; i++) {
      for (int j = 0; j < conjunctions.length; j++) {       
        // allow conjunction offsets of length n - awc
        PropertyList.Iterator[] iters = getOffsetIters (conjunctions, j, tsSize, i, oldfs);
        if (iters == null)
          continue;
        int[] iterIndices = new int[iters.length];
        for (int ii=0; ii < iterIndices.length; ii++)
          iterIndices[ii] = -1;
        newfs[i] = makeConjunctions (iters, 0, conjunctions, j, tsSize, newfs[i], i, oldfs, iterIndices);
      }
    }
    // Put the new PropertyLists in place
    for (int i = 0; i < ts.size(); i++)
      ts.get(i).setFeatures (newfs[i]);
    return carrier;
  }   

  /** Recursively makes conjunctions by iterating through features at each offset
   *  @param iters iterate over the PropertyLists at each offset
   *  @param currIndex which offset we're currently on, e..g 1 in the list [0,1,2]
   *  @param conjunctions list of conjunctions
   *  @param j which offset list we're currently on, e.g. [0,1,2] in the list [[0,1],[0,1,2]]
   *  @param tsSize size of token sequence
   *  @param newfs new features
   *  @param tsi token sequence index
   *  @param oldfs old features
   *  @param iterIndices counter to keep track how far in each iterator in "iters"
   *  @return new features
   */
  private PropertyList makeConjunctions (PropertyList.Iterator[] iters, int currIndex, int[][] conjunctions,
                                         int j, int tsSize, PropertyList newfs, int tsi, PropertyList[] oldfs,
                                         int[] iterIndices) {
    if (iters.length == currIndex) { // base case: add feature for current conjunction of iters
      // avoid redundant doubling of feature space; include only upper triangle
      if (redundant (conjunctions, j, iterIndices)) {
        return newfs;
      }
      String newFeature = "";
      double newValue = 1.0;
      for (int i=0; i < iters.length; i++) {
        String s = iters[i].getKey();
        if (featureRegex != null && !featureRegex.matcher(s).matches())
          return newfs;
        newFeature += (i==0 ? "" : "_&_") + s + (conjunctions[j][i]==0 ? "" : ("@" + conjunctions[j][i]));
        newValue *= iters[i].getNumericValue();
      }
      //System.err.println ("Adding new feature " + newFeature);
      newfs = PropertyList.add (newFeature, newValue, newfs);
    }
    else { // recursive step
      while (iters[currIndex].hasNext()) {
        iters[currIndex].next();
        iterIndices[currIndex]++;
        newfs = makeConjunctions (iters, currIndex+1, conjunctions, j, tsSize, newfs, tsi, oldfs, iterIndices);
      }
      // reset iterator at currIndex
      iters[currIndex] = getOffsetIter (conjunctions, j, currIndex, tsSize, tsi, oldfs);
      iterIndices[currIndex] = -1;
    }
    return newfs;
  }

  /** Is the current feature redundant? The current feature is
   * determined by the current values in iterIndices, which tells us
   * where we are in each PropertyList.Iterator. We do this test to
   * ensure we only include the upper triange of conjunctions.
   * @param conjunctions conjunction array
   * @param j which offset we're on
   * @param iterIndices counters for each PropertyList.Iterator
   * @return true if feature is redundant
   */
  private boolean redundant (int[][] conjunctions, int j, int[] iterIndices) {
    for (int i=1; i < iterIndices.length; i++) {
      if (conjunctions[j][i-1] == conjunctions[j][i] && iterIndices[i] <= iterIndices[i-1])
        return true;
    }
    return false;   
  }

  /** Get iterators for each token in this offset */
  private PropertyList.Iterator[] getOffsetIters (int [][] conjunctions, int j, int tsSize, int tsi,
                                                  PropertyList[] oldfs) {   
    PropertyList.Iterator[] iters = new PropertyList.Iterator[conjunctions[j].length];
    // get iterators for offsets
    for (int iteri=0; iteri < iters.length; iteri++) {
      iters[iteri] = getOffsetIter (conjunctions, j, iteri, tsSize, tsi, oldfs);
      if (iters[iteri]==null)
        return null;
    }
    return iters;
  }

  private PropertyList.Iterator getOffsetIter (int [][] conjunctions, int j, int iteri, int tsSize, int tsi,
                                               PropertyList[] oldfs) {
    PropertyList.Iterator iter;
    if (tsi+conjunctions[j][iteri] < 0)
      iter = startfs[-(tsi+conjunctions[j][iteri])-1].iterator();
    else if (conjunctions[j][iteri]+tsi > tsSize-1)
      iter = endfs[tsi+conjunctions[j][iteri]-tsSize].iterator();
    else if (oldfs[conjunctions[j][iteri]+tsi] == null)
      iter = null;
    else
      iter = oldfs[tsi+conjunctions[j][iteri]].iterator();
    return iter;
  }
 
  // Serialization
 
  private static final long serialVersionUID = 1;
  private static final int CURRENT_SERIAL_VERSION = 0;
  private static final int NULL_INTEGER = -1;
 
  private void writeObject (ObjectOutputStream out) throws IOException {
    out.writeInt (CURRENT_SERIAL_VERSION);
    int size1, size2;
    size1 = (conjunctions == null) ? NULL_INTEGER : conjunctions.length;
    out.writeInt(size1);
    if (size1 != NULL_INTEGER) {
      for (int i = 0; i <size1; i++) {
        size2 = (conjunctions[i] == null) ? NULL_INTEGER: conjunctions[i].length;
        out.writeInt(size2);
        if (size2 != NULL_INTEGER) {
          for (int j = 0; j <size2; j++) {
            out.writeInt(conjunctions[i][j]);
          }
        }
      }
    }
    out.writeBoolean(includeOriginalSingletons);
   
    out.writeObject(featureRegex); //add by fuchun
  }
 
  private void readObject (ObjectInputStream in) throws IOException, ClassNotFoundException {
    int size1, size2;
    int version = in.readInt ();
    size1 = in.readInt();
    // Deserialization doesn't call the unnamed class initializer, so do it here
    if (startfs[0] == null)
      initStartEndFs ();
    if (size1 == NULL_INTEGER) {
      conjunctions = null;
    }
    else {
      conjunctions = new int[size1][];
      for (int i = 0; i < size1; i++) {
        size2 = in.readInt();
        if (size2 == NULL_INTEGER) {
          conjunctions[i] = null;
        }
        else {
          conjunctions[i] = new int[size2];
          for (int j = 0; j < size2; j++) {
            conjunctions[i][j] = in.readInt();
          }
        }
      }
    }
    includeOriginalSingletons = in.readBoolean();
    featureRegex = (Pattern) in.readObject();//add by fuchun
 
  }
}
TOP

Related Classes of cc.mallet.pipe.tsf.OffsetConjunctions

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.