Package cc.mallet.pipe.tsf

Source Code of cc.mallet.pipe.tsf.OffsetFeatureConjunction

/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept.
   This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
   http://www.cs.umass.edu/~mccallum/mallet
   This software is provided under the terms of the Common Public License,
   version 1.0, as published by http://www.opensource.org.  For further
   information, see the file `LICENSE' included with this distribution. */




/**
   Create new feature from the conjunction of features from given
    offsets that match given regular expressions.  This can be seen
    as hand-coding in a few of the conjunctions that you'd get
    from {@link OffsetConjunctions}.
<P>
   For example, creating a pipe with
     <TT>new OffsetFeatureConjunction ("TIME", new String[] { "number", "W=:" "number" }, new int[] { 0, 1, 2 })<TT>
   will create a feature that is true whenever all of (a) a feature at the
   current time matches "number" (b) a feature at the next time step matches "W=:"
   (b) a feature 2 timesteps from now match "number", so that you have a
   simple time detector.

   <P>If the conjunction passes, then either the first timestep
   (that is, the one all the offsets were computed from), or all matching timesteps,
   get the feature "TIME" --- depending on the value of the field tagAllTimesteps.

   @author Charles Sutton <a href="mailto:casutton@cs.umass.edu">casutton@cs.umass.edu</a>
   @author Andrew McCallum <a href="mailto:mccallum@cs.umass.edu">mccallum@cs.umass.edu</a>
*/

package cc.mallet.pipe.tsf;


import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.Serializable;
import java.util.regex.Pattern;

import cc.mallet.pipe.Pipe;
import cc.mallet.types.Instance;
import cc.mallet.types.Token;
import cc.mallet.types.TokenSequence;
import cc.mallet.util.PropertyList;

public class OffsetFeatureConjunction extends Pipe implements Serializable
{
  private String thisFeatureName;
  private Pattern[] featurePatterns;
  private int[] offsets;
  private boolean[] isNonNegated;
  private boolean tagAllTimesteps;
 
  /**
   * Create a Pipe for adding conjunctions of specified features.
   * @param thisFeatureName Name of this conjunction feature.
   * @param featureNames String giving name for each subfeature i.
   * @param offsets For each subfeature i, which offset from the current timestep
   *   must i appear at.
   * @param isNonNegated If element i is false, then the negation of the
   *   feature is added to the conjuction.
   */
  public OffsetFeatureConjunction (String thisFeatureName, String[] featureNames, int[] offsets, boolean[] isNonNegated, boolean tagAllTimesteps)
  {
    this.thisFeatureName = thisFeatureName;
    this.featurePatterns = patternify (featureNames);
    this.offsets = offsets;
    this.isNonNegated = isNonNegated;
    this.tagAllTimesteps = tagAllTimesteps;
  }

  private static boolean[] trueArray (int length) {
    boolean[] ret = new boolean[length];
    for (int i = 0; i < length; i++)
      ret[i] = true;
    return ret;
  }

  private Pattern[] patternify (String[] regex) {
    Pattern[] retval = new Pattern [regex.length];
    for (int i = 0; i < regex.length; i++) {
      retval [i] = Pattern.compile (regex[i]);
    }
    return retval;
  }

  public OffsetFeatureConjunction (String thisFeatureName, String[] featureNames, int[] offsets,
                                   boolean tagAllTimesteps)
  {
    this (thisFeatureName, featureNames, offsets, trueArray(featureNames.length), tagAllTimesteps);
  }

  public OffsetFeatureConjunction (String thisFeatureName, String[] featureNames, int[] offsets)
  {
    this (thisFeatureName, featureNames, offsets, trueArray(featureNames.length), false);
  }

  public boolean isTagAllTimesteps ()
  {
    return tagAllTimesteps;
  }

  public String getFeatureName ()
  {
    return thisFeatureName;
  }

  public Pattern[] getFeaturePatterns ()
  {
    return featurePatterns;
  }

  public int[] getOffsets ()
  {
    return offsets;
  }

  public boolean[] getNonNegated ()
  {
    return isNonNegated;
  }

  public Instance pipe (Instance carrier)
  {
    TokenSequence ts = (TokenSequence) carrier.getData();
    int tsSize = ts.size();
    for (int t = 0; t < tsSize; t++) {
      // Check whether the conjunction is true at time step t
      boolean passes = true;
      for (int fnum = 0; fnum < featurePatterns.length; fnum++) {
        int pos = t + offsets[fnum];
        if (!(pos >= 0 && pos < tsSize)) {
          passes = false;
          break;
        }
        boolean featurePresent = hasMatchingFeature (ts.get(pos), featurePatterns [fnum]);
        if (featurePresent != isNonNegated [fnum]) {
          passes = false;
          break;
        }
      }
      if (passes) {
        if (tagAllTimesteps) {
          for (int fnum = 0; fnum < featurePatterns.length; fnum++) {
            int pos = t + offsets[fnum];
            ts.get(pos).setFeatureValue (thisFeatureName, 1.0);
          }
        } else {
          ts.get(t).setFeatureValue (thisFeatureName, 1.0);
        }
      }
    }

    return carrier;
  }

  private boolean hasMatchingFeature (Token token, Pattern pattern)
  {
    PropertyList.Iterator iter = token.getFeatures ().iterator ();
    while (iter.hasNext()) {
      iter.next();
      if (pattern.matcher (iter.getKey()). matches ()) {
        if (iter.getNumericValue() == 1.0) {
          return true;
        }
      }
    }
    return false;
  }

  // Serialization
 
  private static final long serialVersionUID = 1;
  private static final int CURRENT_SERIAL_VERSION = 1;
  private static final int NULL_INTEGER = -1;
 
  private void writeObject (ObjectOutputStream out) throws IOException
  {
    out.writeInt (CURRENT_SERIAL_VERSION);
    out.writeObject (thisFeatureName);
    out.writeBoolean (tagAllTimesteps);
    int size;
    size = (featurePatterns == null) ? NULL_INTEGER : featurePatterns.length;
    out.writeInt(size);
    if (size != NULL_INTEGER) {
      for (int i = 0; i <size; i++) {
        out.writeObject (featurePatterns[i]);
        out.writeInt (offsets[i]);
        out.writeBoolean (isNonNegated[i]);
      }
    }
  }
 
  private void readObject (ObjectInputStream in) throws IOException, ClassNotFoundException
  {
    int size;
    int version = in.readInt ();
    thisFeatureName = (String) in.readObject();
    if (version >= 1) tagAllTimesteps = in.readBoolean ();

    size = in.readInt();
    if (size == NULL_INTEGER) {
      featurePatterns = null;
      offsets = null;
      isNonNegated = null;
    else {
      featurePatterns = new Pattern[size];
      offsets = new int[size];
      isNonNegated = new boolean[size];
      for (int i = 0; i < size; i++) {
        featurePatterns[i] = (Pattern) in.readObject();
        offsets[i] = in.readInt();
        isNonNegated[i] = in.readBoolean();
      }
    }
  }

}
TOP

Related Classes of cc.mallet.pipe.tsf.OffsetFeatureConjunction

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.