Source Code of cc.mallet.pipe.TokenSequenceParseFeatureString

/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept.
   This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
   http://www.cs.umass.edu/~mccallum/mallet
   This software is provided under the terms of the Common Public License,
   version 1.0, as published by http://www.opensource.org.  For further
   information, see the file `LICENSE' included with this distribution. */








package cc.mallet.pipe;


import java.util.regex.Pattern;
import java.util.regex.Matcher;
import java.util.logging.*;
import java.io.*;


import cc.mallet.types.Instance;
import cc.mallet.types.Token;
import cc.mallet.types.TokenSequence;
import cc.mallet.util.MalletLogger;


/**


   Convert the string in each field <code>Token.text</code> to a list
   of Strings (space delimited). Add each string as a feature to the
   token. If <code>realValued</code> is true, then treat the position
   in the list as the feature name and the value as a
   double. Otherwise, the feature name is the string itself and the
   value is 1.0.
     <p>
   Modified to allow feature names and values to be specified.eg:
   featureName1=featureValue1 featureName2=featureValue2 ...
   The name/value separator (here '=') can be specified.
     <p>
   *  If your data consists of feature/value pairs (eg <code>height=10.7 width=3.6 length=1.7</code>),
   *   use <code>new TokenSequenceParseFeatureString(true, true, "=")</code>. This
   *   format is typically used for sparse data, in which most features are equal to 0 in
   *   any given instance.
   *  <p>
   *  If your data consists only of values, and the position determines which feature
   *   the value is for (eg <code>10.7  3.6  1.7</code>),
   *   use <code>new TokenSequenceParseFeatureString(true)</code>. 
   *   This format is typically used for data that has a small number of features
   *   that all have non-zero values most of the time.
   *  <p>
   *  If your data is in the form of named binary indicator variables
   *   (eg <code>yellow quacks has_webbed_feet</code>), use the constructor
   *   <code>new TokenSequenceParseFeatureString(false)</code>. Each token will be 
   *   interpreted as the name of a feature, whose value is 1.0.


   @author Aron Culotta <a href="mailto:culotta@cs.umass.edu">culotta@cs.umass.edu</a>
*/


public class TokenSequenceParseFeatureString extends Pipe implements Serializable
{
  boolean realValued; // are these real-valued features?
  boolean specifyFeatureNames; // are the feature names given as well?
  String nameValueSeparator; // what separates the name from the value? (CAN'T BE WHITESPACE!)
  
  /**
   *  @param _realValued interpret each data token as a double, and associate it with a 
   *     feature called "Feature#K" where K is the order of the token, starting with 0. 
   *     Note that this option is currently ignored if <code>_specifyFeatureNames</code> is true.
   *  @param _specifyFeatureNames interpret each data token as a feature name/value pair,
   *     separated by some delimiter, which is the equals sign ("=") unless otherwise specified.
   *  @param _nameValueSeparator use a string other than = to separate name/value pairs. Colon (":") is
   *     a common choice. Note that this string cannot consist of any whitespace, as the tokens stream
   *     will already have been split.
   */
  public TokenSequenceParseFeatureString (boolean _realValued, boolean _specifyFeatureNames, String _nameValueSeparator) {
    this.realValued = _realValued;
    if (_nameValueSeparator.trim().length()==0) {
      throw new IllegalArgumentException ("nameValueSeparator can't be whitespace");
    }
    nameValueSeparator = _nameValueSeparator;
    this.specifyFeatureNames = _specifyFeatureNames;
  }


  public TokenSequenceParseFeatureString (boolean _realValued, boolean _specifyFeatureNames) {
    this (_realValued, _specifyFeatureNames, "=");
  }
  
  public TokenSequenceParseFeatureString (boolean _realValued) {
    this (_realValued, false, "=");
  }


  
  public Instance pipe (Instance carrier) {
    TokenSequence ts = (TokenSequence) carrier.getData ();
    for (int i=0; i < ts.size(); i++) {
      Token t = ts.get (i);
      String[] values = t.getText().split("\\s+");
      for (int j=0; j < values.length; j++) {
        if (specifyFeatureNames) {
          String[] nameAndValue = values[j].split(nameValueSeparator);            
          if (nameAndValue.length != 2) { // no feature name. use token as feature.
            t.setFeatureValue ("Token="+values[j], 1.0);
          }
          else {
            t.setFeatureValue (nameAndValue[0], Double.parseDouble (nameAndValue[1]));            
          }
        }
        else if (realValued) {
          t.setFeatureValue ("Feature#" + j, Double.parseDouble (values[j]));
        }
        else
          t.setFeatureValue (values[j], 1.0);          
      }
    }
    carrier.setData (ts);
    return carrier;
  }


  // Serialization 
  
  private static final long serialVersionUID = 1;
  private static final int CURRENT_SERIAL_VERSION = 1;
  
  private void writeObject (ObjectOutputStream out) throws IOException {
    out.writeInt (CURRENT_SERIAL_VERSION);
    out.writeBoolean (realValued);
    out.writeBoolean (specifyFeatureNames);
    out.writeObject (nameValueSeparator);
  }
  
  private void readObject (ObjectInputStream in) throws IOException, ClassNotFoundException {
    int version = in.readInt ();
    realValued = in.readBoolean ();
    if (version >= CURRENT_SERIAL_VERSION) {
      specifyFeatureNames = in.readBoolean();
      nameValueSeparator = (String)in.readObject();
    }
  }
}
Source Code of cc.mallet.pipe.TokenSequenceParseFeatureString

Related Classes of cc.mallet.pipe.TokenSequenceParseFeatureString