Package cc.mallet.pipe

Source Code of cc.mallet.pipe.TokenSequenceParseFeatureString

/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept.
   This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
   http://www.cs.umass.edu/~mccallum/mallet
   This software is provided under the terms of the Common Public License,
   version 1.0, as published by http://www.opensource.org.  For further
   information, see the file `LICENSE' included with this distribution. */




package cc.mallet.pipe;

import java.util.regex.Pattern;
import java.util.regex.Matcher;
import java.util.logging.*;
import java.io.*;

import cc.mallet.types.Instance;
import cc.mallet.types.Token;
import cc.mallet.types.TokenSequence;
import cc.mallet.util.MalletLogger;

/**

   Convert the string in each field <code>Token.text</code> to a list
   of Strings (space delimited). Add each string as a feature to the
   token. If <code>realValued</code> is true, then treat the position
   in the list as the feature name and the value as a
   double. Otherwise, the feature name is the string itself and the
   value is 1.0.
     <p>
   Modified to allow feature names and values to be specified.eg:
   featureName1=featureValue1 featureName2=featureValue2 ...
   The name/value separator (here '=') can be specified.
     <p>
   *  If your data consists of feature/value pairs (eg <code>height=10.7 width=3.6 length=1.7</code>),
   *   use <code>new TokenSequenceParseFeatureString(true, true, "=")</code>. This
   *   format is typically used for sparse data, in which most features are equal to 0 in
   *   any given instance.
   *  <p>
   *  If your data consists only of values, and the position determines which feature
   *   the value is for (eg <code>10.7  3.6  1.7</code>),
   *   use <code>new TokenSequenceParseFeatureString(true)</code>.
   *   This format is typically used for data that has a small number of features
   *   that all have non-zero values most of the time.
   *  <p>
   *  If your data is in the form of named binary indicator variables
   *   (eg <code>yellow quacks has_webbed_feet</code>), use the constructor
   *   <code>new TokenSequenceParseFeatureString(false)</code>. Each token will be
   *   interpreted as the name of a feature, whose value is 1.0.

   @author Aron Culotta <a href="mailto:culotta@cs.umass.edu">culotta@cs.umass.edu</a>
*/

public class TokenSequenceParseFeatureString extends Pipe implements Serializable
{
  boolean realValued; // are these real-valued features?
  boolean specifyFeatureNames; // are the feature names given as well?
  String nameValueSeparator; // what separates the name from the value? (CAN'T BE WHITESPACE!)
 
  /**
   *  @param _realValued interpret each data token as a double, and associate it with a
   *     feature called "Feature#K" where K is the order of the token, starting with 0.
   *     Note that this option is currently ignored if <code>_specifyFeatureNames</code> is true.
   *  @param _specifyFeatureNames interpret each data token as a feature name/value pair,
   *     separated by some delimiter, which is the equals sign ("=") unless otherwise specified.
   *  @param _nameValueSeparator use a string other than = to separate name/value pairs. Colon (":") is
   *     a common choice. Note that this string cannot consist of any whitespace, as the tokens stream
   *     will already have been split.
   */
  public TokenSequenceParseFeatureString (boolean _realValued, boolean _specifyFeatureNames, String _nameValueSeparator) {
    this.realValued = _realValued;
    if (_nameValueSeparator.trim().length()==0) {
      throw new IllegalArgumentException ("nameValueSeparator can't be whitespace");
    }
    nameValueSeparator = _nameValueSeparator;
    this.specifyFeatureNames = _specifyFeatureNames;
  }

  public TokenSequenceParseFeatureString (boolean _realValued, boolean _specifyFeatureNames) {
    this (_realValued, _specifyFeatureNames, "=");
  }
 
  public TokenSequenceParseFeatureString (boolean _realValued) {
    this (_realValued, false, "=");
  }

 
  public Instance pipe (Instance carrier) {
    TokenSequence ts = (TokenSequence) carrier.getData ();
    for (int i=0; i < ts.size(); i++) {
      Token t = ts.get (i);
      String[] values = t.getText().split("\\s+");
      for (int j=0; j < values.length; j++) {
        if (specifyFeatureNames) {
          String[] nameAndValue = values[j].split(nameValueSeparator);           
          if (nameAndValue.length != 2) { // no feature name. use token as feature.
            t.setFeatureValue ("Token="+values[j], 1.0);
          }
          else {
            t.setFeatureValue (nameAndValue[0], Double.parseDouble (nameAndValue[1]));           
          }
        }
        else if (realValued) {
          t.setFeatureValue ("Feature#" + j, Double.parseDouble (values[j]));
        }
        else
          t.setFeatureValue (values[j], 1.0);         
      }
    }
    carrier.setData (ts);
    return carrier;
  }

  // Serialization
 
  private static final long serialVersionUID = 1;
  private static final int CURRENT_SERIAL_VERSION = 1;
 
  private void writeObject (ObjectOutputStream out) throws IOException {
    out.writeInt (CURRENT_SERIAL_VERSION);
    out.writeBoolean (realValued);
    out.writeBoolean (specifyFeatureNames);
    out.writeObject (nameValueSeparator);
  }
 
  private void readObject (ObjectInputStream in) throws IOException, ClassNotFoundException {
    int version = in.readInt ();
    realValued = in.readBoolean ();
    if (version >= CURRENT_SERIAL_VERSION) {
      specifyFeatureNames = in.readBoolean();
      nameValueSeparator = (String)in.readObject();
    }
  }
}
TOP

Related Classes of cc.mallet.pipe.TokenSequenceParseFeatureString

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.