Source Code of cc.mallet.pipe.SvmLight2FeatureVectorAndLabel

/* Copyright (C) 2010 Univ. of Massachusetts Amherst, Computer Science Dept.
   This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
   http://www.cs.umass.edu/~mccallum/mallet
   This software is provided under the terms of the Common Public License,
   version 1.0, as published by http://www.opensource.org.  For further
   information, see the file `LICENSE' included with this distribution. */


package cc.mallet.pipe;


import cc.mallet.pipe.Pipe;
import cc.mallet.types.Alphabet;
import cc.mallet.types.FeatureVector;
import cc.mallet.types.Instance;
import cc.mallet.types.Label;
import cc.mallet.types.LabelAlphabet;


/**
 * This Pipe converts a line in SVMLight format to 
 * a Mallet instance with FeatureVector data and 
 * Label target.  The expected format is
 * 
 * target feature:value feature:value ...
 * 
 * targets and features can be indices, as in 
 * SVMLight, or Strings.
 * 
 * Note that if targets and features are indices,
 * their indices in the data and target Alphabets
 * may be different, though the data will be
 * equivalent.  
 * 
 * @author Gregory Druck
 *
 */
public class SvmLight2FeatureVectorAndLabel extends Pipe {


  private static final long serialVersionUID = 1L;
  
  public SvmLight2FeatureVectorAndLabel () {
    super (new Alphabet(), new LabelAlphabet());
  }
  
  // There is no guarantee that the feature indices in the text
  // file will be the same as in the pipe.  The data should be
  // exactly the same, however, just permuted.  
  @Override public Instance pipe(Instance carrier) {
    // we expect the data for each instance to be
    // a line from the SVMLight format text file    
    String dataStr = (String)carrier.getData();


    // ignore comments at the end
    if (dataStr.contains("#")) {
      dataStr = dataStr.substring(0, dataStr.indexOf('#'));
    }


    String[] terms = dataStr.split("\\s+");
    
    String classStr = terms[0];
    // In SVMLight +1 and 1 are the same label.  
    // Adding a special case to normalize...
    if (classStr.equals("+1")) {
      classStr = "1";
    }
    Label label = ((LabelAlphabet)getTargetAlphabet()).lookupLabel(classStr, true);
    carrier.setTarget(label);
    
    // the rest are feature-value pairs
    int numFeatures = terms.length - 1;
    int[] indices = new int[numFeatures];
    double[] values = new double[numFeatures];
    for (int termIndex = 1; termIndex < terms.length; termIndex++) {
      if (!terms[termIndex].equals("")) {
        String[] s = terms[termIndex].split(":");
        String feature = s[0];
        indices[termIndex-1] = getDataAlphabet().lookupIndex(feature, true);       
        values[termIndex-1] = Double.parseDouble(s[1]);
      }
    }
    
    FeatureVector fv = new FeatureVector(getDataAlphabet(), indices, values);
    carrier.setData(fv);
    return carrier;
  }
}
Source Code of cc.mallet.pipe.SvmLight2FeatureVectorAndLabel

Related Classes of cc.mallet.pipe.SvmLight2FeatureVectorAndLabel