/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept.
This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
This software is provided under the terms of the Common Public License,
version 1.0, as published by http://www.opensource.org. For further
information, see the file `LICENSE' included with this distribution. */
package cc.mallet.pipe;
import java.util.logging.*;
import java.lang.reflect.Array;
import cc.mallet.pipe.Pipe;
import cc.mallet.types.Alphabet;
import cc.mallet.types.FeatureVector;
import cc.mallet.types.Instance;
import cc.mallet.types.Labeling;
import cc.mallet.util.CharSequenceLexer;
import cc.mallet.util.MalletLogger;
Converts a string of comma separated values to an array. To be used
prior to {@link Array2FeatureVector}. Note that this class assumes
that each location of the line corresponds to a feature index
(i.e. "dense" representation) eg:
instance 1: 1,0,0,1,0,0,1 << feature alphabet size = 7
instance 2: 0,0,1,0,0,0,1 << feature alphabet size = 7
@author Aron Culotta
public class Csv2Array extends Pipe {
CharSequenceLexer lexer;
int numberFeatures = -1;
private static Logger logger = MalletLogger.getLogger(Csv2Array.class.getName());
public Csv2Array () {
this.lexer = new CharSequenceLexer ("([^,]+)");
public Csv2Array (String regex) {
this.lexer = new CharSequenceLexer (regex);
public Csv2Array (CharSequenceLexer l) {
this.lexer = l;
/** Convert the data in an <CODE>Instance</CODE> from a CharSequence
* of comma-separated-values to an array, where each index is the
* feature name.
public Instance pipe( Instance carrier ) {
CharSequence c = (CharSequence)carrier.getData();
int nf = countNumberFeatures (c);
if (numberFeatures == -1) // first instance seen
numberFeatures = nf;
else if (numberFeatures != nf)
throw new IllegalArgumentException ("Instances must have same-length feature vectors. length_i: " + numberFeatures + " length_j: " + nf);
double[] feats = new double[numberFeatures];
lexer.setCharSequence (c);
int i=0;
while (lexer.hasNext())
feats[i++] = Double.parseDouble ((String)lexer.next());
carrier.setData (feats);
return carrier;
private int countNumberFeatures (CharSequence c) {
String s = c.toString();
int ret = 0;
int pos = 0;
while ((pos = s.indexOf (",", pos) + 1) != 0)
return ret+1;