/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept.
This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
http://www.cs.umass.edu/~mccallum/mallet
This software is provided under the terms of the Common Public License,
version 1.0, as published by http://www.opensource.org. For further
information, see the file `LICENSE' included with this distribution. */
package cc.mallet.pipe;
import java.util.logging.*;
import java.lang.reflect.Array;
import cc.mallet.pipe.Pipe;
import cc.mallet.types.Alphabet;
import cc.mallet.types.FeatureVector;
import cc.mallet.types.Instance;
import cc.mallet.types.Labeling;
import cc.mallet.util.CharSequenceLexer;
import cc.mallet.util.MalletLogger;
/**
Converts a string of comma separated values to an array. To be used
prior to {@link Array2FeatureVector}. Note that this class assumes
that each location of the line corresponds to a feature index
(i.e. "dense" representation) eg:
instance 1: 1,0,0,1,0,0,1 << feature alphabet size = 7
instance 2: 0,0,1,0,0,0,1 << feature alphabet size = 7
@author Aron Culotta
*/
public class Csv2Array extends Pipe {
CharSequenceLexer lexer;
int numberFeatures = -1;
private static Logger logger = MalletLogger.getLogger(Csv2Array.class.getName());
public Csv2Array () {
this.lexer = new CharSequenceLexer ("([^,]+)");
}
public Csv2Array (String regex) {
this.lexer = new CharSequenceLexer (regex);
}
public Csv2Array (CharSequenceLexer l) {
this.lexer = l;
}
/** Convert the data in an <CODE>Instance</CODE> from a CharSequence
* of comma-separated-values to an array, where each index is the
* feature name.
*/
public Instance pipe( Instance carrier ) {
CharSequence c = (CharSequence)carrier.getData();
int nf = countNumberFeatures (c);
if (numberFeatures == -1) // first instance seen
numberFeatures = nf;
else if (numberFeatures != nf)
throw new IllegalArgumentException ("Instances must have same-length feature vectors. length_i: " + numberFeatures + " length_j: " + nf);
double[] feats = new double[numberFeatures];
lexer.setCharSequence (c);
int i=0;
while (lexer.hasNext())
feats[i++] = Double.parseDouble ((String)lexer.next());
carrier.setData (feats);
return carrier;
}
private int countNumberFeatures (CharSequence c) {
String s = c.toString();
int ret = 0;
int pos = 0;
while ((pos = s.indexOf (",", pos) + 1) != 0)
ret++;
return ret+1;
}
}