output.mkdirs();
int numCats = Integer.parseInt(getOption("categories"));
int cardinality = Integer.parseInt(getOption("cardinality", "100000"));
int threadCount = Integer.parseInt(getOption("threads", "20"));
int poolSize = Integer.parseInt(getOption("poolSize", "5"));
Dictionary asfDictionary = new Dictionary();
AdaptiveLogisticRegression learningAlgorithm = new AdaptiveLogisticRegression(numCats, cardinality, new L1(), threadCount, poolSize);
learningAlgorithm.setInterval(800);
learningAlgorithm.setAveragingWindow(500);
//We ran seq2encoded and split input already, so let's just build up the dictionary
Configuration conf = new Configuration();
PathFilter trainFilter = new PathFilter() {
@Override
public boolean accept(Path path) {
return path.getName().contains("training");
}
};
SequenceFileDirIterator<Text, VectorWritable> iter = new SequenceFileDirIterator<Text, VectorWritable>(new Path(base.toString()), PathType.LIST, trainFilter,
null, true, conf);
long numItems = 0;
while (iter.hasNext()) {
Pair<Text, VectorWritable> next = iter.next();
asfDictionary.intern(next.getFirst().toString());
numItems++;
}
System.out.printf("%d training files\n", numItems);
SGDInfo info = new SGDInfo();
iter = new SequenceFileDirIterator<Text, VectorWritable>(new Path(base.toString()), PathType.LIST, trainFilter,
null, true, conf);
int k = 0;
while (iter.hasNext()) {
Pair<Text, VectorWritable> next = iter.next();
String ng = next.getFirst().toString();
int actual = asfDictionary.intern(ng);
//we already have encoded
learningAlgorithm.train(actual, next.getSecond().get());
k++;
State<AdaptiveLogisticRegression.Wrapper, CrossFoldLearner> best = learningAlgorithm.getBest();