package com.manning.hip.ch12.crunch;
import com.cloudera.crunch.*;
import com.cloudera.crunch.impl.mr.MRPipeline;
import com.cloudera.crunch.type.writable.Writables;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import java.io.IOException;
public class SimpleTokenize {
public static void main(String[] args) throws IOException {
Configuration conf = new Configuration();
Path output = new Path(args[1]);
output.getFileSystem(conf).delete(output, true);
Pipeline pipeline = new MRPipeline(SimpleTokenize.class, conf);
PCollection<String> lines = pipeline.readTextFile(args[0]);
PCollection<String> words = lines.parallelDo(
"tokenize",
new DoFn<String, String>() {
@Override
public void process(String line,
Emitter<String> emitter) {
for (String word : StringUtils.split(line)) {
emitter.emit(word);
}
}
}, Writables.strings()); // Indicates the serialization format
pipeline.writeTextFile(words, args[1]);
pipeline.done();
}
}