File transformedOutput = tmpDir.getFile("transformed-output");
File tfOutput = tmpDir.getFile("tf-output");
PCollection<String> docs = pipeline.readTextFile(input);
PTypeFamily ptf = docs.getTypeFamily();
/*
* Input: String Input title text
*
* Output: PTable<Pair<String, String>, Long> Pair<Pair<word, title>, count
* in title>
*/
PTable<Pair<String, String>, Long> tf = Aggregate.count(docs.parallelDo("term document frequency",
new DoFn<String, Pair<String, String>>() {
@Override
public void process(String doc, Emitter<Pair<String, String>> emitter) {
String[] kv = doc.split("\t");
String title = kv[0];
String text = kv[1];
for (String word : text.split("\\W+")) {
if (!word.isEmpty()) {
Pair<String, String> pair = Pair.of(word.toLowerCase(Locale.ENGLISH), title);
emitter.emit(pair);
}
}
}
}, ptf.pairs(ptf.strings(), ptf.strings())));
if (transformTF) {
/*
* Input: Pair<Pair<String, String>, Long> Pair<Pair<word, title>, count
* in title>
*
* Output: PTable<String, Pair<String, Long>> PTable<word, Pair<title,
* count in title>>
*/
PTable<String, Pair<String, Long>> wordDocumentCountPair = tf.parallelDo("transform wordDocumentPairCount",
new MapFn<Pair<Pair<String, String>, Long>, Pair<String, Pair<String, Long>>>() {
@Override
public Pair<String, Pair<String, Long>> map(Pair<Pair<String, String>, Long> input) {
Pair<String, String> wordDocumentPair = input.first();
return Pair.of(wordDocumentPair.first(), Pair.of(wordDocumentPair.second(), input.second()));
}
}, ptf.tableOf(ptf.strings(), ptf.pairs(ptf.strings(), ptf.longs())));
pipeline.writeTextFile(wordDocumentCountPair, transformedOutput.getAbsolutePath());
}
SourceTarget<String> st = At.textFile(tfOutput.getAbsolutePath());