create SINK tap to measure token frequency, which will need to be used to adjust
stop words -- based on an R script
*/
Pipe tokenPipe = new Pipe( "token", joinPipe ); // name branch
tokenPipe = new GroupBy( tokenPipe, new Fields( "token" ) );
tokenPipe = new Every( tokenPipe, Fields.ALL, new Count(), Fields.ALL );
/*
flow part #3
generate an inverted index for ((uid1,uid2), token) to avoid having to perform
a cross-product, which would impose a bottleneck in the parallelism
*/
Pipe invertPipe = new Pipe( "inverted index", joinPipe );
invertPipe = new CoGroup( invertPipe, new Fields( "token" ), 1, new Fields( "uid1", "ignore", "uid2", "token" ) );
Fields filterArguments = new Fields( "uid1", "uid2" );
String uidFilter = "uid1.compareToIgnoreCase( uid2 ) >= 0";
invertPipe = new Each( invertPipe, filterArguments, new ExpressionFilter( uidFilter, String.class ) );
Fields ignore = new Fields( "ignore" );
invertPipe = new Discard( invertPipe, ignore );
/*
flow part #4
count the number of tokens in common for each uid pair and apply a threshold
*/
Pipe commonPipe = new GroupBy( new Pipe( "uid common", invertPipe ), new Fields( "uid1", "uid2" ) );
commonPipe = new Every( commonPipe, Fields.ALL, new Count( new Fields( "common" ) ), Fields.ALL );
String commonFilter = String.format( "common < %d", MIN_COMMON_TOKENS );
commonPipe = new Each( commonPipe, new Fields( "common" ), new ExpressionFilter( commonFilter, Integer.TYPE ) );
/*
flow part #5
count the number of tokens overall for each uid, then join to calculate
the vector length for uid1
*/
Fields tokenCount = new Fields( "token_count" );
Pipe countPipe = new GroupBy( "count", joinPipe, new Fields( "uid" ) );
countPipe = new Every( countPipe, Fields.ALL, new Count( tokenCount ), Fields.ALL );
joinPipe = new CoGroup( countPipe, new Fields( "uid" ), commonPipe, new Fields( "uid1" ) );
joinPipe = new Pipe( "common", joinPipe );
joinPipe = new Discard( joinPipe, new Fields( "uid" ) );