Fields filterArguments = new Fields( "uid1", "uid2" );
String uidFilter = "uid1.compareToIgnoreCase( uid2 ) >= 0";
invertPipe = new Each( invertPipe, filterArguments, new ExpressionFilter( uidFilter, String.class ) );
Fields ignore = new Fields( "ignore" );
invertPipe = new Discard( invertPipe, ignore );
/*
flow part #4
count the number of tokens in common for each uid pair and apply a threshold
*/
Pipe commonPipe = new GroupBy( new Pipe( "uid common", invertPipe ), new Fields( "uid1", "uid2" ) );
commonPipe = new Every( commonPipe, Fields.ALL, new Count( new Fields( "common" ) ), Fields.ALL );
String commonFilter = String.format( "common < %d", MIN_COMMON_TOKENS );
commonPipe = new Each( commonPipe, new Fields( "common" ), new ExpressionFilter( commonFilter, Integer.TYPE ) );
/*
flow part #5
count the number of tokens overall for each uid, then join to calculate
the vector length for uid1
*/
Fields tokenCount = new Fields( "token_count" );
Pipe countPipe = new GroupBy( "count", joinPipe, new Fields( "uid" ) );
countPipe = new Every( countPipe, Fields.ALL, new Count( tokenCount ), Fields.ALL );
joinPipe = new CoGroup( countPipe, new Fields( "uid" ), commonPipe, new Fields( "uid1" ) );
joinPipe = new Pipe( "common", joinPipe );
joinPipe = new Discard( joinPipe, new Fields( "uid" ) );
joinPipe = new Rename( joinPipe, tokenCount, new Fields( "token_count1" ) );
/*
flow part #6 join to be able to calculate the vector length for