Pipe invertPipe = new Pipe( "inverted index", joinPipe );
invertPipe = new CoGroup( invertPipe, new Fields( "token" ), 1, new Fields( "uid1", "ignore", "uid2", "token" ) );
Fields filterArguments = new Fields( "uid1", "uid2" );
String uidFilter = "uid1.compareToIgnoreCase( uid2 ) >= 0";
invertPipe = new Each( invertPipe, filterArguments, new ExpressionFilter( uidFilter, String.class ) );
Fields ignore = new Fields( "ignore" );
invertPipe = new Discard( invertPipe, ignore );
/*
flow part #4
count the number of tokens in common for each uid pair and apply a threshold
*/
Pipe commonPipe = new GroupBy( new Pipe( "uid common", invertPipe ), new Fields( "uid1", "uid2" ) );
commonPipe = new Every( commonPipe, Fields.ALL, new Count( new Fields( "common" ) ), Fields.ALL );
String commonFilter = String.format( "common < %d", MIN_COMMON_TOKENS );
commonPipe = new Each( commonPipe, new Fields( "common" ), new ExpressionFilter( commonFilter, Integer.TYPE ) );
/*
flow part #5
count the number of tokens overall for each uid, then join to calculate
the vector length for uid1
*/
Fields tokenCount = new Fields( "token_count" );
Pipe countPipe = new GroupBy( "count", joinPipe, new Fields( "uid" ) );
countPipe = new Every( countPipe, Fields.ALL, new Count( tokenCount ), Fields.ALL );
joinPipe = new CoGroup( countPipe, new Fields( "uid" ), commonPipe, new Fields( "uid1" ) );
joinPipe = new Pipe( "common", joinPipe );
joinPipe = new Discard( joinPipe, new Fields( "uid" ) );
joinPipe = new Rename( joinPipe, tokenCount, new Fields( "token_count1" ) );
/*
flow part #6 join to be able to calculate the vector length for
uid2, remove instances where one uid merely retweets another,
then calculate an Ochiai similarity metric to find the nearest
"neighbors" for each uid -- as recommended users to "follow"
*/
joinPipe = new CoGroup( "similarity", countPipe, new Fields( "uid" ), joinPipe, new Fields( "uid2" ) );
joinPipe = new Rename( joinPipe, tokenCount, new Fields( "token_count2" ) );
// use a DEBUG to check the values in the tuple stream; turn off in the FLOWDEF below
joinPipe = new Each( joinPipe, DebugLevel.VERBOSE, new Debug( true ) );
Fields expressionArguments = new Fields( "token_count1", "token_count2", "common" );
commonFilter = "( token_count1 == common ) || ( token_count2 == common )";
joinPipe = new Each( joinPipe, expressionArguments, new ExpressionFilter( commonFilter, Integer.TYPE ) );
Fields ochiaiArguments = new Fields( "uid1", "token_count1", "uid2", "token_count2", "common" );
Fields resultFields = new Fields( "uid", "recommend_uid", "similarity" );
joinPipe = new Each( joinPipe, ochiaiArguments, new OchiaiFunction( resultFields ), Fields.RESULTS );
/*
flow part #7
apply thresholds to filter out poor recommendations
*/
Fields similarityArguments = new Fields( "similarity" );
commonFilter = String.format(Locale.US, "similarity < %f || similarity > %f", MIN_SIMILARITY, MAX_SIMILARITY );
joinPipe = new Each( joinPipe, similarityArguments, new ExpressionFilter( commonFilter, Double.TYPE ) );
/*
connect up all the flow, generate a flow diagram, then run the flow.
results for recommended users get stored in the "similarityPath" sink tap.
*/