Package cascading.pipe.assembly

Examples of cascading.pipe.assembly.SumBy


    Fields rhs_join = new Fields( "rhs_join" );
    Fields n_docs = new Fields( "n_docs" );
    Pipe dPipe = new Unique( "D", tokenPipe, doc_id );
    dPipe = new Each( dPipe, new Insert( tally, 1 ), Fields.ALL );
    dPipe = new Each( dPipe, new Insert( rhs_join, 1 ), Fields.ALL );
    dPipe = new SumBy( dPipe, rhs_join, tally, n_docs, long.class );

    // one branch tallies the token counts for document frequency (DF)
    Pipe dfPipe = new Unique( "DF", tokenPipe, Fields.ALL );
    Fields df_count = new Fields( "df_count" );
    dfPipe = new CountBy( dfPipe, token, df_count );

    Fields df_token = new Fields( "df_token" );
    Fields lhs_join = new Fields( "lhs_join" );
    dfPipe = new Rename( dfPipe, token, df_token );
    dfPipe = new Each( dfPipe, new Insert( lhs_join, 1 ), Fields.ALL );

    // join to bring together all the components for calculating TF-IDF
    // the D side of the join is smaller, so it goes on the RHS
    Pipe idfPipe = new HashJoin( dfPipe, lhs_join, dPipe, rhs_join );

    // the IDF side of the join is smaller, so it goes on the RHS
    Pipe tfidfPipe = new CoGroup( tfPipe, tf_token, idfPipe, df_token );

    // calculate the TF-IDF weights, per token, per document
    Fields tfidf = new Fields( "tfidf" );
    String expression = "(double) tf_count * Math.log( (double) n_docs / ( 1.0 + df_count ) )";
    ExpressionFunction tfidfExpression = new ExpressionFunction( tfidf, expression, Double.class );
    Fields tfidfArguments = new Fields( "tf_count", "df_count", "n_docs" );
    tfidfPipe = new Each( tfidfPipe, tfidfArguments, tfidfExpression, Fields.ALL );

    fieldSelector = new Fields( "tf_token", "doc_id", "tfidf" );
    tfidfPipe = new Retain( tfidfPipe, fieldSelector );
    tfidfPipe = new Rename( tfidfPipe, tf_token, token );

    // keep track of the word counts, which are useful for QA
    Pipe wcPipe = new Pipe( "wc", tfPipe );

    Fields count = new Fields( "count" );
    wcPipe = new SumBy( wcPipe, tf_token, tf_count, count, long.class );
    wcPipe = new Rename( wcPipe, tf_token, token );

    // additionally, sort by count
    wcPipe = new GroupBy( wcPipe, count, count );

View Full Code Here


    Fields groupingFields = new Fields( "date" );

    assembly = new AggregateBy(
      assembly,
      groupingFields,
      new SumBy( new Fields( "size" ), new Fields( "size" ), double.class ),
      new SumBy( new Fields( "size" ), new Fields( "size2" ), double.class ),
      new CountBy( new Fields( "sizes" ) ), new CountBy( new Fields( "sizes2" ) )

    );

    assembly2 = new AggregateBy(
      assembly2,
      groupingFields,
      new SumBy( new Fields( "size" ), new Fields( "size" ), double.class ),
      new SumBy( new Fields( "size" ), new Fields( "size2" ), double.class ),
      new CountBy( new Fields( "sizes" ) ), new CountBy( new Fields( "sizes2" ) )

    );

    Map<String, Tap> sinks = new HashMap<String, Tap>();
View Full Code Here

      current = stack.addDebug( this, current );

      if( aggregationName.equals( "COUNT" ) )
        aggregates.add( new CountBy( current, groupFields, argFields, aggResultFields, CountBy.Include.NO_NULLS ) );
      else if( aggregationName.equals( "SUM" ) )
        aggregates.add( new SumBy( current, groupFields, argFields, aggResultFields ) );
      else if( aggregationName.equals( "MIN" ) )
        aggregates.add( new MinBy( current, groupFields, argFields, aggResultFields ) );
      else if( aggregationName.equals( "MAX" ) )
        aggregates.add( new MaxBy( current, groupFields, argFields, aggResultFields ) );
      else if( aggregationName.equals( "AVG" ) )
View Full Code Here

      Fields aggResultFields = makeFieldsFor( aggCall );

      if( aggregationName.equals( "COUNT" ) )
        aggregates.add( new CountBy( argFields, aggResultFields, CountBy.Include.NO_NULLS ) );
      else if( aggregationName.equals( "SUM" ) )
        aggregates.add( new SumBy( argFields, aggResultFields ) );
      else if( aggregationName.equals( "MIN" ) )
        aggregates.add( new MinBy( argFields, aggResultFields ) );
      else if( aggregationName.equals( "MAX" ) )
        aggregates.add( new MaxBy( argFields, aggResultFields ) );
      else if( aggregationName.equals( "AVG" ) )
View Full Code Here

TOP

Related Classes of cascading.pipe.assembly.SumBy

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.