Examples of GroupBy


Examples of cascading.pipe.GroupBy

        // Limit to MAX_DISTRIBUTED_FETCH if running in real cluster,
        // or MAX_LOCAL_FETCH if running locally. So first we sort the entries
        // from high to low by links score.
        // TODO add unit test
        urlsToFetchPipe = new GroupBy(urlsToFetchPipe, new Fields(CrawlDbDatum.LINKS_SCORE_FIELD), true);
        long maxToFetch = isLocal ? MAX_LOCAL_FETCH : MAX_DISTRIBUTED_FETCH;
        urlsToFetchPipe = new Each(urlsToFetchPipe, new CreateUrlDatumFromCrawlDbDatum(maxToFetch));

        BaseScoreGenerator scorer = new LinkScoreGenerator();
View Full Code Here

Examples of cascading.pipe.GroupBy

   
    public FetchPipe(Pipe urlProvider, BaseScoreGenerator scorer, BaseFetcher fetcher, BaseFetcher robotsFetcher, BaseRobotsParser parser,
                    BaseFetchJobPolicy fetchJobPolicy, int numReducers) {
        super(urlProvider);
        Pipe robotsPipe = new Each(urlProvider, new GroupFunction(new GroupByDomain()));
        robotsPipe = new GroupBy("Grouping URLs by IP/delay", robotsPipe, GroupedUrlDatum.getGroupingField());
        robotsPipe = new Every(robotsPipe, new FilterAndScoreByUrlAndRobots(robotsFetcher, parser, scorer), Fields.RESULTS);
       
        // Split into records for URLs that are special (not fetchable) and regular
        SplitterAssembly splitter = new SplitterAssembly(robotsPipe, new SplitIntoSpecialAndRegularKeys());
       
        // Now generate sets of URLs to fetch. We'll wind up with all URLs for the same server & the same crawl delay,
        // ordered by score, getting passed per list to the PreFetchBuffer. This will generate PreFetchDatums that contain a key
        // based on the hash of the IP address (with a range of values == number of reducers), plus a list of URLs and a target
        // crawl time.
        Pipe prefetchPipe = new GroupBy("Distributing URL sets", splitter.getRHSPipe(), GroupedUrlDatum.getGroupingField(), ScoredUrlDatum.getSortingField(), true);
       
        prefetchPipe = new Every(prefetchPipe, new MakeFetchSetsBuffer(fetchJobPolicy, numReducers), Fields.RESULTS);
        Pipe fetchPipe = new GroupBy("Fetching URL sets", prefetchPipe, FetchSetDatum.getGroupingField(), FetchSetDatum.getSortingField());
        fetchPipe = new Every(fetchPipe, new FetchBuffer(fetcher), Fields.RESULTS);

        Pipe fetchedContent = new Pipe(CONTENT_PIPE_NAME, new Each(fetchPipe, new FilterErrorsFunction()));
       
        Pipe fetchedStatus = new Pipe("fetched status", new Each(fetchPipe, new MakeStatusFunction()));
       
        // We need to merge URLs from the LHS of the splitter (never fetched) so that our status pipe
        // gets status for every URL we put into this sub-assembly.
        Pipe skippedStatus = new Pipe("skipped status", new Each(splitter.getLHSPipe(), new MakeSkippedStatus()));
       
        // TODO KKr You're already setting the group name here (so that the
        // tail pipe gets the same name), so I wasn't able to pass in a
        // group name here for BaseTool.nameFlowSteps to use for the job name.
        Pipe joinedStatus = new GroupBy(STATUS_PIPE_NAME, Pipe.pipes(skippedStatus, fetchedStatus), new Fields(StatusDatum.URL_FN));

        setTails(fetchedContent, joinedStatus);
    }
View Full Code Here

Examples of cascading.pipe.GroupBy

    create SINK tap to measure token frequency, which will need to be used to adjust
    stop words -- based on an R script
    */

    Pipe tokenPipe = new Pipe( "token", joinPipe ); // name branch
    tokenPipe = new GroupBy( tokenPipe, new Fields( "token" ) );
    tokenPipe = new Every( tokenPipe, Fields.ALL, new Count(), Fields.ALL );

    /*
    flow part #3
    generate an inverted index for ((uid1,uid2), token) to avoid having to perform
    a cross-product, which would impose a bottleneck in the parallelism
    */

    Pipe invertPipe = new Pipe( "inverted index", joinPipe );
    invertPipe = new CoGroup( invertPipe, new Fields( "token" ), 1, new Fields( "uid1", "ignore", "uid2", "token" ) );

    Fields filterArguments = new Fields( "uid1", "uid2" );
    String uidFilter = "uid1.compareToIgnoreCase( uid2 ) >= 0";
    invertPipe = new Each( invertPipe, filterArguments, new ExpressionFilter( uidFilter, String.class ) );
    Fields ignore = new Fields( "ignore" );
    invertPipe = new Discard( invertPipe, ignore );

    /*
    flow part #4
    count the number of tokens in common for each uid pair and apply a threshold
    */

    Pipe commonPipe = new GroupBy( new Pipe( "uid common", invertPipe ), new Fields( "uid1", "uid2" ) );
    commonPipe = new Every( commonPipe, Fields.ALL, new Count( new Fields( "common" ) ), Fields.ALL );

    String commonFilter = String.format( "common < %d", MIN_COMMON_TOKENS );
    commonPipe = new Each( commonPipe, new Fields( "common" ), new ExpressionFilter( commonFilter, Integer.TYPE ) );

    /*
    flow part #5
    count the number of tokens overall for each uid, then join to calculate
    the vector length for uid1
    */

    Fields tokenCount = new Fields( "token_count" );
    Pipe countPipe = new GroupBy( "count", joinPipe, new Fields( "uid" ) );
    countPipe = new Every( countPipe, Fields.ALL, new Count( tokenCount ), Fields.ALL );

    joinPipe = new CoGroup( countPipe, new Fields( "uid" ), commonPipe, new Fields( "uid1" ) );
    joinPipe = new Pipe( "common", joinPipe );
    joinPipe = new Discard( joinPipe, new Fields( "uid" ) );
View Full Code Here

Examples of cascading.pipe.GroupBy

    Fields count = new Fields( "count" );
    wcPipe = new SumBy( wcPipe, tf_token, tf_count, count, long.class );
    wcPipe = new Rename( wcPipe, tf_token, token );

    // additionally, sort by count
    wcPipe = new GroupBy( wcPipe, count, count );

    // connect the taps, pipes, etc., into a flow
    FlowDef flowDef = FlowDef.flowDef()
     .setName( "tfidf" )
     .addSource( docPipe, docTap )
View Full Code Here

Examples of co.nubetech.crux.model.GroupBy

      alias.setId(99999);
     
      RowAlias alias1 = new RowAlias();
      alias1.setId(19999);
     
      GroupBy groupBy = createGroupBy(1, alias);
      GroupBy groupBy1 = createGroupBy(2, alias1);
      List<GroupBy> groupByList = new ArrayList<GroupBy>();
      groupByList.add(groupBy);
      groupByList.add(groupBy1);
     
      Report report = new Report();
View Full Code Here

Examples of com.odiago.flumebase.parser.GroupBy

    Expr where = s.getWhereConditions();
    if (mOldChild == where) {
      s.setWhereConditions((Expr) mNewChild);
    }

    GroupBy groupBy = s.getGroupBy();
    if (mOldChild == groupBy) {
      s.setGroupBy((GroupBy) mNewChild);
    }

    Expr aggregateOver = s.getWindowOver();
View Full Code Here

Examples of com.odiago.flumebase.parser.GroupBy

      before(s, where);
      where.accept(this);
      after(s, where);
    }

    GroupBy groupBy = s.getGroupBy();
    if (null != groupBy) {
      before(s, groupBy);
      groupBy.accept(this);
      after(s, groupBy);
    }

    Expr aggregateOver = s.getWindowOver();
    if (null != aggregateOver) {
View Full Code Here

Examples of com.odiago.flumebase.parser.GroupBy


    // Visit the rest of the SELECT statement, including its upstream
    // sources (e.g., nested SELECT statements).
    s.getSource().accept(this);
    GroupBy groupBy = s.getGroupBy();
    if (null != groupBy) {
      groupBy.accept(this);
    }

    List<WindowDef> windowDefs = s.getWindowDefs();
    if (null != windowDefs) {
      for (WindowDef def : windowDefs) {
View Full Code Here

Examples of com.odiago.flumebase.parser.GroupBy

              + whereType);
        }
      }

      // Check the GROUP BY clause for validity if it's non-null.
      GroupBy groupBy = s.getGroupBy();
      if (null != groupBy) {
        groupBy.accept(this);
      }

      // Check the OVER clause for validity if it's non-null.
      // This must evaluate to a value of type WINDOW.
      Expr windowOver = s.getWindowOver();
View Full Code Here

Examples of com.salesforce.phoenix.compile.GroupByCompiler.GroupBy

        ParseNode viewNode = SQLParser.parseCondition(table.getViewExpression());
        // Push VIEW expression into select
        select = SelectStatement.create(select, viewNode);
        Integer limit = LimitCompiler.compile(context, select);

        GroupBy groupBy = GroupByCompiler.compile(context, select);
        // Optimize the HAVING clause by finding any group by expressions that can be moved
        // to the WHERE clause
        select = HavingCompiler.rewrite(context, select, groupBy);
        Expression having = HavingCompiler.compile(context, select, groupBy);
        // Don't pass groupBy when building where clause expression, because we do not want to wrap these
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.