Examples of cascading.pipe.joiner.InnerJoin

cascading.pipe.joiner.InnerJoin
Class InnerJoin will return an {@link Iterator} that will iterate over a given {@link Joiner} and return tuples that representand inner join of the CoGrouper internal grouped tuple collections.
Joins perform based on the equality of the join keys. In the case of null values, Java treats two null values as equivalent. SQL does not treat null values as equal. To produce SQL like results in a given join, a new {@link java.util.Comparator} will need to be used on the joined values to prevent null fromequaling null. As a convenience, see the {@link cascading.util.NullNotEquivalentComparator} class.

        MultiSourceTap dRM1Source = getTaps(dRM1InputPath, inFieldsDRM1);
        MultiSourceTap dRM2Source = getTaps(dRM2InputPath, inFieldsDRM2);


        Pipe lhs = new Pipe("DRM1");
        Pipe rhs = new Pipe("DRM2");
        Pipe groupByItemIDPipe = new CoGroup(lhs, common, rhs, common, grouped, new InnerJoin());
        groupByItemIDPipe = new Each(groupByItemIDPipe, new VectorsToCSVFunction(joinedOutFields));
        //the DRMs (Mahout Distributed Row Matrices) have row and items indexes the two dictionary BiHashMaps
        //pass these to the output function so the strings from the indexes can be written instead of the
        //binary values of the Keys and Vectors in the DRMs
        groupByItemIDPipe.getStepConfigDef().setProperty("itemIndexPath", itemIndexPath.toString());

View Full Code Here

    ExpressionFunction exprFunc = new ExpressionFunction( new Fields( "tree_species" ), expression, String.class );
    treePipe = new Each( treePipe, new Fields( "scrub_species" ), exprFunc, Fields.ALL );


    // join with tree metadata
    Pipe metaTreePipe = new Pipe( "meta_tree" );
    treePipe = new HashJoin( treePipe, new Fields( "tree_species" ), metaTreePipe, new Fields( "species" ), new InnerJoin() );
    treePipe = new Rename( treePipe, new Fields( "blurb" ), new Fields( "tree_name" ) );


    regex = "^(\\S+),(\\S+),(\\S+)\\s*$";
    int[] gpsGroups = { 1, 2, 3 };
    parser = new RegexParser( new Fields( "tree_lat", "tree_lng", "tree_alt" ), regex, gpsGroups );
    treePipe = new Each( treePipe, new Fields( "geo" ), parser, Fields.ALL );


    // determine a tree geohash
    Fields geohashArguments = new Fields( "tree_lat", "tree_lng" );
    treePipe = new Each( treePipe, geohashArguments, new GeoHashFunction( new Fields( "tree_geohash" ), 6 ), Fields.ALL );


    Fields fieldSelector = new Fields( "tree_name", "priv", "tree_id", "situs", "tree_site", "species", "wikipedia", "calflora", "min_height", "max_height", "tree_lat", "tree_lng", "tree_alt", "tree_geohash" );
    treePipe = new Retain( treePipe, fieldSelector );


    // parse the "road" output
    Pipe roadPipe = new Pipe( "road", tsvCheck );
    regex = "^\\s+Sequence\\:.*\\s+Year Constructed\\:\\s+(\\d+)\\s+Traffic Count\\:\\s+(\\d+)\\s+Traffic Index\\:\\s+(\\w.*\\w)\\s+Traffic Class\\:\\s+(\\w.*\\w)\\s+Traffic Date.*\\s+Paving Length\\:\\s+(\\d+)\\s+Paving Width\\:\\s+(\\d+)\\s+Paving Area\\:\\s+(\\d+)\\s+Surface Type\\:\\s+(\\w.*\\w)\\s+Surface Thickness.*\\s+Bike Lane\\:\\s+(\\w+)\\s+Bus Route\\:\\s+(\\w+)\\s+Truck Route\\:\\s+(\\w+)\\s+Remediation.*$";
    roadPipe = new Each( roadPipe, new Fields( "misc" ), new RegexFilter( regex ) );
    Fields roadFields = new Fields( "year_construct", "traffic_count", "traffic_index", "traffic_class", "paving_length", "paving_width", "paving_area", "surface_type", "bike_lane", "bus_route", "truck_route" );
    int[] roadGroups = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 };
    parser = new RegexParser( roadFields, regex, roadGroups );
    roadPipe = new Each( roadPipe, new Fields( "misc" ), parser, Fields.ALL );


    // join with road metadata
    Pipe metaRoadPipe = new Pipe( "meta_road" );
    roadPipe = new HashJoin( roadPipe, new Fields( "surface_type" ), metaRoadPipe, new Fields( "pavement_type" ), new InnerJoin() );
    roadPipe = new Rename( roadPipe, new Fields( "blurb" ), new Fields( "road_name" ) );


    // estimate albedo based on the road surface age and pavement type
    Fields albedoArguments = new Fields( "year_construct", "albedo_new", "albedo_worn" );
    roadPipe = new Each( roadPipe, albedoArguments, new AlbedoFunction( new Fields( "albedo" ), 2002 ), Fields.ALL );


    // generate road segments, with midpoint, y=mx+b, and road_geohash for each
    Fields segmentArguments = new Fields( "geo" );
    Fields segmentResults = new Fields( "lat0", "lng0", "alt0", "lat1", "lng1", "alt1", "lat_mid", "lng_mid" );
    roadPipe = new Each( roadPipe, segmentArguments, new RoadSegmentFunction( segmentResults ), Fields.ALL );


    geohashArguments = new Fields( "lat_mid", "lng_mid" );
    roadPipe = new Each( roadPipe, geohashArguments, new GeoHashFunction( new Fields( "road_geohash" ), 6 ), Fields.ALL );


    fieldSelector = new Fields( "road_name", "year_construct", "traffic_count", "traffic_index", "traffic_class", "paving_length", "paving_width", "paving_area", "surface_type", "bike_lane", "bus_route", "truck_route", "albedo", "lat0", "lng0", "alt0", "lat1", "lng1", "alt1", "road_geohash" );
    roadPipe = new Retain( roadPipe, fieldSelector );


    // join the tree and road pipes to estimate shade
    Pipe shadePipe = new Pipe( "shade", roadPipe );
    shadePipe = new CoGroup( shadePipe, new Fields( "road_geohash" ), treePipe, new Fields( "tree_geohash" ), new InnerJoin() );


    // calculate a rough estimate for distance from tree to road, then filter for "< ~1 block"
    Fields treeDistArguments = new Fields( "tree_lat", "tree_lng", "lat0", "lng0", "lat1", "lng1" );
    Fields tree_dist = new Fields( "tree_dist" );
    shadePipe = new Each( shadePipe, treeDistArguments, new TreeDistanceFunction( tree_dist ), Fields.ALL );


    ExpressionFilter distFilter = new ExpressionFilter( "tree_dist > 25.0", Double.class );
    shadePipe = new Each( shadePipe, tree_dist, distFilter );


    // checkpoint this (big) calculation too
    fieldSelector = new Fields( "road_name", "year_construct", "traffic_count", "traffic_index", "traffic_class", "paving_length", "paving_width", "paving_area", "surface_type", "bike_lane", "bus_route", "truck_route", "albedo", "lat0", "lng0", "lat1", "lng1", "tree_name", "priv", "tree_id", "situs", "tree_site", "species", "wikipedia", "calflora", "min_height", "max_height", "tree_lat", "tree_lng", "tree_alt", "tree_dist", "tree_geohash" );
    shadePipe = new Retain( shadePipe, fieldSelector );
    shadePipe = new GroupBy( shadePipe, new Fields( "tree_name" ), new Fields( "tree_dist" ) );


    Checkpoint shadeCheck = new Checkpoint( "shade", shadePipe );


    // determine the geohash for GPS tracks log events
    Pipe logsPipe = new Pipe( "logs" );
    geohashArguments = new Fields( "lat", "lng" );
    logsPipe = new Each( logsPipe, geohashArguments, new GeoHashFunction( new Fields( "gps_geohash" ), 6 ), Fields.ALL );


    // prepare data for recommendations
    // NB: RHS is large given the sample data, but in practice the logs on the LHS could be much larger
    Pipe recoPipe = new Pipe( "reco", logsPipe );
    recoPipe = new CoGroup( recoPipe, new Fields( "gps_geohash" ), shadeCheck, new Fields( "tree_geohash" ), new InnerJoin() );


    // connect the taps, pipes, etc., into a flow
    FlowDef flowDef = FlowDef.flowDef()
     .setName( "copa" )
     .addSource( gisPipe, gisTap )

View Full Code Here

    Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "cross" ), SinkMode.REPLACE );


    Pipe pipeLower = new Each( "lhs", new Fields( "line" ), new RegexSplitter( new Fields( "numLHS", "charLHS" ), " " ) );
    Pipe pipeUpper = new Each( "rhs", new Fields( "line" ), new RegexSplitter( new Fields( "numRHS", "charRHS" ), " " ) );


    Pipe cross = new CoGroup( pipeLower, new Fields( "numLHS" ), pipeUpper, new Fields( "numRHS" ), new InnerJoin() );


    Flow flow = getPlatform().getFlowConnector().connect( sources, sink, cross );


    flow.complete();

View Full Code Here

    Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );


    Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter );
    Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter );


    Pipe splice = new CoGroup( pipeLower, new Fields( "num" ), pipeUpper, new Fields( "num" ), new InnerJoin( Fields.size( 4 ) ) );


    Map<Object, Object> properties = getProperties();


    // make sure hasher is getting called, but does nothing special
    FlowProps.setDefaultTupleElementComparator( properties, getPlatform().getStringComparator( false ).getClass().getCanonicalName() );

View Full Code Here

    results.add( new Tuple( "4", "d1", "4", "D1" ) );
    results.add( new Tuple( "4", "d2", "4", "D1" ) );
    results.add( new Tuple( "4", "d3", "4", "D1" ) );
    results.add( new Tuple( null, "h1", null, "H1" ) );


    handleJoins( "cogroupinner", new InnerJoin(), results, 8, false, null );
    handleJoins( "cogroupinner-resultgroup", new InnerJoin(), results, 8, true, null );
    }

View Full Code Here

    results.add( new Tuple( "2", "b1", "2", "B3" ) );
    results.add( new Tuple( "4", "d1", "4", "D1" ) );
    results.add( new Tuple( "4", "d2", "4", "D1" ) );
    results.add( new Tuple( "4", "d3", "4", "D1" ) );


    handleJoins( "cogroupinnernull", new InnerJoin(), results, 9, false, new NullNotEquivalentComparator() );
    handleJoins( "cogroupinnernull-resultgroup", new InnerJoin(), results, 9, true, new NullNotEquivalentComparator() );
    }

View Full Code Here


    fielded = new Each( fielded, new Fields( "line" ), new RegexSplitter( "\\s" ) );


//    fielded = new Each( fielded, new Debug( true ) );


    Pipe inner = new CoGroup( "inner", fielded, new Fields( 0 ), uniques, new Fields( "word" ), new InnerJoin() );
    Pipe outer = new CoGroup( "outer", fielded, new Fields( 0 ), uniques, new Fields( "word" ), new OuterJoin() );
    Pipe left = new CoGroup( "left", fielded, new Fields( 0 ), uniques, new Fields( "word" ), new LeftJoin() );
    Pipe right = new CoGroup( "right", fielded, new Fields( 0 ), uniques, new Fields( "word" ), new RightJoin() );


    Pipe[] heads = Pipe.pipes( uniques, fielded );

View Full Code Here

    Pipe join;


    if( isMerge && isGroup )
      join = new GroupBy( Pipe.pipes( pipeLower, pipeUpper ), numLHS );
    else if( !isMerge && isGroup )
      join = new CoGroup( pipeLower, numLHS, pipeUpper, numRHS, declaredFields, new InnerJoin() );
    else if( isMerge && !isGroup )
      join = new Merge( pipeLower, pipeUpper );
    else
      join = new HashJoin( pipeLower, numLHS, pipeUpper, numRHS, declaredFields, new InnerJoin() );


    Flow flow = null;
    try
      {
      flow = getPlatform().getFlowConnector().connect( sources, sink, join );

View Full Code Here

      if( sortFields != null )
        this.sortFieldsMap.put( pipe.getName(), sortFields );
      }


    this.reverseOrder = reverseOrder;
    this.joiner = new InnerJoin();
    }

View Full Code Here

    if( isJoin() && joiner instanceof BufferJoin )
      throw new IllegalArgumentException( "invalid joiner, may not use BufferJoiner in a HashJoin" );


    if( joiner == null )
      {
      joiner = new InnerJoin();
      return;
      }


    if( joiner.numJoins() == -1 )
      return;

View Full Code Here

0 1

TOP

Related Classes of cascading.pipe.joiner.InnerJoin

cascading.CoGroupFieldedPipesPlatformTest

cascading.flow.hadoop.BuildJobsHadoopPlatformTest

cascading.JoinFieldedPipesPlatformTest

cascading.lingual.optiq.CascadingAggregateRel

cascading.lingual.optiq.CascadingJoinRel

cascading.pipe.Splice

cascading.tuple.Tuple

cascading.TypedFieldedPipesPlatformTest

com.liveramp.cascading_ext.assembly.BloomAssembly

copa.Main

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.