/*
* Copyright (c) 2007-2014 Concurrent, Inc. All Rights Reserved.
*
* Project and contact information: http://www.cascading.org/
*
* This file is part of the Cascading project.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package cascading;
import java.io.IOException;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import cascading.cascade.Cascades;
import cascading.flow.Flow;
import cascading.flow.FlowConnectorProps;
import cascading.flow.FlowProps;
import cascading.operation.Debug;
import cascading.operation.Function;
import cascading.operation.Identity;
import cascading.operation.Insert;
import cascading.operation.aggregator.Count;
import cascading.operation.aggregator.First;
import cascading.operation.filter.Sample;
import cascading.operation.regex.RegexFilter;
import cascading.operation.regex.RegexSplitGenerator;
import cascading.operation.regex.RegexSplitter;
import cascading.pipe.CoGroup;
import cascading.pipe.Each;
import cascading.pipe.Every;
import cascading.pipe.GroupBy;
import cascading.pipe.Pipe;
import cascading.pipe.assembly.Discard;
import cascading.pipe.joiner.InnerJoin;
import cascading.pipe.joiner.Joiner;
import cascading.pipe.joiner.LeftJoin;
import cascading.pipe.joiner.MixedJoin;
import cascading.pipe.joiner.OuterJoin;
import cascading.pipe.joiner.RightJoin;
import cascading.tap.SinkMode;
import cascading.tap.Tap;
import cascading.tuple.Fields;
import cascading.tuple.Tuple;
import cascading.util.NullNotEquivalentComparator;
import org.junit.Test;
import static data.InputData.*;
public class CoGroupFieldedPipesPlatformTest extends PlatformTestCase
{
public CoGroupFieldedPipesPlatformTest()
{
super( true, 4, 1 ); // leave cluster testing enabled
}
@Test
public void testCross() throws Exception
{
getPlatform().copyFromLocal( inputFileLhs );
getPlatform().copyFromLocal( inputFileRhs );
Map sources = new HashMap();
sources.put( "lhs", getPlatform().getTextFile( inputFileLhs ) );
sources.put( "rhs", getPlatform().getTextFile( inputFileRhs ) );
Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "cross" ), SinkMode.REPLACE );
Pipe pipeLower = new Each( "lhs", new Fields( "line" ), new RegexSplitter( new Fields( "numLHS", "charLHS" ), " " ) );
Pipe pipeUpper = new Each( "rhs", new Fields( "line" ), new RegexSplitter( new Fields( "numRHS", "charRHS" ), " " ) );
Pipe cross = new CoGroup( pipeLower, new Fields( "numLHS" ), pipeUpper, new Fields( "numRHS" ), new InnerJoin() );
Flow flow = getPlatform().getFlowConnector().connect( sources, sink, cross );
flow.complete();
validateLength( flow, 37, null );
List<Tuple> values = getSinkAsList( flow );
assertTrue( values.contains( new Tuple( "1\ta\t1\tA" ) ) );
assertTrue( values.contains( new Tuple( "1\ta\t1\tB" ) ) );
}
@Test
public void testCoGroup() throws Exception
{
getPlatform().copyFromLocal( inputFileLower );
getPlatform().copyFromLocal( inputFileUpper );
Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower );
Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper );
Map sources = new HashMap();
sources.put( "lower", sourceLower );
sources.put( "upper", sourceUpper );
Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "cogroup" ), SinkMode.REPLACE );
Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );
Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter );
Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter );
Pipe splice = new CoGroup( pipeLower, new Fields( "num" ), pipeUpper, new Fields( "num" ), new InnerJoin( Fields.size( 4 ) ) );
Map<Object, Object> properties = getProperties();
// make sure hasher is getting called, but does nothing special
FlowProps.setDefaultTupleElementComparator( properties, getPlatform().getStringComparator( false ).getClass().getCanonicalName() );
Flow flow = getPlatform().getFlowConnector( properties ).connect( sources, sink, splice );
flow.complete();
validateLength( flow, 5 );
List<Tuple> values = getSinkAsList( flow );
assertTrue( values.contains( new Tuple( "1\ta\t1\tA" ) ) );
assertTrue( values.contains( new Tuple( "2\tb\t2\tB" ) ) );
}
@Test
public void testCoGroupSamePipeName() throws Exception
{
getPlatform().copyFromLocal( inputFileLower );
getPlatform().copyFromLocal( inputFileUpper );
Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower );
Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper );
Map sources = new HashMap();
sources.put( "lower", sourceLower );
sources.put( "upper", sourceUpper );
Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "renamedpipes" ), SinkMode.REPLACE );
Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );
Pipe pipeLower = new Pipe( "lower" );
Pipe pipeUpper = new Pipe( "upper" );
// these pipes will hide the source name, and could cause one to be lost
pipeLower = new Pipe( "same", pipeLower );
pipeUpper = new Pipe( "same", pipeUpper );
pipeLower = new Each( pipeLower, new Fields( "line" ), splitter );
pipeUpper = new Each( pipeUpper, new Fields( "line" ), splitter );
// pipeLower = new Each( pipeLower, new Fields( "num", "char" ), new Identity( new Fields( "num", "char" ) ) );
// pipeUpper = new Each( pipeUpper, new Fields( "num", "char" ), new Identity( new Fields( "num", "char" ) ) );
pipeLower = new Pipe( "left", pipeLower );
pipeUpper = new Pipe( "right", pipeUpper );
// pipeLower = new Each( pipeLower, new Debug( true ) );
// pipeUpper = new Each( pipeUpper, new Debug( true ) );
Pipe splice = new CoGroup( pipeLower, new Fields( "num" ), pipeUpper, new Fields( "num" ), Fields.size( 4 ) );
// splice = new Each( splice, new Debug( true ) );
splice = new Pipe( "splice", splice );
splice = new Pipe( "tail", splice );
Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice );
flow.complete();
validateLength( flow, 5 );
List<Tuple> values = getSinkAsList( flow );
assertTrue( values.contains( new Tuple( "1\ta\t1\tA" ) ) );
assertTrue( values.contains( new Tuple( "2\tb\t2\tB" ) ) );
}
@Test
public void testCoGroupWithUnknowns() throws Exception
{
getPlatform().copyFromLocal( inputFileLower );
getPlatform().copyFromLocal( inputFileUpper );
Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower );
Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper );
Map sources = new HashMap();
sources.put( "lower", sourceLower );
sources.put( "upper", sourceUpper );
Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "unknown" ), SinkMode.REPLACE );
Function splitter = new RegexSplitter( Fields.UNKNOWN, " " );
Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter );
Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter );
Pipe splice = new CoGroup( pipeLower, new Fields( 0 ), pipeUpper, new Fields( 0 ), Fields.size( 4 ) );
Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice );
flow.complete();
validateLength( flow, 5 );
List<Tuple> values = getSinkAsList( flow );
assertTrue( values.contains( new Tuple( "1\ta\t1\tA" ) ) );
assertTrue( values.contains( new Tuple( "2\tb\t2\tB" ) ) );
}
/**
* this test intentionally filters out all values so the intermediate tap is empty. this tap is cogrouped with
* a new stream using an outerjoin.
*
* @throws Exception
*/
@Test
public void testCoGroupFilteredBranch() throws Exception
{
getPlatform().copyFromLocal( inputFileLower );
getPlatform().copyFromLocal( inputFileUpper );
Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower );
Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper );
Map sources = new HashMap();
sources.put( "lower", sourceLower );
sources.put( "upper", sourceUpper );
Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "cogroupfilteredbranch" ), SinkMode.REPLACE );
Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );
Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter );
Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter );
pipeUpper = new Each( pipeUpper, new Fields( "num" ), new RegexFilter( "^fobar" ) ); // intentionally filtering all
pipeUpper = new GroupBy( pipeUpper, new Fields( "num" ) );
Pipe splice = new CoGroup( pipeLower, new Fields( "num" ), pipeUpper, new Fields( "num" ), Fields.size( 4 ), new OuterJoin() );
Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice );
flow.complete();
validateLength( flow, 5 );
List<Tuple> values = getSinkAsList( flow );
assertTrue( values.contains( new Tuple( "1\ta\tnull\tnull" ) ) );
assertTrue( values.contains( new Tuple( "2\tb\tnull\tnull" ) ) );
}
@Test
public void testCoGroupSelf() throws Exception
{
getPlatform().copyFromLocal( inputFileLower );
Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower );
Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower );
Map sources = new HashMap();
sources.put( "lower", sourceLower );
sources.put( "upper", sourceUpper );
Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "cogroupself" ), SinkMode.REPLACE );
Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );
Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter );
Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter );
Pipe splice = new CoGroup( pipeLower, new Fields( "num" ), pipeUpper, new Fields( "num" ), Fields.size( 4 ) );
Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice );
flow.complete();
validateLength( flow, 5 );
List<Tuple> values = getSinkAsList( flow );
assertTrue( values.contains( new Tuple( "1\ta\t1\ta" ) ) );
assertTrue( values.contains( new Tuple( "2\tb\t2\tb" ) ) );
}
/**
* Method testCoGroupAfterEvery tests that a tmp tap is inserted after the Every in the cogroup join
*
* @throws Exception when
*/
@Test
public void testCoGroupAfterEvery() throws Exception
{
getPlatform().copyFromLocal( inputFileLower );
getPlatform().copyFromLocal( inputFileUpper );
Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower );
Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper );
Map sources = new HashMap();
sources.put( "lower", sourceLower );
sources.put( "upper", sourceUpper );
Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "afterevery" ), SinkMode.REPLACE );
Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );
Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter );
pipeLower = new GroupBy( pipeLower, new Fields( "num" ) );
pipeLower = new Every( pipeLower, new Fields( "char" ), new First(), Fields.ALL );
Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter );
pipeUpper = new GroupBy( pipeUpper, new Fields( "num" ) );
pipeUpper = new Every( pipeUpper, new Fields( "char" ), new First(), Fields.ALL );
Pipe splice = new CoGroup( pipeLower, new Fields( "num" ), pipeUpper, new Fields( "num" ), Fields.size( 4 ) );
Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice );
flow.complete();
validateLength( flow, 5, null );
List<Tuple> values = getSinkAsList( flow );
assertTrue( values.contains( new Tuple( "1\ta\t1\tA" ) ) );
assertTrue( values.contains( new Tuple( "2\tb\t2\tB" ) ) );
}
/**
* Tests that CoGroup properly resolves fields when following an Every
*
* @throws Exception
*/
@Test
public void testCoGroupAfterEveryNoDeclared() throws Exception
{
getPlatform().copyFromLocal( inputFileLower );
getPlatform().copyFromLocal( inputFileUpper );
Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower );
Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper );
Map sources = new HashMap();
sources.put( "lower", sourceLower );
sources.put( "upper", sourceUpper );
Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "aftereverynodeclared" ), SinkMode.REPLACE );
Function splitter1 = new RegexSplitter( new Fields( "num1", "char1" ), " " );
Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter1 );
pipeLower = new Each( pipeLower, new Insert( new Fields( "one", "two", "three", "four" ), "one", "two", "three", "four" ), Fields.ALL );
pipeLower = new GroupBy( pipeLower, new Fields( "num1" ) );
pipeLower = new Every( pipeLower, new Fields( "char1" ), new First(), Fields.ALL );
Function splitter2 = new RegexSplitter( new Fields( "num2", "char2" ), " " );
Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter2 );
pipeUpper = new GroupBy( pipeUpper, new Fields( "num2" ) );
pipeUpper = new Every( pipeUpper, new Fields( "char2" ), new First(), Fields.ALL );
Pipe splice = new CoGroup( pipeLower, new Fields( "num1" ), pipeUpper, new Fields( "num2" ) );
Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice );
flow.complete();
validateLength( flow, 5, null );
List<Tuple> values = getSinkAsList( flow );
assertTrue( values.contains( new Tuple( "1\ta\t1\tA" ) ) );
assertTrue( values.contains( new Tuple( "2\tb\t2\tB" ) ) );
}
@Test
public void testCoGroupInnerSingleField() throws Exception
{
getPlatform().copyFromLocal( inputFileLowerOffset );
getPlatform().copyFromLocal( inputFileUpper );
Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLowerOffset );
Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper );
Map sources = new HashMap();
sources.put( "lower", sourceLower );
sources.put( "upper", sourceUpper );
Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "cogroupinnersingle" ), SinkMode.REPLACE );
Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), new RegexSplitter( new Fields( "num1", "char" ), " " ), new Fields( "num1" ) );
Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), new RegexSplitter( new Fields( "num2", "char" ), " " ), new Fields( "num2" ) );
Pipe join = new CoGroup( pipeLower, new Fields( "num1" ), pipeUpper, new Fields( "num2" ) );
join = new Every( join, new Count() );
// join = new Each( join, new Debug( true ) );
Flow flow = getPlatform().getFlowConnector().connect( sources, sink, join );
flow.complete();
validateLength( flow, 2, null );
Set<Tuple> results = new HashSet<Tuple>();
results.add( new Tuple( "1\t1\t1" ) );
results.add( new Tuple( "5\t5\t2" ) );
List<Tuple> actual = getSinkAsList( flow );
results.removeAll( actual );
assertEquals( 0, results.size() );
}
/**
* 1 a1
* 1 a2
* 1 a3
* 2 b1
* 3 c1
* 4 d1
* 4 d2
* 4 d3
* 5 e1
* 5 e2
* 5 e3
* 7 g1
* 7 g2
* 7 g3
* 7 g4
* 7 g5
* null h1
* <p/>
* 1 A1
* 1 A2
* 1 A3
* 2 B1
* 2 B2
* 2 B3
* 4 D1
* 6 F1
* 6 F2
* null H1
* <p/>
* 1 a1 1 A1
* 1 a1 1 A2
* 1 a1 1 A3
* 1 a2 1 A1
* 1 a2 1 A2
* 1 a2 1 A3
* 1 a3 1 A1
* 1 a3 1 A2
* 1 a3 1 A3
* 2 b1 2 B1
* 2 b1 2 B2
* 2 b1 2 B3
* 4 d1 4 D1
* 4 d2 4 D1
* 4 d3 4 D1
* null h1 null H1
*
* @throws Exception
*/
@Test
public void testCoGroupInner() throws Exception
{
HashSet<Tuple> results = new HashSet<Tuple>();
results.add( new Tuple( "1", "a1", "1", "A1" ) );
results.add( new Tuple( "1", "a1", "1", "A2" ) );
results.add( new Tuple( "1", "a1", "1", "A3" ) );
results.add( new Tuple( "1", "a2", "1", "A1" ) );
results.add( new Tuple( "1", "a2", "1", "A2" ) );
results.add( new Tuple( "1", "a2", "1", "A3" ) );
results.add( new Tuple( "1", "a3", "1", "A1" ) );
results.add( new Tuple( "1", "a3", "1", "A2" ) );
results.add( new Tuple( "1", "a3", "1", "A3" ) );
results.add( new Tuple( "2", "b1", "2", "B1" ) );
results.add( new Tuple( "2", "b1", "2", "B2" ) );
results.add( new Tuple( "2", "b1", "2", "B3" ) );
results.add( new Tuple( "4", "d1", "4", "D1" ) );
results.add( new Tuple( "4", "d2", "4", "D1" ) );
results.add( new Tuple( "4", "d3", "4", "D1" ) );
results.add( new Tuple( null, "h1", null, "H1" ) );
handleJoins( "cogroupinner", new InnerJoin(), results, 8, false, null );
handleJoins( "cogroupinner-resultgroup", new InnerJoin(), results, 8, true, null );
}
/**
* 1 a1
* 1 a2
* 1 a3
* 2 b1
* 3 c1
* 4 d1
* 4 d2
* 4 d3
* 5 e1
* 5 e2
* 5 e3
* 7 g1
* 7 g2
* 7 g3
* 7 g4
* 7 g5
* null h1
* <p/>
* 1 A1
* 1 A2
* 1 A3
* 2 B1
* 2 B2
* 2 B3
* 4 D1
* 6 F1
* 6 F2
* null H1
* <p/>
* 1 a1 1 A1
* 1 a1 1 A2
* 1 a1 1 A3
* 1 a2 1 A1
* 1 a2 1 A2
* 1 a2 1 A3
* 1 a3 1 A1
* 1 a3 1 A2
* 1 a3 1 A3
* 2 b1 2 B1
* 2 b1 2 B2
* 2 b1 2 B3
* 4 d1 4 D1
* 4 d2 4 D1
* 4 d3 4 D1
*
* @throws Exception
*/
@Test
public void testCoGroupInnerNull() throws Exception
{
HashSet<Tuple> results = new HashSet<Tuple>();
results.add( new Tuple( "1", "a1", "1", "A1" ) );
results.add( new Tuple( "1", "a1", "1", "A2" ) );
results.add( new Tuple( "1", "a1", "1", "A3" ) );
results.add( new Tuple( "1", "a2", "1", "A1" ) );
results.add( new Tuple( "1", "a2", "1", "A2" ) );
results.add( new Tuple( "1", "a2", "1", "A3" ) );
results.add( new Tuple( "1", "a3", "1", "A1" ) );
results.add( new Tuple( "1", "a3", "1", "A2" ) );
results.add( new Tuple( "1", "a3", "1", "A3" ) );
results.add( new Tuple( "2", "b1", "2", "B1" ) );
results.add( new Tuple( "2", "b1", "2", "B2" ) );
results.add( new Tuple( "2", "b1", "2", "B3" ) );
results.add( new Tuple( "4", "d1", "4", "D1" ) );
results.add( new Tuple( "4", "d2", "4", "D1" ) );
results.add( new Tuple( "4", "d3", "4", "D1" ) );
handleJoins( "cogroupinnernull", new InnerJoin(), results, 9, false, new NullNotEquivalentComparator() );
handleJoins( "cogroupinnernull-resultgroup", new InnerJoin(), results, 9, true, new NullNotEquivalentComparator() );
}
/**
* 1 a1
* 1 a2
* 1 a3
* 2 b1
* 3 c1
* 4 d1
* 4 d2
* 4 d3
* 5 e1
* 5 e2
* 5 e3
* 7 g1
* 7 g2
* 7 g3
* 7 g4
* 7 g5
* null h1
* <p/>
* 1 A1
* 1 A2
* 1 A3
* 2 B1
* 2 B2
* 2 B3
* 4 D1
* 6 F1
* 6 F2
* null H1
* <p/>
* 1 a1 1 A1
* 1 a1 1 A2
* 1 a1 1 A3
* 1 a2 1 A1
* 1 a2 1 A2
* 1 a2 1 A3
* 1 a3 1 A1
* 1 a3 1 A2
* 1 a3 1 A3
* 2 b1 2 B1
* 2 b1 2 B2
* 2 b1 2 B3
* 3 c1 null null
* 4 d1 4 D1
* 4 d2 4 D1
* 4 d3 4 D1
* 5 e1 null null
* 5 e2 null null
* 5 e3 null null
* null null 6 F1
* null null 6 F2
* 7 g1 null null
* 7 g2 null null
* 7 g3 null null
* 7 g4 null null
* 7 g5 null null
* null h1 null H1
*
* @throws Exception
*/
@Test
public void testCoGroupOuter() throws Exception
{
Set<Tuple> results = new HashSet<Tuple>();
results.add( new Tuple( "1", "a1", "1", "A1" ) );
results.add( new Tuple( "1", "a1", "1", "A2" ) );
results.add( new Tuple( "1", "a1", "1", "A3" ) );
results.add( new Tuple( "1", "a2", "1", "A1" ) );
results.add( new Tuple( "1", "a2", "1", "A2" ) );
results.add( new Tuple( "1", "a2", "1", "A3" ) );
results.add( new Tuple( "1", "a3", "1", "A1" ) );
results.add( new Tuple( "1", "a3", "1", "A2" ) );
results.add( new Tuple( "1", "a3", "1", "A3" ) );
results.add( new Tuple( "2", "b1", "2", "B1" ) );
results.add( new Tuple( "2", "b1", "2", "B2" ) );
results.add( new Tuple( "2", "b1", "2", "B3" ) );
results.add( new Tuple( "3", "c1", null, null ) );
results.add( new Tuple( "4", "d1", "4", "D1" ) );
results.add( new Tuple( "4", "d2", "4", "D1" ) );
results.add( new Tuple( "4", "d3", "4", "D1" ) );
results.add( new Tuple( "5", "e1", null, null ) );
results.add( new Tuple( "5", "e2", null, null ) );
results.add( new Tuple( "5", "e3", null, null ) );
results.add( new Tuple( null, null, "6", "F1" ) );
results.add( new Tuple( null, null, "6", "F2" ) );
results.add( new Tuple( "7", "g1", null, null ) );
results.add( new Tuple( "7", "g2", null, null ) );
results.add( new Tuple( "7", "g3", null, null ) );
results.add( new Tuple( "7", "g4", null, null ) );
results.add( new Tuple( "7", "g5", null, null ) );
results.add( new Tuple( null, "h1", null, "H1" ) );
handleJoins( "cogroupouter", new OuterJoin(), results, 8, false, null );
handleJoins( "cogroupouter-resultgroup", new OuterJoin(), results, 8, true, null );
}
/**
* 1 a1
* 1 a2
* 1 a3
* 2 b1
* 3 c1
* 4 d1
* 4 d2
* 4 d3
* 5 e1
* 5 e2
* 5 e3
* 7 g1
* 7 g2
* 7 g3
* 7 g4
* 7 g5
* null h1
* <p/>
* 1 A1
* 1 A2
* 1 A3
* 2 B1
* 2 B2
* 2 B3
* 4 D1
* 6 F1
* 6 F2
* null H1
* <p/>
* 1 a1 1 A1
* 1 a1 1 A2
* 1 a1 1 A3
* 1 a2 1 A1
* 1 a2 1 A2
* 1 a2 1 A3
* 1 a3 1 A1
* 1 a3 1 A2
* 1 a3 1 A3
* 2 b1 2 B1
* 2 b1 2 B2
* 2 b1 2 B3
* 3 c1 null null
* 4 d1 4 D1
* 4 d2 4 D1
* 4 d3 4 D1
* 5 e1 null null
* 5 e2 null null
* 5 e3 null null
* null null 6 F1
* null null 6 F2
* 7 g1 null null
* 7 g2 null null
* 7 g3 null null
* 7 g4 null null
* 7 g5 null null
* null h1 null null
* null null null H1
*
* @throws Exception
*/
@Test
public void testCoGroupOuterNull() throws Exception
{
Set<Tuple> results = new HashSet<Tuple>();
results.add( new Tuple( "1", "a1", "1", "A1" ) );
results.add( new Tuple( "1", "a1", "1", "A2" ) );
results.add( new Tuple( "1", "a1", "1", "A3" ) );
results.add( new Tuple( "1", "a2", "1", "A1" ) );
results.add( new Tuple( "1", "a2", "1", "A2" ) );
results.add( new Tuple( "1", "a2", "1", "A3" ) );
results.add( new Tuple( "1", "a3", "1", "A1" ) );
results.add( new Tuple( "1", "a3", "1", "A2" ) );
results.add( new Tuple( "1", "a3", "1", "A3" ) );
results.add( new Tuple( "2", "b1", "2", "B1" ) );
results.add( new Tuple( "2", "b1", "2", "B2" ) );
results.add( new Tuple( "2", "b1", "2", "B3" ) );
results.add( new Tuple( "3", "c1", null, null ) );
results.add( new Tuple( "4", "d1", "4", "D1" ) );
results.add( new Tuple( "4", "d2", "4", "D1" ) );
results.add( new Tuple( "4", "d3", "4", "D1" ) );
results.add( new Tuple( "5", "e1", null, null ) );
results.add( new Tuple( "5", "e2", null, null ) );
results.add( new Tuple( "5", "e3", null, null ) );
results.add( new Tuple( null, null, "6", "F1" ) );
results.add( new Tuple( null, null, "6", "F2" ) );
results.add( new Tuple( "7", "g1", null, null ) );
results.add( new Tuple( "7", "g2", null, null ) );
results.add( new Tuple( "7", "g3", null, null ) );
results.add( new Tuple( "7", "g4", null, null ) );
results.add( new Tuple( "7", "g5", null, null ) );
results.add( new Tuple( null, "h1", null, null ) );
results.add( new Tuple( null, null, null, "H1" ) );
handleJoins( "cogroupouternull", new OuterJoin(), results, 9, false, new NullNotEquivalentComparator() );
handleJoins( "cogroupouternull-resultgroup", new OuterJoin(), results, 9, true, new NullNotEquivalentComparator() );
}
/**
* 1 a1
* 1 a2
* 1 a3
* 2 b1
* 3 c1
* 4 d1
* 4 d2
* 4 d3
* 5 e1
* 5 e2
* 5 e3
* 7 g1
* 7 g2
* 7 g3
* 7 g4
* 7 g5
* null h1
* <p/>
* 1 A1
* 1 A2
* 1 A3
* 2 B1
* 2 B2
* 2 B3
* 4 D1
* 6 F1
* 6 F2
* null H1
* <p/>
* 1 a1 1 A1
* 1 a1 1 A2
* 1 a1 1 A3
* 1 a2 1 A1
* 1 a2 1 A2
* 1 a2 1 A3
* 1 a3 1 A1
* 1 a3 1 A2
* 1 a3 1 A3
* 2 b1 2 B1
* 2 b1 2 B2
* 2 b1 2 B3
* 3 c1 null null
* 4 d1 4 D1
* 4 d2 4 D1
* 4 d3 4 D1
* 5 e1 null null
* 5 e2 null null
* 5 e3 null null
* 7 g1 null null
* 7 g2 null null
* 7 g3 null null
* 7 g4 null null
* 7 g5 null null
* null h1 null H1
*
* @throws Exception
*/
@Test
public void testCoGroupInnerOuter() throws Exception
{
Set<Tuple> results = new HashSet<Tuple>();
results.add( new Tuple( "1", "a1", "1", "A1" ) );
results.add( new Tuple( "1", "a1", "1", "A2" ) );
results.add( new Tuple( "1", "a1", "1", "A3" ) );
results.add( new Tuple( "1", "a2", "1", "A1" ) );
results.add( new Tuple( "1", "a2", "1", "A2" ) );
results.add( new Tuple( "1", "a2", "1", "A3" ) );
results.add( new Tuple( "1", "a3", "1", "A1" ) );
results.add( new Tuple( "1", "a3", "1", "A2" ) );
results.add( new Tuple( "1", "a3", "1", "A3" ) );
results.add( new Tuple( "2", "b1", "2", "B1" ) );
results.add( new Tuple( "2", "b1", "2", "B2" ) );
results.add( new Tuple( "2", "b1", "2", "B3" ) );
results.add( new Tuple( "3", "c1", null, null ) );
results.add( new Tuple( "4", "d1", "4", "D1" ) );
results.add( new Tuple( "4", "d2", "4", "D1" ) );
results.add( new Tuple( "4", "d3", "4", "D1" ) );
results.add( new Tuple( "5", "e1", null, null ) );
results.add( new Tuple( "5", "e2", null, null ) );
results.add( new Tuple( "5", "e3", null, null ) );
results.add( new Tuple( "7", "g1", null, null ) );
results.add( new Tuple( "7", "g2", null, null ) );
results.add( new Tuple( "7", "g3", null, null ) );
results.add( new Tuple( "7", "g4", null, null ) );
results.add( new Tuple( "7", "g5", null, null ) );
results.add( new Tuple( null, "h1", null, "H1" ) );
handleJoins( "cogroupinnerouter", new LeftJoin(), results, 8, false, null );
handleJoins( "cogroupinnerouter-resultgroup", new LeftJoin(), results, 8, true, null );
}
/**
* 1 a1
* 1 a2
* 1 a3
* 2 b1
* 3 c1
* 4 d1
* 4 d2
* 4 d3
* 5 e1
* 5 e2
* 5 e3
* 7 g1
* 7 g2
* 7 g3
* 7 g4
* 7 g5
* null h1
* <p/>
* 1 A1
* 1 A2
* 1 A3
* 2 B1
* 2 B2
* 2 B3
* 4 D1
* 6 F1
* 6 F2
* null H1
* <p/>
* 1 a1 1 A1
* 1 a1 1 A2
* 1 a1 1 A3
* 1 a2 1 A1
* 1 a2 1 A2
* 1 a2 1 A3
* 1 a3 1 A1
* 1 a3 1 A2
* 1 a3 1 A3
* 2 b1 2 B1
* 2 b1 2 B2
* 2 b1 2 B3
* 3 c1 null null
* 4 d1 4 D1
* 4 d2 4 D1
* 4 d3 4 D1
* 5 e1 null null
* 5 e2 null null
* 5 e3 null null
* 7 g1 null null
* 7 g2 null null
* 7 g3 null null
* 7 g4 null null
* 7 g5 null null
* null h1 null null
*
* @throws Exception
*/
@Test
public void testCoGroupInnerOuterNull() throws Exception
{
Set<Tuple> results = new HashSet<Tuple>();
results.add( new Tuple( "1", "a1", "1", "A1" ) );
results.add( new Tuple( "1", "a1", "1", "A2" ) );
results.add( new Tuple( "1", "a1", "1", "A3" ) );
results.add( new Tuple( "1", "a2", "1", "A1" ) );
results.add( new Tuple( "1", "a2", "1", "A2" ) );
results.add( new Tuple( "1", "a2", "1", "A3" ) );
results.add( new Tuple( "1", "a3", "1", "A1" ) );
results.add( new Tuple( "1", "a3", "1", "A2" ) );
results.add( new Tuple( "1", "a3", "1", "A3" ) );
results.add( new Tuple( "2", "b1", "2", "B1" ) );
results.add( new Tuple( "2", "b1", "2", "B2" ) );
results.add( new Tuple( "2", "b1", "2", "B3" ) );
results.add( new Tuple( "3", "c1", null, null ) );
results.add( new Tuple( "4", "d1", "4", "D1" ) );
results.add( new Tuple( "4", "d2", "4", "D1" ) );
results.add( new Tuple( "4", "d3", "4", "D1" ) );
results.add( new Tuple( "5", "e1", null, null ) );
results.add( new Tuple( "5", "e2", null, null ) );
results.add( new Tuple( "5", "e3", null, null ) );
results.add( new Tuple( "7", "g1", null, null ) );
results.add( new Tuple( "7", "g2", null, null ) );
results.add( new Tuple( "7", "g3", null, null ) );
results.add( new Tuple( "7", "g4", null, null ) );
results.add( new Tuple( "7", "g5", null, null ) );
results.add( new Tuple( null, "h1", null, null ) );
handleJoins( "cogroupinnerouternull", new LeftJoin(), results, 9, false, new NullNotEquivalentComparator() );
handleJoins( "cogroupinnerouternull-resultgroup", new LeftJoin(), results, 9, true, new NullNotEquivalentComparator() );
}
/**
* 1 a1
* 1 a2
* 1 a3
* 2 b1
* 3 c1
* 4 d1
* 4 d2
* 4 d3
* 5 e1
* 5 e2
* 5 e3
* 7 g1
* 7 g2
* 7 g3
* 7 g4
* 7 g5
* null h1
* <p/>
* 1 A1
* 1 A2
* 1 A3
* 2 B1
* 2 B2
* 2 B3
* 4 D1
* 6 F1
* 6 F2
* null H1
* <p/>
* 1 a1 1 A1
* 1 a1 1 A2
* 1 a1 1 A3
* 1 a2 1 A1
* 1 a2 1 A2
* 1 a2 1 A3
* 1 a3 1 A1
* 1 a3 1 A2
* 1 a3 1 A3
* 2 b1 2 B1
* 2 b1 2 B2
* 2 b1 2 B3
* 4 d1 4 D1
* 4 d2 4 D1
* 4 d3 4 D1
* null null 6 F1
* null null 6 F2
* null h1 null H1
*
* @throws Exception
*/
@Test
public void testCoGroupOuterInner() throws Exception
{
Set<Tuple> results = new HashSet<Tuple>();
results.add( new Tuple( "1", "a1", "1", "A1" ) );
results.add( new Tuple( "1", "a1", "1", "A2" ) );
results.add( new Tuple( "1", "a1", "1", "A3" ) );
results.add( new Tuple( "1", "a2", "1", "A1" ) );
results.add( new Tuple( "1", "a2", "1", "A2" ) );
results.add( new Tuple( "1", "a2", "1", "A3" ) );
results.add( new Tuple( "1", "a3", "1", "A1" ) );
results.add( new Tuple( "1", "a3", "1", "A2" ) );
results.add( new Tuple( "1", "a3", "1", "A3" ) );
results.add( new Tuple( "2", "b1", "2", "B1" ) );
results.add( new Tuple( "2", "b1", "2", "B2" ) );
results.add( new Tuple( "2", "b1", "2", "B3" ) );
results.add( new Tuple( "4", "d1", "4", "D1" ) );
results.add( new Tuple( "4", "d2", "4", "D1" ) );
results.add( new Tuple( "4", "d3", "4", "D1" ) );
results.add( new Tuple( null, null, "6", "F1" ) );
results.add( new Tuple( null, null, "6", "F2" ) );
results.add( new Tuple( null, "h1", null, "H1" ) );
handleJoins( "cogroupouterinner", new RightJoin(), results, 8, false, null );
handleJoins( "cogroupouterinner-resultgroup", new RightJoin(), results, 8, true, null );
}
/**
* 1 a1
* 1 a2
* 1 a3
* 2 b1
* 3 c1
* 4 d1
* 4 d2
* 4 d3
* 5 e1
* 5 e2
* 5 e3
* 7 g1
* 7 g2
* 7 g3
* 7 g4
* 7 g5
* null h1
* <p/>
* 1 A1
* 1 A2
* 1 A3
* 2 B1
* 2 B2
* 2 B3
* 4 D1
* 6 F1
* 6 F2
* null H1
* <p/>
* 1 a1 1 A1
* 1 a1 1 A2
* 1 a1 1 A3
* 1 a2 1 A1
* 1 a2 1 A2
* 1 a2 1 A3
* 1 a3 1 A1
* 1 a3 1 A2
* 1 a3 1 A3
* 2 b1 2 B1
* 2 b1 2 B2
* 2 b1 2 B3
* 4 d1 4 D1
* 4 d2 4 D1
* 4 d3 4 D1
* null null 6 F1
* null null 6 F2
* null null null H1
*
* @throws Exception
*/
@Test
public void testCoGroupOuterInnerNull() throws Exception
{
Set<Tuple> results = new HashSet<Tuple>();
results.add( new Tuple( "1", "a1", "1", "A1" ) );
results.add( new Tuple( "1", "a1", "1", "A2" ) );
results.add( new Tuple( "1", "a1", "1", "A3" ) );
results.add( new Tuple( "1", "a2", "1", "A1" ) );
results.add( new Tuple( "1", "a2", "1", "A2" ) );
results.add( new Tuple( "1", "a2", "1", "A3" ) );
results.add( new Tuple( "1", "a3", "1", "A1" ) );
results.add( new Tuple( "1", "a3", "1", "A2" ) );
results.add( new Tuple( "1", "a3", "1", "A3" ) );
results.add( new Tuple( "2", "b1", "2", "B1" ) );
results.add( new Tuple( "2", "b1", "2", "B2" ) );
results.add( new Tuple( "2", "b1", "2", "B3" ) );
results.add( new Tuple( "4", "d1", "4", "D1" ) );
results.add( new Tuple( "4", "d2", "4", "D1" ) );
results.add( new Tuple( "4", "d3", "4", "D1" ) );
results.add( new Tuple( null, null, "6", "F1" ) );
results.add( new Tuple( null, null, "6", "F2" ) );
results.add( new Tuple( null, null, null, "H1" ) );
handleJoins( "cogroupouterinnernull", new RightJoin(), results, 9, false, new NullNotEquivalentComparator() );
handleJoins( "cogroupouterinnernull-resultgroup", new RightJoin(), results, 9, true, new NullNotEquivalentComparator() );
}
private void handleJoins( String path, Joiner joiner, Set<Tuple> results, int numGroups, boolean useResultGroupFields, NullNotEquivalentComparator comparator ) throws Exception
{
results = new HashSet<Tuple>( results );
getPlatform().copyFromLocal( inputFileLhsSparse );
getPlatform().copyFromLocal( inputFileRhsSparse );
Fields fields = new Fields( "num", "char" ).applyTypes( Integer.class, String.class );
Tap sourceLower = getPlatform().getDelimitedFile( fields, " ", inputFileLhsSparse );
Tap sourceUpper = getPlatform().getDelimitedFile( fields, " ", inputFileRhsSparse );
Map sources = new HashMap();
sources.put( "lower", sourceLower );
sources.put( "upper", sourceUpper );
Tap sink = getPlatform().getDelimitedFile( Fields.size( 4, String.class ), "\t", getOutputPath( path ), SinkMode.REPLACE );
Pipe pipeLower = new Pipe( "lower" );
Pipe pipeUpper = new Pipe( "upper" );
Fields declaredFields = new Fields( "num", "char", "num2", "char2" );
Fields groupFields = new Fields( "num" );
if( comparator != null )
groupFields.setComparator( 0, comparator );
Pipe splice;
if( useResultGroupFields )
splice = new CoGroup( pipeLower, groupFields, pipeUpper, groupFields, declaredFields, new Fields( "num", "num2" ), joiner );
else
splice = new CoGroup( pipeLower, groupFields, pipeUpper, groupFields, declaredFields, joiner );
splice = new Every( splice, Fields.ALL, new TestIdentityBuffer( new Fields( "num", "num2" ), numGroups, true ), Fields.RESULTS );
Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice );
flow.complete();
validateLength( flow, results.size() );
List<Tuple> actual = getSinkAsList( flow );
results.removeAll( actual );
assertEquals( 0, results.size() );
}
/**
* 1 a
* 5 b
* 6 c
* 5 b
* 5 e
* <p/>
* 1 A
* 2 B
* 3 C
* 4 D
* 5 E
* <p/>
* 1 a
* 2 b
* 3 c
* 4 d
* 5 e
* <p/>
* 1 a 1 A 1 a
* - - 2 B 2 b
* - - 3 C 3 c
* - - 4 D 4 d
* 5 b 5 E 5 e
* 5 e 5 E 5 e
*
* @throws Exception
*/
@Test
public void testCoGroupMixed() throws Exception
{
getPlatform().copyFromLocal( inputFileLowerOffset );
getPlatform().copyFromLocal( inputFileLower );
getPlatform().copyFromLocal( inputFileUpper );
Tap sourceLowerOffset = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLowerOffset );
Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper );
Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower );
Map sources = new HashMap();
sources.put( "loweroffset", sourceLowerOffset );
sources.put( "lower", sourceLower );
sources.put( "upper", sourceUpper );
Tap sink = getPlatform().getDelimitedFile( Fields.size( 6, String.class ), "\t", getOutputPath( "cogroupmixed" ), SinkMode.REPLACE );
Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );
Pipe pipeLowerOffset = new Each( new Pipe( "loweroffset" ), new Fields( "line" ), splitter );
Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter );
Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter );
Pipe[] pipes = Pipe.pipes( pipeLowerOffset, pipeUpper, pipeLower );
Fields[] fields = Fields.fields( new Fields( "num" ), new Fields( "num" ), new Fields( "num" ) );
MixedJoin join = new MixedJoin( new boolean[]{MixedJoin.OUTER, MixedJoin.INNER, MixedJoin.OUTER} );
Pipe splice = new CoGroup( pipes, fields, Fields.size( 6 ), join );
Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice );
flow.complete();
validateLength( flow, 6 );
Set<Tuple> results = new HashSet<Tuple>();
results.add( new Tuple( "1", "a", "1", "A", "1", "a" ) );
results.add( new Tuple( null, null, "2", "B", "2", "b" ) );
results.add( new Tuple( null, null, "3", "C", "3", "c" ) );
results.add( new Tuple( null, null, "4", "D", "4", "d" ) );
results.add( new Tuple( "5", "b", "5", "E", "5", "e" ) );
results.add( new Tuple( "5", "e", "5", "E", "5", "e" ) );
List<Tuple> actual = getSinkAsList( flow );
results.removeAll( actual );
assertEquals( 0, results.size() );
}
@Test
public void testCoGroupDiffFields() throws Exception
{
getPlatform().copyFromLocal( inputFileLower );
getPlatform().copyFromLocal( inputFileUpper );
Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower );
Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper );
Map sources = new HashMap();
sources.put( "lower", sourceLower );
sources.put( "upper", sourceUpper );
Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "difffields" ), SinkMode.REPLACE );
Function splitterLower = new RegexSplitter( new Fields( "numA", "lower" ), " " );
Function splitterUpper = new RegexSplitter( new Fields( "numB", "upper" ), " " );
Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitterLower );
Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitterUpper );
Pipe cogroup = new CoGroup( pipeLower, new Fields( "numA" ), pipeUpper, new Fields( "numB" ) );
Flow flow = getPlatform().getFlowConnector().connect( sources, sink, cogroup );
flow.complete();
validateLength( flow, 5 );
List<Tuple> actual = getSinkAsList( flow );
assertTrue( actual.contains( new Tuple( "1\ta\t1\tA" ) ) );
assertTrue( actual.contains( new Tuple( "2\tb\t2\tB" ) ) );
}
@Test
public void testCoGroupGroupBy() throws Exception
{
getPlatform().copyFromLocal( inputFileLower );
getPlatform().copyFromLocal( inputFileUpper );
Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower );
Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper );
Map sources = new HashMap();
sources.put( "lower", sourceLower );
sources.put( "upper", sourceUpper );
Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "cogroupgroupby" ), SinkMode.REPLACE );
Function splitterLower = new RegexSplitter( new Fields( "numA", "lower" ), " " );
Function splitterUpper = new RegexSplitter( new Fields( "numB", "upper" ), " " );
Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitterLower );
Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitterUpper );
Pipe cogroup = new CoGroup( pipeLower, new Fields( "numA" ), pipeUpper, new Fields( "numB" ) );
Pipe groupby = new GroupBy( cogroup, new Fields( "numA" ) );
Flow flow = getPlatform().getFlowConnector().connect( sources, sink, groupby );
flow.complete();
validateLength( flow, 5, null );
List<Tuple> actual = getSinkAsList( flow );
assertTrue( actual.contains( new Tuple( "1\ta\t1\tA" ) ) );
assertTrue( actual.contains( new Tuple( "2\tb\t2\tB" ) ) );
}
@Test
public void testCoGroupSamePipe() throws Exception
{
getPlatform().copyFromLocal( inputFileLower );
Tap source = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower );
Map sources = new HashMap();
sources.put( "lower", source );
Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "samepipe" ), SinkMode.REPLACE );
Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );
Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter );
Pipe cogroup = new CoGroup( pipeLower, new Fields( "num" ), 1, new Fields( "num1", "char1", "num2", "char2" ) );
Flow flow = getPlatform().getFlowConnector().connect( sources, sink, cogroup );
flow.complete();
validateLength( flow, 5, null );
List<Tuple> actual = getSinkAsList( flow );
assertTrue( actual.contains( new Tuple( "1\ta\t1\ta" ) ) );
assertTrue( actual.contains( new Tuple( "2\tb\t2\tb" ) ) );
}
@Test
public void testCoGroupSamePipe2() throws Exception
{
getPlatform().copyFromLocal( inputFileLower );
Tap source = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower );
Map sources = new HashMap();
sources.put( "lower", source );
Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "samepipe2" ), SinkMode.REPLACE );
Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );
Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter );
Pipe cogroup = new CoGroup( pipeLower, new Fields( "num" ), pipeLower, new Fields( "num" ), new Fields( "num1", "char1", "num2", "char2" ) );
Flow flow = getPlatform().getFlowConnector().connect( sources, sink, cogroup );
flow.complete();
validateLength( flow, 5, null );
List<Tuple> actual = getSinkAsList( flow );
assertTrue( actual.contains( new Tuple( "1\ta\t1\ta" ) ) );
assertTrue( actual.contains( new Tuple( "2\tb\t2\tb" ) ) );
}
@Test
public void testCoGroupSamePipe3() throws Exception
{
getPlatform().copyFromLocal( inputFileLower );
Tap source = getPlatform().getDelimitedFile( new Fields( "num", "char" ), " ", inputFileLower );
Map sources = new HashMap();
sources.put( "lower", source );
Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "samepipe3" ), SinkMode.REPLACE );
Pipe pipe = new Pipe( "lower" );
Pipe lhs = new Pipe( "lhs", pipe );
Pipe rhs = new Pipe( "rhs", pipe );
Pipe cogroup = new CoGroup( lhs, new Fields( "num" ), rhs, new Fields( "num" ), new Fields( "num1", "char1", "num2", "char2" ) );
Flow flow = getPlatform().getFlowConnector().connect( sources, sink, cogroup );
flow.complete();
validateLength( flow, 5, null );
List<Tuple> actual = getSinkAsList( flow );
assertTrue( actual.contains( new Tuple( "1\ta\t1\ta" ) ) );
assertTrue( actual.contains( new Tuple( "2\tb\t2\tb" ) ) );
}
@Test
public void testCoGroupAroundCoGroup() throws Exception
{
getPlatform().copyFromLocal( inputFileLower );
getPlatform().copyFromLocal( inputFileUpper );
Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower );
Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper );
Map sources = new HashMap();
sources.put( "lower", sourceLower );
sources.put( "upper1", sourceUpper );
sources.put( "upper2", sourceUpper );
Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "cogroupacogroup" ), SinkMode.REPLACE );
Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );
Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter );
Pipe pipeUpper1 = new Each( new Pipe( "upper1" ), new Fields( "line" ), splitter );
Pipe pipeUpper2 = new Each( new Pipe( "upper2" ), new Fields( "line" ), splitter );
Pipe splice1 = new CoGroup( pipeLower, new Fields( "num" ), pipeUpper1, new Fields( "num" ), new Fields( "num1", "char1", "num2", "char2" ) );
splice1 = new Each( splice1, new Identity() );
Pipe splice2 = new CoGroup( splice1, new Fields( "num1" ), pipeUpper2, new Fields( "num" ), new Fields( "num1", "char1", "num2", "char2", "num3", "char3" ) );
Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice2 );
flow.complete();
validateLength( flow, 5, null );
List<Tuple> actual = getSinkAsList( flow );
assertTrue( actual.contains( new Tuple( "1\ta\t1\tA\t1\tA" ) ) );
assertTrue( actual.contains( new Tuple( "2\tb\t2\tB\t2\tB" ) ) );
}
@Test
public void testCoGroupAroundCoGroupWithout() throws Exception
{
runCoGroupAroundCoGroup( null, "cogroupacogroupopt1" );
}
@Test
public void testCoGroupAroundCoGroupWith() throws Exception
{
// hack to get classname
runCoGroupAroundCoGroup( getPlatform().getDelimitedFile( new Fields( "num" ), "\t", inputFileNums10 ).getScheme().getClass(), "cogroupacogroupopt2" );
}
private void runCoGroupAroundCoGroup( Class schemeClass, String stringPath ) throws IOException
{
getPlatform().copyFromLocal( inputFileNums20 );
getPlatform().copyFromLocal( inputFileNums10 );
Tap source10 = getPlatform().getDelimitedFile( new Fields( "num" ), "\t", inputFileNums10 );
Tap source20 = getPlatform().getDelimitedFile( new Fields( "num" ), "\t", inputFileNums20 );
Map sources = new HashMap();
sources.put( "source20", source20 );
sources.put( "source101", source10 );
sources.put( "source102", source10 );
Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( stringPath ), SinkMode.REPLACE );
Pipe pipeNum20 = new Pipe( "source20" );
Pipe pipeNum101 = new Pipe( "source101" );
Pipe pipeNum102 = new Pipe( "source102" );
Pipe splice1 = new CoGroup( pipeNum20, new Fields( "num" ), pipeNum101, new Fields( "num" ), new Fields( "num1", "num2" ) );
Pipe splice2 = new CoGroup( splice1, new Fields( "num1" ), pipeNum102, new Fields( "num" ), new Fields( "num1", "num2", "num3" ) );
splice2 = new Each( splice2, new Identity() );
Map<Object, Object> properties = getPlatform().getProperties();
if( getPlatform().isMapReduce() )
FlowConnectorProps.setIntermediateSchemeClass( properties, schemeClass );
Flow flow = getPlatform().getFlowConnector( properties ).connect( "cogroupopt", sources, sink, splice2 );
if( getPlatform().isMapReduce() )
assertEquals( "wrong number of steps", 2, flow.getFlowSteps().size() );
flow.complete();
validateLength( flow, 10 );
List<Tuple> actual = getSinkAsList( flow );
assertTrue( actual.contains( new Tuple( "1\t1\t1" ) ) );
assertTrue( actual.contains( new Tuple( "10\t10\t10" ) ) );
}
@Test
public void testCoGroupDiffFieldsSameFile() throws Exception
{
getPlatform().copyFromLocal( inputFileLower );
getPlatform().copyFromLocal( inputFileUpper );
Tap sourceOffsetLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower );
Tap sourceLower = getPlatform().getTextFile( new Fields( "line" ), inputFileLower );
Map sources = new HashMap();
sources.put( "offsetLower", sourceOffsetLower );
sources.put( "lower", sourceLower );
Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "samefiledifffields" ), SinkMode.REPLACE );
Function splitterLower = new RegexSplitter( new Fields( "numA", "left" ), " " );
Function splitterUpper = new RegexSplitter( new Fields( "numB", "right" ), " " );
Pipe offsetLower = new Pipe( "offsetLower" );
offsetLower = new Discard( offsetLower, new Fields( "offset" ) );
offsetLower = new Each( offsetLower, new Fields( "line" ), splitterLower );
Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitterUpper );
Pipe cogroup = new CoGroup( offsetLower, new Fields( "numA" ), pipeLower, new Fields( "numB" ) );
Flow flow = getPlatform().getFlowConnector().connect( sources, sink, cogroup );
flow.complete();
validateLength( flow, 5 );
List<Tuple> actual = getSinkAsList( flow );
assertTrue( actual.contains( new Tuple( "1\ta\t1\ta" ) ) );
assertTrue( actual.contains( new Tuple( "2\tb\t2\tb" ) ) );
}
@Test
public void testJoinNone() throws Exception
{
getPlatform().copyFromLocal( inputFileLower );
getPlatform().copyFromLocal( inputFileUpper );
Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower );
Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper );
Map sources = new HashMap();
sources.put( "lower", sourceLower );
sources.put( "upper", sourceUpper );
Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "joinnone" ), SinkMode.REPLACE );
Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );
Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter );
Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter );
Pipe splice = new CoGroup( pipeLower, Fields.NONE, pipeUpper, Fields.NONE, Fields.size( 4 ) );
Map<Object, Object> properties = getProperties();
Flow flow = getPlatform().getFlowConnector( properties ).connect( sources, sink, splice );
flow.complete();
validateLength( flow, 25 );
List<Tuple> values = getSinkAsList( flow );
assertTrue( values.contains( new Tuple( "1\ta\t1\tA" ) ) );
assertTrue( values.contains( new Tuple( "1\ta\t2\tB" ) ) );
assertTrue( values.contains( new Tuple( "2\tb\t2\tB" ) ) );
}
@Test
public void testMultiJoin() throws Exception
{
getPlatform().copyFromLocal( inputFileCrossX2 );
Tap source = getPlatform().getTextFile( new Fields( "line" ), inputFileCrossX2 );
Tap innerSink = getPlatform().getTextFile( getOutputPath( "inner" ), SinkMode.REPLACE );
Tap outerSink = getPlatform().getTextFile( getOutputPath( "outer" ), SinkMode.REPLACE );
Tap leftSink = getPlatform().getTextFile( getOutputPath( "left" ), SinkMode.REPLACE );
Tap rightSink = getPlatform().getTextFile( getOutputPath( "right" ), SinkMode.REPLACE );
Pipe uniques = new Pipe( "unique" );
uniques = new Each( uniques, new Fields( "line" ), new RegexSplitGenerator( new Fields( "word" ), "\\s" ) );
uniques = new GroupBy( uniques, new Fields( "word" ) );
uniques = new Every( uniques, new Fields( "word" ), new First( Fields.ARGS ), Fields.REPLACE );
// uniques = new Each( uniques, new Debug( true ) );
Pipe fielded = new Pipe( "fielded" );
fielded = new Each( fielded, new Fields( "line" ), new RegexSplitter( "\\s" ) );
// fielded = new Each( fielded, new Debug( true ) );
Pipe inner = new CoGroup( "inner", fielded, new Fields( 0 ), uniques, new Fields( "word" ), new InnerJoin() );
Pipe outer = new CoGroup( "outer", fielded, new Fields( 0 ), uniques, new Fields( "word" ), new OuterJoin() );
Pipe left = new CoGroup( "left", fielded, new Fields( 0 ), uniques, new Fields( "word" ), new LeftJoin() );
Pipe right = new CoGroup( "right", fielded, new Fields( 0 ), uniques, new Fields( "word" ), new RightJoin() );
Pipe[] heads = Pipe.pipes( uniques, fielded );
Map<String, Tap> sources = Cascades.tapsMap( heads, Tap.taps( source, source ) );
Pipe[] tails = Pipe.pipes( inner, outer, left, right );
Map<String, Tap> sinks = Cascades.tapsMap( tails, Tap.taps( innerSink, outerSink, leftSink, rightSink ) );
Flow flow = getPlatform().getFlowConnector().connect( "multi-joins", sources, sinks, tails );
flow.complete();
validateLength( flow.openTapForRead( innerSink ), 74 );
validateLength( flow.openTapForRead( outerSink ), 84 );
validateLength( flow.openTapForRead( leftSink ), 74 );
validateLength( flow.openTapForRead( rightSink ), 84 );
}
}