Package cascading

Source Code of cascading.MergePipesPlatformTest

/*
* Copyright (c) 2007-2014 Concurrent, Inc. All Rights Reserved.
*
* Project and contact information: http://www.cascading.org/
*
* This file is part of the Cascading project.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package cascading;

import java.io.IOException;
import java.util.Collection;
import java.util.HashMap;
import java.util.Map;

import cascading.flow.Flow;
import cascading.operation.Function;
import cascading.operation.Identity;
import cascading.operation.regex.RegexFilter;
import cascading.operation.regex.RegexSplitter;
import cascading.pipe.CoGroup;
import cascading.pipe.Each;
import cascading.pipe.GroupBy;
import cascading.pipe.HashJoin;
import cascading.pipe.Merge;
import cascading.pipe.Pipe;
import cascading.pipe.assembly.Rename;
import cascading.pipe.assembly.Retain;
import cascading.tap.SinkMode;
import cascading.tap.Tap;
import cascading.tuple.Fields;
import cascading.tuple.Tuple;
import org.junit.Test;

import static data.InputData.*;

public class MergePipesPlatformTest extends PlatformTestCase
  {
  public MergePipesPlatformTest()
    {
    super( true ); // leave cluster testing enabled
    }

  @Test
  public void testSimpleMerge() throws Exception
    {
    getPlatform().copyFromLocal( inputFileLower );
    getPlatform().copyFromLocal( inputFileUpper );

    Tap sourceLower = getPlatform().getTextFile( inputFileLower );
    Tap sourceUpper = getPlatform().getTextFile( inputFileUpper );

    Map sources = new HashMap();

    sources.put( "lower", sourceLower );
    sources.put( "upper", sourceUpper );

    Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );

    Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "simplemerge" ), SinkMode.REPLACE );

    Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter );
    Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter );

    Pipe splice = new Merge( "merge", pipeLower, pipeUpper );

    Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice );

    flow.complete();

    validateLength( flow, 10 );

    Collection results = getSinkAsList( flow );

    assertTrue( "missing value", results.contains( new Tuple( "1\ta" ) ) );
    assertTrue( "missing value", results.contains( new Tuple( "1\tA" ) ) );
    }

  /**
   * Specifically tests GroupBy will return the correct grouping fields to the following Every
   *
   * @throws Exception
   */
  @Test
  public void testSimpleMergeThree() throws Exception
    {
    getPlatform().copyFromLocal( inputFileLower );
    getPlatform().copyFromLocal( inputFileUpper );
    getPlatform().copyFromLocal( inputFileLowerOffset );

    Tap sourceLower = getPlatform().getTextFile( inputFileLower );
    Tap sourceUpper = getPlatform().getTextFile( inputFileUpper );
    Tap sourceLowerOffset = getPlatform().getTextFile( inputFileLowerOffset );

    Map sources = new HashMap();

    sources.put( "lower", sourceLower );
    sources.put( "upper", sourceUpper );
    sources.put( "offset", sourceLowerOffset );

    Tap sink = getPlatform().getTextFile( getOutputPath( "simplemergethree" ), SinkMode.REPLACE );

    Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );

    Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter );
    Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter );
    Pipe pipeOffset = new Each( new Pipe( "offset" ), new Fields( "line" ), splitter );

    Pipe splice = new Merge( "merge", pipeLower, pipeUpper, pipeOffset );

    splice = new Each( splice, new Fields( "num", "char" ), new Identity() );

    Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice );

    flow.complete();

    validateLength( flow, 14 );
    }

  @Test
  public void testSimpleMergeThreeChain() throws Exception
    {
    getPlatform().copyFromLocal( inputFileLower );
    getPlatform().copyFromLocal( inputFileUpper );
    getPlatform().copyFromLocal( inputFileLowerOffset );

    Tap sourceLower = getPlatform().getTextFile( inputFileLower );
    Tap sourceUpper = getPlatform().getTextFile( inputFileUpper );
    Tap sourceLowerOffset = getPlatform().getTextFile( inputFileLowerOffset );

    Map sources = new HashMap();

    sources.put( "lower", sourceLower );
    sources.put( "upper", sourceUpper );
    sources.put( "offset", sourceLowerOffset );

    Tap sink = getPlatform().getTextFile( getOutputPath( "simplemergethreechain" ), SinkMode.REPLACE );

    Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );

    Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter );
    Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter );
    Pipe pipeOffset = new Each( new Pipe( "offset" ), new Fields( "line" ), splitter );

    Pipe splice = new Merge( "merge", pipeLower, pipeUpper );

    splice = new Merge( splice, pipeOffset );

    splice = new Each( splice, new Fields( "num", "char" ), new Identity() );

    Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice );

    flow.complete();

    validateLength( flow, 14 );
    }

  @Test
  public void testSimpleMergeThreeChainGroup() throws Exception
    {
    getPlatform().copyFromLocal( inputFileLower );
    getPlatform().copyFromLocal( inputFileUpper );
    getPlatform().copyFromLocal( inputFileLowerOffset );

    Tap sourceLower = getPlatform().getTextFile( inputFileLower );
    Tap sourceUpper = getPlatform().getTextFile( inputFileUpper );
    Tap sourceLowerOffset = getPlatform().getTextFile( inputFileLowerOffset );

    Map sources = new HashMap();

    sources.put( "lower", sourceLower );
    sources.put( "upper", sourceUpper );
    sources.put( "offset", sourceLowerOffset );

    Tap sink = getPlatform().getTextFile( getOutputPath( "simplemergethreechaingroup" ), SinkMode.REPLACE );

    Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );

    Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter );
    Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter );
    Pipe pipeOffset = new Each( new Pipe( "offset" ), new Fields( "line" ), splitter );

    Pipe splice = new Merge( "merge", pipeLower, pipeUpper );

    splice = new Merge( splice, pipeOffset );

    splice = new GroupBy( splice, new Fields( "num" ) );

    Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice );

    if( getPlatform().isMapReduce() )
      assertEquals( "wrong num jobs", 1, flow.getFlowSteps().size() );

    flow.complete();

    validateLength( flow, 14 );
    }

  @Test
  public void testSimpleMergeThreeChainCoGroup() throws Exception
    {
    getPlatform().copyFromLocal( inputFileLower );
    getPlatform().copyFromLocal( inputFileUpper );
    getPlatform().copyFromLocal( inputFileLowerOffset );

    Tap sourceLower = getPlatform().getTextFile( inputFileLower );
    Tap sourceUpper = getPlatform().getTextFile( inputFileUpper );
    Tap sourceLowerOffset = getPlatform().getTextFile( inputFileLowerOffset );

    Map sources = new HashMap();

    sources.put( "lower", sourceLower );
    sources.put( "upper", sourceUpper );
    sources.put( "offset", sourceLowerOffset );

    Tap sink = getPlatform().getTextFile( getOutputPath( "simplemergethreechaincogroup" ), SinkMode.REPLACE );

    Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), new RegexSplitter( new Fields( "num1", "char1" ), " " ) );
    Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), new RegexSplitter( new Fields( "num1", "char1" ), " " ) );
    Pipe pipeOffset = new Each( new Pipe( "offset" ), new Fields( "line" ), new RegexSplitter( new Fields( "num2", "char2" ), " " ) );

    Pipe splice = new Merge( "merge", pipeLower, pipeUpper );

    splice = new CoGroup( splice, new Fields( "num1" ), pipeOffset, new Fields( "num2" ) );

    Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice );

    if( getPlatform().isMapReduce() )
      assertEquals( "wrong num jobs", 1, flow.getFlowSteps().size() );

    flow.complete();

    validateLength( flow, 6 );
    }

  @Test
  public void testSameSourceMergeThreeChainGroup() throws Exception
    {
    getPlatform().copyFromLocal( inputFileLower );

    Tap sourceLower = getPlatform().getTextFile( inputFileLower );

    Map sources = new HashMap();

    sources.put( "split", sourceLower );

    Tap sink = getPlatform().getTextFile( getOutputPath( "samemergethreechaingroup" ), SinkMode.REPLACE );

    Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );

    Pipe pipe = new Pipe( "split" );

    Pipe pipeLower = new Each( new Pipe( "lower", pipe ), new Fields( "line" ), splitter );
    Pipe pipeUpper = new Each( new Pipe( "upper", pipe ), new Fields( "line" ), splitter );
    Pipe pipeOffset = new Each( new Pipe( "offset", pipe ), new Fields( "line" ), splitter );

    Pipe splice = new Merge( "merge", pipeLower, pipeUpper );

    //put group before merge to test path counts
    splice = new GroupBy( splice, new Fields( "num" ) );

    splice = new Merge( splice, pipeOffset );

    // this group has its incoming paths counted, gated by the previous group
    splice = new GroupBy( splice, new Fields( "num" ) );

    Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice );

    if( getPlatform().isMapReduce() )
      assertEquals( "wrong num jobs", 2, flow.getFlowSteps().size() );

    flow.complete();

    validateLength( flow, 15 );
    }

  @Test
  public void testSplitSameSourceMerged() throws Exception
    {
    getPlatform().copyFromLocal( inputFileApache );

    Tap source = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileApache );
    Tap sink = getPlatform().getTextFile( getOutputPath( "splitsourcemerged" ), SinkMode.REPLACE );

    Pipe pipe = new Pipe( "split" );

    pipe = new Each( pipe, new Fields( "line" ), new RegexFilter( "^68.*" ) );

    Pipe left = new Each( new Pipe( "left", pipe ), new Fields( "line" ), new RegexFilter( ".*46.*" ) );
    Pipe right = new Each( new Pipe( "right", pipe ), new Fields( "line" ), new RegexFilter( ".*102.*" ) );

    Pipe merged = new Merge( "merged", left, right );

    merged = new Each( merged, new Fields( "line" ), new Identity() );

    Flow flow = getPlatform().getFlowConnector().connect( source, sink, merged );

    if( getPlatform().isMapReduce() )
      assertEquals( "wrong num jobs", 1, flow.getFlowSteps().size() );

    flow.complete();

    validateLength( flow, 3 );
    }

  @Test
  public void testSplitSameSourceMergedComplex() throws Exception
    {
    getPlatform().copyFromLocal( inputFileApache );

    Tap source = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileApache );
    Tap sink = getPlatform().getTextFile( getOutputPath( "splitsourcemergedcomplex" ), SinkMode.REPLACE );

    Pipe pipe = new Pipe( "split" );

    pipe = new Each( pipe, new Fields( "line" ), new RegexFilter( "^68.*" ) );

    Pipe left = new Each( new Pipe( "left", pipe ), new Fields( "line" ), new RegexFilter( ".*46.*" ) );
    Pipe right = new Each( new Pipe( "right", pipe ), new Fields( "line" ), new RegexFilter( ".*102.*" ) );

    Pipe merged = new Merge( "merged-first", left, right );

    merged = new Each( merged, new Fields( "line" ), new Identity() );

    left = new Each( new Pipe( "left", merged ), new Fields( "line" ), new RegexFilter( ".*46.*" ) );
    right = new Each( new Pipe( "right", merged ), new Fields( "line" ), new RegexFilter( ".*102.*" ) );

    merged = new Merge( "merged-second", left, right );

    merged = new Each( merged, new Fields( "line" ), new Identity() );

    Flow flow = getPlatform().getFlowConnector().connect( source, sink, merged );

    if( getPlatform().isMapReduce() )
      assertEquals( "wrong num jobs", 1, flow.getFlowSteps().size() );

    flow.complete();

    validateLength( flow, 3 );
    }

  @Test
  public void testSimpleMergeFail() throws Exception
    {
    getPlatform().copyFromLocal( inputFileLower );
    getPlatform().copyFromLocal( inputFileUpper );

    Tap sourceLower = getPlatform().getTextFile( inputFileLower );
    Tap sourceUpper = getPlatform().getTextFile( inputFileUpper );

    Map sources = new HashMap();

    sources.put( "lower", sourceLower );
    sources.put( "upper", sourceUpper );

    Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );

    Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "simplemergefail" ), SinkMode.REPLACE );

    Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter );
    pipeLower = new Rename( pipeLower, new Fields( "num" ), new Fields( "num2" ) );

    Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter );

    Pipe splice = new Merge( "merge", pipeLower, pipeUpper );

    try
      {
      Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice );
      fail();
      }
    catch( Exception exception )
      {
//      exception.printStackTrace();
      // ignore
      }
    }

  @Test
  public void testMergeIntoHashJoinStreamed() throws Exception
    {
    runMergeIntoHashJoin( true );
    }

  @Test
  public void testMergeIntoHashJoinAccumulated() throws Exception
    {
    runMergeIntoHashJoin( false );
    }

  private void runMergeIntoHashJoin( boolean streamed ) throws IOException
    {
    getPlatform().copyFromLocal( inputFileLower );
    getPlatform().copyFromLocal( inputFileUpper );
    getPlatform().copyFromLocal( inputFileLowerOffset );

    Tap sourceLower = getPlatform().getTextFile( inputFileLower );
    Tap sourceUpper = getPlatform().getTextFile( inputFileUpper );
    Tap sourceLowerOffset = getPlatform().getTextFile( inputFileLowerOffset );

    Map sources = new HashMap();

    sources.put( "lower", sourceLower );
    sources.put( "upper", sourceUpper );
    sources.put( "offset", sourceLowerOffset );

    String name = streamed ? "streamed" : "accumulated";
    String path = "mergeintohashjoin" + name;
    Tap sink = getPlatform().getTextFile( getOutputPath( path ), SinkMode.REPLACE );

    Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), new RegexSplitter( new Fields( "num1", "char1" ), " " ) );
    Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), new RegexSplitter( new Fields( "num1", "char1" ), " " ) );
    Pipe pipeOffset = new Each( new Pipe( "offset" ), new Fields( "line" ), new RegexSplitter( new Fields( "num2", "char2" ), " " ) );

    Pipe splice = new Merge( "merge", pipeLower, pipeUpper );

    if( streamed )
      splice = new HashJoin( splice, new Fields( "num1" ), pipeOffset, new Fields( "num2" ) );
    else
      splice = new HashJoin( pipeOffset, new Fields( "num2" ), splice, new Fields( "num1" ) );

    Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice );

//    flow.writeDOT( name + ".dot" );

    // two jobs, we must put a temp tap between the Merge and HashJoin
    if( getPlatform().isMapReduce() )
      assertEquals( "wrong num jobs", 1, flow.getFlowSteps().size() );

    flow.complete();

    validateLength( flow, 6 );
    }

  @Test
  public void testHashJoinMergeIntoHashJoinStreamed() throws Exception
    {
    runHashJoinIntoMergeIntoHashJoin( true );
    }

  @Test
  public void testHashJoinMergeIntoHashJoinAccumulated() throws Exception
    {
    runHashJoinIntoMergeIntoHashJoin( false );
    }

  private void runHashJoinIntoMergeIntoHashJoin( boolean streamed ) throws IOException
    {
    getPlatform().copyFromLocal( inputFileLower );
    getPlatform().copyFromLocal( inputFileUpper );
    getPlatform().copyFromLocal( inputFileLowerOffset );

    Tap sourceLower = getPlatform().getTextFile( inputFileLower );
    Tap sourceUpper = getPlatform().getTextFile( inputFileUpper );
    Tap sourceLowerOffset = getPlatform().getTextFile( inputFileLowerOffset );

    Map sources = new HashMap();

    sources.put( "lower", sourceLower );
    sources.put( "upper", sourceUpper );
    sources.put( "offset", sourceLowerOffset );

    String name = streamed ? "streamed" : "accumulated";
    String path = "hashjoinintomergeintohashjoin" + name;
    Tap sink = getPlatform().getTextFile( getOutputPath( path ), SinkMode.REPLACE );

    Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), new RegexSplitter( new Fields( "num1", "char1" ), " " ) );
    Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), new RegexSplitter( new Fields( "num1", "char1" ), " " ) );
    Pipe pipeOffset = new Each( new Pipe( "offset" ), new Fields( "line" ), new RegexSplitter( new Fields( "num2", "char2" ), " " ) );

    Pipe splice = new HashJoin( pipeLower, new Fields( "num1" ), pipeOffset, new Fields( "num2" ) );

    splice = new Retain( splice, new Fields( "num1", "char1" ) );

    splice = new Merge( "merge", splice, pipeUpper );

    if( streamed )
      splice = new HashJoin( splice, new Fields( "num1" ), pipeOffset, new Fields( "num2" ) );
    else
      splice = new HashJoin( pipeOffset, new Fields( "num2" ), splice, new Fields( "num1" ) );

    Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice );

//    flow.writeDOT( name + ".dot" );

    if( getPlatform().isMapReduce() )
      assertEquals( "wrong num jobs", streamed ? 1 : 2, flow.getFlowSteps().size() );

    flow.complete();

    validateLength( flow, 8 );
    }

  @Test
  public void testHashJoinMergeIntoHashJoinStreamedStreamedMerge() throws Exception
    {
    runMultiHashJoinIntoMergeIntoHashJoin( true, true, true, 1 );
    }

  @Test
  public void testHashJoinMergeIntoHashJoinAccumulatedAccumulatedMerge() throws Exception
    {
    runMultiHashJoinIntoMergeIntoHashJoin( false, false, true, 3 );
    }

  /**
   * This test will exercise the issue where a unconnected HashJoin could be accumulated against within
   * a node.
   */
  @Test
  public void testHashJoinMergeIntoHashJoinStreamedAccumulatedMerge() throws Exception
    {
    runMultiHashJoinIntoMergeIntoHashJoin( true, false, true, 2 );
    }

  @Test
  public void testHashJoinMergeIntoHashJoinAccumulatedStreamedMerge() throws Exception
    {
    runMultiHashJoinIntoMergeIntoHashJoin( false, true, true, 3 );
    }

  @Test
  public void testHashJoinMergeIntoHashJoinStreamedStreamed() throws Exception
    {
    runMultiHashJoinIntoMergeIntoHashJoin( true, true, false, 1 );
    }

  @Test
  public void testHashJoinMergeIntoHashJoinAccumulatedAccumulated() throws Exception
    {
    runMultiHashJoinIntoMergeIntoHashJoin( false, false, false, 3 );
    }

  @Test
  public void testHashJoinMergeIntoHashJoinStreamedAccumulated() throws Exception
    {
    runMultiHashJoinIntoMergeIntoHashJoin( true, false, false, 2 );
    }

  @Test
  public void testHashJoinMergeIntoHashJoinAccumulatedStreamed() throws Exception
    {
    runMultiHashJoinIntoMergeIntoHashJoin( false, true, false, 3 );
    }

  private void runMultiHashJoinIntoMergeIntoHashJoin( boolean firstStreamed, boolean secondStreamed, boolean interMerge, int expectedSteps ) throws IOException
    {
    getPlatform().copyFromLocal( inputFileLower );
    getPlatform().copyFromLocal( inputFileUpper );
    getPlatform().copyFromLocal( inputFileLowerOffset );

    Tap sourceLower = getPlatform().getTextFile( inputFileLower );
    Tap sourceUpper = getPlatform().getTextFile( inputFileUpper );
    Tap sourceLowerOffset = getPlatform().getTextFile( inputFileLowerOffset );

    Map sources = new HashMap();

    sources.put( "lower", sourceLower );
    sources.put( "upper", sourceUpper );
    sources.put( "offset", sourceLowerOffset );

    String name = firstStreamed ? "firstStreamed" : "firstAccumulated";
    name += secondStreamed ? "secondStreamed" : "secondAccumulated";
    name += interMerge ? "interMerge" : "noInterMerge";

    String path = "multihashjoinintomergeintohashjoin" + name;
    Tap sink = getPlatform().getTextFile( getOutputPath( path ), SinkMode.REPLACE );

    Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), new RegexSplitter( new Fields( "num1", "char1" ), " " ) );
    Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), new RegexSplitter( new Fields( "num1", "char1" ), " " ) );
    Pipe pipeOffset = new Each( new Pipe( "offset" ), new Fields( "line" ), new RegexSplitter( new Fields( "num2", "char2" ), " " ) );

    Pipe splice = new HashJoin( pipeLower, new Fields( "num1" ), pipeOffset, new Fields( "num2" ) );

    splice = new Retain( splice, new Fields( "num1", "char1" ) );

    splice = new Merge( "merge1", splice, pipeUpper );

    if( firstStreamed )
      splice = new HashJoin( splice, new Fields( "num1" ), pipeOffset, new Fields( "num2" ) );
    else
      splice = new HashJoin( pipeOffset, new Fields( "num2" ), splice, new Fields( "num1" ) );

    splice = new Retain( splice, new Fields( "num1", "char1" ) );

    if( interMerge )
      splice = new Merge( "merge2", splice, pipeUpper );

    if( secondStreamed )
      splice = new HashJoin( splice, new Fields( "num1" ), pipeOffset, new Fields( "num2" ) );
    else
      splice = new HashJoin( pipeOffset, new Fields( "num2" ), splice, new Fields( "num1" ) );

    Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice );

//    flow.writeDOT( name + ".dot" );

    if( getPlatform().isMapReduce() )
      assertEquals( "wrong num jobs", expectedSteps, flow.getFlowSteps().size() );

    flow.complete();

    validateLength( flow, interMerge ? 17 : 14 );
    }
  }
TOP

Related Classes of cascading.MergePipesPlatformTest

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.