Source Code of org.apache.tez.mapreduce.input.TestMultiMRInput

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */


package org.apache.tez.mapreduce.input;


import java.nio.ByteBuffer;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;
import static org.mockito.Mockito.doReturn;
import static org.mockito.Mockito.mock;


import java.io.IOException;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Random;


import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.yarn.api.records.ApplicationId;
import org.apache.tez.common.TezUtils;
import org.apache.tez.common.counters.TezCounters;
import org.apache.tez.dag.api.UserPayload;
import org.apache.tez.mapreduce.hadoop.MRInputHelpers;
import org.apache.tez.mapreduce.protos.MRRuntimeProtos.MRInputUserPayloadProto;
import org.apache.tez.mapreduce.protos.MRRuntimeProtos.MRSplitProto;
import org.apache.tez.runtime.api.Event;
import org.apache.tez.runtime.api.InputContext;
import org.apache.tez.runtime.api.events.InputDataInformationEvent;
import org.apache.tez.runtime.library.api.KeyValueReader;
import org.junit.Before;
import org.junit.Test;


public class TestMultiMRInput {


  private static final Log LOG = LogFactory.getLog(TestMultiMRInput.class);


  private static final JobConf defaultConf = new JobConf();
  private static final String testTmpDir;
  private static final Path TEST_ROOT_DIR;
  private static FileSystem localFs;


  static {
    defaultConf.set("fs.defaultFS", "file:///");
    try {
      localFs = FileSystem.getLocal(defaultConf);
      testTmpDir = System.getProperty("test.build.data", "/tmp");
      TEST_ROOT_DIR = new Path(testTmpDir, TestMultiMRInput.class.getSimpleName());
    } catch (IOException e) {
      throw new RuntimeException(e);
    }
  }


  @Before
  public void setup() throws IOException {
    LOG.info("Setup. Using test dir: " + TEST_ROOT_DIR);
    localFs.delete(TEST_ROOT_DIR, true);
    localFs.mkdirs(TEST_ROOT_DIR);
  }


  @Test(timeout = 5000)
  public void testSingleSplit() throws Exception {


    Path workDir = new Path(TEST_ROOT_DIR, "testSingleSplit");
    JobConf jobConf = new JobConf(defaultConf);
    jobConf.setInputFormat(org.apache.hadoop.mapred.SequenceFileInputFormat.class);
    FileInputFormat.setInputPaths(jobConf, workDir);


    MRInputUserPayloadProto.Builder builder = MRInputUserPayloadProto.newBuilder();
    builder.setGroupingEnabled(false);
    builder.setConfigurationBytes(TezUtils.createByteStringFromConf(jobConf));
    byte[] payload = builder.build().toByteArray();


    InputContext inputContext = createTezInputContext(payload);


    MultiMRInput input = new MultiMRInput(inputContext, 1);
    input.initialize();
    List<Event> eventList = new ArrayList<Event>();


    String file1 = "file1";
    LinkedHashMap<LongWritable, Text> data1 = createInputData(localFs, workDir, jobConf, file1, 0,
        10);
    SequenceFileInputFormat<LongWritable, Text> format =
        new SequenceFileInputFormat<LongWritable, Text>();
    InputSplit[] splits = format.getSplits(jobConf, 1);
    assertEquals(1, splits.length);


    MRSplitProto splitProto = MRInputHelpers.createSplitProto(splits[0]);
    InputDataInformationEvent event =
        InputDataInformationEvent.createWithSerializedPayload(0,
            splitProto.toByteString().asReadOnlyByteBuffer());


    eventList.clear();
    eventList.add(event);
    input.handleEvents(eventList);


    int readerCount = 0;
    for (KeyValueReader reader : input.getKeyValueReaders()) {
      readerCount++;
      while (reader.next()) {
        if (data1.size() == 0) {
          fail("Found more records than expected");
        }
        Object key = reader.getCurrentKey();
        Object val = reader.getCurrentValue();
        assertEquals(val, data1.remove(key));
      }
    }
    assertEquals(1, readerCount);
  }


  @Test(timeout = 5000)
  public void testMultipleSplits() throws Exception {


    Path workDir = new Path(TEST_ROOT_DIR, "testMultipleSplits");
    JobConf jobConf = new JobConf(defaultConf);
    jobConf.setInputFormat(org.apache.hadoop.mapred.SequenceFileInputFormat.class);
    FileInputFormat.setInputPaths(jobConf, workDir);


    MRInputUserPayloadProto.Builder builder = MRInputUserPayloadProto.newBuilder();
    builder.setGroupingEnabled(false);
    builder.setConfigurationBytes(TezUtils.createByteStringFromConf(jobConf));
    byte[] payload = builder.build().toByteArray();


    InputContext inputContext = createTezInputContext(payload);


    MultiMRInput input = new MultiMRInput(inputContext, 2);
    input.initialize();
    List<Event> eventList = new ArrayList<Event>();


    LinkedHashMap<LongWritable, Text> data = new LinkedHashMap<LongWritable, Text>();


    String file1 = "file1";
    LinkedHashMap<LongWritable, Text> data1 = createInputData(localFs, workDir, jobConf, file1, 0,
        10);


    String file2 = "file2";
    LinkedHashMap<LongWritable, Text> data2 = createInputData(localFs, workDir, jobConf, file2, 10,
        20);


    data.putAll(data1);
    data.putAll(data2);


    SequenceFileInputFormat<LongWritable, Text> format =
        new SequenceFileInputFormat<LongWritable, Text>();
    InputSplit[] splits = format.getSplits(jobConf, 2);
    assertEquals(2, splits.length);


    MRSplitProto splitProto1 = MRInputHelpers.createSplitProto(splits[0]);
    InputDataInformationEvent event1 =
        InputDataInformationEvent.createWithSerializedPayload(0,
            splitProto1.toByteString().asReadOnlyByteBuffer());


    MRSplitProto splitProto2 = MRInputHelpers.createSplitProto(splits[1]);
    InputDataInformationEvent event2 =
        InputDataInformationEvent.createWithSerializedPayload(0,
            splitProto2.toByteString().asReadOnlyByteBuffer());


    eventList.clear();
    eventList.add(event1);
    eventList.add(event2);
    input.handleEvents(eventList);


    int readerCount = 0;
    for (KeyValueReader reader : input.getKeyValueReaders()) {
      readerCount++;
      while (reader.next()) {
        if (data.size() == 0) {
          fail("Found more records than expected");
        }
        Object key = reader.getCurrentKey();
        Object val = reader.getCurrentValue();
        assertEquals(val, data.remove(key));
      }
    }
    assertEquals(2, readerCount);
  }


  @Test(timeout = 5000)
  public void testExtraEvents() throws Exception {
    Path workDir = new Path(TEST_ROOT_DIR, "testExtraEvents");
    JobConf jobConf = new JobConf(defaultConf);
    jobConf.setInputFormat(org.apache.hadoop.mapred.SequenceFileInputFormat.class);
    FileInputFormat.setInputPaths(jobConf, workDir);


    MRInputUserPayloadProto.Builder builder = MRInputUserPayloadProto.newBuilder();
    builder.setGroupingEnabled(false);
    builder.setConfigurationBytes(TezUtils.createByteStringFromConf(jobConf));
    byte[] payload = builder.build().toByteArray();


    InputContext inputContext = createTezInputContext(payload);


    MultiMRInput input = new MultiMRInput(inputContext, 1);
    input.initialize();
    List<Event> eventList = new ArrayList<Event>();


    String file1 = "file1";
    createInputData(localFs, workDir, jobConf, file1, 0, 10);
    SequenceFileInputFormat<LongWritable, Text> format =
        new SequenceFileInputFormat<LongWritable, Text>();
    InputSplit[] splits = format.getSplits(jobConf, 1);
    assertEquals(1, splits.length);


    MRSplitProto splitProto = MRInputHelpers.createSplitProto(splits[0]);
    InputDataInformationEvent event1 =
        InputDataInformationEvent.createWithSerializedPayload(0,
            splitProto.toByteString().asReadOnlyByteBuffer());
    InputDataInformationEvent event2 =
        InputDataInformationEvent.createWithSerializedPayload(1,
            splitProto.toByteString().asReadOnlyByteBuffer());


    eventList.clear();
    eventList.add(event1);
    eventList.add(event2);
    try {
      input.handleEvents(eventList);
      fail("Expecting Exception due to too many events");
    } catch (Exception e) {
      assertTrue(e.getMessage().contains(
          "Unexpected event. All physical sources already initialized"));
    }
  }


  private InputContext createTezInputContext(byte[] payload) {
    ApplicationId applicationId = ApplicationId.newInstance(10000, 1);
    TezCounters counters = new TezCounters();


    InputContext inputContext = mock(InputContext.class);
    doReturn(applicationId).when(inputContext).getApplicationId();
    doReturn(counters).when(inputContext).getCounters();
    doReturn(1).when(inputContext).getDAGAttemptNumber();
    doReturn("dagName").when(inputContext).getDAGName();
    doReturn(1).when(inputContext).getInputIndex();
    doReturn("srcVertexName").when(inputContext).getSourceVertexName();
    doReturn(1).when(inputContext).getTaskAttemptNumber();
    doReturn(1).when(inputContext).getTaskIndex();
    doReturn(1).when(inputContext).getTaskVertexIndex();
    doReturn("taskVertexName").when(inputContext).getTaskVertexName();
    doReturn(UserPayload.create(ByteBuffer.wrap(payload))).when(inputContext).getUserPayload();
    return inputContext;
  }


  public static LinkedHashMap<LongWritable, Text> createInputData(FileSystem fs, Path workDir,
                                                                  JobConf job, String filename,
                                                                  long startKey,
                                                                  long numKeys) throws IOException {
    LinkedHashMap<LongWritable, Text> data = new LinkedHashMap<LongWritable, Text>();
    Path file = new Path(workDir, filename);
    LOG.info("Generating data at path: " + file);
    // create a file with length entries
    @SuppressWarnings("deprecation")
    SequenceFile.Writer writer = SequenceFile.createWriter(fs, job, file, LongWritable.class,
        Text.class);
    try {
      Random r = new Random(System.currentTimeMillis());
      LongWritable key = new LongWritable();
      Text value = new Text();
      for (long i = startKey; i < numKeys; i++) {
        key.set(i);
        value.set(Integer.toString(r.nextInt(10000)));
        data.put(new LongWritable(key.get()), new Text(value.toString()));
        writer.append(key, value);
        LOG.info("<k, v> : <" + key.get() + ", " + value + ">");
      }
    } finally {
      writer.close();
    }
    return data;
  }
}
Source Code of org.apache.tez.mapreduce.input.TestMultiMRInput

Related Classes of org.apache.tez.mapreduce.input.TestMultiMRInput