Package co.nubetech.hiho.merge

Source Code of co.nubetech.hiho.merge.TestMergeJob

/**
* Copyright 2011 Nube Technologies
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed
* under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
* CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and limitations under the License.
*/
package co.nubetech.hiho.merge;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.util.ReflectionUtils;
import org.apache.hadoop.util.ToolRunner;
import org.apache.log4j.Logger;
import org.junit.Test;

import co.nubetech.hiho.common.HIHOException;
import co.nubetech.hiho.common.HihoTestCase;
import co.nubetech.hiho.testdata.Student;

public class TestMergeJob extends HihoTestCase {
  final static Logger logger = Logger
      .getLogger(co.nubetech.hiho.merge.TestMergeJob.class);

  @Test
  public void testCheckMandatoryConfsValidValues() throws HIHOException {
    String[] args = new String[] {
        "-inputFormat", "org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat",
        "-mergeBy", "key",
        "-inputKeyClassName", "org.apache.hadoop.io.IntWritable",
        "-inputValueClassName", "org.apache.hadoop.io.Text",
        "-oldPath", "testData/merge/inputOld/old",
        "-newPath""testData/merge/inputNew/new",
        "-outputPath", "output" };
    MergeJob mergeJob = new MergeJob();
    mergeJob.populateConfiguration(args);
    mergeJob.checkMandatoryConfs();
  }

  @Test(expected = HIHOException.class)
  public void testCheckMandatoryConfsForExpectedExceptionInInputFormat()
      throws HIHOException {
    String[] args = new String[] {
        "-mergeBy", "key",
        "-inputKeyClassName", "org.apache.hadoop.io.IntWritable",
        "-inputValueClassName", "org.apache.hadoop.io.Text",
        "-oldPath", "testData/merge/inputForSeqTest/old",
        "-newPath", "testData/merge/inputForSeqTest/new",
        "-outputPath", "output" };
    MergeJob mergeJob = new MergeJob();
    mergeJob.populateConfiguration(args);
    mergeJob.checkMandatoryConfs();
  }

  @Test(expected = HIHOException.class)
  public void testCheckMandatoryConfsForExpectedExceptionInMergeBy()
      throws HIHOException {
    String[] args = new String[] {
        "-inputFormat", "org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat",
        "-inputKeyClassName", "org.apache.hadoop.io.IntWritable",
        "-inputValueClassName", "org.apache.hadoop.io.Text",
        "-oldPath", "testData/merge/inputForSeqTest/old",
        "-newPath", "testData/merge/inputForSeqTest/new",
        "-outputPath", "output" };
    MergeJob mergeJob = new MergeJob();
    mergeJob.populateConfiguration(args);
    mergeJob.checkMandatoryConfs();
  }

  @Test(expected = HIHOException.class)
  public void testCheckMandatoryConfsForExpectedExceptionInInputKeyClassName()
      throws HIHOException {
    String[] args = new String[] {
        "-inputFormat", "org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat",
        "-mergeBy", "key",
        "-inputValueClassName", "org.apache.hadoop.io.Text",
        "-oldPath", "testData/merge/inputForSeqTest/old",
        "-newPath", "testData/merge/inputForSeqTest/new",
        "-outputPath", "output" };
    MergeJob mergeJob = new MergeJob();
    mergeJob.populateConfiguration(args);
    mergeJob.checkMandatoryConfs();
  }

  @Test(expected = HIHOException.class)
  public void testCheckMandatoryConfsForExpectedExceptionInInputValueClassName()
      throws HIHOException {
    String[] args = new String[] {
        "-inputFormat", "org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat",
        "-mergeBy", "key",
        "-inputKeyClassName", "org.apache.hadoop.io.IntWritable",
        "-oldPath", "testData/merge/inputForSeqTest/old",
        "-newPath", "testData/merge/inputForSeqTest/new",
        "-outputPath", "output" };
    MergeJob mergeJob = new MergeJob();
    mergeJob.populateConfiguration(args);
    mergeJob.checkMandatoryConfs();
  }

  @Test(expected = HIHOException.class)
  public void testCheckMandatoryConfsForExpectedExceptionInOldPath()
      throws HIHOException {
    String[] args = new String[] {
        "-inputFormat", "org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat",
        "-mergeBy", "key",
        "-inputKeyClassName", "org.apache.hadoop.io.IntWritable",
        "-inputValueClassName", "org.apache.hadoop.io.Text",
        "-newPath", "testData/merge/inputForSeqTest/new",
        "-outputPath", "output" };
    MergeJob mergeJob = new MergeJob();
    mergeJob.populateConfiguration(args);
    mergeJob.checkMandatoryConfs();
  }

  @Test(expected = HIHOException.class)
  public void testCheckMandatoryConfsForExpectedExceptionInNewPath()
      throws HIHOException {
    String[] args = new String[] {
        "-inputFormat", "org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat",
        "-mergeBy", "key",
        "-inputKeyClassName", "org.apache.hadoop.io.IntWritable",
        "-inputValueClassName", "org.apache.hadoop.io.Text",
        "-oldPath""testData/merge/inputForSeqTest/old",
        "-outputPath", "output" };
    MergeJob mergeJob = new MergeJob();
    mergeJob.populateConfiguration(args);
    mergeJob.checkMandatoryConfs();
  }

  @Test(expected = HIHOException.class)
  public void testCheckMandatoryConfsForExpectedExceptionInOutputPath()
      throws HIHOException {
    String[] args = new String[] {
        "-inputFormat", "org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat",
        "-mergeBy", "key",
        "-inputKeyClassName", "org.apache.hadoop.io.IntWritable",
        "-inputValueClassName", "org.apache.hadoop.io.Text",
        "-oldPath", "testData/merge/inputForSeqTest/old",
        "-newPath", "testData/merge/inputForSeqTest/new" };
    MergeJob mergeJob = new MergeJob();
    mergeJob.populateConfiguration(args);
    mergeJob.checkMandatoryConfs();
  }
 
  public Student setStudent(Text name, Text address, IntWritable id,
      LongWritable mobNumber, DoubleWritable per){
    Student student = new Student();
    student.setName(name);
    student.setAddress(address);
    student.setId(id);
    student.setMobileNumber(mobNumber);
    student.setPercentage(per);
    return student;
  }
 
  @Test
  public void testMergeByCustomObjectKeyWithSequenceFileInputFormat() throws Exception {   
    Student student1 = setStudent(new Text("Sam"),new Text("US"),new IntWritable(1),
        new LongWritable(9999999998l),new DoubleWritable(99.12));       
    Student student2 = setStudent(new Text("John"),new Text("AUS"),new IntWritable(2),
        new LongWritable(9999999999l),new DoubleWritable(90.12));       
    Student student3 = setStudent(new Text("Mary"),new Text("UK"),new IntWritable(3),
        new LongWritable(9999999988l),new DoubleWritable(69.12));   
    Student student4 = setStudent(new Text("Kelvin"),new Text("UK"),new IntWritable(4),
        new LongWritable(9999998888l),new DoubleWritable(59.12));
 
    HashMap<Student, Text> inputData1 = new HashMap<Student, Text>();
    inputData1.put(student1, new Text("Macon Kent,6269 Aenean St.,1-247-399-1051,08253"));
    inputData1.put(student2, new Text("Dale Zamora,521-7792 Mauris Rd.,1-214-625-6970,90510"));
    inputData1.put(student3, new Text("Charles Wood,525-9709 In Rd.,1-370-528-4758,62714"));
    createSequenceFileInHdfs(inputData1, "/input1", "testFile1.seq");
   
    HashMap<Student, Text> inputData2 = new HashMap<Student, Text>();
    inputData2.put(student2, new Text("Austin Farley,4794 Donec Ave,1-230-823-8164,13508"));
    inputData2.put(student3, new Text("Macaulay Jackson,5435 Dui. Avenue,1-770-395-6446,31584"));
    inputData2.put(student4, new Text("Timon Leonard,716 Ac Ave,1-857-935-3882,62240"));
    createSequenceFileInHdfs(inputData2, "/input2", "testFile2.seq");
 

    String[] args = new String[] {
        "-newPath""/input1",
        "-oldPath""/input2",
        "-mergeBy""key",
        "-outputPath", "output",
        "-inputFormat", "org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat",
        "-inputKeyClassName", "co.nubetech.hiho.testdata.Student",
        "-inputValueClassName", "org.apache.hadoop.io.Text" };
    MergeJob job=runMergeJobs(args);
    assertEquals(3,job.getTotalRecordsNew());
    assertEquals(3,job.getTotalRecordsOld());
    assertEquals(0,job.getBadRecords());
    assertEquals(4,job.getOutput());
   
      FileSystem outputFS = getFileSystem();
      Path outputPath = new Path(outputFS.getHomeDirectory(), "output/part-r-00000");
      Configuration conf = new Configuration();
      SequenceFile.Reader reader = new SequenceFile.Reader(outputFS, outputPath, conf);
      Writable writableKey = (Writable)
      ReflectionUtils.newInstance(reader.getKeyClass(), conf);
      Writable writableValue = (Writable)
      ReflectionUtils.newInstance(reader.getValueClass(), conf);
      List<Student> expectedOutput = new ArrayList<Student>();
      expectedOutput.add(student1);
      expectedOutput.add(student2);
      expectedOutput.add(student3);
      expectedOutput.add(student4);
      int count = 0;
      while (reader.next(writableKey, writableValue)) {
          logger.debug("key and value is: " + writableKey + ", " + writableValue);
          assertTrue("Matched output " + writableKey , expectedOutput.contains(writableKey));
          count++;
      }
      IOUtils.closeStream(reader);
      assertEquals(4, count)
  }

 
  @Test
  public void testMergeByIntWritableKeyWithSequenceFileInputFormat() throws Exception {
    HashMap<IntWritable, Text> inputData1 = new HashMap<IntWritable, Text>();
    inputData1.put(new IntWritable(1), new Text("Macon Kent,6269 Aenean St.,1-247-399-1051,08253"));
    inputData1.put(new IntWritable(2), new Text("Dale Zamora,521-7792 Mauris Rd.,1-214-625-6970,90510"));
    inputData1.put(new IntWritable(3), new Text("Charles Wood,525-9709 In Rd.,1-370-528-4758,62714"));
    createSequenceFileInHdfs(inputData1, "/input1", "testFile1.seq");
   
    HashMap<IntWritable, Text> inputData2 = new HashMap<IntWritable, Text>();
    inputData2.put(new IntWritable(1), new Text("Macaulay Jackson,5435 Dui. Avenue,1-770-395-6446,31584"));
    inputData2.put(new IntWritable(2), new Text("Timon Leonard,716 Ac Ave,1-857-935-3882,62240"));
    inputData2.put(new IntWritable(4), new Text("Charles Wood,525-9709 In Rd.,1-370-528-4758,62714"));
    createSequenceFileInHdfs(inputData2, "/input2", "testFile2.seq");
 

    String[] args = new String[] {
        "-newPath""/input1",
        "-oldPath""/input2",
        "-mergeBy""key",
        "-outputPath", "output",
        "-inputFormat", "org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat",
        "-inputKeyClassName", "org.apache.hadoop.io.IntWritable",
        "-inputValueClassName", "org.apache.hadoop.io.Text" };
    MergeJob job=runMergeJobs(args);
    assertEquals(3,job.getTotalRecordsNew());
    assertEquals(3,job.getTotalRecordsOld());
    assertEquals(0,job.getBadRecords());
    assertEquals(4,job.getOutput());
   
          FileSystem outputFS = getFileSystem();
          Path outputPath = new Path(outputFS.getHomeDirectory(), "output/part-r-00000");
          Configuration conf = new Configuration();
          SequenceFile.Reader reader = new SequenceFile.Reader(outputFS, outputPath, conf);
          Writable writableKey = (Writable)
          ReflectionUtils.newInstance(reader.getKeyClass(), conf);
          Writable writableValue = (Writable)
          ReflectionUtils.newInstance(reader.getValueClass(), conf);
          List<IntWritable> expectedOutput = new ArrayList<IntWritable>();
          expectedOutput.add(new IntWritable(1));
          expectedOutput.add(new IntWritable(2));
          expectedOutput.add(new IntWritable(3));
          expectedOutput.add(new IntWritable(4));
          int count = 0;
          while (reader.next(writableKey, writableValue)) {
          logger.debug("key and value is: " + writableKey + ", " + writableValue);
          assertTrue("Matched output " + writableKey , expectedOutput.contains(writableKey));
          count++;
          }
          IOUtils.closeStream(reader);
          assertEquals(4, count);   
   

  }
 
  @Test
  public void testMergeByLongWritableKeyWithSequenceFileInputFormat() throws Exception {
    HashMap<LongWritable, Text> inputData1 = new HashMap<LongWritable, Text>();
    inputData1.put(new LongWritable(1), new Text("Macon Kent,6269 Aenean St.,1-247-399-1051,08253"));
    inputData1.put(new LongWritable(2), new Text("Dale Zamora,521-7792 Mauris Rd.,1-214-625-6970,90510"));
    inputData1.put(new LongWritable(3), new Text("Charles Wood,525-9709 In Rd.,1-370-528-4758,62714"));
    createSequenceFileInHdfs(inputData1, "/input1", "testFile1.seq");
   
    HashMap<LongWritable, Text> inputData2 = new HashMap<LongWritable, Text>();
    inputData2.put(new LongWritable(1), new Text("Macaulay Jackson,5435 Dui. Avenue,1-770-395-6446,31584"));
    inputData2.put(new LongWritable(2), new Text("Timon Leonard,716 Ac Ave,1-857-935-3882,62240"));
    inputData2.put(new LongWritable(4), new Text("Charles Wood,525-9709 In Rd.,1-370-528-4758,62714"));
    createSequenceFileInHdfs(inputData2, "/input2", "testFile2.seq");
 

    String[] args = new String[] {
        "-newPath""/input1",
        "-oldPath""/input2",
        "-mergeBy""key",
        "-outputPath","output",
        "-inputFormat""org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat",
        "-inputKeyClassName", "org.apache.hadoop.io.LongWritable",
        "-inputValueClassName", "org.apache.hadoop.io.Text" };
    MergeJob job=runMergeJobs(args);
    assertEquals(3,job.getTotalRecordsNew());
    assertEquals(3,job.getTotalRecordsOld());
    assertEquals(0,job.getBadRecords());
    assertEquals(4,job.getOutput());
   
          FileSystem outputFS = getFileSystem();
          Path outputPath = new Path(outputFS.getHomeDirectory(), "output/part-r-00000");
          Configuration conf = new Configuration();
          SequenceFile.Reader reader = new SequenceFile.Reader(outputFS, outputPath, conf);
          Writable writableKey = (Writable)
          ReflectionUtils.newInstance(reader.getKeyClass(), conf);
          Writable writableValue = (Writable)
          ReflectionUtils.newInstance(reader.getValueClass(), conf);
          List<LongWritable> expectedOutput = new ArrayList<LongWritable>();
          expectedOutput.add(new LongWritable(1));
          expectedOutput.add(new LongWritable(2));
          expectedOutput.add(new LongWritable(3));
          expectedOutput.add(new LongWritable(4));
          int count = 0;
          while (reader.next(writableKey, writableValue)) {
          logger.debug("key and value is: " + writableKey + ", " + writableValue);
          assertTrue("Matched output " + writableKey , expectedOutput.contains(writableKey));
          count++;
          }
          IOUtils.closeStream(reader);
          assertEquals(4, count);   

  }

  @Test
  public void testMergeByValueWithSequenceFileInputFormat() throws Exception {
    HashMap<IntWritable, Text> inputData1 = new HashMap<IntWritable, Text>();
    inputData1.put(new IntWritable(1), new Text("Macon Kent,6269 Aenean St.,1-247-399-1051,08253"));
    inputData1.put(new IntWritable(2), new Text("Dale Zamora,521-7792 Mauris Rd.,1-214-625-6970,90510"));
    inputData1.put(new IntWritable(3), new Text("Charles Wood,525-9709 In Rd.,1-370-528-4758,62714"));
    createSequenceFileInHdfs(inputData1, "/input1", "testFile1.seq");
   
    HashMap<IntWritable, Text> inputData2 = new HashMap<IntWritable, Text>();
    inputData2.put(new IntWritable(1), new Text("Macaulay Jackson,5435 Dui. Avenue,1-770-395-6446,31584"));
    inputData2.put(new IntWritable(2), new Text("Timon Leonard,716 Ac Ave,1-857-935-3882,62240"));
    inputData2.put(new IntWritable(4), new Text("Charles Wood,525-9709 In Rd.,1-370-528-4758,62714"));
    createSequenceFileInHdfs(inputData2, "/input2", "testFile2.seq");
   

    String[] args = new String[] {
        "-newPath""/input1",
        "-oldPath""/input2",
        "-mergeBy""value",
        "-outputPath","output",
        "-inputFormat""org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat",
        "-inputKeyClassName", "org.apache.hadoop.io.IntWritable",
        "-inputValueClassName", "org.apache.hadoop.io.Text" };
    MergeJob job=runMergeJobs(args);
    assertEquals(3,job.getTotalRecordsNew());
    assertEquals(3,job.getTotalRecordsOld());
    assertEquals(0,job.getBadRecords());
    assertEquals(5,job.getOutput());
   
      FileSystem outputFS = getFileSystem();
        Path outputPath = new Path(outputFS.getHomeDirectory(), "output/part-r-00000");
        Configuration conf = new Configuration();
        SequenceFile.Reader reader = new SequenceFile.Reader(outputFS, outputPath, conf);
        Writable writableKey = (Writable)
        ReflectionUtils.newInstance(reader.getKeyClass(), conf);
        Writable writableValue = (Writable)
        ReflectionUtils.newInstance(reader.getValueClass(), conf);
        List<Text> expectedOutput = new ArrayList<Text>();
        expectedOutput.add(new Text("Macon Kent,6269 Aenean St.,1-247-399-1051,08253"));
        expectedOutput.add(new Text("Dale Zamora,521-7792 Mauris Rd.,1-214-625-6970,90510"));
        expectedOutput.add(new Text("Charles Wood,525-9709 In Rd.,1-370-528-4758,62714"));
        expectedOutput.add(new Text("Macaulay Jackson,5435 Dui. Avenue,1-770-395-6446,31584"));
        expectedOutput.add(new Text("Timon Leonard,716 Ac Ave,1-857-935-3882,62240"));
        int count = 0;
        while (reader.next(writableKey, writableValue)) {
        logger.debug("key and value is: " + writableKey + ", " + writableValue);
        assertTrue("Matched output " + writableValue , expectedOutput.contains(writableValue));
        count++;
        }
        IOUtils.closeStream(reader);
        assertEquals(5, count);   

  }

  @Test
  public void testMergeByKeyWithDelimitedTextInputFormat() throws Exception {
   
    final String inputData1 = "Macon Kent,6269 Aenean St.,1-247-399-1051,08253"
        +"\nDale Zamora,521-7792 Mauris Rd.,1-214-625-6970,90510"
        +"\nCharles Wood,525-9709 In Rd.,1-370-528-4758,62714";
    final String inputData2 = "Dale Zamora,521-7792 Mauris Rd.,1-214-625-6970,90510"
          +"\nMacaulay Jackson,5435 Dui. Avenue,1-770-395-6446,31584"
          +"\nCharles Wood,525-9709 In Rd.,1-370-528-4758,62714";
    createTextFileInHDFS(inputData1, "/input1", "testFile1.txt")
    createTextFileInHDFS(inputData2, "/input2", "testFile2.txt");
   
    String[] args = new String[] {
        "-newPath", "/input1",
        "-oldPath""/input2",
        "-mergeBy", "key",
        "-outputPath", "output",
        "-inputFormat""co.nubetech.hiho.dedup.DelimitedTextInputFormat",
        "-inputKeyClassName", "org.apache.hadoop.io.Text",
        "-inputValueClassName", "org.apache.hadoop.io.Text" };
    MergeJob job=runMergeJobs(args);
    assertEquals(3,job.getTotalRecordsNew());
    assertEquals(3,job.getTotalRecordsOld());
    assertEquals(0,job.getBadRecords());
    assertEquals(4,job.getOutput());
   
    FileSystem outputFS=getFileSystem();
    Path outputPath = new Path(outputFS.getHomeDirectory(), "output");
    FileStatus[] status = outputFS.listStatus(outputPath, getOutputPathFilter());
    assertTrue(outputFS.exists(outputPath));
    List<String> expectedOutput = new ArrayList<String>();
    expectedOutput.add("Macon Kent,6269 Aenean St.,1-247-399-1051,08253");
    expectedOutput.add("Dale Zamora,521-7792 Mauris Rd.,1-214-625-6970,90510");
    expectedOutput.add("Charles Wood,525-9709 In Rd.,1-370-528-4758,62714");
    expectedOutput.add("Macaulay Jackson,5435 Dui. Avenue,1-770-395-6446,31584");
    int count = 0;
    for (FileStatus fileStat: status) {
      logger.debug("File status is " + fileStat.getPath() );
      FSDataInputStream in = outputFS.open(fileStat.getPath());
      String line = null;     
      while ((line = in.readLine()) != null) {
        logger.debug("Output is " + line);
        assertTrue("Matched output " + line , expectedOutput.contains(line));
        expectedOutput.remove(line);
        count++;
      }
      in.close();
    }
    assertEquals(4, count);
 
  }

  @Test
  public void testMergeByValueWithDelimitedTextInputFormat() throws Exception {
   
    final String inputData1 = "Macon Kent,6269 Aenean St.,1-247-399-1051,08253" +
        "\nDale Zamora,521-7792 Mauris Rd.,1-214-625-6970,90510" +
        "\nCharles Wood,525-9709 In Rd.,1-370-528-4758,62714";
    final String inputData2 = "Macaulay Jackson,5435 Dui. Avenue,1-770-395-6446,31584" +
        "\nCharles Wood,525-9709 In Rd.,1-370-528-4758,62714" +
        "\nTimon Leonard,716 Ac Ave,1-857-935-3882,62240";
    createTextFileInHDFS(inputData1, "/input1", "testFile1.txt")
    createTextFileInHDFS(inputData2, "/input2", "testFile2.txt");
           
    String[] args = new String[] {
        "-newPath", "/input1",
        "-oldPath""/input2",
        "-mergeBy", "value",
        "-outputPath", "output",
        "-inputFormat""co.nubetech.hiho.dedup.DelimitedTextInputFormat",
        "-inputKeyClassName", "org.apache.hadoop.io.Text",
        "-inputValueClassName", "org.apache.hadoop.io.Text",
        };
    MergeJob job=runMergeJobs(args);
    assertEquals(3,job.getTotalRecordsNew());
    assertEquals(3,job.getTotalRecordsOld());
    assertEquals(0,job.getBadRecords());
    assertEquals(5,job.getOutput());
   
    FileSystem outputFS=getFileSystem();
    Path outputPath = new Path(outputFS.getHomeDirectory(), "output");
    FileStatus[] status = outputFS.listStatus(outputPath, getOutputPathFilter());
    assertTrue(outputFS.exists(outputPath));
    List<String> expectedOutput = new ArrayList<String>();
    expectedOutput.add("Macon Kent,6269 Aenean St.,1-247-399-1051,08253");
    expectedOutput.add("Dale Zamora,521-7792 Mauris Rd.,1-214-625-6970,90510");
    expectedOutput.add("Charles Wood,525-9709 In Rd.,1-370-528-4758,62714");
    expectedOutput.add("Timon Leonard,716 Ac Ave,1-857-935-3882,62240");
    expectedOutput.add("Macaulay Jackson,5435 Dui. Avenue,1-770-395-6446,31584");
    int count = 0;
    for (FileStatus fileStat: status) {
      logger.debug("File status is " + fileStat.getPath() );
      FSDataInputStream in = outputFS.open(fileStat.getPath());
      String line = null;     
      while ((line = in.readLine()) != null) {
        logger.debug("Output is " + line);
        assertTrue("Matched output " + line , expectedOutput.contains(line));
        expectedOutput.remove(line);
        count++;
      }
      in.close();
    }
    assertEquals(5, count);
 
  }
 

  @Test
  public void testMergeByValueWithTextInputFormat() throws Exception {
   
    final String inputData1 = "Macon Kent,6269 Aenean St.,1-247-399-1051,08253" +
        "\nDale Zamora,521-7792 Mauris Rd.,1-214-625-6970,90510" +
        "\nCharles Wood,525-9709 In Rd.,1-370-528-4758,62714";
    final String inputData2 = "Timon Leonard,716 Ac Ave,1-857-935-3882,62240" +
        "\nMacaulay Jackson,5435 Dui. Avenue,1-770-395-6446,31584" +
        "\nCharles Wood,525-9709 In Rd.,1-370-528-4758,62714";
    createTextFileInHDFS(inputData1, "/input1", "testFile1.txt")
    createTextFileInHDFS(inputData2, "/input2", "testFile2.txt");
   
    String[] args = new String[] {
        "-newPath", "/input1",
        "-oldPath""/input2",
        "-mergeBy", "value",
        "-outputPath", "output",
        "-inputFormat""org.apache.hadoop.mapreduce.lib.input.TextInputFormat",
        "-outputFormat", "co.nubetech.hiho.mapreduce.lib.output.NoKeyOnlyValueOutputFormat"
        };
    MergeJob job=runMergeJobs(args);
    assertEquals(3,job.getTotalRecordsNew());
    assertEquals(3,job.getTotalRecordsOld());
    assertEquals(0,job.getBadRecords());
    assertEquals(5,job.getOutput());
   
    FileSystem outputFS=getFileSystem();
    Path outputPath = new Path(outputFS.getHomeDirectory(), "output");
    FileStatus[] status = outputFS.listStatus(outputPath, getOutputPathFilter());
    assertTrue(outputFS.exists(outputPath));
    List<String> expectedOutput = new ArrayList<String>();
    expectedOutput.add("Macon Kent,6269 Aenean St.,1-247-399-1051,08253");
    expectedOutput.add("Dale Zamora,521-7792 Mauris Rd.,1-214-625-6970,90510");
    expectedOutput.add("Charles Wood,525-9709 In Rd.,1-370-528-4758,62714");
    expectedOutput.add("Timon Leonard,716 Ac Ave,1-857-935-3882,62240");
    expectedOutput.add("Macaulay Jackson,5435 Dui. Avenue,1-770-395-6446,31584");
    int count = 0;
    for (FileStatus fileStat: status) {
      logger.debug("File status is " + fileStat.getPath() );
      FSDataInputStream in = outputFS.open(fileStat.getPath());
      String line = null;     
      while ((line = in.readLine()) != null) {
        logger.debug("Output is " + line);
        assertTrue("Matched output " + line , expectedOutput.contains(line));
        expectedOutput.remove(line);
        count++;
      }
      in.close();
    }
    assertEquals(5, count);
  }
 
  @Test
  public void testMergeByKeyWithKeyValueTextInputFormat() throws Exception {
   
    final String inputData1 = "A\tMacon Kent,6269 Aenean St.,1-247-399-1051,08253" +
        "\nB\tDale Zamora,521-7792 Mauris Rd.,1-214-625-6970,90510" +
        "\nC\tCharles Wood,525-9709 In Rd.,1-370-528-4758,62714";
    final String inputData2 = "A\tTimon Leonard,716 Ac Ave,1-857-935-3882,62240" +
        "\nD\tMacaulay Jackson,5435 Dui. Avenue,1-770-395-6446,31584" +
        "\nB\tCharles Wood,525-9709 In Rd.,1-370-528-4758,62714";
    createTextFileInHDFS(inputData1, "/input1", "testFile1.txt")
    createTextFileInHDFS(inputData2, "/input2", "testFile2.txt");
   
    String[] args = new String[] {
        "-newPath", "/input1",
        "-oldPath""/input2",
        "-mergeBy", "key",
        "-outputPath", "output",
        "-inputFormat""org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat",
        "-inputKeyClassName", "org.apache.hadoop.io.Text",
        "-inputValueClassName", "org.apache.hadoop.io.Text",
        "-outputFormat", "co.nubetech.hiho.mapreduce.lib.output.NoKeyOnlyValueOutputFormat"
        };
    MergeJob job=runMergeJobs(args);
    assertEquals(3,job.getTotalRecordsNew());
    assertEquals(3,job.getTotalRecordsOld());
    assertEquals(0,job.getBadRecords());
    assertEquals(4,job.getOutput());
   
    FileSystem outputFS=getFileSystem();
    Path outputPath = new Path(outputFS.getHomeDirectory(), "output");
    FileStatus[] status = outputFS.listStatus(outputPath, getOutputPathFilter());
    assertTrue(outputFS.exists(outputPath));
    List<String> expectedOutput = new ArrayList<String>();
    expectedOutput.add("Macon Kent,6269 Aenean St.,1-247-399-1051,08253");
    expectedOutput.add("Dale Zamora,521-7792 Mauris Rd.,1-214-625-6970,90510");
    expectedOutput.add("Charles Wood,525-9709 In Rd.,1-370-528-4758,62714");
    expectedOutput.add("Macaulay Jackson,5435 Dui. Avenue,1-770-395-6446,31584");
    int count = 0;
    for (FileStatus fileStat: status) {
      logger.debug("File status is " + fileStat.getPath() );
      FSDataInputStream in = outputFS.open(fileStat.getPath());
      String line = null;     
      while ((line = in.readLine()) != null) {
        logger.debug("Output is " + line);
        assertTrue("Matched output " + line , expectedOutput.contains(line));
        expectedOutput.remove(line);
        count++;
      }
      in.close();
    }
    assertEquals(4, count);
  }
 
  @Test
  public void testMergeByValueWithSequenceFileAsTextInputFormat() throws Exception {
    HashMap<IntWritable, Text> inputData1 = new HashMap<IntWritable, Text>();
    inputData1.put(new IntWritable(1), new Text("Macon Kent,6269 Aenean St.,1-247-399-1051,08253"));
    inputData1.put(new IntWritable(2), new Text("Dale Zamora,521-7792 Mauris Rd.,1-214-625-6970,90510"));
    inputData1.put(new IntWritable(3), new Text("Charles Wood,525-9709 In Rd.,1-370-528-4758,62714"));
    createSequenceFileInHdfs(inputData1, "/input1", "testFile1.seq");
   
    HashMap<IntWritable, Text> inputData2 = new HashMap<IntWritable, Text>();
    inputData2.put(new IntWritable(1), new Text("Timon Leonard,716 Ac Ave,1-857-935-3882,62240"));
    inputData2.put(new IntWritable(2), new Text("Macaulay Jackson,5435 Dui. Avenue,1-770-395-6446,31584"));
    inputData2.put(new IntWritable(4), new Text("Charles Wood,525-9709 In Rd.,1-370-528-4758,62714"));
    createSequenceFileInHdfs(inputData2, "/input2", "testFile2.seq");

    String[] args = new String[] {
        "-newPath", "/input1",
        "-oldPath""/input2",
        "-mergeBy", "value",
        "-outputPath", "output",
        "-inputFormat""org.apache.hadoop.mapreduce.lib.input.SequenceFileAsTextInputFormat",
        "-inputKeyClassName", "org.apache.hadoop.io.Text",
        "-inputValueClassName", "org.apache.hadoop.io.Text",
        "-outputFormat", "co.nubetech.hiho.mapreduce.lib.output.NoKeyOnlyValueOutputFormat"
        };
    MergeJob job=runMergeJobs(args);
    assertEquals(3,job.getTotalRecordsNew());
    assertEquals(3,job.getTotalRecordsOld());
    assertEquals(0,job.getBadRecords());
    assertEquals(5,job.getOutput());
   
    FileSystem outputFS=getFileSystem();
    Path outputPath = new Path(outputFS.getHomeDirectory(), "output");
    FileStatus[] status = outputFS.listStatus(outputPath, getOutputPathFilter());
    assertTrue(outputFS.exists(outputPath));
    List<String> expectedOutput = new ArrayList<String>();
    expectedOutput.add("Macon Kent,6269 Aenean St.,1-247-399-1051,08253");
    expectedOutput.add("Dale Zamora,521-7792 Mauris Rd.,1-214-625-6970,90510");
    expectedOutput.add("Charles Wood,525-9709 In Rd.,1-370-528-4758,62714");
    expectedOutput.add("Timon Leonard,716 Ac Ave,1-857-935-3882,62240");
    expectedOutput.add("Macaulay Jackson,5435 Dui. Avenue,1-770-395-6446,31584");
    int count = 0;
    for (FileStatus fileStat: status) {
      logger.debug("File status is " + fileStat.getPath() );
      FSDataInputStream in = outputFS.open(fileStat.getPath());
      String line = null;     
      while ((line = in.readLine()) != null) {
        logger.debug("Output is " + line);
        assertTrue("Matched output " + line , expectedOutput.contains(line));
        expectedOutput.remove(line);
        count++;
      }
      in.close();
    }
    assertEquals(5, count);
  }
 
 
  public MergeJob runMergeJobs(String[] args) throws Exception{
    MergeJob job = new MergeJob();
    int res = ToolRunner.run(createJobConf(), job, args);
    assertEquals(0, res);
    return job;
  }
}
TOP

Related Classes of co.nubetech.hiho.merge.TestMergeJob

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.