Examples of com.m6d.filecrush.crush.Crush

com.m6d.filecrush.crush.Crush

    createFile(subsubdir, "skipped-4", 4, 25);
    createFile(subsubdir, "skipped-5", 5, 25);


    File out = new File(tmp.getRoot(), "out");


    ToolRunner.run(job, new Crush(), new String[] {
        subdir.getAbsolutePath(), out.getAbsolutePath()
    });


    /*
     * Make sure the original files are still there.

View Full Code Here

  public void noFiles() throws Exception {
    File in = tmp.newFolder("in");


    File out = new File(tmp.getRoot(), "out");


    ToolRunner.run(job, new Crush(), new String[] {
        in.getAbsolutePath(), out.getAbsolutePath()
    });


    assertThat(out.exists(), is(false));
  }

View Full Code Here

  public void restoreJavaIoTmpDir() {
    System.setProperty("java.io.tmpdir", javaIoTmpDir);
  }


  private void run(String... args) throws Exception {
    ToolRunner.run(job, new Crush(), args);
  }

View Full Code Here

    expectedBucketFiles.add(format("%s  %s", dir2_4_2.getAbsolutePath() + "-0", new File(dir2_4_2, "file1").getAbsolutePath()));
    expectedBucketFiles.add(format("%s  %s", dir2_4_2.getAbsolutePath() + "-0", new File(dir2_4_2, "file2").getAbsolutePath()));
    expectedBucketFiles.add(format("%s  %s", dir2_4_2.getAbsolutePath() + "-0", new File(dir2_4_2, "file3").getAbsolutePath()));




    Crush crush = new Crush();


    crush.setConf(job);
    crush.setFileSystem(fileSystem);


    /*
     * Call these in the same order that run() does.
     */
    crush.createJobConfAndParseArgs("--compress=none", "--max-file-blocks=1", in.getAbsolutePath(), new File(tmp.getRoot(), "out").getAbsolutePath(), "20101124171730");
    crush.writeDirs();




    /*
     * Verify bucket contents.
     */


    List<String> actualBucketFiles = new ArrayList<String>();


    Text key = new Text();
    Text value = new Text();


    Reader reader = new Reader(FileSystem.get(job), crush.getBucketFiles(), job);


    while(reader.next(key, value)) {
      actualBucketFiles.add(format("%s\t%s", key, value));
    }


    reader.close();


    Collections.sort(expectedBucketFiles);
    Collections.sort(actualBucketFiles);


    assertThat(actualBucketFiles, equalTo(expectedBucketFiles));


    /*
     * Verify the partition map.
     */
    Reader partitionMapReader = new Reader(FileSystem.get(job), crush.getPartitionMap(), job);


    IntWritable partNum = new IntWritable();


    Map<String, Integer> actualPartitions = new HashMap<String, Integer>();


    while (partitionMapReader.next(key, partNum)) {
      actualPartitions.put(key.toString(), partNum.get());
    }


    partitionMapReader.close();


    /*
     * These crush files need to allocated into 5 partitions:
     *
     * in/2-1            55 bytes
     * in/1/1.1-0        45 bytes
     * in/1/1.1-2        40 bytes
     * in/1/1.1-1        40 bytes
     * in/1/1.2-0        30 bytes
     * in/2/2.2-1        40 bytes
     * in/2/2.4/2.4.2-0  50 bytes
     *
     *   0              1                      2                3                4
     *   in/2-1 55      in/2/2.4/2.4.2-0  50  in/1/1.1-0  45  in/1/1.1-2  40  in/1/1.1-1  40
     *                                                       in/2/2.2-1  40  in/1/1.2-0  39
     */
    Map<String, Integer> expectedPartitions = new HashMap<String, Integer>();


    //TODO: this may not be deterministic due to jvm/hashmap/filesystem
    expectedPartitions.put(dir2.getAbsolutePath() + "-1",      0);
    expectedPartitions.put(dir2_4_2.getAbsolutePath() + "-0",  1);
    expectedPartitions.put(dir1_1.getAbsolutePath() + "-0",    2);
    expectedPartitions.put(dir1_1.getAbsolutePath() + "-2",    4);
    expectedPartitions.put(dir2_2.getAbsolutePath() + "-1",    3);
    expectedPartitions.put(dir1_1.getAbsolutePath() + "-1",    3);
    expectedPartitions.put(dir1_2.getAbsolutePath() + "-0",    4);


    assertThat(actualPartitions, equalTo(expectedPartitions));




    /*
     * Verify counters.
     */
    Counters actualCounters = new Counters();


    DataInputStream countersStream = FileSystem.get(job).open(crush.getCounters());


    actualCounters.readFields(countersStream);


    countersStream.close();

View Full Code Here


  @Test
  public void execute() throws Exception {
    writeFiles(true, true, true);


    Crush crush = new Crush();


    ToolRunner.run(job, crush, new String [] {
      "--threshold=0.015",
      "--max-file-blocks=1",
      "--verbose",


      "--regex=.+/other",
      "--replacement=${crush.timestamp}-${crush.task.num}-middle-${crush.file.num}-tail",
      "--input-format=" + SequenceFileInputFormat.class.getName(),
      "--output-format=" + TextOutputFormat.class.getName(),


      "--regex=.+/dir",
      "--replacement=secondregex-${crush.timestamp}-${crush.task.num}-${crush.file.num}",
      "--input-format=" + TextInputFormat.class.getName(),
      "--output-format=" + SequenceFileOutputFormat.class.getName(),


      "--regex=.+/dir/([^/]+/)*(.+)",
      "--replacement=thirdregex-$2-${crush.timestamp}-${crush.task.num}-${crush.file.num}",
      "--input-format=" + SequenceFileInputFormat.class.getName(),
      "--output-format=" + SequenceFileOutputFormat.class.getName(),


      "--regex=.+/text",
      "--replacement=fourthregex-${crush.task.num}-${crush.timestamp}-${crush.file.num}",
      "--input-format=" + TextInputFormat.class.getName(),
      "--output-format=" + TextOutputFormat.class.getName(),


      /*
       * This is the default regex and replacement, which we add last so we can exercise the default logic.
       */
      "--regex=.+",
      "--replacement=crushed_file-${crush.timestamp}-${crush.task.num}-${crush.file.num}",
      "--input-format=" + SequenceFileInputFormat.class.getName(),
      "--output-format=" + TextOutputFormat.class.getName(),


      "--compress=" + CustomCompressionCodec.class.getName(),


      "in", "out", "20101116153015"
    });




    /*
     * Crushed files.
     */
    verifyOutput(homeDir + "/out/dir",                  "secondregex-20101116153015-*-*",            Format.TEXT,      Format.SEQUENCE,  customCodec, "file10", "file11", "file12", "file13");
    verifyOutput(homeDir + "/out/dir/subdir",            "thirdregex-subdir-20101116153015-*-*",      Format.SEQUENCE,  Format.SEQUENCE,  customCodec, "file20", "file21", "file22", "file23", "file24");
    verifyOutput(homeDir + "/out/dir/subdir/subsubdir",  "thirdregex-subsubdir-20101116153015-*-*",  Format.SEQUENCE,  Format.SEQUENCE,  customCodec, "file30", "file31", "file32", "file33", "file34");
    verifyOutput(homeDir + "/out/dir/subdir/other",      "20101116153015-*-middle-*-tail",            Format.SEQUENCE,  Format.TEXT,       customCodec, "file40", "file41", "file42", "file43");
    verifyOutput(homeDir + "/out/dir/other",            "20101116153015-*-middle-*-tail",            Format.SEQUENCE,  Format.TEXT,      customCodec, "file50", "file51", "file52", "file53", "file54", "file55");
    verifyOutput(homeDir + "/out/text",                  "fourthregex-*-20101116153015-*",            Format.TEXT,      Format.TEXT,      customCodec, "file60", "file61", "file62", "file63");
    verifyOutput(homeDir + "/out",                      "crushed_file-20101116153015-*-*",          Format.SEQUENCE,  Format.TEXT,      customCodec, "file70", "file71", "file72");




    /*
     * Skipped files should have been moved to the output dir.
     */
    verifyOutput(homeDir + "/out/dir/skipped", "file80", Format.SEQUENCE, Format.SEQUENCE, defaultCodec, "file80");
    verifyHugeFile(homeDir + "/out/huge", (long) (((float) 0.015) * 1024 * 1024 * 64) + 1);




    /*
     * Crush input files should remain in the input dir.
     */
    for (String file : new String[] { "file10", "file11", "file12", "file13" }) {
      verifyOutput(homeDir + "/in/dir", file, Format.TEXT, Format.TEXT, null, file);
    }


    for (String file : new String[] { "file20", "file21", "file22", "file23", "file24" }) {
      verifyOutput(homeDir + "/in/dir/subdir", file, Format.SEQUENCE, Format.SEQUENCE, defaultCodec, file);
    }


    for (String file : new String[] { "file30", "file31", "file32", "file33", "file34" }) {
      verifyOutput(homeDir + "/in/dir/subdir/subsubdir", file, Format.SEQUENCE, Format.SEQUENCE, defaultCodec, file);
    }


    for (String file : new String[] { "file40", "file41", "file42", "file43" }) {
      verifyOutput(homeDir + "/in/dir/subdir/other", file, Format.SEQUENCE, Format.SEQUENCE, defaultCodec, file);
    }


    for (String file : new String[] { "file50", "file51", "file52", "file53", "file54", "file55" }) {
      verifyOutput(homeDir + "/in/dir/other", file, Format.SEQUENCE, Format.SEQUENCE, defaultCodec, file);
    }


    for (String file : new String[] { "file60", "file61", "file62", "file63" }) {
      verifyOutput(homeDir + "/in/text", file, Format.TEXT, Format.TEXT, null, file);
    }


    for (String file : new String[] { "file70", "file71", "file72" }) {
      verifyOutput(homeDir + "/in", file, Format.SEQUENCE, Format.SEQUENCE, defaultCodec, file);
    }




    Counters jobCounters = crush.getJobCounters();


    assertThat(jobCounters.getCounter(MapperCounter.DIRS_FOUND),      equalTo( 8L));
    assertThat(jobCounters.getCounter(MapperCounter.DIRS_ELIGIBLE),    equalTo( 7L));
    assertThat(jobCounters.getCounter(MapperCounter.DIRS_SKIPPED),    equalTo( 1L));
    assertThat(jobCounters.getCounter(MapperCounter.FILES_FOUND),      equalTo(33L));

View Full Code Here

  public void executeHugeFilesOnly() throws Exception {
    writeHugeFile("in/huge", (long) (((float) 0.015) * 1024 * 1024 * 64) + 1);
    writeHugeFile("in/foo/huge1", (long) (((float) 0.015) * 1024 * 1024 * 64) + 1);
    writeHugeFile("in/foo/huge2", (long) (((float) 0.015) * 1024 * 1024 * 64) + 1);


    Crush crush = new Crush();


    ToolRunner.run(job, crush, new String [] {
      "--threshold=0.015",
      "--max-file-blocks=1",
      "--verbose",
      "--compress=" + CustomCompressionCodec.class.getName(),


      "in", "out", "20101116153015"
    });




    /*
     * Skipped files should have been moved to the output dir.
     */
    verifyHugeFile(homeDir + "/out/huge", (long) (((float) 0.015) * 1024 * 1024 * 64) + 1);
    verifyHugeFile(homeDir + "/out/foo/huge1", (long) (((float) 0.015) * 1024 * 1024 * 64) + 1);
    verifyHugeFile(homeDir + "/out/foo/huge2", (long) (((float) 0.015) * 1024 * 1024 * 64) + 1);


    Counters jobCounters = crush.getJobCounters();


    assertThat(jobCounters.getCounter(MapperCounter.DIRS_FOUND),      equalTo(2L));
    assertThat(jobCounters.getCounter(MapperCounter.DIRS_ELIGIBLE),    equalTo(0L));
    assertThat(jobCounters.getCounter(MapperCounter.DIRS_SKIPPED),    equalTo(2L));
    assertThat(jobCounters.getCounter(MapperCounter.FILES_FOUND),      equalTo(3L));

View Full Code Here

  @Test
  public void executeNoFiles() throws Exception {
    FileSystem.get(job).mkdirs(new Path("in/foo"));
    FileSystem.get(job).mkdirs(new Path("in/hello/world"));


    Crush crush = new Crush();


    ToolRunner.run(job, crush, new String [] {
      "--threshold=0.015",
      "--max-file-blocks=1",
      "--verbose",
      "--compress=" + CustomCompressionCodec.class.getName(),


      "in", "out", "20101116153015"
    });


    Counters jobCounters = crush.getJobCounters();


    assertThat(jobCounters.getCounter(MapperCounter.DIRS_FOUND),      equalTo(4L));
    assertThat(jobCounters.getCounter(MapperCounter.DIRS_ELIGIBLE),    equalTo(0L));
    assertThat(jobCounters.getCounter(MapperCounter.DIRS_SKIPPED),    equalTo(4L));
    assertThat(jobCounters.getCounter(MapperCounter.FILES_FOUND),      equalTo(0L));

View Full Code Here


  @Test
  public void executeClone() throws Exception {
    writeFiles(true, true, true);


    Crush crush = new Crush();


    ToolRunner.run(job, crush, new String [] {
      "--threshold=0.015",
      "--max-file-blocks=1",
      "--verbose",


      "--regex=.+/other",
      "--replacement=${crush.timestamp}-${crush.task.num}-middle-${crush.file.num}-tail",
      "--input-format=sequence",
      "--output-format=text",


      "--regex=.+/dir",
      "--replacement=secondregex-${crush.timestamp}-${crush.task.num}-${crush.file.num}",
      "--input-format=text",
      "--output-format=" + SequenceFileOutputFormat.class.getName(),


      "--regex=.+/dir/([^/]+/)*(.+)",
      "--replacement=thirdregex-$2-${crush.timestamp}-${crush.task.num}-${crush.file.num}",
      "--input-format=" + SequenceFileInputFormat.class.getName(),
      "--output-format=sequence",


      "--regex=.+/text",
      "--replacement=fourthregex-${crush.task.num}-${crush.timestamp}-${crush.file.num}",
      "--input-format=" + TextInputFormat.class.getName(),
      "--output-format=" + TextOutputFormat.class.getName(),


      /*
       * This is the default regex and replacement, which we add last so we can exercise the default logic.
       */
      "--regex=.+",
      "--replacement=crushed_file-${crush.timestamp}-${crush.task.num}-${crush.file.num}",
      "--input-format=" + SequenceFileInputFormat.class.getName(),
      "--output-format=" + TextOutputFormat.class.getName(),


      "--compress=" + CustomCompressionCodec.class.getName(),


      "--clone",


      "in", "out", "20101116153015"
    });




    /*
     * Crushed files.
     */
    verifyOutput(homeDir + "/in/dir",                    "secondregex-20101116153015-*-*",            Format.TEXT,      Format.SEQUENCE,  customCodec, "file10", "file11", "file12", "file13");
    verifyOutput(homeDir + "/in/dir/subdir",            "thirdregex-subdir-20101116153015-*-*",      Format.SEQUENCE,  Format.SEQUENCE,  customCodec, "file20", "file21", "file22", "file23", "file24");
    verifyOutput(homeDir + "/in/dir/subdir/subsubdir",  "thirdregex-subsubdir-20101116153015-*-*",  Format.SEQUENCE,  Format.SEQUENCE,  customCodec, "file30", "file31", "file32", "file33", "file34");
    verifyOutput(homeDir + "/in/dir/subdir/other",      "20101116153015-*-middle-*-tail",            Format.SEQUENCE,  Format.TEXT,       customCodec, "file40", "file41", "file42", "file43");
    verifyOutput(homeDir + "/in/dir/other",              "20101116153015-*-middle-*-tail",            Format.SEQUENCE,  Format.TEXT,      customCodec, "file50", "file51", "file52", "file53", "file54", "file55");
    verifyOutput(homeDir + "/in/text",                  "fourthregex-*-20101116153015-*",            Format.TEXT,      Format.TEXT,      customCodec, "file60", "file61", "file62", "file63");
    verifyOutput(homeDir + "/in",                        "crushed_file-20101116153015-*-*",          Format.SEQUENCE,  Format.TEXT,      customCodec, "file70", "file71", "file72");




    /*
     * Skipped files should remain in the input dir.
     */
    verifyOutput(homeDir + "/in/dir/skipped", "file80", Format.SEQUENCE, Format.SEQUENCE, defaultCodec, "file80");
    verifyHugeFile(homeDir + "/in/huge", (long) (((float) 0.015) * 1024 * 1024 * 64) + 1);




    /*
     * Crush input files should be moved to the clone dir.
     */
    for (String file : new String[] { "file10", "file11", "file12", "file13" }) {
      verifyOutput(homeDir + "/out" + homeDir + "/in/dir", file, Format.TEXT, Format.TEXT, null, file);
    }


    for (String file : new String[] { "file20", "file21", "file22", "file23", "file24" }) {
      verifyOutput(homeDir + "/out" + homeDir + "/in/dir/subdir", file, Format.SEQUENCE, Format.SEQUENCE, defaultCodec, file);
    }


    for (String file : new String[] { "file30", "file31", "file32", "file33", "file34" }) {
      verifyOutput(homeDir + "/out" + homeDir + "/in/dir/subdir/subsubdir", file, Format.SEQUENCE, Format.SEQUENCE, defaultCodec, file);
    }


    for (String file : new String[] { "file40", "file41", "file42", "file43" }) {
      verifyOutput(homeDir + "/out" + homeDir + "/in/dir/subdir/other", file, Format.SEQUENCE, Format.SEQUENCE, defaultCodec, file);
    }


    for (String file : new String[] { "file50", "file51", "file52", "file53", "file54", "file55" }) {
      verifyOutput(homeDir + "/out" + homeDir + "/in/dir/other", file, Format.SEQUENCE, Format.SEQUENCE, defaultCodec, file);
    }


    for (String file : new String[] { "file60", "file61", "file62", "file63" }) {
      verifyOutput(homeDir + "/out" + homeDir + "/in/text", file, Format.TEXT, Format.TEXT, null, file);
    }


    for (String file : new String[] { "file70", "file71", "file72" }) {
      verifyOutput(homeDir + "/out" + homeDir + "/in", file, Format.SEQUENCE, Format.SEQUENCE, defaultCodec, file);
    }




    Counters jobCounters = crush.getJobCounters();


    assertThat(jobCounters.getCounter(MapperCounter.DIRS_FOUND),      equalTo( 8L));
    assertThat(jobCounters.getCounter(MapperCounter.DIRS_ELIGIBLE),    equalTo( 7L));
    assertThat(jobCounters.getCounter(MapperCounter.DIRS_SKIPPED),    equalTo( 1L));
    assertThat(jobCounters.getCounter(MapperCounter.FILES_FOUND),      equalTo(33L));

View Full Code Here

  public void executeCloneHugeFilesOnly() throws Exception {
    writeHugeFile("in/huge", (long) (((float) 0.015) * 1024 * 1024 * 64) + 1);
    writeHugeFile("in/foo/huge1", (long) (((float) 0.015) * 1024 * 1024 * 64) + 1);
    writeHugeFile("in/foo/huge2", (long) (((float) 0.015) * 1024 * 1024 * 64) + 1);


    Crush crush = new Crush();


    ToolRunner.run(job, crush, new String [] {
      "--threshold=0.015",
      "--max-file-blocks=1",
      "--verbose",
      "--compress=" + CustomCompressionCodec.class.getName(),
      "--clone",


      "in", "out", "20101116153015"
    });




    /*
     * Skipped files should remain in the input dir.
     */
    verifyHugeFile(homeDir + "/in/huge", (long) (((float) 0.015) * 1024 * 1024 * 64) + 1);
    verifyHugeFile(homeDir + "/in/foo/huge1", (long) (((float) 0.015) * 1024 * 1024 * 64) + 1);
    verifyHugeFile(homeDir + "/in/foo/huge2", (long) (((float) 0.015) * 1024 * 1024 * 64) + 1);


    Counters jobCounters = crush.getJobCounters();


    assertThat(jobCounters.getCounter(MapperCounter.DIRS_FOUND),      equalTo(2L));
    assertThat(jobCounters.getCounter(MapperCounter.DIRS_ELIGIBLE),    equalTo(0L));
    assertThat(jobCounters.getCounter(MapperCounter.DIRS_SKIPPED),    equalTo(2L));
    assertThat(jobCounters.getCounter(MapperCounter.FILES_FOUND),      equalTo(3L));

View Full Code Here

  @Test
  public void executeCloneNoFiles() throws Exception {
    FileSystem.get(job).mkdirs(new Path("in/foo"));
    FileSystem.get(job).mkdirs(new Path("in/hello/world"));


    Crush crush = new Crush();


    ToolRunner.run(job, crush, new String [] {
      "--threshold=0.015",
      "--max-file-blocks=1",
      "--verbose",
      "--compress=" + CustomCompressionCodec.class.getName(),
      "--clone",


      "in", "out", "20101116153015"
    });


    Counters jobCounters = crush.getJobCounters();


    assertThat(jobCounters.getCounter(MapperCounter.DIRS_FOUND),      equalTo(4L));
    assertThat(jobCounters.getCounter(MapperCounter.DIRS_ELIGIBLE),    equalTo(0L));
    assertThat(jobCounters.getCounter(MapperCounter.DIRS_SKIPPED),    equalTo(4L));
    assertThat(jobCounters.getCounter(MapperCounter.FILES_FOUND),      equalTo(0L));

View Full Code Here

0 1

TOP

Related Classes of com.m6d.filecrush.crush.Crush

com.m6d.filecrush.crush.CrushOptionParsingTest

com.m6d.filecrush.crush.CrushStandAloneSequenceFileTest

com.m6d.filecrush.crush.CrushStandAloneTextTest

com.m6d.filecrush.crush.CrushTest

com.m6d.filecrush.crush.integration.CrushMapReduceTest

org.apache.commons.cli.CommandLine

org.apache.commons.cli.GnuParser

org.apache.commons.cli.Option

org.apache.commons.cli.Options

org.apache.hadoop.fs.FileStatus

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.