Package com.datasalt.pangool.tuplemr.mapred.lib.output

Source Code of com.datasalt.pangool.tuplemr.mapred.lib.output.TupleOutputFormat

/**
* Copyright [2012] [Datasalt Systems S.L.]
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*   http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.datasalt.pangool.tuplemr.mapred.lib.output;

import java.io.IOException;
import java.io.Serializable;

import org.apache.avro.file.CodecFactory;
import org.apache.avro.file.DataFileWriter;
import org.apache.avro.generic.GenericData.Record;
import org.apache.avro.mapred.AvroOutputFormat;
import org.apache.avro.reflect.ReflectDatumWriter;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DataOutputBuffer;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import com.datasalt.pangool.io.ITuple;
import com.datasalt.pangool.io.Schema;
import com.datasalt.pangool.serialization.HadoopSerialization;
import com.datasalt.pangool.utils.AvroUtils;

/**
* An Avro-based output format for {@link ITuple}s
*
*/
@SuppressWarnings("serial")
public class TupleOutputFormat extends FileOutputFormat<ITuple, NullWritable> implements
    Serializable {

  public final static String FILE_PREFIX = "tuple";

  public static final String DEFLATE_CODEC = "deflate";
  public static final String SNAPPY_CODEC = "snappy";

  private static final int SYNC_SIZE = 16;
  private static final int DEFAULT_SYNC_INTERVAL = 1000 * SYNC_SIZE;

  String pangoolOutputSchema;

  public TupleOutputFormat(String pangoolOutputSchema) {
    this.pangoolOutputSchema = pangoolOutputSchema;
  }

  public static class TupleRecordWriter extends RecordWriter<ITuple, NullWritable> {

    Record record;
    DataFileWriter<Record> writer;
    Schema pangoolSchema;
    org.apache.avro.Schema avroSchema;
    private final HadoopSerialization ser;
    private final DataOutputBuffer tmpOutputBuffer = new DataOutputBuffer();

    public TupleRecordWriter(org.apache.avro.Schema schema, Schema pangoolSchema,
        DataFileWriter<Record> writer, HadoopSerialization ser) {
      record = new Record(schema);
      this.ser = ser;
      this.avroSchema = schema;
      this.writer = writer;
      this.pangoolSchema = pangoolSchema;
    }

    @Override
    public void close(TaskAttemptContext arg0) throws IOException, InterruptedException {
      writer.close();
    }

    @Override
    public void write(ITuple tuple, NullWritable ignore) throws IOException,
        InterruptedException {
      AvroUtils.toRecord(tuple, record, tmpOutputBuffer, ser);
      writer.append(record);
    }
  }

  @Override
  public RecordWriter<ITuple, NullWritable> getRecordWriter(TaskAttemptContext context)
      throws IOException, InterruptedException {

    Schema pangoolOutputSchema = Schema.parse(this.pangoolOutputSchema);
    org.apache.avro.Schema avroSchema = AvroUtils.toAvroSchema(pangoolOutputSchema);
    DataFileWriter<Record> writer = new DataFileWriter<Record>(
        new ReflectDatumWriter<Record>());

    // Compression etc - use Avro codecs

    Configuration conf = context.getConfiguration();
    if(conf.getBoolean("mapred.output.compress", false)) {
      String codec = conf.get("mapred.output.compression");
      int level = conf.getInt(AvroOutputFormat.DEFLATE_LEVEL_KEY,
          AvroOutputFormat.DEFAULT_DEFLATE_LEVEL);
      CodecFactory factory = codec.equals(DEFLATE_CODEC) ? CodecFactory
          .deflateCodec(level) : CodecFactory.fromString(codec);
      writer.setCodec(factory);
    }
    writer.setSyncInterval(conf.getInt(AvroOutputFormat.SYNC_INTERVAL_KEY,
        DEFAULT_SYNC_INTERVAL));

    Path file = getDefaultWorkFile(context, "");
    writer
        .create(avroSchema, file.getFileSystem(context.getConfiguration()).create(file));

    return new TupleRecordWriter(avroSchema, pangoolOutputSchema, writer,
        new HadoopSerialization(conf));
  }
}
TOP

Related Classes of com.datasalt.pangool.tuplemr.mapred.lib.output.TupleOutputFormat

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.