Source Code of org.apache.hadoop.hbase.index.mapreduce.IndexHFileOutputFormat$WriterLength

/**
 * Copyright 2011 The Apache Software Foundation
 *
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hbase.index.mapreduce;


import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.URLDecoder;
import java.util.Map;
import java.util.TreeMap;


import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.index.Constants;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.io.encoding.DataBlockEncoding;
import org.apache.hadoop.hbase.io.hfile.Compression;
import org.apache.hadoop.hbase.io.hfile.HFile;
import org.apache.hadoop.hbase.io.hfile.HFileDataBlockEncoder;
import org.apache.hadoop.hbase.io.hfile.HFileDataBlockEncoderImpl;
import org.apache.hadoop.hbase.io.hfile.NoOpDataBlockEncoder;
import org.apache.hadoop.hbase.mapreduce.HFileOutputFormat;
import org.apache.hadoop.hbase.regionserver.Store;
import org.apache.hadoop.hbase.regionserver.StoreFile;
import org.apache.hadoop.hbase.regionserver.TimeRangeTracker;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.WritableUtils;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;


public class IndexHFileOutputFormat extends FileOutputFormat<ImmutableBytesWritable, KeyValue> {


  static Log LOG = LogFactory.getLog(IndexHFileOutputFormat.class);
  static final String COMPRESSION_CONF_KEY = "hbase.hfileoutputformat.families.compression";
  private static final String DATABLOCK_ENCODING_CONF_KEY =
      "hbase.mapreduce.hfileoutputformat.datablock.encoding";
  TimeRangeTracker trt = new TimeRangeTracker();


  public static void configureIncrementalLoad(Job job, HTable table) throws IOException {
    HFileOutputFormat.configureIncrementalLoad(job, table);
    // Override OutputFormatClass
    job.setOutputFormatClass(IndexHFileOutputFormat.class);
  }


  public RecordWriter<ImmutableBytesWritable, KeyValue> getRecordWriter(
      final TaskAttemptContext context) throws IOException, InterruptedException {


    // Get the path of the temporary output file
    final Path outputPath = FileOutputFormat.getOutputPath(context);
    final Path outputdir = new FileOutputCommitter(outputPath, context).getWorkPath();


    final Configuration conf = context.getConfiguration();
    final FileSystem fs = outputdir.getFileSystem(conf);


    // These configs. are from hbase-*.xml
    final long maxsize =
        conf.getLong(HConstants.HREGION_MAX_FILESIZE, HConstants.DEFAULT_MAX_FILE_SIZE);
    final int blocksize =
        conf.getInt("hbase.mapreduce.hfileoutputformat.blocksize", HFile.DEFAULT_BLOCKSIZE);
    // Invented config. Add to hbase-*.xml if other than default compression.
    final String defaultCompression =
        conf.get("hfile.compression", Compression.Algorithm.NONE.getName());
    final boolean compactionExclude =
        conf.getBoolean("hbase.mapreduce.hfileoutputformat.compaction.exclude", false);


    final boolean indexedTable = conf.getBoolean(IndexMapReduceUtil.INDEX_IS_INDEXED_TABLE, false);


    final Path indexDir = new Path(outputdir, IndexMapReduceUtil.INDEX_DATA_DIR);


    final FileSystem indexFs = indexDir.getFileSystem(conf);


    if (indexedTable) {
      if (!indexFs.exists(indexDir)) {
        indexFs.mkdirs(indexDir);
      }
    }


    // create a map from column family to the compression algorithm
    final Map<byte[], String> compressionMap = createFamilyCompressionMap(conf);


    String dataBlockEncodingStr = conf.get(DATABLOCK_ENCODING_CONF_KEY);
    final HFileDataBlockEncoder encoder;
    if (dataBlockEncodingStr == null) {
      encoder = NoOpDataBlockEncoder.INSTANCE;
    } else {
      try {
        encoder = new HFileDataBlockEncoderImpl(DataBlockEncoding.valueOf(dataBlockEncodingStr));
      } catch (IllegalArgumentException ex) {
        throw new RuntimeException("Invalid data block encoding type configured for the param "
            + DATABLOCK_ENCODING_CONF_KEY + " : " + dataBlockEncodingStr);
      }
    }


    return new RecordWriter<ImmutableBytesWritable, KeyValue>() {
      // Map of families to writers and how much has been output on the writer.
      private final Map<byte[], WriterLength> writers = new TreeMap<byte[], WriterLength>(
          Bytes.BYTES_COMPARATOR);
      private byte[] previousRow = HConstants.EMPTY_BYTE_ARRAY;
      private final byte[] now = Bytes.toBytes(System.currentTimeMillis());
      private boolean rollRequested = false;


      public void write(ImmutableBytesWritable row, KeyValue kv) throws IOException {
        // null input == user explicitly wants to flush
        if (row == null && kv == null) {
          rollWriters();
          return;
        }
        boolean indexed = false;


        byte[] rowKey = kv.getRow();
        long length = kv.getLength();
        byte[] family = kv.getFamily();
        byte[] qualifier = kv.getQualifier();


        if (Bytes.equals(family, Constants.IDX_COL_FAMILY)
            && Bytes.equals(qualifier, Constants.IDX_COL_QUAL)) {
          indexed = true;
        }


        WriterLength wl = null;
        if (indexed) {
          wl = this.writers.get(Bytes.toBytes(IndexMapReduceUtil.INDEX_DATA_DIR));
        } else {
          wl = this.writers.get(family);
        }


        // If this is a new column family, verify that the directory exists
        if (wl == null) {
          if (indexed) {
            indexFs.mkdirs(new Path(indexDir, Bytes.toString(family)));
          } else {
            fs.mkdirs(new Path(outputdir, Bytes.toString(family)));
          }
        }


        // If any of the HFiles for the column families has reached
        // maxsize, we need to roll all the writers
        if (wl != null && wl.written + length >= maxsize) {
          this.rollRequested = true;
        }


        // This can only happen once a row is finished though
        if (rollRequested && Bytes.compareTo(this.previousRow, rowKey) != 0) {
          rollWriters();
        }


        // create a new HLog writer, if necessary
        if (wl == null || wl.writer == null) {
          wl = getNewWriter(family, conf, indexed);
        }


        // we now have the proper HLog writer. full steam ahead
        kv.updateLatestStamp(this.now);
        trt.includeTimestamp(kv);
        wl.writer.append(kv);
        wl.written += length;


        // Copy the row so we know when a row transition.
        this.previousRow = rowKey;
      }


      private void rollWriters() throws IOException {
        for (WriterLength wl : this.writers.values()) {
          if (wl.writer != null) {
            LOG.info("Writer=" + wl.writer.getPath()
                + ((wl.written == 0) ? "" : ", wrote=" + wl.written));
            close(wl.writer);
          }
          wl.writer = null;
          wl.written = 0;
        }
        this.rollRequested = false;
      }


      /*
       * Create a new HFile.Writer.
       * @param family
       * @return A WriterLength, containing a new HFile.Writer.
       * @throws IOException
       */
      private WriterLength getNewWriter(byte[] family, Configuration conf, boolean indexData)
          throws IOException {
        WriterLength wl = new WriterLength();


        Path familydir = null;


        String compression = compressionMap.get(family);
        compression = compression == null ? defaultCompression : compression;


        if (indexData) {
          familydir = new Path(indexDir, Bytes.toString(family));
          wl.writer =
              HFile.getWriterFactoryNoCache(conf)
                  .withPath(indexFs, StoreFile.getUniqueFile(indexFs, familydir))
                  .withBlockSize(blocksize).withCompression(compression)
                  .withComparator(KeyValue.KEY_COMPARATOR).withDataBlockEncoder(encoder)
                  .withChecksumType(Store.getChecksumType(conf))
                  .withBytesPerChecksum(Store.getBytesPerChecksum(conf)).create();
          this.writers.put(Bytes.toBytes(IndexMapReduceUtil.INDEX_DATA_DIR), wl);
        } else {
          familydir = new Path(outputdir, Bytes.toString(family));
          wl.writer =
              HFile.getWriterFactoryNoCache(conf)
                  .withPath(fs, StoreFile.getUniqueFile(fs, familydir)).withBlockSize(blocksize)
                  .withCompression(compression).withComparator(KeyValue.KEY_COMPARATOR)
                  .withDataBlockEncoder(encoder).withChecksumType(Store.getChecksumType(conf))
                  .withBytesPerChecksum(Store.getBytesPerChecksum(conf)).create();
          this.writers.put(family, wl);
        }


        return wl;
      }


      private void close(final HFile.Writer w) throws IOException {
        if (w != null) {
          w.appendFileInfo(StoreFile.BULKLOAD_TIME_KEY, Bytes.toBytes(System.currentTimeMillis()));
          w.appendFileInfo(StoreFile.BULKLOAD_TASK_KEY,
            Bytes.toBytes(context.getTaskAttemptID().toString()));
          w.appendFileInfo(StoreFile.MAJOR_COMPACTION_KEY, Bytes.toBytes(true));
          w.appendFileInfo(StoreFile.EXCLUDE_FROM_MINOR_COMPACTION_KEY,
            Bytes.toBytes(compactionExclude));
          w.appendFileInfo(StoreFile.TIMERANGE_KEY, WritableUtils.toByteArray(trt));
          w.close();
        }
      }


      public void close(TaskAttemptContext c) throws IOException, InterruptedException {
        for (WriterLength wl : this.writers.values()) {
          close(wl.writer);
        }
      }
    };


  }


  /**
   * Run inside the task to deserialize column family to compression algorithm map from the
   * configuration. Package-private for unit tests only.
   * @return a map from column family to the name of the configured compression algorithm
   */
  static Map<byte[], String> createFamilyCompressionMap(Configuration conf) {
    Map<byte[], String> compressionMap = new TreeMap<byte[], String>(Bytes.BYTES_COMPARATOR);
    String compressionConf = conf.get(COMPRESSION_CONF_KEY, "");
    for (String familyConf : compressionConf.split("&")) {
      String[] familySplit = familyConf.split("=");
      if (familySplit.length != 2) {
        continue;
      }


      try {
        compressionMap.put(Bytes.toBytes(URLDecoder.decode(familySplit[0], "UTF-8")),
          URLDecoder.decode(familySplit[1], "UTF-8"));
      } catch (UnsupportedEncodingException e) {
        // will not happen with UTF-8 encoding
        throw new AssertionError(e);
      }
    }
    return compressionMap;
  }


  static class WriterLength {
    long written = 0;
    HFile.Writer writer = null;
  }
}
Source Code of org.apache.hadoop.hbase.index.mapreduce.IndexHFileOutputFormat$WriterLength

Related Classes of org.apache.hadoop.hbase.index.mapreduce.IndexHFileOutputFormat$WriterLength