Source Code of com.facebook.giraph.hive.output.HiveApiOutputFormat

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */


package com.facebook.giraph.hive.output;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.metastore.HiveMetaStoreClient;
import org.apache.hadoop.hive.metastore.api.FieldSchema;
import org.apache.hadoop.hive.metastore.api.MetaException;
import org.apache.hadoop.hive.metastore.api.StorageDescriptor;
import org.apache.hadoop.hive.metastore.api.Table;
import org.apache.hadoop.hive.serde2.Serializer;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.OutputCommitter;
import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.util.ReflectionUtils;
import org.apache.log4j.Logger;
import org.apache.thrift.TException;


import com.facebook.giraph.hive.HiveRecord;
import com.facebook.giraph.hive.HiveTableSchema;
import com.facebook.giraph.hive.HiveTableSchemas;
import com.facebook.giraph.hive.impl.HiveApiTableSchema;
import com.facebook.giraph.hive.impl.common.FileSystems;
import com.facebook.giraph.hive.impl.common.HadoopUtils;
import com.facebook.giraph.hive.impl.common.HiveUtils;
import com.facebook.giraph.hive.impl.common.Inspectors;
import com.facebook.giraph.hive.impl.common.ProgressReporter;
import com.facebook.giraph.hive.impl.output.HiveApiOutputCommitter;
import com.facebook.giraph.hive.impl.output.HiveApiRecordWriter;
import com.facebook.giraph.hive.impl.output.OutputConf;
import com.facebook.giraph.hive.impl.output.OutputInfo;
import com.facebook.giraph.hive.input.HiveApiInputFormat;
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;


import java.io.IOException;
import java.util.Collection;
import java.util.List;
import java.util.Map;


/**
 * Hadoop compatible OutputFormat for writing to Hive.
 */
public class HiveApiOutputFormat
    extends OutputFormat<WritableComparable, HiveRecord> {
  /** Logger */
  public static final Logger LOG = Logger.getLogger(HiveApiOutputFormat.class);


  /** Default profile if none given */
  public static final String DEFAULT_PROFILE_ID = "output-profile";


  /** Which profile to lookup */
  private String myProfileId = DEFAULT_PROFILE_ID;


  public String getMyProfileId() {
    return myProfileId;
  }


  public void setMyProfileId(String myProfileId) {
    this.myProfileId = myProfileId;
  }


  /**
   * Get table schema for this profile in the configuration.
   * @param conf Configuration to lookup in
   * @return HiveTableSchema
   */
  public HiveTableSchema getTableSchema(Configuration conf) {
    return HiveTableSchemas.getForProfile(conf, myProfileId);
  }


  /**
   * Initialize using object's profile ID with Configuration and output
   * description passed in.
   * @param conf Configuration to use
   * @param outputDesc HiveOutputDescription
   * @throws TException Hive Metastore issues
   */
  public void init(Configuration conf, HiveOutputDescription outputDesc)
    throws TException {
    initProfile(conf, outputDesc, myProfileId);
  }


  /**
   * Initialize with default profile ID using Configuration and output
   * description passsed in.
   * @param conf Configuration to use
   * @param outputDesc HiveOutputDescription
   * @throws TException Hive Metastore issues
   */
  public static void initDefaultProfile(Configuration conf,
    HiveOutputDescription outputDesc) throws TException {
    initProfile(conf, outputDesc, DEFAULT_PROFILE_ID);
  }


  /**
   * Initialize passed in profile ID with Configuration and output description
   * passed in.
   * @param conf Configuration to use
   * @param outputDesc HiveOutputDescription
   * @param profileId Profile to use
   * @throws TException Hive Metastore issues
   */
  public static void initProfile(Configuration conf,
                                 HiveOutputDescription outputDesc,
                                 String profileId)
    throws TException {
    String dbName = outputDesc.getDbName();
    String tableName = outputDesc.getTableName();


    HiveConf hiveConf = new HiveConf(conf, HiveApiInputFormat.class);
    HiveMetaStoreClient client = new HiveMetaStoreClient(hiveConf);


    Table table = client.getTable(dbName, tableName);
    sanityCheck(table, outputDesc);


    OutputInfo oti = new OutputInfo(table);


    String partitionPiece;
    if (oti.hasPartitionInfo()) {
      partitionPiece = HiveUtils.computePartitionPath(oti.getPartitionInfo(),
          outputDesc.getPartitionValues());
    } else {
      partitionPiece = "_temp";
    }
    String partitionPath = oti.getTableRoot() + Path.SEPARATOR + partitionPiece;


    oti.setPartitionPath(partitionPath);
    HadoopUtils.setOutputDir(conf, partitionPath);


    if (oti.hasPartitionInfo()) {
      oti.setFinalOutputPath(oti.getPartitionPath());
    } else {
      oti.setFinalOutputPath(oti.getTableRoot());
    }


    HiveTableSchema tableSchema = HiveApiTableSchema.fromTable(table);
    HiveTableSchemas.putForName(conf, dbName, tableName, tableSchema);
    HiveTableSchemas.putForProfile(conf, profileId, tableSchema);


    OutputConf outputConf = new OutputConf(conf, profileId);
    outputConf.writeOutputDescription(outputDesc);
    outputConf.writeOutputTableInfo(oti);


    LOG.info("initProfile '" + profileId + "' using " + outputDesc);
  }


  /**
   * Check table is not misconfigured.
   * @param table Table to check
   * @param outputDesc HiveOutputDescription to use
   */
  private static void sanityCheck(Table table,
                                  HiveOutputDescription outputDesc) {
    StorageDescriptor sd = table.getSd();
    Preconditions.checkArgument(!sd.isCompressed());
    Preconditions.checkArgument(nullOrEmpty(sd.getBucketCols()));
    Preconditions.checkArgument(nullOrEmpty(sd.getSortCols()));
    Preconditions.checkArgument(table.getPartitionKeysSize() ==
        outputDesc.numPartitionValues());
  }


  /**
   * Check if collection is null or empty
   * @param <X> data type
   * @param c Collection to check
   * @return true if collection is null or empty
   */
  private static <X> boolean nullOrEmpty(Collection<X> c) {
    return c == null || c.isEmpty();
  }


  /**
   * Convert partition value map with ordered partition info into list of
   * partition values.
   * @param partitionValues Map of partition data
   * @param fieldSchemas List of partition column definitions
   * @return List<String> of partition values
   */
  private List<String> listOfPartitionValues(
    Map<String, String> partitionValues, List<FieldSchema> fieldSchemas) {
    List<String> values = Lists.newArrayList();
    for (FieldSchema fieldSchema : fieldSchemas) {
      String value = partitionValues.get(fieldSchema.getName().toLowerCase());
      values.add(value);
    }
    return values;
  }


  @Override
  public void checkOutputSpecs(JobContext jobContext)
    throws IOException, InterruptedException {
    Configuration conf = jobContext.getConfiguration();
    OutputConf outputConf = new OutputConf(conf, myProfileId);


    HiveOutputDescription description = outputConf.readOutputDescription();
    OutputInfo oti = outputConf.readOutputTableInfo();


    if (oti.hasPartitionInfo()) {
      if (!description.hasPartitionValues()) {
        throw new IOException("table is partitioned but user input isn't");
      }
      checkPartitionDoesntExist(conf, description, oti);
    } else {
      if (description.hasPartitionValues()) {
        throw new IOException("table is not partitioned but user input is");
      } else {
        checkTableIsEmpty(conf, description, oti);
      }
    }
  }


  /**
   * Check if the given table is empty, that is has no files
   * @param conf Configuration to use
   * @param description HiveOutputDescription
   * @param oti OutputInfo
   * @throws IOException Hadoop Filesystem issues
   */
  private void checkTableIsEmpty(Configuration conf,
    HiveOutputDescription description, OutputInfo oti)
    throws IOException {
    Path tablePath = new Path(oti.getTableRoot());
    FileSystem fs = tablePath.getFileSystem(conf);


    if (fs.exists(tablePath)) {
      if (FileSystems.dirHasNonHiddenFiles(fs, tablePath)) {
        throw new IOException("Table " + description.getTableName() +
            " has existing data");
      }
    }
  }


  /**
   * Check that partition we will be writing to does not already exist
   * @param conf Configuration to use
   * @param description HiveOutputDescription
   * @param oti OutputInfo
   * @throws IOException Hadoop Filesystem issues
   */
  private void checkPartitionDoesntExist(Configuration conf,
    HiveOutputDescription description, OutputInfo oti)
    throws IOException {
    HiveConf hiveConf = new HiveConf(conf, HiveApiInputFormat.class);
    HiveMetaStoreClient client;
    try {
      client = new HiveMetaStoreClient(hiveConf);
    } catch (MetaException e) {
      throw new IOException(e);
    }


    String db = description.getDbName();
    String table = description.getTableName();


    if (oti.hasPartitionInfo()) {
      Map<String, String> partitionSpec = description.getPartitionValues();
      List<String> partitionValues = listOfPartitionValues(
          partitionSpec, oti.getPartitionInfo());


      if (partitionExists(client, db, table, partitionValues)) {
        throw new IOException("Table " + db + ":" + table + " partition " +
            partitionSpec + " already exists");
      }
    }
  }


  /**
   * Query Hive metastore if a table's partition exists already.
   * @param client Hive client
   * @param db Hive database name
   * @param table Hive table name
   * @param partitionValues list of partition values
   * @return true if partition exists
   */
  private boolean partitionExists(
      HiveMetaStoreClient client, String db, String table,
      List<String> partitionValues) {
    List<String> partitionNames;
    try {
      partitionNames = client.listPartitionNames(db, table,
          partitionValues, (short) 1);
      // CHECKSTYLE: stop IllegalCatch
    } catch (Exception e) {
      // CHECKSTYLE: resume IllegalCatch
      return false;
    }
    return partitionNames.size() > 0;
  }


  @Override
  public HiveApiRecordWriter getRecordWriter(TaskAttemptContext taskAttemptContext)
    throws IOException, InterruptedException {
    HadoopUtils.setWorkOutputDir(taskAttemptContext);


    Configuration conf = taskAttemptContext.getConfiguration();
    OutputConf outputConf = new OutputConf(conf, myProfileId);


    OutputInfo oti = outputConf.readOutputTableInfo();


    HiveUtils.setRCileNumColumns(conf, oti.getColumnInfo().size());
    HadoopUtils.setOutputKeyWritableClass(conf, NullWritable.class);


    Serializer serializer = oti.createSerializer(conf);
    HadoopUtils.setOutputValueWritableClass(conf,
        serializer.getSerializedClass());


    org.apache.hadoop.mapred.OutputFormat baseOutputFormat =
        ReflectionUtils.newInstance(oti.getOutputFormatClass(), conf);
    // CHECKSTYLE: stop LineLength
    org.apache.hadoop.mapred.RecordWriter<WritableComparable, Writable> baseWriter =
        getBaseRecordWriter(taskAttemptContext, baseOutputFormat);
    // CHECKSTYLE: resume LineLength


    StructObjectInspector soi = Inspectors.createFor(oti.getColumnInfo());


    return new HiveApiRecordWriter(baseWriter, serializer, soi);
  }


  /**
   * Get the base Hadoop RecordWriter.
   * @param taskAttemptContext TaskAttemptContext
   * @param baseOutputFormat Hadoop OutputFormat
   * @return RecordWriter
   * @throws IOException Hadoop issues
   */
  // CHECKSTYLE: stop LineLengthCheck
  private org.apache.hadoop.mapred.RecordWriter<WritableComparable, Writable> getBaseRecordWriter(
    TaskAttemptContext taskAttemptContext,
    org.apache.hadoop.mapred.OutputFormat baseOutputFormat) throws IOException {
    // CHECKSTYLE: resume LineLengthCheck
    JobConf jobConf = new JobConf(taskAttemptContext.getConfiguration());
    String name = FileOutputFormat.getUniqueName(jobConf, "part");
    Reporter reporter = new ProgressReporter(taskAttemptContext);
    return baseOutputFormat.getRecordWriter(null, jobConf, name, reporter);
  }


  @Override
  public OutputCommitter getOutputCommitter(
    TaskAttemptContext taskAttemptContext)
    throws IOException, InterruptedException {
    HadoopUtils.setWorkOutputDir(taskAttemptContext);
    Configuration conf = taskAttemptContext.getConfiguration();
    JobConf jobConf = new JobConf(conf);
    OutputCommitter baseCommitter = jobConf.getOutputCommitter();
    return new HiveApiOutputCommitter(baseCommitter, myProfileId);
  }
}
Source Code of com.facebook.giraph.hive.output.HiveApiOutputFormat

Related Classes of com.facebook.giraph.hive.output.HiveApiOutputFormat