Package org.apache.sqoop.mapreduce

Source Code of org.apache.sqoop.mapreduce.ParquetJob

/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.sqoop.mapreduce;

import org.apache.avro.Schema;
import org.apache.avro.generic.GenericRecord;
import org.apache.hadoop.conf.Configuration;
import org.kitesdk.data.Dataset;
import org.kitesdk.data.DatasetDescriptor;
import org.kitesdk.data.DatasetNotFoundException;
import org.kitesdk.data.Datasets;
import org.kitesdk.data.Formats;
import org.kitesdk.data.mapreduce.DatasetKeyOutputFormat;
import org.kitesdk.data.spi.SchemaValidationUtil;

import java.io.IOException;

/**
* Helper class for setting up a Parquet MapReduce job.
*/
public final class ParquetJob {

  private ParquetJob() {
  }

  private static final String CONF_AVRO_SCHEMA = "avro.schema";

  public static Schema getAvroSchema(Configuration conf) {
    return new Schema.Parser().parse(conf.get(CONF_AVRO_SCHEMA));
  }

  /**
   * Configure the import job. The import process will use a Kite dataset to
   * write data records into Parquet format internally. The input key class is
   * {@link org.apache.sqoop.lib.SqoopAvroRecord}. The output key is
   * {@link org.apache.avro.generic.GenericRecord}.
   */
  public static void configureImportJob(Configuration conf, Schema schema,
      String uri, boolean doAppend) throws IOException {
    Dataset dataset;
    if (doAppend) {
      try {
        dataset = Datasets.load(uri);
      } catch (DatasetNotFoundException ex) {
        dataset = createDataset(schema, uri);
      }
      Schema writtenWith = dataset.getDescriptor().getSchema();
      if (!SchemaValidationUtil.canRead(writtenWith, schema)) {
        throw new IOException(
            String.format("Expected schema: %s%nActual schema: %s",
                writtenWith, schema));
      }
    } else {
      dataset = createDataset(schema, uri);
    }
    conf.set(CONF_AVRO_SCHEMA, schema.toString());
    DatasetKeyOutputFormat.configure(conf).writeTo(dataset);
  }

  private static Dataset createDataset(Schema schema, String uri) {
    DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
        .schema(schema)
        .format(Formats.PARQUET)
        .build();
    return Datasets.create(uri, descriptor, GenericRecord.class);
  }

}
TOP

Related Classes of org.apache.sqoop.mapreduce.ParquetJob

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.