/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.phoenix.mapreduce;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.List;
import java.util.UUID;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Future;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.cli.PosixParser;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.HFileOutputFormat;
import org.apache.hadoop.hbase.mapreduce.LoadIncrementalHFiles;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.phoenix.jdbc.PhoenixDatabaseMetaData;
import org.apache.phoenix.job.JobManager;
import org.apache.phoenix.query.QueryConstants;
import org.apache.phoenix.schema.PTable;
import org.apache.phoenix.util.CSVCommonsLoader;
import org.apache.phoenix.util.ColumnInfo;
import org.apache.phoenix.util.PhoenixRuntime;
import org.apache.phoenix.util.SchemaUtil;
import org.apache.phoenix.util.StringUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Splitter;
import com.google.common.collect.Lists;
/**
* Base tool for running MapReduce-based ingests of data.
*/
public class CsvBulkLoadTool extends Configured implements Tool {
private static final Logger LOG = LoggerFactory.getLogger(CsvBulkLoadTool.class);
static final Option ZK_QUORUM_OPT = new Option("z", "zookeeper", true, "Zookeeper quorum to connect to (optional)");
static final Option INPUT_PATH_OPT = new Option("i", "input", true, "Input CSV path (mandatory)");
static final Option OUTPUT_PATH_OPT = new Option("o", "output", true, "Output path for temporary HFiles (optional)");
static final Option SCHEMA_NAME_OPT = new Option("s", "schema", true, "Phoenix schema name (optional)");
static final Option TABLE_NAME_OPT = new Option("t", "table", true, "Phoenix table name (mandatory)");
static final Option INDEX_TABLE_NAME_OPT = new Option("it", "index-table", true, "Phoenix index table name when just loading this particualar index table");
static final Option DELIMITER_OPT = new Option("d", "delimiter", true, "Input delimiter, defaults to comma");
static final Option ARRAY_DELIMITER_OPT = new Option("a", "array-delimiter", true, "Array element delimiter (optional)");
static final Option IMPORT_COLUMNS_OPT = new Option("c", "import-columns", true, "Comma-separated list of columns to be imported");
static final Option IGNORE_ERRORS_OPT = new Option("g", "ignore-errors", false, "Ignore input errors");
static final Option HELP_OPT = new Option("h", "help", false, "Show this help and quit");
public static void main(String[] args) throws Exception {
ToolRunner.run(new CsvBulkLoadTool(), args);
}
/**
* Parses the commandline arguments, throws IllegalStateException if mandatory arguments are
* missing.
*
* @param args supplied command line arguments
* @return the parsed command line
*/
CommandLine parseOptions(String[] args) {
Options options = getOptions();
CommandLineParser parser = new PosixParser();
CommandLine cmdLine = null;
try {
cmdLine = parser.parse(options, args);
} catch (ParseException e) {
printHelpAndExit("Error parsing command line options: " + e.getMessage(), options);
}
if (cmdLine.hasOption(HELP_OPT.getOpt())) {
printHelpAndExit(options, 0);
}
if (!cmdLine.hasOption(TABLE_NAME_OPT.getOpt())) {
throw new IllegalStateException(TABLE_NAME_OPT.getLongOpt() + " is a mandatory " +
"parameter");
}
if (!cmdLine.getArgList().isEmpty()) {
throw new IllegalStateException("Got unexpected extra parameters: "
+ cmdLine.getArgList());
}
if (!cmdLine.hasOption(INPUT_PATH_OPT.getOpt())) {
throw new IllegalStateException(INPUT_PATH_OPT.getLongOpt() + " is a mandatory " +
"parameter");
}
return cmdLine;
}
private Options getOptions() {
Options options = new Options();
options.addOption(INPUT_PATH_OPT);
options.addOption(TABLE_NAME_OPT);
options.addOption(INDEX_TABLE_NAME_OPT);
options.addOption(ZK_QUORUM_OPT);
options.addOption(OUTPUT_PATH_OPT);
options.addOption(SCHEMA_NAME_OPT);
options.addOption(DELIMITER_OPT);
options.addOption(ARRAY_DELIMITER_OPT);
options.addOption(IMPORT_COLUMNS_OPT);
options.addOption(IGNORE_ERRORS_OPT);
options.addOption(HELP_OPT);
return options;
}
private void printHelpAndExit(String errorMessage, Options options) {
System.err.println(errorMessage);
printHelpAndExit(options, 1);
}
private void printHelpAndExit(Options options, int exitCode) {
HelpFormatter formatter = new HelpFormatter();
formatter.printHelp("help", options);
System.exit(exitCode);
}
@Override
public int run(String[] args) throws Exception {
Configuration conf = HBaseConfiguration.addHbaseResources(getConf());
CommandLine cmdLine = null;
try {
cmdLine = parseOptions(args);
} catch (IllegalStateException e) {
printHelpAndExit(e.getMessage(), getOptions());
}
Class.forName(DriverManager.class.getName());
Connection conn = DriverManager.getConnection(
getJdbcUrl(cmdLine.getOptionValue(ZK_QUORUM_OPT.getOpt())));
return loadData(conf, cmdLine, conn);
}
private int loadData(Configuration conf, CommandLine cmdLine,
Connection conn) throws SQLException, InterruptedException,
ExecutionException {
String tableName = cmdLine.getOptionValue(TABLE_NAME_OPT.getOpt());
String schemaName = cmdLine.getOptionValue(SCHEMA_NAME_OPT.getOpt());
String indexTableName = cmdLine.getOptionValue(INDEX_TABLE_NAME_OPT.getOpt());
String qualifiedTableName = getQualifiedTableName(schemaName, tableName);
String qualifedIndexTableName = null;
if(indexTableName != null){
qualifedIndexTableName = getQualifiedTableName(schemaName, indexTableName);
}
List<ColumnInfo> importColumns = buildImportColumns(conn, cmdLine, qualifiedTableName);
configureOptions(cmdLine, importColumns, conf);
try {
validateTable(conn, schemaName, tableName);
} finally {
conn.close();
}
Path inputPath = new Path(cmdLine.getOptionValue(INPUT_PATH_OPT.getOpt()));
Path outputPath = null;
if (cmdLine.hasOption(OUTPUT_PATH_OPT.getOpt())) {
outputPath = new Path(cmdLine.getOptionValue(OUTPUT_PATH_OPT.getOpt()));
} else {
outputPath = new Path("/tmp/" + UUID.randomUUID());
}
List<String> tablesToBeLoaded = new ArrayList<String>();
tablesToBeLoaded.add(qualifiedTableName);
tablesToBeLoaded.addAll(getIndexTables(conn, schemaName, qualifiedTableName));
// When loading a single index table, check index table name is correct
if(qualifedIndexTableName != null){
boolean exists = false;
for(String tmpTable : tablesToBeLoaded){
if(tmpTable.compareToIgnoreCase(qualifedIndexTableName) == 0) {
exists = true;
break;
}
}
if(!exists){
throw new IllegalStateException("CSV Bulk Loader error: index table " +
qualifedIndexTableName + " doesn't exist");
}
tablesToBeLoaded.clear();
tablesToBeLoaded.add(qualifedIndexTableName);
}
List<Future<Boolean>> runningJobs = new ArrayList<Future<Boolean>>();
ExecutorService executor = JobManager.createThreadPoolExec(Integer.MAX_VALUE, 5, 20);
try{
for(String table : tablesToBeLoaded) {
Path tablePath = new Path(outputPath, table);
Configuration jobConf = new Configuration(conf);
jobConf.set(CsvToKeyValueMapper.TABLE_NAME_CONFKEY, qualifiedTableName);
if(qualifiedTableName.compareToIgnoreCase(table) != 0) {
jobConf.set(CsvToKeyValueMapper.INDEX_TABLE_NAME_CONFKEY, table);
}
TableLoader tableLoader = new TableLoader(jobConf, table, inputPath, tablePath);
runningJobs.add(executor.submit(tableLoader));
}
} finally {
executor.shutdown();
}
// wait for all jobs to complete
int retCode = 0;
for(Future<Boolean> task : runningJobs){
if(!task.get() && (retCode==0)){
retCode = -1;
}
}
return retCode;
}
String getJdbcUrl(String zkQuorum) {
if (zkQuorum == null) {
LOG.warn("Defaulting to localhost for ZooKeeper quorum");
zkQuorum = "localhost:2181";
}
return PhoenixRuntime.JDBC_PROTOCOL + PhoenixRuntime.JDBC_PROTOCOL_SEPARATOR + zkQuorum;
}
/**
* Build up the list of columns to be imported. The list is taken from the command line if
* present, otherwise it is taken from the table description.
*
* @param conn connection to Phoenix
* @param cmdLine supplied command line options
* @param qualifiedTableName table name (possibly with schema) of the table to be imported
* @return the list of columns to be imported
*/
List<ColumnInfo> buildImportColumns(Connection conn, CommandLine cmdLine,
String qualifiedTableName) throws SQLException {
List<String> userSuppliedColumnNames = null;
if (cmdLine.hasOption(IMPORT_COLUMNS_OPT.getOpt())) {
userSuppliedColumnNames = Lists.newArrayList(
Splitter.on(",").trimResults().split
(cmdLine.getOptionValue(IMPORT_COLUMNS_OPT.getOpt())));
}
return CSVCommonsLoader.generateColumnInfo(
conn, qualifiedTableName, userSuppliedColumnNames, true);
}
/**
* Calculate the HBase HTable name for which the import is to be done.
*
* @param schemaName import schema name, can be null
* @param tableName import table name
* @return the byte representation of the import HTable
*/
@VisibleForTesting
static String getQualifiedTableName(String schemaName, String tableName) {
if (schemaName != null) {
return String.format("%s.%s", SchemaUtil.normalizeIdentifier(schemaName),
SchemaUtil.normalizeIdentifier(tableName));
} else {
return SchemaUtil.normalizeIdentifier(tableName);
}
}
/**
* Set configuration values based on parsed command line options.
*
* @param cmdLine supplied command line options
* @param importColumns descriptors of columns to be imported
* @param conf job configuration
*/
@VisibleForTesting
static void configureOptions(CommandLine cmdLine, List<ColumnInfo> importColumns,
Configuration conf) {
char delimiterChar = ',';
if (cmdLine.hasOption(DELIMITER_OPT.getOpt())) {
String delimString = cmdLine.getOptionValue(DELIMITER_OPT.getOpt());
if (delimString.length() != 1) {
throw new IllegalArgumentException("Illegal delimiter character: " + delimString);
}
delimiterChar = delimString.charAt(0);
}
if (cmdLine.hasOption(ZK_QUORUM_OPT.getOpt())) {
String zkQuorum = cmdLine.getOptionValue(ZK_QUORUM_OPT.getOpt());
LOG.info("Configuring ZK quorum to {}", zkQuorum);
conf.set(HConstants.ZOOKEEPER_QUORUM, zkQuorum);
}
CsvBulkImportUtil.initCsvImportJob(
conf,
getQualifiedTableName(
cmdLine.getOptionValue(SCHEMA_NAME_OPT.getOpt()),
cmdLine.getOptionValue(TABLE_NAME_OPT.getOpt())),
delimiterChar,
cmdLine.getOptionValue(ARRAY_DELIMITER_OPT.getOpt()),
importColumns,
cmdLine.hasOption(IGNORE_ERRORS_OPT.getOpt()));
}
/**
* Perform any required validation on the table being bulk loaded into:
* - ensure no column family names start with '_', as they'd be ignored leading to problems.
* @throws java.sql.SQLException
*/
private void validateTable(Connection conn, String schemaName,
String tableName) throws SQLException {
ResultSet rs = conn.getMetaData().getColumns(
null, StringUtil.escapeLike(schemaName),
StringUtil.escapeLike(tableName), null);
while (rs.next()) {
String familyName = rs.getString(PhoenixDatabaseMetaData.COLUMN_FAMILY);
if (familyName != null && familyName.startsWith("_")) {
if (QueryConstants.DEFAULT_COLUMN_FAMILY.equals(familyName)) {
throw new IllegalStateException(
"CSV Bulk Loader error: All column names that are not part of the " +
"primary key constraint must be prefixed with a column family " +
"name (i.e. f.my_column VARCHAR)");
} else {
throw new IllegalStateException("CSV Bulk Loader error: Column family name " +
"must not start with '_': " + familyName);
}
}
}
rs.close();
}
/**
* Get names of index tables of current data table
* @throws java.sql.SQLException
*/
private List<String> getIndexTables(Connection conn, String schemaName, String tableName)
throws SQLException {
PTable table = PhoenixRuntime.getTable(conn, tableName);
List<String> indexTables = new ArrayList<String>();
for(PTable indexTable : table.getIndexes()){
indexTables.add(getQualifiedTableName(schemaName,
indexTable.getTableName().getString()));
}
return indexTables;
}
/**
* A runnable to load data into a single table
*
*/
private static class TableLoader implements Callable<Boolean> {
private Configuration conf;
private String tableName;
private Path inputPath;
private Path outputPath;
public TableLoader(Configuration conf, String qualifiedTableName, Path inputPath,
Path outputPath){
this.conf = conf;
this.tableName = qualifiedTableName;
this.inputPath = inputPath;
this.outputPath = outputPath;
}
@Override
public Boolean call() {
LOG.info("Configuring HFile output path to {}", outputPath);
try{
Job job = new Job(conf, "Phoenix MapReduce import for " + tableName);
// Allow overriding the job jar setting by using a -D system property at startup
if (job.getJar() == null) {
job.setJarByClass(CsvToKeyValueMapper.class);
}
job.setInputFormatClass(TextInputFormat.class);
FileInputFormat.addInputPath(job, inputPath);
FileOutputFormat.setOutputPath(job, outputPath);
job.setMapperClass(CsvToKeyValueMapper.class);
job.setMapOutputKeyClass(ImmutableBytesWritable.class);
job.setMapOutputValueClass(KeyValue.class);
HTable htable = new HTable(conf, tableName);
// Auto configure partitioner and reducer according to the Main Data table
HFileOutputFormat.configureIncrementalLoad(job, htable);
LOG.info("Running MapReduce import job from {} to {}", inputPath, outputPath);
boolean success = job.waitForCompletion(true);
if (!success) {
LOG.error("Import job failed, check JobTracker for details");
return false;
}
LOG.info("Loading HFiles from {}", outputPath);
LoadIncrementalHFiles loader = new LoadIncrementalHFiles(conf);
loader.doBulkLoad(outputPath, htable);
htable.close();
LOG.info("Incremental load complete for table=" + tableName);
LOG.info("Removing output directory {}", outputPath);
if (!FileSystem.get(conf).delete(outputPath, true)) {
LOG.error("Removing output directory {} failed", outputPath);
}
return true;
} catch(Exception ex) {
LOG.error("Import job on table=" + tableName + " failed due to exception:" + ex);
return false;
}
}
}
}