/*
* Copyright 2013 NGDATA nv
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.ngdata.hbaseindexer.mr;
import static com.ngdata.hbaseindexer.indexer.SolrServerFactory.createHttpSolrServers;
import static com.ngdata.hbaseindexer.util.solr.SolrConnectionParamUtil.getSolrMaxConnectionsPerRoute;
import static com.ngdata.hbaseindexer.util.solr.SolrConnectionParamUtil.getSolrMaxConnectionsTotal;
import static com.ngdata.hbaseindexer.util.solr.SolrConnectionParamUtil.getSolrMode;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
import java.util.Collections;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import com.google.common.base.Charsets;
import com.ngdata.hbaseindexer.SolrConnectionParams;
import com.ngdata.hbaseindexer.conf.IndexerComponentFactory;
import com.ngdata.hbaseindexer.conf.IndexerComponentFactoryUtil;
import com.ngdata.hbaseindexer.conf.IndexerConf;
import com.ngdata.hbaseindexer.morphline.MorphlineResultToSolrMapper;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.output.NullOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.http.client.HttpClient;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.impl.conn.PoolingClientConnectionManager;
import org.apache.solr.client.solrj.SolrServer;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.impl.CloudSolrServer;
import org.apache.solr.hadoop.ForkedMapReduceIndexerTool;
import org.apache.solr.hadoop.SolrInputDocumentWritable;
import org.apache.solr.hadoop.Utils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Top-level tool for running MapReduce-based indexing pipelines over HBase tables.
*/
public class HBaseMapReduceIndexerTool extends Configured implements Tool {
private static final Logger LOG = LoggerFactory.getLogger(ForkedMapReduceIndexerTool.class);
public static void main(String[] args) throws Exception {
int res = ToolRunner.run(new Configuration(), new HBaseMapReduceIndexerTool(), args);
System.exit(res);
}
@Override
public int run(String[] args) throws Exception {
return run(args, new NopJobProcessCallback());
}
public int run(String[] args, JobProcessCallback callback) throws Exception {
HBaseIndexingOptions hbaseIndexingOpts = new HBaseIndexingOptions(getConf());
Integer exitCode = new HBaseIndexerArgumentParser().parseArgs(args, getConf(), hbaseIndexingOpts);
if (exitCode != null) {
return exitCode;
}
return run(hbaseIndexingOpts, callback);
}
public int run(HBaseIndexingOptions hbaseIndexingOpts, JobProcessCallback callback) throws Exception {
if (hbaseIndexingOpts.isDryRun) {
return new IndexerDryRun(hbaseIndexingOpts, getConf(), System.out).run();
}
long programStartTime = System.currentTimeMillis();
Configuration conf = getConf();
IndexingSpecification indexingSpec = hbaseIndexingOpts.getIndexingSpecification();
conf.set(HBaseIndexerMapper.INDEX_COMPONENT_FACTORY_KEY, indexingSpec.getIndexerComponentFactory());
conf.set(HBaseIndexerMapper.INDEX_CONFIGURATION_CONF_KEY, new String(indexingSpec.getConfiguration(), Charsets.UTF_8));
conf.set(HBaseIndexerMapper.INDEX_NAME_CONF_KEY, indexingSpec.getIndexerName());
conf.set(HBaseIndexerMapper.TABLE_NAME_CONF_KEY, indexingSpec.getTableName());
HBaseIndexerMapper.configureIndexConnectionParams(conf, indexingSpec.getIndexConnectionParams());
IndexerComponentFactory factory = IndexerComponentFactoryUtil.getComponentFactory(indexingSpec.getIndexerComponentFactory(), new ByteArrayInputStream(indexingSpec.getConfiguration()), indexingSpec.getIndexConnectionParams());
IndexerConf indexerConf = factory.createIndexerConf();
Map<String, String> params = indexerConf.getGlobalParams();
String morphlineFile = params.get(MorphlineResultToSolrMapper.MORPHLINE_FILE_PARAM);
if (hbaseIndexingOpts.morphlineFile != null) {
morphlineFile = hbaseIndexingOpts.morphlineFile.getPath();
}
if (morphlineFile != null) {
conf.set(MorphlineResultToSolrMapper.MORPHLINE_FILE_PARAM, new File(morphlineFile).getName());
ForkedMapReduceIndexerTool.addDistributedCacheFile(new File(morphlineFile), conf);
}
String morphlineId = params.get(MorphlineResultToSolrMapper.MORPHLINE_ID_PARAM);
if (hbaseIndexingOpts.morphlineId != null) {
morphlineId = hbaseIndexingOpts.morphlineId;
}
if (morphlineId != null) {
conf.set(MorphlineResultToSolrMapper.MORPHLINE_ID_PARAM, morphlineId);
}
conf.setBoolean(HBaseIndexerMapper.INDEX_DIRECT_WRITE_CONF_KEY, hbaseIndexingOpts.isDirectWrite());
if (hbaseIndexingOpts.fairSchedulerPool != null) {
conf.set("mapred.fairscheduler.pool", hbaseIndexingOpts.fairSchedulerPool);
}
// switch off a false warning about allegedly not implementing Tool
// also see http://hadoop.6.n7.nabble.com/GenericOptionsParser-warning-td8103.html
// also see https://issues.apache.org/jira/browse/HADOOP-8183
getConf().setBoolean("mapred.used.genericoptionsparser", true);
if (hbaseIndexingOpts.log4jConfigFile != null) {
Utils.setLogConfigFile(hbaseIndexingOpts.log4jConfigFile, getConf());
ForkedMapReduceIndexerTool.addDistributedCacheFile(hbaseIndexingOpts.log4jConfigFile, conf);
}
Job job = Job.getInstance(getConf());
job.setJobName(getClass().getSimpleName() + "/" + HBaseIndexerMapper.class.getSimpleName());
job.setJarByClass(HBaseIndexerMapper.class);
// job.setUserClassesTakesPrecedence(true);
TableMapReduceUtil.initTableMapperJob(
hbaseIndexingOpts.getScans(),
HBaseIndexerMapper.class,
Text.class,
SolrInputDocumentWritable.class,
job);
// explicitely set hbase configuration on the job because the TableMapReduceUtil overwrites it with the hbase defaults
// (see HBASE-4297 which is not really fixed in hbase 0.94.6 on all code paths)
HBaseConfiguration.merge(job.getConfiguration(), getConf());
int mappers = new JobClient(job.getConfiguration()).getClusterStatus().getMaxMapTasks(); // MR1
//mappers = job.getCluster().getClusterStatus().getMapSlotCapacity(); // Yarn only
LOG.info("Cluster reports {} mapper slots", mappers);
LOG.info("Using these parameters: " +
"reducers: {}, shards: {}, fanout: {}, maxSegments: {}",
new Object[]{hbaseIndexingOpts.reducers, hbaseIndexingOpts.shards, hbaseIndexingOpts.fanout,
hbaseIndexingOpts.maxSegments});
if (hbaseIndexingOpts.isDirectWrite()) {
CloudSolrServer solrServer = new CloudSolrServer(hbaseIndexingOpts.zkHost);
solrServer.setDefaultCollection(hbaseIndexingOpts.collection);
if (hbaseIndexingOpts.clearIndex) {
clearSolr(indexingSpec.getIndexConnectionParams());
}
// Run a mapper-only MR job that sends index documents directly to a live Solr instance.
job.setOutputFormatClass(NullOutputFormat.class);
job.setNumReduceTasks(0);
job.submit();
callback.jobStarted(job.getJobID().toString(), job.getTrackingURL());
if (!ForkedMapReduceIndexerTool.waitForCompletion(job, hbaseIndexingOpts.isVerbose)) {
return -1; // job failed
}
commitSolr(indexingSpec.getIndexConnectionParams());
ForkedMapReduceIndexerTool.goodbye(job, programStartTime);
return 0;
} else {
FileSystem fileSystem = FileSystem.get(getConf());
if (fileSystem.exists(hbaseIndexingOpts.outputDir)) {
if (hbaseIndexingOpts.overwriteOutputDir) {
LOG.info("Removing existing output directory {}", hbaseIndexingOpts.outputDir);
if (!fileSystem.delete(hbaseIndexingOpts.outputDir, true)) {
LOG.error("Deleting output directory '{}' failed", hbaseIndexingOpts.outputDir);
return -1;
}
} else {
LOG.error("Output directory '{}' already exists. Run with --overwrite-output-dir to " +
"overwrite it, or remove it manually", hbaseIndexingOpts.outputDir);
return -1;
}
}
int exitCode = ForkedMapReduceIndexerTool.runIndexingPipeline(
job, callback, getConf(), hbaseIndexingOpts.asOptions(),
programStartTime,
fileSystem,
null, -1, // File-based parameters
-1, // num mappers, only of importance for file-based indexing
hbaseIndexingOpts.reducers
);
if (hbaseIndexingOpts.isGeneratedOutputDir()) {
LOG.info("Deleting generated output directory " + hbaseIndexingOpts.outputDir);
fileSystem.delete(hbaseIndexingOpts.outputDir, true);
}
return exitCode;
}
}
private void clearSolr(Map<String, String> indexConnectionParams) throws SolrServerException, IOException {
Set<SolrServer> servers = createSolrServers(indexConnectionParams);
for (SolrServer server : servers) {
server.deleteByQuery("*:*");
server.commit(false, false);
server.shutdown();
}
}
private void commitSolr(Map<String, String> indexConnectionParams) throws SolrServerException, IOException {
Set<SolrServer> servers = createSolrServers(indexConnectionParams);
for (SolrServer server : servers) {
server.commit(false, false);
server.shutdown();
}
}
private Set<SolrServer> createSolrServers(Map<String, String> indexConnectionParams) throws MalformedURLException {
String solrMode = getSolrMode(indexConnectionParams);
if (solrMode.equals("cloud")) {
String indexZkHost = indexConnectionParams.get(SolrConnectionParams.ZOOKEEPER);
String collectionName = indexConnectionParams.get(SolrConnectionParams.COLLECTION);
CloudSolrServer solrServer = new CloudSolrServer(indexZkHost);
solrServer.setDefaultCollection(collectionName);
return Collections.singleton((SolrServer) solrServer);
} else if (solrMode.equals("classic")) {
PoolingClientConnectionManager connectionManager = new PoolingClientConnectionManager();
connectionManager.setDefaultMaxPerRoute(getSolrMaxConnectionsPerRoute(indexConnectionParams));
connectionManager.setMaxTotal(getSolrMaxConnectionsTotal(indexConnectionParams));
HttpClient httpClient = new DefaultHttpClient(connectionManager);
return new HashSet<SolrServer>(createHttpSolrServers(indexConnectionParams, httpClient));
} else {
throw new RuntimeException("Only 'cloud' and 'classic' are valid values for solr.mode, but got " + solrMode);
}
}
}