this.pGraphContext = pGraphContext;
}
private boolean convertBucketMapJoin(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
Object... nodeOutputs) throws SemanticException {
MapJoinOperator mapJoinOp = (MapJoinOperator) nd;
BucketMapjoinOptProcCtx context = (BucketMapjoinOptProcCtx) procCtx;
HiveConf conf = context.getConf();
if(context.getListOfRejectedMapjoins().contains(mapJoinOp)) {
return false;
}
QBJoinTree joinCxt = this.pGraphContext.getMapJoinContext().get(mapJoinOp);
if(joinCxt == null) {
return false;
}
List<String> joinAliases = new ArrayList<String>();
String[] srcs = joinCxt.getBaseSrc();
String[] left = joinCxt.getLeftAliases();
List<String> mapAlias = joinCxt.getMapAliases();
String baseBigAlias = null;
for(String s : left) {
if(s != null && !joinAliases.contains(s)) {
joinAliases.add(s);
if(!mapAlias.contains(s)) {
baseBigAlias = s;
}
}
}
for(String s : srcs) {
if(s != null && !joinAliases.contains(s)) {
joinAliases.add(s);
if(!mapAlias.contains(s)) {
baseBigAlias = s;
}
}
}
MapJoinDesc mjDesc = mapJoinOp.getConf();
LinkedHashMap<String, List<Integer>> aliasToPartitionBucketNumberMapping =
new LinkedHashMap<String, List<Integer>>();
LinkedHashMap<String, List<List<String>>> aliasToPartitionBucketFileNamesMapping =
new LinkedHashMap<String, List<List<String>>>();
Map<String, Operator<? extends OperatorDesc>> topOps =
this.pGraphContext.getTopOps();
Map<TableScanOperator, Table> topToTable = this.pGraphContext.getTopToTable();
// (partition to bucket file names) and (partition to bucket number) for
// the big table;
LinkedHashMap<Partition, List<String>> bigTblPartsToBucketFileNames = new LinkedHashMap<Partition, List<String>>();
LinkedHashMap<Partition, Integer> bigTblPartsToBucketNumber = new LinkedHashMap<Partition, Integer>();
Integer[] orders = null; // accessing order of join cols to bucket cols, should be same
boolean bigTablePartitioned = true;
for (int index = 0; index < joinAliases.size(); index++) {
String alias = joinAliases.get(index);
TableScanOperator tso = (TableScanOperator) topOps.get(alias);
if (tso == null) {
return false;
}
List<String> keys = toColumns(mjDesc.getKeys().get((byte) index));
if (keys == null || keys.isEmpty()) {
return false;
}
if (orders == null) {
orders = new Integer[keys.size()];
}
Table tbl = topToTable.get(tso);
if(tbl.isPartitioned()) {
PrunedPartitionList prunedParts;
try {
prunedParts = pGraphContext.getOpToPartList().get(tso);
if (prunedParts == null) {
prunedParts = PartitionPruner.prune(tbl, pGraphContext.getOpToPartPruner().get(tso), pGraphContext.getConf(), alias,
pGraphContext.getPrunedPartitions());
pGraphContext.getOpToPartList().put(tso, prunedParts);
}
} catch (HiveException e) {
// Has to use full name to make sure it does not conflict with
// org.apache.commons.lang.StringUtils
LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
throw new SemanticException(e.getMessage(), e);
}
List<Partition> partitions = prunedParts.getNotDeniedPartns();
// construct a mapping of (Partition->bucket file names) and (Partition -> bucket number)
if (partitions.isEmpty()) {
if (!alias.equals(baseBigAlias)) {
aliasToPartitionBucketNumberMapping.put(alias, Arrays.<Integer>asList());
aliasToPartitionBucketFileNamesMapping.put(alias, new ArrayList<List<String>>());
}
} else {
List<Integer> buckets = new ArrayList<Integer>();
List<List<String>> files = new ArrayList<List<String>>();
for (Partition p : partitions) {
if (!checkBucketColumns(p.getBucketCols(), keys, orders)) {
return false;
}
List<String> fileNames = getOnePartitionBucketFileNames(p.getDataLocation());
// The number of files for the table should be same as number of buckets.
int bucketCount = p.getBucketCount();
if (fileNames.size() != bucketCount) {
String msg = "The number of buckets for table " +
tbl.getTableName() + " partition " + p.getName() + " is " +
p.getBucketCount() + ", whereas the number of files is " + fileNames.size();
throw new SemanticException(
ErrorMsg.BUCKETED_TABLE_METADATA_INCORRECT.getMsg(msg));
}
if (alias.equals(baseBigAlias)) {
bigTblPartsToBucketFileNames.put(p, fileNames);
bigTblPartsToBucketNumber.put(p, bucketCount);
} else {
files.add(fileNames);
buckets.add(bucketCount);
}
}
if (!alias.equals(baseBigAlias)) {
aliasToPartitionBucketNumberMapping.put(alias, buckets);
aliasToPartitionBucketFileNamesMapping.put(alias, files);
}
}
} else {
if (!checkBucketColumns(tbl.getBucketCols(), keys, orders)) {
return false;
}
List<String> fileNames = getOnePartitionBucketFileNames(tbl.getDataLocation());
Integer num = new Integer(tbl.getNumBuckets());
// The number of files for the table should be same as number of buckets.
if (fileNames.size() != num) {
String msg = "The number of buckets for table " +
tbl.getTableName() + " is " + tbl.getNumBuckets() +
", whereas the number of files is " + fileNames.size();
throw new SemanticException(
ErrorMsg.BUCKETED_TABLE_METADATA_INCORRECT.getMsg(msg));
}
if (alias.equals(baseBigAlias)) {
bigTblPartsToBucketFileNames.put(null, fileNames);
bigTblPartsToBucketNumber.put(null, tbl.getNumBuckets());
bigTablePartitioned = false;
} else {
aliasToPartitionBucketNumberMapping.put(alias, Arrays.asList(num));
aliasToPartitionBucketFileNamesMapping.put(alias, Arrays.asList(fileNames));
}
}
}
// All tables or partitions are bucketed, and their bucket number is
// stored in 'bucketNumbers', we need to check if the number of buckets in
// the big table can be divided by no of buckets in small tables.
for (Integer bucketNumber : bigTblPartsToBucketNumber.values()) {
if (!checkBucketNumberAgainstBigTable(aliasToPartitionBucketNumberMapping, bucketNumber)) {
return false;
}
}
MapJoinDesc desc = mapJoinOp.getConf();
Map<String, Map<String, List<String>>> aliasBucketFileNameMapping =
new LinkedHashMap<String, Map<String, List<String>>>();
//sort bucket names for the big table