baseBigAlias = s;
}
}
}
MapJoinDesc mjDesc = mapJoinOp.getConf();
LinkedHashMap<String, List<Integer>> aliasToPartitionBucketNumberMapping =
new LinkedHashMap<String, List<Integer>>();
LinkedHashMap<String, List<List<String>>> aliasToPartitionBucketFileNamesMapping =
new LinkedHashMap<String, List<List<String>>>();
Map<String, Operator<? extends OperatorDesc>> topOps =
this.pGraphContext.getTopOps();
Map<TableScanOperator, Table> topToTable = this.pGraphContext.getTopToTable();
// (partition to bucket file names) and (partition to bucket number) for
// the big table;
LinkedHashMap<Partition, List<String>> bigTblPartsToBucketFileNames = new LinkedHashMap<Partition, List<String>>();
LinkedHashMap<Partition, Integer> bigTblPartsToBucketNumber = new LinkedHashMap<Partition, Integer>();
Integer[] orders = null; // accessing order of join cols to bucket cols, should be same
boolean bigTablePartitioned = true;
for (int index = 0; index < joinAliases.size(); index++) {
String alias = joinAliases.get(index);
TableScanOperator tso = (TableScanOperator) topOps.get(alias);
if (tso == null) {
return false;
}
List<String> keys = toColumns(mjDesc.getKeys().get((byte) index));
if (keys == null || keys.isEmpty()) {
return false;
}
if (orders == null) {
orders = new Integer[keys.size()];
}
Table tbl = topToTable.get(tso);
if(tbl.isPartitioned()) {
PrunedPartitionList prunedParts;
try {
prunedParts = pGraphContext.getOpToPartList().get(tso);
if (prunedParts == null) {
prunedParts = PartitionPruner.prune(tbl, pGraphContext.getOpToPartPruner().get(tso), pGraphContext.getConf(), alias,
pGraphContext.getPrunedPartitions());
pGraphContext.getOpToPartList().put(tso, prunedParts);
}
} catch (HiveException e) {
// Has to use full name to make sure it does not conflict with
// org.apache.commons.lang.StringUtils
LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
throw new SemanticException(e.getMessage(), e);
}
List<Partition> partitions = prunedParts.getNotDeniedPartns();
// construct a mapping of (Partition->bucket file names) and (Partition -> bucket number)
if (partitions.isEmpty()) {
if (!alias.equals(baseBigAlias)) {
aliasToPartitionBucketNumberMapping.put(alias, Arrays.<Integer>asList());
aliasToPartitionBucketFileNamesMapping.put(alias, new ArrayList<List<String>>());
}
} else {
List<Integer> buckets = new ArrayList<Integer>();
List<List<String>> files = new ArrayList<List<String>>();
for (Partition p : partitions) {
if (!checkBucketColumns(p.getBucketCols(), keys, orders)) {
return false;
}
List<String> fileNames = getOnePartitionBucketFileNames(p.getDataLocation());
// The number of files for the table should be same as number of buckets.
int bucketCount = p.getBucketCount();
if (fileNames.size() != bucketCount) {
String msg = "The number of buckets for table " +
tbl.getTableName() + " partition " + p.getName() + " is " +
p.getBucketCount() + ", whereas the number of files is " + fileNames.size();
throw new SemanticException(
ErrorMsg.BUCKETED_TABLE_METADATA_INCORRECT.getMsg(msg));
}
if (alias.equals(baseBigAlias)) {
bigTblPartsToBucketFileNames.put(p, fileNames);
bigTblPartsToBucketNumber.put(p, bucketCount);
} else {
files.add(fileNames);
buckets.add(bucketCount);
}
}
if (!alias.equals(baseBigAlias)) {
aliasToPartitionBucketNumberMapping.put(alias, buckets);
aliasToPartitionBucketFileNamesMapping.put(alias, files);
}
}
} else {
if (!checkBucketColumns(tbl.getBucketCols(), keys, orders)) {
return false;
}
List<String> fileNames = getOnePartitionBucketFileNames(tbl.getDataLocation());
Integer num = new Integer(tbl.getNumBuckets());
// The number of files for the table should be same as number of buckets.
if (fileNames.size() != num) {
String msg = "The number of buckets for table " +
tbl.getTableName() + " is " + tbl.getNumBuckets() +
", whereas the number of files is " + fileNames.size();
throw new SemanticException(
ErrorMsg.BUCKETED_TABLE_METADATA_INCORRECT.getMsg(msg));
}
if (alias.equals(baseBigAlias)) {
bigTblPartsToBucketFileNames.put(null, fileNames);
bigTblPartsToBucketNumber.put(null, tbl.getNumBuckets());
bigTablePartitioned = false;
} else {
aliasToPartitionBucketNumberMapping.put(alias, Arrays.asList(num));
aliasToPartitionBucketFileNamesMapping.put(alias, Arrays.asList(fileNames));
}
}
}
// All tables or partitions are bucketed, and their bucket number is
// stored in 'bucketNumbers', we need to check if the number of buckets in
// the big table can be divided by no of buckets in small tables.
for (Integer bucketNumber : bigTblPartsToBucketNumber.values()) {
if (!checkBucketNumberAgainstBigTable(aliasToPartitionBucketNumberMapping, bucketNumber)) {
return false;
}
}
MapJoinDesc desc = mapJoinOp.getConf();
Map<String, Map<String, List<String>>> aliasBucketFileNameMapping =
new LinkedHashMap<String, Map<String, List<String>>>();
//sort bucket names for the big table
for(List<String> partBucketNames : bigTblPartsToBucketFileNames.values()) {
Collections.sort(partBucketNames);
}
// go through all small tables and get the mapping from bucket file name
// in the big table to bucket file names in small tables.
for (int j = 0; j < joinAliases.size(); j++) {
String alias = joinAliases.get(j);
if (alias.equals(baseBigAlias)) {
continue;
}
for (List<String> names : aliasToPartitionBucketFileNamesMapping.get(alias)) {
Collections.sort(names);
}
List<Integer> smallTblBucketNums = aliasToPartitionBucketNumberMapping.get(alias);
List<List<String>> smallTblFilesList = aliasToPartitionBucketFileNamesMapping.get(alias);
Map<String, List<String>> mapping = new LinkedHashMap<String, List<String>>();
aliasBucketFileNameMapping.put(alias, mapping);
// for each bucket file in big table, get the corresponding bucket file
// name in the small table.
//more than 1 partition in the big table, do the mapping for each partition
Iterator<Entry<Partition, List<String>>> bigTblPartToBucketNames =
bigTblPartsToBucketFileNames.entrySet().iterator();
Iterator<Entry<Partition, Integer>> bigTblPartToBucketNum = bigTblPartsToBucketNumber
.entrySet().iterator();
while (bigTblPartToBucketNames.hasNext()) {
assert bigTblPartToBucketNum.hasNext();
int bigTblBucketNum = bigTblPartToBucketNum.next().getValue();
List<String> bigTblBucketNameList = bigTblPartToBucketNames.next().getValue();
fillMapping(smallTblBucketNums, smallTblFilesList,
mapping, bigTblBucketNum, bigTblBucketNameList, desc.getBigTableBucketNumMapping());
}
}
desc.setAliasBucketFileNameMapping(aliasBucketFileNameMapping);
desc.setBigTableAlias(baseBigAlias);
if (bigTablePartitioned) {
desc.setBigTablePartSpecToFileMapping(convert(bigTblPartsToBucketFileNames));
}
return true;
}