baseBigAlias = s;
}
}
}
MapJoinDesc mjDecs = mapJoinOp.getConf();
LinkedHashMap<String, Integer> aliasToBucketNumberMapping = new LinkedHashMap<String, Integer>();
LinkedHashMap<String, List<String>> aliasToBucketFileNamesMapping = new LinkedHashMap<String, List<String>>();
// right now this code does not work with "a join b on a.key = b.key and
// a.ds = b.ds", where ds is a partition column. It only works with joins
// with only one partition presents in each join source tables.
Map<String, Operator<? extends Serializable>> topOps = this.pGraphContext.getTopOps();
Map<TableScanOperator, Table> topToTable = this.pGraphContext.getTopToTable();
// (partition to bucket file names) and (partition to bucket number) for
// the big table;
LinkedHashMap<Partition, List<String>> bigTblPartsToBucketFileNames = new LinkedHashMap<Partition, List<String>>();
LinkedHashMap<Partition, Integer> bigTblPartsToBucketNumber = new LinkedHashMap<Partition, Integer>();
for (int index = 0; index < joinAliases.size(); index++) {
String alias = joinAliases.get(index);
TableScanOperator tso = (TableScanOperator) topOps.get(alias);
if (tso == null) {
return null;
}
Table tbl = topToTable.get(tso);
if(tbl.isPartitioned()) {
PrunedPartitionList prunedParts = null;
try {
prunedParts = pGraphContext.getOpToPartList().get(tso);
if (prunedParts == null) {
prunedParts = PartitionPruner.prune(tbl, pGraphContext.getOpToPartPruner().get(tso), pGraphContext.getConf(), alias,
pGraphContext.getPrunedPartitions());
pGraphContext.getOpToPartList().put(tso, prunedParts);
}
} catch (HiveException e) {
// Has to use full name to make sure it does not conflict with
// org.apache.commons.lang.StringUtils
LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
throw new SemanticException(e.getMessage(), e);
}
int partNumber = prunedParts.getConfirmedPartns().size()
+ prunedParts.getUnknownPartns().size();
if (partNumber > 1) {
// only allow one partition for small tables
if(alias != baseBigAlias) {
return null;
}
// here is the big table,and we get more than one partitions.
// construct a mapping of (Partition->bucket file names) and
// (Partition -> bucket number)
Iterator<Partition> iter = prunedParts.getConfirmedPartns()
.iterator();
while (iter.hasNext()) {
Partition p = iter.next();
if (!checkBucketColumns(p.getBucketCols(), mjDecs, index)) {
return null;
}
List<String> fileNames = getOnePartitionBucketFileNames(p);
bigTblPartsToBucketFileNames.put(p, fileNames);
bigTblPartsToBucketNumber.put(p, p.getBucketCount());
}
iter = prunedParts.getUnknownPartns().iterator();
while (iter.hasNext()) {
Partition p = iter.next();
if (!checkBucketColumns(p.getBucketCols(), mjDecs, index)) {
return null;
}
List<String> fileNames = getOnePartitionBucketFileNames(p);
bigTblPartsToBucketFileNames.put(p, fileNames);
bigTblPartsToBucketNumber.put(p, p.getBucketCount());
}
// If there are more than one partition for the big
// table,aliasToBucketFileNamesMapping and partsToBucketNumber will
// not contain mappings for the big table. Instead, the mappings are
// contained in bigTblPartsToBucketFileNames and
// bigTblPartsToBucketNumber
} else {
Partition part = null;
Iterator<Partition> iter = prunedParts.getConfirmedPartns()
.iterator();
if (iter.hasNext()) {
part = iter.next();
}
if (part == null) {
iter = prunedParts.getUnknownPartns().iterator();
if (iter.hasNext()) {
part = iter.next();
}
}
assert part != null;
Integer num = new Integer(part.getBucketCount());
aliasToBucketNumberMapping.put(alias, num);
if (!checkBucketColumns(part.getBucketCols(), mjDecs, index)) {
return null;
}
List<String> fileNames = getOnePartitionBucketFileNames(part);
aliasToBucketFileNamesMapping.put(alias, fileNames);
if (alias == baseBigAlias) {
bigTblPartsToBucketFileNames.put(part, fileNames);
bigTblPartsToBucketNumber.put(part, num);
}
}
} else {
if (!checkBucketColumns(tbl.getBucketCols(), mjDecs, index)) {
return null;
}
Integer num = new Integer(tbl.getNumBuckets());
aliasToBucketNumberMapping.put(alias, num);
List<String> fileNames = new ArrayList<String>();
try {
FileSystem fs = FileSystem.get(tbl.getDataLocation(), this.pGraphContext.getConf());
FileStatus[] files = fs.listStatus(new Path(tbl.getDataLocation().toString()));
if(files != null) {
for(FileStatus file : files) {
fileNames.add(file.getPath().toString());
}
}
} catch (IOException e) {
throw new SemanticException(e);
}
aliasToBucketFileNamesMapping.put(alias, fileNames);
}
}
// All tables or partitions are bucketed, and their bucket number is
// stored in 'bucketNumbers', we need to check if the number of buckets in
// the big table can be divided by no of buckets in small tables.
if (bigTblPartsToBucketNumber.size() > 0) {
Iterator<Entry<Partition, Integer>> bigTblPartToBucketNumber = bigTblPartsToBucketNumber
.entrySet().iterator();
while (bigTblPartToBucketNumber.hasNext()) {
int bucketNumberInPart = bigTblPartToBucketNumber.next().getValue();
if (!checkBucketNumberAgainstBigTable(aliasToBucketNumberMapping,
bucketNumberInPart)) {
return null;
}
}
} else {
int bucketNoInBigTbl = aliasToBucketNumberMapping.get(baseBigAlias).intValue();
if (!checkBucketNumberAgainstBigTable(aliasToBucketNumberMapping,
bucketNoInBigTbl)) {
return null;
}
}
MapJoinDesc desc = mapJoinOp.getConf();
LinkedHashMap<String, LinkedHashMap<String, ArrayList<String>>> aliasBucketFileNameMapping =
new LinkedHashMap<String, LinkedHashMap<String, ArrayList<String>>>();
//sort bucket names for the big table
if(bigTblPartsToBucketNumber.size() > 0) {
Collection<List<String>> bucketNamesAllParts = bigTblPartsToBucketFileNames.values();
for(List<String> partBucketNames : bucketNamesAllParts) {
Collections.sort(partBucketNames);
}
} else {
Collections.sort(aliasToBucketFileNamesMapping.get(baseBigAlias));
}
// go through all small tables and get the mapping from bucket file name
// in the big table to bucket file names in small tables.
for (int j = 0; j < joinAliases.size(); j++) {
String alias = joinAliases.get(j);
if(alias.equals(baseBigAlias)) {
continue;
}
Collections.sort(aliasToBucketFileNamesMapping.get(alias));
LinkedHashMap<String, ArrayList<String>> mapping = new LinkedHashMap<String, ArrayList<String>>();
aliasBucketFileNameMapping.put(alias, mapping);
// for each bucket file in big table, get the corresponding bucket file
// name in the small table.
if (bigTblPartsToBucketNumber.size() > 0) {
//more than 1 partition in the big table, do the mapping for each partition
Iterator<Entry<Partition, List<String>>> bigTblPartToBucketNames = bigTblPartsToBucketFileNames
.entrySet().iterator();
Iterator<Entry<Partition, Integer>> bigTblPartToBucketNum = bigTblPartsToBucketNumber
.entrySet().iterator();
while (bigTblPartToBucketNames.hasNext()) {
assert bigTblPartToBucketNum.hasNext();
int bigTblBucketNum = bigTblPartToBucketNum.next().getValue().intValue();
List<String> bigTblBucketNameList = bigTblPartToBucketNames.next().getValue();
fillMapping(baseBigAlias, aliasToBucketNumberMapping,
aliasToBucketFileNamesMapping, alias, mapping, bigTblBucketNum,
bigTblBucketNameList, desc.getBucketFileNameMapping());
}
} else {
List<String> bigTblBucketNameList = aliasToBucketFileNamesMapping.get(baseBigAlias);
int bigTblBucketNum = aliasToBucketNumberMapping.get(baseBigAlias);
fillMapping(baseBigAlias, aliasToBucketNumberMapping,
aliasToBucketFileNamesMapping, alias, mapping, bigTblBucketNum,
bigTblBucketNameList, desc.getBucketFileNameMapping());
}
}
desc.setAliasBucketFileNameMapping(aliasBucketFileNameMapping);
desc.setBigTableAlias(baseBigAlias);
return null;
}