@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
Object... nodeOutputs) throws SemanticException {
MapJoinOperator mapJoinOp = (MapJoinOperator) nd;
BucketMapjoinOptProcCtx context = (BucketMapjoinOptProcCtx) procCtx;
if(context.getListOfRejectedMapjoins().contains(mapJoinOp)) {
return null;
}
QBJoinTree joinCxt = this.pGraphContext.getMapJoinContext().get(mapJoinOp);
if(joinCxt == null) {
return null;
}
List<String> joinAliases = new ArrayList<String>();
String[] srcs = joinCxt.getBaseSrc();
String[] left = joinCxt.getLeftAliases();
List<String> mapAlias = joinCxt.getMapAliases();
String baseBigAlias = null;
for(String s : left) {
if(s != null && !joinAliases.contains(s)) {
joinAliases.add(s);
if(!mapAlias.contains(s)) {
baseBigAlias = s;
}
}
}
for(String s : srcs) {
if(s != null && !joinAliases.contains(s)) {
joinAliases.add(s);
if(!mapAlias.contains(s)) {
baseBigAlias = s;
}
}
}
MapJoinDesc mjDecs = mapJoinOp.getConf();
LinkedHashMap<String, Integer> aliasToBucketNumberMapping = new LinkedHashMap<String, Integer>();
LinkedHashMap<String, List<String>> aliasToBucketFileNamesMapping = new LinkedHashMap<String, List<String>>();
// right now this code does not work with "a join b on a.key = b.key and
// a.ds = b.ds", where ds is a partition column. It only works with joins
// with only one partition presents in each join source tables.
Map<String, Operator<? extends Serializable>> topOps = this.pGraphContext.getTopOps();
Map<TableScanOperator, Table> topToTable = this.pGraphContext.getTopToTable();
// (partition to bucket file names) and (partition to bucket number) for
// the big table;
LinkedHashMap<Partition, List<String>> bigTblPartsToBucketFileNames = new LinkedHashMap<Partition, List<String>>();
LinkedHashMap<Partition, Integer> bigTblPartsToBucketNumber = new LinkedHashMap<Partition, Integer>();
for (int index = 0; index < joinAliases.size(); index++) {
String alias = joinAliases.get(index);
TableScanOperator tso = (TableScanOperator) topOps.get(alias);
if (tso == null) {
return null;
}
Table tbl = topToTable.get(tso);
if(tbl.isPartitioned()) {
PrunedPartitionList prunedParts = null;
try {
prunedParts = pGraphContext.getOpToPartList().get(tso);
if (prunedParts == null) {
prunedParts = PartitionPruner.prune(tbl, pGraphContext.getOpToPartPruner().get(tso), pGraphContext.getConf(), alias,
pGraphContext.getPrunedPartitions());
pGraphContext.getOpToPartList().put(tso, prunedParts);
}
} catch (HiveException e) {
// Has to use full name to make sure it does not conflict with
// org.apache.commons.lang.StringUtils
LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
throw new SemanticException(e.getMessage(), e);
}
int partNumber = prunedParts.getConfirmedPartns().size()
+ prunedParts.getUnknownPartns().size();
if (partNumber > 1) {
// only allow one partition for small tables
if(alias != baseBigAlias) {
return null;
}
// here is the big table,and we get more than one partitions.
// construct a mapping of (Partition->bucket file names) and
// (Partition -> bucket number)
Iterator<Partition> iter = prunedParts.getConfirmedPartns()
.iterator();
while (iter.hasNext()) {
Partition p = iter.next();
if (!checkBucketColumns(p.getBucketCols(), mjDecs, index)) {
return null;
}
List<String> fileNames = getOnePartitionBucketFileNames(p);
bigTblPartsToBucketFileNames.put(p, fileNames);
bigTblPartsToBucketNumber.put(p, p.getBucketCount());
}
iter = prunedParts.getUnknownPartns().iterator();
while (iter.hasNext()) {
Partition p = iter.next();
if (!checkBucketColumns(p.getBucketCols(), mjDecs, index)) {
return null;
}
List<String> fileNames = getOnePartitionBucketFileNames(p);
bigTblPartsToBucketFileNames.put(p, fileNames);
bigTblPartsToBucketNumber.put(p, p.getBucketCount());
}
// If there are more than one partition for the big
// table,aliasToBucketFileNamesMapping and partsToBucketNumber will
// not contain mappings for the big table. Instead, the mappings are
// contained in bigTblPartsToBucketFileNames and
// bigTblPartsToBucketNumber
} else {
Partition part = null;
Iterator<Partition> iter = prunedParts.getConfirmedPartns()
.iterator();
if (iter.hasNext()) {
part = iter.next();
}
if (part == null) {
iter = prunedParts.getUnknownPartns().iterator();
if (iter.hasNext()) {
part = iter.next();
}
}
assert part != null;
Integer num = new Integer(part.getBucketCount());
aliasToBucketNumberMapping.put(alias, num);
if (!checkBucketColumns(part.getBucketCols(), mjDecs, index)) {
return null;
}
List<String> fileNames = getOnePartitionBucketFileNames(part);
aliasToBucketFileNamesMapping.put(alias, fileNames);
if (alias == baseBigAlias) {
bigTblPartsToBucketFileNames.put(part, fileNames);
bigTblPartsToBucketNumber.put(part, num);
}
}
} else {
if (!checkBucketColumns(tbl.getBucketCols(), mjDecs, index)) {
return null;
}
Integer num = new Integer(tbl.getNumBuckets());
aliasToBucketNumberMapping.put(alias, num);
List<String> fileNames = new ArrayList<String>();
try {
FileSystem fs = FileSystem.get(tbl.getDataLocation(), this.pGraphContext.getConf());
FileStatus[] files = fs.listStatus(new Path(tbl.getDataLocation().toString()));
if(files != null) {
for(FileStatus file : files) {
fileNames.add(file.getPath().toString());
}
}
} catch (IOException e) {
throw new SemanticException(e);
}
aliasToBucketFileNamesMapping.put(alias, fileNames);
}
}
// All tables or partitions are bucketed, and their bucket number is
// stored in 'bucketNumbers', we need to check if the number of buckets in
// the big table can be divided by no of buckets in small tables.
if (bigTblPartsToBucketNumber.size() > 0) {
Iterator<Entry<Partition, Integer>> bigTblPartToBucketNumber = bigTblPartsToBucketNumber
.entrySet().iterator();
while (bigTblPartToBucketNumber.hasNext()) {
int bucketNumberInPart = bigTblPartToBucketNumber.next().getValue();
if (!checkBucketNumberAgainstBigTable(aliasToBucketNumberMapping,
bucketNumberInPart)) {
return null;
}
}
} else {
int bucketNoInBigTbl = aliasToBucketNumberMapping.get(baseBigAlias).intValue();
if (!checkBucketNumberAgainstBigTable(aliasToBucketNumberMapping,
bucketNoInBigTbl)) {
return null;
}
}
MapJoinDesc desc = mapJoinOp.getConf();
LinkedHashMap<String, LinkedHashMap<String, ArrayList<String>>> aliasBucketFileNameMapping =
new LinkedHashMap<String, LinkedHashMap<String, ArrayList<String>>>();
//sort bucket names for the big table