if (lo.getSchema()==null)
{
safeToPrune = false;
return;
}
RelationalOperator rlo = (RelationalOperator)lo;
List<LogicalOperator> predecessors = (mPlan.getPredecessors(rlo) == null ? null
: new ArrayList<LogicalOperator>(mPlan.getPredecessors(rlo)));
// Now we have collected required output fields of LOLoad (include requried map keys).
// We need to push these into the loader
if (rlo instanceof LOLoad)
{
// LOLoad has only one output
RequiredFields loaderRequiredFields = requiredOutputInfo.requiredFieldsList.get(0);
prunedLoaderColumnsMap.put((LOLoad)rlo, loaderRequiredFields);
return;
}
// If the predecessor is one of LOStore/LOStream/LODistinct, we stop to trace up.
// We require all input fields. We stop processing here. The optimizer will
// pick the next ForEach and start processing from there
if (rlo instanceof LOStore || rlo instanceof LOStream || rlo instanceof LODistinct) {
return;
}
// merge requiredOutputFields and process the predecessor
if (rlo instanceof LOSplit)
{
List<RequiredFields> requiredInputFieldsList = new ArrayList<RequiredFields>();
RequiredFields requiredFields = new RequiredFields(false);
for (int i=0;i<mPlan.getSuccessors(rlo).size();i++)
{
RequiredFields rf = null;
try {
rf = requiredOutputInfo.requiredFieldsList.get(i);
} catch (Exception e) {
}
if (rf!=null)
{
rf.reIndex(0);
requiredFields.merge(rf);
} else {
// need all fields
List<Pair<Integer, Integer>> l = new ArrayList<Pair<Integer, Integer>>();
for (int j=0;j<rlo.getSchema().size();j++)
l.add(new Pair<Integer, Integer>(0, j));
rf = new RequiredFields(l);
requiredFields.merge(rf);
break;
}
}
requiredInputFieldsList.add(requiredFields);
if (predecessors.get(0) instanceof LOForEach || predecessors.get(0) instanceof LOSplit)
cachedRequiredInfo.put((RelationalOperator)predecessors.get(0), new RequiredInfo(requiredInputFieldsList));
else
processNode(predecessors.get(0), new RequiredInfo(requiredInputFieldsList));
return;
}
// Initialize requiredInputFieldsList
List<RequiredFields> requiredInputFieldsList = new ArrayList<RequiredFields>();
for (int i=0;i<predecessors.size();i++)
requiredInputFieldsList.add(null);
// Map required output columns to required input columns.
// We also collect required output map keys into input map keys.
// Since we have already processed Split, so every remaining operator
// have only one element in requiredOutputFieldList, so we get the first
// element and process
RequiredFields requiredOutputFields = requiredOutputInfo.requiredFieldsList.get(0);
// needAllFields means we require every individual output column and all map keys of that output.
// We convert needAllFields to individual fields here to facilitate further processing
if (requiredOutputFields.needAllFields())
{
List<Pair<Integer, Integer>> outputList = new ArrayList<Pair<Integer, Integer>>();
for (int j=0;j<rlo.getSchema().size();j++)
outputList.add(new Pair<Integer, Integer>(0, j));
requiredOutputFields = new RequiredFields(outputList);
for (int i=0;i<requiredOutputFields.size();i++)
requiredOutputFields.setMapKeysInfo(i, new MapKeysInfo(true));
}
if (requiredOutputFields.getFields()==null)
{
int errCode = 2184;
String msg = "Fields list inside RequiredFields is null.";
throw new OptimizerException(msg, errCode, PigException.BUG);
}
for (int i=0;i<requiredOutputFields.size();i++)
{
Pair<Integer, Integer> requiredOutputField = requiredOutputFields.getField(i);
MapKeysInfo outputMapKeysInfo = requiredOutputFields.getMapKeysInfo(i);
List<RequiredFields> relevantFieldsList = rlo.getRelevantInputs(requiredOutputField.first, requiredOutputField.second);
// We do not have any relevant input fields for this output, continue to next output
if (relevantFieldsList==null)
continue;
for (int j=0;j<relevantFieldsList.size();j++)
{
RequiredFields relevantFields = relevantFieldsList.get(j);
if (relevantFields!=null && relevantFields.needAllFields())
{
requiredInputFieldsList.set(j, new RequiredFields(true));
continue;
}
// Mapping output map keys to input map keys
if (rlo instanceof LOCogroup)
{
if (j!=0 && relevantFields!=null && !relevantFields.needAllFields())
{
for (Pair<Integer, Integer> pair : relevantFields.getFields())
relevantFields.setMapKeysInfo(pair.first, pair.second,
new MapKeysInfo(true));
}
}
else if (rlo instanceof LOForEach)
{
// Relay map keys from output to input
LogicalPlan forEachPlan = ((LOForEach)rlo).getRelevantPlan(requiredOutputField.second);
if (relevantFields.getFields()!=null && relevantFields.getFields().size()!=0)
{
int index = ((LOForEach)rlo).getForEachPlans().indexOf(forEachPlan);
// We check if the field get flattened, if it does, then we do not relay output map keys to input map keys.
// There are two situations:
// 1. input column is tuple, bag, or other simple type, there is no concept of map key, so we do not relay
// 2. input column is map, flatten does not do anything, we can still relay
boolean nonflatten = false;
if (!((LOForEach)rlo).getFlatten().get(index))
{
nonflatten = true;
}
else
{
// Foreach plan is flattened, check if there is only one input for this foreach plan
// and input schema for that input is not map, if so, it is a dummy flatten
if (forEachPlan.getRoots().size()==1 && forEachPlan.getRoots().get(0) instanceof LOProject)
{
LOProject loProj = (LOProject)forEachPlan.getRoots().get(0);
if (loProj.getExpression().getSchema()!=null &&
loProj.getExpression().getSchema().getField(loProj.getCol()).type!=DataType.BAG)
nonflatten = true;
}
}
if (nonflatten && outputMapKeysInfo!=null && isSimpleProjectCast(forEachPlan))
{
Pair<Integer, Integer> inputColumn = relevantFields.getFields().get(0);
relevantFields.setMapKeysInfo(inputColumn.first, inputColumn.second, outputMapKeysInfo);
}
}
// Collect required map keys in foreach plan here.
// This is the only logical operator that we collect map keys
// which are introduced by the operator here.
// For all other logical operators, it is attached to required fields
// of that logical operator, will process in required fields processing
// section
for (Pair<Integer, Integer> relevantField : relevantFields.getFields())
{
MapKeysInfo mapKeysInfo = getMapKeysInPlan(forEachPlan, relevantField.second);
relevantFields.mergeMapKeysInfo(0, relevantField.second, mapKeysInfo);
}
}
else
{
// For all other logical operators, we have one output column mapping to one or more input column.
// We copy the output map keys from the output column to the according input column
if (relevantFields!=null && relevantFields.getFields()!=null && outputMapKeysInfo!=null)
{
for (Pair<Integer, Integer> pair : relevantFields.getFields())
relevantFields.setMapKeysInfo(pair.first, pair.second,
outputMapKeysInfo);
}
}
// Now we aggregate the input columns of this output column to the required input columns
if (requiredInputFieldsList.get(j)==null)
requiredInputFieldsList.set(j, relevantFields);
else
{
requiredInputFieldsList.get(j).merge(relevantFields);
}
}
}
// Merge with required input fields of this logical operator.
// RequiredInputFields come from two sources, one is mapping from required output to input,
// the other is from the operator itself. Here we use getRequiredFields to get the second part,
// and merge with the first part
List<RequiredFields> requiredFieldsListOfLOOp;
// For LOForEach, requiredFields all flattened fields. Even the flattened fields get pruned,
// it may expand the number of rows in the result. So flattened fields shall not be pruned.
// LOForEach.getRequiredFields does not give the required fields. RequiredFields means that field
// is required by all the outputs. The pipeline does not work correctly without that field.
// LOForEach.getRequiredFields give all the input fields referred in the LOForEach statement, but those
// fields can still be pruned (which means, not required)
// Eg:
// B = foreach A generate a0, a1, a2+a3;
// LOForEach.getRequiredFields gives (a0, a1, a2, a3);
// However, a2,a3 can be pruned if we do not need the a2+a3 for LOForEach.
// So here, we do not use LOForEach.getRequiredFields, instead, any flattened fields are required fields
if (rlo instanceof LOForEach) {
List<Pair<Integer, Integer>> flattenedInputs = new ArrayList<Pair<Integer, Integer>>();
for (int i=0;i<rlo.getSchema().size();i++) {
if (((LOForEach)rlo).isInputFlattened(i)) {
flattenedInputs.add(new Pair<Integer, Integer>(0, i));
}
}
if (!flattenedInputs.isEmpty()) {
requiredFieldsListOfLOOp = new ArrayList<RequiredFields>();
requiredFieldsListOfLOOp.add(new RequiredFields(flattenedInputs));
}
else
requiredFieldsListOfLOOp = null;
}
// For LOCross/LOUnion, actually we do not require any field here
else if (rlo instanceof LOCross || rlo instanceof LOUnion)
requiredFieldsListOfLOOp = null;
else
requiredFieldsListOfLOOp = rlo.getRequiredFields();
if (requiredFieldsListOfLOOp!=null)
{
for (int i=0;i<requiredFieldsListOfLOOp.size();i++)
{
RequiredFields requiredFieldsOfLOOp = requiredFieldsListOfLOOp.get(i);
if (requiredInputFieldsList.get(i)==null)
requiredInputFieldsList.set(i, requiredFieldsOfLOOp);
else
{
requiredInputFieldsList.get(i).merge(requiredFieldsOfLOOp);
}
}
// Collect required map keys of this operator
// Cases are:
// 1. Single predecessor: LOFilter, LOSplitOutput, LOSort
// 2. Multiple predecessors: LOJoin
// 3. LOForEach do not have operator-wise required fields, we
// have already processed it
// 4. LOCogroup require all map keys (even if we cogroup by a0#'k1', a0 itself will be in bag a
// and we have no way to figure out which keys are referenced for a0. So we do not process it and
// simply require all map keys)
// 5. Other operators do not have required fields, no need to process
if (rlo instanceof LOFilter || rlo instanceof LOSplitOutput || rlo instanceof LOSort)
{
List<LogicalPlan> innerPlans = new ArrayList<LogicalPlan>();
if (rlo instanceof LOFilter)
{
innerPlans.add(((LOFilter)rlo).getComparisonPlan());
}
else if (rlo instanceof LOSplitOutput)
{
innerPlans.add(((LOSplitOutput)rlo).getConditionPlan());
}
else if (rlo instanceof LOSort)
{
innerPlans.addAll(((LOSort)rlo).getSortColPlans());
}
for (LogicalPlan p : innerPlans)
{
for (RequiredFields rf : requiredFieldsListOfLOOp)
{
if (rf.getFields()==null)
continue;
for (Pair<Integer, Integer> pair : rf.getFields())
{
MapKeysInfo mapKeysInfo = getMapKeysInPlan(p, pair.second);
if (mapKeysInfo!=null && !mapKeysInfo.needAllKeys() && mapKeysInfo.getKeys()!=null)
requiredInputFieldsList.get(0).mergeMapKeysInfo(0, pair.second,
mapKeysInfo);
}
}
}
}
else if (rlo instanceof LOJoin)
{
for (int i=0;i<predecessors.size();i++)
{
Collection<LogicalPlan> joinPlans = ((LOJoin)rlo).getJoinPlans().get(predecessors.get(i));
if (joinPlans==null)
continue;
for (LogicalPlan p : joinPlans)
{
RequiredFields rf = requiredFieldsListOfLOOp.get(i);
if (rf.getFields()==null)
continue;
for (Pair<Integer, Integer> pair : rf.getFields())
{
MapKeysInfo mapKeysInfo = getMapKeysInPlan(p, pair.second);
if (mapKeysInfo!=null && !mapKeysInfo.needAllKeys() && mapKeysInfo.getKeys()!=null)
requiredInputFieldsList.get(i).mergeMapKeysInfo(i, pair.second,
mapKeysInfo);
}
}
}
}
}
// Now we finish the current logical operator, we need to process next logical operator. There are two cases:
// 1. If the predecessor is LOForEach or LOSplit, we put requiredOutputFieldsList into cache and exit, the optimizer
// will invoke transform() on LOForEach or LOSplit and continue to process
// 2. If the predecessor is otherwise, we then recursively collect required fields for the predecessor
for (int i=0;i<predecessors.size();i++)
{
RelationalOperator predecessor = (RelationalOperator)predecessors.get(i);
List<RequiredFields> newRequiredOutputFieldsList = new ArrayList<RequiredFields>();
// In this optimization, we only prune columns and do not change structure of logical plan
// So if we do not require anything from the input, we change it to require the first field