/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.pig.impl.logicalLayer.optimizer;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.pig.impl.logicalLayer.CastFinder;
import org.apache.pig.impl.logicalLayer.FrontendException;
import org.apache.pig.impl.logicalLayer.LOCross;
import org.apache.pig.impl.logicalLayer.LOForEach;
import org.apache.pig.impl.logicalLayer.LOJoin;
import org.apache.pig.impl.logicalLayer.LOProject;
import org.apache.pig.impl.logicalLayer.LOSort;
import org.apache.pig.impl.logicalLayer.LogicalOperator;
import org.apache.pig.impl.logicalLayer.LogicalPlan;
import org.apache.pig.impl.logicalLayer.UDFFinder;
import org.apache.pig.impl.logicalLayer.schema.Schema;
import org.apache.pig.impl.plan.DepthFirstWalker;
import org.apache.pig.impl.plan.OperatorKey;
import org.apache.pig.impl.plan.ProjectionMap;
import org.apache.pig.impl.plan.RequiredFields;
import org.apache.pig.impl.plan.OperatorPlan.IndexHelper;
import org.apache.pig.impl.plan.optimizer.OptimizerException;
import org.apache.pig.PigException;
import org.apache.pig.impl.util.MultiMap;
import org.apache.pig.impl.util.Pair;
/**
* A visitor to discover if a foreach with flatten(s) can be pushed as low down the tree as
* possible.
*/
public class PushDownForeachFlatten extends LogicalTransformer {
// boolean to remember if the foreach has to be swapped
private boolean mSwap = false;
// boolean to remember if the foreach has to be cloned and pushed into one
// of the foreach's successor's outputs
private boolean mInsertBetween = false;
// map of flattened column to its new position in the output
Map<Integer, Integer> mFlattenedColumnReMap = null;
public PushDownForeachFlatten(LogicalPlan plan) {
super(plan, new DepthFirstWalker<LogicalOperator, LogicalPlan>(plan));
}
/**
*
* @return true if the foreach has to swapped; false otherwise
*/
public boolean getSwap() {
return mSwap;
}
/**
*
* @return true if the foreach has to be inserted after its successor; false
* otherwise
*/
public boolean getInsertBetween() {
return mInsertBetween;
}
/**
*
* @return a map of old column position in the foreach to the column
* position in foreach's successor
*/
public Map<Integer, Integer> getFlattenedColumnMap() {
return mFlattenedColumnReMap;
}
@Override
public boolean check(List<LogicalOperator> nodes) throws OptimizerException {
try {
LOForEach foreach = (LOForEach) getOperator(nodes);
Pair<Boolean, List<Integer>> flattenResult = foreach.hasFlatten();
boolean flattened = flattenResult.first;
List<Integer> flattenedColumns = flattenResult.second;
Set<Integer> flattenedColumnSet = (flattenedColumns == null? null: new HashSet<Integer>(flattenedColumns));
if(!flattened) {
return false;
}
if(flattenedColumns == null || flattenedColumns.size() == 0) {
return false;
}
ProjectionMap foreachProjectionMap = foreach.getProjectionMap();
if(foreachProjectionMap == null) {
return false;
}
List<Integer> foreachAddedFields = foreachProjectionMap.getAddedFields();
if(foreachAddedFields != null) {
Set<Integer> foreachAddedFieldsSet = new HashSet<Integer>(foreachAddedFields);
flattenedColumnSet.removeAll(foreachAddedFieldsSet);
}
if(flattenedColumnSet.size() == 0) {
return false;
}
for(LogicalPlan foreachPlan: foreach.getForEachPlans()) {
UDFFinder udfFinder = new UDFFinder(foreachPlan);
udfFinder.visit();
// if any of the foreach's inner plans contain a UDF then return false
if (udfFinder.foundAnyUDF()) {
return false;
}
CastFinder castFinder = new CastFinder(foreachPlan);
castFinder.visit();
// TODO
// if any of the foreach's inner plans contain a cast then return false
// in the future the cast should be moved appropriately
if (castFinder.foundAnyCast()) {
return false;
}
}
List<LogicalOperator> successors = (mPlan.getSuccessors(foreach) == null ? null
: new ArrayList<LogicalOperator>(mPlan
.getSuccessors(foreach)));
// if the foreach has no successors or more than one successor
// return false
if (successors == null || successors.size() == 0 || successors.size() > 1) {
return false;
}
LogicalOperator successor = successors.get(0);
List<LogicalOperator> peers = (mPlan.getPredecessors(successor) == null ? null
: new ArrayList<LogicalOperator>(mPlan.getPredecessors(successor)));
// check if any of the foreach's peers is a foreach flatten
// if so then this rule does not apply
for(LogicalOperator peer: peers) {
if(!peer.equals(foreach)) {
if(peer instanceof LOForEach) {
LOForEach peerForeach = (LOForEach)peer;
if(peerForeach.hasFlatten().first) {
return false;
}
}
}
}
IndexHelper indexHelper = new IndexHelper(peers);
Integer foreachPosition = indexHelper.getIndex(foreach);
// the foreach with flatten can be swapped with an order by
// as the order by will have lesser number of records to sort
// also the sort does not alter the records that are processed
// the foreach with flatten can be pushed down a cross or a join
// for the same reason. In this case the foreach has to be first
// unflattened and then a new foreach has to be inserted after
// the cross or join. In both cross and foreach the actual columns
// from the foreach are not altered but positions might be changed
// in the case of union the column is transformed and as a result
// the foreach flatten cannot be pushed down
// for distinct the output before flattening and the output
// after flattening might be different. For example, consider
// {(1), (1)}. Distinct of this bag is still {(1), (1)}.
// distinct(flatten({(1), (1)})) is (1). However,
// flatten(distinct({(1), (1)})) is (1), (1)
// in both cases correctness is not affected
if(successor instanceof LOSort) {
LOSort sort = (LOSort) successor;
RequiredFields sortRequiredField = sort.getRequiredFields().get(0);
if(sortRequiredField.getNeedAllFields()) {
return false;
}
List<Pair<Integer, Integer>> sortInputs = sortRequiredField.getFields();
Set<Integer> requiredInputs = new HashSet<Integer>();
for(Pair<Integer, Integer> pair: sortInputs) {
requiredInputs.add(pair.second);
}
requiredInputs.retainAll(flattenedColumnSet);
// the intersection of the sort's required inputs
// and the flattened columns in the foreach should
// be null, i.e., the size of required inputs == 0
if(requiredInputs.size() != 0) {
return false;
}
mSwap = true;
return true;
} else if (successor instanceof LOCross
|| successor instanceof LOJoin) {
List<LogicalOperator> children = mPlan.getSuccessors(successor);
if(children == null || children.size() > 1) {
return false;
}
ProjectionMap succProjectionMap = successor.getProjectionMap();
if(succProjectionMap == null) {
return false;
}
MultiMap<Integer, ProjectionMap.Column> mappedFields = succProjectionMap.getMappedFields();
if(mappedFields == null) {
return false;
}
if(mFlattenedColumnReMap == null) {
mFlattenedColumnReMap = new HashMap<Integer, Integer>();
}
// initialize the map
for(Integer key: flattenedColumnSet) {
mFlattenedColumnReMap.put(key, Integer.MAX_VALUE);
}
// for each output column find the corresponding input that matches the foreach's position
// for each input column in the foreach check if the output column is a mapping of the flattened column
// due to flattenning multiple output columns could be generated from the same input column
// find the first or the lowest column that is a result of the
for(Integer key: mappedFields.keySet()) {
List<ProjectionMap.Column> columns = (List<ProjectionMap.Column>)mappedFields.get(key);
for(ProjectionMap.Column column: columns) {
Pair<Integer, Integer> inputColumn = column.getInputColumn();
// check if the input column number is the same as the
// position of foreach in the list of predecessors
if(foreachPosition.equals(inputColumn.first)) {
if(flattenedColumnSet.contains(inputColumn.second)) {
// check if the output column, i.e., key is the
// least column number seen till date
if(key < mFlattenedColumnReMap.get(inputColumn.second)) {
mFlattenedColumnReMap.put(inputColumn.second, key);
}
}
}
}
}
// check if any of the flattened columns is not remapped
for(Integer key: mFlattenedColumnReMap.keySet()) {
if(mFlattenedColumnReMap.get(key).equals(Integer.MAX_VALUE)) {
return false;
}
}
mInsertBetween = true;
return true;
}
return false;
} catch (OptimizerException oe) {
throw oe;
} catch (Exception e) {
int errCode = 2152;
String msg = "Internal error while trying to check if foreach with flatten can be pushed down.";
throw new OptimizerException(msg, errCode, PigException.BUG, e);
}
}
private LogicalOperator getOperator(List<LogicalOperator> nodes)
throws FrontendException {
if ((nodes == null) || (nodes.size() <= 0)) {
int errCode = 2052;
String msg = "Internal error. Cannot retrieve operator from null or empty list.";
throw new OptimizerException(msg, errCode, PigException.BUG);
}
LogicalOperator lo = nodes.get(0);
if (lo == null || !(lo instanceof LOForEach)) {
// we should never be called with any other operator class name
int errCode = 2005;
String msg = "Expected " + LOForEach.class.getSimpleName()
+ ", got "
+ (lo == null ? lo : lo.getClass().getSimpleName());
throw new OptimizerException(msg, errCode, PigException.INPUT);
} else {
return lo;
}
}
@Override
public void transform(List<LogicalOperator> nodes)
throws OptimizerException {
try {
LOForEach foreach = (LOForEach) getOperator(nodes);
LogicalOperator successor = mPlan.getSuccessors(foreach).get(0);
if (mSwap) {
mPlan.swap(successor, foreach);
} else if (mInsertBetween) {
// mark the flattened columns as not flattened in the foreach
// create a new foreach operator that projects each column of the
// successor. Mark the remapped flattened columns as flattened
// in the new foreach operator
if(mFlattenedColumnReMap == null) {
int errCode = 2153;
String msg = "Internal error. The mapping for the flattened columns is empty";
throw new OptimizerException(msg, errCode, PigException.BUG);
}
// set flatten to false for all columns in the mapping
ArrayList<Boolean> flattenList = (ArrayList<Boolean>)foreach.getFlatten();
for(Integer key: mFlattenedColumnReMap.keySet()) {
flattenList.set(key, false);
}
// rebuild schemas of the foreach and the successor after the foreach modification
foreach.regenerateSchema();
successor.regenerateSchema();
Schema successorSchema = successor.getSchema();
if(successorSchema == null) {
int errCode = 2154;
String msg = "Internal error. Schema of successor cannot be null for pushing down foreach with flatten.";
throw new OptimizerException(msg, errCode, PigException.BUG);
}
flattenList = new ArrayList<Boolean>();
ArrayList<LogicalPlan> foreachInnerPlans = new ArrayList<LogicalPlan>();
for(int i = 0; i < successorSchema.size(); ++i) {
LogicalPlan innerPlan = new LogicalPlan();
LOProject project = new LOProject(innerPlan, OperatorKey
.genOpKey(foreach.getOperatorKey().scope),
successor, i);
innerPlan.add(project);
foreachInnerPlans.add(innerPlan);
flattenList.add(false);
}
// set the flattened remapped column to true
for(Integer key: mFlattenedColumnReMap.keySet()) {
Integer value = mFlattenedColumnReMap.get(key);
flattenList.set(value, true);
}
LOForEach newForeach = new LOForEach(mPlan, OperatorKey
.genOpKey(foreach.getOperatorKey().scope), foreachInnerPlans,
flattenList);
// add the new foreach to the plan
mPlan.add(newForeach);
// insert the new foreach between the successor and the successor's successor
mPlan.insertBetween(successor, newForeach, mPlan.getSuccessors(successor).get(0));
}
} catch (OptimizerException oe) {
throw oe;
} catch (Exception e) {
int errCode = 2155;
String msg = "Internal error while pushing foreach with flatten down.";
throw new OptimizerException(msg, errCode, PigException.BUG, e);
}
}
@Override
public void reset() {
mInsertBetween = false;
mSwap = false;
mFlattenedColumnReMap = null;
}
}