/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.pig.impl.logicalLayer;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.HashMap;
import java.util.Set;
import java.util.Iterator;
import org.apache.pig.PigException;
import org.apache.pig.impl.logicalLayer.schema.Schema;
import org.apache.pig.impl.logicalLayer.schema.SchemaMergeException;
import org.apache.pig.impl.logicalLayer.optimizer.SchemaRemover;
import org.apache.pig.impl.plan.OperatorKey;
import org.apache.pig.impl.plan.ProjectionMap;
import org.apache.pig.impl.plan.RequiredFields;
import org.apache.pig.impl.plan.VisitorException;
import org.apache.pig.impl.util.MultiMap;
import org.apache.pig.impl.util.Pair;
import org.apache.pig.data.DataType;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
public class LOForEach extends LogicalOperator {
private static final long serialVersionUID = 2L;
/**
* The foreach operator supports nested query plans. At this point its one
* level of nesting. Foreach can have a list of operators that need to be
* applied over the input.
*/
private ArrayList<LogicalPlan> mForEachPlans;
private ArrayList<Boolean> mFlatten;
private ArrayList<Schema> mUserDefinedSchema = null;
private static Log log = LogFactory.getLog(LOForEach.class);
/**
* @param plan
* Logical plan this operator is a part of.
* @param k
* Operator key to assign to this node.
* @param foreachPlans
* the list of plans that are applied for each input
* @param flattenList
* boolean list that tells which elements of the foreach
* projection should be flattened.
*/
public LOForEach(LogicalPlan plan, OperatorKey k,
ArrayList<LogicalPlan> foreachPlans, ArrayList<Boolean> flattenList) {
super(plan, k);
mForEachPlans = foreachPlans;
mFlatten = flattenList;
}
public LOForEach(LogicalPlan plan, OperatorKey k,
ArrayList<LogicalPlan> foreachPlans, ArrayList<Boolean> flattenList,
ArrayList<Schema> userDefinedSchemaList) {
super(plan, k);
mForEachPlans = foreachPlans;
mFlatten = flattenList;
mUserDefinedSchema = userDefinedSchemaList;
}
public ArrayList<LogicalPlan> getForEachPlans() {
return mForEachPlans;
}
public void setForEachPlans(ArrayList<LogicalPlan> foreachPlans) {
mForEachPlans = foreachPlans;
}
public List<Boolean> getFlatten() {
return mFlatten;
}
public void setFlatten(ArrayList<Boolean> flattenList) {
mFlatten = flattenList;
}
public List<Schema> getUserDefinedSchema() {
return mUserDefinedSchema;
}
public void setUserDefinedSchema(ArrayList<Schema> userDefinedSchema) {
mUserDefinedSchema = userDefinedSchema;
}
@Override
public String name() {
return "ForEach " + mKey.scope + "-" + mKey.id;
}
@Override
public boolean supportsMultipleInputs() {
return false;
}
@Override
public void visit(LOVisitor v) throws VisitorException {
v.visit(this);
}
public byte getType() {
return DataType.BAG ;
}
private void updateAliasCount(Map<String, Integer> aliases, String alias) {
if((null == aliases) || (null == alias)) return;
Integer count = aliases.get(alias);
if(null == count) {
aliases.put(alias, 1);
} else {
aliases.put(alias, ++count);
}
}
@Override
public Schema getSchema() throws FrontendException {
log.debug("Entering getSchema");
if (!mIsSchemaComputed) {
List<Schema.FieldSchema> fss = new ArrayList<Schema.FieldSchema>(
mForEachPlans.size());
for (LogicalPlan plan : mForEachPlans) {
log.debug("Number of leaves in " + plan + " = " + plan.getLeaves().size());
for(int i = 0; i < plan.getLeaves().size(); ++i) {
log.debug("Leaf" + i + "= " + plan.getLeaves().get(i));
}
//LogicalOperator op = plan.getRoots().get(0);
LogicalOperator op = plan.getLeaves().get(0);
log.debug("op: " + op.getClass().getName() + " " + op);
}
log.debug("Printed the leaves of the generate plans");
Map<Schema.FieldSchema, String> flattenAlias = new HashMap<Schema.FieldSchema, String>();
Map<String, Boolean> inverseFlattenAlias = new HashMap<String, Boolean>();
Map<String, Integer> aliases = new HashMap<String, Integer>();
for (int planCtr = 0; planCtr < mForEachPlans.size(); ++planCtr) {
LogicalPlan plan = mForEachPlans.get(planCtr);
LogicalOperator op = plan.getLeaves().get(0);
log.debug("op: " + op.getClass().getName() + " " + op);
log.debug("Flatten: " + mFlatten.get(planCtr));
Schema.FieldSchema planFs;
if(op instanceof LOProject) {
//the check for the type is required for statements like
//foreach cogroup {
// a1 = order a by *;
// generate a1;
//}
//In the above script, the generate a1, will translate to
//project(a1) -> project(*) and will not be translated to a sequence of projects
//As a result the project(*) will remain but the return type is a bag
//project(*) with a data type set to tuple indicates a project(*) from an input
//that has no schema
if( (((LOProject)op).isStar() ) && (((LOProject)op).getType() == DataType.TUPLE) ) {
mSchema = null;
mIsSchemaComputed = true;
return mSchema;
}
}
try {
planFs = ((ExpressionOperator)op).getFieldSchema();
log.debug("planFs: " + planFs);
Schema userDefinedSchema = null;
if(null != mUserDefinedSchema) {
userDefinedSchema = mUserDefinedSchema.get(planCtr);
}
if(null != planFs) {
String outerCanonicalAlias = op.getAlias();
if(null == outerCanonicalAlias) {
outerCanonicalAlias = planFs.alias;
}
log.debug("Outer canonical alias: " + outerCanonicalAlias);
if(mFlatten.get(planCtr)) {
//need to extract the children and create the aliases
//assumption here is that flatten is only for one column
//i.e., flatten(A), flatten(A.x) and NOT
//flatten(B.(x,y,z))
Schema s = planFs.schema;
if(null != s && s.isTwoLevelAccessRequired()) {
// this is the case where the schema is that of
// a bag which has just one tuple fieldschema which
// in turn has a list of fieldschemas. The schema
// after flattening would consist of the fieldSchemas
// present in the tuple
// check that indeed we only have one field schema
// which is that of a tuple
if(s.getFields().size() != 1) {
int errCode = 1008;
String msg = "Expected a bag schema with a single " +
"element of type "+ DataType.findTypeName(DataType.TUPLE) +
" but got a bag schema with multiple elements.";
throw new FrontendException(msg, errCode, PigException.INPUT, false, null);
}
Schema.FieldSchema tupleFS = s.getField(0);
if(tupleFS.type != DataType.TUPLE) {
int errCode = 1009;
String msg = "Expected a bag schema with a single " +
"element of type "+ DataType.findTypeName(DataType.TUPLE) +
" but got an element of type " +
DataType.findTypeName(tupleFS.type);
throw new FrontendException(msg, errCode, PigException.INPUT, false, null);
}
s = tupleFS.schema;
}
if(null != s) {
for(int i = 0; i < s.size(); ++i) {
Schema.FieldSchema fs;
fs = new Schema.FieldSchema(s.getField(i));
fs.setParent(s.getField(i).canonicalName, op);
log.debug("fs: " + fs);
if(null != userDefinedSchema) {
Schema.FieldSchema userDefinedFieldSchema;
try {
if(i < userDefinedSchema.size()) {
userDefinedFieldSchema = userDefinedSchema.getField(i);
fs = fs.mergePrefixFieldSchema(userDefinedFieldSchema);
}
} catch (SchemaMergeException sme) {
int errCode = 1016;
String msg = "Problems in merging user defined schema";
throw new FrontendException(msg, errCode, PigException.INPUT, false, null, sme);
}
outerCanonicalAlias = null;
}
String innerCanonicalAlias = fs.alias;
Schema.FieldSchema newFs;
if((null != outerCanonicalAlias) && (null != innerCanonicalAlias)) {
String disambiguatorAlias = outerCanonicalAlias + "::" + innerCanonicalAlias;
newFs = new Schema.FieldSchema(disambiguatorAlias, fs.schema, fs.type);
newFs.setParent(s.getField(i).canonicalName, op);
fss.add(newFs);
updateAliasCount(aliases, disambiguatorAlias);
//it's fine if there are duplicates
//we just need to record if its due to
//flattening
} else {
newFs = new Schema.FieldSchema(fs);
newFs.setParent(s.getField(i).canonicalName, op);
fss.add(newFs);
}
updateAliasCount(aliases, innerCanonicalAlias);
flattenAlias.put(newFs, innerCanonicalAlias);
inverseFlattenAlias.put(innerCanonicalAlias, true);
}
} else {
Schema.FieldSchema newFs;
if(null != userDefinedSchema) {
if(!DataType.isSchemaType(planFs.type)) {
if(userDefinedSchema.size() > 1) {
int errCode = 1017;
String msg = "Schema mismatch. A basic type on flattening cannot have more than one column. User defined schema: " + userDefinedSchema;
throw new FrontendException(msg, errCode, PigException.INPUT, false, null);
}
newFs = new Schema.FieldSchema(null, planFs.type);
try {
newFs = newFs.mergePrefixFieldSchema(userDefinedSchema.getField(0));
} catch (SchemaMergeException sme) {
int errCode = 1016;
String msg = "Problems in merging user defined schema";
throw new FrontendException(msg, errCode, PigException.INPUT, false, null, sme);
}
updateAliasCount(aliases, newFs.alias);
fss.add(newFs);
newFs.setParent(null, op);
} else {
for(Schema.FieldSchema ufs: userDefinedSchema.getFields()) {
Schema.FieldSchema.setFieldSchemaDefaultType(ufs, DataType.BYTEARRAY);
newFs = new Schema.FieldSchema(ufs);
fss.add(newFs);
newFs.setParent(null, op);
updateAliasCount(aliases, ufs.alias);
}
}
} else {
if(!DataType.isSchemaType(planFs.type)) {
newFs = new Schema.FieldSchema(planFs.alias, planFs.type);
} else {
newFs = new Schema.FieldSchema(null, DataType.BYTEARRAY);
}
fss.add(newFs);
newFs.setParent(null, op);
}
}
} else {
//just populate the schema with the field schema of the expression operator
//check if the user has defined a schema for the operator; compare the schema
//with that of the expression operator field schema and then add it to the list
Schema.FieldSchema newFs = new Schema.FieldSchema(planFs);
if(null != userDefinedSchema) {
try {
newFs = newFs.mergePrefixFieldSchema(userDefinedSchema.getField(0));
updateAliasCount(aliases, newFs.alias);
} catch (SchemaMergeException sme) {
int errCode = 1016;
String msg = "Problems in merging user defined schema";
throw new FrontendException(msg, errCode, PigException.INPUT, false, null, sme);
}
}
newFs.setParent(planFs.canonicalName, op);
fss.add(newFs);
}
} else {
//did not get a valid list of field schemas
String outerCanonicalAlias = null;
if(null != userDefinedSchema) {
Schema.FieldSchema userDefinedFieldSchema = new Schema.FieldSchema(userDefinedSchema.getField(0));
fss.add(userDefinedFieldSchema);
userDefinedFieldSchema.setParent(null, op);
updateAliasCount(aliases, userDefinedFieldSchema.alias);
} else {
mSchema = null;
mIsSchemaComputed = true;
return mSchema;
}
}
} catch (FrontendException fee) {
mSchema = null;
mIsSchemaComputed = false;
throw fee;
}
}
//check for duplicate column names and throw an error if there are duplicates
//ensure that flatten gets rid of duplicate column names when the checks are
//being done
log.debug(" flattenAlias: " + flattenAlias);
log.debug(" inverseFlattenAlias: " + inverseFlattenAlias);
log.debug(" aliases: " + aliases);
log.debug(" fss.size: " + fss.size());
boolean duplicates = false;
Map<String, Integer> duplicateAliases = new HashMap<String, Integer>();
for(String alias: aliases.keySet()) {
Integer count = aliases.get(alias);
if(count > 1) {//not checking for null here as counts are intitalized to 1
Boolean inFlatten = false;
log.debug("inFlatten: " + inFlatten + " inverseFlattenAlias: " + inverseFlattenAlias);
inFlatten = inverseFlattenAlias.get(alias);
log.debug("inFlatten: " + inFlatten + " inverseFlattenAlias: " + inverseFlattenAlias);
if((null == inFlatten) || (!inFlatten)) {
duplicates = true;
duplicateAliases.put(alias, count);
}
}
}
if(duplicates) {
String errMessage = "Found duplicates in schema. ";
if(duplicateAliases.size() > 0) {
Set<String> duplicateCols = duplicateAliases.keySet();
Iterator<String> iter = duplicateCols.iterator();
String col = iter.next();
errMessage += col + ": " + duplicateAliases.get(col) + " columns";
while(iter.hasNext()) {
col = iter.next();
errMessage += ", " + col + ": " + duplicateAliases.get(col) + " columns";
}
}
errMessage += ". Please alias the columns with unique names.";
log.debug(errMessage);
int errCode = 1007;
throw new FrontendException(errMessage, errCode, PigException.INPUT, false, null);
}
mSchema = new Schema(fss);
//add the aliases that are unique after flattening
for(Schema.FieldSchema fs: mSchema.getFields()) {
String alias = flattenAlias.get(fs);
Integer count = aliases.get(alias);
if (null == count) count = 1;
log.debug("alias: " + alias);
if((null != alias) && (count == 1)) {
mSchema.addAlias(alias, fs);
}
}
mIsSchemaComputed = true;
}
log.debug("Exiting getSchema");
return mSchema;
}
public void unsetSchema() throws VisitorException{
for(LogicalPlan plan: mForEachPlans) {
SchemaRemover sr = new SchemaRemover(plan);
sr.visit();
}
super.unsetSchema();
}
/**
* @see org.apache.pig.impl.plan.Operator#clone()
* Do not use the clone method directly. Operators are cloned when logical plans
* are cloned using {@link LogicalPlanCloner}
*/
@Override
protected Object clone() throws CloneNotSupportedException {
// Do generic LogicalOperator cloning
LOForEach forEachClone = (LOForEach)super.clone();
// create deep copies of attributes specific to foreach
if(mFlatten != null) {
forEachClone.mFlatten = new ArrayList<Boolean>();
for (Iterator<Boolean> it = mFlatten.iterator(); it.hasNext();) {
forEachClone.mFlatten.add(new Boolean(it.next()));
}
}
if(mForEachPlans != null) {
forEachClone.mForEachPlans = new ArrayList<LogicalPlan>();
for (Iterator<LogicalPlan> it = mForEachPlans.iterator(); it.hasNext();) {
LogicalPlanCloneHelper lpCloneHelper = new LogicalPlanCloneHelper(it.next());
forEachClone.mForEachPlans.add(lpCloneHelper.getClonedPlan());
}
}
if(mUserDefinedSchema != null) {
forEachClone.mUserDefinedSchema = new ArrayList<Schema>();
for (Iterator<Schema> it = mUserDefinedSchema.iterator(); it.hasNext();) {
Schema s = it.next();
forEachClone.mUserDefinedSchema.add(s != null ? s.clone() : null);
}
}
return forEachClone;
}
@Override
public ProjectionMap getProjectionMap() {
Schema outputSchema;
try {
outputSchema = getSchema();
} catch (FrontendException fee) {
return null;
}
if(outputSchema == null) {
return null;
}
List<LogicalOperator> predecessors = (ArrayList<LogicalOperator>)mPlan.getPredecessors(this);
if(predecessors == null) {
return null;
}
LogicalOperator predecessor = predecessors.get(0);
Schema inputSchema;
try {
inputSchema = predecessor.getSchema();
} catch (FrontendException fee) {
return null;
}
List<LogicalPlan> foreachPlans = getForEachPlans();
List<Boolean> flattenList = getFlatten();
MultiMap<Integer, Pair<Integer, Integer>> mapFields = new MultiMap<Integer, Pair<Integer, Integer>>();
List<Integer> addedFields = new ArrayList<Integer>();
int outputColumn = 0;
for(int i = 0; i < foreachPlans.size(); ++i) {
LogicalPlan foreachPlan = foreachPlans.get(i);
List<LogicalOperator> leaves = foreachPlan.getLeaves();
if(leaves == null || leaves.size() > 1) {
return null;
}
int inputColumn = -1;
boolean mapped = false;
if(leaves.get(0) instanceof LOProject) {
//find out if this project is a chain of projects
if(LogicalPlan.chainOfProjects(foreachPlan)) {
LOProject rootProject = (LOProject)foreachPlan.getRoots().get(0);
inputColumn = rootProject.getCol();
if(inputSchema != null) {
mapped = true;
}
}
}
Schema.FieldSchema leafFS;
try {
leafFS = ((ExpressionOperator)leaves.get(0)).getFieldSchema();
} catch (FrontendException fee) {
return null;
}
if(leafFS == null) {
return null;
}
if(flattenList.get(i)) {
Schema innerSchema = leafFS.schema;
if(innerSchema != null) {
if(innerSchema.isTwoLevelAccessRequired()) {
// this is the case where the schema is that of
// a bag which has just one tuple fieldschema which
// in turn has a list of fieldschemas. The schema
// after flattening would consist of the fieldSchemas
// present in the tuple
// check that indeed we only have one field schema
// which is that of a tuple
if(innerSchema.getFields().size() != 1) {
return null;
}
Schema.FieldSchema tupleFS;
try {
tupleFS = innerSchema.getField(0);
} catch (FrontendException fee) {
return null;
}
if(tupleFS.type != DataType.TUPLE) {
return null;
}
innerSchema = tupleFS.schema;
}
//innerSchema could be modified and hence the second check
if(innerSchema != null) {
for(int j = 0; j < innerSchema.size(); ++j) {
if(mapped) {
//map each flattened column to the original column
mapFields.put(outputColumn++, new Pair<Integer, Integer>(0, inputColumn));
} else {
addedFields.add(outputColumn++);
}
}
} else {
//innerSchema is null; check for schema type
if(DataType.isSchemaType(leafFS.type)) {
//flattening a null schema results in a bytearray
if(mapped) {
//map each flattened column to the original column
mapFields.put(outputColumn++, new Pair<Integer, Integer>(0, inputColumn));
} else {
addedFields.add(outputColumn++);
}
} else {
mapFields.put(outputColumn++, new Pair<Integer, Integer>(0, inputColumn));
}
}
} else {
//innerSchema is null; check for schema type
if(DataType.isSchemaType(leafFS.type)) {
//flattening a null schema results in a bytearray
if(mapped) {
//map each flattened column to the original column
mapFields.put(outputColumn++, new Pair<Integer, Integer>(0, inputColumn));
} else {
addedFields.add(outputColumn++);
}
} else {
mapFields.put(outputColumn++, new Pair<Integer, Integer>(0, inputColumn));
}
}
} else {
//not a flattened column
if(mapped) {
mapFields.put(outputColumn++, new Pair<Integer, Integer>(0, inputColumn));
} else {
addedFields.add(outputColumn++);
}
}
}
List<Pair<Integer, Integer>> removedFields = new ArrayList<Pair<Integer, Integer>>();
if(inputSchema == null) {
//if input schema is null then there are no mappedFields and removedFields
mapFields = null;
removedFields = null;
} else {
//if the size of the map is zero then set it to null
if(mapFields.size() == 0) {
mapFields = null;
}
if(addedFields.size() == 0) {
addedFields = null;
}
//input schema is not null. Need to compute the removedFields
//compute the set difference between the input schema and mapped fields
Set<Integer> removedSet = new HashSet<Integer>();
for(int i = 0; i < inputSchema.size(); ++i) {
removedSet.add(i);
}
if(mapFields != null) {
Set<Integer> mappedSet = new HashSet<Integer>();
for(Integer key: mapFields.keySet()) {
List<Pair<Integer, Integer>> values = (ArrayList<Pair<Integer, Integer>>)mapFields.get(key);
for(Pair<Integer, Integer> value: values) {
mappedSet.add(value.second);
}
}
removedSet.removeAll(mappedSet);
}
if(removedSet.size() == 0) {
removedFields = null;
} else {
for(Integer i: removedSet) {
removedFields.add(new Pair<Integer, Integer>(0, i));
}
}
}
return new ProjectionMap(mapFields, removedFields, addedFields);
}
@Override
public List<RequiredFields> getRequiredFields() {
List<RequiredFields> requiredFields = new ArrayList<RequiredFields>();
Set<Pair<Integer, Integer>> fields = new HashSet<Pair<Integer, Integer>>();
Set<LOProject> projectSet = new HashSet<LOProject>();
boolean starRequired = false;
for (LogicalPlan plan : getForEachPlans()) {
TopLevelProjectFinder projectFinder = new TopLevelProjectFinder(
plan);
try {
projectFinder.visit();
} catch (VisitorException ve) {
requiredFields.clear();
requiredFields.add(null);
return requiredFields;
}
projectSet.addAll(projectFinder.getProjectSet());
if(projectFinder.getProjectStarSet() != null) {
starRequired = true;
}
}
if(starRequired) {
requiredFields.add(new RequiredFields(true));
return requiredFields;
} else {
for (LOProject project : projectSet) {
for (int inputColumn : project.getProjection()) {
fields.add(new Pair<Integer, Integer>(0, inputColumn));
}
}
if(fields.size() == 0) {
requiredFields.add(new RequiredFields(false, true));
} else {
requiredFields.add(new RequiredFields(new ArrayList<Pair<Integer, Integer>>(fields)));
}
return (requiredFields.size() == 0? null: requiredFields);
}
}
}