/**
*
*/
package org.apache.pig.backend.hadoop.executionengine.util;
import java.io.IOException;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.util.Progressable;
import org.apache.pig.PigException;
import org.apache.pig.StoreConfig;
import org.apache.pig.StoreFunc;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler;
import org.apache.pig.builtin.PigStorage;
import org.apache.pig.impl.PigContext;
import org.apache.pig.impl.util.ObjectSerializer;
import org.apache.pig.impl.io.PigNullableWritable;
import org.apache.pig.impl.io.NullablePartitionWritable;
import org.apache.pig.impl.util.Pair;
import org.apache.pig.data.DefaultTupleFactory;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Map.Entry;
import org.apache.pig.backend.hadoop.HDataType;
import org.apache.pig.data.Tuple;
import java.io.InputStream;
import org.apache.pig.impl.io.BufferedPositionedInputStream;
import org.apache.pig.impl.io.FileLocalizer;
import org.apache.pig.builtin.BinStorage;
import org.apache.pig.data.DataBag;
import org.apache.pig.backend.hadoop.datastorage.ConfigurationUtil;
import org.apache.pig.data.DataType;
/**
* A class of utility static methods to be used in the hadoop map reduce backend
*/
public class MapRedUtil {
/**
* This method is to be called from an
* {@link org.apache.hadoop.mapred.OutputFormat#getRecordWriter(FileSystem ignored, JobConf job,
String name, Progressable progress)}
* method to obtain a reference to the {@link org.apache.pig.StoreFunc} object to be used by
* that OutputFormat to perform the write() operation
* @param conf the JobConf object
* @return the StoreFunc reference
* @throws ExecException
*/
public static StoreFunc getStoreFunc(JobConf conf) throws ExecException {
StoreFunc store;
try {
String storeFunc = conf.get("pig.storeFunc", "");
if (storeFunc.length() == 0) {
store = new PigStorage();
} else {
storeFunc = (String) ObjectSerializer.deserialize(storeFunc);
store = (StoreFunc) PigContext
.instantiateFuncFromSpec(storeFunc);
}
} catch (Exception e) {
int errCode = 2081;
String msg = "Unable to setup the store function.";
throw new ExecException(msg, errCode, PigException.BUG, e);
}
return store;
}
/**
* This method is to be called from an
* {@link org.apache.hadoop.mapred.OutputFormat#getRecordWriter(FileSystem ignored, JobConf job,
String name, Progressable progress)}
* method to obtain a reference to the {@link org.apache.pig.StoreConfig} object. The StoreConfig
* object will contain metadata information like schema and location to be used by
* that OutputFormat to perform the write() operation
* @param conf the JobConf object
* @return StoreConfig object containing metadata information useful for
* an OutputFormat to write the data
* @throws IOException
*/
public static StoreConfig getStoreConfig(JobConf conf) throws IOException {
return (StoreConfig) ObjectSerializer.deserialize(conf.get(JobControlCompiler.PIG_STORE_CONFIG));
}
/**
* Loads the key distribution sampler file
*
* @param keyDistFile the name for the distribution file
* @param totalReducers gets set to the total number of reducers as found in the dist file
* @param job Ref to a jobCong object
* @param keyType Type of the key to be stored in the return map. It currently treats Tuple as a special case.
*/
@SuppressWarnings("unchecked")
public static <E> Map<E, Pair<Integer, Integer> > loadPartitionFile(String keyDistFile,
Integer[] totalReducers, JobConf job, byte keyType) throws IOException {
Map<E, Pair<Integer, Integer> > reducerMap = new HashMap<E, Pair<Integer, Integer> >();
InputStream is;
if (job != null) {
is = FileLocalizer.openDFSFile(keyDistFile,ConfigurationUtil.toProperties(job));
} else {
is = FileLocalizer.openDFSFile(keyDistFile);
}
BinStorage loader = new BinStorage();
DataBag partitionList;
loader.bindTo(keyDistFile, new BufferedPositionedInputStream(is), 0, Long.MAX_VALUE);
Tuple t = loader.getNext();
if(t==null) {
throw new RuntimeException("Empty samples file");
}
// The keydist file is structured as (key, min, max)
// min, max being the index of the reducers
Map<String, Object > distMap = (Map<String, Object>) t.get (0);
partitionList = (DataBag) distMap.get("partition.list");
totalReducers[0] = Integer.valueOf(""+distMap.get("totalreducers"));
Iterator<Tuple> it = partitionList.iterator();
while (it.hasNext()) {
Tuple idxTuple = it.next();
Integer maxIndex = (Integer) idxTuple.get(idxTuple.size() - 1);
Integer minIndex = (Integer) idxTuple.get(idxTuple.size() - 2);
// Used to replace the maxIndex with the number of reducers
if (maxIndex < minIndex) {
maxIndex = totalReducers[0] + maxIndex;
}
E keyT;
// if the join is on more than 1 key
if (idxTuple.size() > 3) {
// remove the last 2 fields of the tuple, i.e: minIndex and maxIndex and store
// it in the reducer map
Tuple keyTuple = DefaultTupleFactory.getInstance().newTuple();
for (int i=0; i < idxTuple.size() - 2; i++) {
keyTuple.append(idxTuple.get(i));
}
keyT = (E) keyTuple;
} else {
if (keyType == DataType.TUPLE) {
keyT = (E)DefaultTupleFactory.getInstance().newTuple(1);
((Tuple)keyT).set(0,idxTuple.get(0));
} else {
keyT = (E) idxTuple.get(0);
}
}
// number of reducers
Integer cnt = 0;
if (minIndex < maxIndex) {
cnt = maxIndex - minIndex;
} else {
cnt = totalReducers[0] + maxIndex - minIndex;
}
reducerMap.put(keyT, new Pair(minIndex, cnt));// 1 is added to account for the 0 index
}
return reducerMap;
}
}