package com.ebay.erl.mobius.core.mapred;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Random;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.lib.MobiusDelegatingInputFormat;
import org.apache.hadoop.mapred.lib.InputSampler.Sampler;
import org.apache.hadoop.util.ReflectionUtils;
import com.ebay.erl.mobius.core.ConfigureConstants;
import com.ebay.erl.mobius.core.datajoin.DataJoinKey;
import com.ebay.erl.mobius.core.model.Column;
import com.ebay.erl.mobius.core.model.Tuple;
import com.ebay.erl.mobius.core.sort.Sorter;
import com.ebay.erl.mobius.core.sort.Sorter.Ordering;
import com.ebay.erl.mobius.util.SerializableUtil;
import com.ebay.erl.mobius.util.Util;
/**
* Performing sampling for total sort job.
*
* <p>
* This product is licensed under the Apache License, Version 2.0,
* available at http://www.apache.org/licenses/LICENSE-2.0.
*
* This product contains portions derived from Apache hadoop which is
* licensed under the Apache License, Version 2.0, available at
* http://hadoop.apache.org.
*
* © 2007 – 2012 eBay Inc., Evan Chiu, Woody Zhou, Neel Sundaresan
*/
@SuppressWarnings({ "deprecation", "unchecked" })
public class MobiusInputSampler implements Sampler {
private double freq;
private final int numSamples;
private final int maxSplitsSampled;
private static final Log LOGGER = LogFactory.getLog(MobiusInputSampler.class);
public MobiusInputSampler(double freq, int numSamples, int maxSplitsSampled)
{
this.freq = freq;
this.numSamples = numSamples;
this.maxSplitsSampled = maxSplitsSampled;
}
private AbstractMobiusMapper getMapper(InputFormat inf, InputSplit split, JobConf conf)
throws IOException
{
AbstractMobiusMapper mapper = null;
if( inf instanceof MobiusDelegatingInputFormat)
{
Class<AbstractMobiusMapper> mapperClass = ((MobiusDelegatingInputFormat)inf).getMapper(split, conf);
mapper = ReflectionUtils.newInstance(mapperClass, conf);
}
else
{
Class<? extends AbstractMobiusMapper> mapperClass = (Class<? extends AbstractMobiusMapper>) Util.getClass(conf.get(ConfigureConstants.MAPPER_CLASS));
mapper = ReflectionUtils.newInstance(mapperClass, conf);
}
return mapper;
}
@Override
public Object[] getSample(InputFormat inf, JobConf job)
throws IOException
{
// the following codes are copied from {@link InputSampler#RandomSampler},
// but require some modifications.
InputSplit[] splits = inf.getSplits(job, job.getNumMapTasks());
ArrayList<DataJoinKey> samples = new ArrayList<DataJoinKey>(this.numSamples);
int splitsToSample = Math.min(this.maxSplitsSampled, splits.length);
Random r = new Random();
long seed = r.nextLong();
r.setSeed(seed);
// get Sorters
Sorter[] sorters = null;
if( job.get(ConfigureConstants.SORTERS, null)!=null )
{
// total sort job
sorters = (Sorter[])SerializableUtil.deserializeFromBase64(job.get(ConfigureConstants.SORTERS), job);
}
else
{
// there is no sorter, should be reducer/join job
Column[] keys = (Column[])SerializableUtil.deserializeFromBase64(job.get(ConfigureConstants.ALL_GROUP_KEY_COLUMNS), job);
sorters = new Sorter[keys.length];
for( int i=0;i<keys.length;i++ )
{
sorters[i] = new Sorter(keys[i].getInputColumnName(), Ordering.ASC);
}
}
long proportion = 10L;
while( (int)(this.freq*proportion)==0 ){
proportion = proportion*10;
}
proportion = 5L*proportion;
// shuffle splits
for (int i = 0; i < splits.length; ++i)
{
InputSplit tmp = splits[i];
int j = r.nextInt(splits.length);
splits[i] = splits[j];
splits[j] = tmp;
}
SamplingOutputCollector collector = new SamplingOutputCollector();
for (int i = 0; i < splitsToSample
|| (i < splits.length && samples.size() < numSamples); i++)
{
LOGGER.info("Sampling from split #"+(i+1)+", collected samples:"+samples.size());
RecordReader<WritableComparable, WritableComparable> reader = inf.getRecordReader(splits[i], job, Reporter.NULL);
WritableComparable key = reader.createKey();
WritableComparable value = reader.createValue();
if( !(inf instanceof MobiusDelegatingInputFormat) )
{
// not mobius delegating input format, so the CURRENT_DATASET_ID
// will not be set by inf#getRecordReader, we set them here.
//
// set the current dataset id, as the AbstractMobiusMapper#configure
// method needs this property.
job.set (ConfigureConstants.CURRENT_DATASET_ID, job.get(ConfigureConstants.ALL_DATASET_IDS));
}
Byte datasetID = Byte.valueOf(job.get(ConfigureConstants.CURRENT_DATASET_ID));
LOGGER.info("Samples coming from dataset: "+datasetID.toString());
AbstractMobiusMapper mapper = this.getMapper(inf, splits[i], job);
mapper.configure(job);
// reading elements from one split
long readElement = 0;
while (reader.next(key, value))
{
collector.clear();
Tuple tuple = mapper.parse(key, value);
readElement++;
if (readElement> (((long)numSamples)*((long)proportion)) )
{
// a split might be very big (ex: a large gz file),
// so we just need to read the
break;
}
if (r.nextDouble() <= freq)
{
if (samples.size() < numSamples)
{
mapper.joinmap(key, value, collector, Reporter.NULL);
// joinmap function might generate more than one output key
// per <code>key</code> input.
for( Tuple t:collector.getOutKey() )
{
Tuple mt = Tuple.merge(tuple, t);
DataJoinKey nkey = this.getKey(mt, sorters, datasetID, mapper, job);
samples.add(nkey);
}
}
else
{
// When exceeding the maximum number of samples, replace
// a random element with this one, then adjust the
// frequency to reflect the possibility of existing
// elements being pushed out
mapper.joinmap(key, value, collector, Reporter.NULL);
for( Tuple t:collector.getOutKey() )
{
int ind = r.nextInt(numSamples);
if (ind != numSamples)
{
Tuple mt = Tuple.merge(tuple, t);
DataJoinKey nkey = this.getKey(mt, sorters, datasetID, mapper, job);
samples.set(ind, nkey);
}
}
freq *= (numSamples - collector.getOutKey().size()) / (double) numSamples;
}
key = reader.createKey();
value = reader.createValue();
}
}
reader.close();
}
LOGGER.info("Samples have been collected, return.");
return samples.toArray();
}
private DataJoinKey getKey(Tuple tuple, Sorter[] sorter, Byte datasetID, AbstractMobiusMapper mapper, Configuration conf)
{
Tuple columnsUsedToSort = new Tuple();
for(Sorter aSorter:sorter )
{
String name = aSorter.getColumn();
Object value = tuple.get(name);
columnsUsedToSort.insert(name, value);
}
DataJoinKey nkey = new DataJoinKey(datasetID, columnsUsedToSort, mapper.extractSortValueKeyword(tuple), mapper.getSortValueComparator());
nkey.setConf(conf);
return nkey;
}
private static class SamplingOutputCollector implements OutputCollector<Tuple, Tuple>
{
private List<Tuple> keys = new ArrayList<Tuple>();
@Override
public void collect(Tuple key, Tuple value) throws IOException
{
this.keys.add(key);
}
// to be called for every new key
public void clear(){
this.keys.clear();
}
public List<Tuple> getOutKey(){
return this.keys;
}
}
}