package com.ebay.erl.mobius.core.mapred;
import java.io.Closeable;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.IllegalFormatException;
import java.util.List;
import java.util.Map;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import com.ebay.erl.mobius.core.ConfigureConstants;
import com.ebay.erl.mobius.core.collection.BigTupleList;
import com.ebay.erl.mobius.core.criterion.TupleCriterion;
import com.ebay.erl.mobius.core.datajoin.DataJoinMapper;
import com.ebay.erl.mobius.core.model.ComputedColumns;
import com.ebay.erl.mobius.core.model.KeyTuple;
import com.ebay.erl.mobius.core.model.Tuple;
import com.ebay.erl.mobius.util.SerializableUtil;
import com.ebay.erl.mobius.util.Util;
/**
* Base class for implementing a customized Mobius mapper.
* <p>
*
* Extends this class if the built-in mappers,
* {@link com.ebay.erl.mobius.core.mapred.TSVMapper} and
* {@link com.ebay.erl.mobius.core.mapred.SequenceFileMapper},
* does not meet the needs.
* <p>
*
* This class provides filtering (by taking user specified
* {@link #tuple_criteria}), compute {@link #computedColumns},
* and updating counters.
* <p>
*
* Override the {@link #parse(Object, Object)} method to convert
* the K-V objects into a tuple, then the underlying data source
* can be processed by mobius.
* <p>
*
* <p>
* This product is licensed under the Apache License, Version 2.0,
* available at http://www.apache.org/licenses/LICENSE-2.0.
*
* This product contains portions derived from Apache hadoop which is
* licensed under the Apache License, Version 2.0, available at
* http://hadoop.apache.org.
*
* © 2007 – 2012 eBay Inc., Evan Chiu, Woody Zhou, Neel Sundaresan
*
* @param <IK> input key type.
* @param <IV> input value type.
*
*/
@SuppressWarnings("deprecation")
public abstract class AbstractMobiusMapper<IK, IV> extends DataJoinMapper<IK, IV, WritableComparable<?>, WritableComparable<?>>
{
/**
* filters
*/
protected TupleCriterion tuple_criteria;
/**
* columns to be emitted as key of this {@link Mapper}
*/
protected String[] key_columns;
/**
* columns to be emitted as value of this {@link Mapper}
*/
protected String[] value_columns;
/**
* Output column names for map only job, ex: listing.
*/
protected String[] projection_order;
/**
* The current dataset ID.
*/
protected Byte currentDatasetID = null;
/**
* The normalized name of the dataset been processed by this
* mapper currently, it is used as counter ID to update the
* corresponding Hadoop counters for this dataset.
* <p>
*
* The name is normalized from the {@link #currentDatasetID} by
* removing the serial number part.
*/
protected String dataset_display_id;
/**
* A background thread responsible for updating the
* Hadoop counters.
*/
protected CounterUpdateThread counterThread;
/**
* Counts for the number of input records.
* <p>
*
* #INPUT_RECORDS = #FILTERED_RECORDS + #OUTPUT_RECORDS.
*/
protected long _COUNTER_INPUT_RECORD;
/**
* Counts for the number of outputted records.
*/
protected long _COUNTER_OUTPUT_RECORD;
/**
* Counts for the number of filtered records,
* filtered by user specified {@link #tuple_criteria}.
*/
protected long _COUNTER_FILTERED_RECORD;
/**
* Counts for invalidate format records.
*/
protected long _COUNTER_INVALIDATE_FORMAT_RECORD;
/**
* {@link ComputedColumns} specified by user.
*/
protected List<ComputedColumns> computedColumns = null;
protected boolean _IS_MAP_ONLY_JOB = false;
public static final long _100MB = 100L*1024L*1024L;
protected boolean reporterSet = false;
private static final Log LOGGER = LogFactory.getLog(AbstractMobiusMapper.class);
/**
* Setup Mapper.
* <p>
*
* Override this method if there is extra initial
* settings need to be done.
* <p>
*
* Make sure to call <code>super.configure(JobConf)</code>
* when overriding.
*
*/
@SuppressWarnings("unchecked")
@Override
public void configure(JobConf conf)
{
super.configure(conf);
this.conf = conf;
this._IS_MAP_ONLY_JOB = this.conf.getInt("mapred.reduce.tasks", 1)==0;
// catch the current dataset ID, the {@link Configuration#get(String)}
// is costly as it compose Pattern every time.
this.currentDatasetID = Byte.valueOf(this.conf.get (ConfigureConstants.CURRENT_DATASET_ID));
String[] datasetIDstoNames = this.conf.getStrings(ConfigureConstants.DATASET_ID_TO_NAME_MAPPING);
Map<Byte, String> mapping = new HashMap<Byte, String>();
for( String aMapping:datasetIDstoNames )
{
int cut = aMapping.indexOf(";");
Byte datasetID = Byte.parseByte(aMapping.substring(0, cut));
String datasetDisplayName = aMapping.substring(cut+1);
mapping.put(datasetID, datasetDisplayName);
}
if( mapping.size()==0 )
throw new IllegalArgumentException(ConfigureConstants.DATASET_ID_TO_NAME_MAPPING+" is not set.");
this.dataset_display_id = mapping.get(this.currentDatasetID);
if( this.dataset_display_id==null )
{
throw new IllegalArgumentException("Cannot find display name for datasetID:"+this.currentDatasetID +
" from "+ConfigureConstants.DATASET_ID_TO_NAME_MAPPING+":"+
this.conf.get(ConfigureConstants.DATASET_ID_TO_NAME_MAPPING));
}
// initialize counters
this._COUNTER_INPUT_RECORD = 0L;
this._COUNTER_OUTPUT_RECORD = 0L;
this._COUNTER_FILTERED_RECORD = 0L;
this._COUNTER_INVALIDATE_FORMAT_RECORD = 0L;
try
{
this.key_columns = (String[])this.conf.getStrings(this.getDatasetID()+".key.columns", Util.ZERO_SIZE_STRING_ARRAY);
this.value_columns = (String[])this.conf.getStrings(this.getDatasetID()+".value.columns", Util.ZERO_SIZE_STRING_ARRAY);
this.tuple_criteria = (TupleCriterion)this.get("tuple.criteria");
this.computedColumns= (List<ComputedColumns>)this.get("computed.columns");
if( this._IS_MAP_ONLY_JOB )
{
this.projection_order = (String[])this.conf.getStrings(this.getDatasetID()+".columns.in.original.order", Util.ZERO_SIZE_STRING_ARRAY);
}
}catch(IOException e)
{
e.printStackTrace();
throw new RuntimeException(e);
}
}
/**
* map()
*/
@SuppressWarnings("unchecked")
@Override
public void joinmap(IK key, IV value, OutputCollector<WritableComparable<?>, WritableComparable<?>> output, Reporter reporter)
throws IOException
{
// initializing counter updating thread, to be run in the background.
if( this.counterThread==null )
{
this.counterThread = new CounterUpdateThread(reporter);
new Thread(this.counterThread).start();
}
if (!reporterSet){
if( this.computedColumns!=null ){
for( ComputedColumns c:this.computedColumns ){
c.setReporter(reporter);
}
}
reporterSet = true;
}
Tuple record = null;
try
{
record = this.parse(key, value);
}
catch(IllegalFormatException e)
{
this._COUNTER_INVALIDATE_FORMAT_RECORD++;
this.updateCounter(this.dataset_display_id, "INVALIDATE_RECORDS", this._COUNTER_INVALIDATE_FORMAT_RECORD);
return;
}
this._COUNTER_INPUT_RECORD++;
this.updateCounter(this.dataset_display_id, "INPUT_RECORDS", this._COUNTER_INPUT_RECORD);
Iterable<Tuple> rows_to_be_output = new ArrayList<Tuple>();
((List<Tuple>)rows_to_be_output).add(record);
// apply computed column if any
if( this.computedColumns!=null )
{
for( ComputedColumns aComputedColumn:this.computedColumns )
{
aComputedColumn.reset();
aComputedColumn.consume(Tuple.immutable(record));
if ( aComputedColumn.getResult()!=null && aComputedColumn.getResult().size()>0 )
{
BigTupleList computedResult = aComputedColumn.getResult();
if( computedResult.size()<5000 )
{
// use in memory cross product
Iterable<Tuple>[] allValues = new Iterable[2];
allValues[0] = rows_to_be_output;
allValues[1] = aComputedColumn.getResult();
rows_to_be_output = Util.inMemoryCrossProduct(allValues);
}
else
{
// computed result is too big, don't use in memory cross
// product.
Iterable<Tuple>[] allValues = new Iterable[2];
allValues[0] = rows_to_be_output;
allValues[1] = aComputedColumn.getResult();
rows_to_be_output = Util.crossProduct(this.conf, reporter, allValues);
}
}
}
}
// apply the criteria if any and prepare output
for( Tuple aRow:rows_to_be_output)
{
Tuple out_key = this.getKeyTuple(this.key_columns, aRow, null);
Tuple out_value = null;
if( !this._IS_MAP_ONLY_JOB )
{
// tuple will go to reducer phase, we use the sorted column
// so the reducer can set the schema back correctly.
out_value = this.getTuple(this.value_columns, aRow, Tuple.NULL);
}
else
{
out_value = this.getTuple(this.projection_order, aRow, Tuple.NULL);
}
/**
* TODO some tuple criteria can be applied earlier, if the
* columns it is evaluating are not derived. This can
* save the time to compute the derived columns, as it might
* be costly.
*/
if( this.tuple_criteria!=null )
{
// use the aRow as the criteria might use column(s)
// not within the projection columns (<code>value_columns</code>).
if ( this.tuple_criteria.accept(aRow, this.conf) )
{
outputRecords(out_key, out_value, output);
this._COUNTER_OUTPUT_RECORD++;
this.updateCounter(this.dataset_display_id, "OUTPUT_RECORDS", this._COUNTER_OUTPUT_RECORD);
}
else
{
this._COUNTER_FILTERED_RECORD++;
this.updateCounter(this.dataset_display_id, "FILTERED_RECORDS", this._COUNTER_FILTERED_RECORD);
}
}
else
{
outputRecords(out_key, out_value, output);
this._COUNTER_OUTPUT_RECORD++;
this.updateCounter(this.dataset_display_id, "OUTPUT_RECORDS", this._COUNTER_OUTPUT_RECORD);
}
}
if( rows_to_be_output instanceof Closeable )
{
((Closeable)rows_to_be_output).close();
}
}
protected void outputRecords(Tuple key, Tuple value, OutputCollector<WritableComparable<?>, WritableComparable<?>> output)
throws IOException
{
if( this._IS_MAP_ONLY_JOB )
{
// map only job, key is not needed as no join is required.
output.collect(NullWritable.get(), value);
}
else
{
if( key==null )
{
// should never happen, this is to perform join/group by, but there
// is no key
throw new IllegalArgumentException("key for dataset: "+this.getDatasetID()+
" cannot be empty when performing join/group by.");
}
output.collect(key, value);
}
}
/**
* close Mapper
*/
@Override
public void close()
throws IOException
{
this.counterThread.stop();
}
/**
* Parse the input key and input value into {@link Tuple}
*/
public abstract Tuple parse(IK inkey, IV invalue)
throws IllegalArgumentException, IOException;
/**
* update certain counter
*/
protected final void updateCounter(String group, String couter, long number)
{
this.counterThread.updateCounter(group, couter, number);
}
/**
* Get the current dataset ID.
*/
public final Byte getDatasetID()
{
return this.currentDatasetID;
}
/**
* Get object from {@link JobConf}, assuming the value
* is Base64 encoded, and can be decoded back to Java
* object.
* <p>
*
* If the value from {@link JobConf} for the given
* <code>key</code> is null or empty, null is returned.
*/
protected final Object get(String key) throws IOException
{
String value = this.conf.get(this.getDatasetID()+"."+key);
if( value==null || (value=value.trim()).isEmpty() )
return null;
return SerializableUtil.deserializeFromBase64(value, this.conf);
}
private final Tuple getKeyTuple(String[] columns, Tuple record, Tuple defaultValue)
{
if( columns==null || columns.length==0 )
return defaultValue;
else
{
Tuple t = new KeyTuple();
for( String aColumn:columns )
{
t.insert(aColumn, record.get(aColumn));
}
return t;
}
}
/**
* retrieve columns from the record tuple, and return
* a new tuple instance which contains only the specified
* columns.
*/
private final Tuple getTuple(String[] columns, Tuple record, Tuple defaultValue)
{
if( columns==null || columns.length==0 )
return defaultValue;
else
{
Tuple t = new Tuple();
for( String aColumn:columns )
{
t.insert(aColumn, record.get(aColumn));
}
return t;
}
}
}