package org.apache.hadoop.mapred.lib;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import com.ebay.erl.mobius.core.ConfigureConstants;
import com.ebay.erl.mobius.core.mapred.AbstractMobiusMapper;
import com.ebay.erl.mobius.core.mapred.MultiInputsHelpersRepository;
/**
* <p>
* This product is licensed under the Apache License, Version 2.0,
* available at http://www.apache.org/licenses/LICENSE-2.0.
*
* This product contains portions derived from Apache hadoop which is
* licensed under the Apache License, Version 2.0, available at
* http://hadoop.apache.org.
*
* © 2007 – 2012 eBay Inc., Evan Chiu, Woody Zhou, Neel Sundaresan
*
* @param <K>
* @param <V>
*/
@SuppressWarnings({ "deprecation" })
public class MobiusDelegatingInputFormat <K, V> extends DelegatingInputFormat<K, V>
{
private Map<URI, String> _URI_TO_DATASETID_MAPPING;
private List<URI> _INPUT_URIS;
private final String _INIT_KEY = "";
private static final Log LOGGER = LogFactory.getLog(MobiusDelegatingInputFormat.class);
// getting the mapper which can process the input split
public Class<AbstractMobiusMapper> getMapper(InputSplit split, JobConf conf)
throws IOException
{
TaggedInputSplit taggedSplit = (TaggedInputSplit)split;
InputSplit inputSplit = taggedSplit.getInputSplit();
URI currentFileURI = MultiInputsHelpersRepository.getInstance(conf).getURIBySplit(inputSplit, conf);
try
{
String[] pathToMapperMappings = conf.get("mapred.input.dir.mappers").split(",");
for( String aPathToMapper:pathToMapperMappings)
{
//System.out.println("aPathToMapper:"+aPathToMapper);
//System.out.println("currentFileURI:"+currentFileURI.toString());
String[] data = aPathToMapper.split(";");
URI path = new URI(data[0]);
URI relative = path.relativize(currentFileURI);
//System.out.println("relative:"+relative);
String mapperClassName = data[1];
if( currentFileURI.equals(path) || !relative.equals(currentFileURI) )
{
return (Class<AbstractMobiusMapper>)Class.forName(mapperClassName);
}
}
}catch(Exception e)
{
throw new RuntimeException(e);
}
return null;
}
@Override
public RecordReader<K, V> getRecordReader(InputSplit split, JobConf conf, Reporter reporter) throws IOException
{
this.setupLookupTables(conf);
String datasetID = getDatasetIDBySplit(split, conf);
conf.set (ConfigureConstants.CURRENT_DATASET_ID, datasetID);
RecordReader<K, V> reader = super.getRecordReader (split, conf, reporter);
return reader;
}
private String getDatasetIDBySplit(InputSplit split, JobConf conf)
throws IOException
{
// The <code>split</code> is an instance of {@link TaggedInputSplit}
// but the TaggedInputSplit is not a public class, so we need to place
// this class under the package of org.apache.hadoop.mapred.lib.
TaggedInputSplit taggedSplit = (TaggedInputSplit)split;
InputSplit inputSplit = taggedSplit.getInputSplit();
URI currentFileURI = MultiInputsHelpersRepository.getInstance(conf).getURIBySplit(inputSplit, conf);
String currentFile = currentFileURI.toString();
LOGGER.debug("Using ["+currentFile+"] to locate current Dataset");
String datasetID = null;
for( URI anInput:_INPUT_URIS )
{
if( anInput.equals(currentFileURI) )
{
datasetID = _URI_TO_DATASETID_MAPPING.get (anInput);
if ( datasetID == null || datasetID.trim ().length () == 0 )
throw new IllegalArgumentException ("Dataet ID for the input path:[" + anInput+ "] did not set.");
}
else
{
// not equal, compute the relative URI
URI relative = anInput.relativize(currentFileURI);
if( !relative.equals(currentFileURI) )
{
// found the key
datasetID = _URI_TO_DATASETID_MAPPING.get (anInput);
if ( datasetID == null || datasetID.trim ().length () == 0 )
throw new IllegalArgumentException ("Dataet ID for the input path:[" + anInput+ "] did not set.");
}
}
}
if( datasetID==null )
{
throw new IllegalArgumentException ("Cannot find dataset id using the given uri:[" + currentFile + "], " +
ConfigureConstants.INPUT_TO_DATASET_MAPPING+":" + conf.get (ConfigureConstants.INPUT_TO_DATASET_MAPPING));
}
return datasetID;
}
private void setupLookupTables(JobConf conf)
{
// due to this bug: https://issues.apache.org/jira/browse/MAPREDUCE-1743
// map.input.file is not set when using MultipleInputs, which is used in
// {@link MobiusMultiInputs}, we need to set it.
synchronized(_INIT_KEY)
{
_URI_TO_DATASETID_MAPPING = new TreeMap<URI, String>();
_INPUT_URIS = new ArrayList<URI>();
// in the format of datasetID;input_uri(,datasetID;input_uri)*
String[] mappings = conf.getStrings (ConfigureConstants.INPUT_TO_DATASET_MAPPING);
for(String aMapping:mappings )
{
String[] data = aMapping.split(";");
String datasetID = data[0];
String input_uri = data[1];
try
{
URI anInput = new URI(input_uri);
_INPUT_URIS.add(anInput);
_URI_TO_DATASETID_MAPPING.put(anInput, datasetID);
}
catch (URISyntaxException e)
{
throw new RuntimeException(e);
}
}
}
Collections.sort(_INPUT_URIS);
// reverse the order so the system can check the URI from most specific to
// less specific
Collections.reverse(_INPUT_URIS);
}
}