/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tez.mapreduce.input;
import java.io.IOException;
import java.util.List;
import java.util.concurrent.locks.Condition;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantLock;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.classification.InterfaceAudience.Private;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileSystem.Statistics;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.serializer.Deserializer;
import org.apache.hadoop.io.serializer.SerializationFactory;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.JobContext;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.TaskAttemptID;
import org.apache.hadoop.mapred.TaskID;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.TaskType;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormatCounter;
import org.apache.hadoop.mapreduce.split.JobSplit.TaskSplitIndex;
import org.apache.hadoop.mapreduce.split.JobSplit.TaskSplitMetaInfo;
import org.apache.hadoop.mapreduce.split.SplitMetaInfoReaderTez;
import org.apache.hadoop.util.ReflectionUtils;
import org.apache.tez.common.counters.TaskCounter;
import org.apache.tez.common.counters.TezCounter;
import org.apache.tez.mapreduce.common.Utils;
import org.apache.tez.mapreduce.hadoop.MRHelpers;
import org.apache.tez.mapreduce.hadoop.MRJobConfig;
import org.apache.tez.mapreduce.hadoop.mapred.MRReporter;
import org.apache.tez.mapreduce.hadoop.mapreduce.TaskAttemptContextImpl;
import org.apache.tez.mapreduce.protos.MRRuntimeProtos.MRInputUserPayloadProto;
import org.apache.tez.mapreduce.protos.MRRuntimeProtos.MRSplitProto;
import org.apache.tez.runtime.api.Event;
import org.apache.tez.runtime.api.Input;
import org.apache.tez.runtime.api.LogicalInput;
import org.apache.tez.runtime.api.TezInputContext;
import org.apache.tez.runtime.api.events.RootInputDataInformationEvent;
import org.apache.tez.runtime.library.api.KeyValueReader;
import com.google.common.base.Preconditions;
/**
* {@link MRInput} is an {@link Input} which provides key/values pairs
* for the consumer.
*
* It is compatible with all standard Apache Hadoop MapReduce
* {@link InputFormat} implementations.
*
* This class is not meant to be extended by external projects.
*/
public class MRInput implements LogicalInput {
private static final Log LOG = LogFactory.getLog(MRInput.class);
private final Lock rrLock = new ReentrantLock();
private Condition rrInited = rrLock.newCondition();
private TezInputContext inputContext;
private volatile boolean eventReceived = false;
private JobConf jobConf;
private Configuration incrementalConf;
private boolean readerCreated = false;
boolean useNewApi;
org.apache.hadoop.mapreduce.TaskAttemptContext taskAttemptContext;
@SuppressWarnings("rawtypes")
private org.apache.hadoop.mapreduce.InputFormat newInputFormat;
@SuppressWarnings("rawtypes")
private org.apache.hadoop.mapreduce.RecordReader newRecordReader;
protected org.apache.hadoop.mapreduce.InputSplit newInputSplit;
@SuppressWarnings("rawtypes")
private InputFormat oldInputFormat;
@SuppressWarnings("rawtypes")
protected RecordReader oldRecordReader;
protected InputSplit oldInputSplit;
protected TaskSplitIndex splitMetaInfo = new TaskSplitIndex();
private TezCounter inputRecordCounter;
private TezCounter fileInputByteCounter;
private List<Statistics> fsStats;
@Private
volatile boolean splitInfoViaEvents;
@Override
public List<Event> initialize(TezInputContext inputContext) throws IOException {
this.inputContext = inputContext;
this.inputContext.requestInitialMemory(0l, null); //mandatory call
this.inputContext.inputIsReady();
MRInputUserPayloadProto mrUserPayload =
MRHelpers.parseMRInputPayload(inputContext.getUserPayload());
Preconditions.checkArgument(mrUserPayload.hasSplits() == false,
"Split information not expected in MRInput");
Configuration conf =
MRHelpers.createConfFromByteString(mrUserPayload.getConfigurationBytes());
this.jobConf = new JobConf(conf);
TaskAttemptID taskAttemptId = new TaskAttemptID(
new TaskID(
Long.toString(inputContext.getApplicationId().getClusterTimestamp()),
inputContext.getApplicationId().getId(), TaskType.MAP,
inputContext.getTaskIndex()),
inputContext.getTaskAttemptNumber());
jobConf.set(MRJobConfig.TASK_ATTEMPT_ID,
taskAttemptId.toString());
jobConf.setInt(MRJobConfig.APPLICATION_ATTEMPT_ID,
inputContext.getDAGAttemptNumber());
// TODO NEWTEZ Rename this to be specific to MRInput. This Input, in
// theory, can be used by the MapProcessor, ReduceProcessor or a custom
// processor. (The processor could provide the counter though)
this.inputRecordCounter = inputContext.getCounters().findCounter(TaskCounter.MAP_INPUT_RECORDS);
this.fileInputByteCounter = inputContext.getCounters().findCounter(FileInputFormatCounter.BYTES_READ);
useNewApi = this.jobConf.getUseNewMapper();
this.splitInfoViaEvents = jobConf.getBoolean(MRJobConfig.MR_TEZ_SPLITS_VIA_EVENTS,
MRJobConfig.MR_TEZ_SPLITS_VIA_EVENTS_DEFAULT);
LOG.info("Using New mapreduce API: " + useNewApi
+ ", split information via event: " + splitInfoViaEvents);
initializeInternal();
return null;
}
@Override
public void start() {
}
@Private
void initializeInternal() throws IOException {
// Primarily for visibility
rrLock.lock();
try {
if (splitInfoViaEvents) {
if (useNewApi) {
setupNewInputFormat();
} else {
setupOldInputFormat();
}
} else {
// Read split information.
TaskSplitMetaInfo[] allMetaInfo = readSplits(jobConf);
TaskSplitMetaInfo thisTaskMetaInfo = allMetaInfo[inputContext
.getTaskIndex()];
this.splitMetaInfo = new TaskSplitIndex(
thisTaskMetaInfo.getSplitLocation(),
thisTaskMetaInfo.getStartOffset());
if (useNewApi) {
setupNewInputFormat();
newInputSplit = getNewSplitDetailsFromDisk(splitMetaInfo);
setupNewRecordReader();
} else {
setupOldInputFormat();
oldInputSplit = getOldSplitDetailsFromDisk(splitMetaInfo);
setupOldRecordReader();
}
}
} finally {
rrLock.unlock();
}
LOG.info("Initialzed MRInput: " + inputContext.getSourceVertexName());
}
private void setupOldInputFormat() {
oldInputFormat = this.jobConf.getInputFormat();
}
private void setupOldRecordReader() throws IOException {
Preconditions.checkNotNull(oldInputSplit, "Input split hasn't yet been setup");
List<Statistics> matchedStats = null;
if (oldInputSplit instanceof FileSplit) {
matchedStats = Utils.getFsStatistics(((FileSplit) oldInputSplit).getPath(), this.jobConf);
}
fsStats = matchedStats;
long bytesInPrev = getInputBytes();
oldRecordReader = oldInputFormat.getRecordReader(oldInputSplit,
this.jobConf, new MRReporter(inputContext, oldInputSplit));
long bytesInCurr = getInputBytes();
fileInputByteCounter.increment(bytesInCurr - bytesInPrev);
setIncrementalConfigParams(oldInputSplit);
}
private void setupNewInputFormat() throws IOException {
taskAttemptContext = createTaskAttemptContext();
Class<? extends org.apache.hadoop.mapreduce.InputFormat<?, ?>> inputFormatClazz;
try {
inputFormatClazz = taskAttemptContext.getInputFormatClass();
} catch (ClassNotFoundException e) {
throw new IOException("Unable to instantiate InputFormat class", e);
}
newInputFormat = ReflectionUtils.newInstance(inputFormatClazz, this.jobConf);
}
private void setupNewRecordReader() throws IOException {
Preconditions.checkNotNull(newInputSplit, "Input split hasn't yet been setup");
List<Statistics> matchedStats = null;
if (newInputSplit instanceof org.apache.hadoop.mapreduce.lib.input.FileSplit) {
matchedStats = Utils.getFsStatistics(
((org.apache.hadoop.mapreduce.lib.input.FileSplit)
newInputSplit).getPath(), this.jobConf);
}
fsStats = matchedStats;
try {
newRecordReader = newInputFormat.createRecordReader(newInputSplit, taskAttemptContext);
newRecordReader.initialize(newInputSplit, taskAttemptContext);
} catch (InterruptedException e) {
throw new IOException("Interrupted while creating record reader", e);
}
}
@Override
public KeyValueReader getReader() throws IOException {
Preconditions
.checkState(readerCreated == false,
"Only a single instance of record reader can be created for this input.");
readerCreated = true;
rrLock.lock();
try {
if (newRecordReader == null && oldRecordReader == null)
checkAndAwaitRecordReaderInitialization();
} finally {
rrLock.unlock();
}
LOG.info("Creating reader for MRInput: "
+ inputContext.getSourceVertexName());
return new MRInputKVReader();
}
@Override
public void handleEvents(List<Event> inputEvents) throws Exception {
if (eventReceived || inputEvents.size() != 1) {
throw new IllegalStateException(
"MRInput expects only a single input. Received: current eventListSize: "
+ inputEvents.size() + "Received previous input: "
+ eventReceived);
}
Event event = inputEvents.iterator().next();
Preconditions.checkArgument(event instanceof RootInputDataInformationEvent,
getClass().getSimpleName()
+ " can only handle a single event of type: "
+ RootInputDataInformationEvent.class.getSimpleName());
processSplitEvent((RootInputDataInformationEvent)event);
}
@Override
public void setNumPhysicalInputs(int numInputs) {
// Not required at the moment. May be required if splits are sent via events.
}
@Override
public List<Event> close() throws IOException {
long bytesInPrev = getInputBytes();
if (useNewApi) {
newRecordReader.close();
} else {
oldRecordReader.close();
}
long bytesInCurr = getInputBytes();
fileInputByteCounter.increment(bytesInCurr - bytesInPrev);
return null;
}
/**
* {@link MRInput} sets some additional parameters like split location when using
* the new API. This methods returns the list of additional updates, and
* should be used by Processors using the old MapReduce API with {@link MRInput}.
*
* @return the additional fields set by {@link MRInput}
*/
public Configuration getConfigUpdates() {
if (incrementalConf != null) {
return new Configuration(incrementalConf);
}
return null;
}
public float getProgress() throws IOException, InterruptedException {
if (useNewApi) {
return newRecordReader.getProgress();
} else {
return oldRecordReader.getProgress();
}
}
private TaskAttemptContext createTaskAttemptContext() {
return new TaskAttemptContextImpl(this.jobConf, inputContext, true, null);
}
void processSplitEvent(RootInputDataInformationEvent event)
throws IOException {
rrLock.lock();
try {
initFromEventInternal(event);
LOG.info("Notifying on RecordReader Initialized");
rrInited.signal();
} finally {
rrLock.unlock();
}
}
void checkAndAwaitRecordReaderInitialization() throws IOException {
try {
LOG.info("Awaiting RecordReader initialization");
rrInited.await();
} catch (Exception e) {
throw new IOException(
"Interrupted waiting for RecordReader initiailization");
}
}
@Private
void initFromEvent(RootInputDataInformationEvent initEvent)
throws IOException {
rrLock.lock();
try {
initFromEventInternal(initEvent);
} finally {
rrLock.unlock();
}
}
private void initFromEventInternal(RootInputDataInformationEvent initEvent)
throws IOException {
LOG.info("Initializing RecordReader from event");
Preconditions.checkState(initEvent != null, "InitEvent must be specified");
MRSplitProto splitProto = MRSplitProto
.parseFrom(initEvent.getUserPayload());
if (useNewApi) {
newInputSplit = getNewSplitDetailsFromEvent(splitProto, jobConf);
LOG.info("Split Details -> SplitClass: "
+ newInputSplit.getClass().getName() + ", NewSplit: " + newInputSplit);
setupNewRecordReader();
} else {
oldInputSplit = getOldSplitDetailsFromEvent(splitProto, jobConf);
LOG.info("Split Details -> SplitClass: "
+ oldInputSplit.getClass().getName() + ", OldSplit: " + oldInputSplit);
setupOldRecordReader();
}
LOG.info("Initialized RecordReader from event");
}
@Private
public static InputSplit getOldSplitDetailsFromEvent(MRSplitProto splitProto, Configuration conf)
throws IOException {
SerializationFactory serializationFactory = new SerializationFactory(conf);
return MRHelpers.createOldFormatSplitFromUserPayload(splitProto, serializationFactory);
}
@SuppressWarnings("unchecked")
private InputSplit getOldSplitDetailsFromDisk(TaskSplitIndex splitMetaInfo)
throws IOException {
Path file = new Path(splitMetaInfo.getSplitLocation());
FileSystem fs = FileSystem.getLocal(jobConf);
file = fs.makeQualified(file);
LOG.info("Reading input split file from : " + file);
long offset = splitMetaInfo.getStartOffset();
FSDataInputStream inFile = fs.open(file);
inFile.seek(offset);
String className = Text.readString(inFile);
Class<org.apache.hadoop.mapred.InputSplit> cls;
try {
cls =
(Class<org.apache.hadoop.mapred.InputSplit>)
jobConf.getClassByName(className);
} catch (ClassNotFoundException ce) {
IOException wrap = new IOException("Split class " + className +
" not found");
wrap.initCause(ce);
throw wrap;
}
SerializationFactory factory = new SerializationFactory(jobConf);
Deserializer<org.apache.hadoop.mapred.InputSplit> deserializer =
(Deserializer<org.apache.hadoop.mapred.InputSplit>)
factory.getDeserializer(cls);
deserializer.open(inFile);
org.apache.hadoop.mapred.InputSplit split = deserializer.deserialize(null);
long pos = inFile.getPos();
inputContext.getCounters().findCounter(TaskCounter.SPLIT_RAW_BYTES)
.increment(pos - offset);
inFile.close();
return split;
}
@Private
public static org.apache.hadoop.mapreduce.InputSplit getNewSplitDetailsFromEvent(
MRSplitProto splitProto, Configuration conf) throws IOException {
SerializationFactory serializationFactory = new SerializationFactory(conf);
return MRHelpers.createNewFormatSplitFromUserPayload(
splitProto, serializationFactory);
}
@SuppressWarnings("unchecked")
private org.apache.hadoop.mapreduce.InputSplit getNewSplitDetailsFromDisk(
TaskSplitIndex splitMetaInfo) throws IOException {
Path file = new Path(splitMetaInfo.getSplitLocation());
long offset = splitMetaInfo.getStartOffset();
// Split information read from local filesystem.
FileSystem fs = FileSystem.getLocal(jobConf);
file = fs.makeQualified(file);
LOG.info("Reading input split file from : " + file);
FSDataInputStream inFile = fs.open(file);
inFile.seek(offset);
String className = Text.readString(inFile);
Class<org.apache.hadoop.mapreduce.InputSplit> cls;
try {
cls =
(Class<org.apache.hadoop.mapreduce.InputSplit>)
jobConf.getClassByName(className);
} catch (ClassNotFoundException ce) {
IOException wrap = new IOException("Split class " + className +
" not found");
wrap.initCause(ce);
throw wrap;
}
SerializationFactory factory = new SerializationFactory(jobConf);
Deserializer<org.apache.hadoop.mapreduce.InputSplit> deserializer =
(Deserializer<org.apache.hadoop.mapreduce.InputSplit>)
factory.getDeserializer(cls);
deserializer.open(inFile);
org.apache.hadoop.mapreduce.InputSplit split =
deserializer.deserialize(null);
long pos = inFile.getPos();
inputContext.getCounters().findCounter(TaskCounter.SPLIT_RAW_BYTES)
.increment(pos - offset);
inFile.close();
return split;
}
private void setIncrementalConfigParams(InputSplit inputSplit) {
if (inputSplit instanceof FileSplit) {
FileSplit fileSplit = (FileSplit) inputSplit;
this.incrementalConf = new Configuration(false);
this.incrementalConf.set(JobContext.MAP_INPUT_FILE, fileSplit.getPath()
.toString());
this.incrementalConf.setLong(JobContext.MAP_INPUT_START,
fileSplit.getStart());
this.incrementalConf.setLong(JobContext.MAP_INPUT_PATH,
fileSplit.getLength());
}
LOG.info("Processing split: " + inputSplit);
}
private long getInputBytes() {
if (fsStats == null) return 0;
long bytesRead = 0;
for (Statistics stat: fsStats) {
bytesRead = bytesRead + stat.getBytesRead();
}
return bytesRead;
}
protected TaskSplitMetaInfo[] readSplits(Configuration conf)
throws IOException {
TaskSplitMetaInfo[] allTaskSplitMetaInfo;
allTaskSplitMetaInfo = SplitMetaInfoReaderTez.readSplitMetaInfo(conf,
FileSystem.getLocal(conf));
return allTaskSplitMetaInfo;
}
private class MRInputKVReader implements KeyValueReader {
Object key;
Object value;
private final boolean localNewApi;
MRInputKVReader() {
localNewApi = useNewApi;
if (!localNewApi) {
key = oldRecordReader.createKey();
value =oldRecordReader.createValue();
}
}
// Setup the values iterator once, and set value on the same object each time
// to prevent lots of objects being created.
@SuppressWarnings("unchecked")
@Override
public boolean next() throws IOException {
boolean hasNext = false;
long bytesInPrev = getInputBytes();
if (localNewApi) {
try {
hasNext = newRecordReader.nextKeyValue();
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
throw new IOException("Interrupted while checking for next key-value", e);
}
} else {
hasNext = oldRecordReader.next(key, value);
}
long bytesInCurr = getInputBytes();
fileInputByteCounter.increment(bytesInCurr - bytesInPrev);
if (hasNext) {
inputRecordCounter.increment(1);
}
return hasNext;
}
@Override
public Object getCurrentKey() throws IOException {
if (localNewApi) {
try {
return newRecordReader.getCurrentKey();
} catch (InterruptedException e) {
throw new IOException("Interrupted while fetching next key", e);
}
} else {
return key;
}
}
@Override
public Object getCurrentValue() throws IOException {
if (localNewApi) {
try {
return newRecordReader.getCurrentValue();
} catch (InterruptedException e) {
throw new IOException("Interrupted while fetching next value", e);
}
} else {
return value;
}
}
}
}