Package org.apache.tez.mapreduce.input

Source Code of org.apache.tez.mapreduce.input.MRInput$MRInputKVReader

/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tez.mapreduce.input;

import java.io.IOException;
import java.util.List;
import java.util.concurrent.locks.Condition;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantLock;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.classification.InterfaceAudience.Private;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileSystem.Statistics;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.serializer.Deserializer;
import org.apache.hadoop.io.serializer.SerializationFactory;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.JobContext;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.TaskAttemptID;
import org.apache.hadoop.mapred.TaskID;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.TaskType;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormatCounter;
import org.apache.hadoop.mapreduce.split.JobSplit.TaskSplitIndex;
import org.apache.hadoop.mapreduce.split.JobSplit.TaskSplitMetaInfo;
import org.apache.hadoop.mapreduce.split.SplitMetaInfoReaderTez;
import org.apache.hadoop.util.ReflectionUtils;
import org.apache.tez.common.counters.TaskCounter;
import org.apache.tez.common.counters.TezCounter;
import org.apache.tez.mapreduce.common.Utils;
import org.apache.tez.mapreduce.hadoop.MRHelpers;
import org.apache.tez.mapreduce.hadoop.MRJobConfig;
import org.apache.tez.mapreduce.hadoop.mapred.MRReporter;
import org.apache.tez.mapreduce.hadoop.mapreduce.TaskAttemptContextImpl;
import org.apache.tez.mapreduce.protos.MRRuntimeProtos.MRInputUserPayloadProto;
import org.apache.tez.mapreduce.protos.MRRuntimeProtos.MRSplitProto;
import org.apache.tez.runtime.api.Event;
import org.apache.tez.runtime.api.Input;
import org.apache.tez.runtime.api.LogicalInput;
import org.apache.tez.runtime.api.TezInputContext;
import org.apache.tez.runtime.api.events.RootInputDataInformationEvent;
import org.apache.tez.runtime.library.api.KeyValueReader;

import com.google.common.base.Preconditions;

/**
* {@link MRInput} is an {@link Input} which provides key/values pairs
* for the consumer.
*
* It is compatible with all standard Apache Hadoop MapReduce
* {@link InputFormat} implementations.
*
* This class is not meant to be extended by external projects.
*/

public class MRInput implements LogicalInput {

  private static final Log LOG = LogFactory.getLog(MRInput.class);
 
  private final Lock rrLock = new ReentrantLock();
  private Condition rrInited = rrLock.newCondition();
  private TezInputContext inputContext;
 
  private volatile boolean eventReceived = false;
 
  private JobConf jobConf;
  private Configuration incrementalConf;
  private boolean readerCreated = false;
 
  boolean useNewApi;
 
  org.apache.hadoop.mapreduce.TaskAttemptContext taskAttemptContext;

  @SuppressWarnings("rawtypes")
  private org.apache.hadoop.mapreduce.InputFormat newInputFormat;
  @SuppressWarnings("rawtypes")
  private org.apache.hadoop.mapreduce.RecordReader newRecordReader;
  protected org.apache.hadoop.mapreduce.InputSplit newInputSplit;

  @SuppressWarnings("rawtypes")
  private InputFormat oldInputFormat;
  @SuppressWarnings("rawtypes")
  protected RecordReader oldRecordReader;
  protected InputSplit oldInputSplit;

  protected TaskSplitIndex splitMetaInfo = new TaskSplitIndex();
 
  private TezCounter inputRecordCounter;
  private TezCounter fileInputByteCounter;
  private List<Statistics> fsStats;
 
  @Private
  volatile boolean splitInfoViaEvents;
 
 
  @Override
  public List<Event> initialize(TezInputContext inputContext) throws IOException {
    this.inputContext = inputContext;
    this.inputContext.requestInitialMemory(0l, null); //mandatory call
    this.inputContext.inputIsReady();
    MRInputUserPayloadProto mrUserPayload =
      MRHelpers.parseMRInputPayload(inputContext.getUserPayload());
    Preconditions.checkArgument(mrUserPayload.hasSplits() == false,
        "Split information not expected in MRInput");
    Configuration conf =
      MRHelpers.createConfFromByteString(mrUserPayload.getConfigurationBytes());
    this.jobConf = new JobConf(conf);

    TaskAttemptID taskAttemptId = new TaskAttemptID(
      new TaskID(
        Long.toString(inputContext.getApplicationId().getClusterTimestamp()),
        inputContext.getApplicationId().getId(), TaskType.MAP,
        inputContext.getTaskIndex()),
      inputContext.getTaskAttemptNumber());

    jobConf.set(MRJobConfig.TASK_ATTEMPT_ID,
      taskAttemptId.toString());
    jobConf.setInt(MRJobConfig.APPLICATION_ATTEMPT_ID,
      inputContext.getDAGAttemptNumber());

    // TODO NEWTEZ Rename this to be specific to MRInput. This Input, in
    // theory, can be used by the MapProcessor, ReduceProcessor or a custom
    // processor. (The processor could provide the counter though)
    this.inputRecordCounter = inputContext.getCounters().findCounter(TaskCounter.MAP_INPUT_RECORDS);
    this.fileInputByteCounter = inputContext.getCounters().findCounter(FileInputFormatCounter.BYTES_READ);
   
    useNewApi = this.jobConf.getUseNewMapper();
    this.splitInfoViaEvents = jobConf.getBoolean(MRJobConfig.MR_TEZ_SPLITS_VIA_EVENTS,
        MRJobConfig.MR_TEZ_SPLITS_VIA_EVENTS_DEFAULT);
    LOG.info("Using New mapreduce API: " + useNewApi
        + ", split information via event: " + splitInfoViaEvents);

    initializeInternal();
    return null;
  }
 
  @Override
  public void start() {
  }

  @Private
  void initializeInternal() throws IOException {
    // Primarily for visibility
    rrLock.lock();
    try {
      if (splitInfoViaEvents) {
        if (useNewApi) {
          setupNewInputFormat();
        } else {
          setupOldInputFormat();
        }
      } else {
        // Read split information.
        TaskSplitMetaInfo[] allMetaInfo = readSplits(jobConf);
        TaskSplitMetaInfo thisTaskMetaInfo = allMetaInfo[inputContext
            .getTaskIndex()];
        this.splitMetaInfo = new TaskSplitIndex(
            thisTaskMetaInfo.getSplitLocation(),
            thisTaskMetaInfo.getStartOffset());
        if (useNewApi) {
          setupNewInputFormat();
          newInputSplit = getNewSplitDetailsFromDisk(splitMetaInfo);
          setupNewRecordReader();
        } else {
          setupOldInputFormat();
          oldInputSplit = getOldSplitDetailsFromDisk(splitMetaInfo);
          setupOldRecordReader();
        }
      }
    } finally {
      rrLock.unlock();
    }
    LOG.info("Initialzed MRInput: " + inputContext.getSourceVertexName());
  }

  private void setupOldInputFormat() {
    oldInputFormat = this.jobConf.getInputFormat();
  }
 
  private void setupOldRecordReader() throws IOException {
    Preconditions.checkNotNull(oldInputSplit, "Input split hasn't yet been setup");
    List<Statistics> matchedStats = null;
    if (oldInputSplit instanceof FileSplit) {
      matchedStats = Utils.getFsStatistics(((FileSplit) oldInputSplit).getPath(), this.jobConf);
    }
    fsStats = matchedStats;
   
    long bytesInPrev = getInputBytes();
    oldRecordReader = oldInputFormat.getRecordReader(oldInputSplit,
        this.jobConf, new MRReporter(inputContext, oldInputSplit));
    long bytesInCurr = getInputBytes();
    fileInputByteCounter.increment(bytesInCurr - bytesInPrev);
    setIncrementalConfigParams(oldInputSplit);
  }
 
  private void setupNewInputFormat() throws IOException {
    taskAttemptContext = createTaskAttemptContext();
    Class<? extends org.apache.hadoop.mapreduce.InputFormat<?, ?>> inputFormatClazz;
    try {
      inputFormatClazz = taskAttemptContext.getInputFormatClass();
    } catch (ClassNotFoundException e) {
      throw new IOException("Unable to instantiate InputFormat class", e);
    }

    newInputFormat = ReflectionUtils.newInstance(inputFormatClazz, this.jobConf);
  }
 
  private void setupNewRecordReader() throws IOException {
    Preconditions.checkNotNull(newInputSplit, "Input split hasn't yet been setup");
    List<Statistics> matchedStats = null;
    if (newInputSplit instanceof org.apache.hadoop.mapreduce.lib.input.FileSplit) {
      matchedStats = Utils.getFsStatistics(
          ((org.apache.hadoop.mapreduce.lib.input.FileSplit)
              newInputSplit).getPath(), this.jobConf);
    }
    fsStats = matchedStats;
   
    try {
      newRecordReader = newInputFormat.createRecordReader(newInputSplit, taskAttemptContext);
      newRecordReader.initialize(newInputSplit, taskAttemptContext);
    } catch (InterruptedException e) {
      throw new IOException("Interrupted while creating record reader", e);
    }
  }

  @Override
  public KeyValueReader getReader() throws IOException {
    Preconditions
        .checkState(readerCreated == false,
            "Only a single instance of record reader can be created for this input.");
    readerCreated = true;
    rrLock.lock();
    try {
      if (newRecordReader == null && oldRecordReader == null)
        checkAndAwaitRecordReaderInitialization();
    } finally {
      rrLock.unlock();
    }

    LOG.info("Creating reader for MRInput: "
        + inputContext.getSourceVertexName());
    return new MRInputKVReader();
  }

  @Override
  public void handleEvents(List<Event> inputEvents) throws Exception {
    if (eventReceived || inputEvents.size() != 1) {
      throw new IllegalStateException(
          "MRInput expects only a single input. Received: current eventListSize: "
              + inputEvents.size() + "Received previous input: "
              + eventReceived);
    }
    Event event = inputEvents.iterator().next();
    Preconditions.checkArgument(event instanceof RootInputDataInformationEvent,
        getClass().getSimpleName()
            + " can only handle a single event of type: "
            + RootInputDataInformationEvent.class.getSimpleName());

    processSplitEvent((RootInputDataInformationEvent)event);
  }

 

  @Override
  public void setNumPhysicalInputs(int numInputs) {
    // Not required at the moment. May be required if splits are sent via events.
  }

  @Override
  public List<Event> close() throws IOException {
    long bytesInPrev = getInputBytes();
    if (useNewApi) {
      newRecordReader.close();
    } else {
      oldRecordReader.close();
    }
    long bytesInCurr = getInputBytes();
    fileInputByteCounter.increment(bytesInCurr - bytesInPrev);
   
    return null;
  }

  /**
   * {@link MRInput} sets some additional parameters like split location when using
   * the new API. This methods returns the list of additional updates, and
   * should be used by Processors using the old MapReduce API with {@link MRInput}.
   *
   * @return the additional fields set by {@link MRInput}
   */
  public Configuration getConfigUpdates() {
    if (incrementalConf != null) {
      return new Configuration(incrementalConf);
    }
    return null;
  }
 
 

  public float getProgress() throws IOException, InterruptedException {
    if (useNewApi) {
      return newRecordReader.getProgress();
    } else {
      return oldRecordReader.getProgress();
    }
  }

 
  private TaskAttemptContext createTaskAttemptContext() {
    return new TaskAttemptContextImpl(this.jobConf, inputContext, true, null);
  }
 
  void processSplitEvent(RootInputDataInformationEvent event)
      throws IOException {
    rrLock.lock();
    try {
      initFromEventInternal(event);
      LOG.info("Notifying on RecordReader Initialized");
      rrInited.signal();
    } finally {
      rrLock.unlock();
    }
  }
 
  void checkAndAwaitRecordReaderInitialization() throws IOException {
    try {
      LOG.info("Awaiting RecordReader initialization");
      rrInited.await();
    } catch (Exception e) {
      throw new IOException(
          "Interrupted waiting for RecordReader initiailization");
    }
  }

  @Private
  void initFromEvent(RootInputDataInformationEvent initEvent)
      throws IOException {
    rrLock.lock();
    try {
      initFromEventInternal(initEvent);
    } finally {
      rrLock.unlock();
    }
  }
 
  private void initFromEventInternal(RootInputDataInformationEvent initEvent)
      throws IOException {
    LOG.info("Initializing RecordReader from event");
    Preconditions.checkState(initEvent != null, "InitEvent must be specified");
    MRSplitProto splitProto = MRSplitProto
        .parseFrom(initEvent.getUserPayload());
    if (useNewApi) {
      newInputSplit = getNewSplitDetailsFromEvent(splitProto, jobConf);
      LOG.info("Split Details -> SplitClass: "
          + newInputSplit.getClass().getName() + ", NewSplit: " + newInputSplit);
      setupNewRecordReader();
    } else {
      oldInputSplit = getOldSplitDetailsFromEvent(splitProto, jobConf);
      LOG.info("Split Details -> SplitClass: "
          + oldInputSplit.getClass().getName() + ", OldSplit: " + oldInputSplit);
      setupOldRecordReader();
    }
    LOG.info("Initialized RecordReader from event");
  }

  @Private
  public static InputSplit getOldSplitDetailsFromEvent(MRSplitProto splitProto, Configuration conf)
      throws IOException {
    SerializationFactory serializationFactory = new SerializationFactory(conf);
    return MRHelpers.createOldFormatSplitFromUserPayload(splitProto, serializationFactory);
  }
 
  @SuppressWarnings("unchecked")
  private InputSplit getOldSplitDetailsFromDisk(TaskSplitIndex splitMetaInfo)
      throws IOException {
    Path file = new Path(splitMetaInfo.getSplitLocation());
    FileSystem fs = FileSystem.getLocal(jobConf);
    file = fs.makeQualified(file);
    LOG.info("Reading input split file from : " + file);
    long offset = splitMetaInfo.getStartOffset();
   
    FSDataInputStream inFile = fs.open(file);
    inFile.seek(offset);
    String className = Text.readString(inFile);
    Class<org.apache.hadoop.mapred.InputSplit> cls;
    try {
      cls =
          (Class<org.apache.hadoop.mapred.InputSplit>)
          jobConf.getClassByName(className);
    } catch (ClassNotFoundException ce) {
      IOException wrap = new IOException("Split class " + className +
          " not found");
      wrap.initCause(ce);
      throw wrap;
    }
    SerializationFactory factory = new SerializationFactory(jobConf);
    Deserializer<org.apache.hadoop.mapred.InputSplit> deserializer =
        (Deserializer<org.apache.hadoop.mapred.InputSplit>)
        factory.getDeserializer(cls);
    deserializer.open(inFile);
    org.apache.hadoop.mapred.InputSplit split = deserializer.deserialize(null);
    long pos = inFile.getPos();
    inputContext.getCounters().findCounter(TaskCounter.SPLIT_RAW_BYTES)
        .increment(pos - offset);
    inFile.close();
    return split;
  }

  @Private
  public static org.apache.hadoop.mapreduce.InputSplit getNewSplitDetailsFromEvent(
      MRSplitProto splitProto, Configuration conf) throws IOException {
    SerializationFactory serializationFactory = new SerializationFactory(conf);
    return MRHelpers.createNewFormatSplitFromUserPayload(
        splitProto, serializationFactory);
  }
 
  @SuppressWarnings("unchecked")
  private org.apache.hadoop.mapreduce.InputSplit getNewSplitDetailsFromDisk(
      TaskSplitIndex splitMetaInfo) throws IOException {
    Path file = new Path(splitMetaInfo.getSplitLocation());
    long offset = splitMetaInfo.getStartOffset();
   
    // Split information read from local filesystem.
    FileSystem fs = FileSystem.getLocal(jobConf);
    file = fs.makeQualified(file);
    LOG.info("Reading input split file from : " + file);
    FSDataInputStream inFile = fs.open(file);
    inFile.seek(offset);
    String className = Text.readString(inFile);
    Class<org.apache.hadoop.mapreduce.InputSplit> cls;
    try {
      cls =
          (Class<org.apache.hadoop.mapreduce.InputSplit>)
          jobConf.getClassByName(className);
    } catch (ClassNotFoundException ce) {
      IOException wrap = new IOException("Split class " + className +
          " not found");
      wrap.initCause(ce);
      throw wrap;
    }
    SerializationFactory factory = new SerializationFactory(jobConf);
    Deserializer<org.apache.hadoop.mapreduce.InputSplit> deserializer =
        (Deserializer<org.apache.hadoop.mapreduce.InputSplit>)
        factory.getDeserializer(cls);
    deserializer.open(inFile);
    org.apache.hadoop.mapreduce.InputSplit split =
        deserializer.deserialize(null);
    long pos = inFile.getPos();
    inputContext.getCounters().findCounter(TaskCounter.SPLIT_RAW_BYTES)
        .increment(pos - offset);
    inFile.close();
    return split;
  }

  private void setIncrementalConfigParams(InputSplit inputSplit) {
    if (inputSplit instanceof FileSplit) {
      FileSplit fileSplit = (FileSplit) inputSplit;
      this.incrementalConf = new Configuration(false);

      this.incrementalConf.set(JobContext.MAP_INPUT_FILE, fileSplit.getPath()
          .toString());
      this.incrementalConf.setLong(JobContext.MAP_INPUT_START,
          fileSplit.getStart());
      this.incrementalConf.setLong(JobContext.MAP_INPUT_PATH,
          fileSplit.getLength());
    }
    LOG.info("Processing split: " + inputSplit);
  }

  private long getInputBytes() {
    if (fsStats == null) return 0;
    long bytesRead = 0;
    for (Statistics stat: fsStats) {
      bytesRead = bytesRead + stat.getBytesRead();
    }
    return bytesRead;
  }

  protected TaskSplitMetaInfo[] readSplits(Configuration conf)
      throws IOException {
    TaskSplitMetaInfo[] allTaskSplitMetaInfo;
    allTaskSplitMetaInfo = SplitMetaInfoReaderTez.readSplitMetaInfo(conf,
        FileSystem.getLocal(conf));
    return allTaskSplitMetaInfo;
  }
 
  private class MRInputKVReader implements KeyValueReader {
   
    Object key;
    Object value;

    private final boolean localNewApi;
   
    MRInputKVReader() {
      localNewApi = useNewApi;
      if (!localNewApi) {
        key = oldRecordReader.createKey();
        value =oldRecordReader.createValue();
      }
    }
   
    // Setup the values iterator once, and set value on the same object each time
    // to prevent lots of objects being created.

   
    @SuppressWarnings("unchecked")
    @Override
    public boolean next() throws IOException {
      boolean hasNext = false;
      long bytesInPrev = getInputBytes();
      if (localNewApi) {
        try {
          hasNext = newRecordReader.nextKeyValue();
        } catch (InterruptedException e) {
          Thread.currentThread().interrupt();
          throw new IOException("Interrupted while checking for next key-value", e);
        }
      } else {
        hasNext = oldRecordReader.next(key, value);
      }
      long bytesInCurr = getInputBytes();
      fileInputByteCounter.increment(bytesInCurr - bytesInPrev);
     
      if (hasNext) {
        inputRecordCounter.increment(1);
      }
     
      return hasNext;
    }

    @Override
    public Object getCurrentKey() throws IOException {
      if (localNewApi) {
        try {
          return newRecordReader.getCurrentKey();
        } catch (InterruptedException e) {
          throw new IOException("Interrupted while fetching next key", e);
        }
      } else {
        return key;
      }
    }

    @Override
    public Object getCurrentValue() throws IOException {
      if (localNewApi) {
        try {
          return newRecordReader.getCurrentValue();
        } catch (InterruptedException e) {
          throw new IOException("Interrupted while fetching next value", e);
        }
      } else {
        return value;
      }
    }
  }

}
TOP

Related Classes of org.apache.tez.mapreduce.input.MRInput$MRInputKVReader

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.