Package org.apache.hadoop.mapred

Source Code of org.apache.hadoop.mapred.CoronaJTState$RestoringClock

package org.apache.hadoop.mapred;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.TimeUnit;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.corona.ResourceGrant;
import org.apache.hadoop.corona.ResourceRequest;
import org.apache.hadoop.corona.SessionDriver;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableUtils;
import org.apache.hadoop.mapred.CoronaStateUpdate.TaskLaunch;
import org.apache.hadoop.mapred.CoronaStateUpdate.TaskStatusUpdate;
import org.apache.hadoop.mapred.CoronaStateUpdate.TaskTimeout;

/**
* Holds update of remote CoronaJobTracker sent to local one. Used by remote JT
* to restore its state after failure
*/
@SuppressWarnings("deprecation")
public class CoronaJTState implements Writable {
  /** Logger */
  public static final Log LOG = LogFactory.getLog(CoronaJTState.class);

  /** Updates in the same order as received */
  List<CoronaStateUpdate> updates = new ArrayList<CoronaStateUpdate>();
  /** Session id */
  private String sessionId = "";
  /** The number of remote job tracker failover executed*/
  int restartNum = 0;
 
  public void setRestartNum(int restartNum) {
    this.restartNum = restartNum;
  }
 
  public void setSessionId(String sessionId) {
    this.sessionId = sessionId;
  }

  @Override
  public void write(DataOutput out) throws IOException {
    WritableUtils.writeVInt(out, updates.size());
    for (CoronaStateUpdate update : updates) {
      update.write(out);
    }
    Text.writeString(out, sessionId);
    WritableUtils.writeVInt(out, restartNum);
  }

  @Override
  public void readFields(DataInput in) throws IOException {
    updates.clear();
    int size = WritableUtils.readVInt(in);
    for (int i = 0; i < size; ++i) {
      CoronaStateUpdate update = new CoronaStateUpdate();
      update.readFields(in);
      updates.add(update);
    }
    sessionId = Text.readString(in);
    restartNum = WritableUtils.readVInt(in);
  }

  /**
   * Add state update to state
   * @param update update to add
   */
  public void add(CoronaStateUpdate update) {
    updates.add(update);
  }

  /**
   * Prepares saved state for new JT
   * @return CoronaJTState prepared to be consumed by restarting JT
   */
  public CoronaJTState prepare() {
    Collections.sort(updates);
    return this;
  }

  /**
   * Creates pretty report of saved state
   * @return string with report
   */
  public String getPrettyReport(JobID jobId) {
    Map<TaskAttemptID, TaskLaunch> lastLaunch =
        new HashMap<TaskAttemptID, CoronaStateUpdate.TaskLaunch>();
    Map<TaskAttemptID, TaskStatus.State> lastKnownStatus =
        new HashMap<TaskAttemptID, TaskStatus.State>();
    JTFailoverMetrics jtFailoverMetrics = new JTFailoverMetrics();
   
    for (CoronaStateUpdate update : updates) {
      if (update.getTaskLaunch() != null) {
        TaskLaunch launch = update.getTaskLaunch();
        lastLaunch.put(launch.getTaskId(), launch);
      } else if (update.getTaskStatus() != null) {
        TaskStatus status = update.getTaskStatus();
        lastKnownStatus.put(status.getTaskID(), status.getRunState());
        jtFailoverMetrics.update(status);
      }
    }
    StringBuilder result = new StringBuilder();
    result.append("CoronaJTState report");
    if (jobId != null) {
      result.append(" for job ").append(jobId);
    }
    for (CoronaStateUpdate update : updates) {
      TaskLaunch launch = update.getTaskLaunch();
      if (launch != null) {
        result.append("\n").append(launch).append(" last known ");
        result.append(lastKnownStatus.get(launch.getTaskId()));
      }
    }
    if (sessionId != null && !sessionId.isEmpty()) {
      result.append("\n Session id ").append(sessionId);
    }
    result.append("\nThis remoteJobTracker failover totally saved: ");
    result.append("\nmappers ").append(jtFailoverMetrics.savedMappers).
      append(" map cpu ").append(jtFailoverMetrics.savedMapCPU).
      append(" map wallclock ").append(jtFailoverMetrics.savedMapWallclock);
    result.append("\nreducers ").append(jtFailoverMetrics.savedReducers).
      append(" reduce cpu ").append(jtFailoverMetrics.savedReduceCPU).
      append(" reduce wallclock ").append(jtFailoverMetrics.savedReduceWallclock);
    return result.toString();
  }

  @Override
  public String toString() {
    return getPrettyReport(null);
  }
 
  public static class JTFailoverMetrics {
    int savedMappers = 0;
    int savedReducers = 0;
    long savedMapCPU = 0L;
    long savedReduceCPU = 0L;
    long savedMapWallclock = 0L;
    long savedReduceWallclock = 0L;
    int restartNum = 0;
    long fetchStateCost = 0L;
   
    public void update(TaskStatus status) {
      if (status.getRunState() != TaskStatus.State.COMMIT_PENDING &&
          status.getRunState() != TaskStatus.State.SUCCEEDED) {
        return;
      }
     
      if (status.getIsMap()) {
        savedMappers += 1;
        savedMapCPU +=
            status.getCounters().getCounter(Task.Counter.CPU_MILLISECONDS);
        savedMapWallclock +=
            status.getCounters().getCounter(Task.Counter.MAP_TASK_WALLCLOCK);
      } else {
        savedReducers += 1;
        savedReduceCPU +=
            status.getCounters().getCounter(Task.Counter.CPU_MILLISECONDS);
        savedReduceWallclock +=
            status.getCounters().getCounter(Task.Counter.REDUCE_TASK_WALLCLOCK);
      }
    }
  }

  /**
   * This class defines how state updates are sent to local JT from remote one.
   */
  public static class Submitter {
    /** Attempt id of this task tracker */
    private TaskAttemptID jtAttemptId;
    /** Destination where status updates will be saved */
    InterCoronaJobTrackerProtocol localJT;
    /** Information pending processing and sending */
    private LinkedBlockingQueue<CoronaStateUpdate> pendingProcessing;
    /** Indicates whether submitting thread is running */
    private volatile boolean running = true;
    /** Submitting thread */
    private Thread submitterThread;

    /**
     * Creates submitter that discards all state updates
     */
    public Submitter() {
    }

    /**
     * Creates submitter of status updates to given destination.
     * @param localJT destination of submits
     * @param jtAttemptId attempt id of job tracker running this submitter
     */
    public Submitter(InterCoronaJobTrackerProtocol localJT,
        TaskAttemptID jtAttemptId,
        JobConf conf) {
      pendingProcessing = new LinkedBlockingQueue<CoronaStateUpdate>();
      this.localJT = localJT;
      this.jtAttemptId = jtAttemptId;
      submitterThread = new Thread(new AsyncSubmitter(conf));
      submitterThread.start();
    }

    /**
     * Determines whether submitter can send updates to it's destination
     * @return true iff updates can be sent
     */
    public boolean canSubmit() {
      return (localJT != null);
    }

    /**
     * Submits state update to destination. This call can delay sending of
     * update depending on its type.
     * @param launch task launch event
     * @throws IOException
     */
    public void submit(TaskLaunch launch) throws IOException {
      if (localJT == null || launch == null)
        return;
      try {
        // We're sending TaskLaunch updates synchronously
        localJT.pushCoronaJobTrackerStateUpdate(jtAttemptId,
            new CoronaStateUpdate[] { new CoronaStateUpdate(launch) });
        // pendingProcessing.offer(launch);
      } catch (IOException e) {
        LOG.error("Failed to push update, failing submitter", e);
        close();
      }
    }

    /**
     * Submits tracker status update. This call can delay sending of
     * update depending on its type.
     * @param tracker task tracker status to generate update from
     * @throws IOException
     */
    public void submit(TaskTrackerStatus tracker) throws IOException {
      if (localJT == null || tracker == null)
        return;
      pendingProcessing.offer(new CoronaStateUpdate(tracker));
    }

    /**
     * Submits task status update. This call can delay sending of update
     * depending on its type.
     * @param status task status to generate update from
     * @throws IOException
     */
    public void submit(TaskStatus status) throws IOException {
      if (localJT == null || status == null)
        return;
      if (TaskStatus.TERMINATING_STATES.contains(status.getRunState())
          || TaskStatus.State.COMMIT_PENDING.equals(status.getRunState())) {
        pendingProcessing.offer(new CoronaStateUpdate(status));
      }
    }

    /**
     * Submits task status update. This call can delay sending of update
     * depending on its type.
     * @param timeout TaskTimout update to save
     */
    public void submit(TaskTimeout timeout) {
      if (localJT == null || timeout == null)
        return;
      pendingProcessing.offer(new CoronaStateUpdate(timeout));
    }

    /**
     * Closes submitter
     */
    public void close() {
      running = false;
      if (submitterThread != null) {
        submitterThread.interrupt();
        try {
          submitterThread.join();
        } catch (InterruptedException e) {
        }
      }
      jtAttemptId = null;
      localJT = null;
      pendingProcessing = null;
    }

    /**
     * Thread that asynchronously process and submits state updates
     */
    private class AsyncSubmitter implements Runnable {
      /** Max processed pending updates per batch */
      private static final int MAX_BATCH_UPDATES_DEFAULT= 1000;
      /** Keeps track of the most recent tracker info */
      private Map<String, TaskTrackerInfo> trackerToInfo =
          new HashMap<String, TaskTrackerInfo>();
      /** The configure key for RJT to update the state to
       * local job tracker*/
      public static final String MAX_BATCH_UPDATES_SIZE =
          "corona.jt.state.max.batch.updates.size";
      /** The configure key for the wait timeout value when RJT updating
       * the state before getting the max batch update size in
       * millis*/
      public static final String MAX_BATCH_UPDATES_WAITTIME =
          "corona.jt.state.batch.update.waittime";
      private static final long MAX_BATCH_UPDATES_WAITTIME_DEFAULT = 1L;
     
      private long batchUpdateTimeout;
      private int maxBatchUpdateSize;
     
      public AsyncSubmitter(JobConf conf) {
        maxBatchUpdateSize = conf.getInt(
            MAX_BATCH_UPDATES_SIZE, MAX_BATCH_UPDATES_DEFAULT);
        batchUpdateTimeout = conf.getLong(
            MAX_BATCH_UPDATES_WAITTIME,
            MAX_BATCH_UPDATES_WAITTIME_DEFAULT);
      }

      @Override
      public void run() {
        List<CoronaStateUpdate> toSend = new ArrayList<CoronaStateUpdate>(
            maxBatchUpdateSize);
        while (running) {
          for (int updates = 0; updates < maxBatchUpdateSize; updates++) {
            CoronaStateUpdate update;
            try {
              if (toSend.isEmpty()) {
                // We're waiting for anything to send
                update = pendingProcessing.take();
              } else {
                // We have things to send, but lets wait for a short time
                // Pushing every update will introduce bigger lag in this thread
                // than this wait, and more updates can get lost (are sync
                // pending)
                update = pendingProcessing.poll(batchUpdateTimeout, TimeUnit.MILLISECONDS);
              }
            } catch (InterruptedException e) {
              // Check running flag, we don't want to loose updates, so this
              // goes through sending code
              break;
            }
            if (update == null) {
              break;
            }
            Object obj = update.get();
            // Classify different objects
            if (obj instanceof TaskLaunch) {
              // Launching task, no preprocessing
              toSend.add(update);
            } else if (obj instanceof TaskTrackerStatus) {
              TaskTrackerStatus tracker = (TaskTrackerStatus) obj;
              String trackerName = tracker.getTrackerName();
              // Send new TaskTrackerInfo update only if has changed
              TaskTrackerInfo info = TaskTrackerInfo.fromStatus(tracker);
              TaskTrackerInfo savedInfo = trackerToInfo.get(trackerName);
              if (savedInfo == null || !savedInfo.equals(info)) {
                trackerToInfo.put(trackerName, info);
                update.set(info);
                toSend.add(update);
              }
            } else if (obj instanceof TaskStatus) {
              TaskStatus report = (TaskStatus) obj;
              // Encapsulate to provide tracker name
              update.set(new TaskStatusUpdate(report));
              toSend.add(update);
            } else if (obj instanceof TaskTimeout) {
              // Timed out running or launching task
              toSend.add(update);
            } else {
              LOG.error("Unknown type of update");
            }
          }
          // Send batch
          if (!toSend.isEmpty()) {
            try {
              localJT.pushCoronaJobTrackerStateUpdate(jtAttemptId, toSend
                  .toArray(CoronaStateUpdate.EMPTY_ARRAY));
              LOG.info("Batch of " + toSend.size() + " updates sent.");
              toSend.clear();
            } catch (IOException e) {
              LOG.error("Failed to push updates", e);
              close();
            }
          }
        }
        LOG.info("AsyncSubmitter exiting.");
      }

    }

  }

  /**
   * Fetches and serves queries for saved state, not designed for concurrent
   * access
   */
  public static class Fetcher {
    /** Id of session saved with this state */
    private String sessionId;
    /** List of updates in the same order as submitted to local JT */
    private List<CoronaStateUpdate> updates;
    /** Maps tracker name to TaskTrackerInfo */
    private Map<String, TaskTrackerInfo> trackerToInfo =
        new HashMap<String, TaskTrackerInfo>();
    /** Clock used for restoring proper timestamps in JT */
    private RestoringClock clock;
    /** The metrics to record the impact of RJT failover*/
    JTFailoverMetrics jtFailoverMetrics =
        new JTFailoverMetrics();
    /** The known trackers **/
    private Set<String> taskLaunchTrackers =
        new HashSet<String>();
   
    /**
     * Creates empty fetcher (which state can't be filled)
     */
    public Fetcher() {
    }

    /**
     * When restoring JT status after restarting, it's possible that we have
     * several task attempts that were using the same grant. Only the most
     * recent task attempt is still using this grant, All finished restored
     * attempts should declare null grant. The last launched attempt for each
     * given grant is the attempt assumed to be running using this grant, rest
     * attempts must declare null grant.
     * @param parent local JT to fetch state from
     * @param jtAttemptId task attempt id of job tracker running this fetcher
     */
    public Fetcher(InterCoronaJobTrackerProtocol parent,
        TaskAttemptID jtAttemptId) {
      CoronaJTState state;
      long startFetchingTime = System.currentTimeMillis();
     
      try {
        state = parent.getCoronaJobTrackerState(jtAttemptId);
      } catch (IOException e) {
        LOG.error("Error when fetching state from parent JT. Proceeding with"
            + " cleared state. ", e);
        close();
        return;
      }
      // State parts
      this.sessionId = state.sessionId;
      this.updates = Collections.unmodifiableList(state.updates);
      this.jtFailoverMetrics.restartNum = state.restartNum;
     
      for (Iterator<CoronaStateUpdate> iter = updates.iterator();
          iter.hasNext();) {
        CoronaStateUpdate update = iter.next();
        // Process task status updates for queries,
        // preserve order for each tracker
        TaskStatus status = update.getTaskStatus();
        if (status != null) {
          jtFailoverMetrics.update(status);
          continue;
        }
        // Set non-existing grants in every attempt, prepare mapping from
        // grant to last attempt that was using this grant
        TaskLaunch launch = update.getTaskLaunch();
        if (launch != null) {
          Integer grant = launch.getGrantId();
          // assign non-existing grant, we will kill all the unfinished tasks
          launch.setGrantId(ResourceTracker.getNoneGrantId());
          taskLaunchTrackers.add(launch.getTrackerName());
          continue;
        }
        // Save tracker info for replaying task status
        TaskTrackerInfo info = update.getTrackerInfo();
        if (info != null) {
          trackerToInfo.put(info.getTrackerName(), info);
          continue;
        }
      }
      trackerToInfo = Collections.unmodifiableMap(trackerToInfo);
      taskLaunchTrackers = Collections.unmodifiableSet(taskLaunchTrackers);
     
      jtFailoverMetrics.fetchStateCost = System.currentTimeMillis() - startFetchingTime;
      LOG.info(jtFailoverMetrics.fetchStateCost + " milliseconds used to do state fetching");
    }

    /**
     * Returns saved session if any
     * @return saved session id or null
     */
    public String getSessionId() {
      if (sessionId == null || sessionId.isEmpty()) {
        return null;
      }
      return sessionId;
    }

    /**
     * Determines whether tasks state has been restored
     * @return true iff tasks state has been restored
     */
    public boolean hasTasksState() {
      return (sessionId != null && updates != null);
    }

    /**
     * Wipes out all state
     */
    public void close() {
      sessionId = null;
      updates = null;
      taskLaunchTrackers = null;
      trackerToInfo = null;
      clock = null;
    }

    /**
     * Restores fetched state updates in the same order that they were saved
     * @param remoteJT JobTracekr to restore state
     */
    public void restoreState(StateRestorer remoteJT) {
      if (!hasTasksState())
        return;
      Clock oldClock = remoteJT.getClock();
      clock = new RestoringClock();
      remoteJT.setClock(clock);
      LOG.info("Begin to restoreState");
      long restoreTime = oldClock.getTime();
     
      for (Iterator<CoronaStateUpdate> iter = updates.iterator();
          iter.hasNext();) {
        CoronaStateUpdate update = iter.next();
        clock.setTimestamp(update.getTimestamp());
        LOG.info("Current timestamp " + update.getTimestamp());
        TaskStatus status = update.getTaskStatus();
        if (status != null) {
          TaskTrackerInfo info = trackerToInfo.get(status.getTaskTracker());
          if (info != null) {
            remoteJT.restoreTaskStatus(status, info);
            LOG.info("Restoring status " + status + " @ " + info);
          } else {
            // it is safe for us to kill more uncertain tasks
            LOG.error("Skipping status " + status + " because of null TaskTracker info");
          }
          continue;
        }
        TaskLaunch launch = update.getTaskLaunch();
        if (launch != null) {
          LOG.info("Restoring launch " + launch);
          remoteJT.restoreTaskLaunch(launch);
          continue;
        }
        TaskTimeout timeout = update.getTaskTimeout();
        if (timeout != null) {
          String trackerName = timeout.getTrackerName();
          LOG.info("Restoring timeout on " + trackerName);
          remoteJT.restoreTaskTimeout(trackerName);
          continue;
        }
      }
      long restoreCost = oldClock.getTime() - restoreTime;
     
      LOG.info("End the restoreState, totally " + restoreCost + "milliseconds used.");
      remoteJT.setClock(oldClock);
    }
   
    /**
     * Returns a set of task trackers that was in use during previous remote JT
     * life
     * @return set of task tracker's names
     */
    public Set<String> getTaskLaunchTrackers() {
      return taskLaunchTrackers;
    }
  }

  /**
   * Clock that allows us to restore time as saved with status updates during
   * restarting process
   */
  public static class RestoringClock extends Clock {
    /** Current manually set timestamp */
    private volatile long timestamp;
    /** Determines whether we're using real or manually set timesamps */
    private volatile boolean useRealTimestamps = false;

    /**
     * Switches to using real timestamps
     */
    public void useRealTimestamps() {
      useRealTimestamps = true;
    }

    /**
     * Sets current timestamp
     * @param timestamp time to set
     */
    public void setTimestamp(long timestamp) {
      useRealTimestamps = false;
      this.timestamp = timestamp;
    }

    @Override
    public long getTime() {
      if (useRealTimestamps) {
        return super.getTime();
      } else {
        return timestamp;
      }
    }
  }

  /**
   * Contract between remote JT and Fetcher defining functions for restoring
   * state
   */
  public interface StateRestorer {

    /**
     * Set clock in object that restores it's state
     * @param clock clock to use
     */
    public void setClock(Clock clock);

    /**
     * Get clock being used in object that restores it's state
     * @return clock used currently by state restorer
     */
    public Clock getClock();

    /**
     * Restores task timeout event for provided task tracker
     * @param trackerName
     */
    public void restoreTaskTimeout(String trackerName);

    /**
     * Restore task launch
     * @param launch a TaskLaunch
     */
    public void restoreTaskLaunch(TaskLaunch launch);

    /**
     * Restore task status update saved from heartbeat report
     * @param status a TaskStatus
     * @param tracker a TaskStatusInfo of tracker that sent this update
     */
    public void restoreTaskStatus(TaskStatus status, TaskTrackerInfo tracker);
  }

}
TOP

Related Classes of org.apache.hadoop.mapred.CoronaJTState$RestoringClock

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.