Package org.apache.hadoop.mapred

Source Code of org.apache.hadoop.mapred.RemoteJTProxy$Caller

/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.mapred;

import java.io.IOException;
import java.net.ConnectException;
import java.net.InetSocketAddress;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.locks.ReadWriteLock;
import java.util.concurrent.locks.ReentrantReadWriteLock;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.corona.ResourceGrant;
import org.apache.hadoop.corona.ResourceRequest;
import org.apache.hadoop.corona.SessionDriver;
import org.apache.hadoop.corona.Utilities;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.ipc.ProtocolSignature;
import org.apache.hadoop.ipc.RPC;
import org.apache.hadoop.mapred.CoronaCommitPermission.CommitPermissionServer;
import org.apache.hadoop.mapred.CoronaSessionInfo.InetSocketAddressWritable;

/**
* The Proxy used by the CoronaJobTracker in the client to communicate
* with the CoronaJobTracker running on the TaskTracker in case of a
* remote CoronaJobTracker
*/
@SuppressWarnings("deprecation")
public class RemoteJTProxy implements InterCoronaJobTrackerProtocol,
    JobSubmissionProtocol {
  /** Logger */
  public static final Log LOG = LogFactory.getLog(CoronaJobTracker.class);
  /** Amount of time to wait for remote JT to launch. */
  public static final String REMOTE_JT_TIMEOUT_SEC_CONF =
      "mapred.coronajobtracker.remotejobtracker.wait";
  /** Flag used for test, if to exclude the failed remote job tracker. */
  public static final String REMOTE_JT_EXCLUDE_FAILED =
      "mpared.coronajobtracker.remotejobtracker.exclude";
  /** Default amount of time to wait for remote JT to launch. */
  public static final int REMOTE_JT_TIMEOUT_SEC_DEFAULT = 60;
  /** Amount of time for a RPC call timeout to remote JT. */
  public static final String REMOTE_JT_RPC_TIMEOUT_SEC_CONF =
      "mapred.coronajobtracker.remotejobtracker.rpc.timeout";
  /** Default amount of a RPC call timeout to remote JT. */
  public static final int REMOTE_JT_RPC_TIMEOUT_SEC_DEFAULT = 3600;
  /** Boolean, determines whether remote JT restart should restore state */
  public static final String REMOTE_JT_STATE_RESTORING_CONF =
      "mapred.coronajobtracker.remote.state.restoring";
  /** Default use state restoring mechanism */
  public static final boolean REMOTE_JT_STATE_RESTORING_DEFAULT = true;
 
  /** The proxy object to the CoronaJobTracker running in the cluster */
  private volatile JobSubmissionProtocol client;
  /** JobSubmissionProtocol client lock */
  // We do need fair lock to enable early recovery after remote JT crash.
  private ReadWriteLock clientLock = new ReentrantReadWriteLock(true);
  // TODO Giving priority to writers will increase performance.
  /** The host where the remote Job Tracker is running. */
  private String remoteJTHost;
  /** The port where the remote Job Tracker is running. */
  private int remoteJTPort;
  /** The task id for the current attempt of running CJT */
  private TaskAttemptID currentAttemptId;
  /** The number of the current attempt */
  private int attempt;
  /** Job configuration */
  private final JobConf conf;
  /** Parent JobTracker */
  private final CoronaJobTracker jt;
  /** The remote JT resource grant. */
  private ResourceGrant remoteJTGrant;
  /** The id of the job */
  private final JobID jobId;
  /** Is true iff the job has been submitted */
  private boolean isJobSubmitted = false;
  /** The session id for the job tracker running in the cluster */
  private String remoteSessionId;
  /** The number of remote JT restart attempts. */
  private volatile int numRemoteJTFailures;
  /** The limit for remote JT restart attempts number. */
  private final int maxRemoteJTFailures;
  /** Current job attempt id */
  private JobID attemptJobId;
  /** Address of remote JT */
  private InetSocketAddress remoteJTAddr;
  /** Holds exceptions from restarting */
  public volatile IOException restartingException = null;
  /** Saved state updates from remote JT */
  private final CoronaJTState remoteJTState;
  /** Authority that gives permission to commit */
  private final CommitPermissionServer commmitPermissionServer;
 
  private enum RemoteJTStatus {
    UNINITIALIZED,
    SUCCESS,
    FAILURE
  };
  private RemoteJTStatus remoteJTStatus;
 
  //This variable is our internal logic control flag.
  //It means when the RJT failover is enabled, which API call failure will cause failover.
  //For API call like killJob, killTasks should not fire the RJT failover.
  protected volatile boolean isRestartable = true;

  /**
   * Construct a proxy for the remote job tracker
   * @param jt parent job tracker
   * @param jobId id of the job the proxy is created for
   * @param conf job configuration
   * @throws IOException
   */
  RemoteJTProxy(CoronaJobTracker jt, JobID jobId, JobConf conf) throws IOException {
    this.maxRemoteJTFailures = conf.getInt(CoronaJobTracker.MAX_JT_FAILURES_CONF,
        CoronaJobTracker.MAX_JT_FAILURES_DEFAULT);
    this.conf = conf;
    this.jt = jt;
    this.jobId = jobId;
    // Prepare first attempt.
    this.attemptJobId = jobId;
    attempt = 0;
    int partitionId = conf.getNumMapTasks() + 100000;
    currentAttemptId = new TaskAttemptID(new TaskID(attemptJobId, true,
        partitionId), attempt);
    remoteJTStatus = RemoteJTStatus.UNINITIALIZED;
   
    // Prepare stuff for restoring state if necessary
    if (isStateRestoringEnabled(conf)) {
      remoteJTState = new CoronaJTState();
      commmitPermissionServer = new CommitPermissionServer();
    } else {
      remoteJTState = null;
      commmitPermissionServer = null;
    }
  }

  public String getRemoteSessionId() {
    return remoteSessionId;
  }

  // ///////////////////////////////////////////////////////////////////////////
  // InterCoronaJobTrackerProtocol
  // ///////////////////////////////////////////////////////////////////////////
  @Override
  public void reportRemoteCoronaJobTracker(
      String attempt,
      String host,
      int port,
      String sessionId) throws IOException {
    TaskAttemptID attemptId = TaskAttemptID.forName(attempt);
    synchronized (this) {
      checkAttempt(attemptId);
      initializeClientUnprotected(host, port, sessionId);
      this.notifyAll();
    }
  }
 
  @Override
  public InetSocketAddressWritable getNewJobTrackerAddress(
      InetSocketAddressWritable failedTracker) throws IOException {
    // Die immediately if restarting is disabled
    if (maxRemoteJTFailures == 0) {
      throw new IOException("Restarting remote JT is disabled.");
    }
    assert remoteJTAddr != null : "Not started, but got request to restart.";
    if (clientLock.readLock().tryLock()) {
      // We're not restarting, check address
      InetSocketAddress seenAddr = remoteJTAddr;
      clientLock.readLock().unlock();
      // seenAddr is safe to use, because even if restarting takes place, this
      // is the address of JT that either is fully running or dead
      // (not currently restarting)
      if (seenAddr.equals(failedTracker.getAddress())) {
        // Not restarted yet
        return null;
      } else {
        LOG.info("Serving new job tracker address request with " + seenAddr
            + " old " + failedTracker.getAddress());
        return new InetSocketAddressWritable(seenAddr);
      }
    } else {
      // Currently restarting
      return null;
    }
  }

  @Override
  public void pushCoronaJobTrackerStateUpdate(TaskAttemptID attempt,
      CoronaStateUpdate[] updates) throws IOException {
    checkAttempt(attempt);
    if (remoteJTState == null) {
      throw new IOException(
          "Logic error: got state update but state restoring is disabled");
    } else {
      synchronized (remoteJTState) {
        for (CoronaStateUpdate update : updates) {
          remoteJTState.add(update);
        }
      }
    }
  }

  @Override
  public CoronaJTState getCoronaJobTrackerState(TaskAttemptID attemptId)
      throws IOException {
    checkAttempt(attemptId);
    if (remoteJTState == null) {
      throw new IOException("Logic error: asked for remote JT state but state "
          + "restoring is disabled");
    } else {
      synchronized (remoteJTState) {
        return remoteJTState.prepare();
      }
    }
  }

  @Override
  public TaskAttemptID[] getAndSetCommitting(TaskAttemptID attemptId,
      TaskAttemptID[] toCommit) throws IOException {
    checkAttempt(attemptId);
    if (commmitPermissionServer == null) {
      throw new IOException(
          "Logic error: got getAndSet for committing attempt "
              + "but commit permission server is down");
    } else {
      return commmitPermissionServer.getAndSetCommitting(toCommit);
    }
  }

  @Override
  public ProtocolSignature getProtocolSignature(String protocol,
      long clientVersion, int clientMethodsHash) throws IOException {
    return ProtocolSignature.getProtocolSignature(
        this, protocol, clientVersion, clientMethodsHash);
  }

  @Override
  public long getProtocolVersion(String protocol, long clientVersion)
    throws IOException {
    if (protocol.equals(InterCoronaJobTrackerProtocol.class.getName())) {
      return InterCoronaJobTrackerProtocol.versionID;
    } else {
      throw new IOException("Unknown protocol " + protocol);
    }
  }

  /**
   * Increment the attempt number for launching a remote corona job tracker.
   * Must be called only when holding the object lock.
   */
  private void incrementAttemptUnprotected() {
    attempt++;
    currentAttemptId = new TaskAttemptID(new TaskID(attemptJobId,
        currentAttemptId.isMap(), currentAttemptId.getTaskID().getId()),
        attempt);
  }

  /**
   * Checks whether provided attempt id of remote JT matches currently set,
   * throws if not
   * @param attempt attempt id to check
   * @throws IOException
   */
  private void checkAttempt(TaskAttemptID attemptId) throws IOException {
    if (!attemptId.equals(currentAttemptId)) {
      throw new IOException("Attempt " + attemptId
          + " does not match current attempt " + currentAttemptId);
    }
  }

  /**
   * Create the RPC client to the remote corona job tracker.
   * @param host The host running the remote corona job tracker.
   * @param port The port of the remote corona job tracker.
   * @param sessionId The session for the remote corona job tracker.
   * @throws IOException
   */
  void initializeClientUnprotected(String host, int port, String sessionId)
    throws IOException {
    if (client != null) {
      return;
    }
    LOG.info("Creating JT client to " + host + ":" + port);
    long connectTimeout = RemoteJTProxy.getRemotJTTimeout(conf);
    int rpcTimeout = RemoteJTProxy.getRemoteJTRPCTimeout(conf);
    remoteJTAddr = new InetSocketAddress(host, port);
    client = RPC.waitForProtocolProxy(
      JobSubmissionProtocol.class,
      JobSubmissionProtocol.versionID,
      remoteJTAddr,
      conf,
      connectTimeout,
      rpcTimeout
      ).getProxy();
    remoteJTStatus = RemoteJTStatus.SUCCESS;
    remoteJTHost = host;
    remoteJTPort = port;
    remoteSessionId = sessionId;
   
    if (remoteJTState != null) {
      remoteJTState.setSessionId(sessionId);
    }
  }

  private void reinitClientUnprotected() throws IOException {
    if (client != null) {
      RPC.stopProxy(client);
      client = null;
      remoteJTStatus = RemoteJTStatus.UNINITIALIZED;
    }
   
    try {
      initializeClientUnprotected(remoteJTHost, remoteJTPort, remoteSessionId);
    } finally {
      if (client == null) {
        remoteJTStatus = RemoteJTStatus.FAILURE;
      }
    }
  }

  /**
   * Waits for the remote Corona JT to be ready.
   * This involves
   *    - getting a JOBTRACKER resource from the cluster manager.
   *    - starting the remote job tracker by connecting to the corona task
   *      tracker on the machine.
   *    - waiting for the remote job tracker to report its port back to this
   *      process.
   * @param jobConf The job configuration to use.
   * @throws IOException
   */
  public void waitForJTStart(JobConf jobConf) throws IOException {
    int maxJTAttempts = jobConf.getInt(
        "mapred.coronajobtracker.remotejobtracker.attempts", 4);
    ResourceTracker resourceTracker = jt.getResourceTracker();
    SessionDriver sessionDriver = jt.getSessionDriver();
    List<ResourceGrant> excludeGrants = new ArrayList<ResourceGrant>();
    boolean toExcludeFailed = jobConf.getBoolean(REMOTE_JT_EXCLUDE_FAILED, true);
    // Release and blacklist failed JT grant.
    if (remoteJTGrant != null) {
      if (toExcludeFailed) {
        excludeGrants.add(remoteJTGrant);
      }
      resourceTracker.releaseResource(remoteJTGrant.getId());
      sessionDriver.releaseResources(resourceTracker.getResourcesToRelease());
    }
    for (int i = 0; i < maxJTAttempts; i++) {
      try {
        remoteJTGrant = waitForJTGrant(resourceTracker, sessionDriver,
            excludeGrants);
        boolean success = startRemoteJT(jobConf, remoteJTGrant);
        if (success) {
          return;
        } else {
          excludeGrants.add(remoteJTGrant);
          resourceTracker.releaseResource(remoteJTGrant.getId());
          List<ResourceRequest> released =
            resourceTracker.getResourcesToRelease();
          sessionDriver.releaseResources(released);
        }
      } catch (InterruptedException e) {
        throw new IOException(e);
      }
    }
    throw new IOException("Could not start remote JT after " + maxJTAttempts +
      " attempts");
  }

  /**
   * Wait for a JOBTRACKER grant.
   * @param resourceTracker The resource tracker object for getting the grant
   * @param sessionDriver The session driver for getting the grant
   * @param previousGrants Previous grants that could not be used successfully.
   * @return A new JOBTRACKER grant.
   * @throws IOException
   * @throws InterruptedException
   */
  private ResourceGrant waitForJTGrant(
      ResourceTracker resourceTracker,
      SessionDriver sessionDriver,
      List<ResourceGrant> previousGrants)
    throws IOException, InterruptedException {
    LOG.info("Waiting for JT grant for " + attemptJobId);
    ResourceRequest req = resourceTracker.newJobTrackerRequest();
    for (ResourceGrant prev: previousGrants) {
      LOG.info("Adding " + prev.getNodeName() + " to excluded hosts");
      req.addToExcludeHosts(prev.getAddress().getHost());
    }
    resourceTracker.recordRequest(req);
    List<ResourceRequest> newRequests = resourceTracker.getWantedResources();
    sessionDriver.requestResources(newRequests);
    final List<ResourceGrant> grants = new ArrayList<ResourceGrant>();
    ResourceTracker.ResourceProcessor proc =
      new ResourceTracker.ResourceProcessor() {
        @Override
        public boolean processAvailableResource(ResourceGrant resource) {
          grants.add(resource);
          final boolean consumed = true;
          return consumed;
        }
      };
    while (true) {
      // Try to get JT grant while periodically checking for session driver
      // exceptions.
      long timeout = 60 * 1000; // 1 min.
      resourceTracker.processAvailableGrants(proc, 1, timeout);
      IOException e = sessionDriver.getFailed();
      if (e != null) {
        throw e;
      }
      if (!grants.isEmpty()) {
        return grants.get(0);
      }
    }
  }

  /**
   * Start corona job tracker on the machine provided by using the corona
   * task tracker API.
   * @param jobConf The job configuration.
   * @param grant The grant that specifies the remote machine.
   * @return A boolean indicating success.
   * @throws InterruptedException
   */
  private boolean startRemoteJT(
    JobConf jobConf,
    ResourceGrant grant) throws InterruptedException {
    org.apache.hadoop.corona.InetAddress ttAddr =
      Utilities.appInfoToAddress(grant.appInfo);
    CoronaTaskTrackerProtocol coronaTT = null;
    try {
      coronaTT = jt.getTaskTrackerClient(ttAddr.getHost(), ttAddr.getPort());
    } catch (IOException e) {
      LOG.error("Error while trying to connect to TT at " + ttAddr.getHost() +
        ":" + ttAddr.getPort(), e);
      return false;
    }
    LOG.warn("Starting remote JT for " + attemptJobId
        + " on " + ttAddr.getHost());

    // Get a special map id for the JT task.
    Path systemDir = new Path(jt.getSystemDir());
    LOG.info("startRemoteJT:systemDir "+systemDir.toString());
    String jobFile = CoronaJobInProgress.getJobFile(systemDir, attemptJobId)
        .toString();
    LOG.info("startRemoteJT:jobFile " + jobFile);
    String splitClass = JobClient.RawSplit.class.getName();
    BytesWritable split = new BytesWritable();
    Task jobTask = new MapTask(
        jobFile, currentAttemptId, currentAttemptId.getTaskID().getId(),
        splitClass, split, 1, jobConf.getUser());
    CoronaSessionInfo info = new CoronaSessionInfo(jt.getSessionId(),
        jt.getJobTrackerAddress(), jt.getJobTrackerAddress());
    synchronized (this) {
      try {
        coronaTT.startCoronaJobTracker(jobTask, info);
      } catch (IOException e) {
        // Increment the attempt so that the older attempt will get an error
        // in reportRemoteCoronaJobTracker().
        incrementAttemptUnprotected();
        LOG.error("Error while performing RPC to TT at " + ttAddr.getHost() +
          ":" + ttAddr.getPort(), e);
        return false;
      }
    }

    // Now wait for the remote CJT to report its address.
    final long waitStart = System.currentTimeMillis();
    final long timeout = RemoteJTProxy.getRemotJTTimeout(jobConf);
    synchronized (this) {
      while (client == null) {
        LOG.warn("Waiting for remote JT to start on " + ttAddr.getHost());
        this.wait(1000);
        if (client == null &&
            System.currentTimeMillis() - waitStart > timeout) {
          // Increment the attempt so that the older attempt will get an error
          // in reportRemoteCoronaJobTracker().
          incrementAttemptUnprotected();
          LOG.warn("Could not start remote JT on " + ttAddr.getHost());
          return false;
        }
      }
    }
    return true;
  }

  /**
   * Returns the timeout in milliseconds after which we timeout the remote job
   * tracker.
   *
   * @param conf
   *          The configuration
   * @return The timeout in milliseconds.
   */
  public static long getRemotJTTimeout(Configuration conf) {
    return conf.getInt(RemoteJTProxy.REMOTE_JT_TIMEOUT_SEC_CONF,
        RemoteJTProxy.REMOTE_JT_TIMEOUT_SEC_DEFAULT) * 1000;
  }
 
  public static int getRemoteJTRPCTimeout(Configuration conf) {
    return conf.getInt(RemoteJTProxy.REMOTE_JT_RPC_TIMEOUT_SEC_CONF,
        RemoteJTProxy.REMOTE_JT_RPC_TIMEOUT_SEC_DEFAULT) * 1000;
  }
 

  // ///////////////////////////////////////////////////////////////////////////
  // JobSubmissionProtocol
  // ///////////////////////////////////////////////////////////////////////////
  @Override
  public JobID getNewJobId() throws IOException {
    throw new UnsupportedOperationException(
        "getNewJobId not supported by proxy");
  }

  @Override
  public JobStatus submitJob(final JobID jobId) throws IOException {
    return (new Caller<JobStatus>() {
      @Override
      JobStatus call(JobSubmissionProtocol myClient) throws IOException {
        // This is first time job submission. Called only once
        isJobSubmitted = true;
        return myClient.submitJob(attemptJobId);
      }
    }).makeCall();
  }

  @Override
  public ClusterStatus getClusterStatus(boolean detailed) throws IOException {
    throw new UnsupportedOperationException(
        "getClusterStatus is not supported by proxy");
  }

  @Override
  public void killJob(final JobID jobId) throws IOException {
    (new Caller<JobID>() {
     
      // If the job tracker who hosting the job died,
      // will not do an automatic failover
      @Override
      protected boolean isRestartableCall() {
        return false;
      }

      @Override
      JobID call(JobSubmissionProtocol myClient) throws IOException {
        myClient.killJob(attemptJobId);
        return jobId;
      }
    }).makeCall();
  }

  @Override
  public void setJobPriority(JobID jobId, String priority) throws IOException {
    throw new UnsupportedOperationException(
        "setJobPriority is not supported by proxy");
  }

  @Override
  public boolean killTask(final TaskAttemptID taskId, final boolean shouldFail)
    throws IOException {
    return (new Caller<Boolean>() {
      // If the job tracker who hosting the task died,
      // will not do an automatic failover
      @Override
      protected boolean isRestartableCall() {
        return false;
      }
     
      @Override
      Boolean call(JobSubmissionProtocol myClient) throws IOException {
        return myClient.killTask(taskId, shouldFail);
      }
    }).makeCall();
  }

  @Override
  public JobProfile getJobProfile(final JobID jobId) throws IOException {
    return (new Caller<JobProfile>() {
      @Override
      JobProfile call(JobSubmissionProtocol myClient) throws IOException {
        return myClient.getJobProfile(attemptJobId);
      }
    }).makeCall();
  }

  @Override
  public JobStatus getJobStatus(final JobID jobId) throws IOException {
    return (new Caller<JobStatus>() {
      @Override
      JobStatus call(JobSubmissionProtocol myClient) throws IOException {
        return myClient.getJobStatus(attemptJobId);
      }
    }).makeCall();
  }

  @Override
  public Counters getJobCounters(final JobID jobId) throws IOException {
    return (new Caller<Counters>() {
      @Override
      Counters call(JobSubmissionProtocol myClient) throws IOException {
        return myClient.getJobCounters(attemptJobId);
      }
    }).makeCall();
  }

  @Override
  public TaskReport[] getMapTaskReports(final JobID jobId) throws IOException {
    return (new Caller<TaskReport[]>() {
      @Override
      TaskReport[] call(JobSubmissionProtocol myClient) throws IOException {
        return myClient.getMapTaskReports(attemptJobId);
      }
    }).makeCall();
  }

  @Override
  public TaskReport[] getReduceTaskReports(final JobID jobId)
    throws IOException {
    return (new Caller<TaskReport[]>() {
      @Override
      TaskReport[] call(JobSubmissionProtocol myClient) throws IOException {
        return myClient.getReduceTaskReports(attemptJobId);
      }
    }).makeCall();
  }

  @Override
  public TaskReport[] getCleanupTaskReports(final JobID jobId)
    throws IOException {
    return (new Caller<TaskReport[]>() {
      @Override
      TaskReport[] call(JobSubmissionProtocol myClient) throws IOException {
        return myClient.getCleanupTaskReports(attemptJobId);
      }
    }).makeCall();
  }

  @Override
  public TaskReport[] getSetupTaskReports(final JobID jobId)
    throws IOException {
    return (new Caller<TaskReport[]>() {
      @Override
      TaskReport[] call(JobSubmissionProtocol myClient) throws IOException {
        return myClient.getSetupTaskReports(attemptJobId);
      }
    }).makeCall();
  }

  @Override
  public String getFilesystemName() throws IOException {
    throw new UnsupportedOperationException(
        "getFilesystemName is not supported by proxy");
  }

  @Override
  public JobStatus[] jobsToComplete() {
    throw new UnsupportedOperationException(
        "jobsToComplete is not supported by proxy");
  }

  @Override
  public JobStatus[] getAllJobs() {
    throw new UnsupportedOperationException(
        "getAllJobs is not supported by proxy");
  }

  @Override
  public TaskCompletionEvent[] getTaskCompletionEvents(final JobID jobid,
      final int fromEventId, final int maxEvents) throws IOException {
    return (new Caller<TaskCompletionEvent[]>() {
      @Override
      TaskCompletionEvent[] call(JobSubmissionProtocol myClient) throws IOException {
        return myClient.getTaskCompletionEvents(attemptJobId, fromEventId, maxEvents);
      }
    }).makeCall();
  }

  @Override
  public String[] getTaskDiagnostics(final TaskAttemptID taskId)
    throws IOException {
    return (new Caller<String[]>() {
      @Override
      String[] call(JobSubmissionProtocol myClient) throws IOException {
        return myClient.getTaskDiagnostics(taskId);
      }
    }).makeCall();
  }

  @Override
  public String getSystemDir() {
    throw new UnsupportedOperationException(
        "getSystemDir not supported by proxy.");
  }

  @Override
  public JobQueueInfo[] getQueues() {
    throw new UnsupportedOperationException("getQueues method is " +
        "not supported by proxy.");
  }

  @Override
  public JobQueueInfo getQueueInfo(String queue) {
    throw new UnsupportedOperationException(
        "getQueueInfo not supported by proxy.");
  }

  @Override
  public JobStatus[] getJobsFromQueue(String queue) {
    throw new UnsupportedOperationException(
        "getJobsFromQueue not supported by proxy.");
  }

  @Override
  public QueueAclsInfo[] getQueueAclsForCurrentUser() throws IOException {
    throw new UnsupportedOperationException(
        "getQueueAclsForCurrentUser not supported by proxy.");
  }

  /**
   * Stop RPC client.
   */
  public void close() {
    clientLock.writeLock().lock();
    try {
      if (client != null) {
        RPC.stopProxy(client);
        client = null;
      }
    } finally {
      clientLock.writeLock().unlock();
    }
  }

  // ///////////////////////////////////////////////////////////////////////////
  // Remote CJT reincarnation.
  // ///////////////////////////////////////////////////////////////////////////
  /**
   * Generic caller interface.
   */
  private abstract class Caller<T> {
    /**
     * Perform the call. Must be overridden by a sub-class.
     * @param myClient the client to make the call with.
     * @return The generic return value.
     * @throws IOException
     */
    abstract T call(JobSubmissionProtocol myClient) throws IOException;
   
    /**
     * Overriding it to let the caller know if the current call is a
     * restartable one. It means if failed to call RJT, if we need to
     * do an automatic failover
     *
     */
    protected boolean isRestartableCall() {
      return isRestartable;
    }

    /**
     * Template function to make the call.
     * @return The generic return value.
     * @throws IOException
     */
    public T makeCall() throws IOException {
      int curRestartNo;
      // If restart fails, exception will break this loop
      while(true) {
        clientLock.readLock().lock();
        curRestartNo = numRemoteJTFailures;
        try {
          try {
            return makeCallWithRetries();
          } finally {
            clientLock.readLock().unlock();
          }
        } catch (IOException e) {
          LOG.error("Error on remote call with retries", e);
          if (isRestartableCall()) {
            handleRemoteJTFailure(curRestartNo);
          } else {
            throw e;
          }
        }
      }
    }
   
    /**
     * Handles remote JT failure.
     * @param failureRestartNo numRemoteJTFailures when failure that issued this
     * call has occurred
     * @return true iff remote call can be repeated
     * @throws IOException InterruptedException
     */
    private void handleRemoteJTFailure(int failureRestartNo)
        throws IOException {
      clientLock.writeLock().lock();
      try {
        if (failureRestartNo == numRemoteJTFailures) {
          try {
            LOG.warn("failureRestartNo " + failureRestartNo +
                " maxRemoteFailures " + maxRemoteJTFailures +
                " numFailure " + numRemoteJTFailures);
           
            ++numRemoteJTFailures;
            if (numRemoteJTFailures <= maxRemoteJTFailures) {
              LOG.warn("JobTracker died or is unreachable."
                  + " Restarting remote JT.");
              synchronized(RemoteJTProxy.this) {
                restartRemoteJTUnprotected();
              }
            } else {
              LOG.warn("JobTracker died or is unreachable."
                  + " Reached restart number limit."
                  + " Reporting to ClusterManager.");
              if (remoteSessionId != null) {
                // Kill remote session - it will release resources immediately
                jt.getSessionDriver().stopRemoteSession(remoteSessionId);
              }
              jt.close(false, true);
              throw new IOException("Reached remote JT restart limit.");
            }
          } catch (IOException e) {
            restartingException = e;
            throw restartingException;
          } catch (InterruptedException e) {
            restartingException = new IOException(e);
            throw restartingException;
          }
        } else {
          // Other thread restarted remote JT, check if successfully
          if (restartingException != null) {
            throw new IOException(restartingException);
          }
        }
      } finally {
        clientLock.writeLock().unlock();
      }
    }
   
    /**
     * Restarts remote JT if there was running job and resubmits this job.
     * @throws IOException
     */
    private void restartRemoteJTUnprotected() throws IOException {
      SessionDriver sessionDriver = jt.getSessionDriver();
      if (remoteSessionId != null) {
        // Kill remote session - new JT will acquire new session
        sessionDriver.stopRemoteSession(remoteSessionId);
      }
      if (!isStateRestoringEnabled(conf)) {
        // Change attempt id only if we're not restoring state
        attemptJobId = prepareNextAttempt(attemptJobId);
      } else {
        // notify the remote job tracker the number of
        // remote job tracker get restarted
        remoteJTState.restartNum = numRemoteJTFailures;
      }
      // Stop RPC client.
      RPC.stopProxy(client);
      client = null;
      // Increment attempt to kill old JT on next connection attempt.
      incrementAttemptUnprotected();
      if (sessionDriver != null) {
        sessionDriver.setName("Launch pending for " + conf.getJobName());
      }
      // Restart remote JT, don't release client lock yet.
      waitForJTStart(conf);
      // Resubmit job directly.
      try {
        if (isJobSubmitted) {
          LOG.warn("Resubmitting job " + jobId.toString());
          client.submitJob(attemptJobId);
        }
        // Set our info server url in parent JT and CM.
        String url = getJobProfile(attemptJobId).getURL().toString();
        jt.setRemoteJTUrl(url);
        if (sessionDriver != null) {
          sessionDriver.setName("Launched session " + getRemoteSessionId());
        }
        // If reached this point assume success.
        LOG.warn("Successfully restarted remote JT.");
        if (remoteJTState != null) {
          if (LOG.isInfoEnabled()) {
            synchronized (remoteJTState) {
              LOG.warn(remoteJTState.getPrettyReport(attemptJobId));
            }
          }
        }
      } catch (IOException e) {
        // in case the new job tracker get failed when doing submitJob
        // or getJobProfile
        LOG.error("Exception happened when doing RJT restart, try it another time",
             e);
        handleRemoteJTFailure(numRemoteJTFailures);
      }
    }
     
    /**
     * Prepares next attempt of job.
     * @param oldId a job id of last submitted attempt or id known by client
     * @return job id of next attempt
     * @throws IOException
     */
    private JobID prepareNextAttempt(final JobID oldId) throws IOException {
      JobID newId = CoronaJobTracker.nextJobID(oldId);
      // TODO copy only necessary files
      Path oldJobDir = new Path(jt.getSystemDir(), oldId.toString()),
           newJobDir = new Path(jt.getSystemDir(), newId.toString());
      FileSystem fs = FileSystem.get(conf);
      LOG.info("oldJobDir " + oldJobDir.toString() + " newJobDir " + newJobDir.toString() );
      // Copy job files.
      Path localTemp = new Path("file:///tmp/" + newId.toString());
      if (fs.exists(newJobDir)) {
        LOG.info("newJobDir "+ localTemp.toString() + " exists, delete it");
        fs.delete(newJobDir, true);
      }
      if (!oldJobDir.equals(newJobDir) && fs.exists(oldJobDir)) {
        fs.copyToLocalFile(oldJobDir, localTemp);
        fs.moveFromLocalFile(localTemp, newJobDir);
      }
      LOG.info("Job files copied to " + newJobDir.toString());
      return newId;
    }

    private T makeCallWithRetries() throws IOException {
      int errorCount = 0;
      final int maxErrorCount = 10; // can make configurable later
      IOException lastException = null;
      while (errorCount < maxErrorCount) {
        try {
          JobSubmissionProtocol myClient = checkClient();
          return call(myClient);
        } catch (ConnectException e) {
          throw e;
        } catch (IOException e) {
          lastException = e;
          errorCount++;
          if (errorCount == maxErrorCount) {
            break;
          } else {
            long backoff = errorCount * 1000;
            LOG.warn(
              "Retrying after error connecting to remote JT " +
                remoteJTHost + ":" + remoteJTPort +
                " will wait " + backoff + " msec ", e);
            try {
              Thread.sleep(backoff);
            } catch (InterruptedException ie) {
              throw new IOException(ie);
            }
            synchronized (RemoteJTProxy.this) {
              reinitClientUnprotected();
            }
          }
        }
      }
      LOG.error("Too many errors " + errorCount +
          " in connecting to remote JT " +
          remoteJTHost + ":" + remoteJTPort, lastException);
      throw lastException;
    }
  }
 
  /**
   * Check if the RPC client to the remote job tracker is ready, and wait if
   * not.
   * @throws IOException
   */
  private JobSubmissionProtocol checkClient() throws IOException {
    synchronized (this) {
      while (client == null) {
        try {
          if (remoteJTStatus == RemoteJTStatus.FAILURE) {
            throw new IOException("Remote Job Tracker is not available");
          }
          this.wait(1000);
        } catch (InterruptedException e) {
          throw new IOException(e);
        }
      }
      return client;
    }
  }
 
  /**
   * Check job configuration if state restoring is enabled
   * @param conf configuration of job
   * @return true iff enabled
   */
  public static boolean isStateRestoringEnabled(JobConf conf) {
    return isJTRestartingEnabled(conf)
        && conf.getBoolean(REMOTE_JT_STATE_RESTORING_CONF,
            REMOTE_JT_STATE_RESTORING_DEFAULT);
  }

  /**
   * Check job configuration if remote JT restarting is enabled
   * @param conf configuration of job
   * @return true iff enabled
   */
  public static boolean isJTRestartingEnabled(JobConf conf) {
    return (0 < conf.getInt(CoronaJobTracker.MAX_JT_FAILURES_CONF,
        CoronaJobTracker.MAX_JT_FAILURES_DEFAULT));
  }
}
TOP

Related Classes of org.apache.hadoop.mapred.RemoteJTProxy$Caller

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.