Package org.apache.hadoop.corona

Source Code of org.apache.hadoop.corona.SchedulerForType

/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.hadoop.corona;

import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.Queue;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.corona.PoolInfoMetrics.MetricName;
import org.apache.hadoop.metrics.MetricsRecord;
import org.apache.hadoop.net.Node;

/**
* Scheduler thread which matches requests to nodes for the given resource type
*/
public class SchedulerForType extends Thread {
  /** Maximum amount of milliseconds to wait before scheduling */
  public static final long SCHEDULE_PERIOD = 100L;
  /** Milliseconds to wait before preempting tasks */
  public static final long PREEMPTION_PERIOD = 1000L;
  /** Class logger */
  private static final Log LOG = LogFactory.getLog(SchedulerForType.class);
  /** Type of resource to schedule for */
  private final ResourceType type;
  /** Manages the pools groups for this resource type */
  private final PoolGroupManager poolGroupManager;
  /** Used to grant/revoke resources to the sessions */
  private final SessionManager sessionManager;
  /** Asynchronously notify the sessions about changes */
  private final SessionNotifier sessionNotifier;
  /** Find where to run the resources */
  private final NodeManager nodeManager;
  /** Get up-to-date dynamic configuration */
  private final ConfigManager configManager;
  /** Map of pool name to associated metrics. The map is replaced each time
   * scheduling happens, but once replaced, the map is not updated. */
  private volatile Map<PoolInfo, PoolInfoMetrics> poolInfoToMetrics =
      new HashMap<PoolInfo, PoolInfoMetrics>();
  /** Map of pool name -> metrics record. */
  private final Map<PoolInfo, MetricsRecord> poolInfoToMetricsRecord =
    new HashMap<PoolInfo, MetricsRecord>();
  /** A flag indicating that the scheduler has allocated all resources */
  private boolean fullyScheduled = true;
  /** Run until told to shutdown */
  private volatile boolean shutdown = false;
  /** Saved last preemption time, used with PREEMPTION_PREIOD */
  private long lastPreemptionTime = -1L;
  /** Do we want preemption. */
  private final boolean enablePreemption;
  /** Required locality levels for this type. */
  private final List<LocalityLevel> neededLocalityLevels;
  /** Cluster Manager metrics. */
  private final ClusterManagerMetrics metrics;
  /** The node snapshot. */
  private NodeSnapshot nodeSnapshot;

  /**
   * Constructor.
   *
   * @param type Type of resource to schedule
   * @param sessionManager Manager of sessions
   * @param sessionNotifier Asynchronous session notification
   * @param nodeManager Available resources on nodes
   * @param configManager Dynamic thread safe configuration
   * @param metrics Cluster Manager metrics
   * @param conf Static configuration
   */
  public SchedulerForType(
    ResourceType type,
    SessionManager sessionManager,
    SessionNotifier sessionNotifier,
    NodeManager nodeManager,
    ConfigManager configManager,
    ClusterManagerMetrics metrics,
    CoronaConf conf) {
    this.type = type;
    this.poolGroupManager = new PoolGroupManager(type, configManager, conf);
    this.sessionManager = sessionManager;
    this.sessionNotifier = sessionNotifier;
    this.nodeManager = nodeManager;
    this.configManager = configManager;
    this.enablePreemption = ResourceTypeProperties.canBePreempted(type);
    this.neededLocalityLevels =
      ResourceTypeProperties.neededLocalityLevels(type);
    this.metrics = metrics;
  }

  @Override
  public void run() {
    while (!shutdown) {
      try {
        if (fullyScheduled) {
          waitForNotification();
        }

        long start = System.currentTimeMillis();

        poolGroupManager.snapshot();

        boolean scheduleFromNodeToSession =
          configManager.getScheduleFromNodeToSession();
        if (scheduleFromNodeToSession) {
          nodeSnapshot = nodeManager.getNodeSnapshot(type);
        } else {
          nodeSnapshot = null;
        }

        Map<String, List<ResourceGrant>> sessionIdToGranted = scheduleTasks();

        dispatchGrantedResource(sessionIdToGranted);

        if (enablePreemption && fullyScheduled) {
          doPreemption();
        }
        int runtime = (int) (System.currentTimeMillis() - start);
        if (runtime > 50) {
          int scheduled = 0;
          float speed = 0;
          for (List<?> grants : sessionIdToGranted.values()) {
            scheduled += grants.size();
          }
          if (scheduled > 0) {
            speed = ((float) runtime) / scheduled;
          }
          // Log if more than 50 msec.
          LOG.info("Scheduler runtime for " + type + " " + runtime + " ms / " +
              scheduled + " grants = " + speed);
        }
        metrics.setSchedulerRunTime(type, runtime);

        collectPoolInfoMetrics();

      } catch (InterruptedException e) {
        if (!shutdown) {
          LOG.warn("Unexpected InterruptedException");
        }
      }
    }
  }

  /**
   * Update the pool metrics. The update is atomic at the map level.
   */
  private void collectPoolInfoMetrics() {
    Map<PoolInfo, PoolInfoMetrics> newPoolNameToMetrics = new HashMap<PoolInfo,
      PoolInfoMetrics>();
    long now = ClusterManager.clock.getTime();

    Map<PoolInfo, Long> poolInfoAverageFirstWaitMs =
        sessionManager.getTypePoolInfoAveFirstWaitMs(type);

    // The gets + puts below are OK because only one thread is doing it.
    for (PoolGroupSchedulable poolGroup : poolGroupManager.getPoolGroups()) {
      int poolGroupSessions = 0;
      for (PoolSchedulable pool : poolGroup.getPools()) {
        MetricsRecord poolRecord =
            poolInfoToMetricsRecord.get(pool.getPoolInfo());
        if (poolRecord == null) {
          poolRecord = metrics.getContext().createRecord(
              "pool-" + pool.getName());
          poolInfoToMetricsRecord.put(pool.getPoolInfo(), poolRecord);
        }

        PoolInfoMetrics poolMetrics = new PoolInfoMetrics(pool.getPoolInfo(),
            type, poolRecord);
        poolMetrics.setCounter(
            MetricName.GRANTED, pool.getGranted());
        poolMetrics.setCounter(
            MetricName.REQUESTED, pool.getRequested());
        poolMetrics.setCounter(
            MetricName.SHARE, (long) pool.getShare());
        poolMetrics.setCounter(
            MetricName.MIN, pool.getMinimum());
        poolMetrics.setCounter(
            MetricName.MAX, pool.getMaximum());
        poolMetrics.setCounter(
            MetricName.WEIGHT, (long) pool.getWeight());
        poolMetrics.setCounter(
            MetricName.SESSIONS, pool.getScheduleQueue().size());
        poolMetrics.setCounter(
            MetricName.STARVING, pool.getStarvingTime(now) / 1000);
        Long averageFirstTypeMs =
            poolInfoAverageFirstWaitMs.get(pool.getPoolInfo());
        poolMetrics.setCounter(MetricName.AVE_FIRST_WAIT_MS,
            (averageFirstTypeMs == null) ?
                0 : averageFirstTypeMs.longValue());

        newPoolNameToMetrics.put(pool.getPoolInfo(), poolMetrics);
        poolGroupSessions += pool.getScheduleQueue().size();
      }

      MetricsRecord poolGroupRecord =
          poolInfoToMetricsRecord.get(poolGroup.getName());
      if (poolGroupRecord == null) {
        poolGroupRecord = metrics.getContext().createRecord(
            "poolgroup-" + poolGroup.getName());
        poolInfoToMetricsRecord.put(poolGroup.getPoolInfo(), poolGroupRecord);
      }

      PoolInfoMetrics poolGroupMetrics =
          new PoolInfoMetrics(poolGroup.getPoolInfo(), type, poolGroupRecord);
      poolGroupMetrics.setCounter(
          MetricName.GRANTED, poolGroup.getGranted());
      poolGroupMetrics.setCounter(
          MetricName.REQUESTED, poolGroup.getRequested());
      poolGroupMetrics.setCounter(
          MetricName.SHARE, (long) poolGroup.getShare());
      poolGroupMetrics.setCounter(
          MetricName.MIN, poolGroup.getMinimum());
      poolGroupMetrics.setCounter(
          MetricName.MAX, poolGroup.getMaximum());
      poolGroupMetrics.setCounter(
          MetricName.SESSIONS, poolGroupSessions);
      newPoolNameToMetrics.put(poolGroup.getPoolInfo(), poolGroupMetrics);
    }

    poolInfoToMetrics = newPoolNameToMetrics;
  }

  /**
   * Wait for SCHEDULE_PERIOD or another thread tells this thread to do
   * something.
   *
   * @throws InterruptedException
   */
  private void waitForNotification() throws InterruptedException {
    synchronized (this) {
      this.wait(SCHEDULE_PERIOD);
    }
  }

  /**
   * Informed the sessions about their newly granted resources via
   * {@link SessionNotifier}.
   *
   * @param sessionIdToGranted Map of session ids to granted resources
   */
  private void dispatchGrantedResource(
      Map<String, List<ResourceGrant>> sessionIdToGranted) {
    for (Map.Entry<String, List<ResourceGrant>> entry :
        sessionIdToGranted.entrySet()) {
      LOG.info("Assigning " + entry.getValue().size() + " " +
          type + " requests to Session: " + entry.getKey());
      if (LOG.isDebugEnabled()) {
        LOG.debug(Arrays.toString(entry.getValue().toArray()));
      }
      sessionNotifier.notifyGrantResource(entry.getKey(), entry.getValue());
    }
  }

  /**
   * Match requests to nodes.
   *
   * @return The list of granted resources for each session
   */
  private Map<String, List<ResourceGrant>> scheduleTasks() {
    fullyScheduled = false;
    long nodeWait = configManager.getLocalityWait(type, LocalityLevel.NODE);
    long rackWait = configManager.getLocalityWait(type, LocalityLevel.RACK);
    int tasksToSchedule = configManager.getGrantsPerIteration();
    Map<String, List<ResourceGrant>> sessionIdToGranted =
        new HashMap<String, List<ResourceGrant>>();
    for (int i = 0; i < tasksToSchedule; i++) {
      ScheduledPair scheduled = scheduleOneTask(nodeWait, rackWait);
      if (scheduled == null) {
        // Cannot find matched request-node anymore. We are done.
        fullyScheduled = true;
        break;
      }
      List<ResourceGrant> granted =
        sessionIdToGranted.get(scheduled.sessionId.toString());
      if (granted == null) {
        granted = new LinkedList<ResourceGrant>();
        sessionIdToGranted.put(scheduled.sessionId.toString(), granted);
      }
      granted.add(scheduled.grant);
    }
    return sessionIdToGranted;
  }

  /**
   * Try match one request to one node
   *
   * @param nodeWait Time to wait for node locality
   * @param rackWait Time to wait for rack locality
   * @return The pair contains a session id and a granted resource
   *         or null when no task can be scheduled
   */
  private ScheduledPair scheduleOneTask(long nodeWait, long rackWait) {
    if (!nodeManager.existRunnableNodes(type)) {
      return null;
    }

    Queue<PoolGroupSchedulable> poolGroupQueue =
        poolGroupManager.getScheduleQueue();
    while (!poolGroupQueue.isEmpty()) {
      PoolGroupSchedulable poolGroup = poolGroupQueue.poll();
      if (poolGroup.reachedMaximum()) {
        continue;
      }
      // Get the appropriate pool from the pool group to schedule, then
      // schedule the best session
      Queue<PoolSchedulable> poolQueue = poolGroup.getScheduleQueue();
      while (!poolQueue.isEmpty()) {
        PoolSchedulable pool = poolQueue.poll();
        if (pool.reachedMaximum()) {
          continue;
        }
        Queue<SessionSchedulable> sessionQueue = pool.getScheduleQueue();
        while (!sessionQueue.isEmpty()) {
          SessionSchedulable schedulable = sessionQueue.poll();
          Session session = schedulable.getSession();
          long now = ClusterManager.clock.getTime();
          MatchedPair pair = doMatch(
            schedulable, now, nodeWait, rackWait);
          synchronized (session) {
            if (session.isDeleted()) {
              continue;
            }
            if (pair != null) {
              ResourceGrant grant = commitMatchedResource(session, pair);
              if (grant != null) {
                poolGroup.incGranted(1);
                pool.incGranted(1);
                schedulable.incGranted(1);
                // Put back to the queue only if we scheduled successfully
                poolGroupQueue.add(poolGroup);
                poolQueue.add(pool);
                sessionQueue.add(schedulable);
                return new ScheduledPair(
                    session.getSessionId().toString(), grant);
              }
            }
          }
        }
      }
    }
    return null;
  }

  /**
   * Find a node to give resources to this schedulable session.
   *
   * @param schedulable Given a resource to this schedulable
   * @param now Current time
   * @param nodeWait Time to wait for node locality
   * @param rackWait Time to wait for rack locality
   * @return Pair of resource request and node or null if no node can be found.
   */
  private MatchedPair doMatch(
      SessionSchedulable schedulable, long now, long nodeWait, long rackWait) {
    schedulable.adjustLocalityRequirement(now, nodeWait, rackWait);
    for (LocalityLevel level : neededLocalityLevels) {
      if (level.isBetterThan(schedulable.getLastLocality())) {
        /**
         * This means that the last time we tried to schedule this session
         * we could not achieve the current LocalityLevel level.
         * Since this is the same iteration of the scheduler we do not need to
         * try this locality level.
         * The last locality level of the shcedulable is getting reset on every
         * iteration of the scheduler, so we will retry the better localities
         * in the next run of the scheduler.
         */
        continue;
      }
      if (needLocalityCheck(level, nodeWait, rackWait) &&
          !schedulable.isLocalityGoodEnough(level)) {
        break;
      }
      Session session = schedulable.getSession();
      synchronized (session) {
        if (session.isDeleted()) {
          return null;
        }
        int pendingRequestCount = session.getPendingRequestCountForType(type);
        MatchedPair matchedPair = null;
        if (nodeSnapshot == null ||
          pendingRequestCount < nodeSnapshot.getRunnableHostCount()) {
          matchedPair = matchNodeForSession(session, level);
        } else {
          matchedPair = matchSessionForNode(session, level);
        }
        if (matchedPair != null) {
          schedulable.setLocalityLevel(level);
          return matchedPair;
        }
      }
    }
    schedulable.startLocalityWait(now);
    if (LOG.isDebugEnabled()) {
      LOG.debug("Could not find a node for " +
        schedulable.getSession().getHandle());
    }
    return null;
  }

  /**
   * Find a matching pair of node, request by looping through the requests in
   * the session, looking at the hosts in each request and making look-ups
   * into the node manager.
   * @param session The session
   * @param level The locality level at which we are trying to schedule.
   * @return A match if found, null if not.
   */
  private MatchedPair matchNodeForSession(
    Session session, LocalityLevel level) {
    Iterator<ResourceRequestInfo> pendingRequestIterator =
        session.getPendingRequestIteratorForType(type);
    while (pendingRequestIterator.hasNext()) {
      ResourceRequestInfo req = pendingRequestIterator.next();
      Set<String> excluded = req.getExcludeHosts();
      if (req.getHosts() == null || req.getHosts().size() == 0) {
        // No locality requirement
        String host = null;
        ClusterNode node = nodeManager.getRunnableNode(
            host, LocalityLevel.ANY, type, excluded);
        if (node != null) {
          return new MatchedPair(node, req);
        }
        continue;
      }
      for (RequestedNode requestedNode : req.getRequestedNodes()) {
        ClusterNode node = nodeManager.getRunnableNode(
            requestedNode, level, type, excluded);
        if (node != null) {
          return new MatchedPair(node, req);
        }
      }
    }
    return null;
  }

  /**
   * Find a matching pair of node, request by looping through runnable nodes
   * in the node snapshot created earlier. For each node, we make lookups in the
   * session to find a suitable request.
   * @param session The session
   * @param level The locality level at which we are trying to schedule.
   * @return A match if found, null if not.
   */
  private MatchedPair matchSessionForNode(
    Session session, LocalityLevel level) {
    if (level == LocalityLevel.NODE || level == LocalityLevel.ANY) {
      Set<Map.Entry<String, NodeContainer>> hostNodesSet =
        nodeSnapshot.runnableHosts();
      for (Map.Entry<String, NodeContainer> hostNodes : hostNodesSet) {
        Iterator<ClusterNode> clusterNodeIt = hostNodes.getValue().iterator();
        while (clusterNodeIt.hasNext()) {
          ClusterNode node = clusterNodeIt.next();
          if (!nodeManager.hasEnoughResource(node)) {
            continue;
          }
          ResourceRequestInfo req = null;
          if (level == LocalityLevel.NODE) {
            req = session.getPendingRequestOnHost(node.getHost(), type);
          } else {
            req = session.getPendingRequestForAny(node.getHost(), type);
          }
          if (req != null) {
            return new MatchedPair(node, req);
          }
        }
      }
    } else if (level == LocalityLevel.RACK) {
      Set<Map.Entry<Node, NodeContainer>> rackNodesSet =
        nodeSnapshot.runnableRacks();
      for (Map.Entry<Node, NodeContainer> rackNodes: rackNodesSet) {
        Node rack = rackNodes.getKey();
        NodeContainer nodes = rackNodes.getValue();
        Iterator<ClusterNode> clusterNodeIt = nodes.iterator();
        while (clusterNodeIt.hasNext()) {
          ClusterNode node = clusterNodeIt.next();
          if (!nodeManager.hasEnoughResource(node)) {
            continue;
          }
          ResourceRequestInfo req = session.getPendingRequestOnRack(
            node.getHost(), rack, type);
          if (req != null) {
            return new MatchedPair(node, req);
          }
        }
      }
    }
    return null;
  }

  /**
   * If the locality wait time is zero, we don't need to check locality at all.
   * @param level The level
   * @param nodeWait The amount of time in msec to wait for node locality
   * @param rackWait The amount of time in msec to wait for rack locality
   * @return True if the locality check is needed, false otherwise.
   */
  private boolean needLocalityCheck(
      LocalityLevel level, long nodeWait, long rackWait) {
    if (level == LocalityLevel.NODE) {
      return nodeWait != 0;
    }
    if (level == LocalityLevel.RACK) {
      return rackWait != 0;
    }
    return false;
  }

  /**
   * Given a session and match of request-node, perform a "transaction commit"
   * @param session The session
   * @param pair The pair of (request, node)
   * @return The resource grant: non-null if the "commit" was successful, null
   *         otherwise.
   */
  private ResourceGrant commitMatchedResource(
      Session session, MatchedPair pair) {
    ResourceGrant grant = null;
    ResourceRequestInfo req = pair.req;
    ClusterNode node = pair.node;
    String appInfo = nodeManager.getAppInfo(node, type);
    if (appInfo != null) {
      if (nodeManager.addGrant(node, session.getSessionId(), req)) {
        // if the nodemanager can commit this grant - we are done
        // the commit can fail if the node has been deleted
        grant = new ResourceGrant(req.getId(), node.getName(),
          node.getAddress(), ClusterManager.clock.getTime(), req.getType());
        grant.setAppInfo(appInfo);
        sessionManager.grantResource(session, req, grant);
      }
    }
    if (nodeSnapshot != null) {
      synchronized (node) {
        if (node.deleted) {
          nodeSnapshot.removeNode(node);
        } else if (!node.checkForGrant(Utilities.getUnitResourceRequest(type),
          nodeManager.getResourceLimit())) {
          nodeSnapshot.removeNode(node);
        }
      }
    }
    return grant;
  }

  /**
   * Update metrics.
   */
  public void submitMetrics() {
    for (PoolInfoMetrics m : poolInfoToMetrics.values()) {
      m.updateMetricsRecord();
    }
  }

  /**
   * A pair of (request, node).
   */
  private class MatchedPair {
    /** Request */
    private final ResourceRequestInfo req;
    /** Node */
    private final ClusterNode node;

    /**
     * Constructor.
     * @param node The node.
     * @param req The request.
     */
    MatchedPair(ClusterNode node, ResourceRequestInfo req) {
      this.req = req;
      this.node = node;
    }
  }

  /**
   * Represents a resource grant given to a session.
   */
  private class ScheduledPair {
    /** The grant. */
    private final ResourceGrant grant;
    /** The session ID. */
    private final String sessionId;

    /**
     * Constructor.
     * @param sessionId The session ID.
     * @param grant The grant.
     */
    ScheduledPair(String sessionId, ResourceGrant grant) {
      this.grant = grant;
      this.sessionId = sessionId;
    }
  }

  /**
   * Performs preemption if it has been long enough since the last round.
   */
  private void doPreemption() {
    long now = ClusterManager.clock.getTime();
    if (now - lastPreemptionTime > PREEMPTION_PERIOD) {
      lastPreemptionTime = now;
      doPreemptionNow();
    }
  }

  /**
   * Performs the preemption.
   */
  private void doPreemptionNow() {
    int totalShare = nodeManager.getAllocatedCpuForType(type);
    poolGroupManager.distributeShare(totalShare);
    for (PoolGroupSchedulable poolGroup : poolGroupManager.getPoolGroups()) {
      poolGroup.distributeShare();
    }
    int tasksToPreempt = countTasksShouldPreempt();
    if (tasksToPreempt > 0) {
      LOG.info("Found " + tasksToPreempt + " " + type + " tasks to preempt");
      preemptTasks(tasksToPreempt);
    }
  }

  /**
   * Kill tasks from over-scheduled sessions
   * @param tasksToPreempt The number of tasks should kill
   */
  private void preemptTasks(int tasksToPreempt) {
    LOG.info("Start preempt " + tasksToPreempt + " for type " + type);
    long maxRunningTime = configManager.getPreemptedTaskMaxRunningTime();
    int rounds = configManager.getPreemptionRounds();
    while (tasksToPreempt > 0) {
      int preempted = preemptOneSession(tasksToPreempt, maxRunningTime);
      if (preempted == 0) {
        maxRunningTime *= 2;
        // Check for enough rounds or an overflow
        if (--rounds <= 0 || maxRunningTime <= 0) {
          LOG.warn("Cannot preempt enough " + type + " tasks " +
              " rounds " + configManager.getPreemptionRounds() +
              " maxRunningTime " + maxRunningTime +
              " tasks not preempted:" + tasksToPreempt);
          return;
        }
      }
      tasksToPreempt -= preempted;
    }
  }

  /**
   * Find the most over-scheduled session in the most over-scheduled pool.
   * Kill tasks from this session.
   * @param maxToPreemt The maximum number of tasks to kill
   * @param maxRunningTime The killed task cannot be older than this time
   * @return The number of tasks actually killed
   */
  private int preemptOneSession(int maxToPreemt, long maxRunningTime) {
    Queue<PoolGroupSchedulable> poolGroupQueue =
        poolGroupManager.getPreemptQueue();
    while (!poolGroupQueue.isEmpty()) {
      PoolGroupSchedulable poolGroup = poolGroupQueue.poll();
      poolGroup.distributeShare();
      Queue<PoolSchedulable> poolQueue = poolGroup.getPreemptQueue();
      while (!poolQueue.isEmpty()) {
        PoolSchedulable pool = poolQueue.poll();
        pool.distributeShare();
        if (!pool.isPreemptable()) {
          continue;
        }
        Queue<SessionSchedulable> sessionQueue = pool.getPreemptQueue();
        while (!sessionQueue.isEmpty()) {
          SessionSchedulable schedulable = sessionQueue.poll();
          try {
            int overScheduled =
                (int) (schedulable.getGranted() - schedulable.getShare());
            if (overScheduled <= 0) {
              continue;
            }
            maxToPreemt = Math.min(maxToPreemt, overScheduled);
            LOG.info("Trying to preempt " + maxToPreemt + " " + type +
                " from " + schedulable.getSession().getHandle());
            int preempted = preemptSession(
                schedulable, maxToPreemt, maxRunningTime);
            poolGroup.incGranted(-1 * preempted);
            pool.incGranted(-1 * preempted);
            schedulable.incGranted(-1 * preempted);
            return preempted;
          } catch (InvalidSessionHandle e) {
            LOG.warn("Invalid session handle:" +
                schedulable.getSession().getHandle() +
                " Session may be removed");
          } finally {
            // Add back the queue so it can be further preempt for other
            // sessions.
            poolGroupQueue.add(poolGroup);
            poolQueue.add(pool);
          }
        }
      }
    }
    return 0;
  }

  /**
   * Preempt a session.
   * @param schedulable The session.
   * @param maxToPreemt Maximum number of resources to preempt.
   * @param maxRunningTime Running time threshold for preemption.
   * @return The number of preempted resources.
   * @throws InvalidSessionHandle
   */
  private int preemptSession(SessionSchedulable schedulable,
      int maxToPreemt, long maxRunningTime) throws InvalidSessionHandle {
    Session session = schedulable.getSession();
    List<Integer> grantIds;
    synchronized (session) {
      grantIds = session.getGrantsToPreempt(maxToPreemt, maxRunningTime, type);
    }
    List<ResourceGrant> revokedGrants =
      sessionManager.revokeResource(session.getHandle(), grantIds);
    for (ResourceGrant grant : revokedGrants) {
      nodeManager.cancelGrant(
          grant.nodeName, session.getSessionId(), grant.getId());
    }
    sessionNotifier.notifyRevokeResource(
        session.getHandle(), revokedGrants, true);
    int preempted = revokedGrants.size();
    LOG.info("Preempt " + preempted + " " + type +
        " tasks for Session:" + session.getHandle());
    return preempted;
  }

  /**
   * Count how many tasks should preempt for the starving pools
   * @return The number of tasks should kill
   */
  private int countTasksShouldPreempt() {
    int tasksToPreempt = 0;
    long now = ClusterManager.clock.getTime();
    for (PoolGroupSchedulable poolGroup : poolGroupManager.getPoolGroups()) {
      for (PoolSchedulable pool : poolGroup.getPools()) {
        if (pool.isStarving(now)) {
          tasksToPreempt +=
            Math.min(pool.getPending(), pool.getShare() - pool.getGranted());
        }
      }
    }
    return tasksToPreempt;
  }

  /**
   * Add a session to this scheduler.
   * @param id The session ID.
   * @param session The session.
   */
  public void addSession(String id, Session session) {
    poolGroupManager.addSession(id, session);
    LOG.info("Session " + id +
        " has been added to " + type + " scheduler");
  }

  public Map<PoolInfo, PoolInfoMetrics> getPoolInfoMetrics() {
    return poolInfoToMetrics;
  }

  /**
   * Inform to stop scheduling.  This is not immediate, but will eventually
   * occur.
   */
  public void close() {
    shutdown = true;
  }
}
TOP

Related Classes of org.apache.hadoop.corona.SchedulerForType

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.