Package org.apache.aurora.scheduler.async

Source Code of org.apache.aurora.scheduler.async.Preemptor

/**
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.aurora.scheduler.async;

import java.lang.annotation.Retention;
import java.lang.annotation.Target;
import java.util.Collections;
import java.util.EnumSet;
import java.util.List;
import java.util.Set;
import java.util.concurrent.atomic.AtomicLong;

import javax.inject.Inject;
import javax.inject.Qualifier;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Function;
import com.google.common.base.Optional;
import com.google.common.base.Predicate;
import com.google.common.base.Predicates;
import com.google.common.collect.FluentIterable;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.collect.Multimap;
import com.google.common.collect.Multimaps;
import com.google.common.collect.Ordering;
import com.google.common.collect.Sets;
import com.twitter.common.quantity.Amount;
import com.twitter.common.quantity.Time;
import com.twitter.common.stats.Stats;
import com.twitter.common.stats.StatsProvider;
import com.twitter.common.util.Clock;

import org.apache.aurora.gen.ScheduleStatus;
import org.apache.aurora.scheduler.HostOffer;
import org.apache.aurora.scheduler.ResourceSlot;
import org.apache.aurora.scheduler.base.Query;
import org.apache.aurora.scheduler.base.Tasks;
import org.apache.aurora.scheduler.filter.AttributeAggregate;
import org.apache.aurora.scheduler.filter.SchedulingFilter;
import org.apache.aurora.scheduler.state.StateManager;
import org.apache.aurora.scheduler.storage.Storage;
import org.apache.aurora.scheduler.storage.entities.IAssignedTask;
import org.apache.aurora.scheduler.storage.entities.IHostAttributes;
import org.apache.aurora.scheduler.storage.entities.IScheduledTask;

import static java.lang.annotation.ElementType.FIELD;
import static java.lang.annotation.ElementType.METHOD;
import static java.lang.annotation.ElementType.PARAMETER;
import static java.lang.annotation.RetentionPolicy.RUNTIME;
import static java.util.Objects.requireNonNull;

import static org.apache.aurora.gen.ScheduleStatus.PENDING;
import static org.apache.aurora.gen.ScheduleStatus.PREEMPTING;
import static org.apache.aurora.scheduler.base.Tasks.SCHEDULED_TO_ASSIGNED;
import static org.apache.aurora.scheduler.storage.Storage.MutableStoreProvider;
import static org.apache.aurora.scheduler.storage.Storage.MutateWork;
import static org.apache.aurora.scheduler.storage.Storage.StoreProvider;
import static org.apache.aurora.scheduler.storage.Storage.Work;

/**
* Preempts active tasks in favor of higher priority tasks.
*/
public interface Preemptor {

  /**
   * Preempts active tasks in favor of the input task.
   *
   * @param taskId ID of the preempting task.
   * @param attributeAggregate Attribute information for tasks in the job containing {@code task}.
   * @return ID of the slave where preemption occured.
   */
  Optional<String> findPreemptionSlotFor(String taskId, AttributeAggregate attributeAggregate);

  /**
   * A task preemptor that tries to find tasks that are waiting to be scheduled, which are of higher
   * priority than tasks that are currently running.
   *
   * To avoid excessive churn, the preemptor requires that a task is PENDING for a duration
   * (dictated by {@link #preemptionCandidacyDelay}) before it becomes eligible to preempt other
   * tasks.
   */
  class PreemptorImpl implements Preemptor {

    /**
     * Binding annotation for the time interval after which a pending task becomes eligible to
     * preempt other tasks.
     */
    @Qualifier
    @Target({ FIELD, PARAMETER, METHOD }) @Retention(RUNTIME)
    @interface PreemptionDelay { }

    @VisibleForTesting
    static final Query.Builder CANDIDATE_QUERY = Query.statusScoped(
        EnumSet.copyOf(Sets.difference(Tasks.SLAVE_ASSIGNED_STATES, EnumSet.of(PREEMPTING))));

    private static final Function<IAssignedTask, Integer> GET_PRIORITY =
        new Function<IAssignedTask, Integer>() {
          @Override
          public Integer apply(IAssignedTask task) {
            return task.getTask().getPriority();
          }
        };

    private final AtomicLong tasksPreempted = Stats.exportLong("preemptor_tasks_preempted");
    // Incremented every time the preemptor is invoked and finds tasks pending and preemptable tasks
    private final AtomicLong attemptedPreemptions = Stats.exportLong("preemptor_attempts");
    // Incremented every time we fail to find tasks to preempt for a pending task.
    private final AtomicLong noSlotsFound = Stats.exportLong("preemptor_no_slots_found");

    private final Predicate<IScheduledTask> isIdleTask = new Predicate<IScheduledTask>() {
      @Override
      public boolean apply(IScheduledTask task) {
        return (clock.nowMillis() - Tasks.getLatestEvent(task).getTimestamp())
            >= preemptionCandidacyDelay.as(Time.MILLISECONDS);
      }
    };

    private final Storage storage;
    private final StateManager stateManager;
    private final OfferQueue offerQueue;
    private final SchedulingFilter schedulingFilter;
    private final Amount<Long, Time> preemptionCandidacyDelay;
    private final Clock clock;
    private final AtomicLong missingAttributes;

    /**
     * Creates a new preemptor.
     *
     * @param storage Backing store for tasks.
     * @param stateManager Scheduler state controller to instruct when preempting tasks.
     * @param offerQueue Queue that contains available Mesos resource offers.
     * @param schedulingFilter Filter to identify whether tasks may reside on given slaves.
     * @param preemptionCandidacyDelay Time a task must be PENDING before it may preempt other
     *                                 tasks.
     * @param clock Clock to check current time.
     */
    @Inject
    PreemptorImpl(
        Storage storage,
        StateManager stateManager,
        OfferQueue offerQueue,
        SchedulingFilter schedulingFilter,
        @PreemptionDelay Amount<Long, Time> preemptionCandidacyDelay,
        Clock clock,
        StatsProvider statsProvider) {

      this.storage = requireNonNull(storage);
      this.stateManager = requireNonNull(stateManager);
      this.offerQueue = requireNonNull(offerQueue);
      this.schedulingFilter = requireNonNull(schedulingFilter);
      this.preemptionCandidacyDelay = requireNonNull(preemptionCandidacyDelay);
      this.clock = requireNonNull(clock);
      missingAttributes = statsProvider.makeCounter("preemptor_missing_attributes");
    }

    private List<IAssignedTask> fetch(Query.Builder query, Predicate<IScheduledTask> filter) {
      return Lists.newArrayList(Iterables.transform(Iterables.filter(
          Storage.Util.consistentFetchTasks(storage, query), filter),
          SCHEDULED_TO_ASSIGNED));
    }

    private List<IAssignedTask> fetch(Query.Builder query) {
      return fetch(query, Predicates.<IScheduledTask>alwaysTrue());
    }

    private static final Function<IAssignedTask, String> TASK_TO_SLAVE_ID =
        new Function<IAssignedTask, String>() {
          @Override
          public String apply(IAssignedTask input) {
            return input.getSlaveId();
          }
        };

    private static Predicate<IAssignedTask> canPreempt(final IAssignedTask pending) {
      return new Predicate<IAssignedTask>() {
        @Override
        public boolean apply(IAssignedTask possibleVictim) {
          return preemptionFilter(possibleVictim).apply(pending);
        }
      };
    }

    private static final Function<IAssignedTask, ResourceSlot> TASK_TO_RESOURCES =
        new Function<IAssignedTask, ResourceSlot>() {
          @Override
          public ResourceSlot apply(IAssignedTask input) {
            return ResourceSlot.from(input.getTask());
          }
        };

    private static final Function<HostOffer, ResourceSlot> OFFER_TO_RESOURCE_SLOT =
        new Function<HostOffer, ResourceSlot>() {
          @Override
          public ResourceSlot apply(HostOffer offer) {
            return ResourceSlot.from(offer.getOffer());
          }
        };

    private static final Function<HostOffer, String> OFFER_TO_HOST =
        new Function<HostOffer, String>() {
          @Override
          public String apply(HostOffer offer) {
            return offer.getOffer().getHostname();
          }
        };

    private static final Function<HostOffer, IHostAttributes> OFFER_TO_ATTRIBUTES =
        new Function<HostOffer, IHostAttributes>() {
          @Override
          public IHostAttributes apply(HostOffer offer) {
            return offer.getAttributes();
          }
        };

    // TODO(zmanji) Consider using Dominant Resource Fairness for ordering instead of the vector
    // ordering
    private static final Ordering<IAssignedTask> RESOURCE_ORDER =
        ResourceSlot.ORDER.onResultOf(TASK_TO_RESOURCES).reverse();

    /**
     * Optional.absent indicates that this slave does not have enough resources to satisfy the task.
     * The empty set indicates the offers (slack) are enough.
     * A set with elements indicates those tasks and the offers are enough.
     */
    private Optional<Set<IAssignedTask>> getTasksToPreempt(
        Iterable<IAssignedTask> possibleVictims,
        Iterable<HostOffer> offers,
        IAssignedTask pendingTask,
        AttributeAggregate attributeAggregate) {

      // This enforces the precondition that all of the resources are from the same host. We need to
      // get the host for the schedulingFilter.
      Set<String> hosts = ImmutableSet.<String>builder()
          .addAll(Iterables.transform(possibleVictims, Tasks.ASSIGNED_TO_SLAVE_HOST))
          .addAll(Iterables.transform(offers, OFFER_TO_HOST)).build();

      String host = Iterables.getOnlyElement(hosts);

      ResourceSlot slackResources =
          ResourceSlot.sum(Iterables.transform(offers, OFFER_TO_RESOURCE_SLOT));

      if (!Iterables.isEmpty(offers)) {
        if (Iterables.size(offers) > 1) {
          // There are multiple offers for the same host. Since both have maintenance information
          // we don't preempt with this information and wait for mesos to merge the two offers for
          // us.
          return Optional.absent();
        }
        IHostAttributes attributes = Iterables.getOnlyElement(
            FluentIterable.from(offers).transform(OFFER_TO_ATTRIBUTES).toSet());

        Set<SchedulingFilter.Veto> vetoes = schedulingFilter.filter(
            slackResources,
            attributes,
            pendingTask.getTask(),
            pendingTask.getTaskId(),
            attributeAggregate);

        if (vetoes.isEmpty()) {
          return Optional.<Set<IAssignedTask>>of(ImmutableSet.<IAssignedTask>of());
        }
      }

      FluentIterable<IAssignedTask> preemptableTasks =
          FluentIterable.from(possibleVictims).filter(canPreempt(pendingTask));

      if (preemptableTasks.isEmpty()) {
        return Optional.absent();
      }

      List<IAssignedTask> toPreemptTasks = Lists.newArrayList();

      Iterable<IAssignedTask> sortedVictims = RESOURCE_ORDER.immutableSortedCopy(preemptableTasks);

      for (IAssignedTask victim : sortedVictims) {
        toPreemptTasks.add(victim);

        ResourceSlot totalResource = ResourceSlot.sum(
            ResourceSlot.sum(Iterables.transform(toPreemptTasks, TASK_TO_RESOURCES)),
            slackResources);

        Optional<IHostAttributes> attributes = getHostAttributes(host);
        if (!attributes.isPresent()) {
          missingAttributes.incrementAndGet();
          continue;
        }

        Set<SchedulingFilter.Veto> vetoes = schedulingFilter.filter(
            totalResource,
            attributes.get(),
            pendingTask.getTask(),
            pendingTask.getTaskId(),
            attributeAggregate);

        if (vetoes.isEmpty()) {
          return Optional.<Set<IAssignedTask>>of(ImmutableSet.copyOf(toPreemptTasks));
        }
      }
      return Optional.absent();
    }

    private Optional<IHostAttributes> getHostAttributes(final String host) {
      return storage.weaklyConsistentRead(new Work.Quiet<Optional<IHostAttributes>>() {
        @Override
        public Optional<IHostAttributes> apply(StoreProvider storeProvider) {
          return storeProvider.getAttributeStore().getHostAttributes(host);
        }
      });
    }

    private static final Function<HostOffer, String> OFFER_TO_SLAVE_ID =
        new Function<HostOffer, String>() {
          @Override
          public String apply(HostOffer offer) {
            return offer.getOffer().getSlaveId().getValue();
          }
        };

    private Multimap<String, IAssignedTask> getSlavesToActiveTasks() {
      // Only non-pending active tasks may be preempted.
      List<IAssignedTask> activeTasks = fetch(CANDIDATE_QUERY);

      // Walk through the preemption candidates in reverse scheduling order.
      Collections.sort(activeTasks, Tasks.SCHEDULING_ORDER.reverse());

      // Group the tasks by slave id so they can be paired with offers from the same slave.
      return Multimaps.index(activeTasks, TASK_TO_SLAVE_ID);
    }

    @Override
    public synchronized Optional<String> findPreemptionSlotFor(
        String taskId,
        AttributeAggregate attributeAggregate) {

      List<IAssignedTask> pendingTasks =
          fetch(Query.statusScoped(PENDING).byId(taskId), isIdleTask);

      // Task is no longer PENDING no need to preempt
      if (pendingTasks.isEmpty()) {
        return Optional.absent();
      }

      final IAssignedTask pendingTask = Iterables.getOnlyElement(pendingTasks);

      Multimap<String, IAssignedTask> slavesToActiveTasks = getSlavesToActiveTasks();

      if (slavesToActiveTasks.isEmpty()) {
        return Optional.absent();
      }

      attemptedPreemptions.incrementAndGet();

      // Group the offers by slave id so they can be paired with active tasks from the same slave.
      Multimap<String, HostOffer> slavesToOffers =
          Multimaps.index(offerQueue.getOffers(), OFFER_TO_SLAVE_ID);

      Set<String> allSlaves = ImmutableSet.<String>builder()
          .addAll(slavesToOffers.keySet())
          .addAll(slavesToActiveTasks.keySet())
          .build();

      for (String slaveID : allSlaves) {
        final Optional<Set<IAssignedTask>> toPreemptTasks = getTasksToPreempt(
            slavesToActiveTasks.get(slaveID),
            slavesToOffers.get(slaveID),
            pendingTask,
            attributeAggregate);

        if (toPreemptTasks.isPresent()) {
          storage.write(new MutateWork.NoResult.Quiet() {
            @Override
            protected void execute(MutableStoreProvider storeProvider) {
              for (IAssignedTask toPreempt : toPreemptTasks.get()) {
                stateManager.changeState(
                    storeProvider,
                    toPreempt.getTaskId(),
                    Optional.<ScheduleStatus>absent(),
                    PREEMPTING,
                    Optional.of("Preempting in favor of " + pendingTask.getTaskId()));
                tasksPreempted.incrementAndGet();
              }
            }
          });
          return Optional.of(slaveID);
        }
      }

      noSlotsFound.incrementAndGet();
      return Optional.absent();
    }

    private static final Predicate<IAssignedTask> IS_PRODUCTION =
        Predicates.compose(Tasks.IS_PRODUCTION, Tasks.ASSIGNED_TO_INFO);

    /**
     * Creates a static filter that will identify tasks that may preempt the provided task.
     * A task may preempt another task if the following conditions hold true:
     * - The resources reserved for {@code preemptableTask} are sufficient to satisfy the task.
     * - The tasks are owned by the same user and the priority of {@code preemptableTask} is lower
     *     OR {@code preemptableTask} is non-production and the compared task is production.
     *
     * @param preemptableTask Task to possibly preempt.
     * @return A filter that will compare the priorities and resources required by other tasks
     *     with {@code preemptableTask}.
     */
    private static Predicate<IAssignedTask> preemptionFilter(IAssignedTask preemptableTask) {
      Predicate<IAssignedTask> preemptableIsProduction = preemptableTask.getTask().isProduction()
          ? Predicates.<IAssignedTask>alwaysTrue()
          : Predicates.<IAssignedTask>alwaysFalse();

      Predicate<IAssignedTask> priorityFilter =
          greaterPriorityFilter(GET_PRIORITY.apply(preemptableTask));
      return Predicates.or(
          Predicates.and(Predicates.not(preemptableIsProduction), IS_PRODUCTION),
          Predicates.and(isOwnedBy(getRole(preemptableTask)), priorityFilter)
      );
    }

    private static Predicate<IAssignedTask> isOwnedBy(final String role) {
      return new Predicate<IAssignedTask>() {
        @Override
        public boolean apply(IAssignedTask task) {
          return getRole(task).equals(role);
        }
      };
    }

    private static String getRole(IAssignedTask task) {
      return task.getTask().getJob().getRole();
    }

    private static Predicate<Integer> greaterThan(final int value) {
      return new Predicate<Integer>() {
        @Override
        public boolean apply(Integer input) {
          return input > value;
        }
      };
    }

    private static Predicate<IAssignedTask> greaterPriorityFilter(int priority) {
      return Predicates.compose(greaterThan(priority), GET_PRIORITY);
    }
  }
}
TOP

Related Classes of org.apache.aurora.scheduler.async.Preemptor

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.
ww.google-analytics.com/analytics.js','ga'); ga('create', 'UA-20639858-1', 'auto'); ga('send', 'pageview');