Package org.apache.hadoop.hdfs.server.balancer

Source Code of org.apache.hadoop.hdfs.server.balancer.BalancePlan

/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hdfs.server.balancer;

import java.util.Arrays;
import java.util.Collection;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.PriorityQueue;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.hadoop.hdfs.server.balancer.Balancer.BalancerDatanode;
import org.apache.hadoop.hdfs.server.balancer.Balancer.NodeTask;
import org.apache.hadoop.hdfs.server.balancer.Balancer.Source;
import org.apache.hadoop.hdfs.server.balancer.Balancer.Target;
import org.apache.hadoop.net.NetworkTopology;
import org.apache.hadoop.net.Node;

/** Keeps the plan of pipelines to create between nodes and how much data must be sent */
class BalancePlan {
  protected static final Log LOG = LogFactory.getLog(BalancePlan.class.getName());

  /** Number of bytes to be moved in order to make the cluster balanced. */
  public long bytesLeftToMove;
  public long bytesToMove;

  public NetworkTopology cluster = new NetworkTopology();
  /** Maps datanode's storage ID to itself */
  public Map<String, BalancerDatanode> datanodes = new HashMap<String, BalancerDatanode>();

  /** All nodes that will participate in balancing as sources */
  public Collection<Source> sources = new HashSet<Source>();
  /** All nodes that will participate in balancing as targets */
  public Collection<Target> targets = new HashSet<Target>();

  /** If remaining < lowerRemainingThreshold then DataNode is considered overutilized */
  private double lowerRemainingThreshold;
  /** If remaining > upperRemainingThreshold then DataNode is considered underutilized */
  private double upperRemainingThreshold;
  /** Cluster-wide remaining capacity percentage */
  private double avgRemaining;

  /** Compute balance plan */
  public BalancePlan(Balancer balancer, List<DatanodeInfo> datanodes) {
    if (datanodes == null || datanodes.isEmpty()) {
      throw new IllegalArgumentException("cannot prepare plan for empty cluster");
    }

    avgRemaining = computeAvgRemaining(datanodes);
    lowerRemainingThreshold = Math.max(avgRemaining / 2, avgRemaining - balancer.threshold);
    upperRemainingThreshold = Math.min(PERCENTAGE_BASE, avgRemaining + balancer.threshold);
    if (lowerRemainingThreshold > upperRemainingThreshold) {
      throw new IllegalStateException("lowerThresh > upperThresh");
    }

    LOG.info("balanced range: [ " + lowerRemainingThreshold + ", " + upperRemainingThreshold
        + " ], average remaining: " + avgRemaining);

    long overLoadedBytes = 0L, underLoadedBytes = 0L;
    Bucket clusterBucket = new Bucket();
    Map<Node, Bucket> rackBuckets = new HashMap<Node, Bucket>();
    for (DatanodeInfo datanode : datanodes) {
      // Update network topology
      cluster.add(datanode);
      // Create bucket if none
      assert datanode.getParent() != null : "node outside of any rack";
      Bucket bucket = rackBuckets.get(datanode.getParent());
      if (bucket == null) {
        bucket = new Bucket();
        rackBuckets.put(datanode.getParent(), bucket);
      }
      // Put DataNode into chosen bucket
      BalancerDatanode datanodeS;
      if (getRemaining(datanode) < avgRemaining) {
        // Above average utilized
        datanodeS = balancer.getSource(datanode, avgRemaining);
        bucket.addSource((Source) datanodeS);
        clusterBucket.addSource((Source) datanodeS);
        if (isOverUtilized(datanodeS)) {
          overLoadedBytes += (long) ((lowerRemainingThreshold - datanodeS.getCurrentRemaining())
              * datanodeS.getDatanode().getCapacity() / PERCENTAGE_BASE);
        }
      } else {
        // Below average utilized
        datanodeS = new Target(datanode, avgRemaining);
        bucket.addTarget((Target) datanodeS);
        clusterBucket.addTarget((Target) datanodeS);
        if (isUnderUtilized(datanodeS)) {
          underLoadedBytes += (long) ((datanodeS.getCurrentRemaining() - upperRemainingThreshold)
              * datanodeS.getDatanode().getCapacity() / PERCENTAGE_BASE);
        }
      }
      // Update all DataNodes list
      this.datanodes.put(datanode.getStorageID(), datanodeS);
    }
    bytesLeftToMove = Math.max(overLoadedBytes, underLoadedBytes);

    logImbalancedNodes();

    // Balance each rack bucket separately
    for (Bucket bucket : rackBuckets.values()) {
      double rackAverage = bucket.computeAvgRemaining();
      if (lowerRemainingThreshold <= rackAverage && rackAverage <= upperRemainingThreshold) {
        bucket.updatePlan();
      }
      // If perfectly balanced rack renders only over or underutilized DataNodes
      // we do not bother balancing it
    }
    // Balance cluster-wide afterwards
    clusterBucket.externalUpdate();
    clusterBucket.updatePlan();

    bytesToMove = 0L;
    for (Source src : sources) {
      bytesToMove += src.scheduledSize;
    }

    logPlanOutcome();
  }

  /** Log the over utilized & under utilized nodes */
  private void logImbalancedNodes() {
    if (LOG.isInfoEnabled()) {
      int underUtilized = 0, overUtilized = 0;
      for (BalancerDatanode node : this.datanodes.values()) {
        if (isUnderUtilized(node))
          underUtilized++;
        else if (isOverUtilized(node))
          overUtilized++;
      }
      StringBuilder msg = new StringBuilder();
      msg.append(overUtilized);
      msg.append(" over utilized nodes:");
      for (BalancerDatanode node : this.datanodes.values()) {
        if (isOverUtilized(node)) {
          msg.append(" ");
          msg.append(node.getName());
        }
      }
      LOG.info(msg);
      msg = new StringBuilder();
      msg.append(underUtilized);
      msg.append(" under utilized nodes: ");
      for (BalancerDatanode node : this.datanodes.values()) {
        if (isUnderUtilized(node)) {
          msg.append(" ");
          msg.append(node.getName());
        }
      }
      LOG.info(msg);
    }
  }

  /** Log node utilization after the plan execution */
  private void logPlanOutcome() {
    if (LOG.isInfoEnabled()) {
      LOG.info("Predicted plan outcome: bytesLeftToMove: "
          + bytesLeftToMove + ", bytesToMove: " + bytesToMove);
      for (BalancerDatanode node : this.datanodes.values()) {
        LOG.info(node.getName() + " remaining: " + node.getCurrentRemaining());
      }
    }
  }

  /** Pairs up given nodes in balancing plan */
  private void scheduleTask(Source source, long size, Target target) {
    NodeTask nodeTask = new NodeTask(target, size);
    source.addNodeTask(nodeTask);
    target.addNodeTask(nodeTask);
    sources.add(source);
    targets.add(target);
    LOG.info("scheduled " + size + " bytes : " + source.getName() + " -> " + target.getName());
  }

  /** Determines if the node is overutilized */
  private boolean isOverUtilized(BalancerDatanode datanode) {
    return datanode.getCurrentRemaining() < lowerRemainingThreshold;
  }

  /** Determines if the node is underutilized */
  private boolean isUnderUtilized(BalancerDatanode datanode) {
    return datanode.getCurrentRemaining() > upperRemainingThreshold;
  }

  /** True iff the DataNode was over or underutilized before balancing */
  private boolean wasUrgent(BalancerDatanode datanode) {
    // Note that no node can become urgent during balancing if it was not before
    return datanode.initialRemaining < lowerRemainingThreshold || datanode.initialRemaining > upperRemainingThreshold;
  }

  /** Remaining ratio is expressed in percents */
  static final double PERCENTAGE_BASE = 100.0;

  static double computeAvgRemaining(Iterable<DatanodeInfo> datanodes) {
    long totalCapacity = 0L, totalRemainingSpace = 0L;
    for (DatanodeInfo datanode : datanodes) {
      totalCapacity += datanode.getCapacity();
      totalRemainingSpace += datanode.getRemaining();
    }
    return (double) totalRemainingSpace / totalCapacity * PERCENTAGE_BASE;
  }

  static double getRemaining(DatanodeInfo datanode) {
    return (double) datanode.getRemaining() / datanode.getCapacity() * PERCENTAGE_BASE;
  }

  /** Set of nodes which can interchange data for balancing */
  private class Bucket {
    private PriorityQueue<Source> sources = new PriorityQueue<Source>(10, new SourceComparator());
    private PriorityQueue<Target> targets = new PriorityQueue<Target>(10, new TargetComparator());

    public void addSource(Source node) {
      this.sources.add(node);
    }

    public void addTarget(Target node) {
      this.targets.add(node);
    }

    public double computeAvgRemaining() {
      long totalCapacity = 0L, totalRemainingSpace = 0L;
      for (BalancerDatanode node : sources) {
        totalCapacity += node.getDatanode().getCapacity();
        totalRemainingSpace += node.getDatanode().getRemaining();
      }
      for (BalancerDatanode node : targets) {
        totalCapacity += node.getDatanode().getCapacity();
        totalRemainingSpace += node.getDatanode().getRemaining();
      }
      return ((double) totalRemainingSpace) / totalCapacity * PERCENTAGE_BASE;
    }

    /** Updates the plan with all pairs of nodes from this bucket which need to be connected */
    public void updatePlan() {
      while (!this.sources.isEmpty() && !this.targets.isEmpty()) {
        Source source = this.sources.poll();
        Target target = this.targets.poll();
        if (!wasUrgent(source) && !wasUrgent(target)) {
          // Due to ordering of DataNodes we can skip the rest
          break;
        }
        long size = moveSize(source, target);
        if (size > 0) {
          scheduleTask(source, size, target);
        }
        if (source.getAvailableMoveSize() > 0) {
          this.sources.add(source);
        }
        if (target.getAvailableMoveSize() > 0) {
          this.targets.add(target);
        }
        // Loop termination:
        // In each step we either scheduleTask, therefore decreasing sum (over
        // all nodes) of availableMoveSize, or decrease number of nodes in
        // sources or targets queue, all of them are bounded by 0.
      }
    }

    /** Determines how much data to move between given nodes */
    private long moveSize(Source source, BalancerDatanode target) {
      // TODO balancing concurrency
      return Math.min(source.getAvailableMoveSize(), target.getAvailableMoveSize());
    }

    /** Sort internal queues again in case DataNodes was changed externally */
    public void externalUpdate() {
      // sources and targets might no longer be a proper priority queues
      this.sources = new PriorityQueue<Source>((Collection<Source>) this.sources);
      this.targets = new PriorityQueue<Target>((Collection<Target>) this.targets);
    }

    /**
     * We rely on this ordering in Bucket#updatePlan loop termination condition,
     * additional priorities should be expressed in proper (source/target) comparator below.
     * Because of this condition SourceComparator and TargetComparator are not reverse of each
     * other.
     */
    private abstract class BalancerDatanodeComparator implements Comparator<BalancerDatanode> {
      @Override
      public int compare(BalancerDatanode o1, BalancerDatanode o2) {
        return Boolean.valueOf(wasUrgent(o2)).compareTo(wasUrgent(o1));
      }
    }

    private final class SourceComparator extends BalancerDatanodeComparator {
      @Override
      public int compare(BalancerDatanode o1, BalancerDatanode o2) {
        int ret = super.compare(o1, o2);
        if (ret == 0) {
          ret = Double.valueOf(o1.getCurrentRemaining()).compareTo(o2.getCurrentRemaining());
        }
        // TODO concurrency level can also be taken into consideration
        return ret;
      }
    }

    private final class TargetComparator extends BalancerDatanodeComparator {
      @Override
      public int compare(BalancerDatanode o1, BalancerDatanode o2) {
        int ret = super.compare(o1, o2);
        if (ret == 0) {
          ret = Double.valueOf(o2.getCurrentRemaining()).compareTo(o1.getCurrentRemaining());
        }
        // TODO concurrency level can also be taken into consideration
        return ret;
      }
    }
  }

  /** Prints data distribution based on report from NameNode */
  public static void logDataDistribution(DatanodeInfo[] report) {
    if (LOG.isInfoEnabled()) {
      double avgRemaining = computeAvgRemaining(Arrays.asList(report));
      StringBuilder msg = new StringBuilder("Data distribution report: avgRemaining "
          + avgRemaining);
      for (DatanodeInfo node : report) {
        msg.append("\n").append(node.getName());
        msg.append(" remaining ").append(getRemaining(node));
        msg.append(" raw ").append(node.getRemaining()).append(" / ").append(node.getCapacity());
      }
      LOG.info(msg);
    }
  }
}
TOP

Related Classes of org.apache.hadoop.hdfs.server.balancer.BalancePlan

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.