Package org.apache.hadoop.hbase

Source Code of org.apache.hadoop.hbase.HMaster$ChangeTableState

/**
* Copyright 2007 The Apache Software Foundation
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase;

import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.lang.reflect.Constructor;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Random;
import java.util.Set;
import java.util.SortedMap;
import java.util.Timer;
import java.util.TimerTask;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.Delayed;
import java.util.concurrent.DelayQueue;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicReference;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantLock;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.hbase.io.BatchUpdate;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.util.FSUtils;
import org.apache.hadoop.hbase.util.InfoServer;
import org.apache.hadoop.hbase.util.Sleeper;
import org.apache.hadoop.hbase.util.Threads;
import org.apache.hadoop.hbase.util.Writables;
import org.apache.hadoop.io.MapWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.ipc.RPC;
import org.apache.hadoop.ipc.RemoteException;
import org.apache.hadoop.ipc.Server;


/**
* HMaster is the "master server" for a HBase.
* There is only one HMaster for a single HBase deployment.
*/
public class HMaster extends Thread implements HConstants, HMasterInterface,
HMasterRegionInterface {
  static final Log LOG = LogFactory.getLog(HMaster.class.getName());

  /** {@inheritDoc} */
  public long getProtocolVersion(String protocol,
      @SuppressWarnings("unused") long clientVersion)
  throws IOException {
    if (protocol.equals(HMasterInterface.class.getName())) {
      return HMasterInterface.versionID;
    } else if (protocol.equals(HMasterRegionInterface.class.getName())) {
      return HMasterRegionInterface.versionID;
    } else {
      throw new IOException("Unknown protocol to name node: " + protocol);
    }
  }

  // We start out with closed flag on.  Using AtomicBoolean rather than
  // plain boolean because want to pass a reference to supporting threads
  // started here in HMaster rather than have them have to know about the
  // hosting class
  volatile AtomicBoolean closed = new AtomicBoolean(true);
  volatile boolean fsOk;
  Path dir;
  Configuration conf;
  FileSystem fs;
  Random rand;
  int threadWakeFrequency;
  int numRetries;
  long maxRegionOpenTime;

  DelayQueue<PendingServerShutdown> shutdownQueue;
  BlockingQueue<PendingOperation> msgQueue;

  int leaseTimeout;
  private Leases serverLeases;
  private Server server;
  private HServerAddress address;

  HConnection connection;

  int metaRescanInterval;

  final AtomicReference<HServerAddress> rootRegionLocation =
    new AtomicReference<HServerAddress>();
 
  Lock splitLogLock = new ReentrantLock();
 
  // A Sleeper that sleeps for threadWakeFrequency
  protected Sleeper sleeper;
 
  // Default access so accesible from unit tests. MASTER is name of the webapp
  // and the attribute name used stuffing this instance into web context.
  InfoServer infoServer;
  public static final String MASTER = "master";

  /**
   * Base HRegion scanner class. Holds utilty common to <code>ROOT</code> and
   * <code>META</code> HRegion scanners.
   *
   * <p>How do we know if all regions are assigned? After the initial scan of
   * the <code>ROOT</code> and <code>META</code> regions, all regions known at
   * that time will have been or are in the process of being assigned.</p>
   *
   * <p>When a region is split the region server notifies the master of the
   * split and the new regions are assigned. But suppose the master loses the
   * split message? We need to periodically rescan the <code>ROOT</code> and
   * <code>META</code> regions.
   *    <ul>
   *    <li>If we rescan, any regions that are new but not assigned will have
   *    no server info. Any regions that are not being served by the same
   *    server will get re-assigned.</li>
   *     
   *    <li>Thus a periodic rescan of the root region will find any new
   *    <code>META</code> regions where we missed the <code>META</code> split
   *    message or we failed to detect a server death and consequently need to
   *    assign the region to a new server.</li>
   *       
   *    <li>if we keep track of all the known <code>META</code> regions, then
   *    we can rescan them periodically. If we do this then we can detect any
   *    regions for which we missed a region split message.</li>
   *    </ul>
   *   
   * Thus just keeping track of all the <code>META</code> regions permits
   * periodic rescanning which will detect unassigned regions (new or
   * otherwise) without the need to keep track of every region.</p>
   *
   * <p>So the <code>ROOT</code> region scanner needs to wake up:
   * <ol>
   * <li>when the master receives notification that the <code>ROOT</code>
   * region has been opened.</li>
   * <li>periodically after the first scan</li>
   * </ol>
   *
   * The <code>META</code>  scanner needs to wake up:
   * <ol>
   * <li>when a <code>META</code> region comes on line</li>
   * </li>periodically to rescan the online <code>META</code> regions</li>
   * </ol>
   *
   * <p>A <code>META</code> region is not 'online' until it has been scanned
   * once.
   */
  abstract class BaseScanner extends Chore {
    protected boolean rootRegion;
    protected final Text tableName;

    protected abstract void initialScan();
    protected abstract void maintenanceScan();

    BaseScanner(final Text tableName, final int period,
        final AtomicBoolean stop) {
      super(period, stop);
      this.tableName = tableName;
      this.rootRegion = tableName.equals(ROOT_TABLE_NAME);
    }
   
    @Override
    protected void initialChore() {
      initialScan();
    }
   
    @Override
    protected void chore() {
      maintenanceScan();
    }

    /**
     * @param region Region to scan
     * @throws IOException
     */
    protected void scanRegion(final MetaRegion region) throws IOException {
      HRegionInterface regionServer = null;
      long scannerId = -1L;
      LOG.info(Thread.currentThread().getName() + " scanning meta region " +
        region.toString());

      // Array to hold list of split parents found.  Scan adds to list.  After
      // scan we go check if parents can be removed.
      Map<HRegionInfo, SortedMap<Text, byte[]>> splitParents =
        new HashMap<HRegionInfo, SortedMap<Text, byte[]>>();
      try {
        regionServer = connection.getHRegionConnection(region.getServer());
        scannerId =
          regionServer.openScanner(region.getRegionName(), COLUMN_FAMILY_ARRAY,
              EMPTY_START_ROW, System.currentTimeMillis(), null);

        int numberOfRegionsFound = 0;
        while (true) {
          SortedMap<Text, byte[]> results = new TreeMap<Text, byte[]>();
          MapWritable values = regionServer.next(scannerId);
          if (values == null || values.size() == 0) {
            break;
          }

          for (Map.Entry<Writable, Writable> e: values.entrySet()) {
            HStoreKey key = (HStoreKey) e.getKey();
            results.put(key.getColumn(),
                ((ImmutableBytesWritable) e.getValue()).get());
          }
          HRegionInfo info = (HRegionInfo) Writables.getWritable(
              results.get(COL_REGIONINFO), new HRegionInfo());
          String serverName = Writables.bytesToString(results.get(COL_SERVER));
          long startCode = Writables.bytesToLong(results.get(COL_STARTCODE));
          if (LOG.isDebugEnabled()) {
            LOG.debug(Thread.currentThread().getName() + " scanner: " +
              Long.valueOf(scannerId) + " regioninfo: {" + info.toString() +
              "}, server: " + serverName + ", startCode: " + startCode);
          }

          // Note Region has been assigned.
          checkAssigned(info, serverName, startCode);
          if (isSplitParent(info)) {
            splitParents.put(info, results);
          }
          numberOfRegionsFound += 1;
        }
        if (this.rootRegion) {
          numberOfMetaRegions.set(numberOfRegionsFound);
        }
      } catch (IOException e) {
        if (e instanceof RemoteException) {
          e = RemoteExceptionHandler.decodeRemoteException((RemoteException) e);
          if (e instanceof UnknownScannerException) {
            // Reset scannerId so we do not try closing a scanner the other side
            // has lost account of: prevents duplicated stack trace out of the
            // below close in the finally.
            scannerId = -1L;
          }
        }
        throw e;
      } finally {
        try {
          if (scannerId != -1L && regionServer != null) {
            regionServer.close(scannerId);
          }
        } catch (IOException e) {
          LOG.error("Closing scanner",
            RemoteExceptionHandler.checkIOException(e));
        }
      }

      // Scan is finished.  Take a look at split parents to see if any we can
      // clean up.
      if (splitParents.size() > 0) {
        for (Map.Entry<HRegionInfo, SortedMap<Text, byte[]>> e:
            splitParents.entrySet()) {
          HRegionInfo hri = e.getKey();
          cleanupSplits(region.getRegionName(), regionServer, hri, e.getValue());
        }
      }
      LOG.info(Thread.currentThread().getName() + " scan of meta region " +
        region.toString() + " complete");
    }

    /*
     * @param info Region to check.
     * @return True if this is a split parent.
     */
    private boolean isSplitParent(final HRegionInfo info) {
      if (!info.isSplit()) {
        return false;
      }
      if (!info.isOffline()) {
        LOG.warn("Region is split but not offline: " + info.regionName);
      }
      return true;
    }

    /*
     * If daughters no longer hold reference to the parents, delete the parent.
     * @param metaRegionName Meta region name.
     * @param server HRegionInterface of meta server to talk to
     * @param parent HRegionInfo of split parent
     * @param rowContent Content of <code>parent</code> row in
     * <code>metaRegionName</code>
     * @return True if we removed <code>parent</code> from meta table and from
     * the filesystem.
     * @throws IOException
     */
    private boolean cleanupSplits(final Text metaRegionName,
        final HRegionInterface srvr, final HRegionInfo parent,
        SortedMap<Text, byte[]> rowContent)
    throws IOException {
      boolean result = false;

      boolean hasReferencesA = hasReferences(metaRegionName, srvr,
          parent.getRegionName(), rowContent, COL_SPLITA);
      boolean hasReferencesB = hasReferences(metaRegionName, srvr,
          parent.getRegionName(), rowContent, COL_SPLITB);
     
      if (!hasReferencesA && !hasReferencesB) {
        LOG.info("Deleting region " + parent.getRegionName() +
        " because daughter splits no longer hold references");
        if (!HRegion.deleteRegion(fs, dir, parent.getRegionName())) {
          LOG.warn("Deletion of " + parent.getRegionName() + " failed");
        }
       
        BatchUpdate b = new BatchUpdate(rand.nextLong());
        long lockid = b.startUpdate(parent.getRegionName());
        b.delete(lockid, COL_REGIONINFO);
        b.delete(lockid, COL_SERVER);
        b.delete(lockid, COL_STARTCODE);
        srvr.batchUpdate(metaRegionName, System.currentTimeMillis(), b);
        result = true;
      } else if (LOG.isDebugEnabled()) {
        // If debug, note we checked and current state of daughters.
        LOG.debug("Checked " + parent.getRegionName() +
          " for references: splitA: " + hasReferencesA + ", splitB: "+
          hasReferencesB);
      }
      return result;
    }
   
    /*
     * Checks if a daughter region -- either splitA or splitB -- still holds
     * references to parent.  If not, removes reference to the split from
     * the parent meta region row.
     * @param metaRegionName Name of meta region to look in.
     * @param srvr Where region resides.
     * @param parent Parent region name.
     * @param rowContent Keyed content of the parent row in meta region.
     * @param splitColumn Column name of daughter split to examine
     * @return True if still has references to parent.
     * @throws IOException
     */
    protected boolean hasReferences(final Text metaRegionName,
      final HRegionInterface srvr, final Text parent,
      SortedMap<Text, byte[]> rowContent, final Text splitColumn)
    throws IOException {
      boolean result = false;
      HRegionInfo split =
        Writables.getHRegionInfoOrNull(rowContent.get(splitColumn));
      if (split == null) {
        return result;
      }
      for (Text family: split.getTableDesc().families().keySet()) {
        Path p = HStoreFile.getMapDir(fs.makeQualified(dir),
            split.getRegionName(), HStoreKey.extractFamily(family));

        // Look for reference files.  Call listPaths with an anonymous
        // instance of PathFilter.

        Path [] ps = fs.listPaths(p,
            new PathFilter () {
              public boolean accept(Path path) {
                return HStoreFile.isReference(path);
              }
            }
        );

        if (ps != null && ps.length > 0) {
          result = true;
          break;
        }
      }
     
      if (result) {
        return result;
      }
     
      if (LOG.isDebugEnabled()) {
        LOG.debug(split.getRegionName().toString()
            +" no longer has references to " + parent.toString());
      }
     
      BatchUpdate b = new BatchUpdate(rand.nextLong());
      long lockid = b.startUpdate(parent);
      b.delete(lockid, splitColumn);
      srvr.batchUpdate(metaRegionName, System.currentTimeMillis(), b);
       
      return result;
    }

    protected void checkAssigned(final HRegionInfo info,
      final String serverName, final long startCode)
    throws IOException {
      // Skip region - if ...
      if(info.offLine                                     // offline
          || killedRegions.contains(info.regionName)      // queued for offline
          || regionsToDelete.contains(info.regionName)) { // queued for delete
        unassignedRegions.remove(info.regionName);
        assignAttempts.remove(info.regionName);
        return;
      }
      HServerInfo storedInfo = null;
      if (serverName.length() != 0) {
        Map<Text, HRegionInfo> regionsToKill = killList.get(serverName);
        if (regionsToKill != null &&
            regionsToKill.containsKey(info.regionName)) {
         
          // Skip if region is on kill list
          if(LOG.isDebugEnabled()) {
            LOG.debug("not assigning region (on kill list): " + info.regionName);
          }
          return;
        }
        synchronized (serversToServerInfo) {
          storedInfo = serversToServerInfo.get(serverName);
        }
      }
      if (LOG.isDebugEnabled()) {
        LOG.debug("Checking " + info.regionName + " is assigned");
      }
      if (!(unassignedRegions.containsKey(info.regionName) ||
            pendingRegions.contains(info.regionName))
          && (storedInfo == null || storedInfo.getStartCode() != startCode)) {
        // The current assignment is no good
        if (LOG.isDebugEnabled()) {
          LOG.debug("Current assignment of " + info.regionName + " is no good");
        }
        // Recover the region server's log if there is one.
        if (serverName.length() != 0) {
          StringBuilder dirName = new StringBuilder("log_");
          dirName.append(serverName.replace(":", "_"));
          Path logDir = new Path(dir, dirName.toString());
          try {
            if (fs.exists(logDir)) {
              splitLogLock.lock();
              try {
                HLog.splitLog(dir, logDir, fs, conf);
              } finally {
                splitLogLock.unlock();
              }
            }
            if (LOG.isDebugEnabled()) {
              LOG.debug("Split " + logDir.toString());
            }
          } catch (IOException e) {
            LOG.warn("unable to split region server log because: ", e);
            throw e;
          }
        }
        // Now get the region assigned
        unassignedRegions.put(info.regionName, info);
        assignAttempts.put(info.regionName, Long.valueOf(0L));
      }
    }
  }

  volatile boolean rootScanned;

  /** Scanner for the <code>ROOT</code> HRegion. */
  class RootScanner extends BaseScanner {
    /** Constructor */
    public RootScanner() {
      super(HConstants.ROOT_TABLE_NAME, metaRescanInterval, closed);
    }

    private void scanRoot() {
      boolean succeeded = false;
      int tries = 0;
      while (!closed.get() && tries < numRetries) {
        synchronized (rootRegionLocation) {
          while(!closed.get() && rootRegionLocation.get() == null) {
            // rootRegionLocation will be filled in when we get an 'open region'
            // regionServerReport message from the HRegionServer that has been
            // allocated the ROOT region below.
            try {
              rootRegionLocation.wait();
            } catch (InterruptedException e) {
              // continue
            }
          }
        }
        if (closed.get()) {
          continue;
        }

        try {
          // Don't interrupt us while we're working
          synchronized(rootScannerLock) {
            scanRegion(new MetaRegion(rootRegionLocation.get(),
                HGlobals.rootRegionInfo.regionName, null));
          }
          succeeded = true;
          break;
        } catch (IOException e) {
          e = RemoteExceptionHandler.checkIOException(e);
          tries += 1;
          if (tries == 1) {
            LOG.warn("Scan ROOT region", e);
          } else {
            LOG.error("Scan ROOT region", e);
          if (tries == numRetries - 1) {
              // We ran out of tries. Make sure the file system is still
              // available
              if (!checkFileSystem()) {
                continue; // Avoid sleeping.
              }
            }
          }
        } catch (Exception e) {
          // If for some reason we get some other kind of exception,
          // at least log it rather than go out silently.
          LOG.error("Unexpected exception", e);
        }
        sleeper.sleep();
      }
      if (!succeeded) {
        // We tried numretries to reach root and failed.  Is it gone.
        // Currently we just flounder.  Should we reallocate root?
        // This would be catastrophic?
        // unassignRootRegion();
      }
    }

    @Override
    protected void initialScan() {
      scanRoot();
      rootScanned = true;
    }

    @Override
    protected void maintenanceScan() {
      scanRoot();
    }
  }

  private RootScanner rootScannerThread;
  Integer rootScannerLock = new Integer(0);

  @SuppressWarnings("unchecked")
  public static class MetaRegion implements Comparable {
    private HServerAddress server;
    private Text regionName;
    private Text startKey;

    MetaRegion(HServerAddress server, Text regionName, Text startKey) {
      if (server == null) {
        throw new IllegalArgumentException("server cannot be null");
      }
      this.server = server;
     
      if (regionName == null) {
        throw new IllegalArgumentException("regionName cannot be null");
      }
      this.regionName = new Text(regionName);
     
      this.startKey = new Text();
      if (startKey != null) {
        this.startKey.set(startKey);
      }
    }
   
    @Override
    public String toString() {
      return "regionname: " + this.regionName.toString() + ", startKey: <" +
        this.startKey.toString() + ">, server: " + this.server.toString() + "}";
    }

    /** @return the regionName */
    public Text getRegionName() {
      return regionName;
    }

    /** @return the server */
    public HServerAddress getServer() {
      return server;
    }

    /** @return the startKey */
    public Text getStartKey() {
      return startKey;
    }

    /** {@inheritDoc} */
    @Override
    public boolean equals(Object o) {
      return this.compareTo(o) == 0;
    }

    /** {@inheritDoc} */
    @Override
    public int hashCode() {
      int result = this.regionName.hashCode();
      result ^= this.startKey.hashCode();
      return result;
    }

    // Comparable

    /** {@inheritDoc} */
    public int compareTo(Object o) {
      MetaRegion other = (MetaRegion)o;
      int result = this.regionName.compareTo(other.getRegionName());
      if(result == 0) {
        result = this.startKey.compareTo(other.getStartKey());
        if (result == 0) {
          // Might be on different host?
          result = this.server.compareTo(other.server);
        }
      }
      return result;
    }
  }

  /** Set by root scanner to indicate the number of meta regions */
  final AtomicInteger numberOfMetaRegions = new AtomicInteger();

  /** Work for the meta scanner is queued up here */
  final BlockingQueue<MetaRegion> metaRegionsToScan =
    new LinkedBlockingQueue<MetaRegion>();

  /** These are the online meta regions */
  final SortedMap<Text, MetaRegion> onlineMetaRegions =
    Collections.synchronizedSortedMap(new TreeMap<Text, MetaRegion>());

  /** Set by meta scanner after initial scan */
  volatile boolean initialMetaScanComplete;

  /**
   * MetaScanner <code>META</code> table.
   *
   * When a <code>META</code> server comes on line, a MetaRegion object is
   * queued up by regionServerReport() and this thread wakes up.
   *
   * It's important to do this work in a separate thread, or else the blocking
   * action would prevent other work from getting done.
   */
  class MetaScanner extends BaseScanner {
    /** Constructor */
    public MetaScanner() {
      super(HConstants.META_TABLE_NAME, metaRescanInterval, closed);
    }

    private void scanOneMetaRegion(MetaRegion region) {
      int tries = 0;
      while (!closed.get() && tries < numRetries) {
        while (!closed.get() && !rootScanned &&
            rootRegionLocation.get() == null) {
          sleeper.sleep();
        }
        if (closed.get()) {
          continue;
        }

        try {
          // Don't interrupt us while we're working
          synchronized (metaScannerLock) {
            scanRegion(region);
            onlineMetaRegions.put(region.getStartKey(), region);
          }
          break;
        } catch (IOException e) {
          e = RemoteExceptionHandler.checkIOException(e);
          tries += 1;
          if (tries == 1) {
            LOG.warn("Scan one META region: " + region.toString(), e);
          } else {
            LOG.error("Scan one META region: " + region.toString(), e);
          }
          // The region may have moved (TestRegionServerAbort, etc.).  If
          // so, either it won't be in the onlineMetaRegions list or its host
          // address has changed and the containsValue will fail. If not
          // found, best thing to do here is probably break.
          if (!onlineMetaRegions.containsValue(region)) {
            LOG.debug("Scanned region is no longer in map of online " +
              "regions or its value has changed");
            break;
          }
          if (tries == numRetries - 1) {
            // We ran out of tries. Make sure the file system is still
            // available
            if (!checkFileSystem()) {
              continue; // avoid sleeping
            }
          }
        } catch (Exception e) {
          // If for some reason we get some other kind of exception,
          // at least log it rather than go out silently.
          LOG.error("Unexpected exception", e);
        }
        // Sleep before going around again.
        sleeper.sleep();
      }
    }

    @Override
    protected void initialScan() {
      MetaRegion region = null;
      while (!closed.get() && region == null && !metaRegionsScanned()) {
        try {
          region =
            metaRegionsToScan.poll(threadWakeFrequency, TimeUnit.MILLISECONDS);
        } catch (InterruptedException e) {
          // continue
        }

        if (region != null) {
          scanOneMetaRegion(region);
        }
      }
      initialMetaScanComplete = true;
    }

    @Override
    protected void maintenanceScan() {
      ArrayList<MetaRegion> regions = new ArrayList<MetaRegion>();
      regions.addAll(onlineMetaRegions.values());
      for (MetaRegion r: regions) {
        scanOneMetaRegion(r);
      }
      metaRegionsScanned();
    }

    /**
     * Called by the meta scanner when it has completed scanning all meta
     * regions. This wakes up any threads that were waiting for this to happen.
     */
    private synchronized boolean metaRegionsScanned() {
      if (!rootScanned ||
          numberOfMetaRegions.get() != onlineMetaRegions.size()) {
        return false;
      }
      LOG.info("all meta regions scanned");
      notifyAll();
      return true;
    }

    /**
     * Other threads call this method to wait until all the meta regions have
     * been scanned.
     */
    synchronized boolean waitForMetaRegionsOrClose() {
      while (!closed.get()) {
        if (rootScanned &&
            numberOfMetaRegions.get() == onlineMetaRegions.size()) {
          break;
        }

        try {
          wait(threadWakeFrequency);
        } catch (InterruptedException e) {
          // continue
        }
      }
      return closed.get();
    }
  }

  MetaScanner metaScannerThread;
  Integer metaScannerLock = new Integer(0);

  /**
   * The 'unassignedRegions' table maps from a region name to a HRegionInfo
   * record, which includes the region's table, its id, and its start/end keys.
   *
   * We fill 'unassignedRecords' by scanning ROOT and META tables, learning the
   * set of all known valid regions.
   *
   * <p>Items are removed from this list when a region server reports in that
   * the region has been deployed.
   */
  final SortedMap<Text, HRegionInfo> unassignedRegions =
    Collections.synchronizedSortedMap(new TreeMap<Text, HRegionInfo>());

  /**
   * The 'assignAttempts' table maps from regions to a timestamp that indicates
   * the last time we *tried* to assign the region to a RegionServer. If the
   * timestamp is out of date, then we can try to reassign it.
   */
  final Map<Text, Long> assignAttempts =
    Collections.synchronizedMap(new HashMap<Text, Long>());

  /**
   * Regions that have been assigned, and the server has reported that it has
   * started serving it, but that we have not yet recorded in the meta table.
   */
  Set<Text> pendingRegions;

  /**
   * The 'killList' is a list of regions that are going to be closed, but not
   * reopened.
   */
  Map<String, HashMap<Text, HRegionInfo>> killList;

  /** 'killedRegions' contains regions that are in the process of being closed */
  Set<Text> killedRegions;

  /**
   * 'regionsToDelete' contains regions that need to be deleted, but cannot be
   * until the region server closes it
   */
  Set<Text> regionsToDelete;

  /**
   * The map of known server names to server info
   *
   * Access to this map and loadToServers and serversToLoad must be synchronized
   * on this object
   */
  final Map<String, HServerInfo> serversToServerInfo =
    new HashMap<String, HServerInfo>();

  /** SortedMap server load -> Set of server names */
  SortedMap<HServerLoad, Set<String>> loadToServers;

  /** Map of server names -> server load */
  Map<String, HServerLoad> serversToLoad;

  /** Build the HMaster out of a raw configuration item.
   *
   * @param conf - Configuration object
   * @throws IOException
   */
  public HMaster(Configuration conf) throws IOException {
    this(new Path(conf.get(HBASE_DIR, DEFAULT_HBASE_DIR)),
        new HServerAddress(conf.get(MASTER_ADDRESS, DEFAULT_MASTER_ADDRESS)),
        conf);
  }

  /**
   * Build the HMaster
   * @param dir base directory
   * @param address server address and port number
   * @param conf configuration
   *
   * @throws IOException
   */
  public HMaster(Path dir, HServerAddress address, Configuration conf)
  throws IOException {
    this.fsOk = true;
    this.dir = dir;
    this.conf = conf;
    this.fs = FileSystem.get(conf);
    this.rand = new Random();
   
    Path rootRegionDir =
      HRegion.getRegionDir(dir, HGlobals.rootRegionInfo.regionName);
    LOG.info("Root region dir: " + rootRegionDir.toString());

    try {
      // Make sure the root directory exists!
      if(! fs.exists(dir)) {
        fs.mkdirs(dir);
      }

      if (!fs.exists(rootRegionDir)) {
        LOG.info("bootstrap: creating ROOT and first META regions");
        try {
          HRegion root = HRegion.createHRegion(HGlobals.rootRegionInfo, this.dir,
            this.conf, null);
          HRegion meta = HRegion.createHRegion(new HRegionInfo(1L,
            HGlobals.metaTableDesc, null, null), this.dir, this.conf, null);

          // Add first region from the META table to the ROOT region.
          HRegion.addRegionToMETA(root, meta);
          root.close();
          root.getLog().closeAndDelete();
          meta.close();
          meta.getLog().closeAndDelete();
        } catch (IOException e) {
          e = RemoteExceptionHandler.checkIOException(e);
          LOG.error("bootstrap", e);
          throw e;
        }
      }
    } catch (IOException e) {
      LOG.fatal("Not starting HMaster because:", e);
      throw e;
    }

    this.threadWakeFrequency = conf.getInt(THREAD_WAKE_FREQUENCY, 10 * 1000);
    this.numRetries =  conf.getInt("hbase.client.retries.number", 2);
    this.maxRegionOpenTime =
      conf.getLong("hbase.hbasemaster.maxregionopen", 30 * 1000);

    this.shutdownQueue = new DelayQueue<PendingServerShutdown>();
    this.msgQueue = new LinkedBlockingQueue<PendingOperation>();

    this.leaseTimeout = conf.getInt("hbase.master.lease.period", 30 * 1000);
    this.serverLeases = new Leases(this.leaseTimeout,
        conf.getInt("hbase.master.lease.thread.wakefrequency", 15 * 1000));
   
    this.server = RPC.getServer(this, address.getBindAddress(),
        address.getPort(), conf.getInt("hbase.regionserver.handler.count", 10),
        false, conf);

    //  The rpc-server port can be ephemeral... ensure we have the correct info
    this.address = new HServerAddress(server.getListenerAddress());
    conf.set(MASTER_ADDRESS, address.toString());

    this.connection = HConnectionManager.getConnection(conf);

    this.metaRescanInterval =
      conf.getInt("hbase.master.meta.thread.rescanfrequency", 60 * 1000);

    // The root region
    this.rootScanned = false;
    this.rootScannerThread = new RootScanner();

    // Scans the meta table
    this.initialMetaScanComplete = false;

    this.metaScannerThread = new MetaScanner();
   
    unassignRootRegion();

    this.pendingRegions =
      Collections.synchronizedSet(new HashSet<Text>());

    this.killList =
      Collections.synchronizedMap(
          new HashMap<String, HashMap<Text, HRegionInfo>>());

    this.killedRegions =
      Collections.synchronizedSet(new HashSet<Text>());

    this.regionsToDelete =
      Collections.synchronizedSet(new HashSet<Text>());

    this.loadToServers = new TreeMap<HServerLoad, Set<String>>();
    this.serversToLoad = new HashMap<String, HServerLoad>();

    this.sleeper = new Sleeper(this.threadWakeFrequency, this.closed);
   
    // We're almost open for business
    this.closed.set(false);
    LOG.info("HMaster initialized on " + this.address.toString());
  }
 
  /*
   * Unassign the root region.
   * This method would be used in case where root region server had died
   * without reporting in.  Currently, we just flounder and never recover.  We
   * could 'notice' dead region server in root scanner -- if we failed access
   * multiple times -- but reassigning root is catastrophic.
   */
  void unassignRootRegion() {
    this.rootRegionLocation.set(null);
    this.unassignedRegions.put(HGlobals.rootRegionInfo.regionName,
        HGlobals.rootRegionInfo);
    this.assignAttempts.put(HGlobals.rootRegionInfo.regionName,
        Long.valueOf(0L));
    // TODO: If the old root region server had a log, it needs splitting.
  }

  /**
   * Checks to see if the file system is still accessible.
   * If not, sets closed
   * @return false if file system is not available
   */
  protected boolean checkFileSystem() {
    if (fsOk) {
      if (!FSUtils.isFileSystemAvailable(fs)) {
        LOG.fatal("Shutting down HBase cluster: file system not available");
        closed.set(true);
        fsOk = false;
      }
    }
    return fsOk;
  }

  /** @return HServerAddress of the master server */
  public HServerAddress getMasterAddress() {
    return address;
  }
 
  /**
   * @return Hbase root dir.
   */
  public Path getRootDir() {
    return this.dir;
  }

  /**
   * @return Read-only map of servers to serverinfo.
   */
  public Map<String, HServerInfo> getServersToServerInfo() {
    return Collections.unmodifiableMap(this.serversToServerInfo);
  }

  /**
   * @return Read-only map of servers to load.
   */
  public Map<String, HServerLoad> getServersToLoad() {
    return Collections.unmodifiableMap(this.serversToLoad);
  }

  /**
   * @return Location of the <code>-ROOT-</code> region.
   */
  public HServerAddress getRootRegionLocation() {
    return this.rootRegionLocation.get();
  }
 
  /**
   * @return Read-only map of online regions.
   */
  public Map<Text, MetaRegion> getOnlineMetaRegions() {
    return Collections.unmodifiableSortedMap(this.onlineMetaRegions);
  }

  /** Main processing loop */
  @Override
  public void run() {
    final String threadName = "HMaster";
    Thread.currentThread().setName(threadName);
    startServiceThreads();
    /*
     * Main processing loop
     */
    try {
      for (PendingOperation op = null; !closed.get(); ) {
        op = this.shutdownQueue.poll();
        if (op == null ) {
          try {
            op = msgQueue.poll(threadWakeFrequency, TimeUnit.MILLISECONDS);
          } catch (InterruptedException e) {
            // continue
          }
        }
        if (op == null || closed.get()) {
          continue;
        }
        try {
          if (LOG.isDebugEnabled()) {
            LOG.debug("Main processing loop: " + op.toString());
          }

          if (!op.process()) {
            // Operation would have blocked because not all meta regions are
            // online. This could cause a deadlock, because this thread is waiting
            // for the missing meta region(s) to come back online, but since it
            // is waiting, it cannot process the meta region online operation it
            // is waiting for. So put this operation back on the queue for now.
            if (msgQueue.size() == 0) {
              // The queue is currently empty so wait for a while to see if what
              // we need comes in first
              sleeper.sleep();
            }
            try {
              if (LOG.isDebugEnabled()) {
                LOG.debug("Put " + op.toString() + " back on queue");
              }
              msgQueue.put(op);
            } catch (InterruptedException e) {
              throw new RuntimeException("Putting into msgQueue was interrupted.", e);
            }
          }
        } catch (Exception ex) {
          if (ex instanceof RemoteException) {
            try {
              ex = RemoteExceptionHandler.decodeRemoteException(
                  (RemoteException)ex);
            } catch (IOException e) {
              ex = e;
              LOG.warn("main processing loop: " + op.toString(), e);
            }
          }
          if (!checkFileSystem()) {
            break;
          }
          LOG.warn("Processing pending operations: " + op.toString(), ex);
          try {
            msgQueue.put(op);
          } catch (InterruptedException e) {
            throw new RuntimeException("Putting into msgQueue was interrupted.", e);
          }
        }
      }
    } catch (Throwable t) {
      LOG.fatal("Unhandled exception", t);
    }
    letRegionServersShutdown();

    /*
     * Clean up and close up shop
     */
    synchronized(rootScannerLock) {
      rootScannerThread.interrupt();    // Wake root scanner
    }
    synchronized(metaScannerLock) {
      metaScannerThread.interrupt();    // Wake meta scanner
    }
    if (this.infoServer != null) {
      LOG.info("Stopping infoServer");
      try {
        this.infoServer.stop();
      } catch (InterruptedException ex) {
        ex.printStackTrace();
      }
    }
    server.stop();                      // Stop server
    serverLeases.close();               // Turn off the lease monitor

    // Join up with all threads
    try {
      rootScannerThread.join();         // Wait for the root scanner to finish.
    } catch (Exception iex) {
      LOG.warn("root scanner", iex);
    }
    try {
      metaScannerThread.join();         // Wait for meta scanner to finish.
    } catch(Exception iex) {
      LOG.warn("meta scanner", iex);
    }
    LOG.info("HMaster main thread exiting");
  }
 
  /*
   * Start up all services. If any of these threads gets an unhandled exception
   * then they just die with a logged message.  This should be fine because
   * in general, we do not expect the master to get such unhandled exceptions
   *  as OOMEs; it should be lightly loaded. See what HRegionServer does if
   *  need to install an unexpected exception handler.
   */
  private void startServiceThreads() {
    String threadName = Thread.currentThread().getName();
    try {
      Threads.setDaemonThreadRunning(this.rootScannerThread,
        threadName + ".rootScanner");
      Threads.setDaemonThreadRunning(this.metaScannerThread,
        threadName + ".metaScanner");
      // Leases are not the same as Chore threads. Set name differently.
      this.serverLeases.setName(threadName + ".leaseChecker");
      this.serverLeases.start();
      // Put up info server.
      int port = this.conf.getInt("hbase.master.info.port", 60010);
      if (port >= 0) {
        String a = this.conf.get("hbase.master.info.bindAddress", "0.0.0.0");
        this.infoServer = new InfoServer(MASTER, a, port, false);
        this.infoServer.setAttribute(MASTER, this);
        this.infoServer.start();
      }
      // Start the server so everything else is running before we start
      // receiving requests.
      this.server.start();
    } catch (IOException e) {
      if (e instanceof RemoteException) {
        try {
          e = RemoteExceptionHandler.decodeRemoteException((RemoteException) e);
        } catch (IOException ex) {
          LOG.warn("thread start", ex);
        }
      }
      // Something happened during startup. Shut things down.
      this.closed.set(true);
      LOG.error("Failed startup", e);
    }
  }

  /*
   * Wait on regionservers to report in
   * with {@link #regionServerReport(HServerInfo, HMsg[])} so they get notice
   * the master is going down.  Waits until all region servers come back with
   * a MSG_REGIONSERVER_STOP which will cancel their lease or until leases held
   * by remote region servers have expired.
   */
  private void letRegionServersShutdown() {
    if (!fsOk) {
      // Forget waiting for the region servers if the file system has gone
      // away. Just exit as quickly as possible.
      return;
    }
    synchronized (serversToServerInfo) {
      while (this.serversToServerInfo.size() > 0) {
        LOG.info("Waiting on following regionserver(s) to go down (or " +
            "region server lease expiration, whichever happens first): " +
            this.serversToServerInfo.values());
        try {
          serversToServerInfo.wait(threadWakeFrequency);
        } catch (InterruptedException e) {
          // continue
        }
      }
    }
  }

  /*
   * HMasterRegionInterface
   */

  /** {@inheritDoc} */
  @SuppressWarnings("unused")
  public MapWritable regionServerStartup(HServerInfo serverInfo)
  throws IOException {
    String s = serverInfo.getServerAddress().toString().trim();
    HServerInfo storedInfo = null;
    LOG.info("received start message from: " + s);

    // If we get the startup message but there's an old server by that
    // name, then we can timeout the old one right away and register
    // the new one.
    synchronized (serversToServerInfo) {
      storedInfo = serversToServerInfo.remove(s);
      HServerLoad load = serversToLoad.remove(s);
      if (load != null) {
        Set<String> servers = loadToServers.get(load);
        if (servers != null) {
          servers.remove(s);
          loadToServers.put(load, servers);
        }
      }
      serversToServerInfo.notifyAll();
    }
    if (storedInfo != null && !closed.get()) {
      shutdownQueue.put(new PendingServerShutdown(storedInfo));
    }

    // Either way, record the new server
    synchronized (serversToServerInfo) {
      HServerLoad load = new HServerLoad();
      serverInfo.setLoad(load);
      serversToServerInfo.put(s, serverInfo);
      serversToLoad.put(s, load);
      Set<String> servers = loadToServers.get(load);
      if (servers == null) {
        servers = new HashSet<String>();
      }
      servers.add(s);
      loadToServers.put(load, servers);
    }

    if (!closed.get()) {
      long serverLabel = getServerLabel(s);
      serverLeases.createLease(serverLabel, serverLabel, new ServerExpirer(s));
    }
   
    return createConfigurationSubset();
  }
 
  /**
   * @return Subset of configuration to pass initializing regionservers: e.g.
   * the filesystem to use and root directory to use.
   */
  protected MapWritable createConfigurationSubset() {
    MapWritable mw = addConfig(new MapWritable(), HConstants.HBASE_DIR);
    return addConfig(mw, "fs.default.name");
  }

  private MapWritable addConfig(final MapWritable mw, final String key) {
    mw.put(new Text(key), new Text(this.conf.get(key)));
    return mw;
  }

  private long getServerLabel(final String s) {
    return s.hashCode();
  }

  /** {@inheritDoc} */
  public HMsg[] regionServerReport(HServerInfo serverInfo, HMsg msgs[])
  throws IOException {
   
    String serverName = serverInfo.getServerAddress().toString().trim();
    long serverLabel = getServerLabel(serverName);

    if (msgs.length > 0 && msgs[0].getMsg() == HMsg.MSG_REPORT_EXITING) {

      // HRegionServer is shutting down. Cancel the server's lease.
      // Note that cancelling the server's lease takes care of updating
      // serversToServerInfo, etc.

      if (LOG.isDebugEnabled()) {
        LOG.debug("Region server " + serverName +
            ": MSG_REPORT_EXITING -- cancelling lease");
      }
     
      if (cancelLease(serverName, serverLabel)) {
        // Only process the exit message if the server still has a lease.
        // Otherwise we could end up processing the server exit twice.

        LOG.info("Region server " + serverName +
            ": MSG_REPORT_EXITING -- lease cancelled");
        // Get all the regions the server was serving reassigned
        // (if we are not shutting down).
        if (!closed.get()) {
          for (int i = 1; i < msgs.length; i++) {
            HRegionInfo info = msgs[i].getRegionInfo();
            if (info.tableDesc.getName().equals(ROOT_TABLE_NAME)) {
              rootRegionLocation.set(null);
            } else if (info.tableDesc.getName().equals(META_TABLE_NAME)) {
              onlineMetaRegions.remove(info.getStartKey());
            }

            this.unassignedRegions.put(info.regionName, info);
            this.assignAttempts.put(info.regionName, Long.valueOf(0L));
          }
        }
      }

      // We don't need to return anything to the server because it isn't
      // going to do any more work.
      return new HMsg[0];
    }

    if (closed.get()) {
      // Tell server to shut down if we are shutting down.  This should
      // happen after check of MSG_REPORT_EXITING above, since region server
      // will send us one of these messages after it gets MSG_REGIONSERVER_STOP
   
      return new HMsg[]{new HMsg(HMsg.MSG_REGIONSERVER_STOP)};
    }

    HServerInfo storedInfo;
    synchronized (serversToServerInfo) {
      storedInfo = serversToServerInfo.get(serverName);
    }
    if (storedInfo == null) {
      if (LOG.isDebugEnabled()) {
        LOG.debug("received server report from unknown server: " + serverName);
      }

      // The HBaseMaster may have been restarted.
      // Tell the RegionServer to start over and call regionServerStartup()

      return new HMsg[]{new HMsg(HMsg.MSG_CALL_SERVER_STARTUP)};

    } else if (storedInfo.getStartCode() != serverInfo.getStartCode()) {

      // This state is reachable if:
      //
      // 1) RegionServer A started
      // 2) RegionServer B started on the same machine, then
      //    clobbered A in regionServerStartup.
      // 3) RegionServer A returns, expecting to work as usual.
      //
      // The answer is to ask A to shut down for good.

      if (LOG.isDebugEnabled()) {
        LOG.debug("region server race condition detected: " + serverName);
      }

      cancelLease(serverName, serverLabel);
      return new HMsg[]{new HMsg(HMsg.MSG_REGIONSERVER_STOP)};

    } else {

      // All's well.  Renew the server's lease.
      // This will always succeed; otherwise, the fetch of serversToServerInfo
      // would have failed above.

      serverLeases.renewLease(serverLabel, serverLabel);

      // Refresh the info object and the load information

      synchronized (serversToServerInfo) {
        serversToServerInfo.put(serverName, serverInfo);

        HServerLoad load = serversToLoad.get(serverName);
        if (load != null && !load.equals(serverInfo.getLoad())) {
          // We have previous information about the load on this server
          // and the load on this server has changed

          Set<String> servers = loadToServers.get(load);

          // Note that servers should never be null because loadToServers
          // and serversToLoad are manipulated in pairs

          servers.remove(serverName);
          loadToServers.put(load, servers);
        }

        // Set the current load information

        load = serverInfo.getLoad();
        serversToLoad.put(serverName, load);
        Set<String> servers = loadToServers.get(load);
        if (servers == null) {
          servers = new HashSet<String>();
        }
        servers.add(serverName);
        loadToServers.put(load, servers);
      }

      // Next, process messages for this server
      return processMsgs(serverInfo, msgs);
    }
  }

  /** Cancel a server's lease and update its load information */
  private boolean cancelLease(final String serverName, final long serverLabel) {
    boolean leaseCancelled = false;
    synchronized (serversToServerInfo) {
      HServerInfo info = serversToServerInfo.remove(serverName);
      if (info != null) {
        // Only cancel lease and update load information once.
        // This method can be called a couple of times during shutdown.
        LOG.info("Cancelling lease for " + serverName);
        serverLeases.cancelLease(serverLabel, serverLabel);
        leaseCancelled = true;

        // update load information
        HServerLoad load = serversToLoad.remove(serverName);
        if (load != null) {
          Set<String> servers = loadToServers.get(load);
          if (servers != null) {
            servers.remove(serverName);
            loadToServers.put(load, servers);
          }
        }
      }
      serversToServerInfo.notifyAll();
    }
    return leaseCancelled;
  }

  /**
   * Process all the incoming messages from a server that's contacted us.
   *
   * Note that we never need to update the server's load information because
   * that has already been done in regionServerReport.
   */
  private HMsg[] processMsgs(HServerInfo info, HMsg incomingMsgs[])
  throws IOException {
   
    ArrayList<HMsg> returnMsgs = new ArrayList<HMsg>();
    String serverName = info.getServerAddress().toString();
    HashMap<Text, HRegionInfo> regionsToKill = killList.remove(serverName);

    // Get reports on what the RegionServer did.

    for (int i = 0; i < incomingMsgs.length; i++) {
      HRegionInfo region = incomingMsgs[i].getRegionInfo();

      switch (incomingMsgs[i].getMsg()) {

      case HMsg.MSG_REPORT_OPEN:
        HRegionInfo regionInfo = unassignedRegions.get(region.regionName);

        if (regionInfo == null) {

          if (LOG.isDebugEnabled()) {
            LOG.debug("region server " + info.getServerAddress().toString()
                + " should not have opened region " + region.regionName);
          }

          // This Region should not have been opened.
          // Ask the server to shut it down, but don't report it as closed. 
          // Otherwise the HMaster will think the Region was closed on purpose,
          // and then try to reopen it elsewhere; that's not what we want.

          returnMsgs.add(new HMsg(HMsg.MSG_REGION_CLOSE_WITHOUT_REPORT, region));

        } else {
          LOG.info(info.getServerAddress().toString() + " serving " +
              region.regionName);
          // Remove from unassigned list so we don't assign it to someone else
          this.unassignedRegions.remove(region.regionName);
          this.assignAttempts.remove(region.regionName);
          if (region.regionName.compareTo(
              HGlobals.rootRegionInfo.regionName) == 0) {
            // Store the Root Region location (in memory)
            synchronized (rootRegionLocation) {
              this.rootRegionLocation.
                set(new HServerAddress(info.getServerAddress()));
              this.rootRegionLocation.notifyAll();
            }
            break;
          }

          // Note that the table has been assigned and is waiting for the meta
          // table to be updated.

          pendingRegions.add(region.regionName);

          // Queue up an update to note the region location.

          try {
            msgQueue.put(new PendingOpenReport(info, region));
          } catch (InterruptedException e) {
            throw new RuntimeException("Putting into msgQueue was interrupted.", e);
          }
        }
        break;

      case HMsg.MSG_REPORT_CLOSE:
        LOG.info(info.getServerAddress().toString() + " no longer serving " +
            region.regionName);

        if (region.regionName.compareTo(
            HGlobals.rootRegionInfo.regionName) == 0) {
         
          // Root region
         
          rootRegionLocation.set(null);
          unassignedRegions.put(region.regionName, region);
          assignAttempts.put(region.regionName, Long.valueOf(0L));

        } else {
          boolean reassignRegion = true;
          boolean deleteRegion = false;

          if (killedRegions.remove(region.regionName)) {
            reassignRegion = false;
          }

          if (regionsToDelete.remove(region.regionName)) {
            reassignRegion = false;
            deleteRegion = true;
          }

          // NOTE: we cannot put the region into unassignedRegions as that
          //       could create a race with the pending close if it gets
          //       reassigned before the close is processed.

          unassignedRegions.remove(region.regionName);
          assignAttempts.remove(region.regionName);

          try {
            msgQueue.put(new PendingCloseReport(region, reassignRegion,
                deleteRegion));
           
          } catch (InterruptedException e) {
            throw new RuntimeException("Putting into msgQueue was interrupted.", e);
          }
        }
        break;

      case HMsg.MSG_REPORT_SPLIT:
        // A region has split.
       
        HRegionInfo newRegionA = incomingMsgs[++i].getRegionInfo();
        unassignedRegions.put(newRegionA.getRegionName(), newRegionA);
        assignAttempts.put(newRegionA.getRegionName(), Long.valueOf(0L));

        HRegionInfo newRegionB = incomingMsgs[++i].getRegionInfo();
        unassignedRegions.put(newRegionB.getRegionName(), newRegionB);
        assignAttempts.put(newRegionB.getRegionName(), Long.valueOf(0L));

        LOG.info("region " + region.regionName + " split. New regions are: "
            + newRegionA.regionName + ", " + newRegionB.regionName);

        if (region.tableDesc.getName().equals(META_TABLE_NAME)) {
          // A meta region has split.

          onlineMetaRegions.remove(region.getStartKey());
          numberOfMetaRegions.incrementAndGet();
        }
        break;

      default:
        throw new IOException(
            "Impossible state during msg processing.  Instruction: " +
            incomingMsgs[i].getMsg());
      }
    }

    // Process the kill list

    if (regionsToKill != null) {
      for (HRegionInfo i: regionsToKill.values()) {
        returnMsgs.add(new HMsg(HMsg.MSG_REGION_CLOSE, i));
        killedRegions.add(i.regionName);
      }
    }

    // Figure out what the RegionServer ought to do, and write back.
    assignRegions(info, serverName, returnMsgs);
    return returnMsgs.toArray(new HMsg[returnMsgs.size()]);
  }
 
  /*
   * Assigns regions to region servers attempting to balance the load across
   * all region servers
   *
   * @param info
   * @param serverName
   * @param returnMsgs
   */
  private synchronized void assignRegions(HServerInfo info, String serverName,
      ArrayList<HMsg> returnMsgs) {
   
    TreeSet<Text> regionsToAssign = getRegionsToAssign();
    int nRegionsToAssign = regionsToAssign.size();
    if (nRegionsToAssign <= 0) {
      // No regions to assign.  Return.
      return;
    }
   
    if (this.serversToServerInfo.size() == 1) {
      assignRegionsToOneServer(regionsToAssign, serverName, returnMsgs);
      // Finished.  Return.
      return;
    }

    // Multiple servers in play.
    // We need to allocate regions only to most lightly loaded servers.
    HServerLoad thisServersLoad = info.getLoad();
    synchronized (this.serversToServerInfo) {
      int nregions = regionsPerServer(nRegionsToAssign, thisServersLoad);
      nRegionsToAssign -= nregions;
      if (nRegionsToAssign > 0) {
        // We still have more regions to assign. See how many we can assign
        // before this server becomes more heavily loaded than the next
        // most heavily loaded server.
        SortedMap<HServerLoad, Set<String>> heavyServers =
          this.loadToServers.tailMap(thisServersLoad);
        int nservers = 0;
        HServerLoad heavierLoad = null;
        for (Map.Entry<HServerLoad, Set<String>> e : heavyServers.entrySet()) {
          Set<String> servers = e.getValue();
          nservers += servers.size();
          if (e.getKey().compareTo(thisServersLoad) == 0) {
            // This is the load factor of the server we are considering
            nservers -= 1;
            continue;
          }

          // If we get here, we are at the first load entry that is a
          // heavier load than the server we are considering
          heavierLoad = e.getKey();
          break;
        }

        nregions = 0;
        if (heavierLoad != null) {
          // There is a more heavily loaded server
          for (HServerLoad load =
            new HServerLoad(thisServersLoad.getNumberOfRequests(),
              thisServersLoad.getNumberOfRegions());
            load.compareTo(heavierLoad) <= 0 &&
              nregions < nRegionsToAssign;
            load.setNumberOfRegions(load.getNumberOfRegions() + 1),
              nregions++) {
            // continue;
          }
        }

        if (nregions < nRegionsToAssign) {
          // There are some more heavily loaded servers
          // but we can't assign all the regions to this server.
          if (nservers > 0) {
            // There are other servers that can share the load.
            // Split regions that need assignment across the servers.
            nregions = (int) Math.ceil((1.0 * nRegionsToAssign)
                / (1.0 * nservers));
          } else {
            // No other servers with same load.
            // Split regions over all available servers
            nregions = (int) Math.ceil((1.0 * nRegionsToAssign)
                / (1.0 * serversToServerInfo.size()));
          }
        } else {
          // Assign all regions to this server
          nregions = nRegionsToAssign;
        }

        long now = System.currentTimeMillis();
        for (Text regionName: regionsToAssign) {
          HRegionInfo regionInfo = this.unassignedRegions.get(regionName);
          LOG.info("assigning region " + regionName + " to server " +
            serverName);
          this.assignAttempts.put(regionName, Long.valueOf(now));
          returnMsgs.add(new HMsg(HMsg.MSG_REGION_OPEN, regionInfo));
          if (--nregions <= 0) {
            break;
          }
        }
      }
    }
  }
 
  /*
   * @param nRegionsToAssign
   * @param thisServersLoad
   * @return How many regions we can assign to more lightly loaded servers
   */
  private int regionsPerServer(final int nRegionsToAssign,
      final HServerLoad thisServersLoad) {
    SortedMap<HServerLoad, Set<String>> lightServers =
      this.loadToServers.headMap(thisServersLoad);

    int nRegions = 0;
    for (Map.Entry<HServerLoad, Set<String>> e : lightServers.entrySet()) {
      HServerLoad lightLoad = new HServerLoad(e.getKey()
          .getNumberOfRequests(), e.getKey().getNumberOfRegions());
      do {
        lightLoad.setNumberOfRegions(lightLoad.getNumberOfRegions() + 1);
        nRegions += 1;
      } while (lightLoad.compareTo(thisServersLoad) <= 0
          && nRegions < nRegionsToAssign);

      nRegions *= e.getValue().size();
      if (nRegions >= nRegionsToAssign) {
        break;
      }
    }
    return nRegions;
  }
 
  /*
   * Assign all to the only server. An unlikely case but still possible.
   * @param regionsToAssign
   * @param serverName
   * @param returnMsgs
   */
  private void assignRegionsToOneServer(final TreeSet<Text> regionsToAssign,
      final String serverName, final ArrayList<HMsg> returnMsgs) {
    long now = System.currentTimeMillis();
    for (Text regionName: regionsToAssign) {
      HRegionInfo regionInfo = this.unassignedRegions.get(regionName);
      LOG.info("assigning region " + regionName + " to the only server " +
        serverName);
      this.assignAttempts.put(regionName, Long.valueOf(now));
      returnMsgs.add(new HMsg(HMsg.MSG_REGION_OPEN, regionInfo));
    }
  }
 
  /*
   * @return List of regions to assign.
   */
  private TreeSet<Text> getRegionsToAssign() {
    long now = System.currentTimeMillis();
    TreeSet<Text> regionsToAssign = new TreeSet<Text>();
    for (Map.Entry<Text, Long> e: this.assignAttempts.entrySet()) {
      long diff = now - e.getValue().longValue();
      if (diff > this.maxRegionOpenTime) {
        regionsToAssign.add(e.getKey());
      }
    }
    return regionsToAssign;
  }

  /*
   * Some internal classes to manage msg-passing and client operations
   */

  private abstract class PendingOperation {
    PendingOperation() {
      super();
    }

    abstract boolean process() throws IOException;
  }

  /**
   * Instantiated when a server's lease has expired, meaning it has crashed.
   * The region server's log file needs to be split up for each region it was
   * serving, and the regions need to get reassigned.
   */
  private class PendingServerShutdown extends PendingOperation
  implements Delayed {
    private final long expire;
    private HServerAddress deadServer;
    private String deadServerName;
    private Path oldLogDir;
    private transient boolean logSplit;
    private transient boolean rootChecked;
    private transient boolean rootRescanned;

    private class ToDoEntry {
      boolean deleteRegion;
      boolean regionOffline;
      Text row;
      HRegionInfo info;

      ToDoEntry(Text row, HRegionInfo info) {
        this.deleteRegion = false;
        this.regionOffline = false;
        this.row = row;
        this.info = info;
      }
    }

    PendingServerShutdown(HServerInfo serverInfo) {
      super();
      this.deadServer = serverInfo.getServerAddress();
      this.deadServerName = this.deadServer.toString();
      this.logSplit = false;
      this.rootChecked = false;
      this.rootRescanned = false;
      StringBuilder dirName = new StringBuilder("log_");
      dirName.append(deadServer.getBindAddress());
      dirName.append("_");
      dirName.append(serverInfo.getStartCode());
      dirName.append("_");
      dirName.append(deadServer.getPort());
      this.oldLogDir = new Path(dir, dirName.toString());
      // Set the future time at which we expect to be released from the
      // DelayQueue we're inserted in on lease expiration.
      this.expire = System.currentTimeMillis() + leaseTimeout / 2;
    }

    /** {@inheritDoc} */
    public long getDelay(TimeUnit unit) {
      return unit.convert(this.expire - System.currentTimeMillis(),
        TimeUnit.MILLISECONDS);
    }
   
    /** {@inheritDoc} */
    public int compareTo(Delayed o) {
      return Long.valueOf(getDelay(TimeUnit.MILLISECONDS)
          - o.getDelay(TimeUnit.MILLISECONDS)).intValue();
    }
   
    /** {@inheritDoc} */
    @Override
    public String toString() {
      return "PendingServerShutdown of " + this.deadServer.toString();
    }

    /** Finds regions that the dead region server was serving */
    private void scanMetaRegion(HRegionInterface server, long scannerId,
        Text regionName) throws IOException {

      ArrayList<ToDoEntry> toDoList = new ArrayList<ToDoEntry>();
      TreeMap<Text, HRegionInfo> regions = new TreeMap<Text, HRegionInfo>();

      try {
        while (true) {
          MapWritable values = null;
          try {
            values = server.next(scannerId);
          } catch (IOException e) {
            LOG.error("Shutdown scanning of meta region",
              RemoteExceptionHandler.checkIOException(e));
            break;
          }

          if (values == null || values.size() == 0) {
            break;
          }

          SortedMap<Text, byte[]> results = new TreeMap<Text, byte[]>();
          Text row = null;
          for (Map.Entry<Writable, Writable> e: values.entrySet()) {
            HStoreKey key = (HStoreKey) e.getKey();
            Text thisRow = key.getRow();
            if (row == null) {
              row = thisRow;
            } else {
              if (!row.equals(thisRow)) {
                LOG.error("Multiple rows in same scanner result set. firstRow="
                    + row + ", currentRow=" + thisRow);
              }
            }
            results.put(key.getColumn(),
                ((ImmutableBytesWritable) e.getValue()).get());
          }

          if (LOG.isDebugEnabled() && row != null) {
            LOG.debug("shutdown scanner looking at " + row.toString());
          }

          // Check server name.  If null, be conservative and treat as though
          // region had been on shutdown server (could be null because we
          // missed edits in hlog because hdfs does not do write-append).
          String serverName;
          try {
            serverName = Writables.bytesToString(results.get(COL_SERVER));
          } catch(UnsupportedEncodingException e) {
            LOG.error("Server name", e);
            break;
          }
          if (serverName.length() > 0 &&
              deadServerName.compareTo(serverName) != 0) {
            // This isn't the server you're looking for - move along
            if (LOG.isDebugEnabled()) {
              LOG.debug("Server name " + serverName + " is not same as " +
                  deadServerName + ": Passing");
            }
            continue;
          }

          // Bingo! Found it.
          HRegionInfo info = null;
          try {
            info = (HRegionInfo) Writables.getWritable(
                results.get(COL_REGIONINFO), new HRegionInfo());
          } catch (IOException e) {
            LOG.error("Read fields", e);
            break;
          }
          LOG.info(info.getRegionName() + " was on shutdown server <" +
              serverName + "> (or server is null). Marking unassigned if " +
          "meta and clearing pendingRegions");

          if (info.tableDesc.getName().equals(META_TABLE_NAME)) {
            onlineMetaRegions.remove(info.getStartKey());
          }

          ToDoEntry todo = new ToDoEntry(row, info);
          toDoList.add(todo);

          if (killList.containsKey(deadServerName)) {
            HashMap<Text, HRegionInfo> regionsToKill =
              killList.get(deadServerName);

            if (regionsToKill.containsKey(info.regionName)) {
              regionsToKill.remove(info.regionName);
              killList.put(deadServerName, regionsToKill);
              unassignedRegions.remove(info.regionName);
              assignAttempts.remove(info.regionName);
              if (regionsToDelete.contains(info.regionName)) {
                // Delete this region
                regionsToDelete.remove(info.regionName);
                todo.deleteRegion = true;
              } else {
                // Mark region offline
                todo.regionOffline = true;
              }
            }
           
          } else {
            // Get region reassigned
            regions.put(info.regionName, info);
          
            // If it was pending, remove.
            // Otherwise will obstruct its getting reassigned.
            pendingRegions.remove(info.getRegionName());
          }
        }
      } finally {
        if(scannerId != -1L) {
          try {
            server.close(scannerId);
          } catch (IOException e) {
            LOG.error("Closing scanner",
              RemoteExceptionHandler.checkIOException(e));
          }
        }
      }

      // Remove server from root/meta entries
      for (ToDoEntry e: toDoList) {
        BatchUpdate b = new BatchUpdate(rand.nextLong());
        long lockid = b.startUpdate(e.row);
        if (e.deleteRegion) {
          b.delete(lockid, COL_REGIONINFO);
        } else if (e.regionOffline) {
          e.info.offLine = true;
          b.put(lockid, COL_REGIONINFO, Writables.getBytes(e.info));
        }
        b.delete(lockid, COL_SERVER);
        b.delete(lockid, COL_STARTCODE);
        server.batchUpdate(regionName, System.currentTimeMillis(), b);
      }

      // Get regions reassigned
      for (Map.Entry<Text, HRegionInfo> e: regions.entrySet()) {
        Text region = e.getKey();
        HRegionInfo regionInfo = e.getValue();
        unassignedRegions.put(region, regionInfo);
        assignAttempts.put(region, Long.valueOf(0L));
      }
    }

    @Override
    boolean process() throws IOException {
      LOG.info("process shutdown of server " + deadServer + ": logSplit: " +
          this.logSplit + ", rootChecked: " + this.rootChecked +
          ", rootRescanned: " + this.rootRescanned + ", numberOfMetaRegions: " +
          numberOfMetaRegions.get() + ", onlineMetaRegions.size(): " +
          onlineMetaRegions.size());

      if (!logSplit) {
        // Process the old log file
        if (fs.exists(oldLogDir)) {
          if (!splitLogLock.tryLock()) {
            return false;
          }
          try {
            HLog.splitLog(dir, oldLogDir, fs, conf);
          } finally {
            splitLogLock.unlock();
          }
        }
        logSplit = true;
      }

      if (!rootChecked) {
        if (rootRegionLocation.get() != null &&
            deadServer.equals(rootRegionLocation.get())) {

          rootRegionLocation.set(null);
          unassignedRegions.put(HGlobals.rootRegionInfo.regionName,
              HGlobals.rootRegionInfo);

          assignAttempts.put(HGlobals.rootRegionInfo.regionName,
              Long.valueOf(0L));
        }
        rootChecked = true;
      }

      if (!rootRescanned) {
        // Scan the ROOT region

        HRegionInterface server = null;
        long scannerId = -1L;
        for (int tries = 0; tries < numRetries; tries ++) {
          if (closed.get()) {
            return true;
          }
          if (rootRegionLocation.get() == null || !rootScanned) {
            // We can't proceed until the root region is online and has been scanned
           
            if (LOG.isDebugEnabled()) {
              LOG.debug("process server shutdown scanning root region " +
              "cancelled because rootRegionLocation is null");
            }
            return false;
          }
          server = connection.getHRegionConnection(rootRegionLocation.get());
          scannerId = -1L;

          try {
            if (LOG.isDebugEnabled()) {
              LOG.debug("process server shutdown scanning root region on " +
                  rootRegionLocation.get().getBindAddress());
            }
            scannerId = server.openScanner(HGlobals.rootRegionInfo.regionName,
                COLUMN_FAMILY_ARRAY, EMPTY_START_ROW,
                System.currentTimeMillis(), null);
           
            scanMetaRegion(server, scannerId, HGlobals.rootRegionInfo.regionName);
            break;

          } catch (IOException e) {
            if (tries == numRetries - 1) {
              throw RemoteExceptionHandler.checkIOException(e);
            }
          }
        }
        if (LOG.isDebugEnabled()) {
          LOG.debug("process server shutdown scanning root region on " +
              rootRegionLocation.get().getBindAddress() + " finished " +
              Thread.currentThread().getName());
        }
        rootRescanned = true;
      }

      for (int tries = 0; tries < numRetries; tries++) {
        try {
          if (closed.get()) {
            return true;
          }
          if (!rootScanned ||
              numberOfMetaRegions.get() != onlineMetaRegions.size()) {
            // We can't proceed because not all of the meta regions are online.
            // We can't block either because that would prevent the meta region
            // online message from being processed. So return false to have this
            // operation requeued.
           
            if (LOG.isDebugEnabled()) {
              LOG.debug("Requeuing shutdown because rootScanned: " +
                  rootScanned + ", numberOfMetaRegions: " +
                  numberOfMetaRegions.get() + ", onlineMetaRegions.size(): " +
                  onlineMetaRegions.size());
            }
            return false;
          }

          for (MetaRegion r: onlineMetaRegions.values()) {

            HRegionInterface server = null;
            long scannerId = -1L;

            if (LOG.isDebugEnabled()) {
              LOG.debug("process server shutdown scanning " +
                  r.getRegionName() + " on " + r.getServer() + " " +
                  Thread.currentThread().getName());
            }
            server = connection.getHRegionConnection(r.getServer());

            scannerId =
              server.openScanner(r.getRegionName(), COLUMN_FAMILY_ARRAY,
                  EMPTY_START_ROW, System.currentTimeMillis(), null);
           
            scanMetaRegion(server, scannerId, r.getRegionName());
           
            if (LOG.isDebugEnabled()) {
              LOG.debug("process server shutdown finished scanning " +
                  r.getRegionName() + " on " + r.getServer() + " " +
                  Thread.currentThread().getName());
            }
          }
          break;

        } catch (IOException e) {
          if (tries == numRetries - 1) {
            throw RemoteExceptionHandler.checkIOException(e);
          }
        }
      }
      return true;
    }
  }

  /**
   * PendingCloseReport is instantiated when a region server reports that it
   * has closed a region.
   */
  private class PendingCloseReport extends PendingOperation {
    private HRegionInfo regionInfo;
    private boolean reassignRegion;
    private boolean deleteRegion;
    private boolean rootRegion;

    PendingCloseReport(HRegionInfo regionInfo, boolean reassignRegion,
        boolean deleteRegion) {

      super();

      this.regionInfo = regionInfo;
      this.reassignRegion = reassignRegion;
      this.deleteRegion = deleteRegion;

      // If the region closing down is a meta region then we need to update
      // the ROOT table

      if (this.regionInfo.tableDesc.getName().equals(META_TABLE_NAME)) {
        this.rootRegion = true;

      } else {
        this.rootRegion = false;
      }
    }

    /** {@inheritDoc} */
    @Override
    public String toString() {
      return "PendingCloseReport of " + this.regionInfo.getRegionName();
    }

    @Override
    boolean process() throws IOException {
      for (int tries = 0; tries < numRetries; tries++) {
        if (closed.get()) {
          return true;
        }
        LOG.info("region closed: " + regionInfo.regionName);

        // Mark the Region as unavailable in the appropriate meta table

        Text metaRegionName;
        HRegionInterface server;
        if (rootRegion) {
          if (rootRegionLocation.get() == null || !rootScanned) {
            // We can't proceed until the root region is online and has been
            // scanned
            return false;
          }
          metaRegionName = HGlobals.rootRegionInfo.regionName;
          server = connection.getHRegionConnection(rootRegionLocation.get());
          onlineMetaRegions.remove(regionInfo.getStartKey());

        } else {
          if (!rootScanned ||
              numberOfMetaRegions.get() != onlineMetaRegions.size()) {
           
            // We can't proceed because not all of the meta regions are online.
            // We can't block either because that would prevent the meta region
            // online message from being processed. So return false to have this
            // operation requeued.
           
            if (LOG.isDebugEnabled()) {
              LOG.debug("Requeuing close because rootScanned=" +
                  rootScanned + ", numberOfMetaRegions=" +
                  numberOfMetaRegions.get() + ", onlineMetaRegions.size()=" +
                  onlineMetaRegions.size());
            }
            return false;
          }

          MetaRegion r = null;
          if (onlineMetaRegions.containsKey(regionInfo.getRegionName())) {
            r = onlineMetaRegions.get(regionInfo.getRegionName());

          } else {
            r = onlineMetaRegions.get(onlineMetaRegions.headMap(
                regionInfo.getRegionName()).lastKey());
          }
          metaRegionName = r.getRegionName();
          server = connection.getHRegionConnection(r.getServer());
        }

        try {
          BatchUpdate b = new BatchUpdate(rand.nextLong());
          long lockid = b.startUpdate(regionInfo.regionName);

          if (deleteRegion) {
            b.delete(lockid, COL_REGIONINFO);

          } else if (!reassignRegion ) {
            regionInfo.offLine = true;
            b.put(lockid, COL_REGIONINFO, Writables.getBytes(regionInfo));
          }
          b.delete(lockid, COL_SERVER);
          b.delete(lockid, COL_STARTCODE);
          server.batchUpdate(metaRegionName, System.currentTimeMillis(), b);

          break;

        } catch (IOException e) {
          if (tries == numRetries - 1) {
            throw RemoteExceptionHandler.checkIOException(e);
          }
          continue;
        }
      }

      if (reassignRegion) {
        LOG.info("reassign region: " + regionInfo.regionName);

        unassignedRegions.put(regionInfo.regionName, regionInfo);
        assignAttempts.put(regionInfo.regionName, Long.valueOf(0L));

      } else if (deleteRegion) {
        try {
          HRegion.deleteRegion(fs, dir, regionInfo.regionName);
        } catch (IOException e) {
          e = RemoteExceptionHandler.checkIOException(e);
          LOG.error("failed delete region " + regionInfo.regionName, e);
          throw e;
        }
      }
      return true;
    }
  }

  /**
   * PendingOpenReport is instantiated when a region server reports that it is
   * serving a region. This applies to all meta and user regions except the
   * root region which is handled specially.
   */
  private class PendingOpenReport extends PendingOperation {
    private final boolean rootRegion;
    private final HRegionInfo region;
    private final HServerAddress serverAddress;
    private final byte [] startCode;

    PendingOpenReport(HServerInfo info, HRegionInfo region)
    throws IOException {
      // If true, the region which just came on-line is a META region.
      // We need to look in the ROOT region for its information.  Otherwise,
      // its just an ordinary region. Look for it in the META table.
      this.rootRegion = region.tableDesc.getName().equals(META_TABLE_NAME);
      this.region = region;
      this.serverAddress = info.getServerAddress();
      this.startCode = Writables.longToBytes(info.getStartCode());
    }

    @Override
    public String toString() {
      return "PendingOpenOperation from " + serverAddress.toString();
    }

    @Override
    boolean process() throws IOException {
      for (int tries = 0; tries < numRetries; tries++) {
        if (closed.get()) {
          return true;
        }
        LOG.info(region.toString() + " open on " +
            this.serverAddress.toString());

        // Register the newly-available Region's location.
        Text metaRegionName;
        HRegionInterface server;
        if (this.rootRegion) {
          if (rootRegionLocation.get() == null || !rootScanned) {
            // We can't proceed until root region is online and scanned
            if (LOG.isDebugEnabled()) {
              LOG.debug("root region: " +
                ((rootRegionLocation.get() != null)?
                  rootRegionLocation.get().toString(): "null") +
                ", rootScanned: " + rootScanned);
            }
            return false;
          }
          metaRegionName = HGlobals.rootRegionInfo.regionName;
          server = connection.getHRegionConnection(rootRegionLocation.get());
        } else {
          if (!rootScanned ||
              numberOfMetaRegions.get() != onlineMetaRegions.size()) {
            // We can't proceed because not all of the meta regions are online.
            // We can't block either because that would prevent the meta region
            // online message from being processed. So return false to have this
            // operation requeued.
            if (LOG.isDebugEnabled()) {
              LOG.debug("Requeuing open because rootScanned: " +
                  rootScanned + ", numberOfMetaRegions: " +
                  numberOfMetaRegions.get() + ", onlineMetaRegions.size(): " +
                  onlineMetaRegions.size());
            }
            return false;
          }

          MetaRegion r = onlineMetaRegions.containsKey(region.getRegionName())?
            onlineMetaRegions.get(region.getRegionName()):
            onlineMetaRegions.get(onlineMetaRegions.
              headMap(region.getRegionName()).lastKey());
          metaRegionName = r.getRegionName();
          server = connection.getHRegionConnection(r.getServer());
        }
       
        LOG.info("updating row " + region.getRegionName() + " in table " +
          metaRegionName + " with startcode " +
          Writables.bytesToLong(this.startCode) + " and server "+
          serverAddress.toString());
        try {
          BatchUpdate b = new BatchUpdate(rand.nextLong());
          long lockid = b.startUpdate(region.getRegionName());
          b.put(lockid, COL_SERVER,
            Writables.stringToBytes(serverAddress.toString()));
          b.put(lockid, COL_STARTCODE, startCode);
          server.batchUpdate(metaRegionName, System.currentTimeMillis(), b);
          if (region.tableDesc.getName().equals(META_TABLE_NAME)) {
            // It's a meta region.
            MetaRegion m = new MetaRegion(this.serverAddress,
              this.region.regionName, this.region.startKey);
            if (!initialMetaScanComplete) {
              // Put it on the queue to be scanned for the first time.
              try {
                LOG.debug("Adding " + m.toString() + " to regions to scan");
                metaRegionsToScan.put(m);
              } catch (InterruptedException e) {
                throw new RuntimeException(
                    "Putting into metaRegionsToScan was interrupted.", e);
              }
            } else {
              // Add it to the online meta regions
              LOG.debug("Adding to onlineMetaRegions: " + m.toString());
              onlineMetaRegions.put(this.region.startKey, m);
            }
          }
          // If updated successfully, remove from pending list.
          pendingRegions.remove(region.getRegionName());
          break;
        } catch (IOException e) {
          if (tries == numRetries - 1) {
            throw RemoteExceptionHandler.checkIOException(e);
          }
        }
      }
      return true;
    }
  }

  /*
   * HMasterInterface
   */

  /** {@inheritDoc} */
  public boolean isMasterRunning() {
    return !closed.get();
  }

  /** {@inheritDoc} */
  public void shutdown() {
    TimerTask tt = new TimerTask() {
      @Override
      public void run() {
        closed.set(true);
        synchronized(msgQueue) {
          msgQueue.clear();                         // Empty the queue
          msgQueue.notifyAll();                     // Wake main thread
        }
      }
    };
    Timer t = new Timer("Shutdown");
    t.schedule(tt, 10);
  }

  /** {@inheritDoc} */
  public void createTable(HTableDescriptor desc)
  throws IOException {
   
    if (!isMasterRunning()) {
      throw new MasterNotRunningException();
    }
    HRegionInfo newRegion = new HRegionInfo(rand.nextLong(), desc, null, null);

    for (int tries = 0; tries < numRetries; tries++) {
      try {
        // We can not access meta regions if they have not already been
        // assigned and scanned.  If we timeout waiting, just shutdown.
        if (this.metaScannerThread.waitForMetaRegionsOrClose()) {
          break;
        }
        createTable(newRegion);
        LOG.info("created table " + desc.getName());
        break;
     
      } catch (IOException e) {
        if (tries == numRetries - 1) {
          throw RemoteExceptionHandler.checkIOException(e);
        }
      }
    }
  }

  /* Set of tables currently in creation. Access needs to be synchronized. */
  private Set<Text> tableInCreation = new HashSet<Text>();

  private void createTable(final HRegionInfo newRegion) throws IOException {
    Text tableName = newRegion.tableDesc.getName();
    synchronized (tableInCreation) {
      if (tableInCreation.contains(tableName)) {
        throw new TableExistsException("Table " + tableName + " in process "
            + "of being created");
      }
      tableInCreation.add(tableName);
    }
    try {
      // 1. Check to see if table already exists. Get meta region where
      // table would sit should it exist. Open scanner on it. If a region
      // for the table we want to create already exists, then table already
      // created. Throw already-exists exception.
     
      MetaRegion m = (onlineMetaRegions.containsKey(newRegion.regionName) ?
          onlineMetaRegions.get(newRegion.regionName) :
            onlineMetaRegions.get(onlineMetaRegions.headMap(
                newRegion.getTableDesc().getName()).lastKey()));
         
      Text metaRegionName = m.getRegionName();
      HRegionInterface server = connection.getHRegionConnection(m.getServer());
      long scannerid = server.openScanner(metaRegionName, COL_REGIONINFO_ARRAY,
          tableName, System.currentTimeMillis(), null);
      try {
        MapWritable data = server.next(scannerid);
           
        // Test data and that the row for the data is for our table. If table
        // does not exist, scanner will return row after where our table would
        // be inserted if it exists so look for exact match on table name.
           
        if (data != null && data.size() > 0) {
          for (Writable k: data.keySet()) {
            if (HRegionInfo.getTableNameFromRegionName(
                ((HStoreKey) k).getRow()).equals(tableName)) {
         
              // Then a region for this table already exists. Ergo table exists.
                 
              throw new TableExistsException(tableName.toString());
            }
          }
        }
           
      } finally {
        server.close(scannerid);
      }

      // 2. Create the HRegion
         
      HRegion region =
        HRegion.createHRegion(newRegion, this.dir, this.conf, null);

      // 3. Insert into meta
         
      HRegionInfo info = region.getRegionInfo();
      Text regionName = region.getRegionName();
      BatchUpdate b = new BatchUpdate(rand.nextLong());
      long lockid = b.startUpdate(regionName);
      b.put(lockid, COL_REGIONINFO, Writables.getBytes(info));
      server.batchUpdate(metaRegionName, System.currentTimeMillis(), b);

      // 4. Close the new region to flush it to disk.  Close its log file too.
     
      region.close();
      region.getLog().closeAndDelete();

      // 5. Get it assigned to a server
     
      this.unassignedRegions.put(regionName, info);
      this.assignAttempts.put(regionName, Long.valueOf(0L));

    } finally {
      synchronized (tableInCreation) {
        tableInCreation.remove(newRegion.getTableDesc().getName());
      }
    }
  }

  /** {@inheritDoc} */
  public void deleteTable(Text tableName) throws IOException {
    new TableDelete(tableName).process();
    LOG.info("deleted table: " + tableName);
  }

  /** {@inheritDoc} */
  public void addColumn(Text tableName, HColumnDescriptor column)
  throws IOException {
   
    new AddColumn(tableName, column).process();
  }

  /** {@inheritDoc} */
  public void deleteColumn(Text tableName, Text columnName) throws IOException {
    new DeleteColumn(tableName, HStoreKey.extractFamily(columnName)).process();
  }

  /** {@inheritDoc} */
  public void enableTable(Text tableName) throws IOException {
    new ChangeTableState(tableName, true).process();
  }

  /** {@inheritDoc} */
  public void disableTable(Text tableName) throws IOException {
    new ChangeTableState(tableName, false).process();
  }

  /** {@inheritDoc} */
  public HServerAddress findRootRegion() {
    return rootRegionLocation.get();
  }

  /*
   * Helper classes for HMasterInterface
   */

  private abstract class TableOperation {
    private Set<MetaRegion> metaRegions;
    protected Text tableName;
    protected Set<HRegionInfo> unservedRegions;

    protected TableOperation(Text tableName) throws IOException {
      if (!isMasterRunning()) {
        throw new MasterNotRunningException();
      }

      this.metaRegions = new HashSet<MetaRegion>();
      this.tableName = tableName;
      this.unservedRegions = new HashSet<HRegionInfo>();

      // We can not access any meta region if they have not already been
      // assigned and scanned.

      if (metaScannerThread.waitForMetaRegionsOrClose()) {
        throw new MasterNotRunningException(); // We're shutting down. Forget it.
      }

      Text firstMetaRegion = null;
      if (onlineMetaRegions.size() == 1) {
        firstMetaRegion = onlineMetaRegions.firstKey();

      } else if (onlineMetaRegions.containsKey(tableName)) {
        firstMetaRegion = tableName;

      } else {
        firstMetaRegion = onlineMetaRegions.headMap(tableName).lastKey();
      }

      this.metaRegions.addAll(onlineMetaRegions.tailMap(firstMetaRegion).values());
    }

    void process() throws IOException {
      for (int tries = 0; tries < numRetries; tries++) {
        boolean tableExists = false;
        try {
          synchronized(metaScannerLock) {     // Prevent meta scanner from running
            for (MetaRegion m: metaRegions) {

              // Get a connection to a meta server

              HRegionInterface server =
                connection.getHRegionConnection(m.getServer());

              // Open a scanner on the meta region

              long scannerId =
                server.openScanner(m.getRegionName(), COLUMN_FAMILY_ARRAY,
                    tableName, System.currentTimeMillis(), null);

              try {
                while (true) {
                  HRegionInfo info = new HRegionInfo();
                  String serverName = null;
                  long startCode = -1L;

                  MapWritable values = server.next(scannerId);
                  if(values == null || values.size() == 0) {
                    break;
                  }
                  boolean haveRegionInfo = false;
                  for (Map.Entry<Writable, Writable> e: values.entrySet()) {

                    byte[] value = ((ImmutableBytesWritable) e.getValue()).get();
                    if (value == null || value.length == 0) {
                      break;
                    }
                    HStoreKey key = (HStoreKey) e.getKey();
                    Text column = key.getColumn();
                    if (column.equals(COL_REGIONINFO)) {
                      haveRegionInfo = true;
                      info = (HRegionInfo) Writables.getWritable(value, info);
                   
                    } else if (column.equals(COL_SERVER)) {
                      try {
                        serverName =
                          Writables.bytesToString(value);
                   
                      } catch (UnsupportedEncodingException ex) {
                        assert(false);
                      }
                   
                    } else if (column.equals(COL_STARTCODE)) {
                      try {
                        startCode = Writables.bytesToLong(value);
                     
                      } catch (UnsupportedEncodingException ex) {
                        assert(false);
                      }
                    }
                  }

                  if (!haveRegionInfo) {
                    throw new IOException(COL_REGIONINFO + " not found");
                  }

                  if (info.tableDesc.getName().compareTo(tableName) > 0) {
                    break;               // Beyond any more entries for this table
                  }

                  tableExists = true;
                  if (!isBeingServed(serverName, startCode)) {
                    unservedRegions.add(info);
                  }
                  processScanItem(serverName, startCode, info);

                } // while(true)

              } finally {
                if (scannerId != -1L) {
                  try {
                    server.close(scannerId);
                  } catch (IOException e) {
                    e = RemoteExceptionHandler.checkIOException(e);
                    LOG.error("closing scanner", e);
                  }
                }
                scannerId = -1L;
              }

              if (!tableExists) {
                throw new IOException(tableName + " does not exist");
              }

              postProcessMeta(m, server);
              unservedRegions.clear();

            } // for(MetaRegion m:)
          } // synchronized(metaScannerLock)

        } catch (IOException e) {
          if (tries == numRetries - 1) {
            // No retries left
            checkFileSystem();
            throw RemoteExceptionHandler.checkIOException(e);
          }
          continue;
        }
        break;
      } // for(tries...)
    }

    protected boolean isBeingServed(String serverName, long startCode) {
      boolean result = false;
      if (serverName != null && serverName.length() > 0 && startCode != -1L) {
        HServerInfo s;
        synchronized (serversToServerInfo) {
          s = serversToServerInfo.get(serverName);
        }
        result = s != null && s.getStartCode() == startCode;
      }
      return result;
    }

    protected boolean isEnabled(HRegionInfo info) {
      return !info.offLine;
    }

    protected abstract void processScanItem(String serverName, long startCode,
        HRegionInfo info) throws IOException;

    protected abstract void postProcessMeta(MetaRegion m,
        HRegionInterface server) throws IOException;
  }

  /** Instantiated to enable or disable a table */
  private class ChangeTableState extends TableOperation {
    private boolean online;

    protected Map<String, HashSet<HRegionInfo>> servedRegions =
      new HashMap<String, HashSet<HRegionInfo>>();
   
    protected long lockid;

    ChangeTableState(Text tableName, boolean onLine) throws IOException {
      super(tableName);
      this.online = onLine;
    }

    @Override
    protected void processScanItem(String serverName, long startCode,
        HRegionInfo info) {
   
      if (isBeingServed(serverName, startCode)) {
        HashSet<HRegionInfo> regions = servedRegions.get(serverName);
        if (regions == null) {
          regions = new HashSet<HRegionInfo>();
        }
        regions.add(info);
        servedRegions.put(serverName, regions);
      }
    }

    @Override
    protected void postProcessMeta(MetaRegion m, HRegionInterface server)
      throws IOException {
     
      // Process regions not being served
     
      if (LOG.isDebugEnabled()) {
        LOG.debug("processing unserved regions");
      }
      for (HRegionInfo i: unservedRegions) {
        if (i.offLine && i.isSplit()) {
          if (LOG.isDebugEnabled()) {
            LOG.debug("Skipping region " + i.toString() + " because it is " +
                "offline because it has been split");
          }
          continue;
        }
       
        // Update meta table
       
        if (LOG.isDebugEnabled()) {
          LOG.debug("updating columns in row: " + i.regionName);
        }

        BatchUpdate b = new BatchUpdate(rand.nextLong());
        lockid = b.startUpdate(i.regionName);
        updateRegionInfo(b, i);
        b.delete(lockid, COL_SERVER);
        b.delete(lockid, COL_STARTCODE);
        server.batchUpdate(m.getRegionName(), System.currentTimeMillis(), b);
        if (LOG.isDebugEnabled()) {
          LOG.debug("updated columns in row: " + i.regionName);
        }

        if (online) {                           // Bring offline regions on-line
          if (!unassignedRegions.containsKey(i.regionName)) {
            unassignedRegions.put(i.regionName, i);
            assignAttempts.put(i.regionName, Long.valueOf(0L));
          }

        } else {                                // Prevent region from getting assigned.
          unassignedRegions.remove(i.regionName);
          assignAttempts.remove(i.regionName);
        }
      }

      // Process regions currently being served

      if (LOG.isDebugEnabled()) {
        LOG.debug("processing regions currently being served");
      }
      for (Map.Entry<String, HashSet<HRegionInfo>> e: servedRegions.entrySet()) {
        String serverName = e.getKey();
        if (online) {
          LOG.debug("Already online");
          continue;                             // Already being served
        }

        // Cause regions being served to be taken off-line and disabled

        HashMap<Text, HRegionInfo> localKillList = killList.get(serverName);
        if (localKillList == null) {
          localKillList = new HashMap<Text, HRegionInfo>();
        }
        for (HRegionInfo i: e.getValue()) {
          if (LOG.isDebugEnabled()) {
            LOG.debug("adding region " + i.regionName + " to local kill list");
          }
          localKillList.put(i.regionName, i);
        }
        if (localKillList.size() > 0) {
          if (LOG.isDebugEnabled()) {
            LOG.debug("inserted local kill list into kill list for server " +
                serverName);
          }
          killList.put(serverName, localKillList);
        }
      }
      servedRegions.clear();
    }

    protected void updateRegionInfo(final BatchUpdate b, final HRegionInfo i)
      throws IOException {
     
      i.offLine = !online;
      b.put(lockid, COL_REGIONINFO, Writables.getBytes(i));
    }
  }

  /**
   * Instantiated to delete a table
   * Note that it extends ChangeTableState, which takes care of disabling
   * the table.
   */
  private class TableDelete extends ChangeTableState {

    TableDelete(Text tableName) throws IOException {
      super(tableName, false);
    }

    @Override
    protected void postProcessMeta(MetaRegion m, HRegionInterface server)
      throws IOException {

      // For regions that are being served, mark them for deletion     
     
      for (HashSet<HRegionInfo> s: servedRegions.values()) {
        for (HRegionInfo i: s) {
          regionsToDelete.add(i.regionName);
        }
      }

      // Unserved regions we can delete now
     
      for (HRegionInfo i: unservedRegions) {
        // Delete the region
     
        try {
          HRegion.deleteRegion(fs, dir, i.regionName);
       
        } catch (IOException e) {
          LOG.error("failed to delete region " + i.regionName,
            RemoteExceptionHandler.checkIOException(e));
        }
      }
      super.postProcessMeta(m, server);
    }

    @Override
    protected void updateRegionInfo(BatchUpdate b,
        @SuppressWarnings("unused") HRegionInfo i) {
     
      b.delete(lockid, COL_REGIONINFO);
    }
  }

  private abstract class ColumnOperation extends TableOperation {
   
    protected ColumnOperation(Text tableName) throws IOException {
      super(tableName);
    }

    @Override
    protected void processScanItem(
        @SuppressWarnings("unused") String serverName,
        @SuppressWarnings("unused") long startCode,
        final HRegionInfo info) throws IOException {
     
      if (isEnabled(info)) {
        throw new TableNotDisabledException(tableName.toString());
      }
    }

    protected void updateRegionInfo(HRegionInterface server, Text regionName,
        HRegionInfo i) throws IOException {

      BatchUpdate b = new BatchUpdate(rand.nextLong());
      long lockid = b.startUpdate(i.regionName);
      b.put(lockid, COL_REGIONINFO, Writables.getBytes(i));
      server.batchUpdate(regionName, System.currentTimeMillis(), b);
      if (LOG.isDebugEnabled()) {
        LOG.debug("updated columns in row: " + i.regionName);
      }
    }
  }

  /** Instantiated to remove a column family from a table */
  private class DeleteColumn extends ColumnOperation {
    private Text columnName;

    DeleteColumn(Text tableName, Text columnName) throws IOException {
      super(tableName);
      this.columnName = columnName;
    }

    @Override
    protected void postProcessMeta(MetaRegion m, HRegionInterface server)
      throws IOException {

      for (HRegionInfo i: unservedRegions) {
        i.tableDesc.families().remove(columnName);
        updateRegionInfo(server, m.getRegionName(), i);

        // Delete the directories used by the column

        fs.delete(HStoreFile.getMapDir(dir, i.regionName, columnName));
        fs.delete(HStoreFile.getInfoDir(dir, i.regionName, columnName));
      }
    }
  }

  /** Instantiated to add a column family to a table */
  private class AddColumn extends ColumnOperation {
    private HColumnDescriptor newColumn;

    AddColumn(Text tableName, HColumnDescriptor newColumn) throws IOException {
      super(tableName);
      this.newColumn = newColumn;
    }

    @Override
    protected void postProcessMeta(MetaRegion m, HRegionInterface server)
      throws IOException {

      for (HRegionInfo i: unservedRegions) {

        // All we need to do to add a column is add it to the table descriptor.
        // When the region is brought on-line, it will find the column missing
        // and create it.

        i.tableDesc.addFamily(newColumn);
        updateRegionInfo(server, m.getRegionName(), i);
      }
    }
  }

  /*
   * Managing leases
   */

  /** Instantiated to monitor the health of a region server */
  private class ServerExpirer implements LeaseListener {
    @SuppressWarnings("hiding")
    private String server;

    ServerExpirer(String server) {
      this.server = server;
    }

    /** {@inheritDoc} */
    public void leaseExpired() {
      LOG.info(server + " lease expired");
      // Remove the server from the known servers list and update load info
      HServerInfo info;
      synchronized (serversToServerInfo) {
        info = serversToServerInfo.remove(server);
        if (info != null) {
          String serverName = info.getServerAddress().toString();
          HServerLoad load = serversToLoad.remove(serverName);
          if (load != null) {
            Set<String> servers = loadToServers.get(load);
            if (servers != null) {
              servers.remove(serverName);
              loadToServers.put(load, servers);
            }
          }
        }
        serversToServerInfo.notifyAll();
      }

      // NOTE: If the server was serving the root region, we cannot reassign it
      // here because the new server will start serving the root region before
      // the PendingServerShutdown operation has a chance to split the log file.
      if (info != null) {
        shutdownQueue.put(new PendingServerShutdown(info));
      }
    }
  }

  /*
   * Main program
   */

  private static void printUsageAndExit() {
    System.err.println("Usage: java org.apache.hbase.HMaster " +
    "[--bind=hostname:port] start|stop");
    System.exit(0);
  }

  protected static void doMain(String [] args,
      Class<? extends HMaster> masterClass) {
    if (args.length < 1) {
      printUsageAndExit();
    }

    Configuration conf = new HBaseConfiguration();

    // Process command-line args. TODO: Better cmd-line processing
    // (but hopefully something not as painful as cli options).

    final String addressArgKey = "--bind=";
    for (String cmd: args) {
      if (cmd.startsWith(addressArgKey)) {
        conf.set(MASTER_ADDRESS, cmd.substring(addressArgKey.length()));
        continue;
      }

      if (cmd.equals("start")) {
        try {
          Constructor<? extends HMaster> c =
            masterClass.getConstructor(Configuration.class);
          HMaster master = c.newInstance(conf);
          master.start();
        } catch (Throwable t) {
          LOG.error( "Can not start master", t);
          System.exit(-1);
        }
        break;
      }

      if (cmd.equals("stop")) {
        try {
          HBaseAdmin adm = new HBaseAdmin(conf);
          adm.shutdown();
        } catch (Throwable t) {
          LOG.error( "Can not stop master", t);
          System.exit(-1);
        }
        break;
      }

      // Print out usage if we get to here.
      printUsageAndExit();
    }
  }
 
  /**
   * Main program
   * @param args
   */
  public static void main(String [] args) {
    doMain(args, HMaster.class);
  }
}
TOP

Related Classes of org.apache.hadoop.hbase.HMaster$ChangeTableState

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.