Package org.apache.hadoop.dfs

Source Code of org.apache.hadoop.dfs.FSNamesystem

/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.dfs;

import org.apache.commons.logging.*;

import org.apache.hadoop.io.*;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.util.*;
import org.apache.hadoop.mapred.StatusHttpServer;
import org.apache.hadoop.net.NetworkTopology;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.ipc.Server;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.*;
import java.util.Map.Entry;

/***************************************************
* FSNamesystem does the actual bookkeeping work for the
* DataNode.
*
* It tracks several important tables.
*
* 1)  valid fsname --> blocklist  (kept on disk, logged)
* 2)  Set of all valid blocks (inverted #1)
* 3)  block --> machinelist (kept in memory, rebuilt dynamically from reports)
* 4)  machine --> blocklist (inverted #2)
* 5)  LRU cache of updated-heartbeat machines
***************************************************/
class FSNamesystem implements FSConstants {
  public static final Log LOG = LogFactory.getLog("org.apache.hadoop.fs.FSNamesystem");

  //
  // Stores the correct file name hierarchy
  //
  FSDirectory dir;

  //
  // Stores the block-->datanode(s) map.  Updated only in response
  // to client-sent information.
  // Mapping: Block -> { INode, datanodes, self ref }
  //
  BlocksMap blocksMap = new BlocksMap();
   
  /**
   * Stores the datanode -> block map. 
   * <p>
   * Done by storing a set of {@link DatanodeDescriptor} objects, sorted by
   * storage id. In order to keep the storage map consistent it tracks
   * all storages ever registered with the namenode.
   * A descriptor corresponding to a specific storage id can be
   * <ul>
   * <li>added to the map if it is a new storage id;</li>
   * <li>updated with a new datanode started as a replacement for the old one
   * with the same storage id; and </li>
   * <li>removed if and only if an existing datanode is restarted to serve a
   * different storage id.</li>
   * </ul> <br>
   * The list of the {@link DatanodeDescriptor}s in the map is checkpointed
   * in the namespace image file. Only the {@link DatanodeInfo} part is
   * persistent, the list of blocks is restored from the datanode block
   * reports.
   * <p>
   * Mapping: StorageID -> DatanodeDescriptor
   */
  Map<String, DatanodeDescriptor> datanodeMap =
    new TreeMap<String, DatanodeDescriptor>();

  //
  // Keeps a Collection for every named machine containing
  // blocks that have recently been invalidated and are thought to live
  // on the machine in question.
  // Mapping: StorageID -> ArrayList<Block>
  //
  private Map<String, Collection<Block>> recentInvalidateSets =
    new TreeMap<String, Collection<Block>>();

  //
  // Keeps a TreeSet for every named node.  Each treeset contains
  // a list of the blocks that are "extra" at that location.  We'll
  // eventually remove these extras.
  // Mapping: StorageID -> TreeSet<Block>
  //
  private Map<String, Collection<Block>> excessReplicateMap =
    new TreeMap<String, Collection<Block>>();

  //
  // Keeps track of files that are being created, plus the
  // blocks that make them up.
  //
  PendingCreates pendingCreates = new PendingCreates();

  //
  // Stats on overall usage
  //
  long totalCapacity = 0, totalRemaining = 0;

  // total number of connections per live datanode
  int totalLoad = 0;


  //
  // For the HTTP browsing interface
  //
  StatusHttpServer infoServer;
  int infoPort;
  String infoBindAddress;
  Date startTime;
   
  //
  Random r = new Random();

  /**
   * Stores a set of DatanodeDescriptor objects.
   * This is a subset of {@link #datanodeMap}, containing nodes that are
   * considered alive.
   * The {@link HeartbeatMonitor} periodically checks for outdated entries,
   * and removes them from the list.
   */
  ArrayList<DatanodeDescriptor> heartbeats = new ArrayList<DatanodeDescriptor>();

  //
  // Store set of Blocks that need to be replicated 1 or more times.
  // We also store pending replication-orders.
  // Set of: Block
  //
  private UnderReplicatedBlocks neededReplications = new UnderReplicatedBlocks();
  private PendingReplicationBlocks pendingReplications;

  //
  // Used for handling lock-leases
  // Mapping: leaseHolder -> Lease
  //
  private Map<StringBytesWritable, Lease> leases = new TreeMap<StringBytesWritable, Lease>();
  // Set of: Lease
  private SortedSet<Lease> sortedLeases = new TreeSet<Lease>();

  //
  // Threaded object that checks to see if we have been
  // getting heartbeats from all clients.
  //
  Daemon hbthread = null;   // HeartbeatMonitor thread
  Daemon lmthread = null;   // LeaseMonitor thread
  Daemon smmthread = null// SafeModeMonitor thread
  Daemon replthread = null// Replication thread
  volatile boolean fsRunning = true;
  long systemStart = 0;

  //  The maximum number of replicates we should allow for a single block
  private int maxReplication;
  //  How many outgoing replication streams a given node should have at one time
  private int maxReplicationStreams;
  // MIN_REPLICATION is how many copies we need in place or else we disallow the write
  private int minReplication;
  // Default replication
  private int defaultReplication;
  // heartbeatRecheckInterval is how often namenode checks for expired datanodes
  private long heartbeatRecheckInterval;
  // heartbeatExpireInterval is how long namenode waits for datanode to report
  // heartbeat
  private long heartbeatExpireInterval;
  //replicationRecheckInterval is how often namenode checks for new replication work
  private long replicationRecheckInterval;
  //decommissionRecheckInterval is how often namenode checks if a node has finished decommission
  private long decommissionRecheckInterval;
  private int replIndex = 0; // last datanode used for replication work
  static int REPL_WORK_PER_ITERATION = 32; // max percent datanodes per iteration

  public static FSNamesystem fsNamesystemObject;
  private String localMachine;
  private int port;
  private SafeModeInfo safeMode;  // safe mode information
  private Host2NodesMap host2DataNodeMap = new Host2NodesMap();
   
  // datanode networktoplogy
  NetworkTopology clusterMap = new NetworkTopology();
  // for block replicas placement
  ReplicationTargetChooser replicator;

  private HostsFileReader hostsReader;
  private Daemon dnthread = null;

  /**
   * dirs is a list oif directories where the filesystem directory state
   * is stored
   */
  public FSNamesystem(String hostname,
                      int port,
                      NameNode nn, Configuration conf) throws IOException {
    fsNamesystemObject = this;
    setConfigurationParameters(conf);

    this.localMachine = hostname;
    this.port = port;
    this.dir = new FSDirectory(this, conf);
    StartupOption startOpt = NameNode.getStartupOption(conf);
    this.dir.loadFSImage(getNamespaceDirs(conf), startOpt);
    this.safeMode = new SafeModeInfo(conf);
    setBlockTotal();
    pendingReplications = new PendingReplicationBlocks(LOG,
                          conf.getInt("dfs.replication.pending.timeout.sec",
                                      -1) * 1000);
    this.hbthread = new Daemon(new HeartbeatMonitor());
    this.lmthread = new Daemon(new LeaseMonitor());
    this.replthread = new Daemon(new ReplicationMonitor());
    hbthread.start();
    lmthread.start();
    replthread.start();
    this.systemStart = now();
    this.startTime = new Date(systemStart);
       
    this.hostsReader = new HostsFileReader(conf.get("dfs.hosts",""),
                                           conf.get("dfs.hosts.exclude",""));
    this.dnthread = new Daemon(new DecommissionedMonitor());
    dnthread.start();

    this.infoPort = conf.getInt("dfs.info.port", 50070);
    this.infoBindAddress = conf.get("dfs.info.bindAddress", "0.0.0.0");
    this.infoServer = new StatusHttpServer("dfs", infoBindAddress, infoPort, false);
    this.infoServer.setAttribute("name.system", this);
    this.infoServer.setAttribute("name.node", nn);
    this.infoServer.setAttribute("name.conf", conf);
    this.infoServer.addServlet("fsck", "/fsck", FsckServlet.class);
    this.infoServer.addServlet("getimage", "/getimage", GetImageServlet.class);
    this.infoServer.addServlet("listPaths", "/listPaths/*", ListPathsServlet.class);
    this.infoServer.addServlet("data", "/data/*", FileDataServlet.class);
    this.infoServer.start();
       
    // The web-server port can be ephemeral... ensure we have the correct info
    this.infoPort = this.infoServer.getPort();
    conf.setInt("dfs.info.port", this.infoPort);
    LOG.info("Web-server up at: " + conf.get("dfs.info.port"));
  }

  static Collection<File> getNamespaceDirs(Configuration conf) {
    String[] dirNames = conf.getStrings("dfs.name.dir");
    if (dirNames == null)
      dirNames = new String[] {"/tmp/hadoop/dfs/name"};
    Collection<File> dirs = new ArrayList<File>(dirNames.length);
    for(int idx = 0; idx < dirNames.length; idx++) {
      dirs.add(new File(dirNames[idx]));
    }
    return dirs;
  }

  /**
   * dirs is a list of directories where the filesystem directory state
   * is stored
   */
  FSNamesystem(FSImage fsImage, Configuration conf) throws IOException {
    fsNamesystemObject = this;
    setConfigurationParameters(conf);
    this.dir = new FSDirectory(fsImage, this, conf);
  }

  /**
   * Initializes some of the members from configuration
   */
  private void setConfigurationParameters(Configuration conf)
                                          throws IOException {
    this.replicator = new ReplicationTargetChooser(
                                                   conf.getBoolean("dfs.replication.considerLoad", true),
                                                   this,
                                                   clusterMap,
                                                   LOG);
    this.defaultReplication = conf.getInt("dfs.replication", 3);
    this.maxReplication = conf.getInt("dfs.replication.max", 512);
    this.minReplication = conf.getInt("dfs.replication.min", 1);
    if (minReplication <= 0)
      throw new IOException(
                            "Unexpected configuration parameters: dfs.replication.min = "
                            + minReplication
                            + " must be greater than 0");
    if (maxReplication >= (int)Short.MAX_VALUE)
      throw new IOException(
                            "Unexpected configuration parameters: dfs.replication.max = "
                            + maxReplication + " must be less than " + (Short.MAX_VALUE));
    if (maxReplication < minReplication)
      throw new IOException(
                            "Unexpected configuration parameters: dfs.replication.min = "
                            + minReplication
                            + " must be less than dfs.replication.max = "
                            + maxReplication);
    this.maxReplicationStreams = conf.getInt("dfs.max-repl-streams", 2);
    long heartbeatInterval = conf.getLong("dfs.heartbeat.interval", 3) * 1000;
    this.heartbeatRecheckInterval = 5 * 60 * 1000; // 5 minutes
    this.heartbeatExpireInterval = 2 * heartbeatRecheckInterval +
      10 * heartbeatInterval;
    this.replicationRecheckInterval = 3 * 1000; //  3 second
    this.decommissionRecheckInterval = conf.getInt(
                                                   "dfs.namenode.decommission.interval",
                                                   5 * 60 * 1000);   
  }

  /** Return the FSNamesystem object
   *
   */
  public static FSNamesystem getFSNamesystem() {
    return fsNamesystemObject;
  }
   
  NamespaceInfo getNamespaceInfo() {
    return new NamespaceInfo(dir.fsImage.getNamespaceID(),
                             dir.fsImage.getCTime(),
                             getDistributedUpgradeVersion());
  }

  /** Close down this filesystem manager.
   * Causes heartbeat and lease daemons to stop; waits briefly for
   * them to finish, but a short timeout returns control back to caller.
   */
  public void close() {
    fsRunning = false;
    try {
      if (pendingReplications != null) pendingReplications.stop();
      if (infoServer != null) infoServer.stop();
      if (hbthread != null) hbthread.interrupt();
      if (replthread != null) replthread.interrupt();
      if (dnthread != null) dnthread.interrupt();
      if (smmthread != null) smmthread.interrupt();
    } catch (InterruptedException ie) {
    } finally {
      // using finally to ensure we also wait for lease daemon
      try {
        if (lmthread != null) {
          lmthread.interrupt();
          lmthread.join(3000);
        }
      } catch (InterruptedException ie) {
      } finally {
        try {
          dir.close();
        } catch (IOException ex) {
          // do nothing
        }
      }
    }
  }

  /**
   * Dump all metadata into specified file
   */
  void metaSave(String filename) throws IOException {
    File file = new File(System.getProperty("hadoop.log.dir"),
                         filename);
    PrintWriter out = new PrintWriter(new BufferedWriter(
                                                         new FileWriter(file, true)));

    //
    // Dump contents of neededReplication
    //
    synchronized (neededReplications) {
      out.println("Metasave: Blocks waiting for replication: " +
                  neededReplications.size());
      if (neededReplications.size() > 0) {
        for (Iterator<Block> it = neededReplications.iterator();
             it.hasNext();) {
          Block block = it.next();
          out.print(block);
          for (Iterator<DatanodeDescriptor> jt = blocksMap.nodeIterator(block);
               jt.hasNext();) {
            DatanodeDescriptor node = jt.next();
            out.print(" " + node + " : ");
          }
          out.println("");
        }
      }
    }

    //
    // Dump blocks from pendingReplication
    //
    pendingReplications.metaSave(out);

    //
    // Dump blocks that are waiting to be deleted
    //
    dumpRecentInvalidateSets(out);

    //
    // Dump all datanodes
    //
    datanodeDump(out);

    out.flush();
    out.close();
  }
   
  /* get replication factor of a block */
  private int getReplication(Block block) {
    FSDirectory.INode fileINode = blocksMap.getINode(block);
    if (fileINode == null) { // block does not belong to any file
      return 0;
    } else {
      return fileINode.getReplication();
    }
  }

  /* updates a block in under replication queue */
  synchronized void updateNeededReplications(Block block,
                        int curReplicasDelta, int expectedReplicasDelta) {
    NumberReplicas repl = countNodes(block);
    int curExpectedReplicas = getReplication(block);
    neededReplications.update(block,
                              repl.liveReplicas(),
                              repl.decommissionedReplicas(),
                              curExpectedReplicas,
                              curReplicasDelta, expectedReplicasDelta);
  }

  /**
   * Used only during DFS upgrade for block level CRCs (HADOOP-1134).
   * This returns information for a given blocks that includes:
   * <li> full path name for the file that contains the block.
   * <li> offset of first byte of the block.
   * <li> file length and length of the block.
   * <li> all block locations for the crc file (".file.crc").
   * <li> replication for crc file.
   * When replicas is true, it includes replicas of the block.
   */
  public synchronized BlockCrcInfo blockCrcInfo(Block block,
                                                boolean replicas) {
    BlockCrcInfo crcInfo = new BlockCrcInfo();
    crcInfo.status = BlockCrcInfo.STATUS_ERROR;
   
    FSDirectory.INode fileINode = blocksMap.getINode(block);
    if ( fileINode == null || fileINode.isDir() ) {
      // Most probably reason is that this block does not exist
      if (blocksMap.getStoredBlock(block) == null) {
        crcInfo.status = BlockCrcInfo.STATUS_UNKNOWN_BLOCK;
      } else {
        LOG.warn("getBlockCrcInfo(): Could not find file for " + block);
      }
      return crcInfo;
    }

    crcInfo.fileName = fileINode.getAbsoluteName();
   
    // Find the offset and length for this block.
    Block[] fileBlocks = fileINode.getBlocks();
    crcInfo.blockLen = -1;
    if ( fileBlocks != null ) {
      for ( Block b:fileBlocks ) {
        if ( block.equals(b) ) {
          crcInfo.blockLen = b.getNumBytes();
        }
        if ( crcInfo.blockLen < 0 ) {
          crcInfo.startOffset += b.getNumBytes();
        }
        crcInfo.fileSize += b.getNumBytes();
      }
    }

    if ( crcInfo.blockLen < 0 ) {
      LOG.warn("blockCrcInfo(): " + block +
               " could not be found in blocks for " + crcInfo.fileName);
      return crcInfo;
    }
   
    String fileName = fileINode.getLocalName();   
    if ( fileName.startsWith(".") && fileName.endsWith(".crc") ) {
      crcInfo.status = BlockCrcInfo.STATUS_CRC_BLOCK;
      return crcInfo;
    }

    if (replicas) {
      // include block replica locations, instead of crcBlocks
      crcInfo.blockLocationsIncluded = true;
     
      DatanodeInfo[] dnInfo = new DatanodeInfo[blocksMap.numNodes(block)];
      Iterator<DatanodeDescriptor> it = blocksMap.nodeIterator(block);
      for (int i=0; it != null && it.hasNext(); i++ ) {
        dnInfo[i] = new DatanodeInfo(it.next());
      }
      crcInfo.blockLocations = new LocatedBlock(block, dnInfo,
                                                crcInfo.startOffset);
    } else {

      //Find CRC file
      String crcName = "." + fileName + ".crc";
      FSDirectory.INode crcINode = fileINode.getParent().getChild(crcName);

      if ( crcINode == null ) {
        // Should we log this?
        crcInfo.status = BlockCrcInfo.STATUS_NO_CRC_DATA;
        return crcInfo;
      }

      Block[] blocks = crcINode.getBlocks();
      if ( blocks == null )  {
        LOG.warn("getBlockCrcInfo(): could not find blocks for crc file for " +
                 crcInfo.fileName);
        return crcInfo;
      }

      crcInfo.crcBlocks = new LocatedBlock[ blocks.length ];
      for (int i=0; i<blocks.length; i++) {
        DatanodeInfo[] dnArr = new DatanodeInfo[ blocksMap.numNodes(blocks[i]) ];
        int idx = 0;
        for (Iterator<DatanodeDescriptor> it = blocksMap.nodeIterator(blocks[i]);
        it.hasNext();) {
          dnArr[ idx++ ] = it.next();
        }
        crcInfo.crcBlocks[i] = new LocatedBlock(blocks[i], dnArr);
      }

      crcInfo.crcReplication = crcINode.getReplication();
    }
   
    crcInfo.status = BlockCrcInfo.STATUS_DATA_BLOCK;
    return crcInfo;
  }
 
  /////////////////////////////////////////////////////////
  //
  // These methods are called by HadoopFS clients
  //
  /////////////////////////////////////////////////////////
  /**
   * Get block locations within the specified range.
   *
   * @see ClientProtocol#open(String, long, long)
   * @see ClientProtocol#getBlockLocations(String, long, long)
   */
  LocatedBlocks getBlockLocations(String clientMachine,
                                  String src,
                                  long offset,
                                  long length
                                  ) throws IOException {
    if (offset < 0) {
      throw new IOException("Negative offset is not supported. File: " + src );
    }
    if (length < 0) {
      throw new IOException("Negative length is not supported. File: " + src );
    }

    DatanodeDescriptor client = null;
    LocatedBlocks blocks =  getBlockLocations(dir.getFileINode(src),
                                              offset, length,
                                              Integer.MAX_VALUE);
    if (blocks == null) {
      return null;
    }
    client = host2DataNodeMap.getDatanodeByHost(clientMachine);
    for (Iterator<LocatedBlock> it = blocks.getLocatedBlocks().iterator();
         it.hasNext();) {
      LocatedBlock block = (LocatedBlock) it.next();
      clusterMap.pseudoSortByDistance(client,
                                (DatanodeDescriptor[])(block.getLocations()));
    }
    return blocks;
  }
 
  private synchronized LocatedBlocks getBlockLocations(FSDirectory.INode inode,
                                                       long offset,
                                                       long length,
                                                       int nrBlocksToReturn) {
    if(inode == null || inode.isDir()) {
      return null;
    }
    Block[] blocks = inode.getBlocks();
    if (blocks == null) {
      return null;
    }
    assert blocks.length > 0 : "Array of blocks is empty.";
    List<LocatedBlock> results;
    results = new ArrayList<LocatedBlock>(blocks.length);

    int curBlk = 0;
    long curPos = 0, blkSize = 0;
    int nrBlocks = (blocks[0].getNumBytes() == 0) ? 0 : blocks.length;
    for (curBlk = 0; curBlk < nrBlocks; curBlk++) {
      blkSize = blocks[curBlk].getNumBytes();
      assert blkSize > 0 : "Block of size 0";
      if (curPos + blkSize > offset) {
        break;
      }
      curPos += blkSize;
    }
   
    if (nrBlocks > 0 && curBlk == nrBlocks)   // offset >= end of file
      return null;
   
    long endOff = offset + length;
   
    do {
      // get block locations
      int numNodes = blocksMap.numNodes(blocks[curBlk]);
      DatanodeDescriptor[] machineSet = new DatanodeDescriptor[numNodes];
      if (numNodes > 0) {
        numNodes = 0;
        for(Iterator<DatanodeDescriptor> it =
            blocksMap.nodeIterator(blocks[curBlk]); it.hasNext();) {
          machineSet[numNodes++] = it.next();
        }
      }
      results.add(new LocatedBlock(blocks[curBlk], machineSet, curPos));
      curPos += blocks[curBlk].getNumBytes();
      curBlk++;
    } while (curPos < endOff
          && curBlk < blocks.length
          && results.size() < nrBlocksToReturn);
   
    return new LocatedBlocks(inode, results);
  }

  /**
   * Set replication for an existing file.
   *
   * The NameNode sets new replication and schedules either replication of
   * under-replicated data blocks or removal of the eccessive block copies
   * if the blocks are over-replicated.
   *
   * @see ClientProtocol#setReplication(String, short)
   * @param src file name
   * @param replication new replication
   * @return true if successful;
   *         false if file does not exist or is a directory
   */
  public boolean setReplication(String src, short replication)
                                throws IOException {
    boolean status = setReplicationInternal(src, replication);
    getEditLog().logSync();
    return status;
  }

  private synchronized boolean setReplicationInternal(String src,
                                             short replication
                                             ) throws IOException {
    if (isInSafeMode())
      throw new SafeModeException("Cannot set replication for " + src, safeMode);
    verifyReplication(src, replication, null);

    int[] oldReplication = new int[1];
    Block[] fileBlocks;
    fileBlocks = dir.setReplication(src, replication, oldReplication);
    if (fileBlocks == null// file not found or is a directory
      return false;
    int oldRepl = oldReplication[0];
    if (oldRepl == replication) // the same replication
      return true;

    // update needReplication priority queues
    LOG.info("Increasing replication for file " + src
             + ". New replication is " + replication);
    for(int idx = 0; idx < fileBlocks.length; idx++)
      updateNeededReplications(fileBlocks[idx], 0, replication-oldRepl);
     
    if (oldRepl > replication) { 
      // old replication > the new one; need to remove copies
      LOG.info("Reducing replication for file " + src
               + ". New replication is " + replication);
      for(int idx = 0; idx < fileBlocks.length; idx++)
        proccessOverReplicatedBlock(fileBlocks[idx], replication);
    }
    return true;
  }
   
  public long getBlockSize(String filename) throws IOException {
    return dir.getBlockSize(filename);
  }
   
  /**
   * Check whether the replication parameter is within the range
   * determined by system configuration.
   */
  private void verifyReplication(String src,
                                 short replication,
                                 String clientName
                                 ) throws IOException {
    String text = "file " + src
      + ((clientName != null) ? " on client " + clientName : "")
      + ".\n"
      + "Requested replication " + replication;

    if (replication > maxReplication)
      throw new IOException(text + " exceeds maximum " + maxReplication);
     
    if (replication < minReplication)
      throw new IOException(
                            text + " is less than the required minimum " + minReplication);
  }
   
  /**
   * The client would like to create a new block for the indicated
   * filename.  Return an array that consists of the block, plus a set
   * of machines.  The first on this list should be where the client
   * writes data.  Subsequent items in the list must be provided in
   * the connection to the first datanode.
   * @return Return an array that consists of the block, plus a set
   * of machines
   * @throws IOException if the filename is invalid
   *         {@link FSDirectory#isValidToCreate(String)}.
   */
  public LocatedBlock startFile(String src,
                                String holder,
                                String clientMachine,
                                boolean overwrite,
                                short replication,
                                long blockSize
                                ) throws IOException {

    //
    // Create file into pendingCreates and get the first blockId
    //
    Block newBlock = startFileInternal(src, holder, clientMachine,
                                       overwrite, replication,
                                       blockSize);

    //
    // Get the array of replication targets
    //
    try {
      DatanodeDescriptor clientNode =
        host2DataNodeMap.getDatanodeByHost(clientMachine);
      DatanodeDescriptor targets[] = replicator.chooseTarget(replication,
                                                             clientNode, null, blockSize);
      if (targets.length < this.minReplication) {
        if (clusterMap.getNumOfLeaves() == 0) {
          throw new IOException("Failed to create file " + src
                                + " on client " + clientMachine
                                + " because this cluster has no datanodes.");
        }
        throw new IOException("Failed to create file " + src
                              + " on client " + clientMachine
                              + " because there were not enough datanodes available. "
                              + "Found " + targets.length
                              + " datanodes but MIN_REPLICATION for the cluster is "
                              + "configured to be "
                              + this.minReplication
                              + ".");
      }
      return new LocatedBlock(newBlock, targets, 0L);

    } catch (IOException ie) {
      NameNode.stateChangeLog.warn("DIR* NameSystem.startFile: "
                                   + ie.getMessage());
      throw ie;
    }
  }

  public synchronized Block startFileInternal(String src,
                                              String holder,
                                              String clientMachine,
                                              boolean overwrite,
                                              short replication,
                                              long blockSize
                                               ) throws IOException {
    NameNode.stateChangeLog.debug("DIR* NameSystem.startFile: file "
                                  +src+" for "+holder+" at "+clientMachine);
    if (isInSafeMode())
      throw new SafeModeException("Cannot create file" + src, safeMode);
    if (!isValidName(src)) {
      throw new IOException("Invalid file name: " + src);         
    }
    try {
      FileUnderConstruction pendingFile = pendingCreates.get(src);
      if (pendingFile != null) {
        //
        // If the file exists in pendingCreate, then it must be in our
        // leases. Find the appropriate lease record.
        //
        Lease lease = getLease(holder);
        //
        // We found the lease for this file. And surprisingly the original
        // holder is trying to recreate this file. This should never occur.
        //
        if (lease != null) {
          throw new AlreadyBeingCreatedException(
                                                 "failed to create file " + src + " for " + holder +
                                                 " on client " + clientMachine +
                                                 " because current leaseholder is trying to recreate file.");
        }
        //
        // Find the original holder.
        //
        lease = getLease(pendingFile.getClientName());
        if (lease == null) {
          throw new AlreadyBeingCreatedException(
                                                 "failed to create file " + src + " for " + holder +
                                                 " on client " + clientMachine +
                                                 " because pendingCreates is non-null but no leases found.");
        }
        //
        // If the original holder has not renewed in the last SOFTLIMIT
        // period, then reclaim all resources and allow this request
        // to proceed. Otherwise, prevent this request from creating file.
        //
        if (lease.expiredSoftLimit()) {
          lease.releaseLocks();
          removeLease(lease.getHolder());
          LOG.info("Removing lease " + lease + " ");
          if (!sortedLeases.remove(lease)) {
            LOG.error("Unknown failure trying to remove " + lease +
                      " from lease set.");
          }
        } else {
          throw new AlreadyBeingCreatedException(
                                                 "failed to create file " + src + " for " + holder +
                                                 " on client " + clientMachine +
                                                 ", because this file is already being created by " +
                                                 pendingFile.getClientName() +
                                                 " on " + pendingFile.getClientMachine());
        }
      }

      try {
        verifyReplication(src, replication, clientMachine);
      } catch(IOException e) {
        throw new IOException("failed to create "+e.getMessage());
      }
      if (!dir.isValidToCreate(src)) {
        if (overwrite) {
          delete(src);
        } else {
          throw new IOException("failed to create file " + src
                                +" on client " + clientMachine
                                +" either because the filename is invalid or the file exists");
        }
      }

      DatanodeDescriptor clientNode =
        host2DataNodeMap.getDatanodeByHost(clientMachine);

      // Reserve space for this pending file
      pendingCreates.put(src,
                         new FileUnderConstruction(replication,
                                                   blockSize,
                                                   holder,
                                                   clientMachine,
                                                   clientNode));
      NameNode.stateChangeLog.debug("DIR* NameSystem.startFile: "
                                    +"add "+src+" to pendingCreates for "+holder);
      synchronized (leases) {
        Lease lease = getLease(holder);
        if (lease == null) {
          lease = new Lease(holder);
          putLease(holder, lease);
          sortedLeases.add(lease);
        } else {
          sortedLeases.remove(lease);
          lease.renew();
          sortedLeases.add(lease);
        }
        lease.startedCreate(src);
      }
     
      // Create first block
      return allocateBlock(src);
    } catch (IOException ie) {
      NameNode.stateChangeLog.warn("DIR* NameSystem.startFile: "
                                   +ie.getMessage());
      throw ie;
    }
  }

  /**
   * The client would like to obtain an additional block for the indicated
   * filename (which is being written-to).  Return an array that consists
   * of the block, plus a set of machines.  The first on this list should
   * be where the client writes data.  Subsequent items in the list must
   * be provided in the connection to the first datanode.
   *
   * Make sure the previous blocks have been reported by datanodes and
   * are replicated.  Will return an empty 2-elt array if we want the
   * client to "try again later".
   */
  public LocatedBlock getAdditionalBlock(String src,
                                         String clientName
                                         ) throws IOException {
    long fileLength, blockSize;
    int replication;
    DatanodeDescriptor clientNode = null;
    Block newBlock = null;

    NameNode.stateChangeLog.debug("BLOCK* NameSystem.getAdditionalBlock: file "
                                  +src+" for "+clientName);

    synchronized (this) {
      if (isInSafeMode()) {
        throw new SafeModeException("Cannot add block to " + src, safeMode);
      }

      //
      // make sure that we still have the lease on this file
      //
      FileUnderConstruction pendingFile = pendingCreates.get(src);
      if (pendingFile == null) {
        throw new LeaseExpiredException("No lease on " + src);
      }
      if (!pendingFile.getClientName().equals(clientName)) {
        throw new LeaseExpiredException("Lease mismatch on " + src + " owned by "
                                        + pendingFile.getClientName()
                                        + " and appended by " + clientName);
      }

      //
      // If we fail this, bad things happen!
      //
      if (!checkFileProgress(pendingFile, false)) {
        throw new NotReplicatedYetException("Not replicated yet:" + src);
      }
      fileLength = pendingFile.computeFileLength();
      blockSize = pendingFile.getBlockSize();
      clientNode = pendingFile.getClientNode();
      replication = (int)pendingFile.getReplication();
      newBlock = allocateBlock(src);
    }

    DatanodeDescriptor targets[] = replicator.chooseTarget(replication,
                                                           clientNode,
                                                           null,
                                                           blockSize);
    if (targets.length < this.minReplication) {
      throw new IOException("File " + src + " could only be replicated to " +
                            targets.length + " nodes, instead of " +
                            minReplication);
    }
       
    // Create next block
    return new LocatedBlock(newBlock, targets, fileLength);
  }

  /**
   * The client would like to let go of the given block
   */
  public synchronized boolean abandonBlock(Block b, String src) throws IOException {
    //
    // Remove the block from the pending creates list
    //
    NameNode.stateChangeLog.debug("BLOCK* NameSystem.abandonBlock: "
                                  +b.getBlockName()+"of file "+src);
    boolean status = pendingCreates.removeBlock(src, b);
    if (status) {
      NameNode.stateChangeLog.debug("BLOCK* NameSystem.abandonBlock: "
                                    + b.getBlockName()
                                    + " is removed from pendingCreates");
    }
    return status;
  }

  /**
   * Abandon the entire file in progress
   */
  public synchronized void abandonFileInProgress(String src,
                                                 String holder
                                                 ) throws IOException {
    NameNode.stateChangeLog.debug("DIR* NameSystem.abandonFileInProgress:" + src);
    synchronized (leases) {
      // find the lease
      Lease lease = getLease(holder);
      if (lease != null) {
        // remove the file from the lease
        if (lease.completedCreate(src)) {
          // if we found the file in the lease, remove it from pendingCreates
          internalReleaseCreate(src, holder);
        } else {
          LOG.info("Attempt by " + holder +
                   " to release someone else's create lock on " + src);
        }
      } else {
        LOG.info("Attempt to release a lock from an unknown lease holder "
                 + holder + " for " + src);
      }
    }
  }

  /**
   * Finalize the created file and make it world-accessible.  The
   * FSNamesystem will already know the blocks that make up the file.
   * Before we return, we make sure that all the file's blocks have
   * been reported by datanodes and are replicated correctly.
   */
  public int completeFile(String src, String holder) throws IOException {
    int status = completeFileInternal(src, holder);
    getEditLog().logSync();
    return status;
  }

  private synchronized int completeFileInternal(String src,
                                                String holder) throws IOException {
    NameNode.stateChangeLog.debug("DIR* NameSystem.completeFile: " + src + " for " + holder);
    if (isInSafeMode())
      throw new SafeModeException("Cannot complete file " + src, safeMode);
    FileUnderConstruction pendingFile = pendingCreates.get(src);

    Block[] fileBlocks =  dir.getFileBlocks(src);
    if (fileBlocks!= null || pendingFile == null) {   
      NameNode.stateChangeLog.warn("DIR* NameSystem.completeFile: "
                                   + "failed to complete " + src
                                   + " because dir.getFileBlocks() is " +
                                   ((fileBlocks == null) ? "null":"non-null") +
                                   " and pendingFile is " +
                                   ((pendingFile == null) ? "null" :
                                     ("from " + pendingFile.getClientMachine()))
                                  );                     
      return OPERATION_FAILED;
    } else if (!checkFileProgress(pendingFile, true)) {
      return STILL_WAITING;
    }
       
    Collection<Block> blocks = pendingFile.getBlocks();
    int nrBlocks = blocks.size();
    Block pendingBlocks[] = blocks.toArray(new Block[nrBlocks]);

    //
    // We have the pending blocks, but they won't have
    // length info in them (as they were allocated before
    // data-write took place). Find the block stored in
    // node descriptor.
    //
    for (int i = 0; i < nrBlocks; i++) {
      Block b = pendingBlocks[i];
      Block storedBlock = blocksMap.getStoredBlock(b);
      if (storedBlock != null) {
        pendingBlocks[i] = storedBlock;
      }
    }
       
    //
    // Now we can add the (name,blocks) tuple to the filesystem
    //
    if (!dir.addFile(src, pendingBlocks, pendingFile.getReplication())) {
      return OPERATION_FAILED;
    }

    // The file is no longer pending
    pendingCreates.remove(src);
    NameNode.stateChangeLog.debug(
                                  "DIR* NameSystem.completeFile: " + src
                                  + " is removed from pendingCreates");

    synchronized (leases) {
      Lease lease = getLease(holder);
      if (lease != null) {
        lease.completedCreate(src);
        if (!lease.hasLocks()) {
          removeLease(holder);
          sortedLeases.remove(lease);
        }
      }
    }

    //
    // REMIND - mjc - this should be done only after we wait a few secs.
    // The namenode isn't giving datanodes enough time to report the
    // replicated blocks that are automatically done as part of a client
    // write.
    //

    // Now that the file is real, we need to be sure to replicate
    // the blocks.
    int numExpectedReplicas = pendingFile.getReplication();
    for (int i = 0; i < nrBlocks; i++) {
      // filter out containingNodes that are marked for decommission.
      NumberReplicas number = countNodes(pendingBlocks[i]);
      if (number.liveReplicas() < numExpectedReplicas) {
        neededReplications.add(pendingBlocks[i],
                               number.liveReplicas(),
                               number.decommissionedReplicas,
                               numExpectedReplicas);
      }
    }
    return COMPLETE_SUCCESS;
  }

  static Random randBlockId = new Random();
   
  /**
   * Allocate a block at the given pending filename
   */
  private Block allocateBlock(String src) throws IOException {
    Block b = null;
    do {
      b = new Block(FSNamesystem.randBlockId.nextLong(), 0);
    } while (isValidBlock(b));
    pendingCreates.addBlock(src, b);
    NameNode.stateChangeLog.info("BLOCK* NameSystem.allocateBlock: "
                                 +src+ ". "+b.getBlockName()+
                                 " is created and added to pendingCreates and pendingCreateBlocks");     
    return b;
  }

  /**
   * Check that the indicated file's blocks are present and
   * replicated.  If not, return false. If checkall is true, then check
   * all blocks, otherwise check only penultimate block.
   */
  synchronized boolean checkFileProgress(FileUnderConstruction v, boolean checkall) {
    if (checkall) {
      //
      // check all blocks of the file.
      //
      for (Iterator<Block> it = v.getBlocks().iterator(); it.hasNext();) {
        if (blocksMap.numNodes(it.next()) < this.minReplication) {
          return false;
        }
      }
    } else {
      //
      // check the penultimate block of this file
      //
      Block b = v.getPenultimateBlock();
      if (b != null) {
        if (blocksMap.numNodes(b) < this.minReplication) {
          return false;
        }
      }
    }
    return true;
  }

  /**
   * Adds block to list of blocks which will be invalidated on
   * specified datanode.
   */
  private void addToInvalidates(Block b, DatanodeInfo n) {
    Collection<Block> invalidateSet = recentInvalidateSets.get(n.getStorageID());
    if (invalidateSet == null) {
      invalidateSet = new ArrayList<Block>();
      recentInvalidateSets.put(n.getStorageID(), invalidateSet);
    }
    invalidateSet.add(b);
  }

  /**
   * dumps the contents of recentInvalidateSets
   */
  private synchronized void dumpRecentInvalidateSets(PrintWriter out) {
    Collection<Collection<Block>> values = recentInvalidateSets.values();
    Iterator<Map.Entry<String,Collection<Block>>> it =
      recentInvalidateSets.entrySet().iterator();
    if (values.size() == 0) {
      out.println("Metasave: Blocks waiting deletion: 0");
      return;
    }
    out.println("Metasave: Blocks waiting deletion from " +
                values.size() + " datanodes.");
    while (it.hasNext()) {
      Map.Entry<String,Collection<Block>> entry = it.next();
      String storageId = (String) entry.getKey();
      DatanodeDescriptor node = datanodeMap.get(storageId);
      Collection<Block> blklist = entry.getValue();
      if (blklist.size() > 0) {
        out.print(node.getName());
        for (Iterator jt = blklist.iterator(); jt.hasNext();) {
          Block block = (Block) jt.next();
          out.print(" " + block);
        }
        out.println("");
      }
    }
  }

  /**
   * Invalidates the given block on the given datanode.
   */
  public synchronized void invalidateBlock(Block blk, DatanodeInfo dn)
    throws IOException {
    NameNode.stateChangeLog.info("DIR* NameSystem.invalidateBlock: "
                                 + blk.getBlockName() + " on "
                                 + dn.getName());
    if (isInSafeMode()) {
      throw new SafeModeException("Cannot invalidate block " + blk.getBlockName(), safeMode);
    }

    // Check how many copies we have of the block.  If we have at least one
    // copy on a live node, then we can delete it.
    int count = countNodes(blk).liveReplicas();
    if (count > 1) {
      addToInvalidates(blk, dn);
      removeStoredBlock(blk, getDatanode(dn));
      NameNode.stateChangeLog.debug("BLOCK* NameSystem.invalidateBlocks: "
                                   + blk.getBlockName() + " on "
                                   + dn.getName() + " listed for deletion.");
    } else {
      NameNode.stateChangeLog.info("BLOCK* NameSystem.invalidateBlocks: "
                                   + blk.getBlockName() + " on "
                                   + dn.getName() + " is the only copy and was not deleted.");
    }
  }

  ////////////////////////////////////////////////////////////////
  // Here's how to handle block-copy failure during client write:
  // -- As usual, the client's write should result in a streaming
  // backup write to a k-machine sequence.
  // -- If one of the backup machines fails, no worries.  Fail silently.
  // -- Before client is allowed to close and finalize file, make sure
  // that the blocks are backed up.  Namenode may have to issue specific backup
  // commands to make up for earlier datanode failures.  Once all copies
  // are made, edit namespace and return to client.
  ////////////////////////////////////////////////////////////////

  public boolean renameTo(String src, String dst) throws IOException {
    boolean status = renameToInternal(src, dst);
    getEditLog().logSync();
    return status;
  }

  /**
   * Change the indicated filename.
   */
  public synchronized boolean renameToInternal(String src, String dst) throws IOException {
    NameNode.stateChangeLog.debug("DIR* NameSystem.renameTo: " + src + " to " + dst);
    if (isInSafeMode())
      throw new SafeModeException("Cannot rename " + src, safeMode);
    if (!isValidName(dst)) {
      throw new IOException("Invalid name: " + dst);
    }
    return dir.renameTo(src, dst);
  }

  /**
   * Remove the indicated filename from the namespace.  This may
   * invalidate some blocks that make up the file.
   */
  public boolean delete(String src) throws IOException {
    boolean status = deleteInternal(src, true);
    getEditLog().logSync();
    return status;
  }

  /**
   * An internal delete function that does not enforce safe mode
   */
  boolean deleteInSafeMode(String src) throws IOException {
    boolean status = deleteInternal(src, false);
    getEditLog().logSync();
    return status;
  }
  /**
   * Remove the indicated filename from the namespace.  This may
   * invalidate some blocks that make up the file.
   */
  private synchronized boolean deleteInternal(String src,
                                              boolean enforceSafeMode)
                                              throws IOException {
    NameNode.stateChangeLog.debug("DIR* NameSystem.delete: " + src);
    if (enforceSafeMode && isInSafeMode())
      throw new SafeModeException("Cannot delete " + src, safeMode);
    Block deletedBlocks[] = dir.delete(src);
    if (deletedBlocks != null) {
      for (int i = 0; i < deletedBlocks.length; i++) {
        Block b = deletedBlocks[i];
               
        for (Iterator<DatanodeDescriptor> it =
               blocksMap.nodeIterator(b); it.hasNext();) {
          DatanodeDescriptor node = it.next();
          addToInvalidates(b, node);
          NameNode.stateChangeLog.info("BLOCK* NameSystem.delete: "
                                        + b.getBlockName() + " is added to invalidSet of "
                                        + node.getName());
        }
      }
    }

    return (deletedBlocks != null);
  }

  /**
   * Return whether the given filename exists
   */
  public boolean exists(String src) {
    if (dir.getFileBlocks(src) != null || dir.isDir(src)) {
      return true;
    } else {
      return false;
    }
  }

  /**
   * Whether the given name is a directory
   */
  public boolean isDir(String src) {
    return dir.isDir(src);
  }

  /* Get the file info for a specific file.
   * @param src The string representation of the path to the file
   * @throws IOException if file does not exist
   * @return object containing information regarding the file
   */
  DFSFileInfo getFileInfo(String src) throws IOException {
    return dir.getFileInfo(src);
  }

  /**
   * Whether the pathname is valid.  Currently prohibits relative paths,
   * and names which contain a ":" or "/"
   */
  static boolean isValidName(String src) {
     
    // Path must be absolute.
    if (!src.startsWith(Path.SEPARATOR)) {
      return false;
    }
     
    // Check for ".." "." ":" "/"
    StringTokenizer tokens = new StringTokenizer(src, Path.SEPARATOR);
    while(tokens.hasMoreTokens()) {
      String element = tokens.nextToken();
      if (element.equals("..") ||
          element.equals("."||
          (element.indexOf(":") >= 0||
          (element.indexOf("/") >= 0)) {
        return false;
      }
    }
    return true;
  }
  /**
   * Create all the necessary directories
   */
  public boolean mkdirs(String src) throws IOException {
    boolean status = mkdirsInternal(src);
    getEditLog().logSync();
    return status;
  }
   
  /**
   * Create all the necessary directories
   */
  private synchronized boolean mkdirsInternal(String src) throws IOException {
    boolean    success;
    NameNode.stateChangeLog.debug("DIR* NameSystem.mkdirs: " + src);
    if (isInSafeMode())
      throw new SafeModeException("Cannot create directory " + src, safeMode);
    if (!isValidName(src)) {
      throw new IOException("Invalid directory name: " + src);
    }
    success = dir.mkdirs(src, now());
    if (!success) {
      throw new IOException("Invalid directory name: " + src);
    }
    return success;
  }

  /************************************************************
   * A Lease governs all the locks held by a single client.
   * For each client there's a corresponding lease, whose
   * timestamp is updated when the client periodically
   * checks in.  If the client dies and allows its lease to
   * expire, all the corresponding locks can be released.
   *************************************************************/
  class Lease implements Comparable<Lease> {
    private StringBytesWritable holder;
    private long lastUpdate;
    private Collection<StringBytesWritable> locks = new TreeSet<StringBytesWritable>();
    private Collection<StringBytesWritable> creates = new TreeSet<StringBytesWritable>();

    public Lease(String holder) throws IOException {
      this.holder = new StringBytesWritable(holder);
      renew();
    }
    public void renew() {
      this.lastUpdate = now();
    }
    /**
     * Returns true if the Hard Limit Timer has expired
     */
    public boolean expiredHardLimit() {
      if (now() - lastUpdate > LEASE_HARDLIMIT_PERIOD) {
        return true;
      }
      return false;
    }
    /**
     * Returns true if the Soft Limit Timer has expired
     */
    public boolean expiredSoftLimit() {
      if (now() - lastUpdate > LEASE_SOFTLIMIT_PERIOD) {
        return true;
      }
      return false;
    }
    public void obtained(String src) throws IOException {
      locks.add(new StringBytesWritable(src));
    }
    public void released(String src) throws IOException {
      locks.remove(new StringBytesWritable(src));
    }
    public void startedCreate(String src) throws IOException {
      creates.add(new StringBytesWritable(src));
    }
    public boolean completedCreate(String src) throws IOException {
      return creates.remove(new StringBytesWritable(src));
    }
    public boolean hasLocks() {
      return (locks.size() + creates.size()) > 0;
    }
    public void releaseLocks() throws IOException {
      String holderStr = holder.getString();
      for (Iterator<StringBytesWritable> it = locks.iterator(); it.hasNext();)
        internalReleaseLock(it.next().getString(), holderStr);
      locks.clear();
      for (Iterator<StringBytesWritable> it = creates.iterator(); it.hasNext();)
        internalReleaseCreate(it.next().getString(), holderStr);
      creates.clear();
    }

    /**
     */
    public String toString() {
      return "[Lease.  Holder: " + holder.toString() + ", heldlocks: " +
        locks.size() + ", pendingcreates: " + creates.size() + "]";
    }

    /**
     */
    public int compareTo(Lease o) {
      Lease l1 = this;
      Lease l2 = o;
      long lu1 = l1.lastUpdate;
      long lu2 = l2.lastUpdate;
      if (lu1 < lu2) {
        return -1;
      } else if (lu1 > lu2) {
        return 1;
      } else {
        return l1.holder.compareTo(l2.holder);
      }
    }

    public boolean equals(Object o) {
      if (!(o instanceof Lease)) {
        return false;
      }
      Lease obj = (Lease) o;
      if (lastUpdate == obj.lastUpdate &&
          holder.equals(obj.holder)) {
        return true;
      }
      return false;
    }

    public int hashCode() {
      return holder.hashCode();
    }
   
    String getHolder() throws IOException {
      return holder.getString();
    }
  }
 
  /******************************************************
   * LeaseMonitor checks for leases that have expired,
   * and disposes of them.
   ******************************************************/
  class LeaseMonitor implements Runnable {
    public void run() {
      try {
        while (fsRunning) {
          synchronized (FSNamesystem.this) {
            synchronized (leases) {
              Lease top;
              while ((sortedLeases.size() > 0) &&
                     ((top = sortedLeases.first()) != null)) {
                if (top.expiredHardLimit()) {
                  top.releaseLocks();
                  leases.remove(top.holder);
                  LOG.info("Removing lease " + top + ", leases remaining: " + sortedLeases.size());
                  if (!sortedLeases.remove(top)) {
                    LOG.info("Unknown failure trying to remove " + top + " from lease set.");
                  }
                } else {
                  break;
                }
              }
            }
          }
          try {
            Thread.sleep(2000);
          } catch (InterruptedException ie) {
          }
        }
      } catch (Exception e) {
        FSNamesystem.LOG.error(StringUtils.stringifyException(e));
      }
    }
  }
 
  private Lease getLease(String holder) throws IOException {
    return leases.get(new StringBytesWritable(holder));
  }
 
  private void putLease(String holder, Lease lease) throws IOException {
    leases.put(new StringBytesWritable(holder), lease);
  }
 
  private void removeLease(String holder) throws IOException {
    leases.remove(new StringBytesWritable(holder));
  }

  /**
   * Get a lock (perhaps exclusive) on the given file
   */
  /** @deprecated */
  @Deprecated
  public synchronized int obtainLock(UTF8 src,
                                     UTF8 holder,
                                     boolean exclusive) throws IOException {
    if (isInSafeMode())
      throw new SafeModeException("Cannot lock file " + src, safeMode);
    return OPERATION_FAILED;
  }

  /**
   * Release the lock on the given file
   */
  /** @deprecated */
  @Deprecated
  public synchronized int releaseLock(UTF8 src, UTF8 holder) {
    return OPERATION_FAILED;
  }
  private int internalReleaseLock(String src, String holder) throws IOException {
    return dir.releaseLock(src, holder);
  }

  /**
   * Release a pending file creation lock.
   * @param src The filename
   * @param holder The datanode that was creating the file
   */
  private void internalReleaseCreate(String src, String holder) throws IOException {
    boolean status =  pendingCreates.remove(src);
    if (status) {
      NameNode.stateChangeLog.debug("DIR* NameSystem.internalReleaseCreate: " + src
                                    + " is removed from pendingCreates for "
                                    + holder + " (failure)");
    } else {
      NameNode.stateChangeLog.warn("DIR* NameSystem.internalReleaseCreate: "
                                   + "attempt to release a create lock on "+ src
                                   + " that was not in pedingCreates");
    }
  }

  /**
   * Renew the lease(s) held by the given client
   */
  public void renewLease(String holder) throws IOException {
    synchronized (leases) {
      if (isInSafeMode())
        throw new SafeModeException("Cannot renew lease for " + holder, safeMode);
      Lease lease = getLease(holder);
      if (lease != null) {
        sortedLeases.remove(lease);
        lease.renew();
        sortedLeases.add(lease);
      }
    }
  }

  /**
   * Get a listing of all files at 'src'.  The Object[] array
   * exists so we can return file attributes (soon to be implemented)
   */
  public DFSFileInfo[] getListing(String src) {
    return dir.getListing(src);
  }

  /////////////////////////////////////////////////////////
  //
  // These methods are called by datanodes
  //
  /////////////////////////////////////////////////////////
  /**
   * Register Datanode.
   * <p>
   * The purpose of registration is to identify whether the new datanode
   * serves a new data storage, and will report new data block copies,
   * which the namenode was not aware of; or the datanode is a replacement
   * node for the data storage that was previously served by a different
   * or the same (in terms of host:port) datanode.
   * The data storages are distinguished by their storageIDs. When a new
   * data storage is reported the namenode issues a new unique storageID.
   * <p>
   * Finally, the namenode returns its namespaceID as the registrationID
   * for the datanodes.
   * namespaceID is a persistent attribute of the name space.
   * The registrationID is checked every time the datanode is communicating
   * with the namenode.
   * Datanodes with inappropriate registrationID are rejected.
   * If the namenode stops, and then restarts it can restore its
   * namespaceID and will continue serving the datanodes that has previously
   * registered with the namenode without restarting the whole cluster.
   *
   * @see DataNode#register()
   */
  public void registerDatanode(DatanodeRegistration nodeReg,
                                            String networkLocation
                                            ) throws IOException {
    registerDatanodeInternal(nodeReg, networkLocation);
    getEditLog().logSync();
  }

  private synchronized void registerDatanodeInternal(
                                            DatanodeRegistration nodeReg,
                                            String networkLocation
                                            ) throws IOException {

    if (!verifyNodeRegistration(nodeReg)) {
      throw new DisallowedDatanodeException(nodeReg);
    }

    String dnAddress = Server.getRemoteAddress();
    if (dnAddress == null) {
      //Mostly not called inside an RPC.
      throw new IOException("Could not find remote address for " +
                            "registration from " + nodeReg.getName());
    }     

    String hostName = nodeReg.getHost();
     
    // update the datanode's name with ip:port
    DatanodeID dnReg = new DatanodeID(dnAddress + ":" + nodeReg.getPort(),
                                      nodeReg.getStorageID(),
                                      nodeReg.getInfoPort());
    nodeReg.updateRegInfo(dnReg);
     
    NameNode.stateChangeLog.info(
                                 "BLOCK* NameSystem.registerDatanode: "
                                 + "node registration from " + nodeReg.getName()
                                 + " storage " + nodeReg.getStorageID());

    DatanodeDescriptor nodeS = datanodeMap.get(nodeReg.getStorageID());
    DatanodeDescriptor nodeN = host2DataNodeMap.getDatanodeByName(nodeReg.getName());
     
    if (nodeN != null && nodeN != nodeS) {
      NameNode.LOG.info("BLOCK* NameSystem.registerDatanode: "
                        + "node from name: " + nodeN.getName());
      // nodeN previously served a different data storage,
      // which is not served by anybody anymore.
      removeDatanode(nodeN);
      // physically remove node from datanodeMap
      wipeDatanode(nodeN);
      // and log removal
      getEditLog().logRemoveDatanode(nodeN);
      nodeN = null;
    }

    if (nodeS != null) {
      if (nodeN == nodeS) {
        // The same datanode has been just restarted to serve the same data
        // storage. We do not need to remove old data blocks, the delta will
        // be calculated on the next block report from the datanode
        NameNode.stateChangeLog.debug("BLOCK* NameSystem.registerDatanode: "
                                      + "node restarted.");
      } else {
        // nodeS is found
        // The registering datanode is a replacement node for the existing
        // data storage, which from now on will be served by a new node.
        NameNode.stateChangeLog.debug(
                                      "BLOCK* NameSystem.registerDatanode: "
                                      + "node " + nodeS.getName()
                                      + " is replaced by " + nodeReg.getName() + ".");
        getEditLog().logRemoveDatanode(nodeS);
      }
      // update cluster map
      clusterMap.remove(nodeS);
      nodeS.updateRegInfo(nodeReg);
      nodeS.setNetworkLocation(networkLocation);
      clusterMap.add(nodeS);
      nodeS.setHostName(hostName);
      if ( nodeS != nodeN ) {
        getEditLog().logAddDatanode( nodeS );
      }
       
      // also treat the registration message as a heartbeat
      synchronized(heartbeats) {
        if( !heartbeats.contains(nodeS)) {
          heartbeats.add(nodeS);
          //update its timestamp
          nodeS.updateHeartbeat(0L, 0L, 0);
          nodeS.isAlive = true;
        }
      }
      return;
    }

    // this is a new datanode serving a new data storage
    if (nodeReg.getStorageID().equals("")) {
      // this data storage has never been registered
      // it is either empty or was created by pre-storageID version of DFS
      nodeReg.storageID = newStorageID();
      NameNode.stateChangeLog.debug(
                                    "BLOCK* NameSystem.registerDatanode: "
                                    + "new storageID " + nodeReg.getStorageID() + " assigned.");
    }
    // register new datanode
    DatanodeDescriptor nodeDescr
      = new DatanodeDescriptor(nodeReg, networkLocation, hostName);
    unprotectedAddDatanode(nodeDescr);
    clusterMap.add(nodeDescr);
    getEditLog().logAddDatanode(nodeDescr);
     
    // also treat the registration message as a heartbeat
    synchronized(heartbeats) {
      heartbeats.add(nodeDescr);
      nodeDescr.isAlive = true;
      // no need to update its timestamp
      // because its is done when the descriptor is created
    }
    return;
  }
   
  /**
   * Get registrationID for datanodes based on the namespaceID.
   *
   * @see #registerDatanode(DatanodeRegistration,String)
   * @see FSImage#newNamespaceID()
   * @return registration ID
   */
  public String getRegistrationID() {
    return Storage.getRegistrationID(dir.fsImage);
  }
   
  /**
   * Generate new storage ID.
   *
   * @return unique storage ID
   *
   * Note: that collisions are still possible if somebody will try
   * to bring in a data storage from a different cluster.
   */
  private String newStorageID() {
    String newID = null;
    while(newID == null) {
      newID = "DS" + Integer.toString(r.nextInt());
      if (datanodeMap.get(newID) != null)
        newID = null;
    }
    return newID;
  }
   
  private boolean isDatanodeDead(DatanodeDescriptor node) {
    return (node.getLastUpdate() <
            (now() - heartbeatExpireInterval));
  }
   
  void setDatanodeDead(DatanodeID nodeID) throws IOException {
    DatanodeDescriptor node = getDatanode(nodeID);
    node.setLastUpdate(0);
  }

  /**
   * The given node has reported in.  This method should:
   * 1) Record the heartbeat, so the datanode isn't timed out
   * 2) Adjust usage stats for future block allocation
   *
   * If a substantial amount of time passed since the last datanode
   * heartbeat then request an immediate block report. 
   *
   * @return true if registration is required or false otherwise.
   * @throws IOException
   */
  public boolean gotHeartbeat(DatanodeID nodeID,
                              long capacity,
                              long remaining,
                              int xceiverCount,
                              int xmitsInProgress,
                              Object[] xferResults,
                              Object deleteList[]
                              ) throws IOException {
    synchronized (heartbeats) {
      synchronized (datanodeMap) {
        DatanodeDescriptor nodeinfo;
        try {
          nodeinfo = getDatanode(nodeID);
          if (nodeinfo == null) {
            return true;
          }
        } catch(UnregisteredDatanodeException e) {
          return true;
        }
         
        // Check if this datanode should actually be shutdown instead.
        if (shouldNodeShutdown(nodeinfo)) {
          setDatanodeDead(nodeinfo);
          throw new DisallowedDatanodeException(nodeinfo);
        }

        if (!nodeinfo.isAlive) {
          return true;
        } else {
          updateStats(nodeinfo, false);
          nodeinfo.updateHeartbeat(capacity, remaining, xceiverCount);
          updateStats(nodeinfo, true);
          //
          // Extract pending replication work or block invalidation
          // work from the datanode descriptor
          //
          nodeinfo.getReplicationSets(this.maxReplicationStreams -
                                      xmitsInProgress, xferResults);
          if (xferResults[0] == null) {
            nodeinfo.getInvalidateBlocks(FSConstants.BLOCK_INVALIDATE_CHUNK,
                                         deleteList);
          }
          return false;
        }
      }
    }
  }

  private void updateStats(DatanodeDescriptor node, boolean isAdded) {
    //
    // The statistics are protected by the heartbeat lock
    //
    assert(Thread.holdsLock(heartbeats));
    if (isAdded) {
      totalCapacity += node.getCapacity();
      totalRemaining += node.getRemaining();
      totalLoad += node.getXceiverCount();
    } else {
      totalCapacity -= node.getCapacity();
      totalRemaining -= node.getRemaining();
      totalLoad -= node.getXceiverCount();
    }
  }
  /**
   * Periodically calls heartbeatCheck().
   */
  class HeartbeatMonitor implements Runnable {
    /**
     */
    public void run() {
      while (fsRunning) {
        try {
          heartbeatCheck();
        } catch (Exception e) {
          FSNamesystem.LOG.error(StringUtils.stringifyException(e));
        }
        try {
          Thread.sleep(heartbeatRecheckInterval);
        } catch (InterruptedException ie) {
        }
      }
    }
  }

  /**
   * Periodically calls computeReplicationWork().
   */
  class ReplicationMonitor implements Runnable {
    public void run() {
      while (fsRunning) {
        try {
          computeDatanodeWork();
          processPendingReplications();
          Thread.sleep(replicationRecheckInterval);
        } catch (InterruptedException ie) {
        } catch (IOException ie) {
          LOG.warn("ReplicationMonitor thread received exception. " + ie);
        } catch (Throwable t) {
          LOG.warn("ReplicationMonitor thread received Runtime exception. " + t);
          Runtime.getRuntime().exit(-1);
        }
      }
    }
  }

  /**
   * Look at a few datanodes and compute any replication work that
   * can be scheduled on them. The datanode will be infomed of this
   * work at the next heartbeat.
   */
  void computeDatanodeWork() throws IOException {
    int numiter = 0;
    int foundwork = 0;
    int hsize = 0;
    int lastReplIndex = -1;

    while (true) {
      DatanodeDescriptor node = null;

      //
      // pick the datanode that was the last one in the
      // previous invocation of this method.
      //
      synchronized (heartbeats) {
        hsize = heartbeats.size();
        if (numiter++ >= hsize) {
          // no change in replIndex.
          if (lastReplIndex >= 0) {
            //next time, start after where the last replication was scheduled
            replIndex = lastReplIndex;
          }
          break;
        }
        if (replIndex >= hsize) {
          replIndex = 0;
        }
        node = heartbeats.get(replIndex);
        replIndex++;
      }

      //
      // Is there replication work to be computed for this datanode?
      //
      int precomputed = node.getNumberOfBlocksToBeReplicated();
      int needed = this.maxReplicationStreams - precomputed;
      boolean doReplication = false;
      boolean doInvalidation = false;
      if (needed > 0) {
        //
        // Compute replication work and store work into the datanode
        //
        Object replsets[] = pendingTransfers(node, needed);
        if (replsets != null) {
          doReplication = true;
          addBlocksToBeReplicated(node, (Block[])replsets[0],
                                  (DatanodeDescriptor[][])replsets[1]);
          lastReplIndex = replIndex;
        }
      }
      if (!doReplication) {
        //
        // Determine if block deletion is pending for this datanode
        //
        Block blocklist[] = blocksToInvalidate(node);
        if (blocklist != null) {
          doInvalidation = true;
          addBlocksToBeInvalidated(node, blocklist);
        }
      }
      if (doReplication || doInvalidation) {
        //
        // If we have already computed work for a predefined
        // number of datanodes in this iteration, then relax
        //
        if (foundwork > ((hsize * REPL_WORK_PER_ITERATION)/100)) {
          break;
        }
        foundwork++;
      }
    }
  }

  /**
   * If there were any replication requests that timed out, reap them
   * and put them back into the neededReplication queue
   */
  void processPendingReplications() {
    Block[] timedOutItems = pendingReplications.getTimedOutBlocks();
    if (timedOutItems != null) {
      synchronized (this) {
        for (int i = 0; i < timedOutItems.length; i++) {
          NumberReplicas num = countNodes(timedOutItems[i]);
          neededReplications.add(timedOutItems[i],
                                 num.liveReplicas(),
                                 num.decommissionedReplicas(),
                                 getReplication(timedOutItems[i]));
        }
      }
    }
  }

  /**
   * Add more replication work for this datanode.
   */
  synchronized void addBlocksToBeReplicated(DatanodeDescriptor node,
                                            Block[] blocklist,
                                            DatanodeDescriptor[][] targets)
    throws IOException {
    //
    // Find the datanode with the FSNamesystem lock held.
    //
    DatanodeDescriptor n = getDatanode(node);
    if (n != null) {
      n.addBlocksToBeReplicated(blocklist, targets);
    }
  }

  /**
   * Add more block invalidation work for this datanode.
   */
  synchronized void addBlocksToBeInvalidated(DatanodeDescriptor node,
                                             Block[] blocklist) throws IOException {
    //
    // Find the datanode with the FSNamesystem lock held.
    //
    DatanodeDescriptor n = getDatanode(node);
    if (n != null) {
      n.addBlocksToBeInvalidated(blocklist);
    }
  }

  /**
   * remove a datanode descriptor
   * @param nodeID datanode ID
   */
  synchronized public void removeDatanode(DatanodeID nodeID)
    throws IOException {
    DatanodeDescriptor nodeInfo = getDatanode(nodeID);
    if (nodeInfo != null) {
      removeDatanode(nodeInfo);
    } else {
      NameNode.stateChangeLog.warn("BLOCK* NameSystem.removeDatanode: "
                                   + nodeID.getName() + " does not exist");
    }
  }
 
  /**
   * remove a datanode descriptor
   * @param nodeInfo datanode descriptor
   */
  private void removeDatanode(DatanodeDescriptor nodeInfo) {
    synchronized (heartbeats) {
      if (nodeInfo.isAlive) {
        updateStats(nodeInfo, false);
        heartbeats.remove(nodeInfo);
        nodeInfo.isAlive = false;
      }
    }

    for (Iterator<Block> it = nodeInfo.getBlockIterator(); it.hasNext();) {
      removeStoredBlock(it.next(), nodeInfo);
    }
    unprotectedRemoveDatanode(nodeInfo);
    clusterMap.remove(nodeInfo);
  }

  void unprotectedRemoveDatanode(DatanodeDescriptor nodeDescr) {
    // datanodeMap.remove(nodeDescr.getStorageID());
    // deaddatanodeMap.put(nodeDescr.getName(), nodeDescr);
    nodeDescr.resetBlocks();
    NameNode.stateChangeLog.debug(
                                  "BLOCK* NameSystem.unprotectedRemoveDatanode: "
                                  + nodeDescr.getName() + " is out of service now.");
  }
   
  void unprotectedAddDatanode(DatanodeDescriptor nodeDescr) {
    /* To keep host2DataNodeMap consistent with datanodeMap,
       remove  from host2DataNodeMap the datanodeDescriptor removed
       from datanodeMap before adding nodeDescr to host2DataNodeMap.
    */
    host2DataNodeMap.remove(
                            datanodeMap.put(nodeDescr.getStorageID(), nodeDescr));
    host2DataNodeMap.add(nodeDescr);
     
    NameNode.stateChangeLog.debug(
                                  "BLOCK* NameSystem.unprotectedAddDatanode: "
                                  + "node " + nodeDescr.getName() + " is added to datanodeMap.");
  }

   
  /**
   * Physically remove node from datanodeMap.
   *
   * @param nodeID node
   */
  void wipeDatanode(DatanodeID nodeID) throws IOException {
    String key = nodeID.getStorageID();
    host2DataNodeMap.remove(datanodeMap.remove(key));
    NameNode.stateChangeLog.debug(
                                  "BLOCK* NameSystem.wipeDatanode: "
                                  + nodeID.getName() + " storage " + key
                                  + " is removed from datanodeMap.");
  }

  FSImage getFSImage() {
    return dir.fsImage;
  }

  FSEditLog getEditLog() {
    return getFSImage().getEditLog();
  }

  /**
   * Check if there are any expired heartbeats, and if so,
   * whether any blocks have to be re-replicated.
   * While removing dead datanodes, make sure that only one datanode is marked
   * dead at a time within the synchronized section. Otherwise, a cascading
   * effect causes more datanodes to be declared dead.
   */
  void heartbeatCheck() {
    boolean allAlive = false;
    while (!allAlive) {
      boolean foundDead = false;
      DatanodeID nodeID = null;

      // locate the first dead node.
      synchronized(heartbeats) {
        for (Iterator<DatanodeDescriptor> it = heartbeats.iterator();
             it.hasNext();) {
          DatanodeDescriptor nodeInfo = it.next();
          if (isDatanodeDead(nodeInfo)) {
            foundDead = true;
            nodeID = nodeInfo;
            break;
          }
        }
      }

      // acquire the fsnamesystem lock, and then remove the dead node.
      if (foundDead) {
        synchronized (this) {
          synchronized(heartbeats) {
            synchronized (datanodeMap) {
              DatanodeDescriptor nodeInfo = null;
              try {
                nodeInfo = getDatanode(nodeID);
              } catch (IOException e) {
                nodeInfo = null;
              }
              if (nodeInfo != null && isDatanodeDead(nodeInfo)) {
                NameNode.stateChangeLog.info("BLOCK* NameSystem.heartbeatCheck: "
                                             + "lost heartbeat from " + nodeInfo.getName());
                removeDatanode(nodeInfo);
              }
            }
          }
        }
      }
      allAlive = !foundDead;
    }
  }
   
  /**
   * The given node is reporting all its blocks.  Use this info to
   * update the (machine-->blocklist) and (block-->machinelist) tables.
   */
  public synchronized Block[] processReport(DatanodeID nodeID,
                                            Block newReport[]
                                            ) throws IOException {
    if (NameNode.stateChangeLog.isDebugEnabled()) {
      NameNode.stateChangeLog.debug("BLOCK* NameSystem.processReport: "
                                    +"from "+nodeID.getName()+" "+newReport.length+" blocks");
    }
    DatanodeDescriptor node = getDatanode(nodeID);
    if (node == null) {
      throw new IOException("ProcessReport from unregisterted node: "
                            + nodeID.getName());
    }

    // Check if this datanode should actually be shutdown instead.
    if (shouldNodeShutdown(node)) {
      setDatanodeDead(node);
      throw new DisallowedDatanodeException(node);
    }

    //
    // Modify the (block-->datanode) map, according to the difference
    // between the old and new block report.
    //
    int newPos = 0;
    Iterator<Block> iter = node.getBlockIterator();
    Block oldblk = iter.hasNext() ? iter.next() : null;
    Block newblk = (newReport != null && newReport.length > 0) ?
      newReport[0: null;

    // common case is that most of the blocks from the datanode
    // matches blocks in datanode descriptor.               
    Collection<Block> toRemove = new LinkedList<Block>();
    Collection<Block> toAdd = new LinkedList<Block>();
       
    while (oldblk != null || newblk != null) {
          
      int cmp = (oldblk == null) ? 1 :
        ((newblk == null) ? -1 : oldblk.compareTo(newblk));

      if (cmp == 0) {
        // Do nothing, blocks are the same
        newPos++;
        oldblk = iter.hasNext() ? iter.next() : null;
        newblk = (newPos < newReport.length)
          ? newReport[newPos] : null;
      } else if (cmp < 0) {
        // The old report has a block the new one does not
        toRemove.add(oldblk);
        oldblk = iter.hasNext() ? iter.next() : null;
      } else {
        // The new report has a block the old one does not
        toAdd.add(newblk);
        newPos++;
        newblk = (newPos < newReport.length)
          ? newReport[newPos] : null;
      }
    }
       
    for (Iterator<Block> i = toRemove.iterator(); i.hasNext();) {
      Block b = i.next();
      removeStoredBlock(b, node);
      node.removeBlock(b);
    }
    for (Iterator<Block> i = toAdd.iterator(); i.hasNext();) {
      Block b = i.next();
      node.addBlock(addStoredBlock(b, node));
    }
       
    //
    // We've now completely updated the node's block report profile.
    // We now go through all its blocks and find which ones are invalid,
    // no longer pending, or over-replicated.
    //
    // (Note it's not enough to just invalidate blocks at lease expiry
    // time; datanodes can go down before the client's lease on
    // the failed file expires and miss the "expire" event.)
    //
    // This function considers every block on a datanode, and thus
    // should only be invoked infrequently.
    //
    Collection<Block> obsolete = new ArrayList<Block>();
    for (Iterator<Block> it = node.getBlockIterator(); it.hasNext();) {
      Block b = it.next();

      //
      // A block report can only send BLOCK_INVALIDATE_CHUNK number of
      // blocks to be deleted. If there are more blocks to be deleted,
      // they are added to recentInvalidateSets and will be sent out
      // thorugh succeeding heartbeat responses.
      //
      if (!isValidBlock(b)) {
        if (obsolete.size() > FSConstants.BLOCK_INVALIDATE_CHUNK) {
          addToInvalidates(b, node);
        } else {
          obsolete.add(b);
        }
        NameNode.stateChangeLog.debug("BLOCK* NameSystem.processReport: "
                                      +"ask "+nodeID.getName()+" to delete "+b.getBlockName());
      }
    }
    return (Block[]) obsolete.toArray(new Block[obsolete.size()]);
  }

  /**
   * Modify (block-->datanode) map.  Remove block from set of
   * needed replications if this takes care of the problem.
   * @return the block that is stored in blockMap.
   */
  synchronized Block addStoredBlock(Block block, DatanodeDescriptor node) {
       
    FSDirectory.INode fileINode = blocksMap.getINode(block);
    int replication = (fileINode != null) ?  fileINode.getReplication() :
      defaultReplication;
    boolean added = blocksMap.addNode(block, node, replication);
       
    Block storedBlock = blocksMap.getStoredBlock(block); //extra look up!
    if (storedBlock != null && block != storedBlock) {
      if (block.getNumBytes() > 0) {
        storedBlock.setNumBytes(block.getNumBytes());
      }
      block = storedBlock;
    }
       
    int curReplicaDelta = 0;
       
    if (added) {
      curReplicaDelta = 1;
      //
      // At startup time, because too many new blocks come in
      // they take up lots of space in the log file.
      // So, we log only when namenode is out of safemode.
      //
      if (!isInSafeMode()) {
        NameNode.stateChangeLog.info("BLOCK* NameSystem.addStoredBlock: "
                                      +"blockMap updated: "+node.getName()+" is added to "+block.getBlockName());
      }
    } else {
      NameNode.stateChangeLog.warn("BLOCK* NameSystem.addStoredBlock: "
                                   + "Redundant addStoredBlock request received for "
                                   + block.getBlockName() + " on " + node.getName());
    }

    if (fileINode == null// block does not belong to any file
      return block;
       
    // filter out containingNodes that are marked for decommission.
    NumberReplicas num = countNodes(block);
    int numCurrentReplica = num.liveReplicas()
      + pendingReplications.getNumReplicas(block);
       
    // check whether safe replication is reached for the block
    // only if it is a part of a files
    incrementSafeBlockCount(numCurrentReplica);
    // handle underReplication/overReplication
    short fileReplication = fileINode.getReplication();
    if (numCurrentReplica >= fileReplication) {
      neededReplications.remove(block, numCurrentReplica,
                                num.decommissionedReplicas, fileReplication);
    } else {
      updateNeededReplications(block, curReplicaDelta, 0);
    }
    if (numCurrentReplica > fileReplication) {
      proccessOverReplicatedBlock(block, fileReplication);
    }
    return block;
  }
   
  /**
   * Find how many of the containing nodes are "extra", if any.
   * If there are any extras, call chooseExcessReplicates() to
   * mark them in the excessReplicateMap.
   */
  private void proccessOverReplicatedBlock(Block block, short replication) {
    Collection<DatanodeDescriptor> nonExcess = new ArrayList<DatanodeDescriptor>();
    for (Iterator<DatanodeDescriptor> it = blocksMap.nodeIterator(block);
         it.hasNext();) {
      DatanodeDescriptor cur = it.next();
      Collection<Block> excessBlocks = excessReplicateMap.get(cur.getStorageID());
      if (excessBlocks == null || !excessBlocks.contains(block)) {
        if (!cur.isDecommissionInProgress() && !cur.isDecommissioned()) {
          nonExcess.add(cur);
        }
      }
    }
    chooseExcessReplicates(nonExcess, block, replication);   
  }

  /**
   * We want "replication" replicates for the block, but we now have too many. 
   * In this method, copy enough nodes from 'srcNodes' into 'dstNodes' such that:
   *
   * srcNodes.size() - dstNodes.size() == replication
   *
   * We pick node that make sure that replicas are spread across racks and
   * also try hard to pick one with least free space.
   * The algorithm is first to pick a node with least free space from nodes
   * that are on a rack holding more than one replicas of the block.
   * So removing such a replica won't remove a rack.
   * If no such a node is available,
   * then pick a node with least free space
   */
  void chooseExcessReplicates(Collection<DatanodeDescriptor> nonExcess,
                              Block b, short replication) {
    // first form a rack to datanodes map and
    HashMap<String, ArrayList<DatanodeDescriptor>> rackMap =
      new HashMap<String, ArrayList<DatanodeDescriptor>>();
    for (Iterator<DatanodeDescriptor> iter = nonExcess.iterator();
         iter.hasNext();) {
      DatanodeDescriptor node = iter.next();
      String rackName = node.getNetworkLocation();
      ArrayList<DatanodeDescriptor> datanodeList = rackMap.get(rackName);
      if(datanodeList==null) {
        datanodeList = new ArrayList<DatanodeDescriptor>();
      }
      datanodeList.add(node);
      rackMap.put(rackName, datanodeList);
    }
   
    // split nodes into two sets
    // priSet contains nodes on rack with more than one replica
    // remains contains the remaining nodes
    ArrayList<DatanodeDescriptor> priSet = new ArrayList<DatanodeDescriptor>();
    ArrayList<DatanodeDescriptor> remains = new ArrayList<DatanodeDescriptor>();
    for( Iterator<Entry<String, ArrayList<DatanodeDescriptor>>> iter =
      rackMap.entrySet().iterator(); iter.hasNext(); ) {
      Entry<String, ArrayList<DatanodeDescriptor>> rackEntry = iter.next();
      ArrayList<DatanodeDescriptor> datanodeList = rackEntry.getValue();
      if( datanodeList.size() == 1 ) {
        remains.add(datanodeList.get(0));
      } else {
        priSet.addAll(datanodeList);
      }
    }
   
    // pick one node with least space from priSet if it is not empty
    // otherwise one node with least space from remains
    while (nonExcess.size() - replication > 0) {
      DatanodeInfo cur = null;
      long minSpace = Long.MAX_VALUE;

      Iterator<DatanodeDescriptor> iter =
        priSet.isEmpty() ? remains.iterator() : priSet.iterator();
      while( iter.hasNext() ) {
        DatanodeDescriptor node = iter.next();
        long free = node.getRemaining();
               
        if (minSpace > free) {
          minSpace = free;
          cur = node;
        }
      }

      // adjust rackmap, priSet, and remains
      String rack = cur.getNetworkLocation();
      ArrayList<DatanodeDescriptor> datanodes = rackMap.get(rack);
      datanodes.remove(cur);
      if(datanodes.isEmpty()) {
        rackMap.remove(rack);
      }
      if (priSet.isEmpty()) {
        remains.remove(cur);
      } else {
        priSet.remove(cur);
        if (datanodes.size() == 1) {
          priSet.remove(datanodes.get(0));
          remains.add(datanodes.get(0));
        }
      }

      nonExcess.remove(cur);

      Collection<Block> excessBlocks = excessReplicateMap.get(cur.getStorageID());
      if (excessBlocks == null) {
        excessBlocks = new TreeSet<Block>();
        excessReplicateMap.put(cur.getStorageID(), excessBlocks);
      }
      excessBlocks.add(b);
      NameNode.stateChangeLog.debug("BLOCK* NameSystem.chooseExcessReplicates: "
                                    +"("+cur.getName()+", "+b.getBlockName()+") is added to excessReplicateMap");

      //
      // The 'excessblocks' tracks blocks until we get confirmation
      // that the datanode has deleted them; the only way we remove them
      // is when we get a "removeBlock" message. 
      //
      // The 'invalidate' list is used to inform the datanode the block
      // should be deleted.  Items are removed from the invalidate list
      // upon giving instructions to the namenode.
      //
      Collection<Block> invalidateSet = recentInvalidateSets.get(cur.getStorageID());
      if (invalidateSet == null) {
        invalidateSet = new ArrayList<Block>();
        recentInvalidateSets.put(cur.getStorageID(), invalidateSet);
      }
      invalidateSet.add(b);
      NameNode.stateChangeLog.debug("BLOCK* NameSystem.chooseExcessReplicates: "
                                    +"("+cur.getName()+", "+b.getBlockName()+") is added to recentInvalidateSets");
    }
  }

  /**
   * Modify (block-->datanode) map.  Possibly generate
   * replication tasks, if the removed block is still valid.
   */
  synchronized void removeStoredBlock(Block block, DatanodeDescriptor node) {
    NameNode.stateChangeLog.debug("BLOCK* NameSystem.removeStoredBlock: "
                                  +block.getBlockName() + " from "+node.getName());
    if (!blocksMap.removeNode(block, node)) {
      NameNode.stateChangeLog.debug("BLOCK* NameSystem.removeStoredBlock: "
                                    +block.getBlockName()+" has already been removed from node "+node);
      return;
    }
       
    decrementSafeBlockCount(block);
    //
    // It's possible that the block was removed because of a datanode
    // failure.  If the block is still valid, check if replication is
    // necessary.  In that case, put block on a possibly-will-
    // be-replicated list.
    //
    FSDirectory.INode fileINode = blocksMap.getINode(block);
    if (fileINode != null) {
      updateNeededReplications(block, -1, 0);
    }

    //
    // We've removed a block from a node, so it's definitely no longer
    // in "excess" there.
    //
    Collection<Block> excessBlocks = excessReplicateMap.get(node.getStorageID());
    if (excessBlocks != null) {
      excessBlocks.remove(block);
      NameNode.stateChangeLog.debug("BLOCK* NameSystem.removeStoredBlock: "
                                    +block.getBlockName()+" is removed from excessBlocks");
      if (excessBlocks.size() == 0) {
        excessReplicateMap.remove(node.getStorageID());
      }
    }
  }

  /**
   * The given node is reporting that it received a certain block.
   */
  public synchronized void blockReceived(DatanodeID nodeID, 
                                         Block block
                                         ) throws IOException {
    DatanodeDescriptor node = getDatanode(nodeID);
    if (node == null) {
      NameNode.stateChangeLog.warn("BLOCK* NameSystem.blockReceived: "
                                   + block.getBlockName() + " is received from an unrecorded node "
                                   + nodeID.getName());
      throw new IllegalArgumentException(
                                         "Unexpected exception.  Got blockReceived message from node "
                                         + block.getBlockName() + ", but there is no info for it");
    }
       
    if (NameNode.stateChangeLog.isDebugEnabled()) {
      NameNode.stateChangeLog.debug("BLOCK* NameSystem.blockReceived: "
                                    +block.getBlockName()+" is received from " + nodeID.getName());
    }

    // Check if this datanode should actually be shutdown instead.
    if (shouldNodeShutdown(node)) {
      setDatanodeDead(node);
      throw new DisallowedDatanodeException(node);
    }

    //
    // Modify the blocks->datanode map and node's map.
    //
    node.addBlock(addStoredBlock(block, node));
    pendingReplications.remove(block);
  }

  /**
   * Total raw bytes.
   */
  public long totalCapacity() {

    synchronized (heartbeats) {
      return totalCapacity;
    }
  }

  /**
   * Total non-used raw bytes.
   */
  public long totalRemaining() {
    synchronized (heartbeats) {
      return totalRemaining;
    }
  }

  /**
   * Total number of connections.
   */
  public int totalLoad() {
    synchronized (heartbeats) {
      return totalLoad;
    }
  }

  public synchronized DatanodeInfo[] datanodeReport() {
    DatanodeInfo results[] = null;
    synchronized (datanodeMap) {
      results = new DatanodeInfo[datanodeMap.size()];
      int i = 0;
      for(Iterator<DatanodeDescriptor> it = datanodeMap.values().iterator(); it.hasNext();)
        results[i++] = new DatanodeInfo(it.next());
    }
    return results;
  }
   
  /**
   */
  public synchronized void DFSNodesStatus(ArrayList<DatanodeDescriptor> live,
                                          ArrayList<DatanodeDescriptor> dead) {
    synchronized (datanodeMap) {
      for(Iterator<DatanodeDescriptor> it = datanodeMap.values().iterator(); it.hasNext();) {
        DatanodeDescriptor node = it.next();
        if (isDatanodeDead(node))
          dead.add(node);
        else
          live.add(node);
      }
    }
  }

  /**
   * Prints information about all datanodes.
   */
  private synchronized void datanodeDump(PrintWriter out) {
    synchronized (datanodeMap) {
      out.println("Metasave: Number of datanodes: " + datanodeMap.size());
      for(Iterator<DatanodeDescriptor> it = datanodeMap.values().iterator(); it.hasNext();) {
        DatanodeDescriptor node = it.next();
        out.println(node.dumpDatanode());
      }
    }
  }

  /**
   * Start decommissioning the specified datanode.
   */
  private void startDecommission (DatanodeDescriptor node)
    throws IOException {

    if (!node.isDecommissionInProgress() && !node.isDecommissioned()) {
      LOG.info("Start Decommissioning node " + node.name);
      node.startDecommission();
      //
      // all the blocks that reside on this node have to be
      // replicated.
      Block decommissionBlocks[] = node.getBlocks();
      for (int j = 0; j < decommissionBlocks.length; j++) {
        updateNeededReplications(decommissionBlocks[j], -1, 0);
      }
    }
  }

  /**
   * Stop decommissioning the specified datanodes.
   */
  public void stopDecommission (DatanodeDescriptor node)
    throws IOException {
    LOG.info("Stop Decommissioning node " + node.name);
    node.stopDecommission();
  }

  /**
   */
  public DatanodeInfo getDataNodeInfo(String name) {
    return datanodeMap.get(name);
  }
  /**
   */
  public String getDFSNameNodeMachine() {
    return localMachine;
  }
  /**
   */
  public int getDFSNameNodePort() {
    return port;
  }
  /**
   */
  public Date getStartTime() {
    return startTime;
  }
   
  short getMaxReplication()     { return (short)maxReplication; }
  short getMinReplication()     { return (short)minReplication; }
  short getDefaultReplication() { return (short)defaultReplication; }
   
  /////////////////////////////////////////////////////////
  //
  // These methods are called by the Namenode system, to see
  // if there is any work for a given datanode.
  //
  /////////////////////////////////////////////////////////

  /**
   * Check if there are any recently-deleted blocks a datanode should remove.
   */
  public synchronized Block[] blocksToInvalidate(DatanodeID nodeID) {
    // Ask datanodes to perform block delete 
    // only if safe mode is off.
    if (isInSafeMode())
      return null;
      
    Collection<Block> invalidateSet = recentInvalidateSets.remove(
                                                                  nodeID.getStorageID());
    if (invalidateSet == null) {
      return null;
    }

    Iterator<Block> it = null;
    int sendNum = invalidateSet.size();
    int origSize = sendNum;
    ArrayList<Block> sendBlock = new ArrayList<Block>(sendNum);

    //
    // calculate the number of blocks that we send in one message
    //
    if (sendNum > FSConstants.BLOCK_INVALIDATE_CHUNK) {
      sendNum =  FSConstants.BLOCK_INVALIDATE_CHUNK;
    }
    //
    // Copy the first chunk into sendBlock
    //
    for (it = invalidateSet.iterator(); sendNum > 0; sendNum--) {
      assert(it.hasNext());
      sendBlock.add(it.next());
      it.remove();
    }

    //
    // If we could not send everything in this message, reinsert this item
    // into the collection.
    //
    if (it.hasNext()) {
      assert(origSize > FSConstants.BLOCK_INVALIDATE_CHUNK);
      recentInvalidateSets.put(nodeID.getStorageID(), invalidateSet);
    }
       
    if (NameNode.stateChangeLog.isInfoEnabled()) {
      StringBuffer blockList = new StringBuffer();
      for (int i = 0; i < sendBlock.size(); i++) {
        blockList.append(' ');
        Block block = sendBlock.get(i);
        blockList.append(block.getBlockName());
      }
      NameNode.stateChangeLog.info("BLOCK* NameSystem.blockToInvalidate: "
                                   +"ask "+nodeID.getName()+" to delete " + blockList);
    }
    return sendBlock.toArray(new Block[sendBlock.size()]);
  }


  /**
   * A immutable object that stores the number of live replicas and
   * the number of decommissined Replicas.
   */
  static class NumberReplicas {
    private int liveReplicas;
    private int decommissionedReplicas;

    NumberReplicas(int live, int decommissioned) {
      liveReplicas = live;
      decommissionedReplicas = decommissioned;
    }

    int liveReplicas() {
      return liveReplicas;
    }
    int decommissionedReplicas() {
      return decommissionedReplicas;
    }
  }

  /*
   * Counts the number of nodes in the given list into active and
   * decommissioned counters.
   */
  private NumberReplicas countNodes(Iterator<DatanodeDescriptor> nodeIter) {
    int count = 0;
    int live = 0;
    while ( nodeIter.hasNext() ) {
      DatanodeDescriptor node = nodeIter.next();
      if (node.isDecommissionInProgress() || node.isDecommissioned()) {
        count++;
      }
      else {
        live++;
      }
    }
    return new NumberReplicas(live, count);
  }

  /** return the number of nodes that are live and decommissioned. */
  private NumberReplicas countNodes(Block b) {
    return countNodes(blocksMap.nodeIterator(b));
  }

  /** Returns a newly allocated list of all nodes. Returns a count of
  * live and decommissioned nodes. */
  ArrayList<DatanodeDescriptor> containingNodeList(Block b, NumberReplicas[] numReplicas) {
    ArrayList<DatanodeDescriptor> nodeList =
      new ArrayList<DatanodeDescriptor>();
    int count = 0;
    int live = 0;
    for(Iterator<DatanodeDescriptor> it = blocksMap.nodeIterator(b);
        it.hasNext();) {
      DatanodeDescriptor node = it.next();
      if (!node.isDecommissionInProgress() && !node.isDecommissioned()) {
        live++;
      }
      else {
        count++;
      }
      nodeList.add(node);
    }
    if (numReplicas != null) {
      numReplicas[0] = new NumberReplicas(live, count);
    }
    return nodeList;
  }
  /*
   * Return true if there are any blocks on this node that have not
   * yet reached their replication factor. Otherwise returns false.
   */
  private boolean isReplicationInProgress(DatanodeDescriptor srcNode) {
    Block decommissionBlocks[] = srcNode.getBlocks();
    boolean status = false;
    for (int i = 0; i < decommissionBlocks.length; i++) {
      Block block = decommissionBlocks[i];
      FSDirectory.INode fileINode = blocksMap.getINode(block);

      if (fileINode != null) {
        NumberReplicas num = countNodes(block);
        int curReplicas = num.liveReplicas();
        int curExpectedReplicas = getReplication(block);
        if (curExpectedReplicas > curReplicas) {
          status = true;
          if (!neededReplications.contains(block) &&
            pendingReplications.getNumReplicas(block) == 0) {
            //
            // These blocks have been reported from the datanode
            // after the startDecommission method has been executed. These
            // blocks were in flight when the decommission was started.
            //
            neededReplications.update(block,
                                      curReplicas,
                                      num.decommissionedReplicas(),
                                      curExpectedReplicas,
                                      -1, 0);
          }
        }
      }
    }
    return status;
  }

  /**
   * Change, if appropriate, the admin state of a datanode to
   * decommission completed. Return true if decommission is complete.
   */
  private boolean checkDecommissionStateInternal(DatanodeDescriptor node) {
    //
    // Check to see if all blocks in this decommisioned
    // node has reached their target replication factor.
    //
    if (node.isDecommissionInProgress()) {
      if (!isReplicationInProgress(node)) {
        node.setDecommissioned();
        LOG.info("Decommission complete for node " + node.name);
      }
    }
    if (node.isDecommissioned()) {
      return true;
    }
    return false;
  }

  /**
   * Return with a list of Block/DataNodeInfo sets, indicating
   * where various Blocks should be copied, ASAP.
   *
   * The Array that we return consists of two objects:
   * The 1st elt is an array of Blocks.
   * The 2nd elt is a 2D array of DatanodeDescriptor objs, identifying the
   *     target sequence for the Block at the appropriate index.
   *
   */
  public synchronized Object[] pendingTransfers(DatanodeID srcNode,
                                                int needed) {
    // Ask datanodes to perform block replication 
    // only if safe mode is off.
    if (isInSafeMode())
      return null;
   
    synchronized (neededReplications) {
      Object results[] = null;

      if (neededReplications.size() > 0) {
        //
        // Go through all blocks that need replications. See if any
        // are present at the current node. If so, ask the node to
        // replicate them.
        //
        List<Block> replicateBlocks = new ArrayList<Block>();
        List<NumberReplicas> numCurrentReplicas = new ArrayList<NumberReplicas>();
        List<DatanodeDescriptor[]> replicateTargetSets;
        replicateTargetSets = new ArrayList<DatanodeDescriptor[]>();
        NumberReplicas[] allReplicas = new NumberReplicas[1];
        for (Iterator<Block> it = neededReplications.iterator(); it.hasNext();) {
          if (needed <= 0) {
            break;
          }
          Block block = it.next();
          long blockSize = block.getNumBytes();
          FSDirectory.INode fileINode = blocksMap.getINode(block);
          if (fileINode == null) { // block does not belong to any file
            it.remove();
          } else {
            List<DatanodeDescriptor> containingNodes =
              containingNodeList(block, allReplicas);
            Collection<Block> excessBlocks = excessReplicateMap.get(
                                                                    srcNode.getStorageID());

            // srcNode must contain the block, and the block must
            // not be scheduled for removal on that node
            if (containingNodes.contains(srcNode)
                && (excessBlocks == null || !excessBlocks.contains(block))) {
              int numCurrentReplica = allReplicas[0].liveReplicas() +
                pendingReplications.getNumReplicas(block);
              NumberReplicas repl = new NumberReplicas(numCurrentReplica,
                                        allReplicas[0].decommissionedReplicas());
              if (numCurrentReplica >= fileINode.getReplication()) {
                it.remove();
              } else {
                DatanodeDescriptor targets[] = replicator.chooseTarget(
                                                                       Math.min(fileINode.getReplication() - numCurrentReplica,
                                                                                needed),
                                                                       datanodeMap.get(srcNode.getStorageID()),
                                                                       containingNodes, null, blockSize);
                if (targets.length > 0) {
                  // Build items to return
                  replicateBlocks.add(block);
                  numCurrentReplicas.add(repl);
                  replicateTargetSets.add(targets);
                  needed -= targets.length;
                }
              }
            }
          }
        }

        //
        // Move the block-replication into a "pending" state.
        // The reason we use 'pending' is so we can retry
        // replications that fail after an appropriate amount of time.
        // (REMIND - mjc - this timer is not yet implemented.)
        //
        if (replicateBlocks.size() > 0) {
          int i = 0;
          for (Iterator<Block> it = replicateBlocks.iterator(); it.hasNext(); i++) {
            Block block = it.next();
            DatanodeDescriptor targets[] =
              (DatanodeDescriptor[]) replicateTargetSets.get(i);
            int numCurrentReplica = numCurrentReplicas.get(i).liveReplicas();
            int numExpectedReplica = blocksMap.getINode(block).getReplication();
            if (numCurrentReplica + targets.length >= numExpectedReplica) {
              neededReplications.remove(
                                        block,
                                        numCurrentReplica,
                                        numCurrentReplicas.get(i).decommissionedReplicas(),
                                        numExpectedReplica);
              pendingReplications.add(block, targets.length);
              NameNode.stateChangeLog.debug(
                                            "BLOCK* NameSystem.pendingTransfer: "
                                            + block.getBlockName()
                                            + " is removed from neededReplications to pendingReplications");
            }

            if (NameNode.stateChangeLog.isInfoEnabled()) {
              StringBuffer targetList = new StringBuffer("datanode(s)");
              for (int k = 0; k < targets.length; k++) {
                targetList.append(' ');
                targetList.append(targets[k].getName());
              }
              NameNode.stateChangeLog.info(
                                           "BLOCK* NameSystem.pendingTransfer: " + "ask "
                                           + srcNode.getName() + " to replicate "
                                           + block.getBlockName() + " to " + targetList);
              NameNode.stateChangeLog.debug(
                                            "BLOCK* neededReplications = " + neededReplications.size()
                                            + " pendingReplications = " + pendingReplications.size());
            }
          }

          //
          // Build returned objects from above lists
          //
          DatanodeDescriptor targetMatrix[][] =
            new DatanodeDescriptor[replicateTargetSets.size()][];
          for (i = 0; i < targetMatrix.length; i++) {
            targetMatrix[i] = replicateTargetSets.get(i);
          }

          results = new Object[2];
          results[0] = replicateBlocks.toArray(new Block[replicateBlocks.size()]);
          results[1] = targetMatrix;
        }
      }
      return results;
    }
  }
 
  // Keeps track of which datanodes are allowed to connect to the namenode.
  private boolean inHostsList(DatanodeID node) {
    Set<String> hostsList = hostsReader.getHosts();
    return (hostsList.isEmpty() ||
            hostsList.contains(node.getName()) ||
            hostsList.contains(node.getHost()) ||
            ((node instanceof DatanodeInfo) &&
             hostsList.contains(((DatanodeInfo)node).getHostName())));
  }


  private boolean inExcludedHostsList(DatanodeID node) {
    Set<String> excludeList = hostsReader.getExcludedHosts();
    return (excludeList.contains(node.getName()) ||
            excludeList.contains(node.getHost()) ||
            ((node instanceof DatanodeInfo) &&
             excludeList.contains(((DatanodeInfo)node).getHostName())));
  }

  /**
   * Rereads the files to update the hosts and exclude lists.  It
   * checks if any of the hosts have changed states:
   * 1. Added to hosts  --> no further work needed here.
   * 2. Removed from hosts --> mark AdminState as decommissioned.
   * 3. Added to exclude --> start decommission.
   * 4. Removed from exclude --> stop decommission.
   */
  void refreshNodes() throws IOException {
    hostsReader.refresh();
    synchronized (this) {
      for (Iterator<DatanodeDescriptor> it = datanodeMap.values().iterator();
           it.hasNext();) {
        DatanodeDescriptor node = it.next();
        // Check if not include.
        if (!inHostsList(node)) {
          node.setDecommissioned()// case 2.
        } else {
          if (inExcludedHostsList(node)) {
            if (!node.isDecommissionInProgress() &&
                !node.isDecommissioned()) {
              startDecommission(node);   // case 3.
            }
          } else {
            if (node.isDecommissionInProgress() ||
                node.isDecommissioned()) {
              stopDecommission(node);   // case 4.
            }
          }
        }
      }
    }
     
  }
   

  /**
   * Checks if the node is not on the hosts list.  If it is not, then
   * it will be ignored.  If the node is in the hosts list, but is also
   * on the exclude list, then it will be decommissioned.
   * Returns FALSE if node is rejected for registration.
   * Returns TRUE if node is registered (including when it is on the
   * exclude list and is being decommissioned).
   */
  public synchronized boolean verifyNodeRegistration(DatanodeRegistration nodeReg)
    throws IOException {
    if (!inHostsList(nodeReg)) {
      return false;   
    }
    if (inExcludedHostsList(nodeReg)) {
      DatanodeDescriptor node = getDatanode(nodeReg);
      if (!checkDecommissionStateInternal(node)) {
        startDecommission(node);
      }
    }
    return true;
  }
   
  /**
   * Checks if the Admin state bit is DECOMMISSIONED.  If so, then
   * we should shut it down.
   *
   * Returns true if the node should be shutdown.
   */
  private boolean shouldNodeShutdown(DatanodeDescriptor node) {
    return (node.isDecommissioned());
  }

  /**
   * Check if any of the nodes being decommissioned has finished
   * moving all its datablocks to another replica. This is a loose
   * heuristic to determine when a decommission is really over.
   */
  public synchronized void decommissionedDatanodeCheck() {
    for (Iterator<DatanodeDescriptor> it = datanodeMap.values().iterator();
         it.hasNext();) {
      DatanodeDescriptor node = it.next()
      checkDecommissionStateInternal(node);
    }
  }
   
  /**
   * Periodically calls decommissionedDatanodeCheck().
   */
  class DecommissionedMonitor implements Runnable {
       
    public void run() {
      while (fsRunning) {
        try {
          decommissionedDatanodeCheck();
        } catch (Exception e) {
          FSNamesystem.LOG.info(StringUtils.stringifyException(e));
        }
        try {
          Thread.sleep(decommissionRecheckInterval);
        } catch (InterruptedException ie) {
        }
      }
    }
  }
   
  /**
   * Get data node by storage ID.
   *
   * @param nodeID
   * @return DatanodeDescriptor or null if the node is not found.
   * @throws IOException
   */
  public DatanodeDescriptor getDatanode(DatanodeID nodeID) throws IOException {
    UnregisteredDatanodeException e = null;
    DatanodeDescriptor node = datanodeMap.get(nodeID.getStorageID());
    if (node == null)
      return null;
    if (!node.getName().equals(nodeID.getName())) {
      e = new UnregisteredDatanodeException(nodeID, node);
      NameNode.stateChangeLog.fatal("BLOCK* NameSystem.getDatanode: "
                                    + e.getLocalizedMessage());
      throw e;
    }
    return node;
  }
   
  /** Stop at and return the datanode at index (used for content browsing)*/
  private DatanodeDescriptor getDatanodeByIndex(int index) {
    int i = 0;
    for (DatanodeDescriptor node : datanodeMap.values()) {
      if (i == index) {
        return node;
      }
      i++;
    }
    return null;
  }
   
  public String randomDataNode() {
    int size = datanodeMap.size();
    int index = 0;
    if (size != 0) {
      index = r.nextInt(size);
      for(int i=0; i<size; i++) {
        DatanodeDescriptor d = getDatanodeByIndex(index);
        if (d != null && !d.isDecommissioned() && !isDatanodeDead(d) &&
            !d.isDecommissionInProgress()) {
          return d.getHost() + ":" + d.getInfoPort();
        }
        index = (index + 1) % size;
      }
    }
    return null;
  }
   
  public int getNameNodeInfoPort() {
    return infoPort;
  }

  /**
   * SafeModeInfo contains information related to the safe mode.
   * <p>
   * An instance of {@link SafeModeInfo} is created when the name node
   * enters safe mode.
   * <p>
   * During name node startup {@link SafeModeInfo} counts the number of
   * <em>safe blocks</em>, those that have at least the minimal number of
   * replicas, and calculates the ratio of safe blocks to the total number
   * of blocks in the system, which is the size of
   * {@link FSNamesystem#blocksMap}. When the ratio reaches the
   * {@link #threshold} it starts the {@link SafeModeMonitor} daemon in order
   * to monitor whether the safe mode extension is passed. Then it leaves safe
   * mode and destroys itself.
   * <p>
   * If safe mode is turned on manually then the number of safe blocks is
   * not tracked because the name node is not intended to leave safe mode
   * automatically in the case.
   *
   * @see ClientProtocol#setSafeMode(FSConstants.SafeModeAction)
   * @see SafeModeMonitor
   */
  class SafeModeInfo {
    // configuration fields
    /** Safe mode threshold condition %.*/
    private double threshold;
    /** Safe mode extension after the threshold. */
    private int extension;
    /** Min replication required by safe mode. */
    private int safeReplication;
     
    // internal fields
    /** Time when threshold was reached.
     *
     * <br>-1 safe mode is off
     * <br> 0 safe mode is on, but threshold is not reached yet
     */
    private long reached = -1
    /** Total number of blocks. */
    int blockTotal;
    /** Number of safe blocks. */
    private int blockSafe;
     
    /**
     * Creates SafeModeInfo when the name node enters
     * automatic safe mode at startup.
     * 
     * @param conf configuration
     */
    SafeModeInfo(Configuration conf) {
      this.threshold = conf.getFloat("dfs.safemode.threshold.pct", 0.95f);
      this.extension = conf.getInt("dfs.safemode.extension", 0);
      this.safeReplication = conf.getInt("dfs.replication.min", 1);
      this.blockTotal = 0;
      this.blockSafe = 0;
    }

    /**
     * Creates SafeModeInfo when safe mode is entered manually.
     *
     * The {@link #threshold} is set to 1.5 so that it could never be reached.
     * {@link #blockTotal} is set to -1 to indicate that safe mode is manual.
     *
     * @see SafeModeInfo
     */
    private SafeModeInfo() {
      this.threshold = 1.5f// this threshold can never be riched
      this.extension = 0;
      this.safeReplication = Short.MAX_VALUE + 1; // more than maxReplication
      this.blockTotal = -1;
      this.blockSafe = -1;
      this.reached = -1;
      enter();
    }
     
    /**
     * Check if safe mode is on.
     * @return true if in safe mode
     */
    synchronized boolean isOn() {
      try {
        assert isConsistent() : " SafeMode: Inconsistent filesystem state: "
          + "Total num of blocks, active blocks, or "
          + "total safe blocks don't match.";
      } catch(IOException e) {
        System.err.print(StringUtils.stringifyException(e));
      }
      return this.reached >= 0;
    }
     
    /**
     * Enter safe mode.
     */
    void enter() {
      if (reached != 0)
        NameNode.stateChangeLog.info(
                                     "STATE* SafeModeInfo.enter: " + "Safe mode is ON.\n"
                                     + getTurnOffTip());
      this.reached = 0;
    }
     
    /**
     * Leave safe mode.
     * Switch to manual safe mode if distributed upgrade is required.
     */
    synchronized void leave(boolean checkForUpgrades) {
      if(checkForUpgrades) {
        // verify whether a distributed upgrade needs to be started
        boolean needUpgrade = false;
        try {
          needUpgrade = startDistributedUpgradeIfNeeded();
        } catch(IOException e) {
          FSNamesystem.LOG.error(StringUtils.stringifyException(e));
        }
        if(needUpgrade) {
          // switch to manual safe mode
          safeMode = new SafeModeInfo();
          NameNode.stateChangeLog.info("STATE* SafeModeInfo.leave: "
                                      + "Safe mode is ON.\n" + getTurnOffTip());
          return;
        }
      }
      if (reached >= 0)
        NameNode.stateChangeLog.info(
                                     "STATE* SafeModeInfo.leave: " + "Safe mode is OFF.");
      reached = -1;
      safeMode = null;
      NameNode.stateChangeLog.info("STATE* Network topology has "
                                   +clusterMap.getNumOfRacks()+" racks and "
                                   +clusterMap.getNumOfLeaves()+ " datanodes");
      NameNode.stateChangeLog.info("STATE* UnderReplicatedBlocks has "
                                   +neededReplications.size()+" blocks");
    }
     
    /**
     * Safe mode can be turned off iff
     * the threshold is reached and
     * the extension time have passed.
     * @return true if can leave or false otherwise.
     */
    synchronized boolean canLeave() {
      if (reached == 0)
        return false;
      if (now() - reached < extension)
        return false;
      return !needEnter();
    }
     
    /**
     * There is no need to enter safe mode
     * if DFS is empty or {@link #threshold} == 0
     */
    boolean needEnter() {
      return getSafeBlockRatio() < threshold;
    }
     
    /**
     * Ratio of the number of safe blocks to the total number of blocks
     * to be compared with the threshold.
     */
    private float getSafeBlockRatio() {
      return (blockTotal == 0 ? 1 : (float)blockSafe/blockTotal);
    }
     
    /**
     * Check and trigger safe mode if needed.
     */
    private void checkMode() {
      if (needEnter()) {
        enter();
        return;
      }
      // the threshold is reached
      if (!isOn() ||                           // safe mode is off
          extension <= 0 || threshold <= 0) {  // don't need to wait
        this.leave(true); // leave safe mode
        return;
      }
      if (reached > 0// threshold has already been reached before
        return;
      // start monitor
      reached = now();
      smmthread = new Daemon(new SafeModeMonitor());
      smmthread.start();
    }
     
    /**
     * Set total number of blocks.
     */
    synchronized void setBlockTotal(int total) {
      this.blockTotal = total;
      checkMode();
    }
     
    /**
     * Increment number of safe blocks if current block has
     * reached minimal replication.
     * @param replication current replication
     */
    synchronized void incrementSafeBlockCount(short replication) {
      if ((int)replication == safeReplication)
        this.blockSafe++;
      checkMode();
    }
     
    /**
     * Decrement number of safe blocks if current block has
     * fallen below minimal replication.
     * @param replication current replication
     */
    synchronized void decrementSafeBlockCount(short replication) {
      if (replication == safeReplication-1)
        this.blockSafe--;
      checkMode();
    }
     
    /**
     * Check if safe mode was entered manually or at startup.
     */
    boolean isManual() {
      return blockTotal == -1;
    }
     
    /**
     * A tip on how safe mode is to be turned off: manually or automatically.
     */
    String getTurnOffTip() {
      return (isManual() ?  getDistributedUpgradeState() ?
        "Safe mode will be turned off automatically upon completion of " +
        "the distributed upgrade: upgrade progress = " +
        getDistributedUpgradeStatus() + "%" :
        "Use \"hadoop dfs -safemode leave\" to turn safe mode off." :
        "Safe mode will be turned off automatically.");
    }
     
    /**
     * Returns printable state of the class.
     */
    public String toString() {
      String resText = "Current safe block ratio = "
        + getSafeBlockRatio()
        + ". Target threshold = " + threshold
        + ". Minimal replication = " + safeReplication + ".";
      if (reached > 0)
        resText += " Threshold was reached " + new Date(reached) + ".";
      return resText;
    }
     
    /**
     * Checks consistency of the class state.
     * This is costly and currently called only in assert.
     */
    boolean isConsistent() throws IOException {
      if (blockTotal == -1 && blockSafe == -1) {
        return true; // manual safe mode
      }
      int activeBlocks = blocksMap.size();
      for(Iterator<Collection<Block>> it =
            recentInvalidateSets.values().iterator(); it.hasNext();) {
        activeBlocks -= it.next().size();
      }
      return (blockTotal == activeBlocks) ||
        (blockSafe >= 0 && blockSafe <= blockTotal);
    }
  }
   
  /**
   * Periodically check whether it is time to leave safe mode.
   * This thread starts when the threshold level is reached.
   *
   */
  class SafeModeMonitor implements Runnable {
    /** interval in msec for checking safe mode: {@value} */
    private static final long recheckInterval = 1000;
     
    /**
     */
    public void run() {
      while (fsRunning && !safeMode.canLeave()) {
        try {
          Thread.sleep(recheckInterval);
        } catch (InterruptedException ie) {
        }
      }
      // leave safe mode an stop the monitor
      safeMode.leave(true);
      smmthread = null;
    }
  }
   
  /**
   * Current system time.
   * @return current time in msec.
   */
  static long now() {
    return System.currentTimeMillis();
  }
   
  /**
   * Check whether the name node is in safe mode.
   * @return true if safe mode is ON, false otherwise
   */
  boolean isInSafeMode() {
    if (safeMode == null)
      return false;
    return safeMode.isOn();
  }
   
  /**
   * Increment number of blocks that reached minimal replication.
   * @param replication current replication
   */
  void incrementSafeBlockCount(int replication) {
    if (safeMode == null)
      return;
    safeMode.incrementSafeBlockCount((short)replication);
  }

  /**
   * Decrement number of blocks that reached minimal replication.
   */
  void decrementSafeBlockCount(Block b) {
    if (safeMode == null) // mostly true
      return;
    safeMode.decrementSafeBlockCount((short)countNodes(b).liveReplicas());
  }

  /**
   * Set the total number of blocks in the system.
   */
  void setBlockTotal() {
    if (safeMode == null)
      return;
    safeMode.setBlockTotal(blocksMap.size());
  }

  /**
   * Enter safe mode manually.
   * @throws IOException
   */
  synchronized void enterSafeMode() throws IOException {
    if (isInSafeMode()) {
      NameNode.stateChangeLog.info(
                                   "STATE* FSNamesystem.enterSafeMode: " + "Safe mode is already ON.");
      return;
    }
    safeMode = new SafeModeInfo();
  }
   
  /**
   * Leave safe mode.
   * @throws IOException
   */
  synchronized void leaveSafeMode(boolean checkForUpgrades) throws IOException {
    if (!isInSafeMode()) {
      NameNode.stateChangeLog.info(
                                   "STATE* FSNamesystem.leaveSafeMode: " + "Safe mode is already OFF.");
      return;
    }
    if(getDistributedUpgradeState())
      throw new SafeModeException("Distributed upgrade is in progress",
                                  safeMode);
    safeMode.leave(checkForUpgrades);
  }
   
  String getSafeModeTip() {
    if (!isInSafeMode())
      return "";
    return safeMode.getTurnOffTip();
  }

  long getEditLogSize() throws IOException {
    return getEditLog().getEditLogSize();
  }

  synchronized void rollEditLog() throws IOException {
    if (isInSafeMode()) {
      throw new SafeModeException("Checkpoint not created",
                                  safeMode);
    }
    LOG.info("Roll Edit Log from " + Server.getRemoteAddress());
    getEditLog().rollEditLog();
  }

  synchronized void rollFSImage() throws IOException {
    LOG.info("Roll FSImage from " + Server.getRemoteAddress());
    if (isInSafeMode()) {
      throw new SafeModeException("Checkpoint not created",
                                  safeMode);
    }
    dir.fsImage.rollFSImage();
  }

  File getFsEditName() throws IOException {
    return getEditLog().getFsEditName();
  }
   
  /**
   * Returns whether the given block is one pointed-to by a file.
   */
  private boolean isValidBlock(Block b) {
    return (blocksMap.getINode(b) != null ||
            pendingCreates.contains(b));
  }

  // Distributed upgrade manager
  UpgradeManagerNamenode upgradeManager = new UpgradeManagerNamenode();

  UpgradeStatusReport distributedUpgradeProgress(UpgradeAction action
                                                 ) throws IOException {
    return upgradeManager.distributedUpgradeProgress(action);
  }

  UpgradeCommand processDistributedUpgradeCommand(UpgradeCommand comm) throws IOException {
    return upgradeManager.processUpgradeCommand(comm);
  }

  int getDistributedUpgradeVersion() {
    return upgradeManager.getUpgradeVersion();
  }

  UpgradeCommand getDistributedUpgradeCommand() throws IOException {
    return upgradeManager.getBroadcastCommand();
  }

  boolean getDistributedUpgradeState() {
    return upgradeManager.getUpgradeState();
  }

  short getDistributedUpgradeStatus() {
    return upgradeManager.getUpgradeStatus();
  }

  boolean startDistributedUpgradeIfNeeded() throws IOException {
    return upgradeManager.startUpgrade();
  }
}
TOP

Related Classes of org.apache.hadoop.dfs.FSNamesystem

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.