Package org.apache.hadoop.hdfs.server.datanode

Source Code of org.apache.hadoop.hdfs.server.datanode.DataNode

/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hdfs.server.datanode;

import java.io.BufferedOutputStream;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.OutputStream;
import java.lang.reflect.Proxy;
import java.net.InetSocketAddress;
import java.net.ServerSocket;
import java.net.Socket;
import java.net.SocketTimeoutException;
import java.net.URI;
import java.net.UnknownHostException;
import java.nio.channels.ClosedByInterruptException;
import java.nio.channels.ServerSocketChannel;
import java.nio.channels.SocketChannel;
import java.security.NoSuchAlgorithmException;
import java.security.SecureRandom;
import java.util.AbstractList;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.Collection;
import java.util.concurrent.Callable;
import java.util.concurrent.Executors;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Future;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.ScheduledFuture;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.locks.ReentrantReadWriteLock;

import javax.management.ObjectName;
import javax.security.auth.login.LoginException;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.ReconfigurableBase;
import org.apache.hadoop.conf.ReconfigurationException;
import org.apache.hadoop.conf.ReconfigurationServlet;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocalFileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.hdfs.DFSUtil;
import org.apache.hadoop.hdfs.HDFSPolicyProvider;
import org.apache.hadoop.hdfs.protocol.Block;
import org.apache.hadoop.hdfs.protocol.BlockPathInfo;
import org.apache.hadoop.hdfs.protocol.BlockListAsLongs;
import org.apache.hadoop.hdfs.protocol.ClientDatanodeProtocol;
import org.apache.hadoop.hdfs.protocol.DataTransferProtocol;
import org.apache.hadoop.hdfs.protocol.DatanodeID;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.hadoop.hdfs.protocol.FSConstants;
import org.apache.hadoop.hdfs.protocol.LocatedBlock;
import org.apache.hadoop.hdfs.protocol.WriteBlockHeader;
import org.apache.hadoop.hdfs.protocol.UnregisteredDatanodeException;
import org.apache.hadoop.hdfs.protocol.ProtocolCompatible;
import org.apache.hadoop.hdfs.server.common.HdfsConstants.StartupOption;
import org.apache.hadoop.hdfs.server.common.HdfsConstants;
import org.apache.hadoop.hdfs.server.common.GenerationStamp;
import org.apache.hadoop.hdfs.server.common.IncorrectVersionException;
import org.apache.hadoop.hdfs.server.common.Storage;
import org.apache.hadoop.hdfs.server.common.Storage.StorageDirectory;
import org.apache.hadoop.hdfs.server.common.Util;
import org.apache.hadoop.hdfs.server.datanode.FSDataset.FSVolume;
import org.apache.hadoop.hdfs.server.datanode.metrics.DataNodeMetrics;
import org.apache.hadoop.hdfs.server.namenode.FSNamesystem;
import org.apache.hadoop.hdfs.server.namenode.FileChecksumServlets;
import org.apache.hadoop.hdfs.server.namenode.NameNode;
import org.apache.hadoop.hdfs.server.namenode.StreamFile;
import org.apache.hadoop.hdfs.server.protocol.BlockAlreadyCommittedException;
import org.apache.hadoop.hdfs.server.protocol.BlockCommand;
import org.apache.hadoop.hdfs.server.protocol.BlockMetaDataInfo;
import org.apache.hadoop.hdfs.server.protocol.BlockRecoveryInfo;
import org.apache.hadoop.hdfs.server.protocol.BlockReport;
import org.apache.hadoop.hdfs.server.protocol.DatanodeCommand;
import org.apache.hadoop.hdfs.server.protocol.DatanodeProtocol;
import org.apache.hadoop.hdfs.server.protocol.DatanodeRegistration;
import org.apache.hadoop.hdfs.server.protocol.DisallowedDatanodeException;
import org.apache.hadoop.hdfs.server.protocol.InterDatanodeProtocol;
import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo;
import org.apache.hadoop.hdfs.server.protocol.ReceivedBlockInfo;
import org.apache.hadoop.hdfs.server.protocol.UpgradeCommand;
import org.apache.hadoop.http.HttpServer;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.ipc.ProtocolSignature;
import org.apache.hadoop.ipc.RPC;
import org.apache.hadoop.ipc.RemoteException;
import org.apache.hadoop.ipc.Server;
import org.apache.hadoop.metrics.util.MBeanUtil;
import org.apache.hadoop.net.DNS;
import org.apache.hadoop.net.NetUtils;
import org.apache.hadoop.security.SecurityUtil;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.security.authorize.ConfiguredPolicy;
import org.apache.hadoop.security.authorize.PolicyProvider;
import org.apache.hadoop.security.authorize.ServiceAuthorizationManager;
import org.apache.hadoop.util.Daemon;
import org.apache.hadoop.util.DiskChecker;
import org.apache.hadoop.util.ReflectionUtils;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.DiskChecker.DiskErrorException;
import org.apache.hadoop.util.DiskChecker.DiskOutOfSpaceException;
import org.apache.hadoop.util.PulseChecker;
import org.apache.hadoop.util.PulseCheckable;
import org.apache.hadoop.util.VersionInfo;
import org.mortbay.util.ajax.JSON;

/**********************************************************
* DataNode is a class (and program) that stores a set of
* blocks for a DFS deployment.  A single deployment can
* have one or many DataNodes.  Each DataNode communicates
* regularly with a single NameNode.  It also communicates
* with client code and other DataNodes from time to time.
*
* DataNodes store a series of named blocks.  The DataNode
* allows client code to read these blocks, or to write new
* block data.  The DataNode may also, in response to instructions
* from its NameNode, delete blocks or copy blocks to/from other
* DataNodes.
*
* The DataNode maintains just one critical table:
*   block-> stream of bytes (of BLOCK_SIZE or less)
*
* This info is stored on a local disk.  The DataNode
* reports the table's contents to the NameNode upon startup
* and every so often afterwards.
*
* DataNodes spend their lives in an endless loop of asking
* the NameNode for something to do.  A NameNode cannot connect
* to a DataNode directly; a NameNode simply returns values from
* functions invoked by a DataNode.
*
* DataNodes maintain an open server socket so that client code
* or other DataNodes can read/write data.  The host/port for
* this server is reported to the NameNode, which then sends that
* information to clients or other DataNodes that might be interested.
*
**********************************************************/
public class DataNode extends ReconfigurableBase
    implements InterDatanodeProtocol, ClientDatanodeProtocol, FSConstants, PulseCheckable,
    DataNodeMXBean {
  public static final Log LOG = LogFactory.getLog(DataNode.class);
 
  static{
    Configuration.addDefaultResource("hdfs-default.xml");
    Configuration.addDefaultResource("hdfs-site.xml");
  }

  public static final String DN_CLIENTTRACE_FORMAT =
        "src: %s" +      // src IP
        ", dest: %s" +   // dst IP
        ", bytes: %s" // byte count
        ", op: %s" +     // operation
        ", cliID: %s" // DFSClient id
        ", offset: %s" + // offset
        ", srvID: %s" // DatanodeRegistration
        ", blockid: %s" + // block id
        ", duration: %s"; // duration time

  public static final Log ClientTraceLog =
    LogFactory.getLog(DataNode.class.getName() + ".clienttrace");

  /**
   * Use {@link NetUtils#createSocketAddr(String)} instead.
   */
  @Deprecated
  public static InetSocketAddress createSocketAddr(String target
                                                   ) throws IOException {
    return NetUtils.createSocketAddr(target);
  }
 
  public FSDatasetInterface data = null;
 
  //TODO this should be not used at all
  private static InetSocketAddress nameNodeAddr;
  public static int NAMESPACE_ID = 12345678;

  volatile boolean shouldRun = true;
  boolean isAlive = false;
 
  protected NamespaceManager namespaceManager;
 
  /** list of blocks being recovered */
  private final Map<Block, Block> ongoingRecovery = new HashMap<Block, Block>();
  AtomicInteger xmitsInProgress = new AtomicInteger();
  AtomicBoolean shuttingDown = new AtomicBoolean(false);
  AtomicBoolean checkingDisk = new AtomicBoolean(false);
  volatile long timeLastCheckDisk = 0;
  long minDiskCheckIntervalMsec;
  Daemon dataXceiverServer = null;
  ThreadGroup threadGroup = null;
  long blockReportInterval;
  long deletedReportInterval;
  long initialBlockReportDelay = BLOCKREPORT_INITIAL_DELAY * 1000L;
  long heartBeatInterval;
  DataStorage storage = null;
  HttpServer infoServer = null;
  DataNodeMetrics myMetrics;
 
  protected InetSocketAddress selfAddr;
  String machineName;
  static String dnThreadName;
  int socketTimeout;
  int socketReadExtentionTimeout;
  int socketWriteTimeout = 0;
  int socketWriteExtentionTimeout = 0
  boolean transferToAllowed = true;
  boolean ignoreChecksumWhenRead = false;
  int writePacketSize = 0;
  boolean syncOnClose;
  boolean supportAppends;
  long heartbeatExpireInterval;
  // heartbeatExpireInterval is how long namenode waits for datanode to report
 
  /**
   * Testing hook that allows tests to delay the sending of blockReceived
   * RPCs to the namenode. This can help find bugs in append.
   */
  int artificialBlockReceivedDelay = 0;

  public DataBlockScannerSet blockScanner = null;
 
  private static final String CONF_SERVLET_PATH = "/dnconf";
 
  private static final Random R = new Random();
 
  // For InterDataNodeProtocol
  public Server ipcServer;

  private final ExecutorService blockCopyExecutor;
  public static final int BLOCK_COPY_THREAD_POOL_SIZE = 10;

  private final int blockCopyRPCWaitTime;
  AbstractList<File> dataDirs;
  Configuration conf;
  private PulseChecker pulseChecker;

  /**
   * Current system time.
   * @return current time in msec.
   */
  static long now() {
    return System.currentTimeMillis();
  }

  /**
   * Create the DataNode given a configuration and an array of dataDirs.
   * 'dataDirs' is where the blocks are stored.
   */
  DataNode(Configuration conf,
          AbstractList<File> dataDirs) throws IOException {
   super(conf);
   supportAppends = conf.getBoolean("dfs.support.append", false);
   // TODO(pritam): Integrate this into a threadpool for all operations of the
   // datanode.
   blockCopyExecutor = Executors.newCachedThreadPool();

   // Time that the blocking version of  RPC for copying block between
   // datanodes should wait for. Default is 5 minutes.
   blockCopyRPCWaitTime = conf.getInt("dfs.datanode.blkcopy.wait_time",
       5 * 60);
   try {
     startDataNode(this.getConf(), dataDirs);
   } catch (IOException ie) {
     LOG.info("Failed to start datanode " + StringUtils.stringifyException(ie));
     shutdown();
     throw ie;
   }
}

  /**
   * Initialize global settings for DN
   */
  protected void initGlobalSetting(Configuration conf,
      AbstractList<File> dataDirs) throws IOException {
    this.dataDirs = dataDirs;
    this.conf = conf;
    storage = new DataStorage(this);
   
    // global DN settings
    initConfig(conf);
    registerMXBean();
    initDataXceiver(conf);
    startInfoServer(conf);
    initIpcServer(conf);

    myMetrics = new DataNodeMetrics(conf, storage.getStorageID());
  }
 
  /**
   * Initialize dataset and block scanner
   *
   * @param conf  Configuration
   * @param dataDirs data directories
   * @param numOfNameSpaces number of name spaces
   * @throws IOException
   */
  protected void initDataSetAndScanner(Configuration conf,
      AbstractList<File> dataDirs, int numOfNameSpaces) throws IOException {
    initFsDataSet(conf, dataDirs, numOfNameSpaces);
    initDataBlockScanner(conf);
  }
 
  /**
   * This method starts the data node with the specified conf.
   *
   * @param conf - the configuration
   *  if conf's CONFIG_PROPERTY_SIMULATED property is set
   *  then a simulated storage based data node is created.
   *
   * @param dataDirs - only for a non-simulated storage data node
   * @throws IOException
   */
  void startDataNode(Configuration conf,
                     AbstractList<File> dataDirs
                     ) throws IOException {
    initGlobalSetting(conf, dataDirs);
   
    /* Initialize namespace manager */
    List<InetSocketAddress> nameNodeAddrs = DFSUtil.getNNServiceRpcAddresses(conf);
   
    //TODO this will be no longer valid, since we will have multiple namenodes
    // We might want to keep it and assign the first NN to it.
    DataNode.nameNodeAddr = nameNodeAddrs.get(0);
    namespaceManager = new NamespaceManager(conf, nameNodeAddrs);
 
    initDataSetAndScanner(conf, dataDirs, nameNodeAddrs.size());
  }
 
  private void initConfig(Configuration conf) throws IOException {
    if (conf.get("slave.host.name") != null) {
      machineName = conf.get("slave.host.name");  
    }
    if (machineName == null) {
      machineName = DNS.getDefaultHost(
                                     conf.get("dfs.datanode.dns.interface","default"),
                                     conf.get("dfs.datanode.dns.nameserver","default"));
    }
    // Allow configuration to delay block reports to find bugs
    artificialBlockReceivedDelay = conf.getInt(
      "dfs.datanode.artificialBlockReceivedDelay", 0);
    if (conf.getBoolean(
        ServiceAuthorizationManager.SERVICE_AUTHORIZATION_CONFIG, false)) {
      PolicyProvider policyProvider = (PolicyProvider) (ReflectionUtils
          .newInstance(conf.getClass(PolicyProvider.POLICY_PROVIDER_CONFIG,
              HDFSPolicyProvider.class, PolicyProvider.class), conf));
      SecurityUtil.setPolicy(new ConfiguredPolicy(conf, policyProvider));
    }
    this.socketTimeout = conf.getInt("dfs.socket.timeout",
        HdfsConstants.READ_TIMEOUT);
    this.socketReadExtentionTimeout = conf.getInt(
        HdfsConstants.DFS_DATANODE_READ_EXTENSION,
        HdfsConstants.READ_TIMEOUT_EXTENSION);
    this.socketWriteTimeout = conf.getInt("dfs.datanode.socket.write.timeout",
        HdfsConstants.WRITE_TIMEOUT);
    this.socketWriteExtentionTimeout = conf.getInt(
        HdfsConstants.DFS_DATANODE_WRITE_EXTENTSION,
        HdfsConstants.WRITE_TIMEOUT_EXTENSION);
   
    /* Based on results on different platforms, we might need set the default
     * to false on some of them. */
    this.transferToAllowed = conf.getBoolean("dfs.datanode.transferTo.allowed",
                                             true);

    // TODO: remove the global setting and change data protocol to support
    // per session setting for this value.
    this.ignoreChecksumWhenRead = conf.getBoolean("dfs.datanode.read.ignore.checksum",
        false);

    this.writePacketSize = conf.getInt("dfs.write.packet.size", 64*1024);
   
    this.deletedReportInterval =
      conf.getLong("dfs.blockreport.intervalMsec", BLOCKREPORT_INTERVAL);
    // Calculate the full block report interval
    int fullReportMagnifier = conf.getInt("dfs.fullblockreport.magnifier", 2);
    this.blockReportInterval = fullReportMagnifier * deletedReportInterval;
    this.heartBeatInterval = conf.getLong("dfs.heartbeat.interval", HEARTBEAT_INTERVAL) * 1000L;
    long heartbeatRecheckInterval = conf.getInt(
        "heartbeat.recheck.interval", 5 * 60 * 1000); // 5 minutes
    this.heartbeatExpireInterval = 2 * heartbeatRecheckInterval +
        10 * heartBeatInterval;
   
    this.initialBlockReportDelay = conf.getLong("dfs.blockreport.initialDelay",
        BLOCKREPORT_INITIAL_DELAY) * 1000L;
    if (this.initialBlockReportDelay >= blockReportInterval) {
      this.initialBlockReportDelay = 0;
      LOG.info("dfs.blockreport.initialDelay is greater than "
          + "dfs.blockreport.intervalMsec."
          + " Setting initial delay to 0 msec:");
    }

    // do we need to sync block file contents to disk when blockfile is closed?
    this.syncOnClose = conf.getBoolean("dfs.datanode.synconclose", false);
   
    this.minDiskCheckIntervalMsec = conf.getLong(
        "dfs.datnode.checkdisk.mininterval",
        FSConstants.MIN_INTERVAL_CHECK_DIR_MSEC);
  }
 
  /**
   * Used only for testing.
   *
   * @param name
   *          the new name for datanode registration.
   */
  public void setRegistrationName(String name) {
    NamespaceService[] nsos = namespaceManager.getAllNamenodeThreads();
    for (NamespaceService ns : nsos) {
      ((NSOfferService) ns).setRegistrationName(name);
    }
  }

 
  private void initDataXceiver(Configuration conf) throws IOException {
    String address =
      NetUtils.getServerAddress(conf,
                        "dfs.datanode.bindAddress",
                        "dfs.datanode.port",
                        "dfs.datanode.address");
    InetSocketAddress socAddr = NetUtils.createSocketAddr(address);
    // find free port
    ServerSocket ss = (socketWriteTimeout > 0) ?
          ServerSocketChannel.open().socket() : new ServerSocket();
    Server.bind(ss, socAddr,
        conf.getInt("dfs.datanode.xceiver.listen.queue.size", 128));
    ss.setReceiveBufferSize(DEFAULT_DATA_SOCKET_SIZE);
    // adjust machine name with the actual port
    int tmpPort = ss.getLocalPort();
    selfAddr = new InetSocketAddress(ss.getInetAddress().getHostAddress(),
                                     tmpPort);
    LOG.info("Opened info server at " + tmpPort);
     
    this.threadGroup = new ThreadGroup("dataXceiverServer");
    this.dataXceiverServer = new Daemon(threadGroup,
        new DataXceiverServer(ss, conf, this));
    this.threadGroup.setDaemon(true); // auto destroy when empty
  }
 
  private void startInfoServer(Configuration conf) throws IOException {
    String infoAddr =
      NetUtils.getServerAddress(conf,
                              "dfs.datanode.info.bindAddress",
                              "dfs.datanode.info.port",
                              "dfs.datanode.http.address");
    InetSocketAddress infoSocAddr = NetUtils.createSocketAddr(infoAddr);
    String infoHost = infoSocAddr.getHostName();
    int tmpInfoPort = infoSocAddr.getPort();
    this.infoServer = new HttpServer("datanode", infoHost, tmpInfoPort,
        tmpInfoPort == 0, conf);
    if (conf.getBoolean("dfs.https.enable", false)) {
      boolean needClientAuth = conf.getBoolean("dfs.https.need.client.auth", false);
      InetSocketAddress secInfoSocAddr = NetUtils.createSocketAddr(conf.get(
          "dfs.datanode.https.address", infoHost + ":" + 0));
      Configuration sslConf = new Configuration(false);
      sslConf.addResource(conf.get("dfs.https.server.keystore.resource",
          "ssl-server.xml"));
      this.infoServer.addSslListener(secInfoSocAddr, sslConf, needClientAuth);
      // assume same ssl port for all datanodes
      InetSocketAddress datanodeSslPort = NetUtils.createSocketAddr(conf.get(
          "dfs.datanode.https.address", infoHost + ":" + 50475));
      this.infoServer.setAttribute("datanode.https.port", datanodeSslPort
          .getPort());
    }
    this.infoServer.addInternalServlet(null, "/streamFile/*", StreamFile.class);
    this.infoServer.addInternalServlet(null, "/getFileChecksum/*",
        FileChecksumServlets.GetServlet.class);
    this.infoServer.setAttribute("datanode", this);
    this.infoServer.addServlet(null, "/blockScannerReport",
                               DataBlockScannerSet.Servlet.class);

    this.infoServer.setAttribute(ReconfigurationServlet.CONF_SERVLET_RECONFIGURABLE_PREFIX +
    CONF_SERVLET_PATH, DataNode.this);
    this.infoServer.addServlet("dnConf", CONF_SERVLET_PATH, ReconfigurationServlet.class);
    this.infoServer.start();
  }
 
  private void initIpcServer(Configuration conf) throws IOException {
    //init ipc server
    InetSocketAddress ipcAddr = NetUtils.createSocketAddr(
        conf.get("dfs.datanode.ipc.address"));
    ipcServer = RPC.getServer(this, ipcAddr.getHostName(), ipcAddr.getPort(),
        conf.getInt("dfs.datanode.handler.count", 3), false, conf);
    ipcServer.start();
  }

  /**
   * Creates either NIO or regular depending on socketWriteTimeout.
   */
  protected Socket newSocket() throws IOException {
   return (socketWriteTimeout > 0) ?
          SocketChannel.open().socket() : new Socket();
  }

  public boolean isSupportAppends() {
    return supportAppends;
  }

  public static InterDatanodeProtocol createInterDataNodeProtocolProxy(
      DatanodeID datanodeid, Configuration conf, final int socketTimeout)
    throws IOException {
    InetSocketAddress addr = NetUtils.createSocketAddr(
  datanodeid.getHost() + ":" + datanodeid.getIpcPort());
    if (InterDatanodeProtocol.LOG.isDebugEnabled()) {
      InterDatanodeProtocol.LOG.info("InterDatanodeProtocol addr=" + addr);
    }
    UserGroupInformation ugi;
    try {
      ugi = UserGroupInformation.login(conf);
    } catch (LoginException le) {
      throw new RuntimeException("Couldn't login!");
    }
    return (InterDatanodeProtocol)RPC.getProxy(InterDatanodeProtocol.class,
        InterDatanodeProtocol.versionID, addr,
        ugi, conf,
        NetUtils.getDefaultSocketFactory(conf), socketTimeout);
  }

  /**
   * This method returns the address namenode uses to communicate with
   * datanodes. If this address is not configured the default NameNode
   * address is used, as it is running only one RPC server.
   * If it is running multiple servers this address cannot be used by clients!!
   * @param conf
   * @return
   */
  public static InetSocketAddress getNameNodeAddress(Configuration conf) {
    InetSocketAddress addr = null;
    addr = NameNode.getDNProtocolAddress(conf);
    if (addr != null) {
      return addr;
    }
    return NameNode.getAddress(conf);
  }

  //TODO this should not be there -> it affects StreamFile class
  public InetSocketAddress getNameNodeAddr() {
    return nameNodeAddr;
  }
 
  /**
   * Get namenode corresponding to a namespace
   * @param namespaceId
   * @return Namenode corresponding to the namespace
   * @throws IOException
   */
  public DatanodeProtocol getNSNamenode(int namespaceId) throws IOException {
    NamespaceService nsos = namespaceManager.get(namespaceId);
    if(nsos == null || nsos.getDatanodeProtocol() == null) {
      throw new IOException("cannot find a namnode proxy for namespaceId=" + namespaceId);
    }
    return nsos.getDatanodeProtocol();
  }

  public InetSocketAddress getSelfAddr() {
    return selfAddr;
  }
 
  public int getPort() {
    return selfAddr.getPort();
  }

  DataNodeMetrics getMetrics() {
    return myMetrics;
  }
 
  /**
   * get datanode registration by namespace id
   * @param namespaceId
   * @return datanode registration object
   * @throws IOException
   */
  public DatanodeRegistration getDNRegistrationForNS(int namespaceId)
  throws IOException {
    NamespaceService nsos = namespaceManager.get(namespaceId);
    if(nsos==null || nsos.getNsRegistration()==null) {
      throw new IOException("cannot find NSOfferService for namespaceId="+namespaceId);
    }
    return nsos.getNsRegistration();
  }

  /**
   * Return the namenode's identifier
   */
  public String getNamenode() {
    //return namenode.toString();
    return "<namenode>";
  }
 
  public static void setNewStorageID(DatanodeRegistration dnReg) {
    LOG.info("Datanode is " + dnReg);
    dnReg.storageID = createNewStorageId(dnReg.getPort());
  }

  public static String createNewStorageId(int port) {
    /* Return
     * "DS-randInt-ipaddr-currentTimeMillis"
     * It is considered extermely rare for all these numbers to match
     * on a different machine accidentally for the following
     * a) SecureRandom(INT_MAX) is pretty much random (1 in 2 billion), and
     * b) Good chance ip address would be different, and
     * c) Even on the same machine, Datanode is designed to use different ports.
     * d) Good chance that these are started at different times.
     * For a confict to occur all the 4 above have to match!.
     * The format of this string can be changed anytime in future without
     * affecting its functionality.
     */
    String ip = "unknownIP";
    try {
      ip = DNS.getDefaultIP("default");
    } catch (UnknownHostException ignored) {
      LOG.warn("Could not find ip address of \"default\" inteface.");
    }

    int rand = 0;
    try {
      rand = SecureRandom.getInstance("SHA1PRNG").nextInt(Integer.MAX_VALUE);
    } catch (NoSuchAlgorithmException e) {
      LOG.warn("Could not use SecureRandom");
      rand = R.nextInt(Integer.MAX_VALUE);
    }
    return "DS-" + rand + "-"+ ip + "-" + port + "-" +
                      System.currentTimeMillis();
  }

  /**
   * Shut down this instance of the datanode.
   * Returns only after shutdown is complete.
   * This method can only be called by the offerService thread.
   * Otherwise, deadlock might occur.
   */
  public void shutdown() {
    if (this.shuttingDown.getAndSet(true)) {
      // Already being shut down
      LOG.warn("DataNode.shutdown() was called while shutting down.");
      return;
    }
    if (infoServer != null) {
      try {
        infoServer.stop();
      } catch (Exception e) {
        LOG.warn("Exception shutting down DataNode", e);
      }
    }
    if (ipcServer != null) {
      ipcServer.stop();
    }
    this.shouldRun = false;
    if (dataXceiverServer != null) {
      ((DataXceiverServer) this.dataXceiverServer.getRunnable()).kill();
      this.dataXceiverServer.interrupt();

      // wait for all data receiver threads to exit
      if (this.threadGroup != null) {
        int retries = 0;
        while (true) {
          this.threadGroup.interrupt();
          LOG.info("Waiting for threadgroup to exit, active threads is " +
                   this.threadGroup.activeCount());
          if (this.threadGroup.activeCount() == 0) {
            break;
          }
          try {
            if (++retries > 600) {
              Thread[] activeThreads = new Thread[this.threadGroup.activeCount()];
              this.threadGroup.enumerate(activeThreads, true);
              LOG.info("Active Threads: " + Arrays.toString(activeThreads));
              LOG.warn("Waited for ThreadGroup to be empty for 10 minutes." +
                        " SHUTTING DOWN NOW");
              break;
            }
            Thread.sleep(1000);
          } catch (InterruptedException e) {}
        }
      }
      // wait for dataXceiveServer to terminate
      try {
        this.dataXceiverServer.join();
      } catch (InterruptedException ie) {
      }
    }
   
    if (blockCopyExecutor != null && !blockCopyExecutor.isShutdown()) {
      blockCopyExecutor.shutdownNow();
    }
   
    if (namespaceManager != null) {
      namespaceManager.shutDownAll();
    }
   
    if (blockScanner != null) {
      blockScanner.shutdown();
    }
    if (storage != null) {
      try {
        this.storage.unlockAll();
      } catch (IOException ie) {
      }
    }
    if (data != null) {
      data.shutdown();
    }
    if (myMetrics != null) {
      myMetrics.shutdown();
    }
    this.shutdownMXBean();
  }
 

  /** Check if there is no space in disk
   *  @param e that caused this checkDiskError call
   **/
  protected void checkDiskError(Exception e ) throws IOException {
    if (e instanceof ClosedByInterruptException
        || e instanceof java.io.InterruptedIOException) {
      return;
    }
    LOG.warn("checkDiskError: exception: ", e);
   
    if (e.getMessage() != null &&
        e.getMessage().startsWith("No space left on device")) {
      throw new DiskOutOfSpaceException("No space left on device");
    } else {
      checkDiskError();
    }
  }
 
  /**
   *  Check if there is a disk failure and if so, handle the error
   *
   **/
  protected void checkDiskError( ) throws IOException{
    // We disallow concurrent disk checks as it doesn't help
    // but can significantly impact performance and reliability of
    // the system.
    //
    boolean setSuccess = checkingDisk.compareAndSet(false, true);
    if (!setSuccess) {
      LOG.info("checkDiskError is already running.");
      return;
    }

    try {
      // We don't check disks if it's not long since last check.
      //
      long curTime = System.currentTimeMillis();
      if (curTime - timeLastCheckDisk < minDiskCheckIntervalMsec) {
        LOG.info("checkDiskError finished within "
            + minDiskCheckIntervalMsec + " mses. Skip this one.");
        return;
      }
      data.checkDataDir();
      timeLastCheckDisk = System.currentTimeMillis();
    } catch(DiskErrorException de) {
      handleDiskError(de.getMessage());
    } finally {
      checkingDisk.set(false);
    }
  }
 
  private void handleDiskError(String errMsgr) throws IOException{
    boolean hasEnoughResource = data.hasEnoughResource();
    myMetrics.volumeFailures.inc();
    for(Integer namespaceId : namespaceManager.getAllNamespaces()){
      DatanodeProtocol nn = getNSNamenode(namespaceId);
      LOG.warn("DataNode.handleDiskError: Keep Running: " + hasEnoughResource);
     
      //if hasEnoughtResource = true - more volumes are available, so we don't want
      // to shutdown DN completely and don't want NN to remove it.
      int dp_error = DatanodeProtocol.DISK_ERROR;
      if(hasEnoughResource == false) {
        // DN will be shutdown and NN should remove it
        dp_error = DatanodeProtocol.FATAL_DISK_ERROR;
      }
      //inform NameNode
      try {
        nn.errorReport(getDNRegistrationForNS(namespaceId), dp_error, errMsgr);
      } catch(IOException ignored) {             
      }
     
     
      if(hasEnoughResource) {
        for (NamespaceService nsos : namespaceManager.getAllNamenodeThreads()) {
          nsos.scheduleBlockReport(0);
        }
        return; // do not shutdown
      }
    }
   
    LOG.warn("DataNode is shutting down.\n" + errMsgr);
    shouldRun = false;
  }
 
  private void refreshVolumes(String confVolumes) throws Exception {
    if( !(data instanceof FSDataset)) {
      throw new UnsupportedOperationException("Only FSDataset support refresh volumes operation");
    }

    // Dirs described by conf file
    Configuration conf = getConf();

    //temporary set dfs.data.dir for get storageDirs
    String oldVolumes = conf.get("dfs.data.dir");
    conf.set("dfs.data.dir", confVolumes);
    Collection<URI> dataDirs = getStorageDirs(conf);
    conf.set("dfs.data.dir", oldVolumes);
   
    ArrayList<File> newDirs = getDataDirsFromURIs(dataDirs);
    ArrayList<File> decomDirs = new ArrayList<File>();
 
    for (Iterator<StorageDirectory> storageIter = this.storage.dirIterator();
        storageIter.hasNext();) {
      StorageDirectory dir = storageIter.next();
     
      // Delete volumes not in service from DataStorage
      if (!((FSDataset)data).isValidVolume(dir.getCurrentDir())) {
        LOG.info("This dir is listed in conf, but not in service " + dir.getRoot());
        storageIter.remove();
        continue;
      }
 
      if (newDirs.contains(dir.getRoot())){
        // remove the dir already in-service in newDirs list
        LOG.info("This conf dir has already been in service " + dir.getRoot());
        newDirs.remove(dir.getRoot());
      } else {
        // add the dirs not described in conf files to decomDirs
        LOG.warn("The configuration does not contain serving dir " +
          dir.getRoot() + ", but we cannot remove it from serving volumes in current version." );
        decomDirs.add(dir.getRoot());
      }
    }
 
    if (newDirs.isEmpty()){
      LOG.info("All the configured dir is in service, and do not need refreshment.");
      return;
    }

    for (int namespaceId: namespaceManager.getAllNamespaces()) {
      // Load new volumes via DataStorage
      NamespaceInfo nsInfo = getNSNamenode(namespaceId).versionRequest();
      String nameserviceId = this.namespaceManager.get(namespaceId).getNameserviceId();
      Collection<StorageDirectory> newStorageDirectories =
        storage.recoverTransitionAdditionalRead(nsInfo, newDirs, getStartupOption(conf));
      storage.recoverTransitionRead(this, namespaceId, nsInfo, newDirs,
        getStartupOption(conf), nameserviceId);
     
      // add new volumes in FSDataSet
      ((FSDataset)data).addVolumes(conf, namespaceId,
        storage.getNameSpaceDataDir(namespaceId), newStorageDirectories);
    }
  }
   
  /** Number of concurrent xceivers per node. */
  int getXceiverCount() {
    return threadGroup == null ? 0 : threadGroup.activeCount();
  }

  static Collection<URI> getStorageDirs(Configuration conf) {
    Collection<String> dirNames =
      conf.getStringCollection("dfs.data.dir");
    return Util.stringCollectionAsURIs(dirNames);
  }

  static ArrayList<File> getDataDirsFromURIs(Collection<URI> dataDirs) {
    ArrayList<File> dirs = new ArrayList<File>();
    for (URI dirURI : dataDirs) {
      if (!"file".equalsIgnoreCase(dirURI.getScheme())) {
        LOG.warn("Unsupported URI schema in " + dirURI + ". Ignoring ...");
        continue;
      }
      // drop any (illegal) authority in the URI for backwards compatibility
      File data = new File(dirURI.getPath());
      try {
        DiskChecker.checkDir(data);
        dirs.add(data);
      } catch (IOException e) {
        LOG.warn("Invalid directory in dfs.data.dir: "
                 + e.getMessage());
      }
    }
    return dirs;
  }

  /**
   * A thread per namenode to perform:
   * <ul>
   * <li> Pre-registration handshake with namenode</li>
   * <li> Registration with namenode</li>
   * <li> Send periodic heartbeats to the namenode</li>
   * <li> Handle commands received from the datanode</li>
   * </ul>
   */
  class NSOfferService extends NamespaceService {
    final InetSocketAddress nnAddr;
    DatanodeRegistration nsRegistration;
    NamespaceInfo nsInfo;
    long lastBlockReport = 0;
    private Thread nsThread;
    private DatanodeProtocol nsNamenode;
    int namespaceId;
    String nameserviceId;
    private long lastHeartbeat = 0;
    private long lastDeletedReport = 0;
    boolean resetBlockReportTime = true;
    private volatile boolean initialized = false;
    private final LinkedList<Block> receivedAndDeletedBlockList
      = new LinkedList<Block>();
    private int pendingReceivedRequests = 0;
    private volatile boolean shouldServiceRun = true;
    UpgradeManagerDatanode upgradeManager = null;
    private ScheduledFuture keepAliveRun = null;
    private ScheduledExecutorService keepAliveSender = null;
    private boolean firstBlockReportSent = false;
    volatile long lastBeingAlive = now();

    NSOfferService(InetSocketAddress isa, String nameserviceId) {
      this.nsRegistration = new DatanodeRegistration(getMachineName());
      this.nnAddr = isa;
      this.nameserviceId = nameserviceId;
    }
   
    public DatanodeProtocol getDatanodeProtocol() {
      return nsNamenode;
    }

    /**
     * Used only for testing.
     *
     * @param name
     *          the new registration name for the datanode
     */
    public void setRegistrationName(String name) {
      this.nsRegistration.setName(name);
    }

    /**
     * Main loop for each NS thread. Run until shutdown,
     * forever calling remote NameNode functions.
     */
    private void offerService() throws Exception {

      LOG.info("using BLOCKREPORT_INTERVAL of " + blockReportInterval + "msec" +
          " Initial delay: " + initialBlockReportDelay + "msec");
      LOG.info("using DELETEREPORT_INTERVAL of " + deletedReportInterval + "msec");
      LOG.info("using HEARTBEAT_INTERVAL of " + heartBeatInterval + "msec");
      LOG.info("using HEARTBEAT_EXPIRE_INTERVAL of " + heartbeatExpireInterval + "msec");

      //
      // Now loop for a long time....
      //

      while (shouldRun && shouldServiceRun) {
        try {
          long startTime = now();

          //
          // Every so often, send heartbeat or block-report
          //

          if (startTime - lastHeartbeat > heartBeatInterval) {
            //
            // All heartbeat messages include following info:
            // -- Datanode name
            // -- data transfer port
            // -- Total capacity
            // -- Bytes remaining
            //
            lastHeartbeat = startTime;
            DatanodeCommand[] cmds = nsNamenode.sendHeartbeat(nsRegistration,
                                                         data.getCapacity(),
                                                         data.getDfsUsed(),
                                                         data.getRemaining(),
                                                         data.getNSUsed(namespaceId),
                                                         xmitsInProgress.get(),
                                                         getXceiverCount());
            this.lastBeingAlive = now();
            LOG.debug("Sent heartbeat at " + this.lastBeingAlive);
            myMetrics.heartbeats.inc(now() - startTime);
            //LOG.info("Just sent heartbeat, with name " + localName);
            if (!processCommand(cmds))
              continue;
          }

          // check if there are newly received blocks (pendingReceivedRequeste > 0
          // or if the deletedReportInterval passed.
          if (firstBlockReportSent && (pendingReceivedRequests > 0
              || (startTime - lastDeletedReport > deletedReportInterval))) {
            Block[] receivedAndDeletedBlockArray = null;
            int currentReceivedRequestsCounter = pendingReceivedRequests;
            synchronized (receivedAndDeletedBlockList) {
                lastDeletedReport = startTime;

                int numBlocksReceivedAndDeleted = receivedAndDeletedBlockList
                    .size();
                if (numBlocksReceivedAndDeleted > 0) {
                  receivedAndDeletedBlockArray = receivedAndDeletedBlockList
                      .toArray(new Block[numBlocksReceivedAndDeleted]);
                }
            }
            if (receivedAndDeletedBlockArray != null) {
              long rpcStartTime = 0;
              if (LOG.isDebugEnabled()) {
                rpcStartTime = System.nanoTime();
                LOG.debug("sending blockReceivedAndDeleted "
                    + receivedAndDeletedBlockArray.length + " blocks to " + nnAddr);
              }
              nsNamenode.blockReceivedAndDeleted(nsRegistration, receivedAndDeletedBlockArray);
              if (LOG.isDebugEnabled()) {
                LOG.debug("finshed blockReceivedAndDeleted to " + nnAddr
                    + " time: " + (System.nanoTime() - rpcStartTime) + " ns");
              }
              synchronized (receivedAndDeletedBlockList) {
                for (int i = 0; i < receivedAndDeletedBlockArray.length; i++) {
                  receivedAndDeletedBlockList
                      .remove(receivedAndDeletedBlockArray[i]);
                }
                pendingReceivedRequests-=currentReceivedRequestsCounter;
              }
            }
          }


          // send block report
          if (startTime - lastBlockReport > blockReportInterval) {
            //
            // Send latest blockinfo report if timer has expired.
            // Get back a list of local block(s) that are obsolete
            // and can be safely GC'ed.
            //
            long brStartTime = now();
            Block[] bReport = data.getBlockReport(namespaceId);

            DatanodeCommand cmd = nsNamenode.blockReport(nsRegistration,
                    new BlockReport(BlockListAsLongs.convertToArrayLongs(bReport)));
            firstBlockReportSent = true;
            long brTime = now() - brStartTime;
            myMetrics.blockReports.inc(brTime);
            LOG.info("BlockReport of " + bReport.length +
                " blocks got processed in " + brTime + " msecs");
            //
            // If we have sent the first block report, then wait a random
            // time before we start the periodic block reports.
            //
            if (resetBlockReportTime) {
              lastBlockReport = startTime - R.nextInt((int)(blockReportInterval));
              resetBlockReportTime = false;
            } else {
               /* say the last block report was at 8:20:14. The current report
               * should have started around 9:20:14 (default 1 hour interval).
               * If current time is :
               *   1) normal like 9:20:18, next report should be at 10:20:14
               *   2) unexpected like 11:35:43, next report should be at 12:20:14
               */
              lastBlockReport += (now() - lastBlockReport) /
                                 blockReportInterval * blockReportInterval;
            }
            processCommand(cmd);
          }

          //
          // There is no work to do;  sleep until hearbeat timer elapses,
          // or work arrives, and then iterate again.
          //
          long waitTime = heartBeatInterval - (System.currentTimeMillis() - lastHeartbeat);
          synchronized(receivedAndDeletedBlockList) {
            if (waitTime > 0 && pendingReceivedRequests == 0) {
              try {
                receivedAndDeletedBlockList.wait(waitTime);
              } catch (InterruptedException ie) {
              }
              delayBeforeBlockReceived();
            }
          } // synchronized
        } catch(RemoteException re) {
          String reClass = re.getClassName();
          if (UnregisteredDatanodeException.class.getName().equals(reClass) ||
              DisallowedDatanodeException.class.getName().equals(reClass) ||
              IncorrectVersionException.class.getName().equals(reClass)) {
            LOG.warn("DataNode is shutting down: " +
                     StringUtils.stringifyException(re));
            shouldRun = false;
            shutdown();
            return;
          }
          try {
            Thread.sleep(1000);
          } catch (InterruptedException ie) {
            // NOTE: common case should be doing this instead of ignoring ie
            Thread.currentThread().interrupt();
          }
          LOG.warn(StringUtils.stringifyException(re));
        } catch (IOException e) {
          LOG.warn(StringUtils.stringifyException(e));
        }
      } // while (shouldRun)
    } // offerService
   
    /**
     * When a block has been received, we can delay some period of time before
     * reporting it to the DN, for the purpose of testing. This simulates
     * the actual latency of blockReceived on a real network (where the client
     * may be closer to the NN than the DNs).
     */
    private void delayBeforeBlockReceived() {
      if (artificialBlockReceivedDelay > 0 && !receivedAndDeletedBlockList.isEmpty()) {
        try {
          long sleepFor = (long)R.nextInt(artificialBlockReceivedDelay);
          LOG.debug("DataNode " + nsRegistration + " sleeping for " +
                    "artificial delay: " + sleepFor + " ms");
          Thread.sleep(sleepFor);
        } catch (InterruptedException ie) {
          Thread.currentThread().interrupt();
        }
      }
    }
   
    /**
     * Process an array of datanode commands
     *
     * @param cmds an array of datanode commands
     * @return true if further processing may be required or false otherwise.
     */
    private boolean processCommand(DatanodeCommand[] cmds) {
      if (cmds != null) {
        for (DatanodeCommand cmd : cmds) {
          try {
            if (processCommand(cmd) == false) {
              return false;
            }
          } catch (IOException ioe) {
            LOG.warn("Error processing datanode Command", ioe);
          }
        }
      }
      return true;
    }
   
    /**
    *
    * @param cmd
    * @return true if further processing may be required or false otherwise.
    * @throws IOException
    */
   private boolean processCommand(DatanodeCommand cmd) throws IOException {
     if (cmd == null)
       return true;
     final BlockCommand bcmd = cmd instanceof BlockCommand? (BlockCommand)cmd: null;

     boolean retValue = true;
     long startTime = System.currentTimeMillis();

     switch(cmd.getAction()) {
     case DatanodeProtocol.DNA_TRANSFER:
       // Send a copy of a block to another datanode
       transferBlocks(namespaceId,
           bcmd.getBlocks(), bcmd.getTargets());
       myMetrics.blocksReplicated.inc(bcmd.getBlocks().length);
       break;
     case DatanodeProtocol.DNA_INVALIDATE:
       //
       // Some local block(s) are obsolete and can be
       // safely garbage-collected.
       //
       Block toDelete[] = bcmd.getBlocks();
       try {
         if (blockScanner != null) {
           blockScanner.deleteBlocks(namespaceId, toDelete);
         }       
         data.invalidate(namespaceId, toDelete);
       } catch(IOException e) {
         checkDiskError();
         throw e;
       }
       myMetrics.blocksRemoved.inc(toDelete.length);
       break;
     case DatanodeProtocol.DNA_SHUTDOWN:
       // shut down the data node
       shouldServiceRun = false;
       retValue = false;
       break;
     case DatanodeProtocol.DNA_REGISTER:
       // namenode requested a registration - at start or if NN lost contact
       LOG.info("DatanodeCommand action: DNA_REGISTER");
       if (shouldRun) {
         register();
         firstBlockReportSent = false;
       }
       break;
     case DatanodeProtocol.DNA_FINALIZE:
        storage.finalizedUpgrade(namespaceId);
       break;
     case UpgradeCommand.UC_ACTION_START_UPGRADE:
       // start distributed upgrade here
       processDistributedUpgradeCommand((UpgradeCommand)cmd);
       break;
     case DatanodeProtocol.DNA_RECOVERBLOCK:
       recoverBlocks(namespaceId, bcmd.getBlocks(), bcmd.getTargets());
       break;
     default:
       LOG.warn("Unknown DatanodeCommand action: " + cmd.getAction());
     }
     long endTime = System.currentTimeMillis();
     if (endTime - startTime > 1000) {
       LOG.info("processCommand() took " + (endTime - startTime)
           + " msec to process command " + cmd.getAction() + " from " + nnAddr);
     } else if (LOG.isDebugEnabled()) {
       LOG.debug("processCommand() took " + (endTime - startTime)
           + " msec to process command " + cmd.getAction() + " from " + nnAddr);
     }
     return retValue;
   }

    /**
     * returns true if NS thread has completed initialization of storage
     * and has registered with the corresponding namenode
     * @return true if initialized
     */
   @Override
    public boolean initialized() {
      return initialized;
    }
   
   @Override
    public boolean isAlive() {
      return shouldServiceRun && nsThread.isAlive();
    }
   
   @Override
    public int getNamespaceId() {
      return namespaceId;
    }
  
   @Override
   public String getNameserviceId() {
     return this.nameserviceId;
   }
   
   @Override
    public InetSocketAddress getNNSocketAddress() {
      return nnAddr;
    }
    void setNamespaceInfo(NamespaceInfo nsinfo) {
      this.nsInfo = nsinfo;
      this.namespaceId = nsinfo.getNamespaceID();
      namespaceManager.addNamespace(this);
    }

    void setNameNode(DatanodeProtocol dnProtocol) {
      nsNamenode = dnProtocol;
    }

    private NamespaceInfo handshake() throws IOException {
      NamespaceInfo nsInfo = new NamespaceInfo();
      while (shouldRun && shouldServiceRun) {
        try {
          nsInfo = nsNamenode.versionRequest();
          break;
        } catch(SocketTimeoutException e) {  // namenode is busy
          LOG.info("Problem connecting to server: " + nnAddr);
          try {
            Thread.sleep(1000);
          } catch (InterruptedException ie) {}
        }
      }
      String errorMsg = null;
      // verify build version
      if( ! nsInfo.getBuildVersion().equals( Storage.getBuildVersion() )) {
        errorMsg = "Incompatible build versions: namenode BV = "
          + nsInfo.getBuildVersion() + "; datanode BV = "
          + Storage.getBuildVersion();
        LOG.warn( errorMsg );
        try {
          nsNamenode.errorReport( nsRegistration,
                                DatanodeProtocol.NOTIFY, errorMsg );
        } catch( SocketTimeoutException e ) {  // namenode is busy
          LOG.info("Problem connecting to server: " + nnAddr.toString());
        }
      }
      assert FSConstants.LAYOUT_VERSION == nsInfo.getLayoutVersion() :
        "Data-node and name-node layout versions must be the same."
        + "Expected: "+ FSConstants.LAYOUT_VERSION + " actual "+ nsInfo.getLayoutVersion();
      return nsInfo;
    }

    void setupNS(Configuration conf, AbstractList<File> dataDirs)
    throws IOException {
      // get NN proxy
      DatanodeProtocol dnp =
        (DatanodeProtocol)RPC.waitForProxy(DatanodeProtocol.class,
            DatanodeProtocol.versionID, nnAddr, conf);
      setNameNode(dnp);

      // handshake with NN
      NamespaceInfo nsInfo = handshake();
      setNamespaceInfo(nsInfo);
      synchronized(DataNode.this){
        setupNSStorage();
      }
     
      nsRegistration.setIpcPort(ipcServer.getListenerAddress().getPort());
      nsRegistration.setInfoPort(infoServer.getPort());
    }
   
    void setupNSStorage() throws IOException {
      StartupOption startOpt = getStartupOption(conf);
      assert startOpt != null : "Startup option must be set.";

      boolean simulatedFSDataset =
        conf.getBoolean("dfs.datanode.simulateddatastorage", false);
     
      if (simulatedFSDataset) {
        nsRegistration.setStorageID(storage.getStorageID()); //same as DN
        nsRegistration.storageInfo.layoutVersion = FSConstants.LAYOUT_VERSION;
        nsRegistration.storageInfo.namespaceID = nsInfo.namespaceID;
      } else {
        // read storage info, lock data dirs and transition fs state if necessary     
        // first do it at the top level dataDirs
        // This is done only once when among all namespaces
        storage
            .recoverTransitionRead(DataNode.this, nsInfo, dataDirs, startOpt);
        // Then do it for this namespace's directory
        storage.recoverTransitionRead(DataNode.this, nsInfo.namespaceID,
            nsInfo, dataDirs, startOpt, nameserviceId);
       
        LOG.info("setting up storage: namespaceId="
            + namespaceId + ";lv=" + storage.layoutVersion + ";nsInfo="
            + nsInfo);

        nsRegistration.setStorageInfo(
            storage.getNStorage(nsInfo.namespaceID), storage.getStorageID());
        data.initialize(storage);
       
      }
      data.addNamespace(namespaceId, storage.getNameSpaceDataDir(namespaceId), conf);
      if (blockScanner != null) {
        blockScanner.start();
        blockScanner.addNamespace(namespaceId);
      }
    }
   

    /**
     * This methods  arranges for the data node to send the block report at
     * the next heartbeat.
     */
    @Override
    public void scheduleBlockReport(long delay) {
      if (delay > 0) { // send BR after random delay
        lastBlockReport = System.currentTimeMillis()
        - ( blockReportInterval - R.nextInt((int)(delay)));
      } else { // send at next heartbeat
        lastBlockReport = lastHeartbeat - blockReportInterval;
      }
      resetBlockReportTime = true; // reset future BRs for randomness
    }
   
    /**
     * This method control the occurrence of blockReceivedAndDeleted
     * only use for testing
     */
    @Override
    public void scheduleBlockReceivedAndDeleted(long delay) {
      if (delay > 0) {
        lastDeletedReport = System.currentTimeMillis()
            - deletedReportInterval + delay;
      } else {
        lastDeletedReport = 0;
      }
    }

    @Override
    public void reportBadBlocks(LocatedBlock[] blocks) throws IOException{
      try {
        nsNamenode.reportBadBlocks(blocks)
      } catch (IOException e){
        /* One common reason is that NameNode could be in safe mode.
         * Should we keep on retrying in that case?
         */
        LOG.warn("Failed to report bad block to namenode : " +
                 " Exception : " + StringUtils.stringifyException(e));
        throw e;
      }
     
    }

    /*
     * Informing the name node could take a long long time! Should we wait
     * till namenode is informed before responding with success to the
     * client? For now we don't.
     */
    @Override
    public void notifyNamenodeReceivedBlock(Block block, String delHint) {
      if (block == null ) {
        throw new IllegalArgumentException("Block is null");
      }

      if (delHint != null && !delHint.isEmpty()) {
        block = new ReceivedBlockInfo(block, delHint);
      }

      synchronized (receivedAndDeletedBlockList) {
        receivedAndDeletedBlockList.add(block);
        pendingReceivedRequests++;
        receivedAndDeletedBlockList.notifyAll();
      }
    }

    @Override
    public void notifyNamenodeDeletedBlock(Block block) {
      if (block == null) {
        throw new IllegalArgumentException(block == null ? "Block is null"
            : "delHint is null");
      }

      // mark it as a deleted block
      DFSUtil.markAsDeleted(block);

      synchronized (receivedAndDeletedBlockList) {
        receivedAndDeletedBlockList.add(block);
      }
    }
   
    //This must be called only by namespaceManager
    @Override
    public void start() {
      if ((nsThread != null) && (nsThread.isAlive())) {
        //Thread is started already
        return;
      }
      nsThread = new Thread(this, dnThreadName);
      nsThread.setDaemon(true); // needed for JUnit testing
      nsThread.start();
    }
   
    @Override
    //This must be called only by namespaceManager.
    public void stop() {
      shouldServiceRun = false;
      if (keepAliveRun != null) {
        keepAliveRun.cancel(false);
      }
      if (keepAliveSender != null) {
        keepAliveSender.shutdownNow();
      }
      if (nsThread != null) {
        nsThread.interrupt();
      }
    }
   
    //This must be called only by namespaceManager
    @Override
    public void join() {
      try {
        if (nsThread != null) {
          nsThread.join();
        }
      } catch (InterruptedException ie) { }
    }
   
    //Cleanup method to be called by current thread before exiting.
    private void cleanUp() {
     
      if(upgradeManager != null)
        upgradeManager.shutdownUpgrade();
     
      namespaceManager.remove(this);
      if (keepAliveRun != null) {
        keepAliveRun.cancel(false);
      }
      if (keepAliveSender != null) {
        keepAliveSender.shutdownNow();
      }
      shouldServiceRun = false;
      RPC.stopProxy(nsNamenode);
      if (blockScanner != null) {
        blockScanner.removeNamespace(this.getNamespaceId());
      }
      if (data != null) {
        data.removeNamespace(this.getNamespaceId());
      }
      if (storage != null) {
        storage.removeNamespaceStorage(this.getNamespaceId());
      }
    }



    /**
     * Register one namespace with the corresponding NameNode
     * <p>
     * The nsDatanode needs to register with the namenode on startup in order
     * 1) to report which storage it is serving now and
     * 2) to receive a registrationID
     * 
     * issued by the namenode to recognize registered datanodes.
     *
     * @see FSNamesystem#registerDatanode(DatanodeRegistration)
     * @throws IOException
     */
    void register() throws IOException {
      if (nsRegistration.getStorageID().equals("")) {
        nsRegistration.storageID = createNewStorageId(nsRegistration.getPort());
      }
      while(shouldRun && shouldServiceRun) {
        try {
          // reset name to machineName. Mainly for web interface.
          nsRegistration.setName(machineName + ":" + nsRegistration.getPort());       
          nsRegistration = nsNamenode.register(nsRegistration,
              DataTransferProtocol.DATA_TRANSFER_VERSION);
          break;
        } catch(RemoteException re) {
          String reClass = re.getClassName();
          if (UnregisteredDatanodeException.class.getName().equals(reClass) ||
              DisallowedDatanodeException.class.getName().equals(reClass) ||
              IncorrectVersionException.class.getName().equals(reClass)) {
            LOG.warn("DataNode is shutting down: " +
                     StringUtils.stringifyException(re));
            break;
          }
        } catch(Exception e) {  // namenode cannot be contacted
          LOG.info("Problem connecting to server: " + nnAddr.toString() +
                    StringUtils.stringifyException(e));
        }
        try {
          Thread.sleep(1000);
        } catch (InterruptedException ie) {}
      }
      assert ("".equals(storage.getStorageID())
              && !"".equals(nsRegistration.getStorageID()))
              || storage.getStorageID().equals(nsRegistration.getStorageID()) :
              "New storageID can be assigned only if data-node is not formatted";
             
      if (storage.getStorageID().equals("")) {
        storage.setStorageID(nsRegistration.getStorageID());
        storage.writeAll();
        LOG.info("New storage id " + nsRegistration.getStorageID()
            + " is assigned to data-node " + nsRegistration.getName());
      }
      if(! storage.getStorageID().equals(nsRegistration.getStorageID())) {
        throw new IOException("Inconsistent storage IDs. Name-node returned "
            + nsRegistration.getStorageID()
            + ". Expecting " + storage.getStorageID());
      }
     
      sendBlocksBeingWrittenReport(nsNamenode, namespaceId, nsRegistration);
      // random short delay - helps scatter the BR from all DNs
      scheduleBlockReport(initialBlockReportDelay);
    }


    /**
     * No matter what kind of exception we get, keep retrying to offerService().
     * That's the loop that connects to the NameNode and provides basic DataNode
     * functionality.
     *
     * Only stop when "shouldRun" or "shouldServiceRun" is turned off, which can
     * happen either at shutdown or due to refreshNamenodes.
     */
    @Override
    public void run() {
      LOG.info(nsRegistration + "In NSOfferService.run, data = " + data
          + ";ns=" + namespaceId);
      try {
        // init stuff
        try {
          // setup storage
          setupNS(conf, dataDirs);
          register();
         
          KeepAliveHeartbeater keepAliveTask =
              new KeepAliveHeartbeater(nsNamenode, nsRegistration, this);
          keepAliveSender = Executors.newSingleThreadScheduledExecutor();
          keepAliveRun = keepAliveSender.scheduleAtFixedRate(keepAliveTask, 0,
                                                             heartBeatInterval,
                                                             TimeUnit.MILLISECONDS);
         
        } catch (IOException ioe) {
          // Initial handshake, storage recovery or registration failed
          // End NSOfferService thread
          LOG.info("--------- " + StringUtils.stringifyException(ioe));
          LOG.fatal(nsRegistration + " initialization failed for namespaceId "
              + namespaceId, ioe);
          return;
        }

        initialized = true;
        while (shouldRun && shouldServiceRun) {
          try {
            startDistributedUpgradeIfNeeded();
            offerService();
          } catch (Exception ex) {
            LOG.error("Exception: " + StringUtils.stringifyException(ex));
            if (shouldRun && shouldServiceRun) {
              try {
                Thread.sleep(5000);
              } catch (InterruptedException ie) {
                LOG.warn("Received exception: ", ie);
              }
            }
          }
        }
      } catch (Throwable ex) {
        LOG.warn("Unexpected exception " + StringUtils.stringifyException(ex));
      } finally {
        LOG.warn(nsRegistration + " ending namespace service for: "
            + namespaceId);
        cleanUp();
      }
    } 
   
    private void processDistributedUpgradeCommand(UpgradeCommand comm
                                                 ) throws IOException {
      assert upgradeManager != null : "DataNode.upgradeManager is null.";
      upgradeManager.processUpgradeCommand(comm);
    }

    @Override
    public synchronized UpgradeManagerDatanode getUpgradeManager() {
      if(upgradeManager == null)
        upgradeManager =
          new UpgradeManagerDatanode(DataNode.this, namespaceId);
     
      return upgradeManager;
    }
   
    /**
     * Start distributed upgrade if it should be initiated by the data-node.
     */
    private void startDistributedUpgradeIfNeeded() throws IOException {
      UpgradeManagerDatanode um = getUpgradeManager();
     
      if(!um.getUpgradeState())
        return;
      um.setUpgradeState(false, um.getUpgradeVersion());
      um.startUpgrade();
      return;
    }
   
    /** Block synchronization */
    @Override
    public LocatedBlock syncBlock(
      Block block, List<BlockRecord> syncList,
      boolean closeFile, List<InterDatanodeProtocol> datanodeProxies,
      long deadline
    )
      throws IOException {
      if (LOG.isDebugEnabled()) {
        LOG.debug("block=" + block + ", (length=" + block.getNumBytes()
            + "), syncList=" + syncList + ", closeFile=" + closeFile);
      }

      //syncList.isEmpty() that all datanodes do not have the block
      //so the block can be deleted.
      if (syncList.isEmpty()) {
        nsNamenode.commitBlockSynchronization(block, 0, 0, closeFile, true,
            DatanodeID.EMPTY_ARRAY);
        return null;
      }

      List<DatanodeID> successList = new ArrayList<DatanodeID>();

      throwIfAfterTime(deadline);
      long generationstamp = -1;
      try {
        generationstamp = nsNamenode.nextGenerationStamp(block, closeFile);
      } catch (RemoteException e) {
        if (e.unwrapRemoteException() instanceof BlockAlreadyCommittedException) {
          throw new BlockAlreadyCommittedException(e);
        } else {
          throw e;
        }
      }
      Block newblock = new Block(block.getBlockId(), block.getNumBytes(), generationstamp);

      for(BlockRecord r : syncList) {
        try {
          throwIfAfterTime(deadline);
          LOG.info("Updating block " + r + " to " + newblock);
          r.datanode.updateBlock(namespaceId, r.info.getBlock(), newblock, closeFile);
          successList.add(r.id);
        } catch (BlockRecoveryTimeoutException e) {
          throw e;
        } catch (IOException e) {
          InterDatanodeProtocol.LOG.warn("Failed to updateBlock (newblock="
              + newblock + ", datanode=" + r.id + ")", e);
        }
      }

      LOG.info("Updated blocks on syncList for block " + block + " to " + newblock);

        stopAllProxies(datanodeProxies);

      if (!successList.isEmpty()) {
        DatanodeID[] nlist = successList.toArray(new DatanodeID[successList.size()]);

        throwIfAfterTime(deadline);
        nsNamenode.commitBlockSynchronization(block,
            newblock.getGenerationStamp(), newblock.getNumBytes(), closeFile, false,
            nlist);
        DatanodeInfo[] info = new DatanodeInfo[nlist.length];
        for (int i = 0; i < nlist.length; i++) {
          info[i] = new DatanodeInfo(nlist[i]);
        }
        return new LocatedBlock(newblock, info); // success
      }

      //failed
      StringBuilder b = new StringBuilder();
      for(BlockRecord r : syncList) {
        b.append("\n  " + r.id);
      }
      throw new IOException("Cannot recover " + block + ", none of these "
          + syncList.size() + " datanodes success {" + b + "\n}");
    }

    @Override
    public DatanodeRegistration getNsRegistration() {
      return nsRegistration;
    }
   

  }
 
  /**
   * Manages the NSOfferService objects for the data node.
   * Creation, removal, starting, stopping, shutdown on NSOfferService
   * objects must be done via APIs in this class.
   */
  class NamespaceManager {
    private final Map<Integer, NamespaceService> nsMapping =
      new HashMap<Integer, NamespaceService>();
    protected final Map<InetSocketAddress, NamespaceService> nameNodeThreads =
      new HashMap<InetSocketAddress, NamespaceService>();
    //This lock is only used for refreshNamenodes method
    private final Object refreshNamenodesLock = new Object();
    NamespaceManager() {
    }
   
    NamespaceManager(Configuration conf, List<InetSocketAddress> nameNodeAddrs)
        throws IOException {
      Collection<String> nameserviceIds = DFSUtil.getNameServiceIds(conf);
      Iterator<String> it = nameserviceIds.iterator();
      for(InetSocketAddress nnAddr : nameNodeAddrs){
        String nameserivceId = it.hasNext()? it.next(): null;
        NSOfferService nsos = new NSOfferService(nnAddr, nameserivceId);
        nameNodeThreads.put(nsos.getNNSocketAddress(), nsos);
      }
    }
   
    public boolean initailized() {
      for(NamespaceService nsos : nameNodeThreads.values()){
        if(!nsos.initialized()){
          return false;
        }
      }
      return true;
    }
   
    public boolean isAlive(int namespaceId) {
      NamespaceService nsos = nsMapping.get(namespaceId);
      if(nsos == null){
        return false;
      }
      return nsos.isAlive();
    }
   
    synchronized void addNamespace(NamespaceService t) {
      if (nameNodeThreads.get(t.getNNSocketAddress()) == null) {
        throw new IllegalArgumentException(
            "Unknown NSOfferService thread for namenode address:"
                + t.getNNSocketAddress());
      }
      nsMapping.put(t.getNamespaceId(), t);
    }
   
    /**
     * Returns the array of NSOfferService objects.
     * Caution: The NSOfferService returned could be shutdown any time.
     */
    synchronized NamespaceService[] getAllNamenodeThreads() {
      NamespaceService[] nsosArray = new NamespaceService[nameNodeThreads.values()
          .size()];
      return nameNodeThreads.values().toArray(nsosArray);
    }
   
    synchronized NamespaceService get(int namespaceId) {
      return nsMapping.get(namespaceId);
    }
   
    synchronized NamespaceService get(InetSocketAddress nameNodeAddr) {
      return nameNodeThreads.get(nameNodeAddr);
    }
   
    public synchronized void remove(NamespaceService t) {
      nameNodeThreads.remove(t.getNNSocketAddress());
      nsMapping.remove(t.getNamespaceId());
    }
   
    synchronized Integer[] getAllNamespaces(){
      return nsMapping.keySet().toArray(
          new Integer[nsMapping.keySet().size()]);
    }
   
    void shutDownAll() {

      NamespaceService[] nsosArray = this.getAllNamenodeThreads();
       
      for (NamespaceService nsos : nsosArray) {
        nsos.stop(); //interrupts the threads
      }
      //now join
      for (NamespaceService nsos : nsosArray) {
        nsos.join();
      }
    }
   
    void startAll() throws IOException {
      for (NamespaceService nsos : getAllNamenodeThreads()) {
        nsos.start();
      }
      isAlive = true;
    }
   
    void stopAll() {
      for (NamespaceService nsos : getAllNamenodeThreads()) {
        nsos.stop();
      }     
    }
   
    void joinAll() throws InterruptedException {
      for (NamespaceService nsos : getAllNamenodeThreads()) {
        nsos.join();
      }
    }
   
    void refreshNamenodes(List<InetSocketAddress> nameNodeAddrs, Configuration conf)
        throws IOException, InterruptedException{
      List<InetSocketAddress> toStart = new ArrayList<InetSocketAddress>();
      List<NamespaceService> toStop = new ArrayList<NamespaceService>();
      Collection<String> nameserviceIds = DFSUtil.getNameServiceIds(conf);
      List<String> toStartServiceIds = new ArrayList<String>();
      synchronized (refreshNamenodesLock) {
        synchronized (this) {
          for (InetSocketAddress nnAddr : nameNodeThreads.keySet()) {
            if (!nameNodeAddrs.contains(nnAddr)){
              toStop.add(nameNodeThreads.get(nnAddr));
            }
          }
          Iterator<String> it = nameserviceIds.iterator();
          for (InetSocketAddress nnAddr : nameNodeAddrs) {
            String nameserviceId = it.hasNext()? it.next(): null;
            if (!nameNodeThreads.containsKey(nnAddr)) {
              toStart.add(nnAddr);
              toStartServiceIds.add(nameserviceId);
            }
          }
         
          it = toStartServiceIds.iterator();
          for (InetSocketAddress nnAddr : toStart) {
            NSOfferService nsos = new NSOfferService(nnAddr, it.next());
            nameNodeThreads.put(nsos.getNNSocketAddress(), nsos);
          }
          for (NamespaceService nsos : toStop) {
            remove(nsos);
          }
        }
      }
      for (NamespaceService nsos : toStop) {
        nsos.stop();
      }
      startAll();
    }
  }
 
  /* ********************************************************************
  Protocol when a client reads data from Datanode (Cur Ver: 9):
 
  Client's Request :
  =================
  
     Processed in DataXceiver:
     +----------------------------------------------+
     | Common Header   | 1 byte OP == OP_READ_BLOCK |
     +----------------------------------------------+
    
     Processed in readBlock() :
     +-------------------------------------------------------------------------+
     | 8 byte Block ID | 8 byte genstamp | 8 byte start offset | 8 byte length |
     +-------------------------------------------------------------------------+
     |   vInt length   |  <DFSClient id> |
     +-----------------------------------+
    
     Client sends optional response only at the end of receiving data.
      
  DataNode Response :
  ===================
  
    In readBlock() :
    If there is an error while initializing BlockSender :
       +---------------------------+
       | 2 byte OP_STATUS_ERROR    | and connection will be closed.
       +---------------------------+
    Otherwise
       +---------------------------+
       | 2 byte OP_STATUS_SUCCESS  |
       +---------------------------+

    Actual data, sent by BlockSender.sendBlock() :

      ChecksumHeader :
      +--------------------------------------------------+
      | 1 byte CHECKSUM_TYPE | 4 byte BYTES_PER_CHECKSUM |
      +--------------------------------------------------+
      Followed by actual data in the form of PACKETS:
      +------------------------------------+
      | Sequence of data PACKETs ....      |
      +------------------------------------+

    A "PACKET" is defined further below.

    The client reads data until it receives a packet with
    "LastPacketInBlock" set to true or with a zero length. If there is
    no checksum error, it replies to DataNode with OP_STATUS_CHECKSUM_OK:

    Client optional response at the end of data transmission :
      +------------------------------+
      | 2 byte OP_STATUS_CHECKSUM_OK |
      +------------------------------+

    PACKET : Contains a packet header, checksum and data. Amount of data
    ======== carried is set by BUFFER_SIZE.

      +-----------------------------------------------------+
      | 4 byte packet length (excluding packet header)      |
      +-----------------------------------------------------+
      | 8 byte offset in the block | 8 byte sequence number |
      +-----------------------------------------------------+
      | 1 byte boolean set: isLastPacketInBlock | forceSync |
      +-----------------------------------------------------+
      | 4 byte Length of actual data                        |
      +-----------------------------------------------------+
      | x byte checksum data. x is defined below            |
      +-----------------------------------------------------+
      | actual data ......                                  |
      +-----------------------------------------------------+
     
      x = (length of data + BYTE_PER_CHECKSUM - 1)/BYTES_PER_CHECKSUM *
          CHECKSUM_SIZE
         
      CHECKSUM_SIZE depends on CHECKSUM_TYPE (usually, 4 for CRC32)
     
      The above packet format is used while writing data to DFS also.
      Not all the fields might be used while reading.
   
   ************************************************************************ */
 
  /** Header size for a packet */
  public static int PKT_HEADER_LEN = ( 4 + /* Packet payload length */
                                      8 + /* offset in block */
                                      8 + /* seqno */
                                      /* up to 8 boolean values field */);
  public static byte isLastPacketInBlockMask = 0x01;
  public static byte forceSyncMask = 0x02;

 
  /**
   * Used for transferring a block of data.  This class
   * sends a piece of data to another DataNode.
   */
  class DataTransfer implements Callable<Boolean> {
    DatanodeInfo targets[];
    Block b;
    Block destinationBlock;
    DataNode datanode;
    private int srcNamespaceId;
    private int dstNamespaceId;

    /**
     * Connect to the first item in the target list.  Pass along the
     * entire target list, the block, and the data.
     */
    public DataTransfer(int namespaceId, DatanodeInfo targets[], Block b, DataNode datanode) throws IOException {
      // the source and destination blocks are the same for block replication
      this(targets, namespaceId, b, namespaceId, b, datanode);
    }

    /**
     * Connect to the first item in the target list.  Pass along the
     * entire target list, the block, and the data.
     */
    public DataTransfer(DatanodeInfo targets[], int srcNamespaceId, Block b,
                        int dstNamespaceId, Block destinationBlock,
                        DataNode datanode) throws IOException {
      this.targets = targets;
      this.b = b;
      this.destinationBlock = destinationBlock;
      this.datanode = datanode;
      this.srcNamespaceId = srcNamespaceId;
      this.dstNamespaceId = dstNamespaceId;
    }

    /**
     * Do the deed, write the bytes
     */
    public Boolean call() throws Exception {
      xmitsInProgress.getAndIncrement();
      Socket sock = null;
      DataOutputStream out = null;
      BlockSender blockSender = null;

      try {
        InetSocketAddress curTarget =
          NetUtils.createSocketAddr(targets[0].getName());
        sock = newSocket();
        NetUtils.connect(sock, curTarget, socketTimeout);
        sock.setSoTimeout(targets.length * socketTimeout);

        long writeTimeout = socketWriteTimeout + socketWriteExtentionTimeout
            * (targets.length - 1);
        OutputStream baseStream = NetUtils.getOutputStream(sock, writeTimeout);
        out = new DataOutputStream(new BufferedOutputStream(baseStream,
                                                            SMALL_BUFFER_SIZE));

        blockSender = new BlockSender(srcNamespaceId, b, 0, b.getNumBytes(),
            false, false, false, datanode);
        DatanodeInfo srcNode = new DatanodeInfo(getDNRegistrationForNS(srcNamespaceId));

        //
        // Header info
        //
        WriteBlockHeader header = new WriteBlockHeader(
            DataTransferProtocol.DATA_TRANSFER_VERSION, dstNamespaceId,
            destinationBlock.getBlockId(),
            destinationBlock.getGenerationStamp(), 0, false, true, srcNode,
            targets.length - 1, targets, "");
        header.writeVersionAndOpCode(out);
        header.write(out);

        // send data & checksum
        blockSender.sendBlock(out, baseStream, null);

        // no response necessary
        LOG.info(getDatanodeInfo() + ":Transmitted block " + b + " at " + srcNamespaceId + " to " + curTarget);

      } catch (IOException ie) {
        LOG.warn(getDatanodeInfo() + ":Failed to transfer " + b + " at " + srcNamespaceId + " to " + targets[0].getName()
            + " got " + StringUtils.stringifyException(ie));
        // check if there are any disk problem
        try{
          datanode.checkDiskError();
        } catch (IOException e) {
          LOG.warn("Error when checking disks : " + StringUtils.stringifyException(e));
          throw e;
        }
        throw ie;
      } finally {
        xmitsInProgress.getAndDecrement();
        IOUtils.closeStream(blockSender);
        IOUtils.closeStream(out);
        IOUtils.closeSocket(sock);
      }
      return true;
    }
  }

  /**
   * Initializes the {@link #data}. The initialization is done only once, when
   * handshake with the the first namenode is completed.
   */
  private synchronized void initFsDataSet(Configuration conf,
      AbstractList<File> dataDirs, int numNamespaces) throws IOException {
    if (data != null) { // Already initialized
      return;
    }

    // get version and id info from the name-node
    boolean simulatedFSDataset =
      conf.getBoolean("dfs.datanode.simulateddatastorage", false);

    if (simulatedFSDataset) {
      storage.createStorageID(selfAddr.getPort());
      // it would have been better to pass storage as a parameter to
      // constructor below - need to augment ReflectionUtils used below.
      conf.set("dfs.datanode.StorageId", storage.getStorageID());
      try {
        data = (FSDatasetInterface) ReflectionUtils.newInstance(
            Class.forName(
            "org.apache.hadoop.hdfs.server.datanode.SimulatedFSDataset"),
            conf);
      } catch (ClassNotFoundException e) {
        throw new IOException(StringUtils.stringifyException(e));
      }
    } else {
      data = new FSDataset(this, conf, numNamespaces);
    }
  }

  public static class KeepAliveHeartbeater implements Runnable {

   private DatanodeProtocol namenode;
   private DatanodeRegistration dnRegistration;
   private NamespaceService ns;

   public KeepAliveHeartbeater(DatanodeProtocol namenode,
       DatanodeRegistration dnRegistration,
       NamespaceService ns) {
     this.namenode = namenode;
     this.dnRegistration = dnRegistration;
     this.ns = ns;
   }
  
   public void run() {
     try {
       namenode.keepAlive(dnRegistration);
       ns.lastBeingAlive = now();
       LOG.debug("Sent heartbeat at " + ns.lastBeingAlive);
     } catch (Throwable ex) {
       LOG.error("Error sending keepAlive to the namenode", ex);
     }
   }
  }
 
  /** Start a single datanode daemon and wait for it to finish.
   *  If this thread is specifically interrupted, it will stop waiting.
   */
  public void runDatanodeDaemon() throws IOException {
    namespaceManager.startAll();
    // start dataXceiveServer
    dataXceiverServer.start();
    ipcServer.start();
  }

  public static boolean isDatanodeUp(DataNode dn) {
    return dn.isDatanodeUp();
  }
 
  /**
   * @return true if any namespace thread is alive
   */
  public boolean isDatanodeUp() {
    for (NamespaceService nsos: namespaceManager.getAllNamenodeThreads()) {
      if (nsos != null && nsos.isAlive()) {
        return true;
      }
    }
    return false;
  }
 
  /**
   * @return true if any namespace thread has heartbeat with namenode recently
   */
  public boolean isDataNodeBeingAlive() {
    for (NamespaceService nsos: namespaceManager.getAllNamenodeThreads()) {
      if (nsos != null &&
          nsos.lastBeingAlive >= now() - heartbeatExpireInterval) {
        return true;
      }
    }
    return false;
  }
 
  /**
  * @return true - if the data node is initialized
  */
  public boolean isInitialized() {
    for (NamespaceService nsos : namespaceManager.getAllNamenodeThreads()) {
      if (!nsos.initialized() || !nsos.isAlive()) {
        return false;
      }
    }
    return true;
  }
 
  /**
   * @param namenode addr 
   * @return true if the NSOfferService thread for given namespaceID is initialized
   * @throws IOException when the NSOfferService is dead
   */
  public synchronized boolean initialized(InetSocketAddress nameNodeAddr) throws IOException{
    NamespaceService nsos = namespaceManager.get(nameNodeAddr);
    if (nsos == null) {
      throw new IOException("NSOfferService for namenode " +
          nameNodeAddr.getAddress() + " is dead.");
    }
    return nsos.initialized();
  }
 
  /** Instantiate a single datanode object. This must be run by invoking
   *  {@link DataNode#runDatanodeDaemon(DataNode)} subsequently.
   */
  public static DataNode instantiateDataNode(String args[],
                                      Configuration conf) throws IOException {
    if (conf == null)
      conf = new Configuration();
    if (!parseArguments(args, conf)) {
      printUsage();
      return null;
    }
    if (conf.get("dfs.network.script") != null) {
      LOG.error("This configuration for rack identification is not supported" +
          " anymore. RackID resolution is handled by the NameNode.");
      System.exit(-1);
    }
    String[] dataDirs = conf.getStrings("dfs.data.dir");
    dnThreadName = "DataNode: [" +
                        StringUtils.arrayToString(dataDirs) + "]";
    return makeInstance(dataDirs, conf);
  }

  /** Instantiate & Start a single datanode daemon and wait for it to finish.
   *  If this thread is specifically interrupted, it will stop waiting.
   */
  public static DataNode createDataNode(String args[], Configuration conf)
    throws IOException {
    DataNode dn = instantiateDataNode(args, conf);
    if (dn != null) {
      dn.runDatanodeDaemon();
    }
    return dn;
  }

  void join() {
    while (shouldRun) {
      try {
        namespaceManager.joinAll();
        NamespaceService[] namespaceServices = namespaceManager.getAllNamenodeThreads();
        if (namespaceServices == null || (namespaceServices != null
            && namespaceServices.length == 0)) {
          shouldRun = false;
          isAlive = false;
        }
        Thread.sleep(2000);
      } catch (InterruptedException ex) {
        LOG.warn("Received exception in Datanode#join: " + ex);
      }
    }
  }

  /**
   * Make an instance of DataNode after ensuring that at least one of the
   * given data directories (and their parent directories, if necessary)
   * can be created.
   * @param dataDirs List of directories, where the new DataNode instance should
   * keep its files.
   * @param conf Configuration instance to use.
   * @return DataNode instance for given list of data dirs and conf, or null if
   * no directory from this directory list can be created.
   * @throws IOException
   */
  public static DataNode makeInstance(String[] dataDirs, Configuration conf)
    throws IOException {
    ArrayList<File> dirs = new ArrayList<File>();
    for (int i = 0; i < dataDirs.length; i++) {
      File data = new File(dataDirs[i]);
      try {
        DiskChecker.checkDir(data);
        dirs.add(data);
      } catch(DiskErrorException e) {
        LOG.warn("Invalid directory in dfs.data.dir: " + e.getMessage());
      }
    }
    if (dirs.size() > 0)
      return new DataNode(conf, dirs);
    LOG.error("All directories in dfs.data.dir are invalid.");
    return null;
  }

  @Override
  public String toString() {
    return "DataNode{" +
      "data=" + data +
      ", localName='" + getDatanodeInfo() + "'" +
      ", xmitsInProgress=" + xmitsInProgress.get() +
      "}";
  }

  private static void printUsage() {
    System.err.println("Usage: java DataNode");
    System.err.println("           [-rollback]");
  }

  /**
   * Parse and verify command line arguments and set configuration parameters.
   *
   * @return false if passed argements are incorrect
   */
  private static boolean parseArguments(String args[],
                                        Configuration conf) {
    int argsLen = (args == null) ? 0 : args.length;
    StartupOption startOpt = StartupOption.REGULAR;
    for(int i=0; i < argsLen; i++) {
      String cmd = args[i];
      if ("-r".equalsIgnoreCase(cmd) || "--rack".equalsIgnoreCase(cmd)) {
        LOG.error("-r, --rack arguments are not supported anymore. RackID " +
            "resolution is handled by the NameNode.");
        System.exit(-1);
      } else if ("-rollback".equalsIgnoreCase(cmd)) {
        startOpt = StartupOption.ROLLBACK;
      } else if ("-regular".equalsIgnoreCase(cmd)) {
        startOpt = StartupOption.REGULAR;
      } else if ("-d".equalsIgnoreCase(cmd)) {
        ++i;
        if(i >= argsLen) {
          LOG.error("-D option requires following argument.");
          System.exit(-1);
        }
        String[] keyval = args[i].split("=", 2);
        if (keyval.length == 2) {
          conf.set(keyval[0], keyval[1]);
        } else {
          LOG.error("-D option invalid (expected =): " + args[i]);
          System.exit(-1);
        }
      } else
        return false;
    }
    setStartupOption(conf, startOpt);
    return true;
  }

  private static void setStartupOption(Configuration conf, StartupOption opt) {
    conf.set("dfs.datanode.startup", opt.toString());
  }

  static StartupOption getStartupOption(Configuration conf) {
    return StartupOption.valueOf(conf.get("dfs.datanode.startup",
                                          StartupOption.REGULAR.toString()));
  }


  /**
   * This method is used for testing.
   * Examples are adding and deleting blocks directly.
   * The most common usage will be when the data node's storage is similated.
   *
   * @return the fsdataset that stores the blocks
   */
  public FSDatasetInterface getFSDataset() {
    return data;
  }

  /** Wait for the datanode to exit and clean up all its resources */
  public void waitAndShutdown() {
    join();
    // make sure all other threads have exited even if
    // offerservice thread died abnormally
    shutdown();
  }

  /**
   */
  public static void main(String args[]) {
    try {
      StringUtils.startupShutdownMessage(DataNode.class, args, LOG);
      DataNode datanode = createDataNode(args, null);
      if (datanode != null) {
        datanode.waitAndShutdown();
      }
    } catch (Throwable e) {
      LOG.error(StringUtils.stringifyException(e));
      System.exit(-1);
    }
  }
 
  private void transferBlock(int namespaceId, Block block,
      DatanodeInfo xferTargets[]) throws IOException {
    DatanodeProtocol nn = getNSNamenode(namespaceId);
    DatanodeRegistration nsReg = getDNRegistrationForNS(namespaceId);

    if (!data.isValidBlock(namespaceId, block, true)) {
      // block does not exist or is under-construction
      String errStr = "Can't send invalid block " + block;
      LOG.info(errStr);
      nn.errorReport(nsReg, DatanodeProtocol.INVALID_BLOCK, errStr);
      return;
    }

    // Check if NN recorded length matches on-disk length
    long onDiskLength = data.getFinalizedBlockLength(namespaceId, block);
    if (block.getNumBytes() > onDiskLength) {
      // Shorter on-disk len indicates corruption so report NN the corrupt block
      nn.reportBadBlocks(new LocatedBlock[] { new LocatedBlock(block,
          new DatanodeInfo[] { new DatanodeInfo(nsReg) }) });
      LOG.info("Can't replicate block " + block + " because on-disk length "
          + onDiskLength + " is shorter than NameNode recorded length "
          + block.getNumBytes());
      return;
    }

    int numTargets = xferTargets.length;
    if (numTargets > 0) {
      if (LOG.isInfoEnabled()) {
        StringBuilder xfersBuilder = new StringBuilder();
        for (int i = 0; i < numTargets; i++) {
          xfersBuilder.append(xferTargets[i].getName());
          xfersBuilder.append(" ");
        }
        LOG.info(nsReg + " Starting thread to transfer block " + block + " to "
            + xfersBuilder);
      }

      blockCopyExecutor.submit(new DataTransfer(namespaceId, xferTargets, block, this));
    }
  }

  void transferBlocks(int namespaceId, Block blocks[],
      DatanodeInfo xferTargets[][]) {
    for (int i = 0; i < blocks.length; i++) {
      try {
        transferBlock(namespaceId, blocks[i], xferTargets[i]);
      } catch (IOException ie) {
        LOG.warn("Failed to transfer block " + blocks[i], ie);
      }
    }
  }

  protected void notifyNamenodeReceivedBlock(int namespaceId, Block block,
      String delHint) throws IOException {
    if (block == null) {
      throw new IllegalArgumentException("Block is null");
    }
    NamespaceService nsos = namespaceManager.get(namespaceId);
    if (nsos == null || nsos.getDatanodeProtocol() == null) {
      throw new IOException("Cannot locate OfferService thread for namespace="
          + namespaceId);
    }
    nsos.notifyNamenodeReceivedBlock(block, delHint);
  }

  protected void notifyNamenodeDeletedBlock(int namespaceId, Block block)
      throws IOException {
    if (block == null) {
      throw new IllegalArgumentException("Block is null");
    }
    NamespaceService nsos = namespaceManager.get(namespaceId);
    if (nsos == null || nsos.getDatanodeProtocol() == null) {
      throw new IOException("Cannot locate OfferService thread for namespace="
          + namespaceId);
    }
    nsos.notifyNamenodeDeletedBlock(block);
  }

  // InterDataNodeProtocol implementation
  // THIS METHOD IS ONLY USED FOR UNIT TESTS
  /** {@inheritDoc} */
  public BlockMetaDataInfo getBlockMetaDataInfo(int namespaceId, Block block
      ) throws IOException {
    if (LOG.isDebugEnabled()) {
      LOG.debug("block=" + block);
    }

    Block stored = data.getStoredBlock(namespaceId, block.getBlockId());

    if (stored == null) {
      return null;
    }
    BlockMetaDataInfo info = new BlockMetaDataInfo(stored,
                                 blockScanner.getLastScanTime(namespaceId, stored));
    if (LOG.isDebugEnabled()) {
      LOG.debug("getBlockMetaDataInfo successful block=" + stored +
                " length " + stored.getNumBytes() +
                " genstamp " + stored.getGenerationStamp());
    }

    // paranoia! verify that the contents of the stored block
    // matches the block file on disk.
   
    data.validateBlockMetadata(namespaceId, stored);
    return info;
  }

  @Override
  public BlockRecoveryInfo startBlockRecovery(int namespaceId, Block block) throws IOException {
    return data.startBlockRecovery(namespaceId, block.getBlockId());
  }

  public Daemon recoverBlocks(final int namespaceId, final Block[] blocks, final DatanodeInfo[][] targets) {
    Daemon d = new Daemon(threadGroup, new Runnable() {
      /** Recover a list of blocks. It is run by the primary datanode. */
      public void run() {
        for(int i = 0; i < blocks.length; i++) {
          try {
            logRecoverBlock("NameNode", namespaceId, blocks[i], targets[i]);
            recoverBlock(namespaceId, blocks[i], false, targets[i], true, 0);
          } catch (IOException e) {
            LOG.warn("recoverBlocks FAILED, blocks[" + i + "]=" + blocks[i], e);
          }
        }
      }
    });
    d.start();
    return d;
  }

    /** {@inheritDoc} */
  public void updateBlock(int namespaceId, Block oldblock, Block newblock, boolean finalize) throws IOException {
    LOG.info("namespaceId: " + namespaceId
        + ", oldblock=" + oldblock + "(length=" + oldblock.getNumBytes()
        + "), newblock=" + newblock + "(length=" + newblock.getNumBytes()
        + "), datanode=" + getDatanodeInfo());
    data.updateBlock(namespaceId, oldblock, newblock);
    if (finalize) {
      data.finalizeBlockIfNeeded(namespaceId, newblock);
      myMetrics.blocksWritten.inc();
      notifyNamenodeReceivedBlock(namespaceId, newblock, null);
      LOG.info("Received block " + newblock +
                " of size " + newblock.getNumBytes() +
                " as part of lease recovery.");
    }
  }

  /** {@inheritDoc} */
  public long getProtocolVersion(String protocol, long clientVersion
      ) throws IOException {
    if (protocol.equals(InterDatanodeProtocol.class.getName())) {
      return InterDatanodeProtocol.versionID;
    } else if (protocol.equals(ClientDatanodeProtocol.class.getName())) {
      checkVersion(protocol, clientVersion, ClientDatanodeProtocol.versionID);
      return ClientDatanodeProtocol.versionID;
    }
    throw new IOException("Unknown protocol to " + getClass().getSimpleName()
        + ": " + protocol);
  }

  /** {@inheritDoc} */
  public BlockPathInfo getBlockPathInfo(Block block) throws IOException {
    return getBlockPathInfo(getAllNamespaces()[0], block);
  }
 
  @Override
  public BlockPathInfo getBlockPathInfo(int namespaceId, Block block) throws IOException {
    File datafile = data.getBlockFile(namespaceId, block);
    File metafile = FSDataset.getMetaFile(datafile, block);
    BlockPathInfo info = new BlockPathInfo(block, datafile.getAbsolutePath(),
                                           metafile.getAbsolutePath());
    if (LOG.isDebugEnabled()) {
      LOG.debug("getBlockPathInfo successful block=" + block +
                " blockfile " + datafile.getAbsolutePath() +
                " metafile " + metafile.getAbsolutePath());
    }
    return info;
  }

  public ProtocolSignature getProtocolSignature(String protocol,
      long clientVersion, int clientMethodsHash) throws IOException {
    return ProtocolSignature.getProtocolSignature(
        this, protocol, clientVersion, clientMethodsHash);
  }

  private void checkVersion(String protocol, long clientVersion,
    long serverVersion) throws IOException {
    if (serverVersion > clientVersion &&
       !ProtocolCompatible.isCompatibleClientDatanodeProtocol(
              clientVersion, serverVersion)) {
      throw new RPC.VersionIncompatible(protocol, clientVersion, serverVersion);
    }
  }

  /** A convenient class used in lease recovery */
  static class BlockRecord {
    final DatanodeID id;
    final InterDatanodeProtocol datanode;
    final BlockRecoveryInfo info;
   
    BlockRecord(DatanodeID id, InterDatanodeProtocol datanode,
        BlockRecoveryInfo info) {
      this.id = id;
      this.datanode = datanode;
      this.info = info;
    }

    /** {@inheritDoc} */
    public String toString() {
      return "BlockRecord(info=" + info + " node=" + id + ")";
    }
  }
 
  static public class BlockRecoveryTimeoutException extends IOException {
    /**
     *
     */
    private static final long serialVersionUID = 7887035511587861524L;

    public BlockRecoveryTimeoutException (String msg) {
      super (msg);
    }
  }

  static public void throwIfAfterTime(long timeoutTime) throws IOException {
    if (timeoutTime > 0 && System.currentTimeMillis() > timeoutTime) {
      throw new BlockRecoveryTimeoutException("The client have timed out.");
    }
  }
 
  /** Recover a block
   * @param keepLength if true, will only recover replicas that have the same length
   * as the block passed in. Otherwise, will calculate the minimum length of the
   * replicas and truncate the rest to that length.
   **/
  private LocatedBlock recoverBlock(int namespaceId, Block block, boolean keepLength,
      DatanodeID[] datanodeids, boolean closeFile, long deadline) throws IOException {
    // If the block is already being recovered, then skip recovering it.
    // This can happen if the namenode and client start recovering the same
    // file at the same time.
    synchronized (ongoingRecovery) {
      Block tmp = new Block();
      tmp.set(block.getBlockId(), block.getNumBytes(), GenerationStamp.WILDCARD_STAMP);
      if (ongoingRecovery.get(tmp) != null) {
        String msg = "Block " + block + " is already being recovered, " +
                     " ignoring this request to recover it.";
        LOG.info(msg);
        throw new IOException(msg);
      }
      ongoingRecovery.put(block, block);
    }
    try {
      int errorCount = 0;

      // Number of "replicasBeingWritten" in 0.21 parlance - these are replicas
      // on DNs that are still alive from when the write was happening
      int rbwCount = 0;
      // Number of "replicasWaitingRecovery" in 0.21 parlance - these replicas
      // have survived a DN restart, and thus might be truncated (eg if the
      // DN died because of a machine power failure, and when the ext3 journal
      // replayed, it truncated the file
      int rwrCount = 0;

      List<BlockRecord> blockRecords = new ArrayList<BlockRecord>();
      List<InterDatanodeProtocol> datanodeProxies =
        new ArrayList<InterDatanodeProtocol>();
      //check generation stamps
      for(DatanodeID id : datanodeids) {
        try {
          InterDatanodeProtocol datanode;
          if (getDNRegistrationForNS(namespaceId).equals(id)) {
            LOG.info("Skipping IDNPP creation for local id " + id
                + " when recovering " + block);
            datanode = this;
          } else {
            LOG.info("Creating IDNPP for non-local id " + id + " (dnReg="
                + getDNRegistrationForNS(namespaceId) + ") when recovering "
                + block);
            datanode = DataNode.createInterDataNodeProtocolProxy(
      id, getConf(), socketTimeout);
            datanodeProxies.add(datanode);
          }
          throwIfAfterTime(deadline);
          BlockRecoveryInfo info = datanode.startBlockRecovery(namespaceId, block);
          if (info == null) {
            LOG.info("No block metadata found for block " + block + " on datanode "
                + id);
            continue;
          }
          if (info.getBlock().getGenerationStamp() < block.getGenerationStamp()) {
            LOG.info("Only old generation stamp " + info.getBlock().getGenerationStamp()
                + " found on datanode " + id + " (needed block=" +
                block + ")");
            continue;
          }
          blockRecords.add(new BlockRecord(id, datanode, info));

          if (info.wasRecoveredOnStartup()) {
            rwrCount++;
          } else {
            rbwCount++;
          }
        } catch (BlockRecoveryTimeoutException e) {
          throw e;
        } catch (IOException e) {
          ++errorCount;
          InterDatanodeProtocol.LOG.warn(
              "Failed to getBlockMetaDataInfo for block (=" + block
              + ") from datanode (=" + id + ")", e);
        }
      }

      // If we *only* have replicas from post-DN-restart, then we should
      // include them in determining length. Otherwise they might cause us
      // to truncate too short.
      boolean shouldRecoverRwrs = (rbwCount == 0);

      List<BlockRecord> syncList = new ArrayList<BlockRecord>();
      long minlength = Long.MAX_VALUE;

      for (BlockRecord record : blockRecords) {
        BlockRecoveryInfo info = record.info;
        assert (info != null && info.getBlock().getGenerationStamp() >= block.getGenerationStamp());
        if (!shouldRecoverRwrs && info.wasRecoveredOnStartup()) {
          LOG.info("Not recovering replica " + record + " since it was recovered on "
              + "startup and we have better replicas");
          continue;
        }
        if (keepLength) {
          if (info.getBlock().getNumBytes() == block.getNumBytes()) {
            syncList.add(record);
          }
        } else {
          syncList.add(record);
          if (info.getBlock().getNumBytes() < minlength) {
            minlength = info.getBlock().getNumBytes();
          }
        }
      }

      if (syncList.isEmpty() && errorCount > 0) {
        stopAllProxies(datanodeProxies);
        throw new IOException("All datanodes failed: block=" + block
            + ", datanodeids=" + Arrays.asList(datanodeids));
      }
      if (!keepLength) {
        block.setNumBytes(minlength);
      }
      return syncBlock(namespaceId, block, syncList, closeFile,
          datanodeProxies, deadline);
    } finally {
      synchronized (ongoingRecovery) {
        ongoingRecovery.remove(block);
      }
    }
  }

  protected void stopAllProxies(List<InterDatanodeProtocol> datanodeProxies) {
    // safe to stop proxies now
    for (InterDatanodeProtocol proxy : datanodeProxies) {
      stopDatanodeProxy(proxy);
    }
  }
 
  private void stopDatanodeProxy(InterDatanodeProtocol datanode) {
    // if this is a proxy instance, close it
    if (Proxy.isProxyClass(datanode.getClass())) {
      RPC.stopProxy(datanode);
    }
  }

  /** Block synchronization */
  private LocatedBlock syncBlock(int namespaceId, Block block,
      List<BlockRecord> syncList, boolean closeFile,
      List<InterDatanodeProtocol> datanodeProxies, long deadline)
      throws IOException {
    return namespaceManager.get(namespaceId).syncBlock(block, syncList,
        closeFile, datanodeProxies, deadline);
  }

  // ClientDataNodeProtocol implementation
  /** {@inheritDoc} */
  public LocatedBlock recoverBlock(Block block, boolean keepLength,
      DatanodeInfo[] targets) throws IOException {
    // old client: use default namespace
    return recoverBlock(getAllNamespaces()[0], block, keepLength, targets);
  }

  @Override
  public LocatedBlock recoverBlock(int namespaceId, Block block,
      boolean keepLength, DatanodeInfo[] targets, long deadline)
      throws IOException {
    logRecoverBlock("Client", namespaceId, block, targets);
    return recoverBlock(namespaceId, block, keepLength, targets, false,
        deadline);
  }

  @Override
  public LocatedBlock recoverBlock(int namespaceId, Block block,
      boolean keepLength, DatanodeInfo[] targets) throws IOException {
    logRecoverBlock("Client", namespaceId, block, targets);
    return recoverBlock(namespaceId, block, keepLength, targets, false, 0);
  }

  /** {@inheritDoc} */
  public Block getBlockInfo(Block block) throws IOException {
    return getBlockInfo(DataNode.PKT_HEADER_LEN, block);
  }
 
  @Override
  public Block getBlockInfo(int namespaceId, Block block) throws IOException {
   
    Block stored = data.getStoredBlock(namespaceId, block.getBlockId());
    return stored;
  }

  @Override
  public void copyBlockLocal(String srcFileSystem,
      int srcNamespaceId, Block srcBlock,
      int dstNamespaceId, Block dstBlock, String srcBlockFilePath)
      throws IOException {
    File srcBlockFile = new File(srcBlockFilePath);
    if (!srcBlockFile.exists()) {
      throw new FileNotFoundException("File " + srcBlockFilePath
          + " could not be found");
    }
    blockCopyExecutor.submit(new LocalBlockCopy(srcFileSystem,
        srcNamespaceId, srcBlock,
        dstNamespaceId, dstBlock, true, srcBlockFile));
  }

  @Override
  public void copyBlock(Block srcBlock, Block destinationBlock,
      DatanodeInfo target) throws IOException {
    copyBlock(srcBlock, destinationBlock, target, true);
  }

  @Override
  public void copyBlock(Block srcBlock, Block destinationBlock,
      DatanodeInfo target, boolean async) throws IOException {
    throw new IOException(
        "Please upgrade your fastcopy tool to work with federated " +
        "HDFS clusters.");
  }
 
  @Override
  public void copyBlock(int srcNamespaceId, Block srcBlock, int dstNamespaceId,
      Block destinationBlock, DatanodeInfo target)
    throws IOException {
    copyBlock(srcNamespaceId, srcBlock,
        dstNamespaceId, destinationBlock, target, true);
  }
 
  @Override
  public void copyBlock(int srcNamespaceId, Block srcBlock, int dstNamespaceId,
      Block destinationBlock, DatanodeInfo target, boolean async)
      throws IOException {

    if (!data.isValidBlock(srcNamespaceId, srcBlock, true)) {
      // block does not exist or is under-construction
      String errStr = "copyBlock: Can't send invalid block " + srcBlock
                    + " at " + srcNamespaceId;
      LOG.info(errStr);
      throw new IOException(errStr);
    }

    // Check if specified length matches on-disk length
    long onDiskLength = data.getFinalizedBlockLength(srcNamespaceId, srcBlock);
    if (srcBlock.getNumBytes() > onDiskLength) {
      // Shorter on-disk len indicates corruption so report NN the corrupt block
      String msg = "copyBlock: Can't replicate block " + srcBlock
          + " at " + srcNamespaceId
          + " because on-disk length " + onDiskLength
          + " is shorter than provided length " + srcBlock.getNumBytes();
      LOG.info(msg);
      throw new IOException(msg);
    }

    LOG.info(getDatanodeInfo() + " copyBlock: Starting thread to transfer: " +
             "srcNamespaceId: " + srcNamespaceId + " block: " +
             srcBlock + " to " + target.getName());
    DatanodeInfo[] targets = new DatanodeInfo[1];
    targets[0] = target;

    // Use IP Address and port number to determine locality. Relying on the
    // DatanodeID of both the target machine and the local machine to
    // determine locality. This guarantees uniformity in comparison.
    String targetMachine = target.getHost();
    int targetPort = target.getPort();
    DatanodeRegistration dnRegistration = getDNRegistrationForNS(srcNamespaceId);
    int localPort = dnRegistration.getPort();
    String localMachine = dnRegistration.getHost();

    Future<Boolean> result;
    // If the target datanode is our datanode itself, then perform local copy.
    if (targetMachine.equals(localMachine) && targetPort == localPort) {
      LOG.info("Performing local block copy since source and "
          + "destination datanodes are same for  block "
          + srcBlock.getBlockName());
      result = blockCopyExecutor.submit(new LocalBlockCopy(srcNamespaceId,
          srcBlock, dstNamespaceId, destinationBlock));
    } else if (targetMachine.equals(localMachine)) {
      LOG.info("Performing cross datanode local block copy since source " +
          "and destination hosts are same for block "
          + srcBlock.getBlockName());
      result = blockCopyExecutor.submit(new CrossDatanodeLocalBlockCopy(
          srcNamespaceId, srcBlock, dstNamespaceId, destinationBlock, target));
    } else {
      result = blockCopyExecutor.submit(new DataTransfer(targets, srcNamespaceId, srcBlock,
          dstNamespaceId, destinationBlock, this));
    }

    // If this is not an async request, wait for the task to complete, if the
    // task fails this will throw an exception and will be propogated to the
    // client.
    if (!async) {
      try {
        // Wait for 5 minutes.
        result.get(this.blockCopyRPCWaitTime, TimeUnit.SECONDS);
      } catch (Exception e) {
        LOG.error(e);
        throw new IOException(e);
      }
    }
  }

  private static void logRecoverBlock(String who, int namespaceId,
      Block block, DatanodeID[] targets) {
    StringBuilder msg = new StringBuilder(targets[0].getName());
    for (int i = 1; i < targets.length; i++) {
      msg.append(", " + targets[i].getName());
    }
    LOG.info(who + " calls recoverBlock(namespace_id =" + namespaceId +
        " block=" + block
        + ", targets=[" + msg + "])");
  }

  class CrossDatanodeLocalBlockCopy implements Callable<Boolean> {
    private final int srcNamespaceId;
    private final Block srcBlock;
    private final int dstNamespaceId;
    private Block dstBlock;
    private final DatanodeInfo target;
    private final String srcFileSystem;

    public CrossDatanodeLocalBlockCopy(int srcNamespaceId, Block srcBlock,
        int dstNamespaceId, Block dstBlock,
        DatanodeInfo target) throws IOException {
      this.srcNamespaceId = srcNamespaceId;
      this.srcBlock = srcBlock;
      this.dstNamespaceId = dstNamespaceId;
      this.dstBlock = dstBlock;
      this.target = target;
      this.srcFileSystem = data.getFileSystemForBlock(srcNamespaceId, srcBlock);
    }

    public Boolean call() throws Exception {
      InterDatanodeProtocol remoteDatanode = null;
      try {
        File srcBlockFile = data.getBlockFile(srcNamespaceId, srcBlock);
        remoteDatanode = DataNode
            .createInterDataNodeProtocolProxy(target, getConf(), socketTimeout);
        remoteDatanode.copyBlockLocal(srcFileSystem, srcNamespaceId, srcBlock,
            dstNamespaceId, dstBlock,
            srcBlockFile.getAbsolutePath());
      } catch (IOException e) {
        LOG.warn("Cross datanode local block copy failed", e);
        throw e;
      } finally {
        if (remoteDatanode != null) {
          stopDatanodeProxy(remoteDatanode);
        }
      }
      return true;
    }
  }

  class LocalBlockCopy implements Callable<Boolean> {
    private final Block srcBlock;
    private final Block dstBlock;
    private final int srcNamespaceId;
    private final int dstNamespaceId;
    // Whether or not this copy is a copy across two datanodes on the same host.
    private final boolean crossDatanode;
    private final File srcBlockFile;
    private final String srcFileSystem;

    public LocalBlockCopy(int srcNamespaceId, Block srcBlock,
        int dstNamespaceId, Block dstBlock) throws IOException {
      this(null, srcNamespaceId, srcBlock, dstNamespaceId, dstBlock, false, null);
    }

    public LocalBlockCopy(String srcFileSystem,
        int srcNamespaceId, Block srcBlock,
        int dstNamespaceId, Block dstBlock,
        boolean crossDatanode, File srcBlockFile) throws IOException {
      this.srcBlock = srcBlock;
      this.dstBlock = dstBlock;
      this.srcNamespaceId = srcNamespaceId;
      this.dstNamespaceId = dstNamespaceId;
      this.crossDatanode = crossDatanode;
      this.srcBlockFile = srcBlockFile;
      this.srcFileSystem = (srcFileSystem != null) ? srcFileSystem :
        data.getFileSystemForBlock(srcNamespaceId, srcBlock);
    }

    public Boolean call() throws Exception {
      try {
        if (crossDatanode) {
          data.copyBlockLocal(srcFileSystem, srcBlockFile,
              srcNamespaceId, srcBlock, dstNamespaceId, dstBlock);
        } else {
          data.copyBlockLocal(srcFileSystem,
              data.getBlockFile(srcNamespaceId, srcBlock),
              srcNamespaceId, srcBlock, dstNamespaceId, dstBlock);
        }
        dstBlock.setNumBytes(srcBlock.getNumBytes());
        notifyNamenodeReceivedBlock(dstNamespaceId, dstBlock, null);
        blockScanner.addBlock(dstNamespaceId, dstBlock);
      } catch (Exception e) {
        LOG.warn("Local block copy for src : " + srcBlock.getBlockName()
            + ", dst : " + dstBlock.getBlockName() + " failed", e);
        throw e;
      }
      return true;
    }
  }

  public void reportBadBlocks(int namespaceId, LocatedBlock[] blocks)
      throws IOException {
    NamespaceService nsos = namespaceManager.get(namespaceId);
    if(nsos == null) {
      throw new IOException("cannot locate OfferService thread for namespace=" + namespaceId);
    }
    nsos.reportBadBlocks(blocks);
  }
 
  public UpgradeManagerDatanode getUpgradeManager(int namespaceId) {
    NamespaceService nsos = namespaceManager.get(namespaceId);
    return nsos == null ? null : nsos.getUpgradeManager();
  }
 
  public void completeUpgrade() throws IOException{
    for(int namespaceId : namespaceManager.getAllNamespaces()){
      UpgradeManagerDatanode manager = namespaceManager.get(namespaceId).getUpgradeManager();
      manager.completeUpgrade();
    }
  }
 
  /**
   * See {@link DataBlockScanner}
   */
  private synchronized void initDataBlockScanner(Configuration conf) {
    if (blockScanner != null) {
      return;
    }
    //initialize periodic block scanner
    String reason = null;
    if (conf.getInt("dfs.datanode.scan.period.hours", 0) < 0) {
      reason = "verification is turned off by configuration";
    } else if ( !(data instanceof FSDataset) ) {
      reason = "verifcation is supported only with FSDataset";
    }

    if ( reason == null ) {
      blockScanner = new DataBlockScannerSet(this, (FSDataset)data, conf);     
    } else {
      LOG.info("Periodic Block Verification is disabled because " +
               reason + ".");
    }
  }
 
  /**
   * Get host:port with host set to Datanode host and port set to the
   * port {@link DataXceiver} is serving.
   * @return host:port string
   */
  public String getMachineName() {
    return machineName + ":" + selfAddr.getPort();
  }

  public long getCTime(int namespaceId) {
    return storage.getNStorage(namespaceId).getCTime();
  }
 
  public String getStorageID() {
    return storage.getStorageID();
  }

  /**
   * Get DataNode info - used primarily for logging
   */
  public String getDatanodeInfo() {
    return machineName + ":" + selfAddr.getPort()
      + "; storageID= " + storage.getStorageID();
  }
 
  /**
   * Return true if the given namespace is alive.
   * @param namespaceId
   * @return true if the namespace is alive, false otherwise
   */
  public boolean isNamespaceAlive(int namespaceId) {
    return namespaceManager.isAlive(namespaceId);
  }
 
  /**
   * Return true if the given namespace is alive.
   * @param addr
   * @return true if the namespace is alive, false otherwise
   */
  public boolean isNamespaceAlive(InetSocketAddress addr) {
    return namespaceManager.get(addr).isAlive();
  }
 
  public Integer[] getAllNamespaces(){
    return namespaceManager.getAllNamespaces();
  }
 
  public NamespaceService[] getAllNamespaceServices() {
    return namespaceManager.getAllNamenodeThreads();
  }

  /**
   * This method makes data node to send block report
   */
  public void scheduleNSBlockReport(long delay) {
    for (NamespaceService nsos : namespaceManager.getAllNamenodeThreads()) {
      nsos.scheduleBlockReport(delay);
    }
  }
 
  /**
   * This method makes data node to send blockReceivedAndDelete report
   */
  public void scheduleNSBlockReceivedAndDeleted(long delay) {
    for (NamespaceService nsos : namespaceManager.getAllNamenodeThreads()) {
      nsos.scheduleBlockReceivedAndDeleted(delay);
    }
  }

  public void refreshNamenodes(Configuration conf) throws IOException {
    LOG.info("refresh namenodes");
    try {
      List<InetSocketAddress> nameNodeAddrs = DFSUtil.getNNServiceRpcAddresses(conf);
      namespaceManager.refreshNamenodes(nameNodeAddrs, conf);
    } catch (InterruptedException e) {
      throw new IOException(e.getCause());
    }
  }

  //ClientDataNodeProtocol implementation
  /* {@inheritDoc} */
  /**
   * This method refreshes all name nodes served by the datanode
   */
  public void refreshNamenodes() throws IOException {
    conf = new Configuration();
    refreshNamenodes(conf);
  }
 
  /**
   * {@inheritDoc}
   */
  @Override
  public void reconfigurePropertyImpl(String property, String newVal)
    throws ReconfigurationException {
    if (property.equals("dfs.data.dir")) {
      try {
        LOG.info("Reconfigure " + property + " to " + newVal);
        this.refreshVolumes(newVal);
      } catch (Exception e) {
        throw new ReconfigurationException(property,
        newVal, getConf().get(property), e);
      }
    } else {
      throw new ReconfigurationException(property, newVal,
                                        getConf().get(property));
    }
  }

  /**
   * {@inheritDoc}
   */
  @Override
  public List<String> getReconfigurableProperties() {
    List<String> changeable =
      Arrays.asList("dfs.data.dir");
    return changeable;
  }
 
  //@Override PulseCheckable
  public Boolean isAlive() {
    return isDatanodeUp() && isDataNodeBeingAlive();
  }
 
  private ObjectName datanodeMXBeanName;
 
  /**
   * Register DataNodeMXBean
   */
  private void registerMXBean() {  
    this.pulseChecker = PulseChecker.create(this, "DataNode");
    datanodeMXBeanName = MBeanUtil.registerMBean("DataNode", "DataNodeInfo", this);
  }
 
  private void shutdownMXBean() {
    if (datanodeMXBeanName != null) {
      MBeanUtil.unregisterMBean(datanodeMXBeanName);
    }
    if (pulseChecker != null) {
      pulseChecker.shutdown();
    }
  }
 
  @Override // DataNodeMXBean
  public String getVersion() {
    return VersionInfo.getVersion();
  }

  @Override // DataNodeMXBean
  public String getRpcPort(){
    return Integer.toString(this.ipcServer.getListenerAddress().getPort());
  }

  @Override // DataNodeMXBean
  public String getHttpPort(){
    return Integer.toString(this.infoServer.getPort());
  }

  /**
   * Returned information is a JSON representation of a map with
   * name node host name as the key and block pool Id as the value
   */
  @Override // DataNodeMXBean
  public String getNamenodeAddresses() {
    final Map<String, Integer> info = new HashMap<String, Integer>();
    for (NamespaceService ns : namespaceManager.getAllNamenodeThreads()) {
      if (ns != null && ns.initialized()) {
        info.put(ns.getNNSocketAddress().getHostName(), ns.getNamespaceId());
      }
    }
    return JSON.toString(info);
  }

  /**
   * Returned information is a JSON representation of a map with
   * volume name as the key and value is a map of volume attribute
   * keys to its values
   */
  @Override // DataNodeMXBean
  public String getVolumeInfo() {
    final Map<String, Object> info = new HashMap<String, Object>();
    try {
      FSVolume[] volumes = ((FSDataset)this.data).volumes.getVolumes();
      for (FSVolume v : volumes) {
        final Map<String, Object> innerInfo = new HashMap<String, Object>();
        innerInfo.put("usedSpace", v.getDfsUsed());
        innerInfo.put("freeSpace", v.getAvailable());
        innerInfo.put("reservedSpace", v.getReserved());
        info.put(v.getDir().toString(), innerInfo);
      }
      return JSON.toString(info);
    } catch (IOException e) {
      LOG.info("Cannot get volume info.", e);
      return "ERROR";
    }
  }

  @Override // DataNodeMXBean
  public String getServiceIds() {
    String nameserviceIdList = "";
    for (NamespaceService ns : namespaceManager.getAllNamenodeThreads()) {
      if (ns != null && ns.initialized()) {
        String nameserviceId = ns.getNameserviceId();
        if (nameserviceIdList.length() > 0) {
          nameserviceIdList += ",";
        }
        if (nameserviceId == null) {
          // Non-federation version, should be only one namespace
          nameserviceId = "NONFEDERATION";
        }
        nameserviceIdList += nameserviceId;
      }
    }
    return nameserviceIdList;
  }

  /**
   * Sends a 'Blocks Being Written' report to the given node.
   *
   * @param node the node to send the report to
   * @throws IOException
   */
  public void sendBlocksBeingWrittenReport(DatanodeProtocol node,
      int namespaceId, DatanodeRegistration nsRegistration) throws IOException {
    Block[] blocks = data.getBlocksBeingWrittenReport(namespaceId);
    if (blocks != null && blocks.length != 0) {
      long[] blocksAsLong =
        BlockListAsLongs.convertToArrayLongs(blocks);
      BlockReport bbwReport = new BlockReport(blocksAsLong);
      node.blocksBeingWrittenReport(nsRegistration, bbwReport);
    }
  }
}
TOP

Related Classes of org.apache.hadoop.hdfs.server.datanode.DataNode

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.