Package org.apache.hadoop.dfs

Source Code of org.apache.hadoop.dfs.DataNode

/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.dfs;

import org.apache.commons.logging.*;

import org.apache.hadoop.fs.ChecksumException;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.ipc.*;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.net.DNS;
import org.apache.hadoop.net.NetUtils;
import org.apache.hadoop.net.NodeBase;
import org.apache.hadoop.util.*;
import org.apache.hadoop.util.DiskChecker.DiskErrorException;
import org.apache.hadoop.util.DiskChecker.DiskOutOfSpaceException;
import org.apache.hadoop.mapred.StatusHttpServer;
import org.apache.hadoop.net.NetworkTopology;
import org.apache.hadoop.dfs.BlockCommand;
import org.apache.hadoop.dfs.DatanodeProtocol;
import org.apache.hadoop.dfs.FSDatasetInterface.MetaDataInputStream;
import org.apache.hadoop.dfs.datanode.metrics.DataNodeMetrics;

import java.io.*;
import java.net.*;
import java.util.*;
import java.util.concurrent.Semaphore;
import java.security.NoSuchAlgorithmException;
import java.security.SecureRandom;

/**********************************************************
* DataNode is a class (and program) that stores a set of
* blocks for a DFS deployment.  A single deployment can
* have one or many DataNodes.  Each DataNode communicates
* regularly with a single NameNode.  It also communicates
* with client code and other DataNodes from time to time.
*
* DataNodes store a series of named blocks.  The DataNode
* allows client code to read these blocks, or to write new
* block data.  The DataNode may also, in response to instructions
* from its NameNode, delete blocks or copy blocks to/from other
* DataNodes.
*
* The DataNode maintains just one critical table:
*   block-> stream of bytes (of BLOCK_SIZE or less)
*
* This info is stored on a local disk.  The DataNode
* reports the table's contents to the NameNode upon startup
* and every so often afterwards.
*
* DataNodes spend their lives in an endless loop of asking
* the NameNode for something to do.  A NameNode cannot connect
* to a DataNode directly; a NameNode simply returns values from
* functions invoked by a DataNode.
*
* DataNodes maintain an open server socket so that client code
* or other DataNodes can read/write data.  The host/port for
* this server is reported to the NameNode, which then sends that
* information to clients or other DataNodes that might be interested.
*
**********************************************************/
public class DataNode implements FSConstants, Runnable {
  public static final Log LOG = LogFactory.getLog("org.apache.hadoop.dfs.DataNode");

  /**
   * Use {@link NetUtils#createSocketAddr(String)} instead.
   */
  @Deprecated
  public static InetSocketAddress createSocketAddr(String target
                                                   ) throws IOException {
    return NetUtils.createSocketAddr(target);
  }

  DatanodeProtocol namenode = null;
  FSDatasetInterface data = null;
  DatanodeRegistration dnRegistration = null;
  private String networkLoc;
  volatile boolean shouldRun = true;
  private LinkedList<Block> receivedBlockList = new LinkedList<Block>();
  private LinkedList<String> delHints = new LinkedList<String>();
  final private static String EMPTY_DEL_HINT = "";
  int xmitsInProgress = 0;
  Daemon dataXceiveServer = null;
  ThreadGroup threadGroup = null;
  long blockReportInterval;
  long lastBlockReport = 0;
  boolean resetBlockReportTime = true;
  long initialBlockReportDelay = BLOCKREPORT_INITIAL_DELAY * 1000L;
  long lastHeartbeat = 0;
  long heartBeatInterval;
  private DataStorage storage = null;
  private StatusHttpServer infoServer = null;
  private DataNodeMetrics myMetrics;
  private static InetSocketAddress nameNodeAddr;
  private static DataNode datanodeObject = null;
  private Thread dataNodeThread = null;
  String machineName;
  int defaultBytesPerChecksum = 512;
  private int socketTimeout;
 
  private DataBlockScanner blockScanner;
  private Daemon blockScannerThread;

  /**
   * We need an estimate for block size to check if the disk partition has
   * enough space. For now we set it to be the default block size set
   * in the server side configuration, which is not ideal because the
   * default block size should be a client-size configuration.
   * A better solution is to include in the header the estimated block size,
   * i.e. either the actual block size or the default block size.
   */
  private long estimateBlockSize;
 
  // The following three fields are to support balancing
  final static short MAX_BALANCING_THREADS = 5;
  private Semaphore balancingSem = new Semaphore(MAX_BALANCING_THREADS);
  long balanceBandwidth;
  private Throttler balancingThrottler;

  // Record all sockets opend for data transfer
  Map<Socket, Socket> childSockets = Collections.synchronizedMap(
                                       new HashMap<Socket, Socket>());
 
  /**
   * Current system time.
   * @return current time in msec.
   */
  static long now() {
    return System.currentTimeMillis();
  }




   
  /**
   * Create the DataNode given a configuration and an array of dataDirs.
   * 'dataDirs' is where the blocks are stored.
   */
  DataNode(Configuration conf,
           AbstractList<File> dataDirs) throws IOException {
     
    datanodeObject = this;

    try {
      startDataNode(conf, dataDirs);
    } catch (IOException ie) {
      shutdown();
      throw ie;
    }
  }
   
 
  /**
   * This method starts the data node with the specified conf.
   *
   * @param conf - the configuration
   *     if conf's CONFIG_PROPERTY_SIMULATED property is set
   *  then a simulated storage based data node is created.
   *
   * @param dataDirs - only for a non-simulated storage data node
   * @throws IOException
   */
  void startDataNode(Configuration conf,
                     AbstractList<File> dataDirs
                     ) throws IOException {
    // use configured nameserver & interface to get local hostname
    machineName = DNS.getDefaultHost(
                                     conf.get("dfs.datanode.dns.interface","default"),
                                     conf.get("dfs.datanode.dns.nameserver","default"));
    InetSocketAddress nameNodeAddr = NetUtils.createSocketAddr(
                                     conf.get("fs.default.name", "local"));
   
    this.defaultBytesPerChecksum =
       Math.max(conf.getInt("io.bytes.per.checksum", 512), 1);
    this.estimateBlockSize = conf.getLong("dfs.block.size", DEFAULT_BLOCK_SIZE);
    this.socketTimeout =  conf.getInt("dfs.socket.timeout",
                                      FSConstants.READ_TIMEOUT);
    String address =
      NetUtils.getServerAddress(conf,
                                "dfs.datanode.bindAddress",
                                "dfs.datanode.port",
                                "dfs.datanode.address");
    InetSocketAddress socAddr = NetUtils.createSocketAddr(address);
    int tmpPort = socAddr.getPort();
    storage = new DataStorage();
    // construct registration
    this.dnRegistration = new DatanodeRegistration(machineName + ":" + tmpPort);

    // connect to name node
    this.namenode = (DatanodeProtocol)
      RPC.waitForProxy(DatanodeProtocol.class,
                       DatanodeProtocol.versionID,
                       nameNodeAddr,
                       conf);
    // get version and id info from the name-node
    NamespaceInfo nsInfo = handshake();
    StartupOption startOpt = getStartupOption(conf);
    assert startOpt != null : "Startup option must be set.";
   
    boolean simulatedFSDataset =
        conf.getBoolean("dfs.datanode.simulateddatastorage", false);
    if (simulatedFSDataset) {
        setNewStorageID(dnRegistration);
        dnRegistration.storageInfo.layoutVersion = FSConstants.LAYOUT_VERSION;
        dnRegistration.storageInfo.namespaceID = nsInfo.namespaceID;
        // it would have been better to pass storage as a parameter to
        // constructor below - need to augment ReflectionUtils used below.
        conf.set("StorageId", dnRegistration.getStorageID());
        try {
          //Equivalent of following (can't do because Simulated is in test dir)
          //  this.data = new SimulatedFSDataset(conf);
          this.data = (FSDatasetInterface) ReflectionUtils.newInstance(
              Class.forName("org.apache.hadoop.dfs.SimulatedFSDataset"), conf);
        } catch (ClassNotFoundException e) {
          throw new IOException(StringUtils.stringifyException(e));
        }
    } else { // real storage
      // read storage info, lock data dirs and transition fs state if necessary
      storage.recoverTransitionRead(nsInfo, dataDirs, startOpt);
      // adjust
      this.dnRegistration.setStorageInfo(storage);
      // initialize data node internal structure
      this.data = new FSDataset(storage, conf);
    }

     
    // find free port
    ServerSocket ss = new ServerSocket(tmpPort, 0, socAddr.getAddress());
    ss.setReceiveBufferSize(DEFAULT_DATA_SOCKET_SIZE);
    // adjust machine name with the actual port
    tmpPort = ss.getLocalPort();
    this.dnRegistration.setName(machineName + ":" + tmpPort);
    LOG.info("Opened server at " + tmpPort);
     
    this.threadGroup = new ThreadGroup("dataXceiveServer");
    this.dataXceiveServer = new Daemon(threadGroup, new DataXceiveServer(ss));
    this.threadGroup.setDaemon(true); // auto destroy when empty

    long blockReportIntervalBasis =
      conf.getLong("dfs.blockreport.intervalMsec", BLOCKREPORT_INTERVAL);
    this.blockReportInterval =
      blockReportIntervalBasis - new Random().nextInt((int)(blockReportIntervalBasis/10));
    this.initialBlockReportDelay = conf.getLong("dfs.blockreport.initialDelay",
                                            BLOCKREPORT_INITIAL_DELAY)* 1000L;
    if (this.initialBlockReportDelay >= blockReportIntervalBasis) {
      this.initialBlockReportDelay = 0;
      LOG.info("dfs.blockreport.initialDelay is greater than " +
        "dfs.blockreport.intervalMsec." + " Setting initial delay to 0 msec:");
    }
    this.heartBeatInterval = conf.getLong("dfs.heartbeat.interval", HEARTBEAT_INTERVAL) * 1000L;
    DataNode.nameNodeAddr = nameNodeAddr;

    //set up parameter for cluster balancing
    this.balanceBandwidth = conf.getLong("dfs.balance.bandwidthPerSec", 1024L*1024);
    LOG.info("Balancing bandwith is "+balanceBandwidth + " bytes/s");
    this.balancingThrottler = new Throttler(balanceBandwidth);

    //initialize periodic block scanner
    String reason = null;
    if (conf.getInt("dfs.datanode.scan.period.hours", 0) < 0) {
      reason = "verification is turned off by configuration";
    } else if ( !(data instanceof FSDataset) ) {
      reason = "verifcation is supported only with FSDataset";
    }
    if ( reason == null ) {
      blockScanner = new DataBlockScanner(this, (FSDataset)data, conf);
      blockScannerThread = new Daemon(blockScanner);
    } else {
      LOG.info("Periodic Block Verification is disabled because " +
               reason + ".");
    }
   
    //create a servlet to serve full-file content
    String infoAddr =
      NetUtils.getServerAddress(conf,
                              "dfs.datanode.info.bindAddress",
                              "dfs.datanode.info.port",
                              "dfs.datanode.http.address");
    InetSocketAddress infoSocAddr = NetUtils.createSocketAddr(infoAddr);
    String infoHost = infoSocAddr.getHostName();
    int tmpInfoPort = infoSocAddr.getPort();
    this.infoServer = new StatusHttpServer("datanode", infoHost, tmpInfoPort, tmpInfoPort == 0);
    this.infoServer.addServlet(null, "/streamFile/*", StreamFile.class);
    this.infoServer.setAttribute("datanode.blockScanner", blockScanner);
    this.infoServer.addServlet(null, "/blockScannerReport",
                               DataBlockScanner.Servlet.class);
    this.infoServer.start();
    // adjust info port
    this.dnRegistration.setInfoPort(this.infoServer.getPort());
    // get network location
    this.networkLoc = conf.get("dfs.datanode.rack");
    if (networkLoc == null// exec network script or set the default rack
      networkLoc = getNetworkLoc(conf);
    // register datanode
    register();
    myMetrics = new DataNodeMetrics(conf, dnRegistration.getStorageID());
  }

  private NamespaceInfo handshake() throws IOException {
    NamespaceInfo nsInfo = new NamespaceInfo();
    while (shouldRun) {
      try {
        nsInfo = namenode.versionRequest();
        break;
      } catch(SocketTimeoutException e) {  // namenode is busy
        LOG.info("Problem connecting to server: " + getNameNodeAddr());
        try {
          Thread.sleep(1000);
        } catch (InterruptedException ie) {}
      }
    }
    String errorMsg = null;
    // verify build version
    if( ! nsInfo.getBuildVersion().equals( Storage.getBuildVersion() )) {
      errorMsg = "Incompatible build versions: namenode BV = "
        + nsInfo.getBuildVersion() + "; datanode BV = "
        + Storage.getBuildVersion();
      LOG.fatal( errorMsg );
      try {
        namenode.errorReport( dnRegistration,
                              DatanodeProtocol.NOTIFY, errorMsg );
      } catch( SocketTimeoutException e ) {  // namenode is busy
        LOG.info("Problem connecting to server: " + getNameNodeAddr());
      }
      throw new IOException( errorMsg );
    }
    assert FSConstants.LAYOUT_VERSION == nsInfo.getLayoutVersion() :
      "Data-node and name-node layout versions must be the same.";
    return nsInfo;
  }

  /** Return the DataNode object
   *
   */
  public static DataNode getDataNode() {
    return datanodeObject;
  }

  public InetSocketAddress getNameNodeAddr() {
    return nameNodeAddr;
  }
   
  DataNodeMetrics getMetrics() {
    return myMetrics;
  }
 
  /**
   * Return the namenode's identifier
   */
  public String getNamenode() {
    //return namenode.toString();
    return "<namenode>";
  }

  static void setNewStorageID(DatanodeRegistration dnReg) {
    /* Return
     * "DS-randInt-ipaddr-currentTimeMillis"
     * It is considered extermely rare for all these numbers to match
     * on a different machine accidentally for the following
     * a) SecureRandom(INT_MAX) is pretty much random (1 in 2 billion), and
     * b) Good chance ip address would be different, and
     * c) Even on the same machine, Datanode is designed to use different ports.
     * d) Good chance that these are started at different times.
     * For a confict to occur all the 4 above have to match!.
     * The format of this string can be changed anytime in future without
     * affecting its functionality.
     */
    String ip = "unknownIP";
    try {
      ip = DNS.getDefaultIP("default");
    } catch (UnknownHostException ignored) {
      LOG.warn("Could not find ip address of \"default\" inteface.");
    }
   
    int rand = 0;
    try {
      rand = SecureRandom.getInstance("SHA1PRNG").nextInt(Integer.MAX_VALUE);
    } catch (NoSuchAlgorithmException e) {
      LOG.warn("Could not use SecureRandom");
      rand = (new Random()).nextInt(Integer.MAX_VALUE);
    }
    dnReg.storageID = "DS-" + rand + "-"+ ip + "-" + dnReg.getPort() + "-" +
                      System.currentTimeMillis();
  }
  /**
   * Register datanode
   * <p>
   * The datanode needs to register with the namenode on startup in order
   * 1) to report which storage it is serving now and
   * 2) to receive a registrationID
   * issued by the namenode to recognize registered datanodes.
   *
   * @see FSNamesystem#registerDatanode(DatanodeRegistration,String)
   * @throws IOException
   */
  private void register() throws IOException {
    if (dnRegistration.getStorageID().equals("")) {
      setNewStorageID(dnRegistration);
    }
    while(shouldRun) {
      try {
        // reset name to machineName. Mainly for web interface.
        dnRegistration.name = machineName + ":" + dnRegistration.getPort();
        dnRegistration = namenode.register(dnRegistration, networkLoc);
        break;
      } catch(SocketTimeoutException e) {  // namenode is busy
        LOG.info("Problem connecting to server: " + getNameNodeAddr());
        try {
          Thread.sleep(1000);
        } catch (InterruptedException ie) {}
      }
    }
    assert ("".equals(storage.getStorageID())
            && !"".equals(dnRegistration.getStorageID()))
            || storage.getStorageID().equals(dnRegistration.getStorageID()) :
            "New storageID can be assigned only if data-node is not formatted";
    if (storage.getStorageID().equals("")) {
      storage.setStorageID(dnRegistration.getStorageID());
      storage.writeAll();
      LOG.info("New storage id " + dnRegistration.getStorageID()
          + " is assigned to data-node " + dnRegistration.getName());
    }
    if(! storage.getStorageID().equals(dnRegistration.getStorageID())) {
      throw new IOException("Inconsistent storage IDs. Name-node returned "
          + dnRegistration.getStorageID()
          + ". Expecting " + storage.getStorageID());
    }
  }

  private void enumerateThreadGroup(ThreadGroup tg) {
    int count = tg.activeCount();
    Thread[] info = new Thread[count];
    int num = tg.enumerate(info);
    for (int i = 0; i < num; i++) {
      System.out.print(info[i].getName() + " ");
    }
    System.out.println("");
  }

  /**
   * Shut down this instance of the datanode.
   * Returns only after shutdown is complete.
   */
  public void shutdown() {
    if (infoServer != null) {
      try {
        infoServer.stop();
      } catch (Exception e) {
      }
    }
    this.shouldRun = false;
    if (dataXceiveServer != null) {
      ((DataXceiveServer) this.dataXceiveServer.getRunnable()).kill();
      this.dataXceiveServer.interrupt();

      // wait for all data receiver threads to exit
      if (this.threadGroup != null) {
        while (true) {
          this.threadGroup.interrupt();
          LOG.info("Waiting for threadgroup to exit, active threads is " +
                   this.threadGroup.activeCount());
          if (this.threadGroup.isDestroyed() ||
              this.threadGroup.activeCount() == 0) {
            break;
          }
          try {
            Thread.sleep(1000);
          } catch (InterruptedException e) {}
        }
      }
    }
    if(upgradeManager != null)
      upgradeManager.shutdownUpgrade();
    if (blockScannerThread != null) {
      blockScanner.shutdown();
      blockScannerThread.interrupt();
    }
    if (storage != null) {
      try {
        this.storage.unlockAll();
      } catch (IOException ie) {
      }
    }
    if (dataNodeThread != null) {
      dataNodeThread.interrupt();
      try {
        dataNodeThread.join();
      } catch (InterruptedException ie) {
      }
    }
    if (data != null) {
      data.shutdown();
    }
    if (myMetrics != null) {
      myMetrics.shutdown();
    }
  }
 
 
  /* Check if there is no space in disk or the disk is read-only
   *  when IOException occurs.
   * If so, handle the error */
  private void checkDiskError( IOException e ) throws IOException {
    if (e.getMessage().startsWith("No space left on device")) {
      throw new DiskOutOfSpaceException("No space left on device");
    } else {
      checkDiskError();
    }
  }
 
  /* Check if there is no disk space and if so, handle the error*/
  private void checkDiskError( ) throws IOException {
    try {
      data.checkDataDir();
    } catch(DiskErrorException de) {
      handleDiskError(de.getMessage());
    }
  }
 
  private void handleDiskError(String errMsgr) {
    LOG.warn("DataNode is shutting down.\n" + errMsgr);
    try {
      namenode.errorReport(
                           dnRegistration, DatanodeProtocol.DISK_ERROR, errMsgr);
    } catch(IOException ignored) {             
    }
    shutdown();
  }
   
  private static class Count {
    int value = 0;
    Count(int init) { value = init; }
    synchronized void incr() { value++; }
    synchronized void decr() { value--; }
    @Override
    public String toString() { return Integer.toString(value); }
    public int getValue() { return value; }
  }
   
  Count xceiverCount = new Count(0);
   
  /**
   * Main loop for the DataNode.  Runs until shutdown,
   * forever calling remote NameNode functions.
   */
  public void offerService() throws Exception {
    
    LOG.info("using BLOCKREPORT_INTERVAL of " + blockReportInterval + "msec" +
       " Initial delay: " + initialBlockReportDelay + "msec");

    //
    // Now loop for a long time....
    //

    while (shouldRun) {
      try {
        long startTime = now();

        //
        // Every so often, send heartbeat or block-report
        //
       
        if (startTime - lastHeartbeat > heartBeatInterval) {
          //
          // All heartbeat messages include following info:
          // -- Datanode name
          // -- data transfer port
          // -- Total capacity
          // -- Bytes remaining
          //
          DatanodeCommand cmd = namenode.sendHeartbeat(dnRegistration,
                                                       data.getCapacity(),
                                                       data.getDfsUsed(),
                                                       data.getRemaining(),
                                                       xmitsInProgress,
                                                       xceiverCount.getValue());
          myMetrics.heartbeats.inc(now() - startTime);
          //LOG.info("Just sent heartbeat, with name " + localName);
          lastHeartbeat = startTime;
          if (!processCommand(cmd))
            continue;
        }
           
        // check if there are newly received blocks
        Block [] blockArray=null;
        String [] delHintArray=null;
        synchronized(receivedBlockList) {
          synchronized(delHints) {
            int numBlocks = receivedBlockList.size();
            if (numBlocks > 0) {
              if(numBlocks!=delHints.size()) {
                LOG.warn("Panic: receiveBlockList and delHints are not of the same length" );
              }
              //
              // Send newly-received blockids to namenode
              //
              blockArray = receivedBlockList.toArray(new Block[numBlocks]);
              delHintArray = delHints.toArray(new String[numBlocks]);
            }
          }
        }
        if (blockArray != null) {
          if(delHintArray == null || delHintArray.length != blockArray.length ) {
            LOG.warn("Panic: block array & delHintArray are not the same" );
          }
          namenode.blockReceived(dnRegistration, blockArray, delHintArray);
          synchronized (receivedBlockList) {
            synchronized (delHints) {
              for(int i=0; i<blockArray.length; i++) {
                receivedBlockList.remove(blockArray[i]);
                delHints.remove(delHintArray[i]);
              }
            }
          }
        }

        // send block report
        if (startTime - lastBlockReport > blockReportInterval) {
          //
          // Send latest blockinfo report if timer has expired.
          // Get back a list of local block(s) that are obsolete
          // and can be safely GC'ed.
          //
          long brStartTime = now();
          Block[] bReport = data.getBlockReport();
          DatanodeCommand cmd = namenode.blockReport(dnRegistration,
                  BlockListAsLongs.convertToArrayLongs(bReport));
          long brTime = now() - brStartTime;
          myMetrics.blockReports.inc(brTime);
          LOG.info("BlockReport of " + bReport.length +
              " blocks got processed in " + brTime + " msecs");
          //
          // If we have sent the first block report, then wait a random
          // time before we start the periodic block reports.
          //
          if (resetBlockReportTime) {
            lastBlockReport = startTime - new Random().nextInt((int)(blockReportInterval));
            resetBlockReportTime = false;
          } else {
            lastBlockReport = startTime;
          }
          processCommand(cmd);
        }
           
        //
        // There is no work to do;  sleep until hearbeat timer elapses,
        // or work arrives, and then iterate again.
        //
        long waitTime = heartBeatInterval - (System.currentTimeMillis() - lastHeartbeat);
        synchronized(receivedBlockList) {
          if (waitTime > 0 && receivedBlockList.size() == 0) {
            try {
              receivedBlockList.wait(waitTime);
            } catch (InterruptedException ie) {
            }
          }
        } // synchronized
      } catch(RemoteException re) {
        String reClass = re.getClassName();
        if (UnregisteredDatanodeException.class.getName().equals(reClass) ||
            DisallowedDatanodeException.class.getName().equals(reClass)) {
          LOG.warn("DataNode is shutting down: " +
                   StringUtils.stringifyException(re));
          shutdown();
          return;
        }
        LOG.warn(StringUtils.stringifyException(re));
      } catch (IOException e) {
        LOG.warn(StringUtils.stringifyException(e));
      }
    } // while (shouldRun)
  } // offerService

    /**
     *
     * @param cmd
     * @return true if further processing may be required or false otherwise.
     * @throws IOException
     */
  private boolean processCommand(DatanodeCommand cmd) throws IOException {
    if (cmd == null)
      return true;
    switch(cmd.getAction()) {
    case DatanodeProtocol.DNA_TRANSFER:
      //
      // Send a copy of a block to another datanode
      //
      BlockCommand bcmd = (BlockCommand)cmd;
      transferBlocks(bcmd.getBlocks(), bcmd.getTargets());
      break;
    case DatanodeProtocol.DNA_INVALIDATE:
      //
      // Some local block(s) are obsolete and can be
      // safely garbage-collected.
      //
      Block toDelete[] = ((BlockCommand)cmd).getBlocks();
      try {
        if (blockScanner != null) {
          blockScanner.deleteBlocks(toDelete);
        }
        data.invalidate(toDelete);
      } catch(IOException e) {
        checkDiskError();
        throw e;
      }
      myMetrics.blocksRemoved.inc(toDelete.length);
      break;
    case DatanodeProtocol.DNA_SHUTDOWN:
      // shut down the data node
      this.shutdown();
      return false;
    case DatanodeProtocol.DNA_REGISTER:
      // namenode requested a registration - at start or if NN lost contact
      register();
      // random short delay - helps scatter the BR from all DNs
      scheduleBlockReport(initialBlockReportDelay);
      break;
    case DatanodeProtocol.DNA_FINALIZE:
      storage.finalizeUpgrade();
      break;
    case UpgradeCommand.UC_ACTION_START_UPGRADE:
      // start distributed upgrade here
      processDistributedUpgradeCommand((UpgradeCommand)cmd);
      break;
    default:
      LOG.warn("Unknown DatanodeCommand action: " + cmd.getAction());
    }
    return true;
  }

  // Distributed upgrade manager
  UpgradeManagerDatanode upgradeManager = new UpgradeManagerDatanode(this);

  private void processDistributedUpgradeCommand(UpgradeCommand comm
                                               ) throws IOException {
    assert upgradeManager != null : "DataNode.upgradeManager is null.";
    upgradeManager.processUpgradeCommand(comm);
  }


  /**
   * Start distributed upgrade if it should be initiated by the data-node.
   */
  private void startDistributedUpgradeIfNeeded() throws IOException {
    UpgradeManagerDatanode um = DataNode.getDataNode().upgradeManager;
    assert um != null : "DataNode.upgradeManager is null.";
    if(!um.getUpgradeState())
      return;
    um.setUpgradeState(false, um.getUpgradeVersion());
    um.startUpgrade();
    return;
  }
  private void transferBlocks( Block blocks[],
                               DatanodeInfo xferTargets[][]
                               ) throws IOException {
    for (int i = 0; i < blocks.length; i++) {
      if (!data.isValidBlock(blocks[i])) {
        String errStr = "Can't send invalid block " + blocks[i];
        LOG.info(errStr);
        namenode.errorReport(dnRegistration,
                             DatanodeProtocol.INVALID_BLOCK,
                             errStr);
        break;
      }
      int numTargets = xferTargets[i].length;
      if (numTargets > 0) {
        if (LOG.isInfoEnabled()) {
          StringBuilder xfersBuilder = new StringBuilder();
          for (int j = 0; j < numTargets; j++) {
            DatanodeInfo nodeInfo = xferTargets[i][j];
            xfersBuilder.append(nodeInfo.getName());
            if (j < (numTargets - 1)) {
              xfersBuilder.append(", ");
            }
          }
          String xfersTo = xfersBuilder.toString();
          LOG.info(dnRegistration + " Starting thread to transfer block " +
                   blocks[i] + " to " + xfersTo);                      
        }
        new Daemon(new DataTransfer(xferTargets[i], blocks[i])).start();
      }
    }
  }

  /* utility function for receiving a response */
  private static void receiveResponse(Socket s, int numTargets) throws IOException {
    // check the response
    DataInputStream reply = new DataInputStream(new BufferedInputStream(
        s.getInputStream(), BUFFER_SIZE));
    try {
      for (int i = 0; i < numTargets; i++) {
        short opStatus = reply.readShort();
        if(opStatus != OP_STATUS_SUCCESS) {
          throw new IOException("operation failed at "+
              s.getInetAddress());
        }
      }
    } finally {
      IOUtils.closeStream(reply);
    }
  }

  /* utility function for sending a respose */
  private static void sendResponse(Socket s, short opStatus) throws IOException {
    DataOutputStream reply = new DataOutputStream(s.getOutputStream());
    try {
      reply.writeShort(opStatus);
      reply.flush();
    } finally {
      IOUtils.closeStream(reply);
    }
  }

  /*
   * Informing the name node could take a long long time! Should we wait
   * till namenode is informed before responding with success to the
   * client? For now we don't.
   */
  private void notifyNamenodeReceivedBlock(Block block, String delHint) {
    if(block==null || delHint==null) {
      throw new IllegalArgumentException(block==null?"Block is null":"delHint is null");
    }
    synchronized (receivedBlockList) {
      synchronized (delHints) {
        receivedBlockList.add(block);
        delHints.add(delHint);
        receivedBlockList.notifyAll();
      }
    }
  }

  /**
   * Server used for receiving/sending a block of data.
   * This is created to listen for requests from clients or
   * other DataNodes.  This small server does not use the
   * Hadoop IPC mechanism.
   */
  class DataXceiveServer implements Runnable {
    ServerSocket ss;
    public DataXceiveServer(ServerSocket ss) {
      this.ss = ss;
    }

    /**
     */
    public void run() {
      try {
        while (shouldRun) {
          Socket s = ss.accept();
          s.setTcpNoDelay(true);
          new Daemon(threadGroup, new DataXceiver(s)).start();
        }
        ss.close();
      } catch (IOException ie) {
        LOG.info(dnRegistration + ":Exiting DataXceiveServer due to " + ie.toString());
      }
    }
    public void kill() {
      assert shouldRun == false :
        "shoudRun should be set to false before killing";
      try {
        this.ss.close();
      } catch (IOException iex) {
      }

      // close all the sockets that were accepted earlier
      synchronized (childSockets) {
        for (Iterator it = childSockets.values().iterator();
             it.hasNext();) {
          Socket thissock = (Socket) it.next();
          try {
            thissock.close();
          } catch (IOException e) {
          }
        }
      }
    }
  }

  /**
   * Thread for processing incoming/outgoing data stream
   */
  class DataXceiver implements Runnable {
    Socket s;
    String remoteAddress; // address of remote side
    String localAddress;  // local address of this daemon
    public DataXceiver(Socket s) {
      this.s = s;
      childSockets.put(s, s);
      InetSocketAddress isock = (InetSocketAddress)s.getRemoteSocketAddress();
      remoteAddress = isock.toString();
      localAddress = s.getInetAddress() + ":" + s.getLocalPort();
      LOG.debug("Number of active connections is: "+xceiverCount);
    }

    /**
     * Read/write data from/to the DataXceiveServer.
     */
    public void run() {
      DataInputStream in=null;
      try {
        in = new DataInputStream(
            new BufferedInputStream(s.getInputStream(), BUFFER_SIZE));
        short version = in.readShort();
        if ( version != DATA_TRANFER_VERSION ) {
          throw new IOException( "Version Mismatch" );
        }
        boolean local = s.getInetAddress().equals(s.getLocalAddress());
        byte op = in.readByte();
        long startTime = now();
        switch ( op ) {
        case OP_READ_BLOCK:
          readBlock( in );
          myMetrics.readBlockOp.inc(now() - startTime);
          if (local)
            myMetrics.readsFromLocalClient.inc();
          else
            myMetrics.readsFromRemoteClient.inc();
          break;
        case OP_WRITE_BLOCK:
          writeBlock( in );
          myMetrics.writeBlockOp.inc(now() - startTime);
          if (local)
            myMetrics.writesFromLocalClient.inc();
          else
            myMetrics.writesFromRemoteClient.inc();
          break;
        case OP_READ_METADATA:
          readMetadata( in );
          myMetrics.readMetadataOp.inc(now() - startTime);
          break;
        case OP_REPLACE_BLOCK: // for balancing purpose; send to a destination
          replaceBlock(in);
          myMetrics.replaceBlockOp.inc(now() - startTime);
          break;
        case OP_COPY_BLOCK: // for balancing purpose; send to a proxy source
          copyBlock(in);
          myMetrics.copyBlockOp.inc(now() - startTime);
          break;
        default:
          throw new IOException("Unknown opcode " + op + " in data stream");
        }
      } catch (Throwable t) {
        LOG.error(dnRegistration + ":DataXceiver: " + StringUtils.stringifyException(t));
      } finally {
        LOG.debug(dnRegistration + ":Number of active connections is: "+xceiverCount);
        IOUtils.closeStream(in);
        IOUtils.closeSocket(s);
        childSockets.remove(s);
      }
    }

    /**
     * Read a block from the disk
     * @param in The stream to read from
     * @throws IOException
     */
    private void readBlock(DataInputStream in) throws IOException {
      xceiverCount.incr();
      //
      // Read in the header
      //
      long blockId = in.readLong();         
      Block block = new Block( blockId, 0 );

      long startOffset = in.readLong();
      long length = in.readLong();

      // send the block
      DataOutputStream out = new DataOutputStream(
          new BufferedOutputStream(s.getOutputStream(), BUFFER_SIZE));
      BlockSender blockSender = null;
      try {
        try {
          blockSender = new BlockSender(block, startOffset, length,
                                        true, true, false);
        } catch(IOException e) {
          out.writeShort(OP_STATUS_ERROR);
          throw e;
        }

        out.writeShort(DataNode.OP_STATUS_SUCCESS); // send op status
        long read = blockSender.sendBlock(out, null); // send data

        if (blockSender.isBlockReadFully()) {
          // See if client verification succeeded.
          // This is an optional response from client.
          try {
            if (in.readShort() == OP_STATUS_CHECKSUM_OK  &&
                blockScanner != null) {
              blockScanner.verifiedByClient(block);
            }
          } catch (IOException ignored) {}
        }
       
        myMetrics.bytesRead.inc((int) read);
        myMetrics.blocksRead.inc();
        LOG.info(dnRegistration + " Served block " + block + " to " + s.getInetAddress());
      } catch ( SocketException ignored ) {
        // Its ok for remote side to close the connection anytime.
        myMetrics.blocksRead.inc();
      } catch ( IOException ioe ) {
        /* What exactly should we do here?
         * Earlier version shutdown() datanode if there is disk error.
         */
        LOG.warn(dnRegistration +  ":Got exception while serving " + block + " to " +
                  s.getInetAddress() + ":\n" +
                  StringUtils.stringifyException(ioe) );
        throw ioe;
      } finally {
        xceiverCount.decr();
        IOUtils.closeStream(out);
        IOUtils.closeStream(blockSender);
      }
    }

    /**
     * Write a block to disk.
     *
     * @param in The stream to read from
     * @throws IOException
     */
    private void writeBlock(DataInputStream in) throws IOException {
      xceiverCount.incr();
      LOG.debug("writeBlock receive buf size " + s.getReceiveBufferSize() +
                " tcp no delay " + s.getTcpNoDelay());
      //
      // Read in the header
      //
      Block block = new Block(in.readLong(), estimateBlockSize);
      LOG.info("Receiving block " + block +
               " src: " + remoteAddress +
               " dest: " + localAddress);
      int pipelineSize = in.readInt(); // num of datanodes in entire pipeline
      boolean isRecovery = in.readBoolean(); // is this part of recovery?
      String client = Text.readString(in); // working on behalf of this client
      int numTargets = in.readInt();
      if (numTargets < 0) {
        throw new IOException("Mislabelled incoming datastream.");
      }
      DatanodeInfo targets[] = new DatanodeInfo[numTargets];
      for (int i = 0; i < targets.length; i++) {
        DatanodeInfo tmp = new DatanodeInfo();
        tmp.readFields(in);
        targets[i] = tmp;
      }

      DataOutputStream mirrorOut = null// stream to next target
      DataInputStream mirrorIn = null;    // reply from next target
      DataOutputStream replyOut = null;   // stream to prev target
      Socket mirrorSock = null;           // socket to next target
      BlockReceiver blockReceiver = null; // responsible for data handling
      String mirrorNode = null;           // the name:port of next target
      String firstBadLink = "";           // first datanode that failed in connection setup
      try {
        // open a block receiver and check if the block does not exist
        blockReceiver = new BlockReceiver(block, in,
            s.getInetAddress().toString(), isRecovery, client);

        // get a connection back to the previous target
        replyOut = new DataOutputStream(s.getOutputStream());

        //
        // Open network conn to backup machine, if
        // appropriate
        //
        if (targets.length > 0) {
          InetSocketAddress mirrorTarget = null;
          // Connect to backup machine
          mirrorNode = targets[0].getName();
          mirrorTarget = NetUtils.createSocketAddr(mirrorNode);
          mirrorSock = new Socket();
          try {
            int timeoutValue = numTargets * socketTimeout;
            mirrorSock.connect(mirrorTarget, timeoutValue);
            mirrorSock.setSoTimeout(timeoutValue);
            mirrorSock.setSendBufferSize(DEFAULT_DATA_SOCKET_SIZE);
            mirrorOut = new DataOutputStream(
                          new BufferedOutputStream(mirrorSock.getOutputStream(),
                                                   BUFFER_SIZE));
            mirrorIn = new DataInputStream(mirrorSock.getInputStream());

            // Write header: Copied from DFSClient.java!
            mirrorOut.writeShort( DATA_TRANFER_VERSION );
            mirrorOut.write( OP_WRITE_BLOCK );
            mirrorOut.writeLong( block.getBlockId() );
            mirrorOut.writeInt( pipelineSize );
            mirrorOut.writeBoolean( isRecovery );
            Text.writeString( mirrorOut, client );
            mirrorOut.writeInt( targets.length - 1 );
            for ( int i = 1; i < targets.length; i++ ) {
              targets[i].write( mirrorOut );
            }

            blockReceiver.writeChecksumHeader(mirrorOut);
            mirrorOut.flush();

            // read connect ack (only for clients, not for replication req)
            if (client.length() != 0) {
              firstBadLink = Text.readString(mirrorIn);
              LOG.info("Datanode " + targets.length +
                       " got response for connect ack " +
                       " from downstream datanode with firstbadlink as " +
                       firstBadLink);
            }

          } catch (IOException e) {
            if (client.length() != 0) {
              Text.writeString(replyOut, mirrorNode);
              replyOut.flush();
            }
            IOUtils.closeStream(mirrorOut);
            mirrorOut = null;
            IOUtils.closeStream(mirrorIn);
            mirrorIn = null;
            IOUtils.closeSocket(mirrorSock);
            mirrorSock = null;
            if (client.length() > 0) {
              throw e;
            } else {
              LOG.info(dnRegistration + ":Exception transfering block " +
                       block + " to mirror " + mirrorNode +
                       ". continuing without the mirror.\n" +
                       StringUtils.stringifyException(e));
            }
          }
        }

        // send connect ack back to source (only for clients)
        if (client.length() != 0) {
          LOG.info("Datanode " + targets.length +
                   " forwarding connect ack to upstream firstbadlink is " +
                   firstBadLink);
          Text.writeString(replyOut, firstBadLink);
          replyOut.flush();
        }

        // receive the block and mirror to the next target
        String mirrorAddr = (mirrorSock == null) ? null : mirrorNode;
        blockReceiver.receiveBlock(mirrorOut, mirrorIn, replyOut,
                                   mirrorAddr, null, targets.length);

        // if this write is for a replication request (and not
        // from a client), then confirm block. For client-writes,
        // the block is finalized in the PacketResponder.
        if (client.length() == 0) {
          notifyNamenodeReceivedBlock(block, EMPTY_DEL_HINT);
          LOG.info("Received block " + block +
                   " src: " + remoteAddress +
                   " dest: " + localAddress +
                   " of size " + block.getNumBytes());
        }

        if (blockScanner != null) {
          blockScanner.addBlock(block);
        }
       
      } catch (IOException ioe) {
        LOG.info("writeBlock " + block + " received exception " + ioe);
        throw ioe;
      } finally {
        // close all opened streams
        IOUtils.closeStream(mirrorOut);
        IOUtils.closeStream(mirrorIn);
        IOUtils.closeStream(replyOut);
        IOUtils.closeSocket(mirrorSock);
        IOUtils.closeStream(blockReceiver);
        // decrement counter
        xceiverCount.decr();
      }
    }

    /**
     * Reads the metadata and sends the data in one 'DATA_CHUNK'
     * @param in
     */
    void readMetadata(DataInputStream in) throws IOException {
      xceiverCount.incr();

      Block block = new Block( in.readLong(), 0 );
      MetaDataInputStream checksumIn = null;
      DataOutputStream out = null;
     
      try {

        checksumIn = data.getMetaDataInputStream(block);
       
        long fileSize = checksumIn.getLength();

        if (fileSize >= 1L<<31 || fileSize <= 0) {
            throw new IOException("Unexpected size for checksumFile of block" +
                    block);
        }

        byte [] buf = new byte[(int)fileSize];
        IOUtils.readFully(checksumIn, buf, 0, buf.length);
       
        out = new DataOutputStream(s.getOutputStream());
       
        out.writeByte(OP_STATUS_SUCCESS);
        out.writeInt(buf.length);
        out.write(buf);
       
        //last DATA_CHUNK
        out.writeInt(0);
      } finally {
        xceiverCount.decr();
        IOUtils.closeStream(checksumIn);
      }
    }
   
    /**
     * Read a block from the disk and then sends it to a destination
     *
     * @param in
     *          The stream to read from
     * @throws IOException
     */
    private void copyBlock(DataInputStream in) throws IOException {
      // Read in the header
      long blockId = in.readLong(); // read block id
      Block block = new Block(blockId, 0);
      String source = Text.readString(in); // read del hint
      DatanodeInfo target = new DatanodeInfo(); // read target
      target.readFields(in);

      Socket targetSock = null;
      short opStatus = OP_STATUS_SUCCESS;
      BlockSender blockSender = null;
      DataOutputStream targetOut = null;
      try {
        balancingSem.acquireUninterruptibly();
       
        // check if the block exists or not
        blockSender = new BlockSender(block, 0, -1, false, false, false);

        // get the output stream to the target
        InetSocketAddress targetAddr = NetUtils.createSocketAddr(target.getName());
        targetSock = new Socket();
        targetSock.connect(targetAddr, socketTimeout);
        targetSock.setSoTimeout(socketTimeout);

        targetOut = new DataOutputStream(new BufferedOutputStream(
            targetSock.getOutputStream(), BUFFER_SIZE));

        /* send request to the target */
        // fist write header info
        targetOut.writeShort(DATA_TRANFER_VERSION); // transfer version
        targetOut.writeByte(OP_REPLACE_BLOCK); // op code
        targetOut.writeLong(block.getBlockId()); // block id
        Text.writeString( targetOut, source); // del hint

        // then send data
        long read = blockSender.sendBlock(targetOut, balancingThrottler);

        myMetrics.bytesRead.inc((int) read);
        myMetrics.blocksRead.inc();
       
        // check the response from target
        receiveResponse(targetSock, 1);

        LOG.info("Copied block " + block + " to " + targetAddr);
      } catch (IOException ioe) {
        opStatus = OP_STATUS_ERROR;
        LOG.warn("Got exception while serving " + block + " to "
            + target.getName() + ": " + StringUtils.stringifyException(ioe));
        throw ioe;
      } finally {
        /* send response to the requester */
        try {
          sendResponse(s, opStatus);
        } catch (IOException replyE) {
          LOG.warn("Error writing the response back to "+
              s.getRemoteSocketAddress() + "\n" +
              StringUtils.stringifyException(replyE) );
        }
        IOUtils.closeStream(targetOut);
        IOUtils.closeStream(blockSender);
        balancingSem.release();
      }
    }

    /**
     * Receive a block and write it to disk, it then notifies the namenode to
     * remove the copy from the source
     *
     * @param in
     *          The stream to read from
     * @throws IOException
     */
    private void replaceBlock(DataInputStream in) throws IOException {
      balancingSem.acquireUninterruptibly();

      /* read header */
      Block block = new Block(in.readLong(), estimateBlockSize); // block id & len
      String sourceID = Text.readString(in);

      short opStatus = OP_STATUS_SUCCESS;
      BlockReceiver blockReceiver = null;
      try {
        // open a block receiver and check if the block does not exist
         blockReceiver = new BlockReceiver(
            block, in, s.getRemoteSocketAddress().toString(), false, "");

        // receive a block
        blockReceiver.receiveBlock(null, null, null, null, balancingThrottler, -1);
                     
        // notify name node
        notifyNamenodeReceivedBlock(block, sourceID);

        LOG.info("Moved block " + block +
            " from " + s.getRemoteSocketAddress());
      } catch (IOException ioe) {
        opStatus = OP_STATUS_ERROR;
        throw ioe;
      } finally {
        // send response back
        try {
          sendResponse(s, opStatus);
        } catch (IOException ioe) {
          LOG.warn("Error writing reply back to " + s.getRemoteSocketAddress());
        }
        IOUtils.closeStream(blockReceiver);
        balancingSem.release();
      }
    }
  }
   
  /** a class to throttle the block transfers
   * This class is thread safe. It can be shared by multiple threads.
   * The parameter bandwidthPerSec specifies the total bandwidth shared by threads.
   */
  static class Throttler {
    private long period;          // period over which bw is imposed
    private long periodExtension; // Max period over which bw accumulates.
    private long bytesPerPeriod; // total number of bytes can be sent in each period
    private long curPeriodStart; // current period starting time
    private long curReserve;     // remaining bytes can be sent in the period
    private long bytesAlreadyUsed;

    /** Constructor
     * @param bandwidthPerSec bandwidth allowed in bytes per second.
     */
    Throttler(long bandwidthPerSec) {
      this(500, bandwidthPerSec)// by default throttling period is 500ms
    }

    /**
     * Constructor
     * @param period in milliseconds. Bandwidth is enforced over this
     *        period.
     * @param bandwidthPerSec bandwidth allowed in bytes per second.
     */
    Throttler(long period, long bandwidthPerSec) {
      this.curPeriodStart = System.currentTimeMillis();
      this.period = period;
      this.curReserve = this.bytesPerPeriod = bandwidthPerSec*period/1000;
      this.periodExtension = period*3;
    }

    /**
     * @return current throttle bandwidth in bytes per second.
     */
    public synchronized long getBandwidth() {
      return bytesPerPeriod*1000/period;
    }
   
    /**
     * Sets throttle bandwidth. This takes affect latest by the end of current
     * period.
     *
     * @param bytesPerSecond
     */
    public synchronized void setBandwidth(long bytesPerSecond) {
      if ( bytesPerSecond <= 0 ) {
        throw new IllegalArgumentException("" + bytesPerSecond);
      }
      bytesPerPeriod = bytesPerSecond*period/1000;
    }
   
    /** Given the numOfBytes sent/received since last time throttle was called,
     * make the current thread sleep if I/O rate is too fast
     * compared to the given bandwidth
     *
     * @param numOfBytes
     *     number of bytes sent/received since last time throttle was called
     */
    public synchronized void throttle(long numOfBytes) {
      if ( numOfBytes <= 0 ) {
        return;
      }

      curReserve -= numOfBytes;
      bytesAlreadyUsed += numOfBytes;

      while (curReserve <= 0) {
        long now = System.currentTimeMillis();
        long curPeriodEnd = curPeriodStart + period;

        if ( now < curPeriodEnd ) {
          // Wait for next period so that curReserve can be increased.
          try {
            wait( curPeriodEnd - now );
          } catch (InterruptedException ignored) {}
        } else if ( now <  (curPeriodStart + periodExtension)) {
          curPeriodStart = curPeriodEnd;
          curReserve += bytesPerPeriod;
        } else {
          // discard the prev period. Throttler might not have
          // been used for a long time.
          curPeriodStart = now;
          curReserve = bytesPerPeriod - bytesAlreadyUsed;
        }
      }

      bytesAlreadyUsed -= numOfBytes;
    }
  }

  class BlockSender implements java.io.Closeable {
    private Block block; // the block to read from
    private DataInputStream blockIn; // data strean
    private DataInputStream checksumIn; // checksum datastream
    private DataChecksum checksum; // checksum stream
    private long offset; // starting position to read
    private long endOffset; // ending position
    private long blockLength;
    private byte buf[]; // buffer to store data read from the block file & crc
    private int bytesPerChecksum; // chunk size
    private int checksumSize; // checksum size
    private boolean corruptChecksumOk; // if need to verify checksum
    private boolean chunkOffsetOK; // if need to send chunk offset
    private long seqno; // sequence number of packet

    private boolean blockReadFully; //set when the whole block is read
    private boolean verifyChecksum; //if true, check is verified while reading
    private Throttler throttler;
    private DataOutputStream out;

    BlockSender(Block block, long startOffset, long length,
                boolean corruptChecksumOk, boolean chunkOffsetOK,
                boolean verifyChecksum) throws IOException {

      try {
        this.block = block;
        this.chunkOffsetOK = chunkOffsetOK;
        this.corruptChecksumOk = corruptChecksumOk;
        this.verifyChecksum = verifyChecksum;
        this.blockLength = data.getLength(block);

        if ( !corruptChecksumOk || data.metaFileExists(block) ) {
          checksumIn = new DataInputStream(
                  new BufferedInputStream(data.getMetaDataInputStream(block),
                                          BUFFER_SIZE));

          // read and handle the common header here. For now just a version
          short version = checksumIn.readShort();
          if (version != FSDataset.METADATA_VERSION) {
            LOG.warn("Wrong version (" + version + ") for metadata file for "
                + block + " ignoring ...");
          }
          checksum = DataChecksum.newDataChecksum(checksumIn);
        } else {
          LOG.warn("Could not find metadata file for " + block);
          // This only decides the buffer size. Use BUFFER_SIZE?
          checksum = DataChecksum.newDataChecksum(DataChecksum.CHECKSUM_NULL,
              16 * 1024);
        }

        bytesPerChecksum = checksum.getBytesPerChecksum();
        checksumSize = checksum.getChecksumSize();

        if (length < 0) {
          length = blockLength;
        }

        endOffset = blockLength;
        if (startOffset < 0 || startOffset > endOffset
            || (length + startOffset) > endOffset) {
          String msg = " Offset " + startOffset + " and length " + length
          + " don't match block " + block + " ( blockLen " + endOffset + " )";
          LOG.warn(dnRegistration + ":sendBlock() : " + msg);
          throw new IOException(msg);
        }

        buf = new byte[bytesPerChecksum + checksumSize];
        offset = (startOffset - (startOffset % bytesPerChecksum));
        if (length >= 0) {
          // Make sure endOffset points to end of a checksumed chunk.
          long tmpLen = startOffset + length + (startOffset - offset);
          if (tmpLen % bytesPerChecksum != 0) {
            tmpLen += (bytesPerChecksum - tmpLen % bytesPerChecksum);
          }
          if (tmpLen < endOffset) {
            endOffset = tmpLen;
          }
        }

        // seek to the right offsets
        if (offset > 0) {
          long checksumSkip = (offset / bytesPerChecksum) * checksumSize;
          // note blockInStream is  seeked when created below
          if (checksumSkip > 0) {
            // Should we use seek() for checksum file as well?
            IOUtils.skipFully(checksumIn, checksumSkip);
          }
        }
        seqno = 0;

        InputStream blockInStream = data.getBlockInputStream(block, offset); // seek to offset
        blockIn = new DataInputStream(new BufferedInputStream(blockInStream, BUFFER_SIZE));
      } catch (IOException ioe) {
        IOUtils.closeStream(this);
        IOUtils.closeStream(blockIn);
        throw ioe;
      }
    }

    // close opened files
    public void close() throws IOException {
      IOException ioe = null;
      // close checksum file
      if(checksumIn!=null) {
        try {
          checksumIn.close();
        } catch (IOException e) {
          ioe = e;
        }
        checksumIn = null;
      }
      // close data file
      if(blockIn!=null) {
        try {
          blockIn.close();
        } catch (IOException e) {
          ioe = e;
        }
        blockIn = null;
      }
      // throw IOException if there is any
      if(ioe!= null) {
        throw ioe;
      }
    }

   
    private int sendChunk()
        throws IOException {
      int len = (int) Math.min(endOffset - offset, bytesPerChecksum);
      if (len == 0) {
        return 0;
      }
      blockIn.readFully(buf, 0, len);

      if (checksumSize > 0 && checksumIn != null) {
        try {
          checksumIn.readFully(buf, len, checksumSize);
         
          if (verifyChecksum) {
            checksum.reset();
            checksum.update(buf, 0, len);
            if (!checksum.compare(buf, len)) {
              throw new ChecksumException("Checksum failed at " + offset, len);
            }
          }
        } catch (IOException e) {
          LOG.warn(" Could not read or failed to veirfy checksum for data" +
                   " at offset " + offset + " for block " + block + " got : "
                   + StringUtils.stringifyException(e));
          IOUtils.closeStream(checksumIn);
          checksumIn = null;
          if (corruptChecksumOk) {
            // Just fill the array with zeros.
            Arrays.fill(buf, len, len + checksumSize, (byte) 0);
          } else {
            throw e;
          }
        }
      }
      boolean lastPacketInBlock = false;
      if (offset + len >= endOffset) {
        lastPacketInBlock = true;
      }

      // write packet header
      out.writeInt(len + checksumSize + 4);
      out.writeLong(offset);
      out.writeLong(seqno);
      out.writeBoolean(lastPacketInBlock);
     
      out.writeInt(len);
      out.write(buf, 0, len + checksumSize);

      if (throttler != null) { // rebalancing so throttle
        throttler.throttle(len + checksumSize + 4);
      }

      return len;
    }

    /**
     * sendBlock() is used to read block and its metadata and stream the data to
     * either a client or to another datanode.
     *
     * @param out  stream to which the block is written to
     * returns total bytes reads, including crc.
     */
    long sendBlock(DataOutputStream out, Throttler throttler)
        throws IOException {
      if( out == null ) {
        throw new IOException( "out stream is null" );
      }
      this.out = out;
      this.throttler = throttler;

      long initialOffset = offset;
      long totalRead = 0;
      try {
        checksum.writeHeader(out);
        if ( chunkOffsetOK ) {
          out.writeLong( offset );
        }

        while (endOffset > offset) {
          // Write one data chunk per loop.
          long len = sendChunk();
          offset += len;
          totalRead += len + checksumSize;
          seqno++;
        }
        out.writeInt(0); // mark the end of block
        out.flush();
      } finally {
        close();
      }

      blockReadFully = (initialOffset == 0 && offset >= blockLength);

      return totalRead;
    }
   
    boolean isBlockReadFully() {
      return blockReadFully;
    }
  }

  // This information is cached by the Datanode in the ackQueue
  static private class Packet {
    long seqno;
    boolean lastPacketInBlock;

    Packet(long seqno, boolean lastPacketInBlock) {
      this.seqno = seqno;
      this.lastPacketInBlock = lastPacketInBlock;
    }
  }

  /**
   * Processed responses from downstream datanodes in the pipeline
   * and sends back replies to the originator.
   */
  class PacketResponder implements Runnable {
    private LinkedList<Packet> ackQueue = new LinkedList<Packet>(); // packet waiting for ack
    private volatile boolean running = true;
    private Block block;
    DataInputStream mirrorIn;   // input from downstream datanode
    DataOutputStream replyOut;  // output to upstream datanode
    private int numTargets;     // number of downstream datanodes including myself
    private String clientName;  // The name of the client (if any)
    private BlockReceiver receiver; // The owner of this responder.

    public String toString() {
      return "PacketResponder " + numTargets + " for Block " + this.block;
    }

    PacketResponder(BlockReceiver receiver, Block b, DataInputStream in,
                    DataOutputStream out, int numTargets, String clientName) {
      this.receiver = receiver;
      this.block = b;
      mirrorIn = in;
      replyOut = out;
      this.numTargets = numTargets;
      this.clientName = clientName;
    }

    // enqueue the seqno that is still be to acked by the downstream datanode
    synchronized void enqueue(long seqno, boolean lastPacketInBlock) {
      if (running) {
        LOG.debug("PacketResponder " + numTargets + " adding seqno " + seqno +
                  " to ack queue.");
        ackQueue.addLast(new Packet(seqno, lastPacketInBlock));
        notifyAll();
      }
    }

    // wait for all pending packets to be acked. Then shutdown thread.
    synchronized void close() {
      while (running && ackQueue.size() != 0 && shouldRun) {
        try {
          wait();
        } catch (InterruptedException e) {
          running = false;
        }
      }
      LOG.debug("PacketResponder " + numTargets +
               " for block " + block + " Closing down.");
      running = false;
      notifyAll();
    }

    private synchronized void lastDataNodeRun() {
      long lastHeartbeat = System.currentTimeMillis();
      boolean lastPacket = false;

      while (running && shouldRun && !lastPacket) {
        long now = System.currentTimeMillis();
        try {

            // wait for a packet to be sent to downstream datanode
            while (running && shouldRun && ackQueue.size() == 0) {
              long idle = now - lastHeartbeat;
              long timeout = (socketTimeout/2) - idle;
              if (timeout <= 0) {
                timeout = 1000;
              }
              try {
                wait(timeout);
              } catch (InterruptedException e) {
                if (running) {
                  LOG.info("PacketResponder " + numTargets +
                           " for block " + block + " Interrupted.");
                  running = false;
                }
                break;
              }
         
              // send a heartbeat if it is time.
              now = System.currentTimeMillis();
              if (now - lastHeartbeat > socketTimeout/2) {
                replyOut.writeLong(-1); // send heartbeat
                replyOut.flush();
                lastHeartbeat = now;
              }
            }

            if (!running || !shouldRun) {
              break;
            }
            Packet pkt = ackQueue.removeFirst();
            long expected = pkt.seqno;
            notifyAll();
            LOG.debug("PacketResponder " + numTargets +
                      " for block " + block +
                      " acking for packet " + expected);

            // If this is the last packet in block, then close block
            // file and finalize the block before responding success
            if (pkt.lastPacketInBlock) {
              if (!receiver.finalized) {
                receiver.close();
                block.setNumBytes(receiver.offsetInBlock);
                data.finalizeBlock(block);
                myMetrics.blocksWritten.inc();
                notifyNamenodeReceivedBlock(block, EMPTY_DEL_HINT);
                LOG.info("Received block " + block +
                         " of size " + block.getNumBytes() +
                         " from " + receiver.inAddr);
              }
              lastPacket = true;
            }

            replyOut.writeLong(expected);
            replyOut.writeShort(OP_STATUS_SUCCESS);
            replyOut.flush();
        } catch (Exception e) {
          if (running) {
            LOG.info("PacketResponder " + block + " " + numTargets +
                     " Exception " + StringUtils.stringifyException(e));
            running = false;
          }
        }
      }
      LOG.info("PacketResponder " + numTargets +
               " for block " + block + " terminating");
    }

    // Thread to process incoming acks
    public void run() {

      // If this is the last datanode in pipeline, then handle differently
      if (numTargets == 0) {
        lastDataNodeRun();
        return;
      }

      boolean lastPacketInBlock = false;
      while (running && shouldRun && !lastPacketInBlock) {

        try {
            short op = OP_STATUS_SUCCESS;
            boolean didRead = false;
            long expected = -2;
            try {
              // read seqno from downstream datanode
              long seqno = mirrorIn.readLong();
              didRead = true;
              if (seqno == -1) {
                replyOut.writeLong(-1); // send keepalive
                replyOut.flush();
                LOG.debug("PacketResponder " + numTargets + " got -1");
                continue;
              } else if (seqno == -2) {
                LOG.debug("PacketResponder " + numTargets + " got -2");
              } else {
                LOG.debug("PacketResponder " + numTargets + " got seqno = " + seqno);
                Packet pkt = null;
                synchronized (this) {
                  while (running && shouldRun && ackQueue.size() == 0) {
                    if (LOG.isDebugEnabled()) {
                      LOG.debug("PacketResponder " + numTargets +
                                " seqno = " + seqno +
                                " for block " + block +
                                " waiting for local datanode to finish write.");
                    }
                    wait();
                  }
                  pkt = ackQueue.removeFirst();
                  expected = pkt.seqno;
                  notifyAll();
                  LOG.debug("PacketResponder " + numTargets + " seqno = " + seqno);
                  if (seqno != expected) {
                    throw new IOException("PacketResponder " + numTargets +
                                          " for block " + block +
                                          " expected seqno:" + expected +
                                          " received:" + seqno);
                  }
                  lastPacketInBlock = pkt.lastPacketInBlock;
                }
              }
            } catch (Throwable e) {
              if (running) {
                LOG.info("PacketResponder " + block + " " + numTargets +
                         " Exception " + StringUtils.stringifyException(e));
                running = false;
                if (!didRead) {
                  op = OP_STATUS_ERROR;
                }
              }
            }

            // If this is the last packet in block, then close block
            // file and finalize the block before responding success
            if (lastPacketInBlock && !receiver.finalized) {
              receiver.close();
              block.setNumBytes(receiver.offsetInBlock);
              data.finalizeBlock(block);
              myMetrics.blocksWritten.inc();
              notifyNamenodeReceivedBlock(block, EMPTY_DEL_HINT);
              LOG.info("Received block " + block +
                       " of size " + block.getNumBytes() +
                       " from " + receiver.inAddr);
            }

            // send my status back to upstream datanode
            replyOut.writeLong(expected); // send seqno upstream
            replyOut.writeShort(OP_STATUS_SUCCESS);

            LOG.debug("PacketResponder " + numTargets +
                      " for block " + block +
                      " responded my status " +
                      " for seqno " + expected);

            // forward responses from downstream datanodes.
            for (int i = 0; i < numTargets && shouldRun; i++) {
              try {
                if (op == OP_STATUS_SUCCESS) {
                  op = mirrorIn.readShort();
                  if (op != OP_STATUS_SUCCESS) {
                    LOG.debug("PacketResponder for block " + block +
                              ": error code received from downstream " +
                              " datanode[" + i + "] " + op);
                  }
                }
              } catch (Throwable e) {
                op = OP_STATUS_ERROR;
              }
              replyOut.writeShort(op);
            }
            replyOut.flush();
            LOG.debug("PacketResponder " + block + " " + numTargets +
                      " responded other status " + " for seqno " + expected);

            // If we were unable to read the seqno from downstream, then stop.
            if (expected == -2) {
              running = false;
            }
            // If we forwarded an error response from a downstream datanode
            // and we are acting on behalf of a client, then we quit. The
            // client will drive the recovery mechanism.
            if (op == OP_STATUS_ERROR && clientName.length() > 0) {
              running = false;
            }
        } catch (IOException e) {
          if (running) {
            LOG.info("PacketResponder " + block + " " + numTargets +
                     " Exception " + StringUtils.stringifyException(e));
            running = false;
          }
        } catch (RuntimeException e) {
          if (running) {
            LOG.info("PacketResponder " + block + " " + numTargets +
                     " Exception " + StringUtils.stringifyException(e));
            running = false;
          }
        }
      }
      LOG.info("PacketResponder " + numTargets +
               " for block " + block + " terminating");
    }
  }

  // this class is a bufferoutputstream that exposes the number of
  // bytes in the buffer.
  static private class DFSBufferedOutputStream extends BufferedOutputStream {
    DFSBufferedOutputStream(OutputStream out, int capacity) {
      super(out, capacity);
    }

    int count() {
      return count;
    }
  }

  /* A class that receives a block and wites to its own disk, meanwhile
   * may copies it to another site. If a throttler is provided,
   * streaming throttling is also supported.
   * */
  private class BlockReceiver implements java.io.Closeable {
    private Block block; // the block to receive
    private boolean finalized;
    private DataInputStream in = null; // from where data are read
    private DataChecksum checksum; // from where chunks of a block can be read
    private DataOutputStream out = null; // to block file at local disk
    private DataOutputStream checksumOut = null; // to crc file at local disk
    private DFSBufferedOutputStream bufStream = null;
    private int bytesPerChecksum;
    private int checksumSize;
    private byte buf[];
    private long offsetInBlock;
    final private String inAddr;
    private String mirrorAddr;
    private DataOutputStream mirrorOut;
    private Daemon responder = null;
    private Throttler throttler;
    private int lastLen = -1;
    private int curLen = -1;
    private FSDataset.BlockWriteStreams streams;
    private boolean isRecovery = false;
    private String clientName;

    BlockReceiver(Block block, DataInputStream in, String inAddr,
                  boolean isRecovery, String clientName)
        throws IOException {
      try{
        this.block = block;
        this.in = in;
        this.inAddr = inAddr;
        this.isRecovery = isRecovery;
        this.clientName = clientName;
        this.offsetInBlock = 0;
        this.checksum = DataChecksum.newDataChecksum(in);
        this.bytesPerChecksum = checksum.getBytesPerChecksum();
        this.checksumSize = checksum.getChecksumSize();
        this.buf = new byte[bytesPerChecksum + checksumSize];
        //
        // Open local disk out
        //
        streams = data.writeToBlock(block, isRecovery);
    this.finalized = data.isValidBlock(block);
        if (streams != null) {
          this.bufStream = new DFSBufferedOutputStream(
                                          streams.dataOut, BUFFER_SIZE);
          this.out = new DataOutputStream(bufStream);
          this.checksumOut = new DataOutputStream(new BufferedOutputStream(
                                          streams.checksumOut, BUFFER_SIZE));
        }
      } catch(IOException ioe) {
        IOUtils.closeStream(this);
        throw ioe;
      }
    }

    // close files
    public void close() throws IOException {

      IOException ioe = null;
      // close checksum file
      try {
        if (checksumOut != null) {
          checksumOut.close();
          checksumOut = null;
        }
      } catch(IOException e) {
        ioe = e;
      }
      // close block file
      try {
        if (out != null) {
          out.close();
          out = null;
        }
      } catch (IOException e) {
        ioe = e;
      }
      // disk check
      if(ioe != null) {
        checkDiskError(ioe);
        throw ioe;
      }
    }

    // flush block data and metadata files to disk.
    void flush() throws IOException {
      if (checksumOut != null) {
        checksumOut.flush();
      }
      if (out != null) {
        out.flush();
      }
    }

    /**
     * While writing to mirrorOut, failure to write to mirror should not
     * affect this datanode unless a client is writing the block.
     */
    private void handleMirrorOutError(IOException ioe) throws IOException {
      LOG.info(dnRegistration + ":Exception writing block " +
               block + " to mirror " + mirrorAddr + "\n" +
               StringUtils.stringifyException(ioe));
      mirrorOut = null;
      //
      // If stream-copy fails, continue
      // writing to disk for replication requests. For client
      // writes, return error so that the client can do error
      // recovery.
      //
      if (clientName.length() > 0) {
        throw ioe;
      }
    }
   
    /* receive a chunk: write it to disk & mirror it to another stream */
    private void receiveChunk( int len ) throws IOException {
      if (len <= 0 || len > bytesPerChecksum) {
        throw new IOException("Got wrong length during writeBlock(" + block
            + ") from " + inAddr + " at offset " + offsetInBlock + ": " + len
            + " expected <= " + bytesPerChecksum);
      }

      if (lastLen > 0 && lastLen != bytesPerChecksum) {
        throw new IOException("Got wrong length during receiveBlock(" + block
          + ") from " + inAddr + " : " + " got " + lastLen + " instead of "
          + bytesPerChecksum);
      }

      lastLen = curLen;
      curLen = len;

      in.readFully(buf, 0, len + checksumSize);

      /*
       * Verification is not included in the initial design. For now, it at
       * least catches some bugs. Later, we can include this after showing that
       * it does not affect performance much.
       */
      checksum.update(buf, 0, len);

      if (!checksum.compare(buf, len)) {
        throw new IOException("Unexpected checksum mismatch "
            + "while writing " + block + " from " + inAddr);
      }

      checksum.reset();
      offsetInBlock += len;

      // First write to remote node before writing locally.
      if (mirrorOut != null) {
        try {
          mirrorOut.writeInt(len);
          mirrorOut.write(buf, 0, len + checksumSize);
        } catch (IOException ioe) {
          handleMirrorOutError(ioe);
        }
      }

      try {
        if (!finalized) {
          out.write(buf, 0, len);
          // Write checksum
          checksumOut.write(buf, len, checksumSize);
          myMetrics.bytesWritten.inc(len);
        }
      } catch (IOException iex) {
        checkDiskError(iex);
        throw iex;
      }

      if (throttler != null) { // throttle I/O
        throttler.throttle(len + checksumSize + 4);
      }
    }

    /*
     * Receive and process a packet. It contains many chunks.
     */
    private void receivePacket(int packetSize) throws IOException {

      offsetInBlock = in.readLong(); // get offset of packet in block
      long seqno = in.readLong();    // get seqno
      boolean lastPacketInBlock = in.readBoolean();
      int curPacketSize = 0;        
      LOG.debug("Receiving one packet for block " + block +
                " of size " + packetSize +
                " seqno " + seqno +
                " offsetInBlock " + offsetInBlock +
                " lastPacketInBlock " + lastPacketInBlock);
      setBlockPosition(offsetInBlock);

      int len = in.readInt();
      curPacketSize += 4;            // read an integer in previous line

      // send packet header to next datanode in pipeline
      if (mirrorOut != null) {
        try {
          mirrorOut.writeInt(packetSize);
          mirrorOut.writeLong(offsetInBlock);
          mirrorOut.writeLong(seqno);
          mirrorOut.writeBoolean(lastPacketInBlock);
        } catch (IOException e) {
          handleMirrorOutError(e);
        }
      }

      if (len == 0) {
        LOG.info("Receiving empty packet for block " + block);
        if (mirrorOut != null) {
          try {
            mirrorOut.writeInt(len);
            mirrorOut.flush();
          } catch (IOException e) {
            handleMirrorOutError(e);
          }
        }
      }

      while (len != 0) {
        LOG.debug("Receiving one chunk for block " + block +
                  " of size " + len);
        receiveChunk( len );
        curPacketSize += (len + checksumSize);
        if (curPacketSize > packetSize) {
          throw new IOException("Packet size for block " + block +
                                " too long " + curPacketSize +
                                " was expecting " + packetSize);
        }
        if (curPacketSize == packetSize) {
          if (mirrorOut != null) {
            try {
              mirrorOut.flush();
            } catch (IOException e) {
              handleMirrorOutError(e);
            }
          }
          break;
        }
        len = in.readInt();
        curPacketSize += 4;
      }

      /// flush entire packet before sending ack
      flush();

      // put in queue for pending acks
      if (responder != null) {
        ((PacketResponder)responder.getRunnable()).enqueue(seqno,
                                        lastPacketInBlock);
      }
    }

    public void writeChecksumHeader(DataOutputStream mirrorOut) throws IOException {
      checksum.writeHeader(mirrorOut);
    }
  

    public void receiveBlock(
        DataOutputStream mirrOut, // output to next datanode
        DataInputStream mirrIn,   // input from next datanode
        DataOutputStream replyOut,  // output to previous datanode
        String mirrAddr, Throttler throttlerArg,
        int numTargets) throws IOException {

        mirrorOut = mirrOut;
        mirrorAddr = mirrAddr;
        throttler = throttlerArg;

      try {
        // write data chunk header
        if (!finalized) {
          checksumOut.writeShort(FSDataset.METADATA_VERSION);
          checksum.writeHeader(checksumOut);
        }
        if (clientName.length() > 0) {
          responder = new Daemon(threadGroup,
                                 new PacketResponder(this, block, mirrIn,
                                                     replyOut, numTargets,
                                                     clientName));
          responder.start(); // start thread to processes reponses
        }

        /*
         * Skim packet headers. A response is needed for every packet.
         */
        int len = in.readInt(); // get packet size
        while (len != 0) {
          receivePacket(len);
          len = in.readInt(); // get packet size
        }

        // flush the mirror out
        if (mirrorOut != null) {
          try {
            mirrorOut.writeInt(0); // mark the end of the block
            mirrorOut.flush();
          } catch (IOException e) {
            handleMirrorOutError(e);
          }
        }

        // wait for all outstanding packet responses. And then
        // indicate responder to gracefully shutdown.
        if (responder != null) {
          ((PacketResponder)responder.getRunnable()).close();
        }

        // if this write is for a replication request (and not
        // from a client), then finalize block. For client-writes,
        // the block is finalized in the PacketResponder.
        if (clientName.length() == 0) {
          // close the block/crc files
          close();

          // Finalize the block. Does this fsync()?
          block.setNumBytes(offsetInBlock);
          data.finalizeBlock(block);
          myMetrics.blocksWritten.inc();
        }

      } catch (IOException ioe) {
        LOG.info("Exception in receiveBlock for block " + block +
                 " " + ioe);
        IOUtils.closeStream(this);
        if (responder != null) {
          responder.interrupt();
        }
        throw ioe;
      } finally {
        if (responder != null) {
          try {
            responder.join();
          } catch (InterruptedException e) {
            throw new IOException("Interrupted receiveBlock");
          }
          responder = null;
        }
      }
    }

    /**
     * Sets the file pointer in the local block file to the specified value.
     */
    private void setBlockPosition(long offsetInBlock) throws IOException {
      if (finalized) {
        if (!isRecovery) {
          throw new IOException("Write to offset " + offsetInBlock +
                                " of block " + block +
                                " that is already finalized.");
        }
        if (offsetInBlock > data.getLength(block)) {
          throw new IOException("Write to offset " + offsetInBlock +
                                " of block " + block +
                                " that is already finalized and is of size " +
                                data.getLength(block));
        }
        return;
      }
      if (data.getChannelPosition(block, streams) + bufStream.count() ==
                                                    offsetInBlock) {
        return;                   // nothing to do
      }
      if (offsetInBlock % bytesPerChecksum != 0) {
        throw new IOException("setBlockPosition trying to set position to " +
                              offsetInBlock +
                              " which is not a multiple of bytesPerChecksum " +
                               bytesPerChecksum);
      }
      long offsetInChecksum = checksum.getChecksumHeaderSize() +
                              offsetInBlock / bytesPerChecksum * checksumSize;
      if (out != null) {
       out.flush();
      }
      if (checksumOut != null) {
        checksumOut.flush();
      }
      LOG.info("Changing block file offset of block " + block + " from " +
               data.getChannelPosition(block, streams) +
               " to " + offsetInBlock +
               " meta file offset to " + offsetInChecksum);

      // set the position of the block file
      data.setChannelPosition(block, streams, offsetInBlock, offsetInChecksum);
    }
  }

  /**
   * Used for transferring a block of data.  This class
   * sends a piece of data to another DataNode.
   */
  class DataTransfer implements Runnable {
    DatanodeInfo targets[];
    Block b;

    /**
     * Connect to the first item in the target list.  Pass along the
     * entire target list, the block, and the data.
     */
    public DataTransfer(DatanodeInfo targets[], Block b) throws IOException {
      this.targets = targets;
      this.b = b;
    }

    /**
     * Do the deed, write the bytes
     */
    public void run() {
      xmitsInProgress++;
      Socket sock = null;
      DataOutputStream out = null;
      BlockSender blockSender = null;
     
      try {
        InetSocketAddress curTarget =
          NetUtils.createSocketAddr(targets[0].getName());
        sock = new Socket()
        sock.connect(curTarget, socketTimeout);
        sock.setSoTimeout(targets.length * socketTimeout);

        out = new DataOutputStream(new BufferedOutputStream(
            sock.getOutputStream(), BUFFER_SIZE));
        blockSender = new BlockSender(b, 0, -1, false, false, false);

        //
        // Header info
        //
        out.writeShort(DATA_TRANFER_VERSION);
        out.writeByte(OP_WRITE_BLOCK);
        out.writeLong(b.getBlockId());
        out.writeInt(0);           // no pipelining
        out.writeBoolean(false);   // not part of recovery
        Text.writeString(out, ""); // client
        // write targets
        out.writeInt(targets.length - 1);
        for (int i = 1; i < targets.length; i++) {
          targets[i].write(out);
        }
        // send data & checksum
        blockSender.sendBlock(out, null);

        // no response necessary
        LOG.info(dnRegistration + ":Transmitted block " + b + " to " + curTarget);

      } catch (IOException ie) {
        LOG.warn(dnRegistration + ":Failed to transfer " + b + " to " + targets[0].getName()
            + " got " + StringUtils.stringifyException(ie));
      } finally {
        IOUtils.closeStream(blockSender);
        IOUtils.closeStream(out);
        IOUtils.closeSocket(sock);
        xmitsInProgress--;
      }
    }
  }

  /**
   * No matter what kind of exception we get, keep retrying to offerService().
   * That's the loop that connects to the NameNode and provides basic DataNode
   * functionality.
   *
   * Only stop when "shouldRun" is turned off (which can only happen at shutdown).
   */
  public void run() {
    LOG.info(dnRegistration + "In DataNode.run, data = " + data);

    // start block scanner
    if (blockScannerThread != null) {
      blockScannerThread.start();
    }

    // start dataXceiveServer
    dataXceiveServer.start();
       
    while (shouldRun) {
      try {
        startDistributedUpgradeIfNeeded();
        offerService();
      } catch (Exception ex) {
        LOG.error("Exception: " + StringUtils.stringifyException(ex));
        if (shouldRun) {
          try {
            Thread.sleep(5000);
          } catch (InterruptedException ie) {
          }
        }
      }
    }
       
    // wait for dataXceiveServer to terminate
    try {
      this.dataXceiveServer.join();
    } catch (InterruptedException ie) {
    }
       
    LOG.info(dnRegistration + ":Finishing DataNode in: "+data);
  }
   
  /** Start datanode daemon.
   */
  public static DataNode run(Configuration conf) throws IOException {
    String[] dataDirs = conf.getStrings("dfs.data.dir");
    DataNode dn = makeInstance(dataDirs, conf);
    if (dn != null) {
      dn.dataNodeThread = new Thread(dn, "DataNode: [" +
                                  StringUtils.arrayToString(dataDirs) + "]");
      dn.dataNodeThread.setDaemon(true); // needed for JUnit testing
      dn.dataNodeThread.start();
    }
    return dn;
  }

  /** Start a single datanode daemon and wait for it to finish.
   *  If this thread is specifically interrupted, it will stop waiting.
   */
  static DataNode createDataNode(String args[],
                                 Configuration conf) throws IOException {
    if (conf == null)
      conf = new Configuration();
    if (!parseArguments(args, conf)) {
      printUsage();
      return null;
    }
    return run(conf);
  }

  void join() {
    if (dataNodeThread != null) {
      try {
        dataNodeThread.join();
      } catch (InterruptedException e) {}
    }
  }

  /**
   * Make an instance of DataNode after ensuring that at least one of the
   * given data directories (and their parent directories, if necessary)
   * can be created.
   * @param dataDirs List of directories, where the new DataNode instance should
   * keep its files.
   * @param conf Configuration instance to use.
   * @return DataNode instance for given list of data dirs and conf, or null if
   * no directory from this directory list can be created.
   * @throws IOException
   */
  static DataNode makeInstance(String[] dataDirs, Configuration conf)
    throws IOException {
    ArrayList<File> dirs = new ArrayList<File>();
    for (int i = 0; i < dataDirs.length; i++) {
      File data = new File(dataDirs[i]);
      try {
        DiskChecker.checkDir(data);
        dirs.add(data);
      } catch(DiskErrorException e) {
        LOG.warn("Invalid directory in dfs.data.dir: " + e.getMessage());
      }
    }
    if (dirs.size() > 0)
      return new DataNode(conf, dirs);
    LOG.error("All directories in dfs.data.dir are invalid.");
    return null;
  }

  @Override
  public String toString() {
    return "DataNode{" +
      "data=" + data +
      ", localName='" + dnRegistration.getName() + "'" +
      ", storageID='" + dnRegistration.getStorageID() + "'" +
      ", xmitsInProgress=" + xmitsInProgress +
      "}";
  }
 
  private static void printUsage() {
    System.err.println("Usage: java DataNode");
    System.err.println("           [-r, --rack <network location>] |");
    System.err.println("           [-rollback]");
  }

  /**
   * Parse and verify command line arguments and set configuration parameters.
   *
   * @return false if passed argements are incorrect
   */
  private static boolean parseArguments(String args[],
                                        Configuration conf) {
    int argsLen = (args == null) ? 0 : args.length;
    StartupOption startOpt = StartupOption.REGULAR;
    String networkLoc = null;
    for(int i=0; i < argsLen; i++) {
      String cmd = args[i];
      if ("-r".equalsIgnoreCase(cmd) || "--rack".equalsIgnoreCase(cmd)) {
        if (i==args.length-1)
          return false;
        networkLoc = args[++i];
        if (networkLoc.startsWith("-"))
          return false;
      } else if ("-rollback".equalsIgnoreCase(cmd)) {
        startOpt = StartupOption.ROLLBACK;
      } else if ("-regular".equalsIgnoreCase(cmd)) {
        startOpt = StartupOption.REGULAR;
      } else
        return false;
    }
    if (networkLoc != null)
      conf.set("dfs.datanode.rack", NodeBase.normalize(networkLoc));
    setStartupOption(conf, startOpt);
    return true;
  }

  private static void setStartupOption(Configuration conf, StartupOption opt) {
    conf.set("dfs.datanode.startup", opt.toString());
  }

  static StartupOption getStartupOption(Configuration conf) {
    return StartupOption.valueOf(conf.get("dfs.datanode.startup",
                                          StartupOption.REGULAR.toString()));
  }

  /* Get the network location by running a script configured in conf */
  private static String getNetworkLoc(Configuration conf)
    throws IOException {
    String locScript = conf.get("dfs.network.script");
    if (locScript == null)
      return NetworkTopology.DEFAULT_RACK;

    LOG.info("Starting to run script to get datanode network location");
    Process p = Runtime.getRuntime().exec(locScript);
    StringBuffer networkLoc = new StringBuffer();
    final BufferedReader inR = new BufferedReader(
                                                  new InputStreamReader(p.getInputStream()));
    final BufferedReader errR = new BufferedReader(
                                                   new InputStreamReader(p.getErrorStream()));

    // read & log any error messages from the running script
    Thread errThread = new Thread() {
        @Override
        public void start() {
          try {
            String errLine = errR.readLine();
            while(errLine != null) {
              LOG.warn("Network script error: "+errLine);
              errLine = errR.readLine();
            }
          } catch(IOException e) {
                   
          }
        }
      };
    try {
      errThread.start();
           
      // fetch output from the process
      String line = inR.readLine();
      while(line != null) {
        networkLoc.append(line);
        line = inR.readLine();
      }
      try {
        // wait for the process to finish
        int returnVal = p.waitFor();
        // check the exit code
        if (returnVal != 0) {
          throw new IOException("Process exits with nonzero status: "+locScript);
        }
      } catch (InterruptedException e) {
        throw new IOException(e.getMessage());
      } finally {
        try {
          // make sure that the error thread exits
          errThread.join();
        } catch (InterruptedException je) {
          LOG.warn(StringUtils.stringifyException(je));
        }
      }
    } finally {
      // close in & error streams
      try {
        inR.close();
      } catch (IOException ine) {
        throw ine;
      } finally {
        errR.close();
      }
    }

    return networkLoc.toString();
  }
 
  /**
   * This methods  arranges for the data node to send the block report at the next heartbeat.
   */
  public void scheduleBlockReport(long delay) {
    if (delay > 0) { // send BR after random delay
      lastBlockReport = System.currentTimeMillis()
                - ( blockReportInterval - new Random().nextInt((int)(delay)));
    } else { // send at next heartbeat
      lastBlockReport = lastHeartbeat - blockReportInterval;
    }
    resetBlockReportTime = true; // reset future BRs for randomness
  }
 
 
  /**
   * This method is used for testing.
   * Examples are adding and deleting blocks directly.
   * The most common usage will be when the data node's storage is similated.
   *
   * @return the fsdataset that stores the blocks
   */
  public FSDatasetInterface getFSDataset() {
    return data;
  }

  /**
   */
  public static void main(String args[]) {
    try {
      StringUtils.startupShutdownMessage(DataNode.class, args, LOG);
      DataNode datanode = createDataNode(args, null);
      if (datanode != null)
        datanode.join();
    } catch (Throwable e) {
      LOG.error(StringUtils.stringifyException(e));
      System.exit(-1);
    }
  }
}
TOP

Related Classes of org.apache.hadoop.dfs.DataNode

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.