Package org.apache.hadoop.hdfs

Source Code of org.apache.hadoop.hdfs.DFSOutputStream

/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hdfs;

import java.io.BufferedOutputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InterruptedIOException;
import java.net.InetSocketAddress;
import java.net.Socket;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Iterator;
import java.util.LinkedList;

import org.apache.hadoop.fs.FSOutputSummer;
import org.apache.hadoop.fs.FileAlreadyExistsException;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.Syncable;
import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.hdfs.DFSClient.MultiDataInputStream;
import org.apache.hadoop.hdfs.DFSClient.MultiDataOutputStream;
import org.apache.hadoop.hdfs.protocol.AppendBlockHeader;
import org.apache.hadoop.hdfs.profiling.DFSWriteProfilingData;
import org.apache.hadoop.hdfs.profiling.DFSWriteProfilingData.WritePacketClientProfile;
import org.apache.hadoop.hdfs.protocol.Block;
import org.apache.hadoop.hdfs.protocol.ClientDatanodeProtocol;
import org.apache.hadoop.hdfs.protocol.ClientProtocol;
import org.apache.hadoop.hdfs.protocol.DSQuotaExceededException;
import org.apache.hadoop.io.WriteOptions;
import org.apache.hadoop.hdfs.protocol.DataTransferProtocol;
import org.apache.hadoop.hdfs.protocol.DataTransferProtocol.PipelineAck;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.hadoop.hdfs.protocol.FSConstants;
import org.apache.hadoop.hdfs.protocol.LocatedBlock;
import org.apache.hadoop.hdfs.protocol.LocatedBlockWithMetaInfo;
import org.apache.hadoop.hdfs.protocol.LocatedBlockWithOldGS;
import org.apache.hadoop.hdfs.protocol.NSQuotaExceededException;
import org.apache.hadoop.hdfs.protocol.VersionedLocatedBlock;
import org.apache.hadoop.hdfs.protocol.WriteBlockHeader;
import org.apache.hadoop.hdfs.server.datanode.DataNode;
import org.apache.hadoop.hdfs.server.namenode.INode;
import org.apache.hadoop.hdfs.server.namenode.NotReplicatedYetException;
import org.apache.hadoop.hdfs.server.protocol.BlockAlreadyCommittedException;
import org.apache.hadoop.hdfs.util.InjectionEvent;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.ipc.Client;
import org.apache.hadoop.ipc.ProtocolProxy;
import org.apache.hadoop.ipc.RPC;
import org.apache.hadoop.ipc.RemoteException;
import org.apache.hadoop.net.NetUtils;
import org.apache.hadoop.security.AccessControlException;
import org.apache.hadoop.util.Daemon;
import org.apache.hadoop.util.DataChecksum;
import org.apache.hadoop.util.InjectionHandler;
import org.apache.hadoop.util.Progressable;
import org.apache.hadoop.util.NativeCrc32;
import org.apache.hadoop.util.StringUtils;

/****************************************************************
* DFSOutputStream creates files from a stream of bytes.
*
* The client application writes data that is cached internally by
* this stream. Data is broken up into packets, each packet is
* typically 64K in size. A packet comprises of chunks. Each chunk
* is typically 512 bytes and has an associated checksum with it.
*
* When a client application fills up the currentPacket, it is
* enqueued into dataQueue.  The DataStreamer thread picks up
* packets from the dataQueue, sends it to the first datanode in
* the pipeline and moves it from the dataQueue to the ackQueue.
* The ResponseProcessor receives acks from the datanodes. When an
* successful ack for a packet is received from all datanodes, the
* ResponseProcessor removes the corresponding packet from the
* ackQueue.
*
* In case of error, all outstanding packets and moved from
* ackQueue. A new pipeline is setup by eliminating the bad
* datanode from the original pipeline. The DataStreamer now
* starts sending packets from the dataQueue.
****************************************************************/
class DFSOutputStream extends FSOutputSummer implements Syncable, Replicable {
  private final DFSClient dfsClient;
  private Socket[] s;
  boolean closed = false;

  private String src;
  private MultiDataOutputStream blockStream;
  private MultiDataInputStream blockReplyStream;
  private Block block;
  final private long blockSize;
  private boolean pktIncludeVersion = false;
  final private int packetVersion;
  private DataChecksum checksum;
  private LinkedList<DFSOutputStreamPacket> dataQueue = new LinkedList<DFSOutputStreamPacket>();
  private LinkedList<DFSOutputStreamPacket> ackQueue = new LinkedList<DFSOutputStreamPacket>();
  private int numPendingHeartbeats = 0;
  private long lastPacketSentTime = 0;
  private final long packetTimeout;
  private DFSOutputStreamPacket currentPacket = null;
  private int maxPackets = 80; // each packet 64K, total 5MB
  // private int maxPackets = 1000; // each packet 64K, total 64MB
  private DataStreamer streamer;
  private ResponseProcessor response = null;
  private long currentSeqno = 0;
  private long lastQueuedSeqno = -1;
  private long lastAckedSeqno = -1;
  private long bytesCurBlock = 0; // bytes writen in current block
  private int packetSize = 0; // write packet size, including the header.
  private int chunksPerPacket = 0;
  DatanodeInfo[] nodes = null; // list of targets for current block
  private DatanodeInfo[] favoredNodes = null; // put replicas here if possible
  private volatile boolean hasError = false;
  private volatile int errorIndex = 0;
  volatile IOException lastException = null;
  private long artificialSlowdown = 0;
  private long lastFlushOffset = 0; // offset when flush was invoked
  private boolean persistBlocks = false; // persist blocks on namenode
  private int recoveryErrorCount = 0; // number of times block recovery failed
  private final int maxRecoveryErrorCount;
  private volatile boolean appendChunk = false;   // appending to existing partial block
  private long initialFileSize = 0; // at time of file open
  private Progressable progress;
  private short blockReplication; // replication factor of file
  private long lastBlkOffset = 0; // end pos of last block already sent

  private boolean forceSync;
  private boolean doParallelWrites = false;
   
  private final WriteOptions options;

  private void setLastException(IOException e) {
    if (lastException == null) {
      lastException = e;
    }
  }
 
  public void setOffsets(long offset) {
    DFSClient.LOG.info("set last block offsets in file: " + src + " pos: " + offset);
    lastBlkOffset = offset;
  }

  /** Decide if the write pipeline supports bidirectional heartbeat or not */
  private boolean supportClientHeartbeat() throws IOException {
    return dfsClient.getDataTransferProtocolVersion() >=
                 DataTransferProtocol.CLIENT_HEARTBEAT_VERSION;
  }

  /**
   * Check if the last outstanding packet has not received an ack before
   * it is timed out.
   * If true, for now just log it.
   * We will provide a decent solution to this later on.
   */
  private void checkIfLastPacketTimeout() {
     synchronized (ackQueue) {
             if( !ackQueue.isEmpty()  && (
                             System.currentTimeMillis() - lastPacketSentTime > packetTimeout) ) {
               DFSClient.LOG.warn("Packet " + ackQueue.getLast().seqno +
                             " of " + block + " is timed out");
             }
     }
  }


  //
  // The DataStreamer class is responsible for sending data packets to the
  // datanodes in the pipeline. It retrieves a new blockid and block locations
  // from the namenode, and starts streaming packets to the pipeline of
  // Datanodes. Every packet has a sequence number associated with
  // it. When all the packets for a block are sent out and acks for each
  // if them are received, the DataStreamer closes the current block.
  //
  private class DataStreamer extends Daemon {

    private volatile boolean closed = false;
    private long lastPacket;
    private boolean doSleep;

    DataStreamer() throws IOException {
      // explicitly invoke RPC so avoiding RPC in waitForWork
      // that might cause timeout
      dfsClient.getDataTransferProtocolVersion();
    }

    private void waitForWork() throws IOException {
      if ( supportClientHeartbeat() ) {  // send heart beat
        long now = System.currentTimeMillis();
        while ((!closed && !hasError && dfsClient.clientRunning
            && dataQueue.size() == &&
            (blockStream == null || (
                blockStream != null && now - lastPacket < dfsClient.timeoutValue/2)))
                || doSleep) {
          long timeout = dfsClient.timeoutValue/2 - (now-lastPacket);
          timeout = timeout <= 0 ? 1000 : timeout;

          try {
            dataQueue.wait(timeout);
            checkIfLastPacketTimeout();
            now = System.currentTimeMillis();
          } catch (InterruptedException  e) {
          }
          doSleep = false;
        }
      } else { // no sending heart beat
        while ((!closed && !hasError && dfsClient.clientRunning
            && dataQueue.size() == 0) || doSleep) {
          try {
            dataQueue.wait(1000);
          } catch (InterruptedException  e) {
          }
          doSleep = false;
        }
      }
    }

    public void run() {
      while (!closed && dfsClient.clientRunning) {

        // if the Responder encountered an error, shutdown Responder
        if (hasError && response != null) {
          try {
            response.close();
            response.join();
            response = null;
          } catch (InterruptedException  e) {
          }
        }

        DFSOutputStreamPacket one = null;

        // process IO errors if any
        doSleep = processDatanodeError(hasError, false);

        try {
          synchronized (dataQueue) {
            // wait for a packet to be sent.
            waitForWork();

            if (closed || hasError || !dfsClient.clientRunning) {
              continue;
            }

            InjectionHandler
                .processEventIO(InjectionEvent.DFSCLIENT_DATASTREAM_AFTER_WAIT, blockStream);
           
            // get packet to be sent.
            if (dataQueue.isEmpty()) {
              one = DFSOutputStreamPacketFactory.getHeartbeatPacket(
                  DFSOutputStream.this, ifPacketIncludeVersion(),
                  getPacketVersion()); // heartbeat
                                                                     // packet
            } else {
              one = dataQueue.getFirst(); // regular data packet
              one.eventPopFromDataQueue();
            }
          }
                 
          long offsetInBlock = one.offsetInBlock;

          // get new block from namenode.
          if (blockStream == null) {
            DFSClient.LOG.debug("Allocating new block: " + src + "  pos: " + lastBlkOffset);

            nodes = nextBlockOutputStream(src);
            this.setName("DataStreamer for file " + src +
                " block " + block);
            response = new ResponseProcessor(nodes);
            response.start();
          }

          if (offsetInBlock > blockSize
              || (offsetInBlock == blockSize && (one.dataLength > 0 || !one.lastPacketInBlock))) {
            throw new IOException("BlockSize " + blockSize +
                                  " is smaller than data size. " +
                                  " Offset of packet in block " +
                                  offsetInBlock +
                                  " Aborting file " + src);
          }

          ByteBuffer buf = one.getBuffer();

          InjectionHandler.processEventIO(
              InjectionEvent.DFSCLIENT_DATASTREAM_BEFORE_WRITE, blockStream);
         
          // write out data to remote datanode
          blockStream.write(buf.array(), buf.position(), buf.remaining());

          if (one.lastPacketInBlock) {
            blockStream.writeInt(0); // indicate end-of-block
          }
          blockStream.flush();
          lastPacket = System.currentTimeMillis();
          if (DFSClient.LOG.isDebugEnabled()) {
            DFSClient.LOG.debug("DataStreamer block " + block +
                      " wrote packet seqno:" + one.seqno +
                      " size:" + buf.remaining() +
                      " offsetInBlock:" + one.offsetInBlock +
                      " lastPacketInBlock:" + one.lastPacketInBlock);
          }

          // move packet from dataQueue to ackQueue
          synchronized (dataQueue) {
            if (!one.isHeartbeatPacket()) {
              dataQueue.removeFirst();
              dataQueue.notifyAll();
              synchronized (ackQueue) {
                ackQueue.addLast(one);
                one.eventAddToAckQueue();
                lastPacketSentTime = System.currentTimeMillis();
                ackQueue.notifyAll();
              }
            } else {
              synchronized (ackQueue) {
                numPendingHeartbeats++;
                ackQueue.notifyAll();                 
              }

              DFSClient.LOG.info("Sending a heartbeat packet for block " + block);
            }
          }
        } catch (Throwable e) {
          dfsClient.incWriteExpCntToStats();

          DFSClient.LOG.warn("DataStreamer Exception: ", e);
          if (e instanceof IOException) {
            setLastException((IOException)e);
          }
          hasError = true;
          if (blockStream != null) {
            // find the first datanode to which we could not write data.
            int possibleError =  blockStream.getErrorIndex();
            if (possibleError != -1) {
              errorIndex = possibleError;
              DFSClient.LOG.warn("DataStreamer bad datanode in pipeline:" +
                         possibleError);
            }
          }
        }

        if (closed || hasError || !dfsClient.clientRunning) {
          continue;
        }

        // Is this block full?
        if (one.lastPacketInBlock) {
          synchronized (ackQueue) {
            while (!hasError && ackQueue.size() != 0 && dfsClient.clientRunning) {
              try {
                ackQueue.wait();   // wait for acks to arrive from datanodes
              } catch (InterruptedException  e) {
              }
            }
          }
          DFSClient.LOG.debug("Closing old block " + block);
          this.setName("DataStreamer for file " + src);

          response.close();        // ignore all errors in Response
          try {
            response.join();
            response = null;
          } catch (InterruptedException  e) {
          }
         
          if (closed || hasError || !dfsClient.clientRunning) {
            continue;
          }

          synchronized (dataQueue) {
            try {
              blockStream.close();
              blockReplyStream.close();
            } catch (IOException e) {
            }
            nodes = null;
            response = null;
            blockStream = null;
            blockReplyStream = null;
          }
        }
       
        if (progress != null) { progress.progress(); }

        // This is used by unit test to trigger race conditions.
        if (artificialSlowdown != 0 && dfsClient.clientRunning) {
          DFSClient.sleepForUnitTest(artificialSlowdown);
        }
      }
    }

    // shutdown thread
    void close() {
      closed = true;
      synchronized (dataQueue) {
        dataQueue.notifyAll();
      }
      synchronized (ackQueue) {
        ackQueue.notifyAll();
      }
      this.interrupt();
    }
  }

  //
  // Processes reponses from the datanodes.  A packet is removed
  // from the ackQueue when its response arrives.
  //
  private class ResponseProcessor extends Thread {

    private volatile boolean closed = false;
    private DatanodeInfo[] targets = null;
    private boolean lastPacketInBlock = false;

    ResponseProcessor (DatanodeInfo[] targets) {
      this.targets = targets;
    }

    public void run() {

      this.setName("ResponseProcessor for block " + block);

      while (!closed && dfsClient.clientRunning && !lastPacketInBlock) {
        // process responses from datanodes.
        int recordError = 0;
        try {
          long seqno = 0;
          synchronized (ackQueue) {
            while (!closed && dfsClient.clientRunning && ackQueue.isEmpty() &&
                   numPendingHeartbeats == 0) {
              try {
                ackQueue.wait();
              } catch (InterruptedException e) {
                // If the thread is being interrupted when waiting for
                // packet, we log the exception and treat it as a normal
                // exception.
                //
                DFSClient.LOG.info("ResponseProcessor thread interrupted when " +
                         "waiting for new packets");
                throw e;
              }
            }
          }
          if (closed || !dfsClient.clientRunning) {
            break;
          }

          eventStartReceiveAck();
          PipelineAck pipelineAck = null;
          if (!doParallelWrites) {
            // verify seqno from datanode
            if (supportClientHeartbeat()) {
              pipelineAck = new PipelineAck();
              pipelineAck.readFields(blockReplyStream.get(0), targets.length,
                  profileData != null);
             
              seqno = pipelineAck.getSeqno();
             
              if (!pipelineAck.isSuccess()) {
                for (int i = 0; i < targets.length && dfsClient.clientRunning; i++) {
                  short reply = pipelineAck.getReply(i);
                  if (reply != DataTransferProtocol.OP_STATUS_SUCCESS) {
                    recordError = i; // first bad datanode
                    throw new IOException("Bad response " + reply + " for block "
                        + block + " from datanode " + targets[i].getName());
                  }
                }               
              }
            } else {
              // Backward compatibility codes.
              seqno = blockReplyStream.get(0).readLong();
              DFSClient.LOG.debug("DFSClient received ack for seqno " + seqno);
              if (seqno == DFSOutputStreamPacket.HEART_BEAT_SEQNO) {
                continue;
              }
              // regular ack
              // processes response status from all datanodes.
              for (int i = 0; i < targets.length && dfsClient.clientRunning; i++) {
                short reply = blockReplyStream.get(0).readShort();
                if (reply != DataTransferProtocol.OP_STATUS_SUCCESS) {
                  recordError = i; // first bad datanode
                  throw new IOException("Bad response " + reply + " for block "
                      + block + " from datanode " + targets[i].getName());
                }
              }
            }
          } else {
            // The client is writing to all replicas in parallel. It also
            // expects an ack from all replicas.
            long lastsn = 0;
            assert blockReplyStream.size() > 0;
            for (int i = 0; i < blockReplyStream.size(); i++) {
              recordError = i; // remember the current slot
              seqno = blockReplyStream.get(i).readLong();
              if (DFSClient.LOG.isDebugEnabled()) {
                DFSClient.LOG.debug("DFSClient for block " + block + " " + seqno);
              }
              if (i != 0 && seqno != -2 && seqno != lastsn) {
                String msg = "Responses from datanodes do not match "
                    + " this replica acked " + seqno
                    + " but previous replica acked " + lastsn;
                DFSClient.LOG.warn(msg);
                throw new IOException(msg);
              }
              short reply = blockReplyStream.get(i).readShort();
              if (reply != DataTransferProtocol.OP_STATUS_SUCCESS) {
                recordError = i; // first bad datanode
                throw new IOException("Bad parallel response " + reply
                    + " for block " + block + " from datanode "
                    + targets[i].getName());
              }
              lastsn = seqno;
            }
          }

          assert seqno != -2 :
            "Ack for unkown seqno should be a failed ack!";
          if (seqno == DFSOutputStreamPacket.HEART_BEAT_SEQNO) {  // a heartbeat ack
            assert supportClientHeartbeat();
            synchronized(ackQueue) {
              assert numPendingHeartbeats > 0;
              numPendingHeartbeats--;
            }
            continue;
          }

          DFSOutputStreamPacket one = null;
          synchronized (ackQueue) {
            assert !ackQueue.isEmpty();
            one = ackQueue.getFirst();
          }
          if (one.seqno != seqno) {
            throw new IOException("Responseprocessor: Expecting seqno " +
                " for block " + block +
                one.seqno + " but received " + seqno);
          }
                   
          lastPacketInBlock = one.lastPacketInBlock;

          if (lastPacketInBlock) {
            if (DFSClient.LOG.isDebugEnabled()) {
              DFSClient.LOG
                  .debug("Update pos in file: " + src + " curBlckOffset: "
                      + lastBlkOffset + " blockSize: "
                      + one.getEndPosInCurrBlk());
            }
            lastBlkOffset += one.getEndPosInCurrBlk();
          }

          synchronized (ackQueue) {
            assert seqno == lastAckedSeqno + 1;
            lastAckedSeqno = seqno;
           
            ackQueue.removeFirst();
            ackQueue.notifyAll();
          }

          one.eventAckReceived();

          if (getProfileData() != null) {
            getProfileData().finishPacket(one.profile, pipelineAck);
            long slowWriteProfileThreshold = options
                .getLogSlowWriteProfileDataThreshold();
            long totalTime = getProfileData().recentPacketProfile.getTotalTime();
            if (slowWriteProfileThreshold > 0
                && totalTime > slowWriteProfileThreshold) {
              DFSClient.LOG.warn("Slow Write Packet for block : " + block +
                  ", packet seqno : " + one.seqno +  ", total time : " +
                  totalTime + " \n" + getProfileData().recentPacketProfile);
            }
          }
        } catch (Exception e) {
          if (!closed) {
            hasError = true;
            errorIndex = recordError;
            if (e instanceof IOException) {
              setLastException((IOException)e);
            }
            DFSClient.LOG.warn("DFSOutputStream ResponseProcessor exception " +
                     " for block " + block +
                      StringUtils.stringifyException(e));
            closed = true;
          }
        }

        synchronized (dataQueue) {
          dataQueue.notifyAll();
        }
        synchronized (ackQueue) {
          ackQueue.notifyAll();
        }
      }
    }

    void close() {
      closed = true;
      this.interrupt();
    }
  }

  // If this stream has encountered any errors so far, shutdown
  // threads and mark stream as closed. Returns true if we should
  // sleep for a while after returning from this call.
  //
  private boolean processDatanodeError(boolean hasError, boolean isAppend) {
    if (!hasError) {
      return false;
    }
    if (response != null) {
      DFSClient.LOG.info("Error Recovery for block " + block +
               " waiting for responder to exit. ");
      return true;
    }
    dfsClient.incWriteExpCntToStats();

    if (errorIndex >= 0) {
      DFSClient.LOG.warn("Error Recovery for block " + block
          + " bad datanode[" + errorIndex + "] "
          + (nodes == null? "nodes == null": nodes[errorIndex].getName()));
    }

    if (blockStream != null) {
      try {
        blockStream.close();
        blockReplyStream.close();
      } catch (IOException e) {
      }
    }
    blockStream = null;
    blockReplyStream = null;

    // move packets from ack queue to front of the data queue
    synchronized (dataQueue) {
      synchronized (ackQueue) {
        if (!ackQueue.isEmpty()) {
          DFSClient.LOG.info("First unacked packet in " + block + " starts at "
              + ackQueue.getFirst().offsetInBlock);
          dataQueue.addAll(0, ackQueue);
          ackQueue.clear();
        }
        numPendingHeartbeats = 0;
      }
    }

    boolean success = false;
    while (!success && dfsClient.clientRunning) {
      DatanodeInfo[] newnodes = null;
      if (nodes == null) {
        String msg = "Could not get block locations. " +
                                        "Source file \"" + src
                                        + "\" - Aborting...";
        DFSClient.LOG.warn(msg);
        setLastException(new IOException(msg));
        closed = true;
        if (streamer != null) streamer.close();
        return false;
      }
      StringBuilder pipelineMsg = new StringBuilder();
      for (int j = 0; j < nodes.length; j++) {
        pipelineMsg.append(nodes[j].getName());
        if (j < nodes.length - 1) {
          pipelineMsg.append(", ");
        }
      }
      // remove bad datanode from list of datanodes.
      // If errorIndex was not set (i.e. appends), then do not remove
      // any datanodes
      //
      if (errorIndex < 0) {
        newnodes = nodes;
      } else {
        if (nodes.length <= 1) {
          lastException = new IOException("All datanodes " + pipelineMsg +
                                          " are bad. Aborting...");
          closed = true;
          if (streamer != null) streamer.close();
          return false;
        }
        DFSClient.LOG.warn("Error Recovery for block " + block +
                 " in pipeline " + pipelineMsg +
                 ": bad datanode " + nodes[errorIndex].getName());
        newnodes =  new DatanodeInfo[nodes.length-1];
        System.arraycopy(nodes, 0, newnodes, 0, errorIndex);
        System.arraycopy(nodes, errorIndex+1, newnodes, errorIndex,
            newnodes.length-errorIndex);
      }

      // Tell the primary datanode to do error recovery
      // by stamping appropriate generation stamps.
      //
      LocatedBlock newBlock = null;
      DatanodeInfo primaryNode = null;
      boolean clientAdRecoveryPrimaryProtocolSupported = false;
      try {
        clientAdRecoveryPrimaryProtocolSupported = dfsClient.namenodeProtocolProxy
            .isMethodSupported("nextGenerationStamp", Block.class,
                boolean.class);
      } catch (InterruptedIOException iie) {
        return false;
      } catch (IOException ioe) {
        DFSClient.LOG.warn(
            "Error when trying to determine whether namenode protocol "
                + "supports client as block recovoery coordinator.", ioe);
      }
      boolean clientAsRecoveryPrimary = dfsClient.conf.getBoolean(
          "dfs.client.as.block.recovery.primary", true)
          && clientAdRecoveryPrimaryProtocolSupported;
      try {
        if (clientAsRecoveryPrimary) {
          BlockRecoveryCoordinator brc = new BlockRecoveryCoordinator(
              DFSClient.LOG, dfsClient.conf, dfsClient.socketTimeout, null,
              new BlockSyncer(dfsClient.getNamespaceId(),
                  dfsClient.getNameNodeRPC(), DFSClient.LOG), null);
          newBlock = brc.recoverBlock(dfsClient.getNamespaceId(), block, false,
              newnodes, false, System.currentTimeMillis() + dfsClient.socketTimeout * 8000);
        } else {
          // Pick the "least" datanode as the primary datanode to avoid
          // deadlock.
          primaryNode = Collections.min(Arrays.asList(newnodes));
          newBlock = recoverBlockFromPrimaryDataNode(primaryNode, newnodes,
              isAppend);
        }
       
        if (newBlock == null) {
          throw new IOException("all datanodes do not have the block");
        }
        boolean isEmpty;
        long nextByteToSend;
        long newBlockSize = newBlock.getBlockSize();
        int numPktRemoved;
        synchronized (dataQueue) {
          numPktRemoved = adjustDataQueueAfterBlockRecovery(newBlockSize);
          isEmpty = dataQueue.isEmpty();
          if (isEmpty) {
            if (currentPacket != null) {
              nextByteToSend = currentPacket.offsetInBlock;
            } else {
              nextByteToSend = bytesCurBlock;
            }
          } else {
            nextByteToSend = dataQueue.getFirst().offsetInBlock;
          }
        }
        if (numPktRemoved > 0) {
          DFSClient.LOG.info("Remove " + numPktRemoved
              + " packets in the packet queue after block recovery");
          if (nextByteToSend > newBlockSize) {
            DFSClient.LOG
                .warn("Missing bytes after removing packets! It should never happen. nextByteToSend "
                    + nextByteToSend + " new block size " + newBlockSize);
          }
        } else if (nextByteToSend > newBlockSize) {
          DFSClient.LOG.warn("Missing bytes! Error Recovery for block " + block
              + " end up with " + newBlock.getBlockSize()
              + " bytes but client already sent " + nextByteToSend
              + " bytes and data queue is " + (isEmpty ? "" : "not ")
              + "empty.");
        } else if (DFSClient.LOG.isDebugEnabled()) {
          DFSClient.LOG.debug("Didn't remove any block. nextByteToSend "
                    + nextByteToSend + " new block size " + newBlockSize);
        }
      } catch (BlockAlreadyCommittedException e) {
        dfsClient.incWriteExpCntToStats();

        DFSClient.LOG
            .warn("Error Recovery for block "
                + block
                + " failed "
                + " because block is already committed according to primary datanode "
                + primaryNode + ". " + " Pipeline was " + pipelineMsg
                + ". Aborting...", e);

        lastException = e;
        closed = true;
        if (streamer != null) streamer.close();
        return false;       // abort with IOexception
      } catch (IOException e) {
        dfsClient.incWriteExpCntToStats();

        DFSClient.LOG.warn("Failed recovery attempt #" + recoveryErrorCount +
            " from primary datanode " + primaryNode, e);
        recoveryErrorCount++;
        // For client as primary, no need to retry as all failures thrown by
        // data nodes are already handled.
        if (clientAsRecoveryPrimary || recoveryErrorCount > maxRecoveryErrorCount) {
          if (!clientAsRecoveryPrimary && nodes.length > 1) {
            // if the primary datanode failed, remove it from the list.
            // The original bad datanode is left in the list because it is
            // conservative to remove only one datanode in one iteration.
            for (int j = 0; j < nodes.length; j++) {
              if (nodes[j].equals(primaryNode)) {
                errorIndex = j; // forget original bad node.
              }
            }
            // remove primary node from list
            newnodes =  new DatanodeInfo[nodes.length-1];
            System.arraycopy(nodes, 0, newnodes, 0, errorIndex);
            System.arraycopy(nodes, errorIndex+1, newnodes, errorIndex,
                             newnodes.length-errorIndex);
            nodes = newnodes;
            DFSClient.LOG.warn("Error Recovery for block " + block + " failed " +
                     " because recovery from primary datanode " +
                     primaryNode + " failed " + recoveryErrorCount +
                     " times. " + " Pipeline was " + pipelineMsg +
                     ". Marking primary datanode as bad.");
            recoveryErrorCount = 0;
            errorIndex = -1;
            return true;          // sleep when we return from here
          }
          String emsg = "Error Recovery for block " + block + " failed " +
                        " because recovery from primary datanode " +
                        primaryNode + " failed " + recoveryErrorCount +
                        " times. "  + " Pipeline was " + pipelineMsg +
                        ". Aborting...";
          DFSClient.LOG.warn(emsg);
          lastException = new IOException(emsg);
          closed = true;
          if (streamer != null) streamer.close();
          return false;       // abort with IOexception
        }
        DFSClient.LOG.warn("Error Recovery for block " + block + " failed " +
                 " because recovery from primary datanode " +
                 primaryNode + " failed " + recoveryErrorCount +
                 " times. "  + " Pipeline was " + pipelineMsg +
                 ". Will retry...");
        return true;          // sleep when we return from here
      } finally {
      }
      recoveryErrorCount = 0; // block recovery successful

      // If the block recovery generated a new generation stamp, use that
      // from now on.  Also, setup new pipeline
      //
      if (newBlock != null) {
        block = newBlock.getBlock();
        nodes = newBlock.getLocations();
      }

      this.hasError = false;
      lastException = null;
      errorIndex = 0;
      success = createBlockOutputStream(nodes, dfsClient.clientName,
          true, false);
    }

    response = new ResponseProcessor(nodes);
    response.start();
    return false; // do not sleep, continue processing
  }
 
  LocatedBlock recoverBlockFromPrimaryDataNode(DatanodeInfo primaryNode,
      DatanodeInfo[] newnodes, boolean isAppend) throws IOException {
    ProtocolProxy<ClientDatanodeProtocol> primary = null;
    try {
    // Copied from org.apache.hadoop.ipc.Client
    int connectTimeout = dfsClient.conf.getInt(
        Client.CONNECT_TIMEOUT_KEY, Client.CONNECT_TIMEOUT_DEFAULT);
    int maxRetries = dfsClient.conf.getInt(
        Client.CONNECT_MAX_RETRIES_KEY, Client.CONNECT_MAX_RETRIES_DEFAULT);
    /*
     * considering pipeline recovery needs 3 RPCs to DataNodes and 2 RPCs to
     * NameNode; So rpcTimeout sets to be 5 times of client socketTimeout.
     * Also each datanode RPC might take upto (connectTimeout * maxRetries)
     * to establish connection.
     */
    int recoverTimeout = 5 * dfsClient.socketTimeout + 3
        * (connectTimeout * maxRetries);
    primary = DFSClient.createClientDNProtocolProxy(primaryNode,
        dfsClient.conf, recoverTimeout);
    try {
      if (primary.isMethodSupported("recoverBlock", int.class, Block.class,
          boolean.class, DatanodeInfo[].class, long.class)) {
        // The deadline is up to RPC time out minus one socket timeout
        // to be more conservative.
        return primary.getProxy().recoverBlock(dfsClient.namespaceId, block,
            isAppend, newnodes,
            System.currentTimeMillis() + recoverTimeout -
            dfsClient.socketTimeout - (maxRetries * connectTimeout));
      } else if (primary.isMethodSupported("recoverBlock", int.class, Block.class, boolean.class, DatanodeInfo[].class)) {
        return primary.getProxy().recoverBlock(
            dfsClient.namespaceId, block, isAppend, newnodes);
      } else {
        return primary.getProxy().recoverBlock(block, isAppend, newnodes);
      }
    } catch (RemoteException re) {
      if (re.unwrapRemoteException() instanceof BlockAlreadyCommittedException) {
        throw new BlockAlreadyCommittedException(re);
      } else {
        throw re;
      }
    }
    } finally {
      if (primary != null) {
        RPC.stopProxy(primary.getProxy());
      }
    }
  }
 
 
  private int adjustDataQueueAfterBlockRecovery(long newBlockSize) {
    // New block size should be one of the packet's ending position
    // If the block offset of the first packet is not the new block
    // size, we should be able to remove several packets in packet
    // queue and make sure the first packet is the new block size.
    // Otherwise, something went wrong.
    //
    // We are conservative here: if the first unacked packet starts
    // with a full chunk, it can always be a clean checkpoint. We
    // keep the packets starting from it.
    //
    int bytesPerChecksum = checksum.getBytesPerChecksum();
    int numPktRemoved = 0;
    long newAckedSeqno = -1;
    while (!dataQueue.isEmpty()) {
      DFSOutputStreamPacket first = dataQueue.getFirst();
      long endOffsetOfBlock = first.getEndPosInCurrBlk();
      if (first.isHeartbeatPacket()) {
        dataQueue.removeFirst();
        numPktRemoved++;
      } else if (first.offsetInBlock % bytesPerChecksum == 0) {
        // The first unacked packet starts with a full chunk.
        //
        break;
      } else if (endOffsetOfBlock <= newBlockSize) {
        if (first.lastPacketInBlock) {
          // Last block is already acked in all remaining replicas
          // Resend an empty one to force the stream to finish.
          //
          if (endOffsetOfBlock != newBlockSize) {
            DFSClient.LOG.warn("Packet is the last packet in block with "
                + endOffsetOfBlock
                + " but new block length after block recovery is "
                + newBlockSize + ". Something went wrong.");
          }
         
          first.cleanup();
          first.offsetInBlock = endOffsetOfBlock;
          DFSClient.LOG
              .info("Resend last packet in block and make it empty, new offsetInBlock "
                  + endOffsetOfBlock);
          break;
        } else {
          dataQueue.removeFirst();
          numPktRemoved++;
          if (first.seqno > lastAckedSeqno) {
            newAckedSeqno = first.seqno;
          }
        }
      } else {
        if (first.offsetInBlock != newBlockSize) {
          DFSClient.LOG.warn("Packet has start offset " + first.offsetInBlock
              + " and end offset " + endOffsetOfBlock
              + " but new block length after block recovery is " + newBlockSize
              + ". Something went wrong.");         
        }
        break;
      }
    }
   
    if (numPktRemoved > 0 && newAckedSeqno != -1) {
      synchronized (ackQueue) {
        lastAckedSeqno = newAckedSeqno;
        ackQueue.notifyAll();
      }
    }

    return numPktRemoved;
  }

  private void isClosed() throws IOException {
    if ((closed || !dfsClient.clientRunning) && lastException != null) {
        throw lastException;
    }
  }

  //
  // returns the list of targets, if any, that is being currently used.
  //
  DatanodeInfo[] getPipeline() {
    synchronized (dataQueue) {
      if (nodes == null) {
        return null;
      }
      DatanodeInfo[] value = new DatanodeInfo[nodes.length];
      for (int i = 0; i < nodes.length; i++) {
        value[i] = nodes[i];
      }
      return value;
    }
  }
 
  static private DFSWriteProfilingData getProfile(DFSClient dfsClient) {
    DFSWriteProfilingData profile = DFSClient.getAndResetProfileDataForNextOutputStream();
    if (dfsClient != null) {
      boolean ifAutoPrint = dfsClient.conf.getBoolean(
          FSConstants.FS_OUTPUT_STREAM_AUTO_PRINT_PROFILE, false);
      if (ifAutoPrint) {
        if (profile == null) {
          profile = new DFSWriteProfilingData();
        }
        profile.setAutoPrintWhileClose(true);
      }
    }
    return profile;
  }

  private DFSOutputStream(DFSClient dfsClient, String src, long blockSize,
      Progressable progress, int bytesPerChecksum, short replication, boolean forceSync,
boolean doParallelWrites, DatanodeInfo[] favoredNodes,
      WriteOptions options)
  throws IOException {
    super(new NativeCrc32(), bytesPerChecksum, 4, getProfile(dfsClient));
    this.dfsClient = dfsClient;
    this.forceSync = forceSync;
    this.doParallelWrites = doParallelWrites;
    this.src = src;
    this.blockSize = blockSize;
    this.blockReplication = replication;
    this.progress = progress;
    this.options = options;
    this.pktIncludeVersion = dfsClient.ifPacketIncludeVersion();
    this.packetVersion = dfsClient.getOutPacketVersion();
   
    streamer = new DataStreamer();
   
    packetTimeout =
        dfsClient.conf.getLong("dfs.client.packet.timeout", 15000); // 15 seconds
    // try block recovery 5 times:
    maxRecoveryErrorCount =
        dfsClient.conf.getInt("dfs.client.block.recovery.retries", 5);
   
    if (progress != null) {
      DFSClient.LOG.debug("Set non-null progress callback on DFSOutputStream "+src);
    }

    this.favoredNodes = favoredNodes;

    if ( bytesPerChecksum < 1 || blockSize % bytesPerChecksum != 0) {
      throw new IOException("io.bytes.per.checksum(" + bytesPerChecksum +
                            ") and blockSize(" + blockSize +
                            ") do not match. " + "blockSize should be a " +
                            "multiple of io.bytes.per.checksum");

    }
    checksum = DataChecksum.newDataChecksum(FSConstants.CHECKSUM_TYPE,
                                            bytesPerChecksum,
                                            new NativeCrc32());
  }
 
  /**
   * Create a new output stream to the given DataNode.
   * @see ClientProtocol#create(String, FsPermission, String, boolean, short, long)
   */
  DFSOutputStream(DFSClient dfsClient, String src, int buffersize,
      Progressable progress, LocatedBlock lastBlock, FileStatus stat,
      int bytesPerChecksum)
      throws IOException {
    this(dfsClient, src, buffersize, progress, lastBlock, stat,
        bytesPerChecksum, 0);
  }

  DFSOutputStream(DFSClient dfsClient, String src, FsPermission masked,
      boolean overwrite, boolean createParent, short replication,
      long blockSize, Progressable progress, int buffersize,
      int bytesPerChecksum, boolean forceSync, boolean doParallelWrites,
      DatanodeInfo[] favoredNodes) throws IOException {
    this(dfsClient, src, masked, overwrite, createParent, replication,
        blockSize, progress, buffersize, bytesPerChecksum, forceSync,
        doParallelWrites, favoredNodes, new WriteOptions());
  }
  /**
   * Create a new output stream to the given DataNode.
   * @see ClientProtocol#create(String, FsPermission, String, boolean, short, long)
   */
  DFSOutputStream(DFSClient dfsClient, String src, FsPermission masked,
      boolean overwrite, boolean createParent, short replication, long blockSize,
      Progressable progress,int buffersize, int bytesPerChecksum,
      boolean forceSync, boolean doParallelWrites,
      DatanodeInfo[] favoredNodes, WriteOptions options)
      throws IOException {
    this(dfsClient, src, blockSize, progress, bytesPerChecksum, replication,
        forceSync, doParallelWrites, favoredNodes, options);

    computePacketChunkSize(dfsClient.writePacketSize, bytesPerChecksum);

    try {
      if (dfsClient.namenodeProtocolProxy != null &&
            dfsClient.namenodeProtocolProxy.isMethodSupported("create", String.class,
               FsPermission.class, String.class, boolean.class, boolean.class,
               short.class, long.class)) {
        dfsClient.namenode.create(src, masked, dfsClient.clientName, overwrite,
                        createParent, replication, blockSize);
      } else {
        dfsClient.namenode.create(src, masked, dfsClient.clientName, overwrite,
                        replication, blockSize);
      }
    } catch(RemoteException re) {
      dfsClient.incWriteExpCntToStats();

      throw re.unwrapRemoteException(AccessControlException.class,
                                     FileAlreadyExistsException.class,
                                     FileNotFoundException.class,
                                     NSQuotaExceededException.class,
                                     DSQuotaExceededException.class);
    }
    streamer.start();
  }

  /**
   * Create a new output stream to the given DataNode with namespace id.
   */
  DFSOutputStream(DFSClient dfsClient, String src, int buffersize,
      Progressable progress, LocatedBlock lastBlock, FileStatus stat,
      int bytesPerChecksum, int namespaceId) throws IOException {
    this(dfsClient, src, stat.getBlockSize(), progress, bytesPerChecksum,
        stat.getReplication(), false, false, null,
        new WriteOptions());
    initialFileSize = stat.getLen(); // length of file when opened
    dfsClient.updateNamespaceIdIfNeeded(namespaceId);
    //
    // The last partial block of the file has to be filled.
    //
    if (lastBlock != null) {
      block = lastBlock.getBlock();
      long usedInLastBlock = stat.getLen() % blockSize;
      int freeInLastBlock = (int)(blockSize - usedInLastBlock);

      // calculate the amount of free space in the pre-existing
      // last crc chunk
      int usedInCksum = (int)(stat.getLen() % bytesPerChecksum);
      super.bytesSentInChunk = usedInCksum;
      int freeInCksum = bytesPerChecksum - usedInCksum;

      // if there is space in the last block, then we have to
      // append to that block
      if (freeInLastBlock > blockSize) {
        throw new IOException("The last block for file " +
                              src + " is full.");
      }

      int dataProtocolVersion = dfsClient.getDataTransferProtocolVersion();
      // indicate that we are appending to an existing block
      if (dataProtocolVersion >= DataTransferProtocol.APPEND_BLOCK_VERSION) {
        bytesCurBlock = lastBlock.getBlock().getNumBytes();
      } else {
        bytesCurBlock = lastBlock.getBlockSize();
      }
     
      if (usedInCksum > 0 && freeInCksum > 0) {
        // if there is space in the last partial chunk, then
        // setup in such a way that the next packet will have only
        // one chunk that fills up the partial chunk.
        //
        computePacketChunkSize(0, freeInCksum);
        resetChecksumChunk();
        this.appendChunk = true;
      } else {
        // if the remaining space in the block is smaller than
        // that expected size of of a packet, then create
        // smaller size packet.
        //
        computePacketChunkSize(Math.min(dfsClient.writePacketSize, freeInLastBlock),
                               bytesPerChecksum);
      }

      // setup pipeline to append to the last block
      nodes = lastBlock.getLocations();
      errorIndex = -1;   // no errors yet.
      if (nodes.length < 1) {
        throw new IOException("Unable to retrieve blocks locations" +
                              " for append to last block " + block +
                              " of file " + src);

      }
     
      if (dataProtocolVersion < DataTransferProtocol.APPEND_BLOCK_VERSION) {
        // go through the block recovery process to setup the pipeline for append
        while(processDatanodeError(true, true)) {
          try {
            Thread.sleep(1000);
          } catch (InterruptedException  e) {
            lastException = new IOException(e);
            break;
          }
        }
      } else {
        setupPipelineForAppend(lastBlock);
      }
      if (lastException != null) {
        throw lastException;
      }
    }
    else {
      computePacketChunkSize(dfsClient.writePacketSize, bytesPerChecksum);
    }
   
    long blockOffset = stat.getLen();
    blockOffset -= blockOffset % blockSize;
    setOffsets(blockOffset);
    streamer.start();
  }
 
  /**
   * Setup the Append pipeline, the length of current pipeline will shrink
   * if any datanodes are dead during the process.
   */
  private boolean setupPipelineForAppend(LocatedBlock lastBlock) throws IOException {
    if (nodes == null || nodes.length == 0) {
      String msg = "Could not get block locations. " +
          "Source file \"" + src
          + "\" - Aborting...";
      DFSClient.LOG.warn(msg);
      setLastException(new IOException(msg));
      closed = true;
      if (streamer != null) streamer.close();
      return false;
    }
   
    boolean success = createBlockOutputStream(nodes, dfsClient.clientName, false, true);
    long oldGenerationStamp =
        ((LocatedBlockWithOldGS)lastBlock).getOldGenerationStamp();
   
    if (success) {
      // bump up the generation stamp in NN.
      Block newBlock = lastBlock.getBlock();
      Block oldBlock = new Block(newBlock.getBlockId(), newBlock.getNumBytes(),
          oldGenerationStamp);
      dfsClient.namenode.updatePipeline(dfsClient.clientName,
          oldBlock, newBlock, nodes);
    } else {
      DFSClient.LOG.warn("Fall back to block recovery process when trying" +
          " to setup the append pipeline for file " + src);
      // set the old generation stamp
      block.setGenerationStamp(oldGenerationStamp);
      // fall back the block recovery
      while(processDatanodeError(true, true)) {
        try {
          Thread.sleep(1000);
        } catch (InterruptedException  e) {
          lastException = new IOException(e);
          break;
        }
      }
    }
    return success;
  }

  private void computePacketChunkSize(int psize, int csize) {
    int chunkSize = csize + checksum.getChecksumSize();
    int n = getPacketHeaderLen() + DFSClient.SIZE_OF_INTEGER;
    chunksPerPacket = Math.max((psize - n + chunkSize-1)/chunkSize, 1);
    packetSize = n + chunkSize*chunksPerPacket;
    if (DFSClient.LOG.isDebugEnabled()) {
      DFSClient.LOG.debug("computePacketChunkSize: src=" + src +
                ", chunkSize=" + chunkSize +
                ", chunksPerPacket=" + chunksPerPacket +
                ", packetSize=" + packetSize);
    }
  }

  /**
   * Open a DataOutputStream to a DataNode so that it can be written to.
   * This happens when a file is created and each time a new block is allocated.
   * Must get block ID and the IDs of the destinations from the namenode.
   * Returns the list of target datanodes.
   */
  private DatanodeInfo[] nextBlockOutputStream(String client) throws IOException {
    LocatedBlock lb = null;
    boolean retry = false;
    DatanodeInfo[] nodes;
    ArrayList<DatanodeInfo> excludedNodes = new ArrayList<DatanodeInfo>();
    int count = dfsClient.conf.getInt("dfs.client.block.write.retries", 3);
    boolean success;
    do {
      hasError = false;
      lastException = null;
      errorIndex = 0;
      retry = false;
      nodes = null;
      success = false;

      long startTime = System.currentTimeMillis();

      DatanodeInfo[] excluded = excludedNodes.toArray(new DatanodeInfo[0]);
      lb = locateFollowingBlock(startTime, excluded.length > 0 ? excluded
          : null);
      block = lb.getBlock();
      nodes = lb.getLocations();

      //
      // Connect to first DataNode in the list.
      //
      success = createBlockOutputStream(nodes, dfsClient.clientName,
          false, false);

      if (!success) {
        DFSClient.LOG.info("Abandoning block " + block + " for file " + src);
        dfsClient.namenode.abandonBlock(block, src, dfsClient.clientName);

        if (errorIndex < nodes.length) {
          DFSClient.LOG.debug("Excluding datanode " + nodes[errorIndex]);
          excludedNodes.add(nodes[errorIndex]);
        }
        // Connection failed.  Let's wait a little bit and retry
        retry = true;
      }
    } while (retry && --count >= 0);

    if (!success && nodes != null) {
      // in the last fail time, we will retry with the remaining nodes.
      while (nodes.length > 1 && !success) {
        if (errorIndex >= nodes.length) {
          break;
        }
       
        DatanodeInfo[] remainingNodes = new DatanodeInfo[nodes.length - 1];
        for (int i = 0; i < errorIndex; i++) {
          remainingNodes[i] = nodes[i];
        }
       
        for (int i = errorIndex + 1; i < nodes.length; i++) {
          remainingNodes[i - 1] = nodes[i];
        }
       
        nodes = remainingNodes;
        success = createBlockOutputStream(nodes, dfsClient.clientName,
            false, false);
      }
    }
   
    if (!success) {
      throw new IOException("Unable to create new block.");
    }
    return nodes;
  }

  // For pipelined writes, connects to the first datanode in the pipeline.
  // For parallel writes, connect to all specified datanodes.
  // Returns true if success, otherwise return failure.
  //
  private boolean createBlockOutputStream(DatanodeInfo[] nodes, String client,
                  boolean recoveryFlag, boolean appendFlag) {
    String firstBadLink = "";
    if (DFSClient.LOG.isDebugEnabled()) {
      for (int i = 0; i < nodes.length; i++) {
        DFSClient.LOG.debug("pipeline = " + nodes[i].getName());
      }
    }

    // persist blocks on namenode on next flush
    persistBlocks = true;
    boolean result = false;
    int curNode = 0;
    int length = 0;
    int pipelineDepth;
    if (doParallelWrites) {
      length = nodes.length; // connect to all datanodes
      pipelineDepth = 1;
    } else {
      length = 1; // connect to only the first datanode
      pipelineDepth = nodes.length;
    }
    DataOutputStream[] tmpOut = new DataOutputStream[length];
    DataInputStream[] replyIn = new DataInputStream[length];
    Socket[] sockets = new Socket[length];

    try {
      for (curNode = 0; curNode < length;  curNode++) {

        DFSClient.LOG.debug("Connecting to " + nodes[curNode].getName());
        InetSocketAddress target = NetUtils.createSocketAddr(nodes[curNode].getName());
        Socket s = dfsClient.socketFactory.createSocket();
        sockets[curNode] = s;
        dfsClient.timeoutValue = dfsClient.socketReadExtentionTimeout *
            pipelineDepth + dfsClient.socketTimeout;
        NetUtils.connect(s, target, dfsClient.timeoutValue, dfsClient.ipTosValue);
        s.setSoTimeout(dfsClient.timeoutValue);
        s.setSendBufferSize(DFSClient.DEFAULT_DATA_SOCKET_SIZE);
        DFSClient.LOG.debug("Send buf size " + s.getSendBufferSize());
        long writeTimeout = dfsClient.datanodeWriteExtentionTimeout *
                            pipelineDepth + dfsClient.datanodeWriteTimeout;

        //
        // Xmit header info to datanode (see DataXceiver.java)
        //
        DataOutputStream out = new DataOutputStream(
          new BufferedOutputStream(NetUtils.getOutputStream(s, writeTimeout),
                                   DataNode.SMALL_BUFFER_SIZE));
        tmpOut[curNode] = out;
        DataInputStream brs = new DataInputStream(NetUtils.getInputStream(s));
        replyIn[curNode] = brs;
       
        if (getProfileData() != null) {
          getProfileData().nextBlock();
        }

        int version = dfsClient.getDataTransferProtocolVersion();
        // write the header
        if (!appendFlag) {
          WriteBlockHeader header = new WriteBlockHeader(version,
              dfsClient.namespaceId, block.getBlockId(), block.getGenerationStamp(),
              pipelineDepth, recoveryFlag, false, null, pipelineDepth - 1,
              nodes, client);
          header.getWritePipelineInfo().setWriteOptions(options);
          header.getWritePipelineInfo().getWriteOptions()
              .setIfProfileEnabled(profileData != null);
          header.writeVersionAndOpCode(out);
          header.write(out);
        } else {
          AppendBlockHeader header = new AppendBlockHeader(version,
              dfsClient.namespaceId, block.getBlockId(), block.getNumBytes(),
              block.getGenerationStamp(),
              pipelineDepth, false, null, pipelineDepth - 1,
              nodes, client);
          header.writeVersionAndOpCode(out);
          header.write(out);
        }
        checksum.writeHeader(out);
        out.flush();

        // receive ack for connect
        firstBadLink = Text.readString(brs);
        if (firstBadLink.length() != 0) {
          throw new IOException("Bad connect ack with firstBadLink " +
                                firstBadLink);
        }
      }
      result = true;     // success
      blockStream = dfsClient.new MultiDataOutputStream(tmpOut);
      blockReplyStream = dfsClient.new MultiDataInputStream(replyIn);
      this.s = sockets;
     
      if (appendFlag) {
        // start the responseProcessor if the pipeline is successfully setup
        // for append only
        response = new ResponseProcessor(nodes);
        response.start();
      }
    } catch (IOException ie) {

      DFSClient.LOG.info("Exception in createBlockOutputStream " +
          nodes[curNode].getName() + " " + " for file " + src + ie);

      dfsClient.incWriteExpCntToStats();
     
      // find the datanode that matches
      if (firstBadLink.length() != 0) {
        for (int i = 0; i < nodes.length; i++) {
          if (nodes[i].getName().equals(firstBadLink)) {
            errorIndex = i;
            break;
          }
        }
      } else {
        // if we are doing parallel writes, then record the datanode that is bad
        errorIndex = curNode;
      }
      hasError = true;
      setLastException(ie);
      blockReplyStream = null;
      result = false;
    } finally {
      if (!result) {
        for (int i = 0; i < sockets.length; i++) {
          IOUtils.closeSocket(sockets[i]);
        }
        this.s = null;
      }
    }
   
    return result;
  }

  private LocatedBlock locateFollowingBlock(long start,
                                            DatanodeInfo[] excludedNodes
                                            ) throws IOException {
    int retries = dfsClient.conf.getInt(
        "dfs.client.block.write.locateFollowingBlock.retries", 5);
   
    long sleeptime = 400;
    while (true) {
      long localstart = System.currentTimeMillis();
      while (true) {
        try {
          VersionedLocatedBlock loc = null;
          if (dfsClient.namenodeProtocolProxy != null
              && dfsClient.namenodeProtocolProxy.isMethodSupported(
                  "addBlockAndFetchMetaInfo", String.class, String.class,
                  DatanodeInfo[].class, DatanodeInfo[].class, long.class,
                  Block.class)) {
           loc = dfsClient.namenode.addBlockAndFetchMetaInfo(src,
               dfsClient.clientName, excludedNodes, favoredNodes,
               this.lastBlkOffset, getLastBlock());
          } else if (dfsClient.namenodeProtocolProxy != null
              && dfsClient.namenodeProtocolProxy.isMethodSupported(
                  "addBlockAndFetchMetaInfo", String.class, String.class,
                  DatanodeInfo[].class, DatanodeInfo[].class, long.class)) {
            loc = dfsClient.namenode.addBlockAndFetchMetaInfo(src,
                dfsClient.clientName, excludedNodes, favoredNodes, this.lastBlkOffset);
          } else if (dfsClient.namenodeProtocolProxy != null
              && dfsClient.namenodeProtocolProxy.isMethodSupported(
                  "addBlockAndFetchMetaInfo", String.class, String.class,
                  DatanodeInfo[].class, long.class)) {
            loc = dfsClient.namenode.addBlockAndFetchMetaInfo(src,
                dfsClient.clientName, excludedNodes, this.lastBlkOffset);
          } else if (dfsClient.namenodeProtocolProxy != null
              && dfsClient.namenodeProtocolProxy.isMethodSupported(
                  "addBlockAndFetchMetaInfo", String.class, String.class,
                  DatanodeInfo[].class)) {
            loc = dfsClient.namenode.addBlockAndFetchMetaInfo(src,
                dfsClient.clientName, excludedNodes);
          } else if (dfsClient.namenodeProtocolProxy != null
              && dfsClient.namenodeProtocolProxy.isMethodSupported(
                  "addBlockAndFetchVersion", String.class, String.class,
                  DatanodeInfo[].class)) {
            loc = dfsClient.namenode.addBlockAndFetchVersion(src,
                dfsClient.clientName, excludedNodes);
          } else if (dfsClient.namenodeProtocolProxy != null
              && dfsClient.namenodeProtocolProxy.isMethodSupported("addBlock",
                  String.class, String.class, DatanodeInfo[].class)) {
            return dfsClient.namenode.addBlock(src, dfsClient.clientName,
                excludedNodes);
          } else {
            return dfsClient.namenode.addBlock(src, dfsClient.clientName);
          }
          dfsClient.updateDataTransferProtocolVersionIfNeeded(
              loc.getDataProtocolVersion());
          if (loc instanceof LocatedBlockWithMetaInfo) {
            LocatedBlockWithMetaInfo metaLoc = (LocatedBlockWithMetaInfo)loc;
            dfsClient.updateNamespaceIdIfNeeded(metaLoc.getNamespaceID());
            dfsClient.getNewNameNodeIfNeeded(metaLoc.getMethodFingerPrint());
          }
          return loc;
        } catch (RemoteException e) {
          IOException ue =
            e.unwrapRemoteException(FileNotFoundException.class,
                                    AccessControlException.class,
                                    NSQuotaExceededException.class,
                                    DSQuotaExceededException.class);
          if (ue != e) {
            throw ue; // no need to retry these exceptions
          }

          if (NotReplicatedYetException.class.getName().
              equals(e.getClassName())) {

              if (retries == 0) {
                throw e;
              } else {
                --retries;
                DFSClient.LOG.info(StringUtils.stringifyException(e));
                if (System.currentTimeMillis() - localstart > 5000) {
                  DFSClient.LOG.info("Waiting for replication for "
                      + (System.currentTimeMillis() - localstart) / 1000
                      + " seconds");
                }
                try {
                  DFSClient.LOG.warn("NotReplicatedYetException sleeping " + src
                      + " retries left " + retries);
                  Thread.sleep(sleeptime);
                  sleeptime *= 2;
                } catch (InterruptedException ie) {
                }
              }
          } else {
            throw e;
          }
        }
      }
    }
  }

  @Override
  protected void incMetrics(int len){
    dfsClient.metrics.incWriteOps();
    dfsClient.metrics.incWriteSize(len);
  }
  // @see FSOutputSummer#writeChunk()
  @Override
  protected synchronized void writeChunk(byte[] b, int offset, int len, byte[] checksum)
                                                        throws IOException {
    dfsClient.checkOpen();
    isClosed();


    int cklen = checksum.length;
    int bytesPerChecksum = this.checksum.getBytesPerChecksum();
    if (len > bytesPerChecksum) {
      throw new IOException("writeChunk() buffer size is " + len +
                            " is larger than supported  bytesPerChecksum " +
                            bytesPerChecksum);
    }
    if (checksum.length != this.checksum.getChecksumSize()) {
      throw new IOException("writeChunk() checksum size is supposed to be " +
                            this.checksum.getChecksumSize() +
                            " but found to be " + checksum.length);
    }

    eventStartEnqueuePacket();
   
    synchronized (dataQueue) {

      // If queue is full, then wait till we can create  enough space
      while (!closed && dataQueue.size() + ackQueue.size()  > maxPackets) {
        try {
          dataQueue.wait(packetTimeout);
          checkIfLastPacketTimeout();
        } catch (InterruptedException  e) {
        }
      }
      isClosed();
     
      if (currentPacket == null) {
        WritePacketClientProfile pktProfile = null;
        if (getProfileData() != null) {
          pktProfile = getProfileData().getWritePacketClientProfile();
        }

        currentPacket = DFSOutputStreamPacketFactory.getPacket(
            DFSOutputStream.this, ifPacketIncludeVersion(),
            getPacketVersion(), packetSize, chunksPerPacket, bytesCurBlock, pktProfile);

        if (DFSClient.LOG.isDebugEnabled()) {
          DFSClient.LOG.debug("DFSClient writeChunk allocating new packet seqno=" +
                    currentPacket.seqno +
                    ", src=" + src +
                    ", packetSize=" + packetSize +
                    ", chunksPerPacket=" + chunksPerPacket +
                    ", bytesCurBlock=" + bytesCurBlock +
                    ", forceSync=" + forceSync +
                    ", doParallelWrites=" + doParallelWrites +
                    ", len=" + len +
                    ", blocksize=" + blockSize);
        }
      }

      if (packetVersion == DataTransferProtocol.PACKET_VERSION_CHECKSUM_FIRST) {
        currentPacket.writeChecksum(checksum, 0, cklen);
        currentPacket.writeData(b, offset, len);
      } else {
        // packetVersion == DataTransferProtocol.PACKET_VERSION_CHECKSUM_INLINE
        currentPacket.writeData(b, offset, len);
        currentPacket.writeChecksum(checksum, 0, cklen);
      }
      currentPacket.numChunks++;
      bytesCurBlock += len;

      // If packet is full, enqueue it for transmission
      if (currentPacket.numChunks == currentPacket.maxChunks ||
          bytesCurBlock == blockSize) {
        if (DFSClient.LOG.isDebugEnabled()) {
          DFSClient.LOG.debug("DFSClient writeChunk packet full seqno=" +
                    currentPacket.seqno +
                    ", src=" + src +
                    ", bytesCurBlock=" + bytesCurBlock +
                    ", blockSize=" + blockSize +
                    ", appendChunk=" + appendChunk);
        }
        //
        // if we allocated a new packet because we encountered a block
        // boundary, reset bytesCurBlock.
        //
        if (bytesCurBlock == blockSize) {
          currentPacket.lastPacketInBlock = true;
          bytesCurBlock = 0;
          lastFlushOffset = 0;
        }
        enqueueCurrentPacket();
       
        eventEndEnquePacket();

        // If this was the first write after reopening a file, then the above
        // write filled up any partial chunk. Tell the summer to generate full
        // crc chunks from now on.
        if (appendChunk) {
          appendChunk = false;
          resetChecksumChunk();
        }
        int psize = Math.min((int)(blockSize-bytesCurBlock),
            dfsClient.writePacketSize);
        computePacketChunkSize(psize, bytesPerChecksum);
      }
    }
   

    //LOG.debug("DFSClient writeChunk done length " + len +
    //          " checksum length " + cklen);
  }

  private synchronized void enqueueCurrentPacket() {
    synchronized (dataQueue) {
      if (currentPacket == null) return;
      dataQueue.addLast(currentPacket);
      currentPacket.eventAddToDataQueue();
      dataQueue.notifyAll();
      lastQueuedSeqno = currentPacket.seqno;
      currentPacket = null;
    }
  }

  /**
   * All data is written out to datanodes. It is not guaranteed
   * that data has been flushed to persistent store on the
   * datanode. Block allocations are persisted on namenode.
   */
  public void sync() throws IOException {
    long start = System.currentTimeMillis();
    try {
      long toWaitFor;
      synchronized (this) {
        eventStartSync();
        /* Record current blockOffset. This might be changed inside
         * flushBuffer() where a partial checksum chunk might be flushed.
         * After the flush, reset the bytesCurBlock back to its previous value,
         * any partial checksum chunk will be sent now and in next packet.
         */
        long saveOffset = bytesCurBlock;
        DFSOutputStreamPacket oldCurrentPacket = currentPacket;

        // flush checksum buffer as an incomplete chunk
        flushBuffer(false, shouldKeepPartialChunkData());
        // bytesCurBlock potentially incremented if there was buffered data
       
        eventSyncStartWaitAck();
       
        if (DFSClient.LOG.isDebugEnabled()) {
          DFSClient.LOG.debug("DFSClient flush() : bytesCurBlock " + bytesCurBlock +
                    " lastFlushOffset " + lastFlushOffset);
        }

        // Flush only if we haven't already flushed till this offset.
        if (lastFlushOffset != bytesCurBlock) {
          assert bytesCurBlock > lastFlushOffset;
          // record the valid offset of this flush
          lastFlushOffset = bytesCurBlock;
          enqueueCurrentPacket();
        } else {
          // just discard the current packet since it is already been sent.
          if (oldCurrentPacket == null && currentPacket != null) {
            // If we didn't previously have a packet queued, and now we do,
            // but we don't plan on sending it, then we should not
            // skip a sequence number for it!
            currentSeqno--;
          }
          currentPacket = null;
        }

        if (shouldKeepPartialChunkData()) {
          // Restore state of stream. Record the last flush offset
          // of the last full chunk that was flushed.
          //
          bytesCurBlock = saveOffset;
        }
        toWaitFor = lastQueuedSeqno;
      }

      waitForAckedSeqno(toWaitFor);
     
      eventSyncPktAcked();

      // If any new blocks were allocated since the last flush,
      // then persist block locations on namenode.
      //
      boolean willPersist;
      synchronized (this) {
        willPersist = persistBlocks;
        persistBlocks = false;
      }
      if (willPersist) {
        dfsClient.namenode.fsync(src, dfsClient.clientName);
      }
      long timeval = System.currentTimeMillis() - start;
      dfsClient.metrics.incSyncTime(timeval);

      eventEndSync();

    } catch (IOException e) {
        lastException = new IOException("IOException flush:", e);
        closed = true;
        closeThreads();
        throw e;
    }
  }
 
  private Block getLastBlock() {
    return this.block;
  }

  /**
   * Returns the number of replicas of current block. This can be different
   * from the designated replication factor of the file because the NameNode
   * does not replicate the block to which a client is currently writing to.
   * The client continues to write to a block even if a few datanodes in the
   * write pipeline have failed. If the current block is full and the next
   * block is not yet allocated, then this API will return 0 because there are
   * no replicas in the pipeline.
   */
  public int getNumCurrentReplicas() throws IOException {
    synchronized(dataQueue) {
      if (nodes == null) {
        return blockReplication;
      }
      return nodes.length;
    }
  }
 
  public DFSWriteProfilingData getProfileData() {
    return (DFSWriteProfilingData) profileData;
  }

  /**
   * Waits till all existing data is flushed and confirmations
   * received from datanodes.
   */
  private void flushInternal() throws IOException {
    isClosed();
    dfsClient.checkOpen();

    long toWaitFor;
    synchronized (this) {
      enqueueCurrentPacket();
      toWaitFor = lastQueuedSeqno;
    }

    waitForAckedSeqno(toWaitFor);
  }

  private void waitForAckedSeqno(long seqnumToWaitFor) throws IOException {
    boolean interrupted = false;

    synchronized (ackQueue) {
      while (!closed) {
        isClosed();
        if (lastAckedSeqno >= seqnumToWaitFor) {
          break;
        }
        try {
          ackQueue.wait();
        } catch (InterruptedException ie) {
          interrupted = true;
        }
      }
    }

    if (interrupted) {
      Thread.currentThread().interrupt();
    }
    isClosed();
  }

  /**
   * Closes this output stream and releases any system
   * resources associated with this stream.
   */
  @Override
  public void close() throws IOException {
    try {
      if (closed) {
        IOException e = lastException;
        if (e == null)
          return;
        else
          throw e;
      }

      try {
        closeInternal();

        if (s != null) {
          for (int i = 0; i < s.length; i++) {
            s[i].close();
          }
          s = null;
        }
      } catch (IOException e) {
        lastException = e;
        throw e;
      }
      if (profileData != null && profileData.isAutoPrintWhileClose()) {
        DFSClient.LOG.info("Write Profile for " + this.src + ":"
            + profileData.toString());
      }
    } finally {
      // We always try to remove the connection from the lease to
      // avoid memory leak. In case of failed close(), it is possible
      // that later users' retry of close() could succeed but fail on
      // lease expiration. Since clients don't possibly write more data
      // after calling close(), this case doesn't change any guarantee
      // of data itself.
      dfsClient.leasechecker.remove(src);
    }
  }

  /**
   * Harsh abort method that should only be used from tests - this
   * is in order to prevent pipeline recovery when eg a DN shuts down.
   */
  void abortForTests() throws IOException {
    if (streamer != null) {
      streamer.close();
    }
    if (response != null) {
      response.close();
    }
    closed = true;
  }

  /**
   * Aborts this output stream and releases any system
   * resources associated with this stream.
   */
  synchronized void abort() throws IOException {
    if (closed) {
      return;
    }
    setLastException(new IOException("Lease timeout of " +
                                     (dfsClient.hdfsTimeout/1000) + " seconds expired."));
    closeThreads();
  }


  // shutdown datastreamer and responseprocessor threads.
  private void closeThreads() throws IOException {
    try {
      if (streamer != null) {
        streamer.close();
        streamer.join();
      }

      // shutdown response after streamer has exited.
      if (response != null) {
        response.close();
        response.join();
        response = null;
      }
    } catch (InterruptedException e) {
      throw new InterruptedIOException("Failed to shutdown response thread");
    }
  }

  /**
   * Closes this output stream and releases any system
   * resources associated with this stream.
   */
  private synchronized void closeInternal() throws IOException {
    dfsClient.checkOpen();
    isClosed();

    try {
        eventStartWrite();

        flushBuffer(true, false);       // flush from all upper layers

        eventCloseAfterFlushBuffer();
       
        // Mark that this packet is the last packet in block.
        // If there are no outstanding packets and the last packet
        // was not the last one in the current block, then create a
        // packet with empty payload.
        synchronized (dataQueue) {
          if (currentPacket == null && bytesCurBlock != 0) {
            WritePacketClientProfile pktProfile = null;
            if (getProfileData() != null) {
              pktProfile = getProfileData().getWritePacketClientProfile();
            }
            currentPacket = DFSOutputStreamPacketFactory.getPacket(
              DFSOutputStream.this, ifPacketIncludeVersion(),
              getPacketVersion(), packetSize, chunksPerPacket, bytesCurBlock, pktProfile);
          }
          if (currentPacket != null) {
            currentPacket.lastPacketInBlock = true;
          }
        }
               
      flushInternal();             // flush all data to Datanodes
      isClosed(); // check to see if flushInternal had any exceptions
      closed = true; // allow closeThreads() to showdown threads

      closeThreads();

      synchronized (dataQueue) {
        if (blockStream != null) {
          blockStream.writeInt(0); // indicate end-of-block to datanode
          blockStream.close();
          blockReplyStream.close();
        }
        if (s != null) {
          for (int i = 0; i < s.length; i++) {
            s[i].close();
          }
          s = null;
        }
      }

      streamer = null;
      blockStream = null;
      blockReplyStream = null;

      eventCloseReceivedAck();
     
      dfsClient.closeFile(src, lastBlkOffset, getLastBlock());
     
      eventEndClose();
    } finally {
      closed = true;
    }
  }

  void setArtificialSlowdown(long period) {
    artificialSlowdown = period;
  }

  synchronized void setChunksPerPacket(int value) {
    chunksPerPacket = Math.min(chunksPerPacket, value);
    packetSize = getPacketHeaderLen() + DFSClient.SIZE_OF_INTEGER +
             (checksum.getBytesPerChecksum() +
              checksum.getChecksumSize()) * chunksPerPacket;
  }

  synchronized void setTestFilename(String newname) {
    src = newname;
  }

  /**
   * Returns the size of a file as it was when this stream was opened
   */
  long getInitialLen() {
    return initialFileSize;
  }
 
  private void eventStartEnqueuePacket() {
    if (getProfileData() != null) {
      getProfileData().startEnqueuePacket();
    }
  }

  private void eventEndEnquePacket() {
    if (getProfileData() != null) {
      getProfileData().endEnquePacket();
    }
  }

  private void eventStartSync() {
    if (getProfileData() != null) {
      getProfileData().startSync();
    }
  }

  private void eventSyncStartWaitAck() {
    if (getProfileData() != null) {
      getProfileData().syncStartWaitAck();
    }
  }

  private void eventSyncPktAcked() {
    if (getProfileData() != null) {
      getProfileData().syncPktAcked();
    }
  }

  private void eventEndSync() {
    if (getProfileData() != null) {
      getProfileData().endSync();
    }
  }

  private void eventCloseAfterFlushBuffer() {
    if (getProfileData() != null) {
      getProfileData().closeAfterFlushBuffer();
    }
  }

  private void eventCloseReceivedAck() {
    if (getProfileData() != null) {
      getProfileData().closeReceivedAck();
    }
  }

  private void eventEndClose() {
    if (getProfileData() != null) {
      getProfileData().endClose();
    }
  }

  public void eventStartReceiveAck() {
    if (getProfileData() != null) {
      getProfileData().startReceiveAck();
    }
  }

  int getPacketHeaderLen() {
    return DataNode.getPacketHeaderLen(ifPacketIncludeVersion());
  }
 
  long incAndGetCurrentSeqno() {
    return currentSeqno++;
  }
 
  int getPacketVersion() {
    return packetVersion;
  }

  boolean ifPacketIncludeVersion() {
    return pktIncludeVersion;
  }
 
  boolean ifForceSync() {
    return forceSync;
  }
 
  int getBytesPerChecksum() {
    return checksum.getBytesPerChecksum();
  }
 
  int getChecksumSize() {
    return checksum.getChecksumSize();
  }

  @Override
  protected boolean shouldKeepPartialChunkData() throws IOException {
    return this.dfsClient.getDataTransferProtocolVersion() <
        DataTransferProtocol.NOT_RESEND_PARTIAL_CHUNK_VERSION;
  }
}
TOP

Related Classes of org.apache.hadoop.hdfs.DFSOutputStream

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.