Package org.apache.hadoop.hdfs

Source Code of org.apache.hadoop.hdfs.DFSOutputStream$DataStreamer

/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hdfs;

import static org.apache.hadoop.hdfs.protocol.DataTransferProtocol.Status.ERROR_ACCESS_TOKEN;
import static org.apache.hadoop.hdfs.protocol.DataTransferProtocol.Status.SUCCESS;

import java.io.BufferedOutputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.net.InetSocketAddress;
import java.net.Socket;
import java.nio.BufferOverflowException;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.EnumSet;
import java.util.LinkedList;
import java.util.concurrent.atomic.AtomicBoolean;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.CreateFlag;
import org.apache.hadoop.fs.FSOutputSummer;
import org.apache.hadoop.fs.FileAlreadyExistsException;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.Syncable;
import org.apache.hadoop.fs.UnresolvedLinkException;
import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.hdfs.protocol.Block;
import org.apache.hadoop.hdfs.protocol.ClientProtocol;
import org.apache.hadoop.hdfs.protocol.DSQuotaExceededException;
import org.apache.hadoop.hdfs.protocol.DataTransferProtocol;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.hadoop.hdfs.protocol.HdfsFileStatus;
import org.apache.hadoop.hdfs.protocol.LocatedBlock;
import org.apache.hadoop.hdfs.protocol.NSQuotaExceededException;
import org.apache.hadoop.hdfs.protocol.UnresolvedPathException;
import org.apache.hadoop.hdfs.protocol.DataTransferProtocol.BlockConstructionStage;
import org.apache.hadoop.hdfs.protocol.DataTransferProtocol.PipelineAck;
import org.apache.hadoop.hdfs.security.BlockAccessToken;
import org.apache.hadoop.hdfs.security.InvalidAccessTokenException;
import org.apache.hadoop.hdfs.server.common.HdfsConstants;
import org.apache.hadoop.hdfs.server.datanode.DataNode;
import org.apache.hadoop.hdfs.server.namenode.NotReplicatedYetException;
import org.apache.hadoop.io.EnumSetWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.ipc.RemoteException;
import org.apache.hadoop.net.NetUtils;
import org.apache.hadoop.security.AccessControlException;
import org.apache.hadoop.util.Daemon;
import org.apache.hadoop.util.DataChecksum;
import org.apache.hadoop.util.Progressable;
import org.apache.hadoop.util.PureJavaCrc32;
import org.apache.hadoop.util.StringUtils;

/****************************************************************
* DFSOutputStream creates files from a stream of bytes.
*
* The client application writes data that is cached internally by
* this stream. Data is broken up into packets, each packet is
* typically 64K in size. A packet comprises of chunks. Each chunk
* is typically 512 bytes and has an associated checksum with it.
*
* When a client application fills up the currentPacket, it is
* enqueued into dataQueue.  The DataStreamer thread picks up
* packets from the dataQueue, sends it to the first datanode in
* the pipeline and moves it from the dataQueue to the ackQueue.
* The ResponseProcessor receives acks from the datanodes. When an
* successful ack for a packet is received from all datanodes, the
* ResponseProcessor removes the corresponding packet from the
* ackQueue.
*
* In case of error, all outstanding packets and moved from
* ackQueue. A new pipeline is setup by eliminating the bad
* datanode from the original pipeline. The DataStreamer now
* starts sending packets from the dataQueue.
****************************************************************/
class DFSOutputStream extends FSOutputSummer implements Syncable {
  /**
   *
   */
  private final DFSClient dfsClient;
  private Configuration conf;
  private static final int MAX_PACKETS = 80; // each packet 64K, total 5MB
  private Socket s;
  // closed is accessed by different threads under different locks.
  private volatile boolean closed = false;

  private String src;
  private final long blockSize;
  private final DataChecksum checksum;
  // both dataQueue and ackQueue are protected by dataQueue lock
  private final LinkedList<Packet> dataQueue = new LinkedList<Packet>();
  private final LinkedList<Packet> ackQueue = new LinkedList<Packet>();
  private Packet currentPacket = null;
  private DataStreamer streamer;
  private long currentSeqno = 0;
  private long bytesCurBlock = 0; // bytes writen in current block
  private int packetSize = 0; // write packet size, including the header.
  private int chunksPerPacket = 0;
  private volatile IOException lastException = null;
  private long artificialSlowdown = 0;
  private long lastFlushOffset = -1; // offset when flush was invoked
  //persist blocks on namenode
  private final AtomicBoolean persistBlocks = new AtomicBoolean(false);
  private volatile boolean appendChunk = false;   // appending to existing partial block
  private long initialFileSize = 0; // at time of file open
  private Progressable progress;
  private short blockReplication; // replication factor of file
 
  private class Packet {
    ByteBuffer buffer;           // only one of buf and buffer is non-null
    byte[]  buf; 
    long    seqno;               // sequencenumber of buffer in block
    long    offsetInBlock;       // offset in block
    boolean lastPacketInBlock;   // is this the last packet in block?
    int     numChunks;           // number of chunks currently in packet
    int     maxChunks;           // max chunks in packet
    int     dataStart;
    int     dataPos;
    int     checksumStart;
    int     checksumPos;     
    private static final long HEART_BEAT_SEQNO = -1L;

    /**
     *  create a heartbeat packet
     */
    Packet() {
      this.lastPacketInBlock = false;
      this.numChunks = 0;
      this.offsetInBlock = 0;
      this.seqno = HEART_BEAT_SEQNO;
     
      buffer = null;
      int packetSize = DataNode.PKT_HEADER_LEN + DFSClient.SIZE_OF_INTEGER;
      buf = new byte[packetSize];
     
      checksumStart = dataStart = packetSize;
      checksumPos = checksumStart;
      dataPos = dataStart;
      maxChunks = 0;
    }
   
    // create a new packet
    Packet(int pktSize, int chunksPerPkt, long offsetInBlock) {
      this.lastPacketInBlock = false;
      this.numChunks = 0;
      this.offsetInBlock = offsetInBlock;
      this.seqno = currentSeqno;
      currentSeqno++;
     
      buffer = null;
      buf = new byte[pktSize];
     
      checksumStart = DataNode.PKT_HEADER_LEN + DFSClient.SIZE_OF_INTEGER;
      checksumPos = checksumStart;
      dataStart = checksumStart + chunksPerPkt * checksum.getChecksumSize();
      dataPos = dataStart;
      maxChunks = chunksPerPkt;
    }

    void writeData(byte[] inarray, int off, int len) {
      if ( dataPos + len > buf.length) {
        throw new BufferOverflowException();
      }
      System.arraycopy(inarray, off, buf, dataPos, len);
      dataPos += len;
    }

    void  writeChecksum(byte[] inarray, int off, int len) {
      if (checksumPos + len > dataStart) {
        throw new BufferOverflowException();
      }
      System.arraycopy(inarray, off, buf, checksumPos, len);
      checksumPos += len;
    }
   
    /**
     * Returns ByteBuffer that contains one full packet, including header.
     */
    ByteBuffer getBuffer() {
      /* Once this is called, no more data can be added to the packet.
       * setting 'buf' to null ensures that.
       * This is called only when the packet is ready to be sent.
       */
      if (buffer != null) {
        return buffer;
      }
     
      //prepare the header and close any gap between checksum and data.
     
      int dataLen = dataPos - dataStart;
      int checksumLen = checksumPos - checksumStart;
     
      if (checksumPos != dataStart) {
        /* move the checksum to cover the gap.
         * This can happen for the last packet.
         */
        System.arraycopy(buf, checksumStart, buf,
                         dataStart - checksumLen , checksumLen);
      }
     
      int pktLen = DFSClient.SIZE_OF_INTEGER + dataLen + checksumLen;
     
      //normally dataStart == checksumPos, i.e., offset is zero.
      buffer = ByteBuffer.wrap(buf, dataStart - checksumPos,
                               DataNode.PKT_HEADER_LEN + pktLen);
      buf = null;
      buffer.mark();
     
      /* write the header and data length.
       * The format is described in comment before DataNode.BlockSender
       */
      buffer.putInt(pktLen)// pktSize
      buffer.putLong(offsetInBlock);
      buffer.putLong(seqno);
      buffer.put((byte) ((lastPacketInBlock) ? 1 : 0));
      //end of pkt header
      buffer.putInt(dataLen); // actual data length, excluding checksum.
     
      buffer.reset();
      return buffer;
    }
   
    // get the packet's last byte's offset in the block
    long getLastByteOffsetBlock() {
      return offsetInBlock + dataPos - dataStart;
    }
   
    /**
     * Check if this packet is a heart beat packet
     * @return true if the sequence number is HEART_BEAT_SEQNO
     */
    private boolean isHeartbeatPacket() {
      return seqno == HEART_BEAT_SEQNO;
    }
   
    public String toString() {
      return "packet seqno:" + this.seqno +
      " offsetInBlock:" + this.offsetInBlock +
      " lastPacketInBlock:" + this.lastPacketInBlock +
      " lastByteOffsetInBlock: " + this.getLastByteOffsetBlock();
    }
  }

  //
  // The DataStreamer class is responsible for sending data packets to the
  // datanodes in the pipeline. It retrieves a new blockid and block locations
  // from the namenode, and starts streaming packets to the pipeline of
  // Datanodes. Every packet has a sequence number associated with
  // it. When all the packets for a block are sent out and acks for each
  // if them are received, the DataStreamer closes the current block.
  //
  class DataStreamer extends Daemon {
    private volatile boolean streamerClosed = false;
    private Block block; // its length is number of bytes acked
    private BlockAccessToken accessToken;
    private DataOutputStream blockStream;
    private DataInputStream blockReplyStream;
    private ResponseProcessor response = null;
    private volatile DatanodeInfo[] nodes = null; // list of targets for current block
    private ArrayList<DatanodeInfo> excludedNodes = new ArrayList<DatanodeInfo>();
    volatile boolean hasError = false;
    volatile int errorIndex = -1;
    private BlockConstructionStage stage;  // block construction stage
    private long bytesSent = 0; // number of bytes that've been sent

    /**
     * Default construction for file create
     */
    private DataStreamer() {
      stage = BlockConstructionStage.PIPELINE_SETUP_CREATE;
    }
   
    /**
     * Construct a data streamer for append
     * @param lastBlock last block of the file to be appended
     * @param stat status of the file to be appended
     * @param bytesPerChecksum number of bytes per checksum
     * @throws IOException if error occurs
     */
    private DataStreamer(LocatedBlock lastBlock, HdfsFileStatus stat,
        int bytesPerChecksum) throws IOException {
      stage = BlockConstructionStage.PIPELINE_SETUP_APPEND;
      block = lastBlock.getBlock();
      bytesSent = block.getNumBytes();
      accessToken = lastBlock.getAccessToken();
      long usedInLastBlock = stat.getLen() % blockSize;
      int freeInLastBlock = (int)(blockSize - usedInLastBlock);

      // calculate the amount of free space in the pre-existing
      // last crc chunk
      int usedInCksum = (int)(stat.getLen() % bytesPerChecksum);
      int freeInCksum = bytesPerChecksum - usedInCksum;

      // if there is space in the last block, then we have to
      // append to that block
      if (freeInLastBlock == blockSize) {
        throw new IOException("The last block for file " +
            src + " is full.");
      }

      if (usedInCksum > 0 && freeInCksum > 0) {
        // if there is space in the last partial chunk, then
        // setup in such a way that the next packet will have only
        // one chunk that fills up the partial chunk.
        //
        computePacketChunkSize(0, freeInCksum);
        resetChecksumChunk(freeInCksum);
        appendChunk = true;
      } else {
        // if the remaining space in the block is smaller than
        // that expected size of of a packet, then create
        // smaller size packet.
        //
        computePacketChunkSize(Math.min(dfsClient.writePacketSize, freeInLastBlock),
            bytesPerChecksum);
      }

      // setup pipeline to append to the last block XXX retries??
      nodes = lastBlock.getLocations();
      errorIndex = -1;   // no errors yet.
      if (nodes.length < 1) {
        throw new IOException("Unable to retrieve blocks locations " +
            " for last block " + block +
            "of file " + src);

      }
    }
   
    /**
     * Initialize for data streaming
     */
    private void initDataStreaming() {
      this.setName("DataStreamer for file " + src +
          " block " + block);
      response = new ResponseProcessor(nodes);
      response.start();
      stage = BlockConstructionStage.DATA_STREAMING;
    }
   
    private void endBlock() {
      DFSClient.LOG.debug("Closing old block " + block);
      this.setName("DataStreamer for file " + src);
      closeResponder();
      closeStream();
      nodes = null;
      stage = BlockConstructionStage.PIPELINE_SETUP_CREATE;
    }
   
    /*
     * streamer thread is the only thread that opens streams to datanode,
     * and closes them. Any error recovery is also done by this thread.
     */
    public void run() {
      long lastPacket = System.currentTimeMillis();
      while (!streamerClosed && dfsClient.clientRunning) {

        // if the Responder encountered an error, shutdown Responder
        if (hasError && response != null) {
          try {
            response.close();
            response.join();
            response = null;
          } catch (InterruptedException  e) {
          }
        }

        Packet one = null;

        try {
          // process datanode IO errors if any
          boolean doSleep = false;
          if (hasError && errorIndex>=0) {
            doSleep = processDatanodeError();
          }

          synchronized (dataQueue) {
            // wait for a packet to be sent.
            long now = System.currentTimeMillis();
            while ((!streamerClosed && !hasError && dfsClient.clientRunning
                && dataQueue.size() == 0 &&
                (stage != BlockConstructionStage.DATA_STREAMING ||
                 stage == BlockConstructionStage.DATA_STREAMING &&
                 now - lastPacket < dfsClient.socketTimeout/2)) || doSleep ) {
              long timeout = dfsClient.socketTimeout/2 - (now-lastPacket);
              timeout = timeout <= 0 ? 1000 : timeout;
              timeout = (stage == BlockConstructionStage.DATA_STREAMING)?
                 timeout : 1000;
              try {
                dataQueue.wait(timeout);
              } catch (InterruptedException  e) {
              }
              doSleep = false;
              now = System.currentTimeMillis();
            }
            if (streamerClosed || hasError || !dfsClient.clientRunning) {
              continue;
            }
            // get packet to be sent.
            if (dataQueue.isEmpty()) {
              one = new Packet()// heartbeat packet
            } else {
              one = dataQueue.getFirst(); // regular data packet
            }
          }

          // get new block from namenode.
          if (stage == BlockConstructionStage.PIPELINE_SETUP_CREATE) {
            DFSClient.LOG.debug("Allocating new block");
            nodes = nextBlockOutputStream(src);
            initDataStreaming();
          } else if (stage == BlockConstructionStage.PIPELINE_SETUP_APPEND) {
            DFSClient.LOG.debug("Append to block " + block);
            setupPipelineForAppendOrRecovery();
            initDataStreaming();
          }

          long lastByteOffsetInBlock = one.getLastByteOffsetBlock();
          if (lastByteOffsetInBlock > blockSize) {
            throw new IOException("BlockSize " + blockSize +
                " is smaller than data size. " +
                " Offset of packet in block " +
                lastByteOffsetInBlock +
                " Aborting file " + src);
          }

          if (one.lastPacketInBlock) {
            // wait for all data packets have been successfully acked
            synchronized (dataQueue) {
              while (!streamerClosed && !hasError &&
                  ackQueue.size() != 0 && dfsClient.clientRunning) {
                try {
                  // wait for acks to arrive from datanodes
                  dataQueue.wait(1000);
                } catch (InterruptedException  e) {
                }
              }
            }
            if (streamerClosed || hasError || !dfsClient.clientRunning) {
              continue;
            }
            stage = BlockConstructionStage.PIPELINE_CLOSE;
          }
         
          // send the packet
          ByteBuffer buf = one.getBuffer();

          synchronized (dataQueue) {
            // move packet from dataQueue to ackQueue
            if (!one.isHeartbeatPacket()) {
              dataQueue.removeFirst();
              ackQueue.addLast(one);
              dataQueue.notifyAll();
            }
          }

          if (DFSClient.LOG.isDebugEnabled()) {
            DFSClient.LOG.debug("DataStreamer block " + block +
                " sending packet " + one);
          }

          // write out data to remote datanode
          blockStream.write(buf.array(), buf.position(), buf.remaining());
          blockStream.flush();
          lastPacket = System.currentTimeMillis();
         
          if (one.isHeartbeatPacket()) {  //heartbeat packet
          }
         
          // update bytesSent
          long tmpBytesSent = one.getLastByteOffsetBlock();
          if (bytesSent < tmpBytesSent) {
            bytesSent = tmpBytesSent;
          }

          if (streamerClosed || hasError || !dfsClient.clientRunning) {
            continue;
          }

          // Is this block full?
          if (one.lastPacketInBlock) {
            // wait for the close packet has been acked
            synchronized (dataQueue) {
              while (!streamerClosed && !hasError &&
                  ackQueue.size() != 0 && dfsClient.clientRunning) {
                dataQueue.wait(1000);// wait for acks to arrive from datanodes
              }
            }
            if (streamerClosed || hasError || !dfsClient.clientRunning) {
              continue;
            }

            endBlock();
          }
          if (progress != null) { progress.progress(); }

          // This is used by unit test to trigger race conditions.
          if (artificialSlowdown != 0 && dfsClient.clientRunning) {
            Thread.sleep(artificialSlowdown);
          }
        } catch (Throwable e) {
          DFSClient.LOG.warn("DataStreamer Exception: " +
              StringUtils.stringifyException(e));
          if (e instanceof IOException) {
            setLastException((IOException)e);
          }
          hasError = true;
          if (errorIndex == -1) { // not a datanode error
            streamerClosed = true;
          }
        }
      }
      closeInternal();
    }

    private void closeInternal() {
      closeResponder();       // close and join
      closeStream();
      streamerClosed = true;
      closed = true;
      synchronized (dataQueue) {
        dataQueue.notifyAll();
      }
    }

    /*
     * close both streamer and DFSOutputStream, should be called only
     * by an external thread and only after all data to be sent has
     * been flushed to datanode.
     *
     * Interrupt this data streamer if force is true
     *
     * @param force if this data stream is forced to be closed
     */
    void close(boolean force) {
      streamerClosed = true;
      synchronized (dataQueue) {
        dataQueue.notifyAll();
      }
      if (force) {
        this.interrupt();
      }
    }

    private void closeResponder() {
      if (response != null) {
        try {
          response.close();
          response.join();
        } catch (InterruptedException  e) {
        } finally {
          response = null;
        }
      }
    }

    private void closeStream() {
      if (blockStream != null) {
        try {
          blockStream.close();
        } catch (IOException e) {
        } finally {
          blockStream = null;
        }
      }
      if (blockReplyStream != null) {
        try {
          blockReplyStream.close();
        } catch (IOException e) {
        } finally {
          blockReplyStream = null;
        }
      }
    }

    //
    // Processes reponses from the datanodes.  A packet is removed
    // from the ackQueue when its response arrives.
    //
    private class ResponseProcessor extends Daemon {

      private volatile boolean responderClosed = false;
      private DatanodeInfo[] targets = null;
      private boolean isLastPacketInBlock = false;

      ResponseProcessor (DatanodeInfo[] targets) {
        this.targets = targets;
      }

      public void run() {

        setName("ResponseProcessor for block " + block);
        PipelineAck ack = new PipelineAck();

        while (!responderClosed && dfsClient.clientRunning && !isLastPacketInBlock) {
          // process responses from datanodes.
          try {
            // read an ack from the pipeline
            ack.readFields(blockReplyStream);
            if (DFSClient.LOG.isDebugEnabled()) {
              DFSClient.LOG.debug("DFSClient " + ack);
            }
           
            long seqno = ack.getSeqno();
            // processes response status from datanodes.
            for (int i = ack.getNumOfReplies()-1; i >=&& dfsClient.clientRunning; i--) {
              final DataTransferProtocol.Status reply = ack.getReply(i);
              if (reply != SUCCESS) {
                errorIndex = i; // first bad datanode
                throw new IOException("Bad response " + reply +
                    " for block " + block +
                    " from datanode " +
                    targets[i].getName());
              }
            }
           
            assert seqno != PipelineAck.UNKOWN_SEQNO :
              "Ack for unkown seqno should be a failed ack: " + ack;
            if (seqno == Packet.HEART_BEAT_SEQNO) {  // a heartbeat ack
              continue;
            }

            // a success ack for a data packet
            Packet one = null;
            synchronized (dataQueue) {
              one = ackQueue.getFirst();
            }
            if (one.seqno != seqno) {
              throw new IOException("Responseprocessor: Expecting seqno " +
                                    " for block " + block +
                                    one.seqno + " but received " + seqno);
            }
            isLastPacketInBlock = one.lastPacketInBlock;
            // update bytesAcked
            block.setNumBytes(one.getLastByteOffsetBlock());

            synchronized (dataQueue) {
              ackQueue.removeFirst();
              dataQueue.notifyAll();
            }
          } catch (Exception e) {
            if (!responderClosed) {
              if (e instanceof IOException) {
                setLastException((IOException)e);
              }
              hasError = true;
              errorIndex = errorIndex==-1 ? 0 : errorIndex;
              synchronized (dataQueue) {
                dataQueue.notifyAll();
              }
              DFSClient.LOG.warn("DFSOutputStream ResponseProcessor exception " +
                  " for block " + block +
                  StringUtils.stringifyException(e));
              responderClosed = true;
            }
          }
        }
      }

      void close() {
        responderClosed = true;
        this.interrupt();
      }
    }

    // If this stream has encountered any errors so far, shutdown
    // threads and mark stream as closed. Returns true if we should
    // sleep for a while after returning from this call.
    //
    private boolean processDatanodeError() throws IOException {
      if (response != null) {
        DFSClient.LOG.info("Error Recovery for block " + block +
        " waiting for responder to exit. ");
        return true;
      }
      closeStream();

      // move packets from ack queue to front of the data queue
      synchronized (dataQueue) {
        dataQueue.addAll(0, ackQueue);
        ackQueue.clear();
      }

      boolean doSleep = setupPipelineForAppendOrRecovery();
     
      if (!streamerClosed && dfsClient.clientRunning) {
        if (stage == BlockConstructionStage.PIPELINE_CLOSE) {
          synchronized (dataQueue) {
            dataQueue.remove()// remove the end of block packet
            dataQueue.notifyAll();
          }
          endBlock();
        } else {
          initDataStreaming();
        }
      }
     
      return doSleep;
    }


    /**
     * Open a DataOutputStream to a DataNode pipeline so that
     * it can be written to.
     * This happens when a file is appended or data streaming fails
     * It keeps on trying until a pipeline is setup
     */
    private boolean setupPipelineForAppendOrRecovery() throws IOException {
      // check number of datanodes
      if (nodes == null || nodes.length == 0) {
        String msg = "Could not get block locations. " + "Source file \""
            + src + "\" - Aborting...";
        DFSClient.LOG.warn(msg);
        setLastException(new IOException(msg));
        streamerClosed = true;
        return false;
      }
     
      boolean success = false;
      long newGS = 0L;
      while (!success && !streamerClosed && dfsClient.clientRunning) {
        boolean isRecovery = hasError;
        // remove bad datanode from list of datanodes.
        // If errorIndex was not set (i.e. appends), then do not remove
        // any datanodes
        //
        if (errorIndex >= 0) {
          StringBuilder pipelineMsg = new StringBuilder();
          for (int j = 0; j < nodes.length; j++) {
            pipelineMsg.append(nodes[j].getName());
            if (j < nodes.length - 1) {
              pipelineMsg.append(", ");
            }
          }
          if (nodes.length <= 1) {
            lastException = new IOException("All datanodes " + pipelineMsg
                + " are bad. Aborting...");
            streamerClosed = true;
            return false;
          }
          DFSClient.LOG.warn("Error Recovery for block " + block +
              " in pipeline " + pipelineMsg +
              ": bad datanode " + nodes[errorIndex].getName());
          DatanodeInfo[] newnodes = new DatanodeInfo[nodes.length-1];
          System.arraycopy(nodes, 0, newnodes, 0, errorIndex);
          System.arraycopy(nodes, errorIndex+1, newnodes, errorIndex,
              newnodes.length-errorIndex);
          nodes = newnodes;
          hasError = false;
          lastException = null;
          errorIndex = -1;
        }

        // get a new generation stamp and an access token
        LocatedBlock lb = dfsClient.namenode.updateBlockForPipeline(block, dfsClient.clientName);
        newGS = lb.getBlock().getGenerationStamp();
        accessToken = lb.getAccessToken();
       
        // set up the pipeline again with the remaining nodes
        success = createBlockOutputStream(nodes, newGS, isRecovery);
      }

      if (success) {
        // update pipeline at the namenode
        Block newBlock = new Block(
            block.getBlockId(), block.getNumBytes(), newGS);
        dfsClient.namenode.updatePipeline(dfsClient.clientName, block, newBlock, nodes);
        // update client side generation stamp
        block = newBlock;
      }
      return false; // do not sleep, continue processing
    }

    /**
     * Open a DataOutputStream to a DataNode so that it can be written to.
     * This happens when a file is created and each time a new block is allocated.
     * Must get block ID and the IDs of the destinations from the namenode.
     * Returns the list of target datanodes.
     */
    private DatanodeInfo[] nextBlockOutputStream(String client) throws IOException {
      LocatedBlock lb = null;
      DatanodeInfo[] nodes = null;
      int count = conf.getInt("dfs.client.block.write.retries", 3);
      boolean success = false;
      do {
        hasError = false;
        lastException = null;
        errorIndex = -1;
        success = false;

        long startTime = System.currentTimeMillis();
        DatanodeInfo[] w = excludedNodes.toArray(
            new DatanodeInfo[excludedNodes.size()]);
        lb = locateFollowingBlock(startTime, w.length > 0 ? w : null);
        block = lb.getBlock();
        block.setNumBytes(0);
        accessToken = lb.getAccessToken();
        nodes = lb.getLocations();

        //
        // Connect to first DataNode in the list.
        //
        success = createBlockOutputStream(nodes, 0L, false);

        if (!success) {
          DFSClient.LOG.info("Abandoning block " + block);
          dfsClient.namenode.abandonBlock(block, src, dfsClient.clientName);
          block = null;
          DFSClient.LOG.info("Excluding datanode " + nodes[errorIndex]);
          excludedNodes.add(nodes[errorIndex]);
        }
      } while (!success && --count >= 0);

      if (!success) {
        throw new IOException("Unable to create new block.");
      }
      return nodes;
    }

    // connects to the first datanode in the pipeline
    // Returns true if success, otherwise return failure.
    //
    private boolean createBlockOutputStream(DatanodeInfo[] nodes, long newGS,
        boolean recoveryFlag) {
      DataTransferProtocol.Status pipelineStatus = SUCCESS;
      String firstBadLink = "";
      if (DFSClient.LOG.isDebugEnabled()) {
        for (int i = 0; i < nodes.length; i++) {
          DFSClient.LOG.debug("pipeline = " + nodes[i].getName());
        }
      }

      // persist blocks on namenode on next flush
      persistBlocks.set(true);

      try {
        DFSClient.LOG.debug("Connecting to " + nodes[0].getName());
        InetSocketAddress target = NetUtils.createSocketAddr(nodes[0].getName());
        s = dfsClient.socketFactory.createSocket();
        int timeoutValue = dfsClient.getDatanodeReadTimeout(nodes.length);
        NetUtils.connect(s, target, timeoutValue);
        s.setSoTimeout(timeoutValue);
        s.setSendBufferSize(DFSClient.DEFAULT_DATA_SOCKET_SIZE);
        DFSClient.LOG.debug("Send buf size " + s.getSendBufferSize());
        long writeTimeout = dfsClient.getDatanodeWriteTimeout(nodes.length);

        //
        // Xmit header info to datanode
        //
        DataOutputStream out = new DataOutputStream(new BufferedOutputStream(
            NetUtils.getOutputStream(s, writeTimeout),
            DataNode.SMALL_BUFFER_SIZE));
        blockReplyStream = new DataInputStream(NetUtils.getInputStream(s));

        // send the request
        DataTransferProtocol.Sender.opWriteBlock(out,
            block.getBlockId(), block.getGenerationStamp(),
            nodes.length, recoveryFlag?stage.getRecoveryStage():stage, newGS,
            block.getNumBytes(), bytesSent, dfsClient.clientName, null, nodes, accessToken);
        checksum.writeHeader(out);
        out.flush();

        // receive ack for connect
        pipelineStatus = DataTransferProtocol.Status.read(blockReplyStream);
        firstBadLink = Text.readString(blockReplyStream);
        if (pipelineStatus != SUCCESS) {
          if (pipelineStatus == ERROR_ACCESS_TOKEN) {
            throw new InvalidAccessTokenException(
                "Got access token error for connect ack with firstBadLink as "
                    + firstBadLink);
          } else {
            throw new IOException("Bad connect ack with firstBadLink as "
                + firstBadLink);
          }
        }

        blockStream = out;
        return true; // success

      } catch (IOException ie) {

        DFSClient.LOG.info("Exception in createBlockOutputStream " + ie);

        // find the datanode that matches
        if (firstBadLink.length() != 0) {
          for (int i = 0; i < nodes.length; i++) {
            if (nodes[i].getName().equals(firstBadLink)) {
              errorIndex = i;
              break;
            }
          }
        } else {
          errorIndex = 0;
        }
        hasError = true;
        setLastException(ie);
        blockReplyStream = null;
        return false// error
      }
    }

    private LocatedBlock locateFollowingBlock(long start,
        DatanodeInfo[] excludedNodes)
        throws IOException, UnresolvedLinkException {
      int retries = conf.getInt("dfs.client.block.write.locateFollowingBlock.retries", 5);
      long sleeptime = 400;
      while (true) {
        long localstart = System.currentTimeMillis();
        while (true) {
          try {
            return dfsClient.namenode.addBlock(src, dfsClient.clientName, block, excludedNodes);
          } catch (RemoteException e) {
            IOException ue =
              e.unwrapRemoteException(FileNotFoundException.class,
                                      AccessControlException.class,
                                      NSQuotaExceededException.class,
                                      DSQuotaExceededException.class,
                                      UnresolvedPathException.class);
            if (ue != e) {
              throw ue; // no need to retry these exceptions
            }
           
           
            if (NotReplicatedYetException.class.getName().
                equals(e.getClassName())) {
              if (retries == 0) {
                throw e;
              } else {
                --retries;
                DFSClient.LOG.info(StringUtils.stringifyException(e));
                if (System.currentTimeMillis() - localstart > 5000) {
                  DFSClient.LOG.info("Waiting for replication for "
                      + (System.currentTimeMillis() - localstart) / 1000
                      + " seconds");
                }
                try {
                  DFSClient.LOG.warn("NotReplicatedYetException sleeping " + src
                      + " retries left " + retries);
                  Thread.sleep(sleeptime);
                  sleeptime *= 2;
                } catch (InterruptedException ie) {
                }
              }
            } else {
              throw e;
            }

          }
        }
      }
    }

    Block getBlock() {
      return block;
    }

    DatanodeInfo[] getNodes() {
      return nodes;
    }

    BlockAccessToken getAccessToken() {
      return accessToken;
    }

    private void setLastException(IOException e) {
      if (lastException == null) {
        lastException = e;
      }
    }
  }

  private void isClosed() throws IOException {
    if (closed) {
      IOException e = lastException;
      throw e != null ? e : new IOException("DFSOutputStream is closed");
    }
  }

  //
  // returns the list of targets, if any, that is being currently used.
  //
  synchronized DatanodeInfo[] getPipeline() {
    if (streamer == null) {
      return null;
    }
    DatanodeInfo[] currentNodes = streamer.getNodes();
    if (currentNodes == null) {
      return null;
    }
    DatanodeInfo[] value = new DatanodeInfo[currentNodes.length];
    for (int i = 0; i < currentNodes.length; i++) {
      value[i] = currentNodes[i];
    }
    return value;
  }

  private DFSOutputStream(DFSClient dfsClient, String src, long blockSize, Progressable progress,
      int bytesPerChecksum, short replication) throws IOException {
    super(new PureJavaCrc32(), bytesPerChecksum, 4);
    this.dfsClient = dfsClient;
    this.conf = dfsClient.conf;
    this.src = src;
    this.blockSize = blockSize;
    this.blockReplication = replication;
    this.progress = progress;
    if (progress != null) {
      DFSClient.LOG.debug("Set non-null progress callback on DFSOutputStream "+src);
    }
   
    if ( bytesPerChecksum < 1 || blockSize % bytesPerChecksum != 0) {
      throw new IOException("io.bytes.per.checksum(" + bytesPerChecksum +
                            ") and blockSize(" + blockSize +
                            ") do not match. " + "blockSize should be a " +
                            "multiple of io.bytes.per.checksum");
                           
    }
    checksum = DataChecksum.newDataChecksum(DataChecksum.CHECKSUM_CRC32,
                                            bytesPerChecksum);
  }

  /**
   * Create a new output stream to the given DataNode.
   * @see ClientProtocol#create(String, FsPermission, String, EnumSetWritable, boolean, short, long)
   */
  DFSOutputStream(DFSClient dfsClient, String src, FsPermission masked, EnumSet<CreateFlag> flag,
      boolean createParent, short replication, long blockSize, Progressable progress,
      int buffersize, int bytesPerChecksum)
      throws IOException, UnresolvedLinkException {
    this(dfsClient, src, blockSize, progress, bytesPerChecksum, replication);

    computePacketChunkSize(dfsClient.writePacketSize, bytesPerChecksum);

    try {
      dfsClient.namenode.create(
          src, masked, dfsClient.clientName, new EnumSetWritable<CreateFlag>(flag), createParent, replication, blockSize);
    } catch(RemoteException re) {
      throw re.unwrapRemoteException(AccessControlException.class,
                                     FileAlreadyExistsException.class,
                                     FileNotFoundException.class,
                                     NSQuotaExceededException.class,
                                     DSQuotaExceededException.class,
                                     UnresolvedPathException.class);
    }
    streamer = new DataStreamer();
    streamer.start();
  }

  /**
   * Create a new output stream to the given DataNode.
   * @see ClientProtocol#create(String, FsPermission, String, boolean, short, long)
   */
  DFSOutputStream(DFSClient dfsClient, String src, int buffersize, Progressable progress,
      LocatedBlock lastBlock, HdfsFileStatus stat,
      int bytesPerChecksum) throws IOException {
    this(dfsClient, src, stat.getBlockSize(), progress, bytesPerChecksum, stat.getReplication());
    initialFileSize = stat.getLen(); // length of file when opened

    //
    // The last partial block of the file has to be filled.
    //
    if (lastBlock != null) {
      // indicate that we are appending to an existing block
      bytesCurBlock = lastBlock.getBlockSize();
      streamer = new DataStreamer(lastBlock, stat, bytesPerChecksum);
    } else {
      computePacketChunkSize(dfsClient.writePacketSize, bytesPerChecksum);
      streamer = new DataStreamer();
    }
    streamer.start();
  }

  private void computePacketChunkSize(int psize, int csize) {
    int chunkSize = csize + checksum.getChecksumSize();
    int n = DataNode.PKT_HEADER_LEN + DFSClient.SIZE_OF_INTEGER;
    chunksPerPacket = Math.max((psize - n + chunkSize-1)/chunkSize, 1);
    packetSize = n + chunkSize*chunksPerPacket;
    if (DFSClient.LOG.isDebugEnabled()) {
      DFSClient.LOG.debug("computePacketChunkSize: src=" + src +
                ", chunkSize=" + chunkSize +
                ", chunksPerPacket=" + chunksPerPacket +
                ", packetSize=" + packetSize);
    }
  }

  private void queuePacket(Packet packet) {
    synchronized (dataQueue) {
      dataQueue.addLast(packet);
      dataQueue.notifyAll();
    }
  }

  private void waitAndQueuePacket(Packet packet) throws IOException {
    synchronized (dataQueue) {
      // If queue is full, then wait till we have enough space
      while (!closed && dataQueue.size() + ackQueue.size()  > MAX_PACKETS) {
        try {
          dataQueue.wait();
        } catch (InterruptedException  e) {
        }
      }
      isClosed();
      queuePacket(packet);
    }
  }

  // @see FSOutputSummer#writeChunk()
  @Override
  protected synchronized void writeChunk(byte[] b, int offset, int len, byte[] checksum)
                                                        throws IOException {
    dfsClient.checkOpen();
    isClosed();

    int cklen = checksum.length;
    int bytesPerChecksum = this.checksum.getBytesPerChecksum();
    if (len > bytesPerChecksum) {
      throw new IOException("writeChunk() buffer size is " + len +
                            " is larger than supported  bytesPerChecksum " +
                            bytesPerChecksum);
    }
    if (checksum.length != this.checksum.getChecksumSize()) {
      throw new IOException("writeChunk() checksum size is supposed to be " +
                            this.checksum.getChecksumSize() +
                            " but found to be " + checksum.length);
    }

    if (currentPacket == null) {
      currentPacket = new Packet(packetSize, chunksPerPacket,
          bytesCurBlock);
      if (DFSClient.LOG.isDebugEnabled()) {
        DFSClient.LOG.debug("DFSClient writeChunk allocating new packet seqno=" +
            currentPacket.seqno +
            ", src=" + src +
            ", packetSize=" + packetSize +
            ", chunksPerPacket=" + chunksPerPacket +
            ", bytesCurBlock=" + bytesCurBlock);
      }
    }

    currentPacket.writeChecksum(checksum, 0, cklen);
    currentPacket.writeData(b, offset, len);
    currentPacket.numChunks++;
    bytesCurBlock += len;

    // If packet is full, enqueue it for transmission
    //
    if (currentPacket.numChunks == currentPacket.maxChunks ||
        bytesCurBlock == blockSize) {
      if (DFSClient.LOG.isDebugEnabled()) {
        DFSClient.LOG.debug("DFSClient writeChunk packet full seqno=" +
            currentPacket.seqno +
            ", src=" + src +
            ", bytesCurBlock=" + bytesCurBlock +
            ", blockSize=" + blockSize +
            ", appendChunk=" + appendChunk);
      }
      waitAndQueuePacket(currentPacket);
      currentPacket = null;

      // If the reopened file did not end at chunk boundary and the above
      // write filled up its partial chunk. Tell the summer to generate full
      // crc chunks from now on.
      if (appendChunk && bytesCurBlock%bytesPerChecksum == 0) {
        appendChunk = false;
        resetChecksumChunk(bytesPerChecksum);
      }

      if (!appendChunk) {
        int psize = Math.min((int)(blockSize-bytesCurBlock), dfsClient.writePacketSize);
        computePacketChunkSize(psize, bytesPerChecksum);
      }
      //
      // if encountering a block boundary, send an empty packet to
      // indicate the end of block and reset bytesCurBlock.
      //
      if (bytesCurBlock == blockSize) {
        currentPacket = new Packet(DataNode.PKT_HEADER_LEN+4, 0,
            bytesCurBlock);
        currentPacket.lastPacketInBlock = true;
        waitAndQueuePacket(currentPacket);
        currentPacket = null;
        bytesCurBlock = 0;
        lastFlushOffset = -1;
      }
    }
  }

  @Override
  @Deprecated
  public synchronized void sync() throws IOException {
    hflush();
  }
 
  /**
   * flushes out to all replicas of the block.
   * The data is in the buffers of the DNs
   * but not neccessary on the DN's OS buffers.
   *
   * It is a synchronous operation. When it returns,
   * it gurantees that flushed data become visible to new readers.
   * It is not guaranteed that data has been flushed to
   * persistent store on the datanode.
   * Block allocations are persisted on namenode.
   */
  @Override
  public synchronized void hflush() throws IOException {
    dfsClient.checkOpen();
    isClosed();
    try {
      /* Record current blockOffset. This might be changed inside
       * flushBuffer() where a partial checksum chunk might be flushed.
       * After the flush, reset the bytesCurBlock back to its previous value,
       * any partial checksum chunk will be sent now and in next packet.
       */
      long saveOffset = bytesCurBlock;

      // flush checksum buffer, but keep checksum buffer intact
      flushBuffer(true);

      DFSClient.LOG.debug("DFSClient flush() : saveOffset " + saveOffset + 
                " bytesCurBlock " + bytesCurBlock +
                " lastFlushOffset " + lastFlushOffset);
     
      // Flush only if we haven't already flushed till this offset.
      if (lastFlushOffset != bytesCurBlock) {

        // record the valid offset of this flush
        lastFlushOffset = bytesCurBlock;

        // wait for all packets to be sent and acknowledged
        flushInternal();
      } else {
        // just discard the current packet since it is already been sent.
        currentPacket = null;
      }
     
      // Restore state of stream. Record the last flush offset
      // of the last full chunk that was flushed.
      //
      bytesCurBlock = saveOffset;

      // If any new blocks were allocated since the last flush,
      // then persist block locations on namenode.
      //
      if (persistBlocks.getAndSet(false)) {
        dfsClient.namenode.fsync(src, dfsClient.clientName);
      }
    } catch (IOException e) {
        lastException = new IOException("IOException flush:" + e);
        closeThreads(true);
        throw e;
    }
  }

  /**
   * The expected semantics is all data have flushed out to all replicas
   * and all replicas have done posix fsync equivalent - ie the OS has
   * flushed it to the disk device (but the disk may have it in its cache).
   *
   * Right now by default it is implemented as hflush
   */
  @Override
  public synchronized void hsync() throws IOException {
    hflush();
  }

  /**
   * Returns the number of replicas of current block. This can be different
   * from the designated replication factor of the file because the NameNode
   * does not replicate the block to which a client is currently writing to.
   * The client continues to write to a block even if a few datanodes in the
   * write pipeline have failed.
   * @return the number of valid replicas of the current block
   */
  public synchronized int getNumCurrentReplicas() throws IOException {
    dfsClient.checkOpen();
    isClosed();
    if (streamer == null) {
      return blockReplication; // no pipeline, return repl factor of file
    }
    DatanodeInfo[] currentNodes = streamer.getNodes();
    if (currentNodes == null) {
      return blockReplication; // no pipeline, return repl factor of file
    }
    return currentNodes.length;
  }
 
  /**
   * Waits till all existing data is flushed and confirmations
   * received from datanodes.
   */
  private synchronized void flushInternal() throws IOException {
    dfsClient.checkOpen();
    isClosed();
    //
    // If there is data in the current buffer, send it across
    //
    if (currentPacket != null) {
      queuePacket(currentPacket);
      currentPacket = null;
    }

    synchronized (dataQueue) {
      while (!closed && dataQueue.size() + ackQueue.size() > 0) {
        try {
          dataQueue.wait();
        } catch (InterruptedException  e) {
        }
      }
      isClosed();
    }
  }

  /**
   * Aborts this output stream and releases any system
   * resources associated with this stream.
   */
  synchronized void abort() throws IOException {
    if (closed) {
      return;
    }
    streamer.setLastException(new IOException("Lease timeout of " +
                             (dfsClient.hdfsTimeout/1000) + " seconds expired."));
    closeThreads(true);
  }

  // shutdown datastreamer and responseprocessor threads.
  // interrupt datastreamer if force is true
  private void closeThreads(boolean force) throws IOException {
    try {
      streamer.close(force);
      streamer.join();
      if (s != null) {
        s.close();
      }
    } catch (InterruptedException e) {
      throw new IOException("Failed to shutdown streamer");
    } finally {
      streamer = null;
      s = null;
      closed = true;
    }
  }
 
  /**
   * Closes this output stream and releases any system
   * resources associated with this stream.
   */
  @Override
  public synchronized void close() throws IOException {
    if (closed) {
      IOException e = lastException;
      if (e == null)
        return;
      else
        throw e;
    }

    try {
      flushBuffer();       // flush from all upper layers

      if (currentPacket != null) {
        waitAndQueuePacket(currentPacket);
      }

      if (bytesCurBlock != 0) {
        // send an empty packet to mark the end of the block
        currentPacket = new Packet(DataNode.PKT_HEADER_LEN+4, 0,
            bytesCurBlock);
        currentPacket.lastPacketInBlock = true;
      }

      flushInternal();             // flush all data to Datanodes
      // get last block before destroying the streamer
      Block lastBlock = streamer.getBlock();
      closeThreads(false);
      completeFile(lastBlock);
      dfsClient.leasechecker.remove(src);
    } finally {
      closed = true;
    }
  }

  // should be called holding (this) lock since setTestFilename() may
  // be called during unit tests
  private void completeFile(Block last) throws IOException {
    long localstart = System.currentTimeMillis();
    boolean fileComplete = false;
    while (!fileComplete) {
      fileComplete = dfsClient.namenode.complete(src, dfsClient.clientName, last);
      if (!fileComplete) {
        if (!dfsClient.clientRunning ||
              (dfsClient.hdfsTimeout > 0 &&
               localstart + dfsClient.hdfsTimeout < System.currentTimeMillis())) {
            String msg = "Unable to close file because dfsclient " +
                          " was unable to contact the HDFS servers." +
                          " clientRunning " + dfsClient.clientRunning +
                          " hdfsTimeout " + dfsClient.hdfsTimeout;
            DFSClient.LOG.info(msg);
            throw new IOException(msg);
        }
        try {
          Thread.sleep(400);
          if (System.currentTimeMillis() - localstart > 5000) {
            DFSClient.LOG.info("Could not complete file " + src + " retrying...");
          }
        } catch (InterruptedException ie) {
        }
      }
    }
  }

  void setArtificialSlowdown(long period) {
    artificialSlowdown = period;
  }

  synchronized void setChunksPerPacket(int value) {
    chunksPerPacket = Math.min(chunksPerPacket, value);
    packetSize = DataNode.PKT_HEADER_LEN + DFSClient.SIZE_OF_INTEGER +
                 (checksum.getBytesPerChecksum() +
                  checksum.getChecksumSize()) * chunksPerPacket;
  }

  synchronized void setTestFilename(String newname) {
    src = newname;
  }

  /**
   * Returns the size of a file as it was when this stream was opened
   */
  long getInitialLen() {
    return initialFileSize;
  }

  /**
   * Returns the access token currently used by streamer, for testing only
   */
  BlockAccessToken getAccessToken() {
    return streamer.getAccessToken();
  }

}
TOP

Related Classes of org.apache.hadoop.hdfs.DFSOutputStream$DataStreamer

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.