Package org.apache.hadoop.hdfs.server.datanode

Source Code of org.apache.hadoop.hdfs.server.datanode.BlockReceiver

/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hdfs.server.datanode;

import java.io.BufferedOutputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.EOFException;
import java.io.FileDescriptor;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.nio.ByteBuffer;
import java.util.LinkedList;
import java.util.zip.Checksum;

import org.apache.commons.logging.Log;
import org.apache.hadoop.fs.FSInputChecker;
import org.apache.hadoop.fs.FSOutputSummer;
import org.apache.hadoop.hdfs.protocol.Block;
import org.apache.hadoop.hdfs.protocol.DataTransferProtocol;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.hadoop.hdfs.protocol.FSConstants;
import org.apache.hadoop.hdfs.protocol.LocatedBlock;
import org.apache.hadoop.hdfs.protocol.DataTransferProtocol.PipelineAck;
import org.apache.hadoop.hdfs.util.DataTransferThrottler;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.nativeio.NativeIO;
import org.apache.hadoop.util.Daemon;
import org.apache.hadoop.util.DataChecksum;
import org.apache.hadoop.util.PureJavaCrc32;
import org.apache.hadoop.util.StringUtils;
import static org.apache.hadoop.hdfs.server.datanode.DataNode.DN_CLIENTTRACE_FORMAT;

/** A class that receives a block and writes to its own disk, meanwhile
* may copies it to another site. If a throttler is provided,
* streaming throttling is also supported.
**/
class BlockReceiver implements java.io.Closeable, FSConstants {
  public static final Log LOG = DataNode.LOG;
  static final Log ClientTraceLog = DataNode.ClientTraceLog;
 
  private static final long CACHE_DROP_LAG_BYTES = 8 * 1024 * 1024;
 
  private Block block; // the block to receive
  protected boolean finalized;
  private DataInputStream in = null; // from where data are read
  private DataChecksum checksum; // from where chunks of a block can be read
  private OutputStream out = null; // to block file at local disk
  private OutputStream cout = null; // output stream for cehcksum file
  private FileDescriptor outFd;
  private DataOutputStream checksumOut = null; // to crc file at local disk
  private int bytesPerChecksum;
  private int checksumSize;
  private ByteBuffer buf; // contains one full packet.
  private int bufRead; //amount of valid data in the buf
  private int maxPacketReadLen;
  protected long offsetInBlock;
  protected final String inAddr;
  protected final String myAddr;
  private String mirrorAddr;
  private DataOutputStream mirrorOut;
  private Daemon responder = null;
  private DataTransferThrottler throttler;
  private FSDataset.BlockWriteStreams streams;
  private boolean isRecovery = false;
  private String clientName;
  DatanodeInfo srcDataNode = null;
  private Checksum partialCrc = null;
  private DataNode datanode = null;
  volatile private boolean mirrorError;

  // Cache management state
  private boolean dropCacheBehindWrites;
  private boolean syncBehindWrites;
  private long lastCacheDropOffset = 0;
 
  BlockReceiver(Block block, DataInputStream in, String inAddr,
                String myAddr, boolean isRecovery, String clientName,
                DatanodeInfo srcDataNode, DataNode datanode) throws IOException {
    try{
      this.block = block;
      this.in = in;
      this.inAddr = inAddr;
      this.myAddr = myAddr;
      this.isRecovery = isRecovery;
      this.clientName = clientName;
      this.offsetInBlock = 0;
      this.srcDataNode = srcDataNode;
      this.datanode = datanode;
      this.checksum = DataChecksum.newDataChecksum(in);
      this.bytesPerChecksum = checksum.getBytesPerChecksum();
      this.checksumSize = checksum.getChecksumSize();
      this.dropCacheBehindWrites = datanode.shouldDropCacheBehindWrites();
      this.syncBehindWrites = datanode.shouldSyncBehindWrites();
      //
      // Open local disk out
      //
      streams = datanode.data.writeToBlock(block, isRecovery,
                              clientName == null || clientName.length() == 0);
      this.finalized = false;
      if (streams != null) {
        this.out = streams.dataOut;
        this.cout = streams.checksumOut;
        if (out instanceof FileOutputStream) {
          this.outFd = ((FileOutputStream) out).getFD();
        } else {
          LOG.warn("Could not get file descriptor for outputstream of class "
              + out.getClass());
        }
        this.checksumOut = new DataOutputStream(new BufferedOutputStream(
                                                  streams.checksumOut,
                                                  SMALL_BUFFER_SIZE));
        // If this block is for appends, then remove it from periodic
        // validation.
        if (datanode.blockScanner != null && isRecovery) {
          datanode.blockScanner.deleteBlock(block);
        }
      }
    } catch (BlockAlreadyExistsException bae) {
      throw bae;
    } catch(IOException ioe) {
      IOUtils.closeStream(this);
      cleanupBlock();
     
      // check if there is a disk error
      IOException cause = FSDataset.getCauseIfDiskError(ioe);
      DataNode.LOG.warn("IOException in BlockReceiver constructor. Cause is ",
          cause);
     
      if (cause != null) { // possible disk error
        ioe = cause;
        datanode.checkDiskError(ioe); // may throw an exception here
      }
     
      throw ioe;
    }
  }

  /**
   * close files.
   */
  public void close() throws IOException {

    IOException ioe = null;
    // close checksum file
    try {
      if (checksumOut != null) {
        checksumOut.flush();
        if (datanode.syncOnClose && (cout instanceof FileOutputStream)) {
          ((FileOutputStream)cout).getChannel().force(true);
        }
        checksumOut.close();
        checksumOut = null;
      }
    } catch(IOException e) {
      ioe = e;
    }
    // close block file
    try {
      if (out != null) {
        out.flush();
        if (datanode.syncOnClose && (out instanceof FileOutputStream)) {
          ((FileOutputStream)out).getChannel().force(true);
        }
        out.close();
        out = null;
      }
    } catch (IOException e) {
      ioe = e;
    }
    // disk check
    if(ioe != null) {
      datanode.checkDiskError(ioe);
      throw ioe;
    }
  }

  /**
   * Flush block data and metadata files to disk.
   * @throws IOException
   */
  void flush() throws IOException {
    if (checksumOut != null) {
      checksumOut.flush();
    }
    if (out != null) {
      out.flush();
    }
  }

  /**
   * While writing to mirrorOut, failure to write to mirror should not
   * affect this datanode.
   */
  private void handleMirrorOutError(IOException ioe) throws IOException {
    LOG.info(datanode.dnRegistration + ": Exception writing " +
             block + " to mirror " + mirrorAddr + "\n" +
             StringUtils.stringifyException(ioe));
    if (Thread.interrupted()) { // shut down if the thread is interrupted
      throw ioe;
    } else { // encounter an error while writing to mirror
      // continue to run even if can not write to mirror
      // notify client of the error
      // and wait for the client to shut down the pipeline
      mirrorError = true;
    }
  }
 
  /**
   * Verify multiple CRC chunks.
   */
  private void verifyChunks( byte[] dataBuf, int dataOff, int len,
                             byte[] checksumBuf, int checksumOff )
                             throws IOException {
    while (len > 0) {
      int chunkLen = Math.min(len, bytesPerChecksum);
     
      checksum.update(dataBuf, dataOff, chunkLen);

      if (!checksum.compare(checksumBuf, checksumOff)) {
        if (srcDataNode != null) {
          try {
            LOG.info("report corrupt " + block + " from datanode " +
                      srcDataNode + " to namenode");
            LocatedBlock lb = new LocatedBlock(block,
                                            new DatanodeInfo[] {srcDataNode});
            datanode.namenode.reportBadBlocks(new LocatedBlock[] {lb});
          } catch (IOException e) {
            LOG.warn("Failed to report bad " + block +
                      " from datanode " + srcDataNode + " to namenode");
          }
        }
        throw new IOException("Unexpected checksum mismatch " +
                              "while writing " + block + " from " + inAddr);
      }

      checksum.reset();
      dataOff += chunkLen;
      checksumOff += checksumSize;
      len -= chunkLen;
    }
  }

  /**
   * Makes sure buf.position() is zero without modifying buf.remaining().
   * It moves the data if position needs to be changed.
   */
  private void shiftBufData() {
    if (bufRead != buf.limit()) {
      throw new IllegalStateException("bufRead should be same as " +
                                      "buf.limit()");
    }
   
    //shift the remaining data on buf to the front
    if (buf.position() > 0) {
      int dataLeft = buf.remaining();
      if (dataLeft > 0) {
        byte[] b = buf.array();
        System.arraycopy(b, buf.position(), b, 0, dataLeft);
      }
      buf.position(0);
      bufRead = dataLeft;
      buf.limit(bufRead);
    }
  }
 
  /**
   * reads upto toRead byte to buf at buf.limit() and increments the limit.
   * throws an IOException if read does not succeed.
   */
  private int readToBuf(int toRead) throws IOException {
    if (toRead < 0) {
      toRead = (maxPacketReadLen > 0 ? maxPacketReadLen : buf.capacity())
               - buf.limit();
    }
   
    int nRead = in.read(buf.array(), buf.limit(), toRead);
   
    if (nRead < 0) {
      throw new EOFException("while trying to read " + toRead + " bytes");
    }
    bufRead = buf.limit() + nRead;
    buf.limit(bufRead);
    return nRead;
  }
 
 
  /**
   * Reads (at least) one packet and returns the packet length.
   * buf.position() points to the start of the packet and
   * buf.limit() point to the end of the packet. There could
   * be more data from next packet in buf.<br><br>
   *
   * It tries to read a full packet with single read call.
   * Consecutive packets are usually of the same length.
   */
  private int readNextPacket() throws IOException {
    /* This dances around buf a little bit, mainly to read
     * full packet with single read and to accept arbitarary size 
     * for next packet at the same time.
     */
    if (buf == null) {
      /* initialize buffer to the best guess size:
       * 'chunksPerPacket' calculation here should match the same
       * calculation in DFSClient to make the guess accurate.
       */
      int chunkSize = bytesPerChecksum + checksumSize;
      int chunksPerPacket = (datanode.writePacketSize - DataNode.PKT_HEADER_LEN -
                             SIZE_OF_INTEGER + chunkSize - 1)/chunkSize;
      buf = ByteBuffer.allocate(DataNode.PKT_HEADER_LEN + SIZE_OF_INTEGER +
                                Math.max(chunksPerPacket, 1) * chunkSize);
      buf.limit(0);
    }
   
    // See if there is data left in the buffer :
    if (bufRead > buf.limit()) {
      buf.limit(bufRead);
    }
   
    while (buf.remaining() < SIZE_OF_INTEGER) {
      if (buf.position() > 0) {
        shiftBufData();
      }
      readToBuf(-1);
    }
   
    /* We mostly have the full packet or at least enough for an int
     */
    buf.mark();
    int payloadLen = buf.getInt();
    buf.reset();
   
    if (payloadLen == 0) {
      //end of stream!
      buf.limit(buf.position() + SIZE_OF_INTEGER);
      return 0;
    }
   
    // check corrupt values for pktLen, 100MB upper limit should be ok?
    if (payloadLen < 0 || payloadLen > (100*1024*1024)) {
      throw new IOException("Incorrect value for packet payload : " +
                            payloadLen);
    }
   
    int pktSize = payloadLen + DataNode.PKT_HEADER_LEN;
   
    if (buf.remaining() < pktSize) {
      //we need to read more data
      int toRead = pktSize - buf.remaining();
     
      // first make sure buf has enough space.       
      int spaceLeft = buf.capacity() - buf.limit();
      if (toRead > spaceLeft && buf.position() > 0) {
        shiftBufData();
        spaceLeft = buf.capacity() - buf.limit();
      }
      if (toRead > spaceLeft) {
        byte oldBuf[] = buf.array();
        int toCopy = buf.limit();
        buf = ByteBuffer.allocate(toCopy + toRead);
        System.arraycopy(oldBuf, 0, buf.array(), 0, toCopy);
        buf.limit(toCopy);
      }
     
      //now read:
      while (toRead > 0) {
        toRead -= readToBuf(toRead);
      }
    }
   
    if (buf.remaining() > pktSize) {
      buf.limit(buf.position() + pktSize);
    }
   
    if (pktSize > maxPacketReadLen) {
      maxPacketReadLen = pktSize;
    }
   
    return payloadLen;
  }
 
  /**
   * Receives and processes a packet. It can contain many chunks.
   * returns size of the packet.
   */
  private int receivePacket() throws IOException {
   
    int payloadLen = readNextPacket();
   
    if (payloadLen <= 0) {
      return payloadLen;
    }
   
    buf.mark();
    //read the header
    buf.getInt(); // packet length
    offsetInBlock = buf.getLong(); // get offset of packet in block
    long seqno = buf.getLong();    // get seqno
    boolean lastPacketInBlock = (buf.get() != 0);
   
    int endOfHeader = buf.position();
    buf.reset();
   
    if (LOG.isDebugEnabled()){
      LOG.debug("Receiving one packet for " + block +
                " of length " + payloadLen +
                " seqno " + seqno +
                " offsetInBlock " + offsetInBlock +
                " lastPacketInBlock " + lastPacketInBlock);
    }
   
    setBlockPosition(offsetInBlock);
   
    // First write the packet to the mirror:
    if (mirrorOut != null && !mirrorError) {
      try {
        mirrorOut.write(buf.array(), buf.position(), buf.remaining());
        mirrorOut.flush();
      } catch (IOException e) {
        handleMirrorOutError(e);
      }
    }

    buf.position(endOfHeader);       
    int len = buf.getInt();
   
    if (len < 0) {
      throw new IOException("Got wrong length during writeBlock(" + block +
                            ") from " + inAddr + " at offset " +
                            offsetInBlock + ": " + len);
    }

    if (len == 0) {
      LOG.debug("Receiving empty packet for " + block);
    } else {
      offsetInBlock += len;

      int checksumLen = ((len + bytesPerChecksum - 1)/bytesPerChecksum)*
                                                            checksumSize;

      if ( buf.remaining() != (checksumLen + len)) {
        throw new IOException("Data remaining in packet does not match " +
                              "sum of checksumLen and dataLen");
      }
      int checksumOff = buf.position();
      int dataOff = checksumOff + checksumLen;
      byte pktBuf[] = buf.array();

      buf.position(buf.limit()); // move to the end of the data.

      /* skip verifying checksum iff this is not the last one in the
       * pipeline and clientName is non-null. i.e. Checksum is verified
       * on all the datanodes when the data is being written by a
       * datanode rather than a client. Whe client is writing the data,
       * protocol includes acks and only the last datanode needs to verify
       * checksum.
       */
      if (mirrorOut == null || clientName.length() == 0) {
        verifyChunks(pktBuf, dataOff, len, pktBuf, checksumOff);
      }

      try {
        if (!finalized) {
          //finally write to the disk :
          out.write(pktBuf, dataOff, len);

          // If this is a partial chunk, then verify that this is the only
          // chunk in the packet. Calculate new crc for this chunk.
          if (partialCrc != null) {
            if (len > bytesPerChecksum) {
              throw new IOException("Got wrong length during writeBlock(" +
                                    block + ") from " + inAddr + " " +
                                    "A packet can have only one partial chunk."+
                                    " len = " + len +
                                    " bytesPerChecksum " + bytesPerChecksum);
            }
            partialCrc.update(pktBuf, dataOff, len);
            byte[] buf = FSOutputSummer.convertToByteStream(partialCrc, checksumSize);
            checksumOut.write(buf);
            LOG.debug("Writing out partial crc for data len " + len);
            partialCrc = null;
          } else {
            checksumOut.write(pktBuf, checksumOff, checksumLen);
          }
          datanode.myMetrics.incrBytesWritten(len);
          /// flush entire packet before sending ack
          flush();
         
          // update length only after flush to disk
          datanode.data.setVisibleLength(block, offsetInBlock);
          dropOsCacheBehindWriter(offsetInBlock);
        }
      } catch (IOException iex) {
        datanode.checkDiskError(iex);
        throw iex;
      }
    }

    // put in queue for pending acks
    if (responder != null) {
      ((PacketResponder)responder.getRunnable()).enqueue(seqno,
                                      lastPacketInBlock);
    }
   
    if (throttler != null) { // throttle I/O
      throttler.throttle(payloadLen);
    }
   
    return payloadLen;
  }

  private void dropOsCacheBehindWriter(long offsetInBlock) throws IOException {
    try {
      if (outFd != null
          && offsetInBlock > lastCacheDropOffset + CACHE_DROP_LAG_BYTES) {
        long twoWindowsAgo = lastCacheDropOffset - CACHE_DROP_LAG_BYTES;
        if (twoWindowsAgo > 0 && dropCacheBehindWrites) {
          NativeIO.posixFadviseIfPossible(outFd, 0, lastCacheDropOffset,
              NativeIO.POSIX_FADV_DONTNEED);
        }

        if (syncBehindWrites) {
          NativeIO.syncFileRangeIfPossible(outFd, lastCacheDropOffset,
              CACHE_DROP_LAG_BYTES, NativeIO.SYNC_FILE_RANGE_WRITE);
        }

        lastCacheDropOffset += CACHE_DROP_LAG_BYTES;
      }
    } catch (Throwable t) {
      LOG.warn("Couldn't drop os cache behind writer for " + block, t);
    }
  }

  void writeChecksumHeader(DataOutputStream mirrorOut) throws IOException {
    checksum.writeHeader(mirrorOut);
  }

  void receiveBlock(
      DataOutputStream mirrOut, // output to next datanode
      DataInputStream mirrIn,   // input from next datanode
      DataOutputStream replyOut,  // output to previous datanode
      String mirrAddr, DataTransferThrottler throttlerArg,
      int numTargets) throws IOException {

      mirrorOut = mirrOut;
      mirrorAddr = mirrAddr;
      throttler = throttlerArg;

    try {
      // write data chunk header
      if (!finalized) {
        BlockMetadataHeader.writeHeader(checksumOut, checksum);
      }
      if (clientName.length() > 0) {
        responder = new Daemon(datanode.threadGroup,
                               new PacketResponder(this, block, mirrIn,
                                                   replyOut, numTargets,
                                                   Thread.currentThread()));
        responder.start(); // start thread to processes reponses
      }

      /*
       * Receive until packet length is zero.
       */
      while (receivePacket() > 0) {}

      // flush the mirror out
      if (mirrorOut != null) {
        try {
          mirrorOut.writeInt(0); // mark the end of the block
          mirrorOut.flush();
        } catch (IOException e) {
          handleMirrorOutError(e);
        }
      }

      // wait for all outstanding packet responses. And then
      // indicate responder to gracefully shutdown.
      if (responder != null) {
        ((PacketResponder)responder.getRunnable()).close();
      }

      // if this write is for a replication request (and not
      // from a client), then finalize block. For client-writes,
      // the block is finalized in the PacketResponder.
      if (clientName.length() == 0) {
        // close the block/crc files
        close();

        // Finalize the block. Does this fsync()?
        block.setNumBytes(offsetInBlock);
        datanode.data.finalizeBlock(block);
        datanode.myMetrics.incrBlocksWritten();
      }

    } catch (IOException ioe) {
      LOG.info("Exception in receiveBlock for " + block + " " + ioe);
      IOUtils.closeStream(this);
      if (responder != null) {
        responder.interrupt();
      }
      cleanupBlock();
      throw ioe;
    } finally {
      if (responder != null) {
        try {
          responder.join();
        } catch (InterruptedException e) {
          throw new IOException("Interrupted receiveBlock");
        }
        responder = null;
      }
    }
  }

  /** Cleanup a partial block
   * if this write is for a replication request (and not from a client)
   */
  private void cleanupBlock() throws IOException {
    if (clientName.length() == 0) { // not client write
      datanode.data.unfinalizeBlock(block);
    }
  }

  /**
   * Sets the file pointer in the local block file to the specified value.
   */
  private void setBlockPosition(long offsetInBlock) throws IOException {
    if (finalized) {
      if (!isRecovery) {
        throw new IOException("Write to offset " + offsetInBlock +
                              " of block " + block +
                              " that is already finalized.");
      }
      if (offsetInBlock > datanode.data.getLength(block)) {
        throw new IOException("Write to offset " + offsetInBlock +
                              " of block " + block +
                              " that is already finalized and is of size " +
                              datanode.data.getLength(block));
      }
      return;
    }

    if (datanode.data.getChannelPosition(block, streams) == offsetInBlock) {
      return;                   // nothing to do
    }
    long offsetInChecksum = BlockMetadataHeader.getHeaderSize() +
                            offsetInBlock / bytesPerChecksum * checksumSize;
    if (out != null) {
     out.flush();
    }
    if (checksumOut != null) {
      checksumOut.flush();
    }

    // If this is a partial chunk, then read in pre-existing checksum
    if (offsetInBlock % bytesPerChecksum != 0) {
      LOG.info("setBlockPosition trying to set position to " +
               offsetInBlock + " for " + block +
               " which is not a multiple of bytesPerChecksum " +
               bytesPerChecksum);
      computePartialChunkCrc(offsetInBlock, offsetInChecksum, bytesPerChecksum);
    }

    if (LOG.isDebugEnabled()) {
      LOG.debug("Changing block file offset of block " + block + " from " +
        datanode.data.getChannelPosition(block, streams) +
             " to " + offsetInBlock +
             " meta file offset to " + offsetInChecksum);
    }

    // set the position of the block file
    datanode.data.setChannelPosition(block, streams, offsetInBlock, offsetInChecksum);
  }

  /**
   * reads in the partial crc chunk and computes checksum
   * of pre-existing data in partial chunk.
   */
  private void computePartialChunkCrc(long blkoff, long ckoff,
                                      int bytesPerChecksum) throws IOException {

    // find offset of the beginning of partial chunk.
    //
    int sizePartialChunk = (int) (blkoff % bytesPerChecksum);
    int checksumSize = checksum.getChecksumSize();
    blkoff = blkoff - sizePartialChunk;
    LOG.info("computePartialChunkCrc sizePartialChunk " +
              sizePartialChunk + " " + block +
              " offset in block " + blkoff +
              " offset in metafile " + ckoff);

    // create an input stream from the block file
    // and read in partial crc chunk into temporary buffer
    //
    byte[] buf = new byte[sizePartialChunk];
    byte[] crcbuf = new byte[checksumSize];
    FSDataset.BlockInputStreams instr = null;
    try {
      instr = datanode.data.getTmpInputStreams(block, blkoff, ckoff);
      IOUtils.readFully(instr.dataIn, buf, 0, sizePartialChunk);

      // open meta file and read in crc value computer earlier
      IOUtils.readFully(instr.checksumIn, crcbuf, 0, crcbuf.length);
    } finally {
      IOUtils.closeStream(instr);
    }

    // compute crc of partial chunk from data read in the block file.
    partialCrc = new PureJavaCrc32();
    partialCrc.update(buf, 0, sizePartialChunk);
    LOG.info("Read in partial CRC chunk from disk for " + block);

    // paranoia! verify that the pre-computed crc matches what we
    // recalculated just now
    if (partialCrc.getValue() != FSInputChecker.checksum2long(crcbuf)) {
      String msg = "Partial CRC " + partialCrc.getValue() +
                   " does not match value computed the " +
                   " last time file was closed " +
                   FSInputChecker.checksum2long(crcbuf);
      throw new IOException(msg);
    }
    //LOG.debug("Partial CRC matches 0x" +
    //            Long.toHexString(partialCrc.getValue()));
  }
 
 
  /**
   * Processed responses from downstream datanodes in the pipeline
   * and sends back replies to the originator.
   */
  class PacketResponder implements Runnable, FSConstants {  

    //packet waiting for ack
    private LinkedList<Packet> ackQueue = new LinkedList<Packet>();
    private volatile boolean running = true;
    private Block block;
    DataInputStream mirrorIn;   // input from downstream datanode
    DataOutputStream replyOut;  // output to upstream datanode
    private int numTargets;     // number of downstream datanodes including myself
    private BlockReceiver receiver; // The owner of this responder.
    private Thread receiverThread; // the thread that spawns this responder

    public String toString() {
      return "PacketResponder " + numTargets + " for " + this.block;
    }

    PacketResponder(BlockReceiver receiver, Block b, DataInputStream in,
                    DataOutputStream out, int numTargets,
                    Thread receiverThread) {
      this.receiver = receiver;
      this.block = b;
      mirrorIn = in;
      replyOut = out;
      this.numTargets = numTargets;
      this.receiverThread = receiverThread;
    }

    /**
     * enqueue the seqno that is still be to acked by the downstream datanode.
     * @param seqno
     * @param lastPacketInBlock
     */
    synchronized void enqueue(long seqno, boolean lastPacketInBlock) {
      if (running) {
        LOG.debug("PacketResponder " + numTargets + " adding seqno " + seqno +
                  " to ack queue.");
        ackQueue.addLast(new Packet(seqno, lastPacketInBlock));
        notifyAll();
      }
    }

    /**
     * wait for all pending packets to be acked. Then shutdown thread.
     */
    synchronized void close() {
      while (running && ackQueue.size() != 0 && datanode.shouldRun) {
        try {
          wait();
        } catch (InterruptedException e) {
          running = false;
        }
      }
      LOG.debug("PacketResponder " + numTargets +
               " for block " + block + " Closing down.");
      running = false;
      notifyAll();
    }

    /**
     * Thread to process incoming acks.
     * @see java.lang.Runnable#run()
     */
    public void run() {
      boolean lastPacketInBlock = false;
      boolean isInterrupted = false;
      final long startTime = ClientTraceLog.isInfoEnabled() ? System.nanoTime() : 0;
      while (running && datanode.shouldRun && !lastPacketInBlock) {

        try {
          /**
           * Sequence number -2 is a special value that is used when
           * a DN fails to read an ack from a downstream. In this case,
           * it needs to tell the client that there's been an error downstream
           * but has no valid sequence number to use. Thus, -2 is used
           * as an UNKNOWN value.
           */
          long expected = PipelineAck.UNKOWN_SEQNO;
          long seqno = PipelineAck.UNKOWN_SEQNO;;

          PipelineAck ack = new PipelineAck();
          boolean localMirrorError = mirrorError;
          try {
            Packet pkt = null;
            synchronized (this) {
              // wait for a packet to arrive
              while (running && datanode.shouldRun && ackQueue.size() == 0) {
                if (LOG.isDebugEnabled()) {
                  LOG.debug("PacketResponder " + numTargets +
                            " seqno = " + seqno +
                            " for block " + block +
                            " waiting for local datanode to finish write.");
                  }
                  wait();
                }
                if (!running || !datanode.shouldRun) {
                  break;
                }
                pkt = ackQueue.removeFirst();
                expected = pkt.seqno;
                notifyAll();
              }
              // receive an ack if DN is not the last one in the pipeline
              if (numTargets > 0 && !localMirrorError) {
                // read an ack from downstream datanode
                ack.readFields(mirrorIn);
                if (LOG.isDebugEnabled()) {
                  LOG.debug("PacketResponder " + numTargets +
                      " for block " + block + " got " + ack);
                }
                seqno = ack.getSeqno();
                // verify seqno
                if (seqno != expected) {
                  throw new IOException("PacketResponder " + numTargets +
                      " for block " + block +
                      " expected seqno:" + expected +
                      " received:" + seqno);
                }
              }
              lastPacketInBlock = pkt.lastPacketInBlock;
            } catch (InterruptedException ine) {
              isInterrupted = true;
            } catch (IOException ioe) {
              if (Thread.interrupted()) {
                isInterrupted = true;
              } else {
                // continue to run even if can not read from mirror
                // notify client of the error
                // and wait for the client to shut down the pipeline
                mirrorError = true;
                LOG.info("PacketResponder " + block + " " + numTargets +
                    " Exception " + StringUtils.stringifyException(ioe));
              }
            }

            if (Thread.interrupted() || isInterrupted) {
              /* The receiver thread cancelled this thread.
               * We could also check any other status updates from the
               * receiver thread (e.g. if it is ok to write to replyOut).
               * It is prudent to not send any more status back to the client
               * because this datanode has a problem. The upstream datanode
               * will detect that this datanode is bad, and rightly so.
               */
              LOG.info("PacketResponder " + block +  " " + numTargets +
                       " : Thread is interrupted.");
              break;
            }
           
            // If this is the last packet in block, then close block
            // file and finalize the block before responding success
            if (lastPacketInBlock && !receiver.finalized) {
              receiver.close();
              final long endTime = ClientTraceLog.isInfoEnabled() ? System.nanoTime() : 0;
              block.setNumBytes(receiver.offsetInBlock);
              datanode.data.finalizeBlock(block);
              datanode.myMetrics.incrBlocksWritten();
              datanode.notifyNamenodeReceivedBlock(block,
                  DataNode.EMPTY_DEL_HINT);
              if (ClientTraceLog.isInfoEnabled() &&
                  receiver.clientName.length() > 0) {
                long offset = 0;
                ClientTraceLog.info(String.format(DN_CLIENTTRACE_FORMAT,
                      receiver.inAddr, receiver.myAddr, block.getNumBytes(),
                      "HDFS_WRITE", receiver.clientName, offset,
                      datanode.dnRegistration.getStorageID(), block, endTime-startTime));
              } else {
                LOG.info("Received " + block + " of size " + block.getNumBytes() +
                         " from " + receiver.inAddr);
              }
            }

            // construct my ack message
            short[] replies = null;
            if (mirrorError) { // no ack is read
              replies = new short[2];
              replies[0] = DataTransferProtocol.OP_STATUS_SUCCESS;
              replies[1] = DataTransferProtocol.OP_STATUS_ERROR;
            } else {
              short ackLen = numTargets == 0 ? 0 : ack.getNumOfReplies();
              replies = new short[1+ackLen];
              replies[0] = DataTransferProtocol.OP_STATUS_SUCCESS;
              for (int i=0; i<ackLen; i++) {
                replies[i+1] = ack.getReply(i);
              }
            }
            PipelineAck replyAck = new PipelineAck(expected, replies);
            // send my ack back to upstream datanode
            replyAck.write(replyOut);
            replyOut.flush();
            if (LOG.isDebugEnabled()) {
              LOG.debug("PacketResponder " + numTargets +
                        " for block " + block +
                        " responded an ack: " + replyAck);
            }
        } catch (Throwable e) {
          LOG.warn("IOException in BlockReceiver.run(): ", e);
          if (running) {
            LOG.info("PacketResponder " + block + " " + numTargets +
                     " Exception " + StringUtils.stringifyException(e));
            running = false;
          }
          if (!Thread.interrupted()) { // error not caused by interruption
            receiverThread.interrupt();
          }
        }
      }
      LOG.info("PacketResponder " + numTargets + " for " + block +
          " terminating");
    }
  }
 
  /**
   * This information is cached by the Datanode in the ackQueue.
   */
  static private class Packet {
    long seqno;
    boolean lastPacketInBlock;

    Packet(long seqno, boolean lastPacketInBlock) {
      this.seqno = seqno;
      this.lastPacketInBlock = lastPacketInBlock;
    }
  }
}
TOP

Related Classes of org.apache.hadoop.hdfs.server.datanode.BlockReceiver

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.