/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hdfs.server.datanode;
import java.io.BufferedOutputStream;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileDescriptor;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FilenameFilter;
import java.io.IOException;
import java.io.OutputStream;
import java.io.RandomAccessFile;
import java.nio.channels.ClosedChannelException;
import java.nio.channels.FileChannel;
import java.util.Arrays;
import java.util.zip.Checksum;
import org.apache.hadoop.fs.FSInputChecker;
import org.apache.hadoop.hdfs.DFSClient;
import org.apache.hadoop.hdfs.protocol.Block;
import org.apache.hadoop.hdfs.protocol.DataTransferProtocol;
import org.apache.hadoop.hdfs.protocol.FSConstants;
import org.apache.hadoop.hdfs.server.datanode.BlockDataFile.RandomAccessor;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.nativeio.NativeIO;
import org.apache.hadoop.util.CrcConcat;
import org.apache.hadoop.util.DataChecksum;
import org.apache.hadoop.util.InjectionHandler;
import org.apache.hadoop.util.NativeCrc32;
/**
* Write data into block file and checksum into a separate checksum files.
*
* The on disk file format is:
* Data file:
*
* +---------------+
* | |
* | Data |
* | . |
* | . |
* | . |
* | . |
* | . |
* | . |
* | |
* +---------------+
*
* Checksum file:
* +----------------------+
* | Checksum Header |
* +----------------------+
* | Checksum for Chunk 1 |
* +----------------------+
* | Checksum for Chunk 2 |
* +----------------------+
* | . |
* | . |
* | . |
* +----------------------+
* | Checksum for last |
* | Chunk (Partial) |
* +----------------------+
*
*/
public class BlockWithChecksumFileWriter extends DatanodeBlockWriter {
final private BlockDataFile blockDataFile;
protected BlockDataFile.Writer blockDataWriter = null;
File metafile;
protected DataOutputStream checksumOut = null; // to crc file at local disk
protected OutputStream cout = null; // output stream for checksum file
public BlockWithChecksumFileWriter(BlockDataFile blockDataFile, File metafile) {
this.blockDataFile = blockDataFile;
this.metafile = metafile;
}
public void initializeStreams(int bytesPerChecksum, int checksumSize,
Block block, String inAddr, int namespaceId, DataNode datanode)
throws FileNotFoundException, IOException {
if (this.blockDataWriter == null) {
blockDataWriter = blockDataFile.getWriter(-1);
}
if (this.cout == null) {
this.cout = new FileOutputStream(
new RandomAccessFile(metafile, "rw").getFD());
}
checksumOut = new DataOutputStream(new BufferedOutputStream(cout,
FSConstants.SMALL_BUFFER_SIZE));
setParameters(bytesPerChecksum, checksumSize, block, inAddr, namespaceId,
datanode);
}
@Override
public void fadviseStream(int advise, long offset, long len)
throws IOException {
fadviseStream(advise, offset, len, false);
}
@Override
public void fadviseStream(int advise, long offset, long len, boolean sync)
throws IOException {
if (LOG.isDebugEnabled()) {
LOG.debug("posix_fadvise with advise : " + advise + " for : "
+ blockDataFile.getFile());
}
blockDataWriter.posixFadviseIfPossible(offset, len, advise, sync);
}
@Override
public void writeHeader(DataChecksum checksum) throws IOException {
BlockMetadataHeader.writeHeader(checksumOut, checksum);
}
@Override
public void writePacket(byte pktBuf[], int len, int dataOff,
int pktBufStartOff, int numChunks, int packetVersion) throws IOException {
if (packetVersion != DataTransferProtocol.PACKET_VERSION_CHECKSUM_FIRST) {
throw new IOException(
"non-inline checksum doesn't support packet version " + packetVersion);
}
if (len == 0) {
return;
}
// finally write to the disk :
blockDataWriter.write(pktBuf, dataOff, len);
boolean lastChunkStartsFromChunkStart = false;
if (firstChunkOffset > 0) {
// packet doesn't start as beginning of the chunk, need to concatenate
// checksums of two pieces.
int crcPart2 = DataChecksum.getIntFromBytes(pktBuf, pktBufStartOff);
partialCrcInt = CrcConcat.concatCrc(partialCrcInt, crcPart2,
Math.min(len, bytesPerChecksum - firstChunkOffset));
byte[] tempBuf = new byte[4];
DataChecksum.writeIntToBuf(partialCrcInt, tempBuf, 0);
checksumOut.write(tempBuf);
if (numChunks > 1) {
// write the other chunk's checksums.
checksumOut.write(pktBuf, pktBufStartOff + checksumSize, (numChunks - 1)
* checksumSize);
lastChunkStartsFromChunkStart = true;
}
} else {
checksumOut.write(pktBuf, pktBufStartOff, numChunks * checksumSize);
lastChunkStartsFromChunkStart = true;
}
firstChunkOffset = (firstChunkOffset + len) % bytesPerChecksum;
if (firstChunkOffset > 0 && lastChunkStartsFromChunkStart) {
// The last chunk is partial and starts from the chunk boundary,
// need to remember its checksum for the next chunk.
partialCrcInt = DataChecksum.getIntFromBytes(pktBuf, pktBufStartOff
+ (numChunks - 1) * checksumSize);
}
}
/**
* Retrieves the offset in the block to which the the next write will write
* data to.
*/
public long getChannelPosition() throws IOException {
return blockDataWriter.getChannelPosition();
}
private long getChecksumOffset(long offsetInBlock) {
return BlockMetadataHeader.getHeaderSize() + offsetInBlock
/ bytesPerChecksum * checksumSize;
}
@Override
public void setPosAndRecomputeChecksumIfNeeded(long offsetInBlock, DataChecksum checksum) throws IOException {
firstChunkOffset = (int) (offsetInBlock % bytesPerChecksum);
if (getChannelPosition() == offsetInBlock) {
if (firstChunkOffset > 0) {
// Partial block, need to seek checksum stream back.
setChecksumOffset(getChecksumOffset(offsetInBlock));
}
return; // nothing to do
}
long offsetInChecksum = getChecksumOffset(offsetInBlock);
if (blockDataWriter != null) {
blockDataWriter.flush();
}
if (checksumOut != null) {
checksumOut.flush();
}
// If this is a partial chunk, then read in pre-existing checksum
if (offsetInBlock % bytesPerChecksum != 0) {
LOG.info("setBlockPosition trying to set position to " + offsetInBlock
+ " for block " + block
+ " which is not a multiple of bytesPerChecksum " + bytesPerChecksum);
computePartialChunkCrc(offsetInBlock, offsetInChecksum, bytesPerChecksum, checksum);
}
if (LOG.isDebugEnabled()) {
LOG.debug("Changing block file offset of block " + block + " from "
+ getChannelPosition() + " to " + offsetInBlock
+ " meta file offset to " + offsetInChecksum);
}
// set the position of the block file
setChannelPosition(offsetInBlock, offsetInChecksum);
}
/**
* Sets the offset in the block to which the the next write will write data
* to.
*/
public void setChannelPosition(long dataOffset, long ckOffset)
throws IOException {
long channelSize = blockDataWriter.getChannelSize();
if (channelSize < dataOffset) {
String fileName;
if (datanode.data instanceof FSDataset) {
FSDataset fsDataset = (FSDataset) datanode.data;
fileName = fsDataset.getDatanodeBlockInfo(namespaceId, block)
.getBlockDataFile().getTmpFile(namespaceId, block).toString();
} else {
fileName = "unknown";
}
String msg = "Trying to change block file offset of block " + block
+ " file " + fileName + " to " + dataOffset
+ " but actual size of file is " + blockDataWriter.getChannelSize();
throw new IOException(msg);
}
if (dataOffset > channelSize) {
throw new IOException("Set position over the end of the data file.");
}
if (dataOffset % bytesPerChecksum != 0 && channelSize != dataOffset) {
DFSClient.LOG.warn("Non-inline Checksum Block " + block
+ " channel size " + channelSize + " but data starts from "
+ dataOffset);
}
blockDataWriter.position(dataOffset);
setChecksumOffset(ckOffset);
}
private void setChecksumOffset(long ckOffset) throws IOException {
FileOutputStream file = (FileOutputStream) cout;
if (ckOffset > file.getChannel().size()) {
throw new IOException("Set position over the end of the checksum file.");
}
file.getChannel().position(ckOffset);
}
/**
* reads in the partial crc chunk and computes checksum of pre-existing data
* in partial chunk.
*/
private void computePartialChunkCrc(long blkoff, long ckoff,
int bytesPerChecksum, DataChecksum checksum) throws IOException {
// find offset of the beginning of partial chunk.
//
int sizePartialChunk = (int) (blkoff % bytesPerChecksum);
int checksumSize = checksum.getChecksumSize();
blkoff = blkoff - sizePartialChunk;
LOG.info("computePartialChunkCrc sizePartialChunk " + sizePartialChunk
+ " block " + block + " offset in block " + blkoff
+ " offset in metafile " + ckoff);
// create an input stream from the block file
// and read in partial crc chunk into temporary buffer
//
byte[] buf = new byte[sizePartialChunk];
byte[] crcbuf = new byte[checksumSize];
FileInputStream dataIn = null, metaIn = null;
try {
DatanodeBlockInfo info = datanode.data.getDatanodeBlockInfo(namespaceId,
block);
if (info == null) {
throw new IOException("Block " + block
+ " does not exist in volumeMap.");
}
File blockFile = info.getDataFileToRead();
if (blockFile == null) {
blockFile = info.getBlockDataFile().getTmpFile(namespaceId, block);
}
RandomAccessFile blockInFile = new RandomAccessFile(blockFile, "r");
if (blkoff > 0) {
blockInFile.seek(blkoff);
}
File metaFile = getMetaFile(blockFile, block);
RandomAccessFile metaInFile = new RandomAccessFile(metaFile, "r");
if (ckoff > 0) {
metaInFile.seek(ckoff);
}
dataIn = new FileInputStream(blockInFile.getFD());
metaIn = new FileInputStream(metaInFile.getFD());
IOUtils.readFully(dataIn, buf, 0, sizePartialChunk);
// open meta file and read in crc value computer earlier
IOUtils.readFully(metaIn, crcbuf, 0, crcbuf.length);
} finally {
if (dataIn != null) {
dataIn.close();
}
if (metaIn != null) {
metaIn.close();
}
}
// compute crc of partial chunk from data read in the block file.
Checksum partialCrc = new NativeCrc32();
partialCrc.update(buf, 0, sizePartialChunk);
LOG.info("Read in partial CRC chunk from disk for block " + block);
// paranoia! verify that the pre-computed crc matches what we
// recalculated just now
if (partialCrc.getValue() != FSInputChecker.checksum2long(crcbuf)) {
String msg = "Partial CRC " + partialCrc.getValue()
+ " does not match value computed the "
+ " last time file was closed "
+ FSInputChecker.checksum2long(crcbuf);
throw new IOException(msg);
}
// LOG.debug("Partial CRC matches 0x" +
// Long.toHexString(partialCrc.getValue()));
partialCrcInt = (int) partialCrc.getValue();
}
/**
* Flush the data and checksum data out to the stream. Please call sync to
* make sure to write the data in to disk
*
* @throws IOException
*/
@Override
public void flush(boolean forceSync)
throws IOException {
if (checksumOut != null) {
checksumOut.flush();
if (forceSync && (cout instanceof FileOutputStream)) {
((FileOutputStream) cout).getChannel().force(true);
}
}
if (blockDataWriter != null) {
blockDataWriter.flush();
if (forceSync) {
blockDataWriter.force(true);
}
}
}
@Override
public void fileRangeSync(long lastBytesToSync, int flags) throws IOException {
if (cout instanceof FileOutputStream && lastBytesToSync > 0) {
FileChannel fc = ((FileOutputStream) cout).getChannel();
long pos = fc.position();
long startOffset = pos - lastBytesToSync;
if (startOffset < 0) {
startOffset = 0;
}
if (LOG.isDebugEnabled()) {
LOG.debug("file_range_sync " + block + " channel position " + pos
+ " offset " + startOffset);
}
blockDataWriter.syncFileRangeIfPossible(startOffset, pos
- startOffset, flags);
}
}
public void truncateBlock(long oldBlockFileLen, long newlen)
throws IOException {
if (newlen == 0) {
// Special case for truncating to 0 length, since there's no previous
// chunk.
RandomAccessor ra = blockDataFile.getRandomAccessor();
try {
// truncate blockFile
ra.setLength(newlen);
} finally {
ra.close();
}
// update metaFile
RandomAccessFile metaRAF = new RandomAccessFile(metafile, "rw");
try {
metaRAF.setLength(BlockMetadataHeader.getHeaderSize());
} finally {
metaRAF.close();
}
return;
}
DataChecksum dcs = BlockMetadataHeader.readHeader(metafile).getChecksum();
int checksumsize = dcs.getChecksumSize();
int bpc = dcs.getBytesPerChecksum();
long newChunkCount = (newlen - 1) / bpc + 1;
long newmetalen = BlockMetadataHeader.getHeaderSize() + newChunkCount
* checksumsize;
long lastchunkoffset = (newChunkCount - 1) * bpc;
int lastchunksize = (int) (newlen - lastchunkoffset);
byte[] b = new byte[Math.max(lastchunksize, checksumsize)];
RandomAccessor ra = blockDataFile.getRandomAccessor();
try {
// truncate blockFile
ra.setLength(newlen);
// read last chunk
ra.seek(lastchunkoffset);
ra.readFully(b, 0, lastchunksize);
} finally {
ra.close();
}
// compute checksum
dcs.update(b, 0, lastchunksize);
dcs.writeValue(b, 0, false);
// update metaFile
RandomAccessFile metaRAF = new RandomAccessFile(metafile, "rw");
try {
metaRAF.setLength(newmetalen);
metaRAF.seek(newmetalen - checksumsize);
metaRAF.write(b, 0, checksumsize);
} finally {
metaRAF.close();
}
}
@Override
public void close() throws IOException {
close(0);
}
public void close(int fadvise) throws IOException {
IOException ioe = null;
// close checksum file
try {
if (checksumOut != null) {
try {
checksumOut.flush();
if (datanode.syncOnClose && (cout instanceof FileOutputStream)) {
((FileOutputStream) cout).getChannel().force(true);
}
} finally {
checksumOut.close();
checksumOut = null;
}
}
} catch (IOException e) {
ioe = e;
}
// close block file
try {
if (blockDataWriter != null) {
try {
blockDataWriter.flush();
if (datanode.syncOnClose) {
blockDataWriter.force(true);
}
if (fadvise != 0) {
fadviseStream(fadvise, 0, 0, true);
}
} finally {
blockDataWriter.close();
blockDataWriter = null;
}
}
} catch (IOException e) {
ioe = e;
}
// disk check
// We don't check disk for ClosedChannelException as close() can be
// called twice and it is possible that out.close() throws.
// No need to check or recheck disk then.
//
if (ioe != null) {
if (!(ioe instanceof ClosedChannelException)) {
datanode.checkDiskError(ioe);
}
throw ioe;
}
}
static String getMetaFileName(String blockFileName, long genStamp) {
return blockFileName + "_" + genStamp + FSDataset.METADATA_EXTENSION;
}
public static File getMetaFile(File f , Block b) {
return new File(getMetaFileName(f.getAbsolutePath(),
b.getGenerationStamp()));
}
/** Find the corresponding meta data file from a given block file */
public static File findMetaFile(final File blockFile) throws IOException {
return findMetaFile(blockFile, false);
}
static File findMetaFile(final File blockFile, boolean missingOk)
throws IOException {
final String prefix = blockFile.getName() + "_";
final File parent = blockFile.getParentFile();
File[] matches = parent.listFiles(new FilenameFilter() {
public boolean accept(File dir, String name) {
return dir.equals(parent)
&& name.startsWith(prefix) && name.endsWith(FSDataset.METADATA_EXTENSION);
}
});
if (matches == null || matches.length == 0) {
if (missingOk) {
return null;
} else {
throw new IOException("Meta file not found, blockFile=" + blockFile);
}
}
else if (matches.length > 1) {
throw new IOException("Found more than one meta files: "
+ Arrays.asList(matches));
}
return matches[0];
}
}