/**
* Copyright 2008 - CommonCrawl Foundation
*
* CommonCrawl licenses this file to you under the Apache License,
* Version 2.0 (the "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.commoncrawl.hadoop.io.deprecated;
import java.io.BufferedReader;
import java.io.EOFException;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.PushbackInputStream;
import java.io.SequenceInputStream;
import java.nio.ByteBuffer;
import java.nio.channels.Channels;
import java.nio.channels.ReadableByteChannel;
import java.nio.charset.Charset;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.LinkedList;
import java.util.StringTokenizer;
import java.util.Vector;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.TimeUnit;
import java.util.zip.CRC32;
import java.util.zip.CheckedInputStream;
import java.util.zip.Inflater;
import java.util.zip.InflaterInputStream;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.commoncrawl.crawl.common.shared.Constants;
import org.commoncrawl.io.shared.NIODataSink;
import org.commoncrawl.protocol.shared.ArcFileHeaderItem;
import org.commoncrawl.protocol.shared.ArcFileItem;
import org.junit.Assert;
import org.junit.Test;
import org.apache.hadoop.record.Buffer;
import org.apache.hadoop.util.StringUtils;
/**
* Decompresses ARC file and returns documents as individual ArcFileItems
*
* @author rana
*
*/
public final class ArcFileReader extends InflaterInputStream implements
NIODataSink {
// ////////////////////////////////////////////////////////////////////////////////
// data members
// ////////////////////////////////////////////////////////////////////////////////
/** logging **/
private static final Log LOG = LogFactory.getLog(ArcFileReader.class);
/** internal ByteBuffer wrapper for queuing byte buffers **/
private static final class BufferItem {
public BufferItem(ByteBuffer bufferItem) {
_buffer = bufferItem;
}
public ByteBuffer _buffer;
};
/**
* helper dummy stream used to get around some stupid design decisions in
* InflaterInputStream
*/
private static InputStream _dummyStream = new InputStream() {
@Override
public int read()
throws IOException {
return 0;
}
};
/** blocking consumer queue **/
private LinkedBlockingQueue<BufferItem> _consumerQueue = null;
/** 32 bit crc **/
private CRC32 _crc = new CRC32();
/** flag indicating that this arc file has a header item **/
private boolean _hasHeaderItem = true;
/** the arc file header , when available **/
private String _arcFileHeader = null;
/** End Of Stream Indicator **/
private boolean _eosReached = false;
/** block size used for various operations **/
public static final int DEFAULT_BLOCK_SIZE = 32 * 1024;
/** default buffer queue size **/
public static final int DEFAULT_BUFFER_QUEUE_SIZE = 1024;
/** default timeout value **/
public static final int DEFAULT_TIMEOUT_VALUE = -1;
/** allocation block size **/
private static int _blockSize = DEFAULT_BLOCK_SIZE;
/** buffer queue size **/
private static int _bufferQueueSize = DEFAULT_BUFFER_QUEUE_SIZE;
/** timeout value for blocking input stream operations **/
private static int _ioTimeoutValue = DEFAULT_TIMEOUT_VALUE;
/** stream pos variable **/
private int _streamPos = 0;
// ////////////////////////////////////////////////////////////////////////////////
// public API
// ////////////////////////////////////////////////////////////////////////////////
/**
* Costructs a new ArcFileReader object with specified block size (for
* allocations)
*/
public ArcFileReader() {
super(_dummyStream, new Inflater(true), _blockSize);
// set up buffer queue ...
_consumerQueue = new LinkedBlockingQueue<BufferItem>(_bufferQueueSize);
// setup the proper stream...
super.in = new PushbackInputStream(new InputStream() {
ByteBuffer _activeBuffer = null;
byte oneByteArray[] = new byte[1];
@Override
public int read() throws IOException {
if (read(oneByteArray, 0, 1) != -1) {
return oneByteArray[0] & 0xff;
}
return -1;
}
@Override
public int read(byte b[], int off, int len) throws IOException {
if (_activeBuffer == null || _activeBuffer.remaining() == 0) {
BufferItem nextItem = null;
try {
// when io timeout is not specified, block indefinitely...
if (_ioTimeoutValue == -1) {
nextItem = _consumerQueue.take();
}
// otherwise wait for specified time on io
else {
nextItem = _consumerQueue.poll(_ioTimeoutValue,
TimeUnit.MILLISECONDS);
if (nextItem == null) {
throw new IOException("IO Timeout waiting for Buffer");
}
}
} catch (InterruptedException e) {
throw new IOException("Thread Interrupted waiting for Buffer");
}
if (nextItem._buffer == null) {
_eosReached = true;
// EOF CONDITION ...
return -1;
} else {
_activeBuffer = nextItem._buffer;
}
}
final int sizeAvailable = _activeBuffer.remaining();
final int readSize = Math.min(sizeAvailable, len);
_activeBuffer.get(b, off, readSize);
_streamPos += readSize;
return readSize;
}
}, _blockSize);
}
/** set the default block size **/
public static void setBlockSize(int blockSize) {
_blockSize = blockSize;
}
/** set the buffer queue size **/
public static void setBufferQueueSize(int bufferQueueSize) {
_bufferQueueSize = bufferQueueSize;
}
/** set the default timeout value for blocking io operations **/
public static void setIOTimeoutValue(int timeoutInMilliseconds) {
_ioTimeoutValue = timeoutInMilliseconds;
}
/** indicate whether this arc file has a header item **/
public void setArcFileHasHeaderItemFlag(boolean value) {
_hasHeaderItem = value;
}
/**
* Reset all interal variables and get the Reader ready to process a new
* ArcFile
*/
public void resetState() {
_arcFileHeader = null;
_consumerQueue.clear();
_crc.reset();
_eosReached = false;
resetInflater();
}
/**
* Checks to see if additional ArcFileItems can be extracted from the current
* ARC File Stream This is a BLOCKING CALL - it will block on
* _consumerQueue.take if no data is available...
*
* @return true if another ArcFileItem can be extracted from the stream.
* @throws IOException
* if an error occurs processing ARC file data
*/
public boolean hasMoreItems() throws IOException {
if (_arcFileHeader == null && _hasHeaderItem) {
readARCHeader();
}
return readHeader();
}
/**
* blocking call to retrieve next ArcFileItem contained within an ARC File
*
* @return Fully constructed ArcFileItem
* @throws IOException
*/
public void getNextItem(ArcFileItem itemOut) throws IOException {
// preserve incoming arc file name ...
String arcFileName = itemOut.getArcFileName();
// reset item
itemOut.clear();
// restore arc file name
itemOut.setArcFileName(arcFileName);
// read content
_crc.reset();
// and reset inflater
resetInflater();
// set the arc file stream positon up front
itemOut.setArcFilePos(getARCFileStreamPos());
ArcFileBuilder builder = new ArcFileBuilder(itemOut);
// read header line buffer
for (;;) {
byte scanBuffer[] = new byte[_blockSize];
ByteBuffer byteBuffer = ByteBuffer.wrap(scanBuffer);
// read up to scan buffer size of data ...
int readAmount = read(scanBuffer, 0, scanBuffer.length);
if (readAmount != -1) {
// update crc calculation
_crc.update(scanBuffer, 0, readAmount);
// and limit byte buffer ...
byteBuffer.limit(readAmount);
// and then input data input builder
builder.inputData(byteBuffer);
} else {
// validate crc and header length ...
readTrailer();
builder.finish();
// set the compressed content size ...
itemOut.setArcFileSize(getARCFileStreamPos() - itemOut.getArcFilePos());
return;
}
}
}
/**
* NIODataSink method - called by implementor when all ARC File data has been
* exhauseted
*
*/
public void finished() {
try {
_consumerQueue.put(new BufferItem(null));
} catch (InterruptedException e) {
}
}
/**
* NIODataSink method - called by the implementor to queue up compressed ARC
* File data for processing
*/
public void available(ByteBuffer availableReadBuffer) {
try {
_consumerQueue.put(new BufferItem(availableReadBuffer));
} catch (InterruptedException e) {
}
}
// ////////////////////////////////////////////////////////////////////////////////
// internal helpers
// ////////////////////////////////////////////////////////////////////////////////
private void resetInflater() {
inf.reset();
}
private void readARCHeader() throws IOException {
readHeader();
byte accumBuffer[] = new byte[4096];
int accumAmount = 0;
int readAmt = 0;
while ((readAmt = this.read(accumBuffer, accumAmount, accumBuffer.length
- accumAmount)) > 0) {
accumAmount += readAmt;
if (accumAmount == accumBuffer.length) {
throw new IOException("Invalid ARC File Header");
}
}
if (readAmt == 0 || accumAmount == 0) {
throw new IOException("Invalid ARC File Header");
} else {
// calculate header crc ...
_crc.reset();
_crc.update(accumBuffer, 0, accumAmount);
// validate crc and header length ...
readTrailer();
// and decode header string ...
_arcFileHeader = new String(accumBuffer, 0, accumAmount, "ISO-8859-1");
}
}
/**
* GZIP Code derived from GZIPInputStream code
*/
// GZIP header magic number.
private final static int GZIP_MAGIC = 0x8b1f;
/*
* File header flags.
*/
private final static int FHCRC = 2; // Header CRC
private final static int FEXTRA = 4; // Extra field
private final static int FNAME = 8; // File name
private final static int FCOMMENT = 16; // File comment
/*
* Reads GZIP member header.
*/
private boolean readHeader() throws IOException {
if (!_eosReached) {
CheckedInputStream in = new CheckedInputStream(this.in, _crc);
_crc.reset();
try {
// Check header magic
if (readUShort(in) != GZIP_MAGIC) {
throw new IOException("Not in GZIP format");
}
// Check compression method
if (readUByte(in) != 8) {
throw new IOException("Unsupported compression method");
}
// Read flags
int flg = readUByte(in);
// Skip MTIME, XFL, and OS fields
skipBytes(in, 6);
// Skip optional extra field
if ((flg & FEXTRA) == FEXTRA) {
skipBytes(in, readUShort(in));
}
// Skip optional file name
if ((flg & FNAME) == FNAME) {
while (readUByte(in) != 0)
;
}
// Skip optional file comment
if ((flg & FCOMMENT) == FCOMMENT) {
while (readUByte(in) != 0)
;
}
// Check optional header CRC
if ((flg & FHCRC) == FHCRC) {
int v = (int) _crc.getValue() & 0xffff;
if (readUShort(in) != v) {
throw new IOException("Corrupt GZIP header");
}
}
return true;
} catch (EOFException e) {
}
}
return false;
}
/*
* Reads GZIP member trailer.
*/
private void readTrailer() throws IOException {
PushbackInputStream in = (PushbackInputStream) this.in;
int n = inf.getRemaining();
if (n > 0) {
in.unread(buf, len - n, n);
}
// Uses left-to-right evaluation order
if ((readUInt(in) != _crc.getValue()) ||
// rfc1952; ISIZE is the input size modulo 2^32
(readUInt(in) != (inf.getBytesWritten() & 0xffffffffL)))
throw new IOException("Corrupt GZIP trailer");
}
/*
* Reads unsigned integer in Intel byte order.
*/
private static long readUInt(InputStream in) throws IOException {
long s = readUShort(in);
return ((long) readUShort(in) << 16) | s;
}
/*
* Reads unsigned short in Intel byte order.
*/
private static int readUShort(InputStream in) throws IOException {
int b = readUByte(in);
return ((int) readUByte(in) << 8) | b;
}
/*
* Reads unsigned byte.
*/
private static int readUByte(InputStream in) throws IOException {
int b = in.read();
if (b == -1) {
throw new EOFException();
}
if (b < -1 || b > 255) {
// Report on this.in, not argument in; see read{Header, Trailer}.
throw new IOException("read() returned value out of range -1..255: " + b);
}
return b;
}
private byte[] tmpbuf = new byte[128];
/*
* Skips bytes of input data blocking until all bytes are skipped. Does not
* assume that the input stream is capable of seeking.
*/
private void skipBytes(InputStream in, int n) throws IOException {
while (n > 0) {
int len = in.read(tmpbuf, 0, n < tmpbuf.length ? n : tmpbuf.length);
if (len == -1) {
throw new EOFException();
}
n -= len;
}
}
/**
* calculated raw arc file stream pos (taking into account any buffered data
* contained within PushBackInputStream
*
* @return current stream position in bytes
* @throws IOException
* if error occurs
*/
private final int getARCFileStreamPos() throws IOException {
PushbackInputStream in = (PushbackInputStream) this.in;
return _streamPos - in.available();
}
/**
*
* ArcFileBuilder helper class - used to construct ArcFileItem objects from an
* ARC File Entry in a stateful manner
*
*/
public static class ArcFileBuilder {
// various states of processing an ARC FILE
private enum State {
LookingForMetadata, LookingForHeaderTerminator, ReadingContent, Finished
}
// ARC FILE HEADER TIMESTAMP FORMAT
// Note: Not Thread-Safe, so every instance of builder needs its own copy
// ...
SimpleDateFormat TIMESTAMP14 = new SimpleDateFormat(
"yyyyMMddHHmmss");
// ArcFileItem this builder returns
ArcFileItem _item = null;
// the content buffer associated this item ...
Buffer _buffer = new Buffer();
// Builder State
State _state = State.LookingForMetadata;
// Queued Input State
LinkedList<ByteBuffer> _buffers = new LinkedList<ByteBuffer>();
// Active Input Buffer
ByteBuffer _activeBuffer = null;
// last matched pattern char
byte lastMatchChar = 0;
// match count
int matchCount = 0;
// End Of Stream Indicator
boolean eos = false;
// Charsets used during decoding process
static Charset UTF8_Charset = Charset.forName("UTF8");
static Charset ASCII_Charset = Charset.forName("ASCII");
/**
* Constructor
*
* @param itemToConstruct
* - the ArcFileItem to build
*/
public ArcFileBuilder(ArcFileItem itemToConstruct) {
_item = itemToConstruct;
}
private final boolean checkForCRLFTerminator(byte matchingChar) {
if (matchingChar == '\n') {
switch (matchCount) {
case 1:
++matchCount;
break;
case 3:
matchCount = 0;
return true;
default:
matchCount = 0;
}
} else if (matchingChar == '\r') {
switch (matchCount) {
case 2:
++matchCount;
break;
default:
matchCount = 1;
break;
}
} else {
matchCount = 0;
}
return false;
}
/** check for terminator pattern **/
private final boolean checkForTerminator(byte matchingChar) {
boolean terminatorFound = false;
switch (_state) {
// metadata line is terminated by a single line-feed
case LookingForMetadata: {
if (matchingChar == '\n') {
terminatorFound = true;
}
}
break;
// http headers are terminated by the standard crlf-crlf pattern
case LookingForHeaderTerminator: {
terminatorFound = checkForCRLFTerminator(matchingChar);
}
break;
}
if (terminatorFound) {
// reset state ...
matchCount = 0;
// if active buffer contains no more characters...
if (_activeBuffer.remaining() == 0) {
// add entire active buffer to input state
_activeBuffer.rewind();
_buffers.addLast(_activeBuffer);
_activeBuffer = null;
} else {
// otherwise, slice buffer at current position, and
// add one buffer to input state, and make the other current
ByteBuffer oldBuffer = _activeBuffer;
_activeBuffer = _activeBuffer.slice();
oldBuffer.limit(oldBuffer.position());
oldBuffer.rewind();
_buffers.addLast(oldBuffer);
}
}
return terminatorFound;
}
/**
* newInputStream
*
* @param buf
* - ByteBuffer to wrap as an InputStream
* @return InputStream - wrapped InputStream object
*/
private static InputStream newInputStream(final ByteBuffer buf) {
return new InputStream() {
public synchronized int read() throws IOException {
if (!buf.hasRemaining()) {
return -1;
}
return buf.get();
}
public synchronized int read(byte[] bytes, int off, int len)
throws IOException {
// Read only what's left
len = Math.min(len, buf.remaining());
buf.get(bytes, off, len);
return len;
}
};
}
/** construct a reader given a list of ByteBuffers **/
private static InputStreamReader readerFromScanBufferList(
LinkedList<ByteBuffer> buffers, Charset charset) throws IOException {
Vector<InputStream> inputStreams = new Vector<InputStream>();
for (ByteBuffer buffer : buffers) {
inputStreams.add(newInputStream(buffer));
}
buffers.clear();
SequenceInputStream seqInputStream = new SequenceInputStream(inputStreams
.elements());
;
return new InputStreamReader(seqInputStream, charset);
}
/** construct a single line from the current input state **/
private final String readLine(Charset charset) throws IOException {
BufferedReader reader = new BufferedReader(readerFromScanBufferList(
_buffers, charset));
return reader.readLine();
}
/** process the metadata line of an ARC File Entry **/
private final void processMetadataLine(String metadata) throws IOException {
StringTokenizer tokenizer = new StringTokenizer(metadata, " ");
int tokenCount = 0;
while (tokenizer.hasMoreElements() && tokenCount <= 5) {
switch (++tokenCount) {
// URI
case 1: {
_item.setUri(tokenizer.nextToken());
}
break;
// Host IP Address
case 2: {
_item.setHostIP(tokenizer.nextToken());
}
break;
// Timestamp
case 3: {
String timeStamp = tokenizer.nextToken();
try {
_item.setTimestamp(TIMESTAMP14.parse(timeStamp).getTime());
} catch (ParseException e) {
LOG.error("Invalid Timestamp Encountered in Item Metdata. URL:"
+ _item.getUri() + " Timestamp:" + timeStamp + " Metadata:"
+ metadata);
_item.setTimestamp(0);
}
}
break;
// MimeType
case 4: {
_item.setMimeType(tokenizer.nextToken());
}
break;
// and Record Length
case 5: {
_item.setRecordLength(Integer.parseInt(tokenizer.nextToken()));
}
break;
}
}
}
/** extract http headers from the current input state **/
private final void processHeaders() throws IOException {
BufferedReader reader = new BufferedReader(readerFromScanBufferList(
_buffers, ArcFileBuilder.UTF8_Charset));
String line = null;
_item.setFieldDirty(ArcFileItem.Field_HEADERITEMS);
while ((line = reader.readLine()) != null) {
if (line.length() != 0) {
int colonPos = line.indexOf(':');
ArcFileHeaderItem item = new ArcFileHeaderItem();
if (colonPos != -1 && colonPos != line.length() - 1) {
item.setItemKey(line.substring(0, colonPos));
item.setItemValue(line.substring(colonPos + 1));
// if this is our special truncation flag ...
if (item.getItemKey().equals(
Constants.ARCFileHeader_ContentTruncated)) {
String parts[] = item.getItemValue().split(",");
for (String part : parts) {
if (part.equals(ArcFileItem.Flags
.toString(ArcFileItem.Flags.TruncatedInInflate))) {
_item.setFlags(_item.getFlags()
| ArcFileItem.Flags.TruncatedInDownload);
} else if (part.equals(ArcFileItem.Flags
.toString(ArcFileItem.Flags.TruncatedInInflate))) {
_item.setFlags(_item.getFlags()
| ArcFileItem.Flags.TruncatedInInflate);
}
}
}
} else {
item.setItemValue(line);
}
_item.getHeaderItems().add(item);
}
}
}
/** transition from the current input state to the next input state **/
private final void transitionState() throws IOException {
switch (_state) {
case LookingForMetadata: {
// decode the string as a utf-8 string
processMetadataLine(readLine(ASCII_Charset));
// and advance to next state ...
_state = ArcFileBuilder.State.LookingForHeaderTerminator;
}
break;
case LookingForHeaderTerminator: {
// found header terminator
processHeaders();
// and advance to next state ...
_state = ArcFileBuilder.State.ReadingContent;
// and set up arc file item for read ...
_buffer.setCapacity(_blockSize);
}
break;
}
}
/**
* inform builder that input for the current item has been exhauseted
*
* @return ArcFileItem - the fully constructed ArcFileItem object if
* construction was successfull
* @throws IOException
* - if building fails
*/
public final void finish() throws IOException {
if (_state == State.ReadingContent) {
_state = State.Finished;
// generate warning in case of zero content edge case ...
if (_buffer.getCount() == 0) {
LOG
.error("ArcFileBuilder Encountered Item with Zero Length Content. URI:"
+ _item.getUri());
} else {
_item.setContent(_buffer,false);
_buffer = new Buffer();
}
_item = null;
} else {
throw new IOException(
"ArcBuilder finish calledin Invalid State. State:" + _state
+ " ArcFile:" + _item.getArcFileName() + " Position:"
+ _item.getArcFilePos() + " Item URI:" + _item.getUri());
}
}
/**
* Input Data into the builder
*
* @param buffer
* - a piece of uncompressed content
* @throws IOException
* - throws exception if building fails
*/
public final void inputData(ByteBuffer buffer) throws IOException {
// set the buffer as the active buffer ...
_activeBuffer = buffer;
// scan looking for terminator
while (_activeBuffer != null && _activeBuffer.remaining() != 0) {
// if not reading content then
if (_state != ArcFileBuilder.State.ReadingContent) {
// read a byte at a time ...
byte b = _activeBuffer.get();
// and if the byte is a delimiter ...
if (b == '\r' || b == '\n') {
// and check for pattern match (terminator match)
if (checkForTerminator(b)) {
transitionState();
}
}
// otherwise reset pattern buffer
else {
matchCount = 0;
}
} else {
// calculate available storage in buffer ...
int available = _buffer.getCapacity() - _buffer.getCount();
// if we need more room ...
if (available < _activeBuffer.remaining()) {
// figure out how much to grow buffer by ...
int growByAmount = Math.max(_activeBuffer.remaining() - available,
_blockSize * 2);
// and grow the buffer ...
_buffer.setCapacity(_buffer.getCapacity() + growByAmount);
}
// copy the buffer data in one go ...
_buffer.append(_activeBuffer.array(), _activeBuffer.position()
+ _activeBuffer.arrayOffset(), _activeBuffer.remaining());
_activeBuffer = null;
}
}
// now if we reached the end of the buffer while scanning for a token ...
if (_activeBuffer != null) {
// add entire buffer to buffer list ...
_activeBuffer.rewind();
_buffers.add(_activeBuffer);
_activeBuffer = null;
}
}
}
// ////////////////////////////////////////////////////////////////////////////////
// test routines
// ////////////////////////////////////////////////////////////////////////////////
public void checkCRLFStateMachine() throws Exception {
ArcFileItem item = new ArcFileItem();
ArcFileBuilder builder = new ArcFileBuilder(item);
Assert.assertFalse(builder.checkForCRLFTerminator((byte) '\r'));
Assert.assertFalse(builder.checkForCRLFTerminator((byte) '\n'));
Assert.assertFalse(builder.checkForCRLFTerminator((byte) '\r'));
Assert.assertTrue(builder.checkForCRLFTerminator((byte) '\n'));
Assert.assertFalse(builder.checkForCRLFTerminator((byte) '\r'));
Assert.assertFalse(builder.checkForCRLFTerminator((byte) '\n'));
Assert.assertFalse(builder.checkForCRLFTerminator((byte) '\r'));
Assert.assertFalse(builder.checkForCRLFTerminator((byte) '\r'));
Assert.assertFalse(builder.checkForCRLFTerminator((byte) '\n'));
Assert.assertFalse(builder.checkForCRLFTerminator((byte) '\r'));
Assert.assertTrue(builder.checkForCRLFTerminator((byte) '\n'));
Assert.assertFalse(builder.checkForCRLFTerminator((byte) '\n'));
Assert.assertFalse(builder.checkForCRLFTerminator((byte) '\r'));
Assert.assertFalse(builder.checkForCRLFTerminator((byte) '\r'));
Assert.assertFalse(builder.checkForCRLFTerminator((byte) '\n'));
Assert.assertFalse(builder.checkForCRLFTerminator((byte) '\r'));
Assert.assertTrue(builder.checkForCRLFTerminator((byte) '\n'));
Assert.assertFalse(builder.checkForCRLFTerminator((byte) '\r'));
Assert.assertFalse(builder.checkForCRLFTerminator((byte) '\r'));
Assert.assertFalse(builder.checkForCRLFTerminator((byte) '\r'));
Assert.assertFalse(builder.checkForCRLFTerminator((byte) '\n'));
Assert.assertFalse(builder.checkForCRLFTerminator((byte) '\r'));
Assert.assertTrue(builder.checkForCRLFTerminator((byte) '\n'));
}
@Test
public void testReader(File file) throws Exception {
checkCRLFStateMachine();
setIOTimeoutValue(30000);
resetState();
Thread thread = new Thread(new Runnable() {
public void run() {
try {
while (hasMoreItems()) {
ArcFileItem item = new ArcFileItem();
getNextItem(item);
LOG.info("GOT Item URL:" + item.getUri() + " StreamPos:"
+ item.getArcFilePos() + " Content Length:"
+ item.getContent().getCount());
for (ArcFileHeaderItem headerItem : item.getHeaderItems()) {
if (headerItem.isFieldDirty(ArcFileHeaderItem.Field_ITEMKEY)) {
// LOG.info("Header Item:" + headerItem.getItemKey() + " :" +
// headerItem.getItemValue());
} else {
// LOG.info("Header Item:" + headerItem.getItemValue());
}
}
// LOG.info("Content Length:" + item.getContent().getCount());
// LOG.info("Content:");
/*
* ByteArrayInputStream inputStream = new
* ByteArrayInputStream(item.getContent
* ().getReadOnlyBytes(),0,item.getContent().getCount());
* BufferedReader reader = new BufferedReader(new
* InputStreamReader(inputStream,Charset.forName("ASCII"))); String
* line = null; while ((line = reader.readLine()) != null) {
* LOG.info(line); }
*/
}
LOG.info("NO MORE ITEMS... BYE");
} catch (IOException e) {
LOG.error(StringUtils.stringifyException(e));
}
}
});
// run the thread ...
thread.start();
ReadableByteChannel channel = Channels
.newChannel(new FileInputStream(file));
try {
int totalBytesRead = 0;
for (;;) {
ByteBuffer buffer = ByteBuffer.allocate(32768);
int bytesRead = channel.read(buffer);
// LOG.info("Read "+bytesRead + " From File");
if (bytesRead == -1) {
finished();
break;
} else {
buffer.flip();
totalBytesRead += buffer.remaining();
available(buffer);
}
}
} finally {
channel.close();
}
// now wait for thread to die ...
LOG.info("Done Reading File.... Waiting for ArcFileThread to DIE");
thread.join();
LOG.info("Done Reading File.... ArcFileThread to DIED");
}
public static void main(String[] args) {
File file = new File(args[0]);
ArcFileReader reader = new ArcFileReader();
try {
reader.testReader(file);
} catch (Exception e) {
e.printStackTrace();
}
}
}