/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tajo.storage.v2;
import org.apache.commons.lang.ArrayUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.Seekable;
import org.apache.hadoop.io.compress.*;
import org.apache.tajo.catalog.TableMeta;
import org.apache.tajo.storage.Fragment;
import org.apache.tajo.storage.LazyTuple;
import org.apache.tajo.storage.Tuple;
import org.apache.tajo.storage.compress.CodecPool;
import org.apache.tajo.util.Bytes;
import java.io.DataInputStream;
import java.io.IOException;
import java.io.InputStream;
public class CSVFileScanner extends FileScannerV2 {
public static final String DELIMITER = "csvfile.delimiter";
public static final String DELIMITER_DEFAULT = "|";
public static final byte LF = '\n';
private static final Log LOG = LogFactory.getLog(CSVFileScanner.class);
private final static int DEFAULT_BUFFER_SIZE = 256 * 1024;
private int bufSize;
private char delimiter;
private ScheduledInputStream sin;
private InputStream is; // decompressd stream
private CompressionCodecFactory factory;
private CompressionCodec codec;
private Decompressor decompressor;
private Seekable filePosition;
private boolean splittable = true;
private long startOffset, length;
private byte[] buf = null;
private byte[][] tuples = null;
private long[] tupleOffsets = null;
private int currentIdx = 0, validIdx = 0;
private byte[] tail = null;
private long pageStart = -1;
private long prevTailLen = -1;
private int[] targetColumnIndexes;
private boolean eof = false;
private boolean first = true;
private long totalReadBytesForFetch;
private long totalReadBytesFromDisk;
public CSVFileScanner(Configuration conf, final TableMeta meta,
final Fragment fragment) throws IOException {
super(conf, meta, fragment);
factory = new CompressionCodecFactory(conf);
codec = factory.getCodec(fragment.getPath());
if (isCompress() && !(codec instanceof SplittableCompressionCodec)) {
splittable = false;
}
}
@Override
public void init() throws IOException {
// Buffer size, Delimiter
this.bufSize = DEFAULT_BUFFER_SIZE;
String delim = fragment.getMeta().getOption(DELIMITER, DELIMITER_DEFAULT);
this.delimiter = delim.charAt(0);
super.init();
}
@Override
protected boolean initFirstScan(int maxBytesPerSchedule) throws IOException {
synchronized(this) {
eof = false;
first = true;
if(sin == null) {
FSDataInputStream fin = fs.open(fragment.getPath(), 128 * 1024);
sin = new ScheduledInputStream(fragment.getPath(), fin,
fragment.getStartOffset(), fragment.getLength(), fs.getLength(fragment.getPath()));
startOffset = fragment.getStartOffset();
length = fragment.getLength();
if (startOffset > 0) {
startOffset--; // prev line feed
}
}
}
return true;
}
private boolean scanFirst() throws IOException {
if (codec != null) {
decompressor = CodecPool.getDecompressor(codec);
if (codec instanceof SplittableCompressionCodec) {
SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(
sin, decompressor, startOffset, startOffset + length,
SplittableCompressionCodec.READ_MODE.BYBLOCK);
startOffset = cIn.getAdjustedStart();
length = cIn.getAdjustedEnd() - startOffset;
filePosition = cIn;
is = cIn;
} else {
is = new DataInputStream(codec.createInputStream(sin, decompressor));
}
} else {
sin.seek(startOffset);
filePosition = sin;
is = sin;
}
tuples = new byte[0][];
if (targets == null) {
targets = schema.toArray();
}
targetColumnIndexes = new int[targets.length];
for (int i = 0; i < targets.length; i++) {
targetColumnIndexes[i] = schema.getColumnIdByName(targets[i].getColumnName());
}
if (LOG.isDebugEnabled()) {
LOG.debug("CSVScanner open:" + fragment.getPath() + "," + startOffset + "," + length +
"," + fs.getFileStatus(fragment.getPath()).getLen());
}
if (startOffset != 0) {
int rbyte;
while ((rbyte = is.read()) != LF) {
if(rbyte == -1) break;
}
}
if (fragmentable() < 1) {
close();
return false;
}
return true;
}
@Override
public boolean isStopScanScheduling() {
if(sin != null && sin.IsEndOfStream()) {
return true;
} else {
return false;
}
}
private long fragmentable() throws IOException {
return startOffset + length - getFilePosition();
}
@Override
protected long getFilePosition() throws IOException {
long retVal;
if (filePosition != null) {
retVal = filePosition.getPos();
} else {
retVal = sin.getPos();
}
return retVal;
}
@Override
public boolean isFetchProcessing() {
if(sin != null &&
(sin.getAvaliableSize() >= 64 * 1024 * 1024)) {
return true;
} else {
return false;
}
}
private void page() throws IOException {
// Index initialization
currentIdx = 0;
// Buffer size set
if (isSplittable() && fragmentable() < DEFAULT_BUFFER_SIZE) {
bufSize = (int) fragmentable();
}
if (this.tail == null || this.tail.length == 0) {
this.pageStart = getFilePosition();
this.prevTailLen = 0;
} else {
this.pageStart = getFilePosition() - this.tail.length;
this.prevTailLen = this.tail.length;
}
// Read
int rbyte;
buf = new byte[bufSize];
rbyte = is.read(buf);
if (rbyte < 0) {
eof = true; // EOF
return;
}
if (prevTailLen == 0) {
tail = new byte[0];
tuples = Bytes.splitPreserveAllTokens(buf, rbyte, (char) LF);
} else {
byte[] lastRow = ArrayUtils.addAll(tail, buf);
tuples = Bytes.splitPreserveAllTokens(lastRow, rbyte + tail.length, (char) LF);
tail = null;
}
// Check tail
if ((char) buf[rbyte - 1] != LF) {
if ((fragmentable() < 1 || rbyte != bufSize)) {
int lineFeedPos = 0;
byte[] temp = new byte[DEFAULT_BUFFER_SIZE];
// find line feed
while ((temp[lineFeedPos] = (byte)is.read()) != (byte)LF) {
if(temp[lineFeedPos] < 0) {
break;
}
lineFeedPos++;
}
tuples[tuples.length - 1] = ArrayUtils.addAll(tuples[tuples.length - 1],
ArrayUtils.subarray(temp, 0, lineFeedPos));
validIdx = tuples.length;
} else {
tail = tuples[tuples.length - 1];
validIdx = tuples.length - 1;
}
} else {
tail = new byte[0];
validIdx = tuples.length - 1;
}
if(!isCompress()) makeTupleOffset();
}
private void makeTupleOffset() {
long curTupleOffset = 0;
this.tupleOffsets = new long[this.validIdx];
for (int i = 0; i < this.validIdx; i++) {
this.tupleOffsets[i] = curTupleOffset + this.pageStart;
curTupleOffset += this.tuples[i].length + 1;//tuple byte + 1byte line feed
}
}
protected Tuple nextTuple() throws IOException {
if(first) {
boolean more = scanFirst();
first = false;
if(!more) {
return null;
}
}
try {
if (currentIdx == validIdx) {
if (isSplittable() && fragmentable() < 1) {
close();
return null;
} else {
page();
}
if(eof){
close();
return null;
}
}
long offset = -1;
if(!isCompress()){
offset = this.tupleOffsets[currentIdx];
}
byte[][] cells = Bytes.splitPreserveAllTokens(tuples[currentIdx++], delimiter, targetColumnIndexes);
return new LazyTuple(schema, cells, offset);
} catch (Throwable t) {
LOG.error(t.getMessage(), t);
}
return null;
}
private boolean isCompress() {
return codec != null;
}
@Override
public void scannerReset() {
if(sin != null) {
try {
filePosition.seek(0);
} catch (IOException e) {
LOG.error(e.getMessage(), e);
}
}
if(sin != null) {
try {
sin.seek(0);
sin.reset();
} catch (IOException e) {
LOG.error(e.getMessage(), e);
}
}
}
@Override
public void close() throws IOException {
if(closed.get()) {
return;
}
if(sin != null) {
totalReadBytesForFetch = sin.getTotalReadBytesForFetch();
totalReadBytesFromDisk = sin.getTotalReadBytesFromDisk();
}
try {
if(is != null) {
is.close();
}
is = null;
sin = null;
} finally {
if (decompressor != null) {
CodecPool.returnDecompressor(decompressor);
decompressor = null;
}
tuples = null;
super.close();
}
}
@Override
protected boolean scanNext(int length) throws IOException {
synchronized(this) {
if(isClosed()) {
return false;
}
return sin.readNext(length);
}
}
@Override
public boolean isProjectable() {
return true;
}
@Override
public boolean isSelectable() {
return false;
}
@Override
public void setSearchCondition(Object expr) {
}
@Override
public boolean isSplittable(){
return splittable;
}
@Override
protected long[] reportReadBytes() {
return new long[]{totalReadBytesForFetch, totalReadBytesFromDisk};
}
}