Source Code of org.apache.tajo.storage.CSVFile$CSVScanner

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */


package org.apache.tajo.storage;


import org.apache.commons.codec.binary.Base64;
import org.apache.commons.lang.ArrayUtils;
import org.apache.commons.lang.StringEscapeUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.*;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.io.compress.*;
import org.apache.tajo.catalog.Column;
import org.apache.tajo.catalog.Schema;
import org.apache.tajo.catalog.TableMeta;
import org.apache.tajo.catalog.statistics.TableStat;
import org.apache.tajo.common.TajoDataTypes;
import org.apache.tajo.datum.CharDatum;
import org.apache.tajo.datum.Datum;
import org.apache.tajo.datum.NullDatum;
import org.apache.tajo.datum.ProtobufDatum;
import org.apache.tajo.datum.protobuf.ProtobufJsonFormat;
import org.apache.tajo.exception.UnsupportedException;
import org.apache.tajo.storage.compress.CodecPool;
import org.apache.tajo.storage.exception.AlreadyExistsStorageException;
import org.apache.tajo.util.Bytes;


import java.io.*;
import java.util.Arrays;


public class CSVFile {
  public static byte[] trueBytes = "true".getBytes();
  public static byte[] falseBytes = "false".getBytes();


  public static final String DELIMITER = "csvfile.delimiter";
  public static final String NULL = "csvfile.null";     //read only
  public static final String DELIMITER_DEFAULT = "|";
  public static final byte LF = '\n';
  public static int EOF = -1;


  private static final Log LOG = LogFactory.getLog(CSVFile.class);


  public static class CSVAppender extends FileAppender {
    private final TableMeta meta;
    private final Schema schema;
    private final FileSystem fs;
    private FSDataOutputStream fos;
    private DataOutputStream outputStream;
    private CompressionOutputStream deflateFilter;
    private char delimiter;
    private TableStatistics stats = null;
    private Compressor compressor;
    private CompressionCodecFactory codecFactory;
    private CompressionCodec codec;
    private Path compressedPath;
    private byte[] nullChars;
    private ProtobufJsonFormat protobufJsonFormat = ProtobufJsonFormat.getInstance();


    public CSVAppender(Configuration conf, final TableMeta meta,
                       final Path path) throws IOException {
      super(conf, meta, path);
      this.fs = path.getFileSystem(conf);
      this.meta = meta;
      this.schema = meta.getSchema();
      this.delimiter = StringEscapeUtils.unescapeJava(this.meta.getOption(DELIMITER, DELIMITER_DEFAULT)).charAt(0);


      String nullCharacters = StringEscapeUtils.unescapeJava(this.meta.getOption(NULL));
      if (StringUtils.isEmpty(nullCharacters)) {
        nullChars = NullDatum.get().asTextBytes();
      } else {
        nullChars = nullCharacters.getBytes();
      }
    }


    @Override
    public void init() throws IOException {
      if (!fs.exists(path.getParent())) {
        throw new FileNotFoundException(path.toString());
      }


      String codecName = this.meta.getOption(TableMeta.COMPRESSION_CODEC);
      if(!StringUtils.isEmpty(codecName)){
        codecFactory = new CompressionCodecFactory(conf);
        codec = codecFactory.getCodecByClassName(codecName);
        compressor =  CodecPool.getCompressor(codec);
        if(compressor != null) compressor.reset();  //builtin gzip is null


        String extension = codec.getDefaultExtension();
        compressedPath = path.suffix(extension);


        if (fs.exists(compressedPath)) {
          throw new AlreadyExistsStorageException(compressedPath);
        }


        fos = fs.create(compressedPath);
        deflateFilter = codec.createOutputStream(fos, compressor);
        outputStream = new DataOutputStream(new BufferedOutputStream(deflateFilter));


      } else {
        if (fs.exists(path)) {
          throw new AlreadyExistsStorageException(path);
        }
        fos = fs.create(path);
        outputStream = fos;
      }


      if (enabledStats) {
        this.stats = new TableStatistics(this.schema);
      }


      super.init();
    }


    @Override
    public void addTuple(Tuple tuple) throws IOException {
      Column col;
      Datum datum;


      int colNum = schema.getColumnNum();
      if (tuple instanceof LazyTuple) {
        LazyTuple  lTuple = (LazyTuple)tuple;
        for (int i = 0; i < colNum; i++) {
          TajoDataTypes.DataType dataType = schema.getColumn(i).getDataType();


          switch (dataType.getType()) {
            case TEXT: {
              datum = tuple.get(i);
              if (datum instanceof NullDatum) {
                outputStream.write(nullChars);
              } else {
                outputStream.write(datum.asTextBytes());
              }
              break;
            }
            case CHAR: {
              datum = tuple.get(i);
              if (datum instanceof NullDatum) {
                outputStream.write(nullChars);
              } else {
                byte[] pad = new byte[dataType.getLength() - datum.size()];
                outputStream.write(datum.asTextBytes());
                outputStream.write(pad);
              }
              break;
            }
            case BOOLEAN: {
              datum = tuple.get(i);
              if (datum instanceof NullDatum) {
                //null datum is zero length byte array
              } else {
                outputStream.write(datum.asBool() ? trueBytes : falseBytes);   //Compatibility with Apache Hive
              }
              break;
            }
            case NULL:
              break;
            case PROTOBUF:
              datum = tuple.get(i);
              ProtobufDatum protobufDatum = (ProtobufDatum) datum;
              protobufJsonFormat.print(protobufDatum.get(), outputStream);
              break;
            default:
              outputStream.write(lTuple.getTextBytes(i)); //better usage for insertion to table of lazy tuple
              break;
          }


          if(colNum - 1 > i){
            outputStream.write((byte) delimiter);
          }


          if (enabledStats) {
            datum = tuple.get(i);
            stats.analyzeField(i, datum);
          }
        }
      } else {
        for (int i = 0; i < schema.getColumnNum(); i++) {
          datum = tuple.get(i);
          if (enabledStats) {
            stats.analyzeField(i, datum);
          }
          if (datum instanceof NullDatum) {
            outputStream.write(nullChars);
          } else {
            col = schema.getColumn(i);
            switch (col.getDataType().getType()) {
              case BOOLEAN:
                outputStream.write(tuple.getBoolean(i).asBool() ? trueBytes : falseBytes);   //Compatibility with Apache Hive
                break;
              case BIT:
                outputStream.write(tuple.getByte(i).asTextBytes());
                break;
              case BLOB:
                outputStream.write(Base64.encodeBase64(tuple.getBytes(i).asByteArray(), false));
                break;
              case CHAR:
                CharDatum charDatum = tuple.getChar(i);
                byte[] pad = new byte[col.getDataType().getLength() - datum.size()];
                outputStream.write(charDatum.asTextBytes());
                outputStream.write(pad);
                break;
              case TEXT:
                outputStream.write(tuple.getText(i).asTextBytes());
                break;
              case INT2:
                outputStream.write(tuple.getShort(i).asTextBytes());
                break;
              case INT4:
                outputStream.write(tuple.getInt(i).asTextBytes());
                break;
              case INT8:
                outputStream.write(tuple.getLong(i).asTextBytes());
                break;
              case FLOAT4:
                outputStream.write(tuple.getFloat(i).asTextBytes());
                break;
              case FLOAT8:
                outputStream.write(tuple.getDouble(i).asTextBytes());
                break;
              case INET4:
                outputStream.write(tuple.getIPv4(i).asTextBytes());
                break;
              case INET6:
                outputStream.write(tuple.getIPv6(i).toString().getBytes());
                break;
              case PROTOBUF:
                ProtobufDatum protobuf = (ProtobufDatum) datum;
                ProtobufJsonFormat.getInstance().print(protobuf.get(), outputStream);
                break;
              default:
                throw new UnsupportedOperationException("Cannot write such field: "
                    + tuple.get(i).type());
            }
          }
          if(colNum - 1 > i){
            outputStream.write((byte) delimiter);
          }
        }
      }
      // Statistical section
      outputStream.write('\n');
      if (enabledStats) {
        stats.incrementRow();
      }
    }


    @Override
    public long getOffset() throws IOException {
      return fos.getPos();
    }


    @Override
    public void flush() throws IOException {
      outputStream.flush();
    }


    @Override
    public void close() throws IOException {
      // Statistical section
      if (enabledStats) {
        stats.setNumBytes(getOffset());
      }


      try {
        flush();


        if(deflateFilter != null) {
          deflateFilter.finish();
          deflateFilter.resetState();
          deflateFilter = null;
        }


        fos.close();
      } finally {
        if (compressor != null) {
          CodecPool.returnCompressor(compressor);
          compressor = null;
        }
      }
    }


    @Override
    public TableStat getStats() {
      if (enabledStats) {
        return stats.getTableStat();
      } else {
        return null;
      }
    }


    public boolean isCompress() {
      return compressor != null;
    }


    public String getExtension() {
      return codec != null ? codec.getDefaultExtension() : "";
    }
  }


  public static class CSVScanner extends FileScanner implements SeekableScanner {
    public CSVScanner(Configuration conf, final TableMeta meta,
                      final Fragment fragment) throws IOException {
      super(conf, meta, fragment);
      factory = new CompressionCodecFactory(conf);
      codec = factory.getCodec(fragment.getPath());
      if (isCompress() && !(codec instanceof SplittableCompressionCodec)) {
          splittable = false;
      }


      // Buffer size, Delimiter
      this.bufSize = DEFAULT_BUFFER_SIZE;
      String delim  = fragment.getMeta().getOption(DELIMITER, DELIMITER_DEFAULT);
      this.delimiter = StringEscapeUtils.unescapeJava(delim).charAt(0);


      String nullCharacters = StringEscapeUtils.unescapeJava(fragment.getMeta().getOption(NULL));
      if (StringUtils.isEmpty(nullCharacters)) {
        nullChars = NullDatum.get().asTextBytes();
      } else {
        nullChars = nullCharacters.getBytes();
      }
    }


    private final static int DEFAULT_BUFFER_SIZE = 128 * 1024;
    private int bufSize;
    private char delimiter;
    private FileSystem fs;
    private FSDataInputStream fis;
    private InputStream is; //decompressd stream
    private CompressionCodecFactory factory;
    private CompressionCodec codec;
    private Decompressor decompressor;
    private Seekable filePosition;
    private boolean splittable = true;
    private long startOffset, length;
    private byte[] buf = null;
    private byte[][] tuples = null;
    private long[] tupleOffsets = null;
    private int currentIdx = 0, validIdx = 0;
    private byte[] tail = null;
    private long pageStart = -1;
    private long prevTailLen = -1;
    private int[] targetColumnIndexes;
    private boolean eof = false;
    private final byte[] nullChars;


    @Override
    public void init() throws IOException {


      // Fragment information
      fs = fragment.getPath().getFileSystem(conf);
      fis = fs.open(fragment.getPath());
      startOffset = fragment.getStartOffset();
      length = fragment.getLength();


      if(startOffset > 0) startOffset--; // prev line feed


      if (codec != null) {
        decompressor = CodecPool.getDecompressor(codec);
        if (codec instanceof SplittableCompressionCodec) {
          SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(
              fis, decompressor, startOffset, startOffset + length,
              SplittableCompressionCodec.READ_MODE.BYBLOCK);


          startOffset = cIn.getAdjustedStart();
          length = cIn.getAdjustedEnd() - startOffset;
          filePosition = cIn;
          is = cIn;
        } else {
          is = new DataInputStream(codec.createInputStream(fis, decompressor));
        }
      } else {
        fis.seek(startOffset);
        filePosition = fis;
        is = fis;
      }


      tuples = new byte[0][];
      if (targets == null) {
        targets = schema.toArray();
      }


      targetColumnIndexes = new int[targets.length];
      for (int i = 0; i < targets.length; i++) {
        targetColumnIndexes[i] = schema.getColumnIdByName(targets[i].getColumnName());
      }


      super.init();
      Arrays.sort(targetColumnIndexes);
      if (LOG.isDebugEnabled()) {
        LOG.debug("CSVScanner open:" + fragment.getPath() + "," + startOffset + "," + length +
            "," + fs.getFileStatus(fragment.getPath()).getLen());
      }


      if (startOffset != 0) {
        int rbyte;
        while ((rbyte = is.read()) != LF) {
          if(rbyte == EOF) break;
        }
      }


      if (fragmentable() < 1) {
        close();
        return;
      }
      page();
    }


    private long fragmentable() throws IOException {
      return startOffset + length - getFilePosition();
    }


    private long getFilePosition() throws IOException {
      long retVal;
      if (filePosition != null) {
        retVal = filePosition.getPos();
      } else {
        retVal = fis.getPos();
      }
      return retVal;
    }


    private void page() throws IOException {
      // Index initialization
      currentIdx = 0;


      // Buffer size set
      if (isSplittable() &&  fragmentable() < DEFAULT_BUFFER_SIZE) {
        bufSize = (int)fragmentable();
      }


      if (this.tail == null || this.tail.length == 0) {
        this.pageStart = getFilePosition();
        this.prevTailLen = 0;
      } else {
        this.pageStart = getFilePosition() - this.tail.length;
        this.prevTailLen = this.tail.length;
      }


      // Read
      int rbyte;
      buf = new byte[bufSize];
      rbyte = is.read(buf);


      if (prevTailLen == 0) {
        if(rbyte == EOF){
          eof = true; //EOF
          return;
        }


        tail = new byte[0];
        tuples = Bytes.splitPreserveAllTokens(buf, rbyte, (char) LF);
      } else {
        byte[] lastRow = ArrayUtils.addAll(tail, buf);
        tuples = Bytes.splitPreserveAllTokens(lastRow, rbyte + tail.length, (char) LF);
        tail = null;
      }


      // Check tail
      if ((char) buf[rbyte - 1] != LF) {
        // splittable bzip2 compression returned 1 byte when sync maker found
        if (isSplittable() && (fragmentable() < 1 || rbyte != bufSize)) {
          int lineFeedPos = 0;
          byte[] temp = new byte[DEFAULT_BUFFER_SIZE];


          // find line feed
          while ((temp[lineFeedPos] = (byte)is.read()) != (byte)LF) {
            lineFeedPos++;
          }


          tuples[tuples.length - 1] = ArrayUtils.addAll(tuples[tuples.length - 1],
              ArrayUtils.subarray(temp, 0, lineFeedPos));
          validIdx = tuples.length;


        } else {
          tail = tuples[tuples.length - 1];
          validIdx = tuples.length - 1;
        }
      } else {
        tail = new byte[0];
        validIdx = tuples.length - 1;   //remove last empty row ( .... \n .... \n  length is 3)
      }


     if(!isCompress()) makeTupleOffset();
    }


    private void makeTupleOffset() {
      long curTupleOffset = 0;
      this.tupleOffsets = new long[this.validIdx];
      for (int i = 0; i < this.validIdx; i++) {
        this.tupleOffsets[i] = curTupleOffset + this.pageStart;
        curTupleOffset += this.tuples[i].length + 1;//tuple byte +  1byte line feed
      }
    }


    @Override
    public Tuple next() throws IOException {
      try {
        if (currentIdx == validIdx) {
          if (isSplittable() && fragmentable() < 1) {
            close();
            return null;
          } else {
            page();
          }


          if(eof){
            close();
            return null;
          }
        }


        long offset = -1;
        if(!isCompress()){
          offset = this.tupleOffsets[currentIdx];
        }


        byte[][] cells = Bytes.splitPreserveAllTokens(tuples[currentIdx++], delimiter, targetColumnIndexes);
        return new LazyTuple(schema, cells, offset, nullChars);
      } catch (Throwable t) {
        LOG.error("Tuple list length: " + (tuples != null ? tuples.length : 0), t);
        LOG.error("Tuple list current index: " + currentIdx, t);
      }
      return null;
    }


    private boolean isCompress() {
      return codec != null;
    }


    @Override
    public void reset() throws IOException {
      init();
    }


    @Override
    public void close() throws IOException {
      try {
        is.close();
      } finally {
        if (decompressor != null) {
          decompressor.reset();
          CodecPool.returnDecompressor(decompressor);
          decompressor = null;
        }
      }
    }


    @Override
    public boolean isProjectable() {
      return true;
    }


    @Override
    public boolean isSelectable() {
      return false;
    }


    @Override
    public void setSearchCondition(Object expr) {
    }


    @Override
    public void seek(long offset) throws IOException {
      if(isCompress()) throw new UnsupportedException();


      int tupleIndex = Arrays.binarySearch(this.tupleOffsets, offset);
      if (tupleIndex > -1) {
        this.currentIdx = tupleIndex;
      } else if (isSplittable() && offset >= this.pageStart + this.bufSize
          + this.prevTailLen - this.tail.length || offset <= this.pageStart) {
        filePosition.seek(offset);
        tail = new byte[0];
        buf = new byte[DEFAULT_BUFFER_SIZE];
        bufSize = DEFAULT_BUFFER_SIZE;
        this.currentIdx = 0;
        this.validIdx = 0;
        // pageBuffer();
      } else {
        throw new IOException("invalid offset " +
            " < pageStart : " +  this.pageStart + " , " +
            "  pagelength : " + this.bufSize + " , " +
            "  tail lenght : " + this.tail.length +
            "  input offset : " + offset + " >");
      }


    }


    @Override
    public long getNextOffset() throws IOException {
      if(isCompress()) throw new UnsupportedException();


      if (this.currentIdx == this.validIdx) {
        if (fragmentable() < 1) {
          return -1;
        } else {
          page();
        }
      }
      return this.tupleOffsets[currentIdx];
    }


    @Override
    public boolean isSplittable(){
      return splittable;
    }
  }
}
Source Code of org.apache.tajo.storage.CSVFile$CSVScanner

Related Classes of org.apache.tajo.storage.CSVFile$CSVScanner