Source Code of org.apache.drill.exec.store.text.DrillTextRecordReader

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.drill.exec.store.text;


import java.io.IOException;
import java.util.Collections;
import java.util.List;


import org.apache.drill.common.exceptions.DrillRuntimeException;
import org.apache.drill.common.exceptions.ExecutionSetupException;
import org.apache.drill.common.expression.FieldReference;
import org.apache.drill.common.expression.SchemaPath;
import org.apache.drill.common.types.TypeProtos;
import org.apache.drill.common.types.Types;
import org.apache.drill.exec.ExecConstants;
import org.apache.drill.exec.exception.SchemaChangeException;
import org.apache.drill.exec.expr.TypeHelper;
import org.apache.drill.exec.ops.FragmentContext;
import org.apache.drill.exec.physical.impl.OutputMutator;
import org.apache.drill.exec.record.MaterializedField;
import org.apache.drill.exec.store.RecordReader;
import org.apache.drill.exec.vector.AllocationHelper;
import org.apache.drill.exec.vector.RepeatedVarCharVector;
import org.apache.drill.exec.vector.ValueVector;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextInputFormat;


import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;


public class DrillTextRecordReader implements RecordReader {
  static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(DrillTextRecordReader.class);


  static final String COL_NAME = "columns";


  private org.apache.hadoop.mapred.RecordReader<LongWritable, Text> reader;
  private List<ValueVector> vectors = Lists.newArrayList();
  private byte delimiter;
  private int targetRecordCount;
  private FieldReference ref = new FieldReference(COL_NAME);
  private FragmentContext context;
  private RepeatedVarCharVector vector;
  private List<Integer> columnIds = Lists.newArrayList();
  private LongWritable key;
  private Text value;
  private int numCols = 0;
  private boolean redoRecord = false;
  private boolean first = true;


  public DrillTextRecordReader(FileSplit split, FragmentContext context, char delimiter, List<SchemaPath> columns) {
    this.context = context;
    this.delimiter = (byte) delimiter;
    if(columns != null) {
      for (SchemaPath path : columns) {
        assert path.getRootSegment().isNamed();
        Preconditions.checkArgument(path.getRootSegment().getPath().equals(COL_NAME), "Selected column must have name 'columns'");
        // FIXME: need re-work for text column push-down.
        if (path.getRootSegment().getChild() != null) {
          Preconditions.checkArgument(path.getRootSegment().getChild().isArray(),"Selected column must be an array index");
          int index = path.getRootSegment().getChild().getArraySegment().getIndex();
          columnIds.add(index);
        }
      }
      Collections.sort(columnIds);
    }
    targetRecordCount = context.getConfig().getInt(ExecConstants.TEXT_LINE_READER_BATCH_SIZE);
    numCols = columnIds.size();
    TextInputFormat inputFormat = new TextInputFormat();
    JobConf job = new JobConf();
    job.setInt("io.file.buffer.size", context.getConfig().getInt(ExecConstants.TEXT_LINE_READER_BUFFER_SIZE));
    job.setInputFormat(inputFormat.getClass());
    try {
      reader = inputFormat.getRecordReader(split, job, Reporter.NULL);
      key = reader.createKey();
      value = reader.createValue();
    } catch (IOException e) {
      throw new RuntimeException(e);
    }
  }


  @Override
  public void setup(OutputMutator output) throws ExecutionSetupException {
    MaterializedField field = MaterializedField.create(ref, Types.repeated(TypeProtos.MinorType.VARCHAR));
    try {
      vector = output.addField(field, RepeatedVarCharVector.class);
    } catch (SchemaChangeException e) {
      throw new ExecutionSetupException(e);
    }
  }


  @Override
  public int next() {
    logger.debug("vector value capacity {}", vector.getValueCapacity());
    logger.debug("vector byte capacity {}", vector.getByteCapacity());
    int batchSize = 0;
    try {
      int recordCount = 0;
      while (redoRecord || (batchSize < 200*1000 && reader.next(key, value))) {
        redoRecord = false;
        int start;
        int end = -1;
        int p = 0;
        int i = 0;
        vector.getMutator().startNewGroup(recordCount);
        while (end < value.getLength() - 1) {
          if(numCols > 0 && p >= numCols) {
            break;
          }
          start = end;
          if (delimiter == '\n') {
            end = value.getLength();
          } else {
            end = find(value, delimiter, start + 1);
            if (end == -1) {
              end = value.getLength();
            }
          }
          if (numCols > 0 && i++ < columnIds.get(p)) {
            if (!vector.getMutator().addSafe(recordCount, value.getBytes(), start + 1, 0)) {
              redoRecord = true;
              vector.getMutator().setValueCount(recordCount);
              logger.debug("text scan batch size {}", batchSize);
              return recordCount;
            }
            continue;
          }
          p++;
          if (!vector.getMutator().addSafe(recordCount, value.getBytes(), start + 1, end - start - 1)) {
            redoRecord = true;
            vector.getMutator().setValueCount(recordCount);
            logger.debug("text scan batch size {}", batchSize);
            return recordCount;
          }
          batchSize += end - start;
        }
        recordCount++;
      }
      for (ValueVector v : vectors) {
        v.getMutator().setValueCount(recordCount);
      }
      vector.getMutator().setValueCount(recordCount);
      logger.debug("text scan batch size {}", batchSize);
      return recordCount;
    } catch (IOException e) {
      cleanup();
      throw new DrillRuntimeException(e);
    }
  }


  public int find(Text text, byte what, int start) {
    int len = text.getLength();
    int p = start;
    byte[] bytes = text.getBytes();
    boolean inQuotes = false;
    while (p < len) {
      if ('\"' == bytes[p]) {
        inQuotes = !inQuotes;
      }
      if (!inQuotes && bytes[p] == what) {
        return p;
      }
      p++;
    }
    return -1;
  }


  @Override
  public void cleanup() {
    try {
      reader.close();
    } catch (IOException e) {
      logger.warn("Exception closing reader: {}", e);
    }
  }
}
Source Code of org.apache.drill.exec.store.text.DrillTextRecordReader

Related Classes of org.apache.drill.exec.store.text.DrillTextRecordReader