Source Code of com.twitter.elephantbird.pig.store.RCFilePigStorage

package com.twitter.elephantbird.pig.store;


import java.io.IOException;
import java.util.ArrayList;
import java.util.Properties;


import com.twitter.elephantbird.util.HadoopCompat;
import org.apache.hadoop.hive.ql.io.RCFileInputFormat;
import org.apache.hadoop.hive.serde2.ByteStream;
import org.apache.hadoop.hive.serde2.ColumnProjectionUtils;
import org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable;
import org.apache.hadoop.hive.serde2.columnar.BytesRefWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.pig.ResourceSchema;
import org.apache.pig.builtin.PigStorage;
import org.apache.pig.data.DataByteArray;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.apache.pig.impl.logicalLayer.FrontendException;
import org.apache.pig.impl.util.ObjectSerializer;
import org.apache.pig.impl.util.StorageUtil;
import org.apache.pig.impl.util.UDFContext;


import com.twitter.elephantbird.mapreduce.input.MapReduceInputFormatWrapper;
import com.twitter.elephantbird.mapreduce.output.RCFileOutputFormat;


/**
 * RCFile version of PigStorage. <p>
 *
 * Usage: <pre>
 * register 'libs/*hive-common*.jar;
 * register 'libs/*hive-exec*.jar;
 *
 * a = load 'input' using RCFileStorage() as (a:int, b:chararray, c:long);
 *
 * b = foreach a generate a, TOTUPLE(a, c);
 * store b into 'output' using RCFilePigStorage();
 *
 * </pre>
 */
public class RCFilePigStorage extends PigStorage {


  private TupleFactory tupleFactory = TupleFactory.getInstance();


  // for loader
  private int[] requiredColumns = null;


  // for storage
  private int numColumns = -1;
  private ByteStream.Output byteStream;
  private BytesRefArrayWritable rowWritable;
  private BytesRefWritable[] colValRefs;


  public RCFilePigStorage() {
    super();
  }


  /* TODO(raghu): support extra options for PigStorage() with pig 11
   * public RCFilePigStorage(String options) {
   *   super("\t", options); // delimiter does not affect us
   * }
   */


  private Properties getUDFProperties() {
    return UDFContext.getUDFContext()
              .getUDFProperties(this.getClass(), new String[] { signature });
  }


  @Override
  public InputFormat<LongWritable, BytesRefArrayWritable> getInputFormat() {
    return new MapReduceInputFormatWrapper<LongWritable, BytesRefArrayWritable>
            (new RCFileInputFormat<LongWritable, BytesRefArrayWritable>());
  }


  @Override
  public OutputFormat<NullWritable, Writable> getOutputFormat() {
    return new RCFileOutputFormat();
  }


  @Override
  public RequiredFieldResponse pushProjection(RequiredFieldList requiredFieldList)
          throws FrontendException {
    // no need to invoke super.pushProjection();
    try {
      getUDFProperties().setProperty("requiredFieldList",
              ObjectSerializer.serialize(requiredFieldList));
    } catch (IOException e) {
      throw new RuntimeException(e);
    }


    return new RequiredFieldResponse(true);
  }


  public void setLocation(String location, Job job) throws IOException {
    super.setLocation(location, job);


    // sets columnIds config for RCFile


    String obj = getUDFProperties().getProperty("requiredFieldList");
    if (obj == null) {
      // front end or there is no projection set
      ColumnProjectionUtils.setFullyReadColumns(HadoopCompat.getConfiguration(job));
      return ;
    }


    RequiredFieldList fieldList = (RequiredFieldList)
                                  ObjectSerializer.deserialize(obj);


    ArrayList<Integer> ids = new ArrayList<Integer>();
    requiredColumns = new int[fieldList.getFields().size()];
    int i = 0;
    for (RequiredField rf : fieldList.getFields()) {
      requiredColumns[i++] = rf.getIndex();
      ids.add(rf.getIndex());
    }


    ColumnProjectionUtils.setReadColumnIDs(HadoopCompat.getConfiguration(job), ids);
  }


  @Override
  public void checkSchema(ResourceSchema s) throws IOException {
    super.checkSchema(s);
    getUDFProperties().setProperty("numColumns",
                                   Integer.toString(s.getFields().length));
  }


  @Override
  public void setStoreLocation(String location, Job job) throws IOException {
    super.setStoreLocation(location, job);
    // set number of columns if this is set in context.
    Properties p = getUDFProperties();
    if (p != null) {
      numColumns = Integer.parseInt(p.getProperty("numColumns", "-1"));
    }


    if (numColumns > 0) {
      RCFileOutputFormat.setColumnNumber(HadoopCompat.getConfiguration(job), numColumns);
    }
  }


  @Override
  public Tuple getNext() throws IOException {
    try {
      if (!in.nextKeyValue()) {
        return null;
      }


      BytesRefArrayWritable byteRefs = (BytesRefArrayWritable) in.getCurrentValue();


      boolean isProjected = requiredColumns != null;
      int inputSize = byteRefs.size();
      int tupleSize = isProjected ? requiredColumns.length : inputSize;


      Tuple tuple = tupleFactory.newTuple(tupleSize);
      int tupleIdx = 0;


      for (int i=0; i<inputSize && tupleIdx<tupleSize; i++) {
        if (!isProjected || i == requiredColumns[tupleIdx]) {
          // set if all the fields are required or the field is projected
          BytesRefWritable ref = byteRefs.get(i);
          if (ref != null && ref.getLength() > 0) {
            tuple.set(tupleIdx, new DataByteArray(ref.getBytesCopy()));
          }
          tupleIdx++;
        }
      }


      return tuple;
    } catch (InterruptedException e) {
      throw new IOException(e);
    }
  }


  @SuppressWarnings("unchecked")
  @Override
  public void putNext(Tuple t) throws IOException {
    // convert tuple fields to set of byte arrays and write to RCFile


    if (rowWritable == null) { // initialize
      if (numColumns < 1) {
        throw new IOException("number of columns is not set");
      }


      byteStream = new ByteStream.Output();
      rowWritable = new BytesRefArrayWritable();
      colValRefs = new BytesRefWritable[numColumns];


      for (int i = 0; i < numColumns; i++) {
        colValRefs[i] = new BytesRefWritable();
        rowWritable.set(i, colValRefs[i]);
      }
    }


    byteStream.reset();


    // write each field as a text (just like PigStorage)
    int sz = t.size();
    int startPos = 0;


    for (int i = 0; i < sz && i < numColumns; i++) {


      StorageUtil.putField(byteStream, t.get(i));
      colValRefs[i].set(byteStream.getData(),
                        startPos,
                        byteStream.getCount() - startPos);
       startPos = byteStream.getCount();
    }


    try {
      writer.write(null, rowWritable);
    } catch (InterruptedException e) {
      throw new IOException(e);
    }
  }
}
Source Code of com.twitter.elephantbird.pig.store.RCFilePigStorage

Related Classes of com.twitter.elephantbird.pig.store.RCFilePigStorage