Package com.twitter.elephantbird.pig.store

Source Code of com.twitter.elephantbird.pig.store.RCFilePigStorage

package com.twitter.elephantbird.pig.store;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Properties;

import com.twitter.elephantbird.util.HadoopCompat;
import org.apache.hadoop.hive.ql.io.RCFileInputFormat;
import org.apache.hadoop.hive.serde2.ByteStream;
import org.apache.hadoop.hive.serde2.ColumnProjectionUtils;
import org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable;
import org.apache.hadoop.hive.serde2.columnar.BytesRefWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.pig.ResourceSchema;
import org.apache.pig.builtin.PigStorage;
import org.apache.pig.data.DataByteArray;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.apache.pig.impl.logicalLayer.FrontendException;
import org.apache.pig.impl.util.ObjectSerializer;
import org.apache.pig.impl.util.StorageUtil;
import org.apache.pig.impl.util.UDFContext;

import com.twitter.elephantbird.mapreduce.input.MapReduceInputFormatWrapper;
import com.twitter.elephantbird.mapreduce.output.RCFileOutputFormat;

/**
* RCFile version of PigStorage. <p>
*
* Usage: <pre>
* register 'libs/*hive-common*.jar;
* register 'libs/*hive-exec*.jar;
*
* a = load 'input' using RCFileStorage() as (a:int, b:chararray, c:long);
*
* b = foreach a generate a, TOTUPLE(a, c);
* store b into 'output' using RCFilePigStorage();
*
* </pre>
*/
public class RCFilePigStorage extends PigStorage {

  private TupleFactory tupleFactory = TupleFactory.getInstance();

  // for loader
  private int[] requiredColumns = null;

  // for storage
  private int numColumns = -1;
  private ByteStream.Output byteStream;
  private BytesRefArrayWritable rowWritable;
  private BytesRefWritable[] colValRefs;

  public RCFilePigStorage() {
    super();
  }

  /* TODO(raghu): support extra options for PigStorage() with pig 11
   * public RCFilePigStorage(String options) {
   *   super("\t", options); // delimiter does not affect us
   * }
   */

  private Properties getUDFProperties() {
    return UDFContext.getUDFContext()
              .getUDFProperties(this.getClass(), new String[] { signature });
  }

  @Override
  public InputFormat<LongWritable, BytesRefArrayWritable> getInputFormat() {
    return new MapReduceInputFormatWrapper<LongWritable, BytesRefArrayWritable>
            (new RCFileInputFormat<LongWritable, BytesRefArrayWritable>());
  }

  @Override
  public OutputFormat<NullWritable, Writable> getOutputFormat() {
    return new RCFileOutputFormat();
  }

  @Override
  public RequiredFieldResponse pushProjection(RequiredFieldList requiredFieldList)
          throws FrontendException {
    // no need to invoke super.pushProjection();
    try {
      getUDFProperties().setProperty("requiredFieldList",
              ObjectSerializer.serialize(requiredFieldList));
    } catch (IOException e) {
      throw new RuntimeException(e);
    }

    return new RequiredFieldResponse(true);
  }

  public void setLocation(String location, Job job) throws IOException {
    super.setLocation(location, job);

    // sets columnIds config for RCFile

    String obj = getUDFProperties().getProperty("requiredFieldList");
    if (obj == null) {
      // front end or there is no projection set
      ColumnProjectionUtils.setFullyReadColumns(HadoopCompat.getConfiguration(job));
      return ;
    }

    RequiredFieldList fieldList = (RequiredFieldList)
                                  ObjectSerializer.deserialize(obj);

    ArrayList<Integer> ids = new ArrayList<Integer>();
    requiredColumns = new int[fieldList.getFields().size()];
    int i = 0;
    for (RequiredField rf : fieldList.getFields()) {
      requiredColumns[i++] = rf.getIndex();
      ids.add(rf.getIndex());
    }

    ColumnProjectionUtils.setReadColumnIDs(HadoopCompat.getConfiguration(job), ids);
  }

  @Override
  public void checkSchema(ResourceSchema s) throws IOException {
    super.checkSchema(s);
    getUDFProperties().setProperty("numColumns",
                                   Integer.toString(s.getFields().length));
  }

  @Override
  public void setStoreLocation(String location, Job job) throws IOException {
    super.setStoreLocation(location, job);
    // set number of columns if this is set in context.
    Properties p = getUDFProperties();
    if (p != null) {
      numColumns = Integer.parseInt(p.getProperty("numColumns", "-1"));
    }

    if (numColumns > 0) {
      RCFileOutputFormat.setColumnNumber(HadoopCompat.getConfiguration(job), numColumns);
    }
  }

  @Override
  public Tuple getNext() throws IOException {
    try {
      if (!in.nextKeyValue()) {
        return null;
      }

      BytesRefArrayWritable byteRefs = (BytesRefArrayWritable) in.getCurrentValue();

      boolean isProjected = requiredColumns != null;
      int inputSize = byteRefs.size();
      int tupleSize = isProjected ? requiredColumns.length : inputSize;

      Tuple tuple = tupleFactory.newTuple(tupleSize);
      int tupleIdx = 0;

      for (int i=0; i<inputSize && tupleIdx<tupleSize; i++) {
        if (!isProjected || i == requiredColumns[tupleIdx]) {
          // set if all the fields are required or the field is projected
          BytesRefWritable ref = byteRefs.get(i);
          if (ref != null && ref.getLength() > 0) {
            tuple.set(tupleIdx, new DataByteArray(ref.getBytesCopy()));
          }
          tupleIdx++;
        }
      }

      return tuple;
    } catch (InterruptedException e) {
      throw new IOException(e);
    }
  }

  @SuppressWarnings("unchecked")
  @Override
  public void putNext(Tuple t) throws IOException {
    // convert tuple fields to set of byte arrays and write to RCFile

    if (rowWritable == null) { // initialize
      if (numColumns < 1) {
        throw new IOException("number of columns is not set");
      }

      byteStream = new ByteStream.Output();
      rowWritable = new BytesRefArrayWritable();
      colValRefs = new BytesRefWritable[numColumns];

      for (int i = 0; i < numColumns; i++) {
        colValRefs[i] = new BytesRefWritable();
        rowWritable.set(i, colValRefs[i]);
      }
    }

    byteStream.reset();

    // write each field as a text (just like PigStorage)
    int sz = t.size();
    int startPos = 0;

    for (int i = 0; i < sz && i < numColumns; i++) {

      StorageUtil.putField(byteStream, t.get(i));
      colValRefs[i].set(byteStream.getData(),
                        startPos,
                        byteStream.getCount() - startPos);
       startPos = byteStream.getCount();
    }

    try {
      writer.write(null, rowWritable);
    } catch (InterruptedException e) {
      throw new IOException(e);
    }
  }
}
TOP

Related Classes of com.twitter.elephantbird.pig.store.RCFilePigStorage

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.