package com.twitter.elephantbird.pig.store;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Properties;
import com.twitter.elephantbird.util.HadoopCompat;
import org.apache.hadoop.hive.ql.io.RCFileInputFormat;
import org.apache.hadoop.hive.serde2.ByteStream;
import org.apache.hadoop.hive.serde2.ColumnProjectionUtils;
import org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable;
import org.apache.hadoop.hive.serde2.columnar.BytesRefWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.pig.ResourceSchema;
import org.apache.pig.builtin.PigStorage;
import org.apache.pig.data.DataByteArray;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.apache.pig.impl.logicalLayer.FrontendException;
import org.apache.pig.impl.util.ObjectSerializer;
import org.apache.pig.impl.util.StorageUtil;
import org.apache.pig.impl.util.UDFContext;
import com.twitter.elephantbird.mapreduce.input.MapReduceInputFormatWrapper;
import com.twitter.elephantbird.mapreduce.output.RCFileOutputFormat;
/**
* RCFile version of PigStorage. <p>
*
* Usage: <pre>
* register 'libs/*hive-common*.jar;
* register 'libs/*hive-exec*.jar;
*
* a = load 'input' using RCFileStorage() as (a:int, b:chararray, c:long);
*
* b = foreach a generate a, TOTUPLE(a, c);
* store b into 'output' using RCFilePigStorage();
*
* </pre>
*/
public class RCFilePigStorage extends PigStorage {
private TupleFactory tupleFactory = TupleFactory.getInstance();
// for loader
private int[] requiredColumns = null;
// for storage
private int numColumns = -1;
private ByteStream.Output byteStream;
private BytesRefArrayWritable rowWritable;
private BytesRefWritable[] colValRefs;
public RCFilePigStorage() {
super();
}
/* TODO(raghu): support extra options for PigStorage() with pig 11
* public RCFilePigStorage(String options) {
* super("\t", options); // delimiter does not affect us
* }
*/
private Properties getUDFProperties() {
return UDFContext.getUDFContext()
.getUDFProperties(this.getClass(), new String[] { signature });
}
@Override
public InputFormat<LongWritable, BytesRefArrayWritable> getInputFormat() {
return new MapReduceInputFormatWrapper<LongWritable, BytesRefArrayWritable>
(new RCFileInputFormat<LongWritable, BytesRefArrayWritable>());
}
@Override
public OutputFormat<NullWritable, Writable> getOutputFormat() {
return new RCFileOutputFormat();
}
@Override
public RequiredFieldResponse pushProjection(RequiredFieldList requiredFieldList)
throws FrontendException {
// no need to invoke super.pushProjection();
try {
getUDFProperties().setProperty("requiredFieldList",
ObjectSerializer.serialize(requiredFieldList));
} catch (IOException e) {
throw new RuntimeException(e);
}
return new RequiredFieldResponse(true);
}
public void setLocation(String location, Job job) throws IOException {
super.setLocation(location, job);
// sets columnIds config for RCFile
String obj = getUDFProperties().getProperty("requiredFieldList");
if (obj == null) {
// front end or there is no projection set
ColumnProjectionUtils.setFullyReadColumns(HadoopCompat.getConfiguration(job));
return ;
}
RequiredFieldList fieldList = (RequiredFieldList)
ObjectSerializer.deserialize(obj);
ArrayList<Integer> ids = new ArrayList<Integer>();
requiredColumns = new int[fieldList.getFields().size()];
int i = 0;
for (RequiredField rf : fieldList.getFields()) {
requiredColumns[i++] = rf.getIndex();
ids.add(rf.getIndex());
}
ColumnProjectionUtils.setReadColumnIDs(HadoopCompat.getConfiguration(job), ids);
}
@Override
public void checkSchema(ResourceSchema s) throws IOException {
super.checkSchema(s);
getUDFProperties().setProperty("numColumns",
Integer.toString(s.getFields().length));
}
@Override
public void setStoreLocation(String location, Job job) throws IOException {
super.setStoreLocation(location, job);
// set number of columns if this is set in context.
Properties p = getUDFProperties();
if (p != null) {
numColumns = Integer.parseInt(p.getProperty("numColumns", "-1"));
}
if (numColumns > 0) {
RCFileOutputFormat.setColumnNumber(HadoopCompat.getConfiguration(job), numColumns);
}
}
@Override
public Tuple getNext() throws IOException {
try {
if (!in.nextKeyValue()) {
return null;
}
BytesRefArrayWritable byteRefs = (BytesRefArrayWritable) in.getCurrentValue();
boolean isProjected = requiredColumns != null;
int inputSize = byteRefs.size();
int tupleSize = isProjected ? requiredColumns.length : inputSize;
Tuple tuple = tupleFactory.newTuple(tupleSize);
int tupleIdx = 0;
for (int i=0; i<inputSize && tupleIdx<tupleSize; i++) {
if (!isProjected || i == requiredColumns[tupleIdx]) {
// set if all the fields are required or the field is projected
BytesRefWritable ref = byteRefs.get(i);
if (ref != null && ref.getLength() > 0) {
tuple.set(tupleIdx, new DataByteArray(ref.getBytesCopy()));
}
tupleIdx++;
}
}
return tuple;
} catch (InterruptedException e) {
throw new IOException(e);
}
}
@SuppressWarnings("unchecked")
@Override
public void putNext(Tuple t) throws IOException {
// convert tuple fields to set of byte arrays and write to RCFile
if (rowWritable == null) { // initialize
if (numColumns < 1) {
throw new IOException("number of columns is not set");
}
byteStream = new ByteStream.Output();
rowWritable = new BytesRefArrayWritable();
colValRefs = new BytesRefWritable[numColumns];
for (int i = 0; i < numColumns; i++) {
colValRefs[i] = new BytesRefWritable();
rowWritable.set(i, colValRefs[i]);
}
}
byteStream.reset();
// write each field as a text (just like PigStorage)
int sz = t.size();
int startPos = 0;
for (int i = 0; i < sz && i < numColumns; i++) {
StorageUtil.putField(byteStream, t.get(i));
colValRefs[i].set(byteStream.getData(),
startPos,
byteStream.getCount() - startPos);
startPos = byteStream.getCount();
}
try {
writer.write(null, rowWritable);
} catch (InterruptedException e) {
throw new IOException(e);
}
}
}