package com.twitter.elephantbird.pig.load;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.Iterator;
import com.twitter.elephantbird.pig.util.PigTestUtil;
import com.twitter.elephantbird.util.HadoopCompat;
import com.twitter.elephantbird.util.CoreTestUtil;
import org.apache.commons.codec.binary.Base64;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.TaskAttemptID;
import org.apache.pig.PigServer;
import org.apache.pig.data.DataByteArray;
import org.apache.pig.data.Tuple;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
import com.google.protobuf.Message;
import com.twitter.data.proto.tutorial.AddressBookProtos.Person;
import com.twitter.data.proto.tutorial.AddressBookProtos.PersonWithoutEmail;
import com.twitter.data.proto.tutorial.AddressBookProtos.Person.PhoneNumber;
import com.twitter.data.proto.tutorial.AddressBookProtos.Person.PhoneType;
import com.twitter.elephantbird.mapreduce.io.ProtobufWritable;
import com.twitter.elephantbird.mapreduce.output.RCFileProtobufOutputFormat;
import com.twitter.elephantbird.pig.piggybank.ProtobufBytesToTuple;
import com.twitter.elephantbird.pig.store.RCFileProtobufPigStorage;
import com.twitter.elephantbird.pig.util.ProtobufToPig;
import com.twitter.elephantbird.util.Codecs;
import com.twitter.elephantbird.util.Protobufs;
/**
* Test RCFile loader and storage with Protobufs.
*/
public class TestRCFileProtobufStorage {
private PigServer pigServer;
private final String testDir =
CoreTestUtil.getTestDataDir(TestRCFileProtobufStorage.class);
private final File inputDir = new File(testDir, "in");
private final File rcfile_in = new File(testDir, "rcfile_in");
private final Person[] records = new Person[]{
makePerson(0),
makePerson(1),
makePerson(2),
makePersonWithDefaults(3, true),
makePersonWithDefaults(4, false),
makePersonWithDefaults(4, true) };
private static final Base64 base64 = Codecs.createStandardBase64();
public static class B64ToTuple extends ProtobufBytesToTuple<Message> {
public B64ToTuple(String className) {
super(className);
}
@Override
public Tuple exec(Tuple input) throws IOException {
byte[] bytes = ((DataByteArray)input.get(0)).get();
input.set(0, new DataByteArray(base64.decode(bytes)));
return super.exec(input);
}
}
@Before
public void setUp() throws Exception {
FileUtil.fullyDelete(new File(testDir));
pigServer = PigTestUtil.makePigServer();
inputDir.mkdirs();
// create an text file with b64 encoded protobufs
FileOutputStream out = new FileOutputStream(new File(inputDir, "persons_b64.txt"));
for (Person rec : records) {
out.write(base64.encode(rec.toByteArray()));
out.write('\n');
}
out.close();
}
@Test
public void testRCFileStorage() throws Exception {
/* create a directory with three rcfiles :
* - one created with normal Person objects using RCFileProtobufPigStorage.
* - one created with Person objects where the optional fields are not set.
* - other with PersonWithoutEmail (for testing unknown fields)
* using the same objects as the first one.
*
* Then load both files using RCFileProtobufPigLoader
*/
// write to rcFile using RCFileProtobufStorage
for(String line : String.format(
"DEFINE b64ToTuple %s('%s');\n" +
"A = load '%s' as (line);\n" +
"A = foreach A generate b64ToTuple(line) as t;\n" +
"A = foreach A generate FLATTEN(t);\n" +
"STORE A into '%s' using %s('%s');\n"
, B64ToTuple.class.getName()
, Person.class.getName()
, inputDir.toURI().toString()
, rcfile_in.toURI().toString()
, RCFileProtobufPigStorage.class.getName()
, Person.class.getName()
).split("\n")) {
pigServer.registerQuery(line + "\n");
}
// create an rcfile with Person objects directly with out converting to a
// tuple so that optional fields that are not set are null in RCFile
ProtobufWritable<Person> personWritable = ProtobufWritable.newInstance(Person.class);
RecordWriter<Writable, Writable> protoWriter =
createProtoWriter(Person.class,
new File(rcfile_in, "persons_with_unset_fields.rc"));
for(Person person : records) {
personWritable.set(person);
protoWriter.write(null, personWritable);
}
protoWriter.close(null);
// create an rcFile with PersonWithoutEmail to test unknown fields
ProtobufWritable<PersonWithoutEmail> pweWritable =
ProtobufWritable.newInstance(PersonWithoutEmail.class);
protoWriter = createProtoWriter(PersonWithoutEmail.class,
new File(rcfile_in, "persons_with_unknows.rc"));
for(Person person : records) {
pweWritable.set(PersonWithoutEmail.newBuilder()
.mergeFrom(person.toByteArray()).build());
protoWriter.write(null, pweWritable);
}
protoWriter.close(null);
// load all the files
pigServer.registerQuery(String.format(
"A = load '%s' using %s('%s');\n"
, rcfile_in.toURI().toString()
, RCFileProtobufPigLoader.class.getName()
, Person.class.getName()));
// verify the result:
Iterator<Tuple> rows = pigServer.openIterator("A");
for (int i=0; i<3; i++) {
for(Person person : records) {
String expected = personToString(person);
Assert.assertEquals(expected, rows.next().toString());
}
}
// clean up on successful run
FileUtil.fullyDelete(new File(testDir));
}
@SuppressWarnings("unchecked")
private static RecordWriter<Writable, Writable>
createProtoWriter(Class<?> protoClass, final File file)
throws IOException, InterruptedException {
OutputFormat outputFormat = (
new RCFileProtobufOutputFormat(Protobufs.getTypeRef(protoClass.getName())) {
@Override
public Path getDefaultWorkFile(TaskAttemptContext context,
String extension) throws IOException {
return new Path(file.toURI().toString());
}
});
Configuration conf = new Configuration();
// TODO: figure out why Gzip or BZip2 compression fails on OSX
// conf.setBoolean("mapred.output.compress", true);
// conf.set("mapred.output.compression.codec", "org.apache.hadoop.io.compress.BZip2Codec");
return outputFormat.getRecordWriter(
HadoopCompat.newTaskAttemptContext(conf, new TaskAttemptID()));
}
// return a Person object
private static Person makePerson(int index) {
return Person.newBuilder()
.setName("bob_" + index + " jenkins")
.setId(index)
.setEmail("bob_" + index + "@example.com")
.addPhone(
PhoneNumber.newBuilder()
.setNumber("408-555-" + (5555 + index))
.setType(PhoneType.MOBILE))
.build();
}
// return a Person object. don't set optional fields
private static Person makePersonWithDefaults(int index, boolean add_phone) {
Person.Builder builder =
Person.newBuilder()
.setName("bob_" + index + " jenkins")
.setId(index);
if (add_phone) {
builder.addPhone(PhoneNumber.newBuilder()
.setNumber("408-555-" + (5555 + index)));
}
return builder.build();
}
private static String personToString(Person person) {
return new ProtobufToPig().toTuple(person).toString();
}
}