Package com.twitter.elephantbird.pig.load

Source Code of com.twitter.elephantbird.pig.load.TestRCFileProtobufStorage$B64ToTuple

package com.twitter.elephantbird.pig.load;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.Iterator;

import com.twitter.elephantbird.pig.util.PigTestUtil;
import com.twitter.elephantbird.util.HadoopCompat;
import com.twitter.elephantbird.util.CoreTestUtil;

import org.apache.commons.codec.binary.Base64;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.TaskAttemptID;
import org.apache.pig.PigServer;
import org.apache.pig.data.DataByteArray;
import org.apache.pig.data.Tuple;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;

import com.google.protobuf.Message;
import com.twitter.data.proto.tutorial.AddressBookProtos.Person;
import com.twitter.data.proto.tutorial.AddressBookProtos.PersonWithoutEmail;
import com.twitter.data.proto.tutorial.AddressBookProtos.Person.PhoneNumber;
import com.twitter.data.proto.tutorial.AddressBookProtos.Person.PhoneType;
import com.twitter.elephantbird.mapreduce.io.ProtobufWritable;
import com.twitter.elephantbird.mapreduce.output.RCFileProtobufOutputFormat;
import com.twitter.elephantbird.pig.piggybank.ProtobufBytesToTuple;
import com.twitter.elephantbird.pig.store.RCFileProtobufPigStorage;
import com.twitter.elephantbird.pig.util.ProtobufToPig;
import com.twitter.elephantbird.util.Codecs;
import com.twitter.elephantbird.util.Protobufs;

/**
* Test RCFile loader and storage with Protobufs.
*/
public class TestRCFileProtobufStorage {

  private PigServer pigServer;
  private final String testDir =
          CoreTestUtil.getTestDataDir(TestRCFileProtobufStorage.class);
  private final File inputDir = new File(testDir, "in");
  private final File rcfile_in = new File(testDir, "rcfile_in");

  private final Person[] records = new Person[]{
                                          makePerson(0),
                                          makePerson(1),
                                          makePerson(2),
                                          makePersonWithDefaults(3, true),
                                          makePersonWithDefaults(4, false),
                                          makePersonWithDefaults(4, true) };

  private static final Base64 base64 = Codecs.createStandardBase64();

  public static class B64ToTuple extends ProtobufBytesToTuple<Message> {
    public B64ToTuple(String className) {
      super(className);
    }

    @Override
    public Tuple exec(Tuple input) throws IOException {
      byte[] bytes = ((DataByteArray)input.get(0)).get();
      input.set(0, new DataByteArray(base64.decode(bytes)));
      return super.exec(input);
    }
  }

  @Before
  public void setUp() throws Exception {

    FileUtil.fullyDelete(new File(testDir));

    pigServer = PigTestUtil.makePigServer();

    inputDir.mkdirs();

    // create an text file with b64 encoded protobufs

    FileOutputStream out = new FileOutputStream(new File(inputDir, "persons_b64.txt"));
    for (Person rec : records) {
      out.write(base64.encode(rec.toByteArray()));
      out.write('\n');
    }
    out.close();
  }

  @Test
  public void testRCFileStorage() throws Exception {
    /* create a directory with three rcfiles :
     *  - one created with normal Person objects using RCFileProtobufPigStorage.
     *  - one created with Person objects where the optional fields are not set.
     *  - other with PersonWithoutEmail (for testing unknown fields)
     *    using the same objects as the first one.
     *
     * Then load both files using RCFileProtobufPigLoader
     */

    // write to rcFile using RCFileProtobufStorage
    for(String line : String.format(
            "DEFINE b64ToTuple %s('%s');\n" +
            "A = load '%s' as (line);\n" +
            "A = foreach A generate b64ToTuple(line) as t;\n" +
            "A = foreach A generate FLATTEN(t);\n" +
            "STORE A into '%s' using %s('%s');\n"

            , B64ToTuple.class.getName()
            , Person.class.getName()
            , inputDir.toURI().toString()
            , rcfile_in.toURI().toString()
            , RCFileProtobufPigStorage.class.getName()
            , Person.class.getName()

            ).split("\n")) {

      pigServer.registerQuery(line + "\n");
    }

    // create an rcfile with Person objects directly with out converting to a
    // tuple so that optional fields that are not set are null in RCFile

    ProtobufWritable<Person> personWritable = ProtobufWritable.newInstance(Person.class);

    RecordWriter<Writable, Writable> protoWriter =
            createProtoWriter(Person.class,
                              new File(rcfile_in, "persons_with_unset_fields.rc"));

    for(Person person : records) {
      personWritable.set(person);
      protoWriter.write(null, personWritable);
    }
    protoWriter.close(null);

    // create an rcFile with PersonWithoutEmail to test unknown fields

    ProtobufWritable<PersonWithoutEmail> pweWritable =
            ProtobufWritable.newInstance(PersonWithoutEmail.class);

    protoWriter = createProtoWriter(PersonWithoutEmail.class,
                                    new File(rcfile_in, "persons_with_unknows.rc"));

    for(Person person : records) {
      pweWritable.set(PersonWithoutEmail.newBuilder()
                        .mergeFrom(person.toByteArray()).build());
      protoWriter.write(null, pweWritable);
    }
    protoWriter.close(null);

    // load all the files
    pigServer.registerQuery(String.format(
        "A = load '%s' using %s('%s');\n"
        , rcfile_in.toURI().toString()
        , RCFileProtobufPigLoader.class.getName()
        , Person.class.getName()));

    // verify the result:
    Iterator<Tuple> rows = pigServer.openIterator("A");
    for (int i=0; i<3; i++) {
      for(Person person : records) {
        String expected = personToString(person);
        Assert.assertEquals(expected, rows.next().toString());
      }
    }

    // clean up on successful run
    FileUtil.fullyDelete(new File(testDir));
  }

  @SuppressWarnings("unchecked")
  private static RecordWriter<Writable, Writable>
  createProtoWriter(Class<?> protoClass, final File file)
                    throws IOException, InterruptedException {

    OutputFormat outputFormat = (
      new RCFileProtobufOutputFormat(Protobufs.getTypeRef(protoClass.getName())) {
        @Override
        public Path getDefaultWorkFile(TaskAttemptContext context,
            String extension) throws IOException {
          return new Path(file.toURI().toString());
        }
    });

    Configuration conf = new Configuration();
    // TODO: figure out why Gzip or BZip2 compression fails on OSX
    // conf.setBoolean("mapred.output.compress", true);
    // conf.set("mapred.output.compression.codec", "org.apache.hadoop.io.compress.BZip2Codec");

    return outputFormat.getRecordWriter(
        HadoopCompat.newTaskAttemptContext(conf, new TaskAttemptID()));
  }

  // return a Person object
  private static Person makePerson(int index) {
    return Person.newBuilder()
      .setName("bob_" + index + " jenkins")
      .setId(index)
      .setEmail("bob_" + index + "@example.com")
      .addPhone(
          PhoneNumber.newBuilder()
              .setNumber("408-555-" + (5555 + index))
              .setType(PhoneType.MOBILE))
      .build();
  }

  // return a Person object. don't set optional fields
  private static Person makePersonWithDefaults(int index, boolean add_phone) {
    Person.Builder builder =
            Person.newBuilder()
            .setName("bob_" + index + " jenkins")
            .setId(index);
    if (add_phone) {
      builder.addPhone(PhoneNumber.newBuilder()
                                  .setNumber("408-555-" + (5555 + index)));
    }
    return builder.build();
  }

  private static String personToString(Person person) {
    return new ProtobufToPig().toTuple(person).toString();
  }
}
TOP

Related Classes of com.twitter.elephantbird.pig.load.TestRCFileProtobufStorage$B64ToTuple

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.