Package com.twitter.elephantbird.pig.load

Source Code of com.twitter.elephantbird.pig.load.TestRCFileThriftStorage$B64ToTuple

package com.twitter.elephantbird.pig.load;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.Iterator;

import com.twitter.elephantbird.pig.util.PigTestUtil;
import com.twitter.elephantbird.util.HadoopCompat;
import com.twitter.elephantbird.util.CoreTestUtil;

import org.apache.commons.codec.binary.Base64;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.TaskAttemptID;
import org.apache.pig.PigServer;
import org.apache.pig.data.DataByteArray;
import org.apache.pig.data.Tuple;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;

import com.google.common.collect.ImmutableMap;
import com.twitter.elephantbird.mapreduce.io.ThriftConverter;
import com.twitter.elephantbird.mapreduce.output.RCFileThriftOutputFormat;
import com.twitter.elephantbird.pig.piggybank.ThriftBytesToTuple;
import com.twitter.elephantbird.pig.store.RCFileThriftPigStorage;
import com.twitter.elephantbird.pig.util.ThriftToPig;
import com.twitter.elephantbird.thrift.test.TestName;
import com.twitter.elephantbird.thrift.test.TestPerson;
import com.twitter.elephantbird.thrift.test.TestPersonExtended;
import com.twitter.elephantbird.thrift.test.TestPhoneType;
import com.twitter.elephantbird.util.Codecs;
import com.twitter.elephantbird.util.ThriftUtils;

/**
* Test RCFile loader and storage with Thrift objects
*/
public class TestRCFileThriftStorage {

  private PigServer pigServer;
  private final String testDir =
                    CoreTestUtil.getTestDataDir(TestRCFileThriftStorage.class);
  private final File inputDir = new File(testDir, "in");
  private final File rcfile_in = new File(testDir, "rcfile_in");

  private ThriftToPig<TestPersonExtended> thriftToPig = ThriftToPig.newInstance(TestPersonExtended.class);
  private ThriftConverter<TestPersonExtended> thriftConverter = ThriftConverter.newInstance(TestPersonExtended.class);

  private final TestPersonExtended[] records = new TestPersonExtended[]{
                                                    makePerson(0),
                                                    makePerson(1),
                                                    makePerson(2) };

  private static final Base64 base64 = Codecs.createStandardBase64();

  public static class B64ToTuple extends ThriftBytesToTuple<TestPersonExtended> {
    public B64ToTuple(String className) {
      super(className);
    }

    @Override
    public Tuple exec(Tuple input) throws IOException {
      byte[] bytes = ((DataByteArray)input.get(0)).get();
      input.set(0, new DataByteArray(base64.decode(bytes)));
      return super.exec(input);
    }
  }

  @Before
  public void setUp() throws Exception {

    FileUtil.fullyDelete(new File(testDir));

    pigServer = PigTestUtil.makePigServer();

    pigServer.getPigContext().getProperties().setProperty(
        "mapred.output.compress", "true"); //default codec

    inputDir.mkdirs();

    // create an text file with b64 encoded thrift objects.

    FileOutputStream out = new FileOutputStream(new File(inputDir, "persons_b64.txt"));
    for (TestPersonExtended rec : records) {
      out.write(base64.encode(thriftConverter.toBytes(rec)));
      out.write('\n');
    }
    out.close();
  }

  @Test
  public void testRCFileSThrifttorage() throws Exception {
    /* Create a directory with two files:
     *  - one created with TestPersonExtended objects using RCFileThriftPigStorage
     *  - one created with TestPerson using serialized TestPersonExtended objects
     *         to test handling of unknown fields.
     *
     *  Then load both files using RCFileThriftPigLoader.
     */

    // write to rcFile using RCFileThriftPigStorage
    for(String line : String.format(
            "DEFINE b64ToTuple %s('%s');\n" +
            "A = load '%s' as (line);\n" +
            "A = foreach A generate b64ToTuple(line) as t;\n" +
            "A = foreach A generate FLATTEN(t);\n" +
            "STORE A into '%s' using %s('%s');\n"

            , B64ToTuple.class.getName()
            , TestPersonExtended.class.getName()
            , inputDir.toURI().toString()
            , rcfile_in.toURI().toString()
            , RCFileThriftPigStorage.class.getName()
            , TestPersonExtended.class.getName()

            ).split("\n")) {

      pigServer.registerQuery(line + "\n");
    }
    // the RCFile created above has 5 columns : 4 fields in extended Person
    // and one for unknown fields (this column is empty in this case).

    // store another file with unknowns, by writing the person object with
    // TestPerson rather than with TestPersionExtended, but using
    // serialized TestPersionExtended objects.

    RecordWriter<Writable, Writable> thriftWriter =
      createThriftWriter(TestPerson.class, new File(rcfile_in, "persons_with_unknows.rc"));
    for(TestPersonExtended person : records) {
      // write the bytes from TestPersonExtened
      thriftWriter.write(null, new BytesWritable(thriftConverter.toBytes(person)));
    }
    thriftWriter.close(null);
    // this RCFile has 3 columns : 2 fields in TestPerson and one for unknown
    // fields. The unknowns-columns contains 2 fields from TestPersonExtended
    // that are not understood by TestPerson.

    // load using RCFileThriftPigLoader
    pigServer.registerQuery(String.format(
        "A = load '%s' using %s('%s');\n"
        , rcfile_in.toURI().toString()
        , RCFileThriftPigLoader.class.getName()
        , TestPersonExtended.class.getName()));

    // verify the result:
    Iterator<Tuple> rows = pigServer.openIterator("A");
    for (int i=0; i<2; i++) {
      for(TestPersonExtended person : records) {
        String expected = personToString(person);
        Assert.assertEquals(expected, rows.next().toString());
      }
    }

    // clean up on successful run
    FileUtil.fullyDelete(new File(testDir));
  }

  // return a Person thrift object
  private TestPersonExtended makePerson(int index) {
    return new TestPersonExtended(
              new TestName("bob " + index, "jenkins"),
              ImmutableMap.of(TestPhoneType.HOME,
                              "408-555-5555" + "ex" + index),
              "bob_" + index + "@examle.com",
              new TestName("alice " + index, "smith"));
  }

  @SuppressWarnings("unchecked")
  private static RecordWriter<Writable, Writable>
  createThriftWriter(Class<?> thriftClass, final File file)
                    throws IOException, InterruptedException {

    OutputFormat outputFormat = (
      new RCFileThriftOutputFormat(ThriftUtils.getTypeRef(thriftClass.getName())) {
        @Override
        public Path getDefaultWorkFile(TaskAttemptContext context,
            String extension) throws IOException {
          return new Path(file.toURI().toString());
        }
    });

    Configuration conf = new Configuration();
    // TODO: figure out why Gzip or BZip2 compression fails on OSX
    //conf.setBoolean("mapred.output.compress", true);
    //conf.set("mapred.output.compression.codec", "org.apache.hadoop.io.compress.BZip2Codec");


    return outputFormat.getRecordWriter(
        HadoopCompat.newTaskAttemptContext(conf, new TaskAttemptID()));
  }

  private String personToString(TestPersonExtended person) {
    return thriftToPig.getPigTuple(person).toString();
  }

}
TOP

Related Classes of com.twitter.elephantbird.pig.load.TestRCFileThriftStorage$B64ToTuple

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.