Package org.apache.pig.test

Source Code of org.apache.pig.test.TestBuiltInBagToTupleOrString

/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.pig.test;

import static junit.framework.Assert.*;

import java.util.Iterator;
import java.util.List;

import org.apache.pig.ExecType;
import org.apache.pig.PigServer;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.builtin.BagToString;
import org.apache.pig.builtin.BagToTuple;
import org.apache.pig.builtin.mock.Storage.Data;
import org.apache.pig.data.BagFactory;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.DataType;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.apache.pig.impl.logicalLayer.schema.Schema;
import org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema;
import static org.apache.pig.builtin.mock.Storage.*;

import org.junit.Test;

/**
*
* Test cases for BagToTuple and BagToString UDFs
*
* @author hluu
*
*/
public class TestBuiltInBagToTupleOrString {
  private BagFactory bf = BagFactory.getInstance();
  private TupleFactory tf = TupleFactory.getInstance();

  @Test
  public void testNullInputBagToTupleUDF() throws Exception {
    BagToTuple udf = new BagToTuple();
    Tuple udfInput = tf.newTuple(1);
    udfInput.set(0, null);
    Tuple output = udf.exec(udfInput);
    assertNull(output);
  }

  @Test
  public void testBasicBagToTupleUDF() throws Exception {

    Tuple t1 = tf.newTuple(2);
    t1.set(0, "a");
    t1.set(1, 5);

    Tuple t2 = tf.newTuple(2);
    t2.set(0, "c");
    t2.set(1, 6);

    DataBag bag = bf.newDefaultBag();
    bag.add(t1);
    bag.add(t2);

    Tuple udfInput = tf.newTuple(1);
    udfInput.set(0, bag);

    // invoking UDF
    BagToTuple udf = new BagToTuple();
    Tuple result = udf.exec(udfInput);

    int totalExpectedSize = t1.size() + t2.size();
    assertEquals(totalExpectedSize, result.size());

    for (int i = 0; i < t1.size(); i++) {
      assertEquals(t1.get(i), result.get(i));
    }

    for (int i = 0; i < t2.size(); i++) {
      assertEquals(t2.get(i), result.get(t1.size() + i));
    }
  }

  @Test
  public void testNonuniformTuplesInBagForBagToTupleUDF() throws Exception {

    Tuple t1 = tf.newTuple(2);
    t1.set(0, "a");
    t1.set(1, 5);

    Tuple t2 = tf.newTuple(3);
    t2.set(0, "b");
    t2.set(1, 6);
    t2.set(2, 7);

    Tuple t3 = tf.newTuple(4);
    t3.set(0, "c");
    t3.set(1, 8);
    t3.set(2, 9.7);
    t3.set(3, 10);

    DataBag bag = bf.newDefaultBag();
    bag.add(t1);
    bag.add(t2);
    bag.add(t3);

    Tuple udfInput = tf.newTuple(1);
    udfInput.set(0, bag);

    // invoking UDF
    BagToTuple udf = new BagToTuple();
    Tuple outputTuple = udf.exec(udfInput);

    int totalExpectedSize = t1.size() + t2.size() + t3.size();
    assertEquals(totalExpectedSize, outputTuple.size());

    for (int i = 0; i < t1.size(); i++) {
      assertEquals(t1.get(i), outputTuple.get(i));
    }

    for (int i = 0; i < t2.size(); i++) {
      assertEquals(t2.get(i), outputTuple.get(t1.size() + i));
    }

    int startIndex = t1.size() + t2.size();
    for (int i = 0; i < t3.size(); i++) {
      assertEquals(t3.get(i), outputTuple.get(startIndex + i));
    }
  }

  @Test
  public void testNestedDataElementsForBagToTupleUDF() throws Exception {

    DataBag inputBag = buildBagWithNestedTupleAndBag();


    BagToTuple udf = new BagToTuple();
    Tuple udfInput = tf.newTuple(1);
    udfInput.set(0, inputBag);
    Tuple outputTuple = udf.exec(udfInput);


    Iterator<Tuple> inputBagIterator = inputBag.iterator();
    Tuple firstTuple = inputBagIterator.next();
    for (int i = 0; i < firstTuple.size(); i++) {
      assertEquals(firstTuple.get(i), outputTuple.get(i));
    }

    Tuple secondTuple = inputBagIterator.next();
    for (int i = 0; i < secondTuple.size(); i++) {
      assertEquals(secondTuple.get(i), outputTuple.get(firstTuple.size() + i));
    }

    int startIndex = firstTuple.size() + secondTuple.size();
    Tuple thirdTuple = inputBagIterator.next();
    for (int i = 0; i < thirdTuple.size(); i++) {
      assertEquals(thirdTuple.get(i), outputTuple.get(startIndex + i));
    }
  }

  @Test
  public void testOutputSchemaForBagToTupleUDF() throws Exception {
    Schema expectedSch = Schema.generateNestedSchema(DataType.TUPLE,
        DataType.INTEGER, DataType.CHARARRAY);

    FieldSchema tupSch = new FieldSchema(null, DataType.TUPLE);
    tupSch.schema = new Schema();
    tupSch.schema.add(new FieldSchema(null, DataType.INTEGER));
    tupSch.schema.add(new FieldSchema(null, DataType.CHARARRAY));

    FieldSchema bagSch = new FieldSchema(null, DataType.BAG);
    bagSch.schema = new Schema(tupSch);

    Schema inputSch = new Schema();
    inputSch.add(bagSch);

    BagToTuple udf = new BagToTuple();
    Schema outputSchema = udf.outputSchema(inputSch);

    assertEquals("schema of BagToTuple input", expectedSch.size(),
        outputSchema.size());
    assertTrue("schema of BagToTuple input",
        Schema.equals(expectedSch, outputSchema, false, true));
  }

  @Test(expected=org.apache.pig.backend.executionengine.ExecException.class)
  public void testInvalidInputToBagToTupleUDF() throws Exception {
    TupleFactory tf = TupleFactory.getInstance();
    Tuple udfInput = tf.newTuple(1);
    // input contains tuple instead of bag
    udfInput.set(0, tf.newTuple());
    BagToTuple udf = new BagToTuple();

    // expecting an exception because the input if of type Tuple, not DataBag
    udf.exec(udfInput);
  }


  @Test
  public void testNullInputBagToStringUDF() throws Exception {
    BagToString udf = new BagToString();
    Tuple udfInput = tf.newTuple(1);
    udfInput.set(0, null);
    String output = udf.exec(udfInput);
    assertNull(output);
  }

  @Test(expected=org.apache.pig.backend.executionengine.ExecException.class)
  public void testInvalidInputForBagToStringUDF() throws Exception {
    TupleFactory tf = TupleFactory.getInstance();
    Tuple udfInput = tf.newTuple(1);
    // input contains tuple instead of bag
    udfInput.set(0, tf.newTuple());
    BagToString udf = new BagToString();

    // expecting an exception because the input if of type Tuple, not DataBag
    udf.exec(udfInput);
  }

  @Test
  public void testUseDefaultDelimiterBagToStringUDF() throws Exception {
    BagFactory bf = BagFactory.getInstance();
    TupleFactory tf = TupleFactory.getInstance();

    Tuple t1 = tf.newTuple(2);
    t1.set(0, "a");
    t1.set(1, 5);

    Tuple t2 = tf.newTuple(2);
    t2.set(0, "c");
    t2.set(1, 6);

    DataBag bag = bf.newDefaultBag();
    bag.add(t1);
    bag.add(t2);

    BagToString udf = new BagToString();
    Tuple udfInput = tf.newTuple(1);
    udfInput.set(0, bag);
    String result = udf.exec(udfInput);

    assertEquals("a_5_c_6", result);
  }

  @Test
  public void testBasicBagToStringUDF() throws Exception {
    BagFactory bf = BagFactory.getInstance();
    TupleFactory tf = TupleFactory.getInstance();

    Tuple t1 = tf.newTuple(2);
    t1.set(0, "a");
    t1.set(1, 5);

    Tuple t2 = tf.newTuple(2);
    t2.set(0, "c");
    t2.set(1, 6);

    DataBag bag = bf.newDefaultBag();
    bag.add(t1);
    bag.add(t2);

    BagToString udf = new BagToString();
    Tuple udfInput = tf.newTuple(2);
    udfInput.set(0, bag);
    udfInput.set(1, "-");
    String result = udf.exec(udfInput);

    assertEquals("a-5-c-6", result);
  }

  @Test
  public void testNestedTupleForBagToStringUDF() throws Exception {
    BagFactory bf = BagFactory.getInstance();
    TupleFactory tf = TupleFactory.getInstance();

    Tuple t1 = tf.newTuple(2);
    t1.set(0, "a");
    t1.set(1, 5);

    Tuple nestedTuple = tf.newTuple(2);
    nestedTuple.set(0, "d");
    nestedTuple.set(1, 7);

    Tuple t2 = tf.newTuple(3);
    t2.set(0, "c");
    t2.set(1, 6);
    t2.set(2, nestedTuple);

    DataBag inputBag = bf.newDefaultBag();
    inputBag.add(t1);
    inputBag.add(t2);

    BagToString udf = new BagToString();
    Tuple udfInput = tf.newTuple(2);
    udfInput.set(0, inputBag);
    udfInput.set(1, "_");
    String result = udf.exec(udfInput);

    assertEquals("a_5_c_6_(d,7)", result);
  }

  @Test
  public void testNestedDataElementsForBagToStringUDF() throws Exception {

    DataBag inputBag = buildBagWithNestedTupleAndBag();

    BagToString udf = new BagToString();
    Tuple udfInput = tf.newTuple(2);
    udfInput.set(0, inputBag);
    udfInput.set(1, "*");

    String result = udf.exec(udfInput);
    assertEquals("a*5*c*6*(d,7)*{(in bag,10)}", result);
  }


  @Test(expected=java.lang.RuntimeException.class)
  public void testInvalidZeroInputToOutputSchemaForBagToTupleStringUDF() throws Exception {


    Schema inputSch = new Schema();

    BagToString udf = new BagToString();
    Schema outputSchema = udf.outputSchema(inputSch);

    assertEquals("schema of BagToTuple input", outputSchema.getField(0).type,
        DataType.CHARARRAY);

  }

  @Test
  public void testOutputSchemaForBagToTupleStringUDF() throws Exception {

    FieldSchema tupSch = new FieldSchema(null, DataType.TUPLE);
    tupSch.schema = new Schema();
    tupSch.schema.add(new FieldSchema(null, DataType.INTEGER));
    tupSch.schema.add(new FieldSchema(null, DataType.CHARARRAY));

    FieldSchema bagSch = new FieldSchema(null, DataType.BAG);
    bagSch.schema = new Schema(tupSch);

    Schema inputSch = new Schema();
    inputSch.add(bagSch);
    inputSch.add(new FieldSchema(null, DataType.CHARARRAY));

    BagToString udf = new BagToString();
    Schema outputSchema = udf.outputSchema(inputSch);

    assertEquals("schema of BagToTuple input", outputSchema.getField(0).type,
        DataType.CHARARRAY);

  }

  @Test
  public void testOutputSchemaWithDefaultDelimiterForBagToTupleStringUDF() throws Exception {

    FieldSchema tupSch = new FieldSchema(null, DataType.TUPLE);
    tupSch.schema = new Schema();
    tupSch.schema.add(new FieldSchema(null, DataType.INTEGER));
    tupSch.schema.add(new FieldSchema(null, DataType.CHARARRAY));

    FieldSchema bagSch = new FieldSchema(null, DataType.BAG);
    bagSch.schema = new Schema(tupSch);

    Schema inputSch = new Schema();
    inputSch.add(bagSch);

    BagToString udf = new BagToString();
    Schema outputSchema = udf.outputSchema(inputSch);

    assertEquals("schema of BagToTuple input", outputSchema.getField(0).type,
        DataType.CHARARRAY);

  }

  @Test(expected=java.lang.RuntimeException.class)
  public void testInvalidOutputSchemaForBagToTupleStringUDF() throws Exception {

    FieldSchema tupSch = new FieldSchema(null, DataType.TUPLE);
    tupSch.schema = new Schema();
    tupSch.schema.add(new FieldSchema(null, DataType.INTEGER));
    tupSch.schema.add(new FieldSchema(null, DataType.CHARARRAY));

    FieldSchema bagSch = new FieldSchema(null, DataType.BAG);
    bagSch.schema = new Schema(tupSch);

    Schema inputSch = new Schema();
    inputSch.add(bagSch);
    inputSch.add(new FieldSchema(null, DataType.DOUBLE));

    BagToString udf = new BagToString();
    // expecting an exception because the delimiter is not of type Data.CHARARRAY
    udf.outputSchema(inputSch);
  }

  @Test
  public void testPigScriptForBagToTupleUDF() throws Exception {
    PigServer pigServer = new PigServer(ExecType.LOCAL);
    Data data = resetData(pigServer);

    // bag of chararray
    data.set("foo", "myBag:bag{t:(l:chararray)}",
        tuple(bag(tuple("a"), tuple("b"), tuple("c"))));
    pigServer.registerQuery("A = LOAD 'foo' USING mock.Storage();");
    pigServer.registerQuery("B = FOREACH A GENERATE BagToTuple(myBag) as myBag;");
      pigServer.registerQuery("STORE B INTO 'bar' USING mock.Storage();");

      assertEquals(schema("myBag:(l:chararray)"), data.getSchema("bar"));

      List<Tuple> out = data.get("bar");
      assertEquals(tuple("a", "b","c"), out.get(0).get(0));

      // bag of longs
      data = resetData(pigServer);
    data.set("foo", "myBag:bag{t:(l:long)}",
        tuple(bag(tuple(1), tuple(2), tuple(3))));
    pigServer.registerQuery("A = LOAD 'foo' USING mock.Storage();");
    pigServer.registerQuery("B = FOREACH A GENERATE BagToTuple(myBag) as myBag;");
      pigServer.registerQuery("STORE B INTO 'bar' USING mock.Storage();");

      out = data.get("bar");
      assertEquals(tuple(1, 2, 3), out.get(0).get(0));
  }

  @Test
  public void testPigScriptMultipleElmementsPerTupleForBagTupleUDF() throws Exception {
    PigServer pigServer = new PigServer(ExecType.LOCAL);
    Data data = resetData(pigServer);

    data.set("foo", "myBag:bag{t:(l:chararray)}",
        tuple(bag(tuple("a", "b"), tuple("c", "d"), tuple("e", "f"))));
    pigServer.registerQuery("A = LOAD 'foo' USING mock.Storage();");
    pigServer.registerQuery("B = FOREACH A GENERATE BagToTuple(myBag) as myBag;");
    pigServer.registerQuery("STORE B INTO 'bar' USING mock.Storage();");

      List<Tuple> out = data.get("bar");
      assertEquals(tuple("a", "b","c", "d", "e", "f"), out.get(0).get(0));
  }

  @Test
  public void testPigScriptNestedTupleForBagToTupleDF() throws Exception {
    PigServer pigServer = new PigServer(ExecType.LOCAL);
    Data data = resetData(pigServer);

      Tuple nestedTuple = tuple(bag(tuple("c"), tuple("d")));
      data.set("foo", "myBag:bag{t:(l:chararray)}",
        tuple(bag(tuple("a"), tuple("b"), nestedTuple, tuple("e"))));

    pigServer.registerQuery("A = LOAD 'foo' USING mock.Storage();");
    pigServer.registerQuery("B = FOREACH A GENERATE BagToTuple(myBag) as myBag;");
      pigServer.registerQuery("STORE B INTO 'bar' USING mock.Storage();");

      List<Tuple> out = data.get("bar");
      assertEquals(tuple("a", "b",bag(tuple("c"), tuple("d")), "e"), out.get(0).get(0));

  }

  @Test
  public void testPigScriptEmptyBagForBagToTupleUDF() throws Exception {
    PigServer pigServer = new PigServer(ExecType.LOCAL);
    Data data = resetData(pigServer);

      data.set("foo", "myBag:bag{t:(l:chararray)}",
        tuple(bag()));

    pigServer.registerQuery("A = LOAD 'foo' USING mock.Storage();");
    pigServer.registerQuery("B = FOREACH A GENERATE BagToTuple(myBag) as myBag;");
      pigServer.registerQuery("STORE B INTO 'bar' USING mock.Storage();");

      List<Tuple> out = data.get("bar");
      // empty bag will generate empty tuple
      assertEquals(tuple(), out.get(0).get(0));

  }

  @Test
  public void testPigScriptrForBagToStringUDF() throws Exception {
    PigServer pigServer = new PigServer(ExecType.LOCAL);
    Data data = resetData(pigServer);

    data.set("foo", "myBag:bag{t:(l:chararray)}",
        tuple(bag(tuple("a"), tuple("b"), tuple("c"))));
    pigServer.registerQuery("A = LOAD 'foo' USING mock.Storage();");
    pigServer.registerQuery("B = FOREACH A GENERATE BagToString(myBag) as myBag;");
      pigServer.registerQuery("STORE B INTO 'bar' USING mock.Storage();");

      pigServer.registerQuery("C = FOREACH A GENERATE BagToString(myBag, '==') as myBag;");
      pigServer.registerQuery("STORE C INTO 'baz' USING mock.Storage();");

      List<Tuple> out = data.get("bar");
      assertEquals(schema("myBag:chararray"), data.getSchema("bar"));
      assertEquals(tuple("a_b_c"), out.get(0));

      out = data.get("baz");
      assertEquals(tuple("a==b==c"), out.get(0));
  }

  @Test
  public void testPigScriptMultipleElmementsPerTupleForBagToStringUDF() throws Exception {
    PigServer pigServer = new PigServer(ExecType.LOCAL);
    Data data = resetData(pigServer);

    data.set("foo", "myBag:bag{t:(l:chararray)}",
        tuple(bag(tuple("a", "b"), tuple("c", "d"), tuple("e", "f"))));
    pigServer.registerQuery("A = LOAD 'foo' USING mock.Storage();");
    pigServer.registerQuery("B = FOREACH A GENERATE BagToString(myBag) as myBag;");
    pigServer.registerQuery("STORE B INTO 'bar' USING mock.Storage();");

    pigServer.registerQuery("C = FOREACH A GENERATE BagToString(myBag, '^') as myBag;");
    pigServer.registerQuery("STORE C INTO 'baz' USING mock.Storage();");

      List<Tuple> out = data.get("bar");
      assertEquals(tuple("a_b_c_d_e_f"), out.get(0));

      out = data.get("baz");
      assertEquals(tuple("a^b^c^d^e^f"), out.get(0));
  }

  @Test
  public void testPigScriptNestedTupleForBagToStringUDF() throws Exception {
    PigServer pigServer = new PigServer(ExecType.LOCAL);
    Data data = resetData(pigServer);

      Tuple nestedTuple = tuple(bag(tuple("c"), tuple("d")));
      data.set("foo", "myBag:bag{t:(l:chararray)}",
        tuple(bag(tuple("a"), tuple("b"), nestedTuple, tuple("e"))));

    pigServer.registerQuery("A = LOAD 'foo' USING mock.Storage();");
    pigServer.registerQuery("B = FOREACH A GENERATE BagToString(myBag) as myBag;");
      pigServer.registerQuery("STORE B INTO 'bar' USING mock.Storage();");

      List<Tuple> out = data.get("bar");
      assertEquals(tuple("a_b_{(c),(d)}_e"), out.get(0));

  }

  @Test
  public void testPigScriptEmptyBagForBagToStringUDF() throws Exception {
    PigServer pigServer = new PigServer(ExecType.LOCAL);
    Data data = resetData(pigServer);

      data.set("foo", "myBag:bag{t:(l:chararray)}",
        tuple(bag()));

    pigServer.registerQuery("A = LOAD 'foo' USING mock.Storage();");
    pigServer.registerQuery("B = FOREACH A GENERATE BagToString(myBag) as myBag;");
      pigServer.registerQuery("STORE B INTO 'bar' USING mock.Storage();");

      List<Tuple> out = data.get("bar");
      // empty bag will generate empty string
      assertEquals(tuple(""), out.get(0));

  }

  private DataBag buildBagWithNestedTupleAndBag() throws ExecException {
    Tuple t1 = tf.newTuple(2);
    t1.set(0, "a");
    t1.set(1, 5);

    Tuple nestedTuple = tf.newTuple(2);
    nestedTuple.set(0, "d");
    nestedTuple.set(1, 7);

    Tuple t2 = tf.newTuple(3);
    t2.set(0, "c");
    t2.set(1, 6);
    t2.set(2, nestedTuple);

    DataBag nestedBag = bf.newDefaultBag();
    Tuple tupleInNestedBag = tf.newTuple(2);
    tupleInNestedBag.set(0, "in bag");
    tupleInNestedBag.set(1, 10);
    nestedBag.add(tupleInNestedBag);

    Tuple t3 = tf.newTuple(1);
    t3.set(0, nestedBag);

    DataBag bag = bf.newDefaultBag();
    bag.add(t1);
    bag.add(t2);
    bag.add(t3);
    return bag;
  }
}
TOP

Related Classes of org.apache.pig.test.TestBuiltInBagToTupleOrString

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.