Package org.apache.pig.test

Source Code of org.apache.pig.test.TestDataBagAccess

/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.pig.test;

import static org.junit.Assert.*;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Properties;

import org.apache.pig.ExecType;
import org.apache.pig.PigServer;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.builtin.BinStorage;
import org.apache.pig.builtin.PigStorage;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.DefaultTuple;
import org.apache.pig.data.NonSpillableDataBag;
import org.apache.pig.data.SingleTupleBag;
import org.apache.pig.data.Tuple;
import org.apache.pig.impl.logicalLayer.FrontendException;
import org.apache.pig.impl.util.MultiMap;
import org.junit.Before;
import org.junit.Test;

/**
*
*/
public class TestDataBagAccess {
    private PigServer pigServer;

    @Before
    public void setUp() throws Exception{
        pigServer = new PigServer(ExecType.LOCAL, new Properties());
    }

    @Test
    public void testSingleTupleBagAcess() throws Exception {
        Tuple inputTuple = new DefaultTuple();
        inputTuple.append("a");
        inputTuple.append("b");

        SingleTupleBag bg = new SingleTupleBag(inputTuple);
        Iterator<Tuple> it = bg.iterator();
        assertEquals(inputTuple, it.next());
        assertFalse(it.hasNext());
    }

    @Test
    public void testNonSpillableDataBag() throws Exception {
        String[][] tupleContents = new String[][] {{"a", "b"},{"c", "d" }, { "e", "f"} };
        NonSpillableDataBag bg = new NonSpillableDataBag();
        for (int i = 0; i < tupleContents.length; i++) {
            bg.add(Util.createTuple(tupleContents[i]));
        }
        Iterator<Tuple> it = bg.iterator();
        int j = 0;
        while(it.hasNext()) {
            Tuple t = it.next();
            assertEquals(Util.createTuple(tupleContents[j]), t);
            j++;
        }
        assertEquals(tupleContents.length, j);
    }

    @Test
    public void testBagConstantAccess() throws IOException, ExecException {
        File input = Util.createInputFile("tmp", "",
                new String[] {"sampledata\tnot_used"});
        pigServer.registerQuery("a = load '"
                + Util.generateURI(Util.encodeEscape(input.toString()), pigServer.getPigContext()) + "';");
        pigServer.registerQuery("b = foreach a generate {(16, 4.0e-2, 'hello', -101)} as mybag:{t:(i: int, d: double, c: chararray, e : int)};");
        pigServer.registerQuery("c = foreach b generate mybag.i, mybag.d, mybag.c, mybag.e;");
        Iterator<Tuple> it = pigServer.openIterator("c");
        Tuple t = it.next();
        Object[] results = new Object[] { new Integer(16), new Double(4.0e-2), "hello", new Integer( -101 ) };
        Class[] resultClasses = new Class[] { Integer.class, Double.class, String.class, Integer.class };
        assertEquals(results.length, t.size());
        for (int i = 0; i < results.length; i++) {
            DataBag bag = (DataBag)t.get(i);
            assertEquals(results[i], bag.iterator().next().get(0));
            assertEquals(resultClasses[i], bag.iterator().next().get(0).getClass());
        }
    }

    @Test
    public void testBagConstantAccessFailure() throws IOException, ExecException {
        File input = Util.createInputFile("tmp", "",
                new String[] {"sampledata\tnot_used"});
        boolean exceptionOccured = false;
        pigServer.setValidateEachStatement(true);
        try {
            pigServer.registerQuery("a = load '"
                    + Util.generateURI(Util.encodeEscape(input.toString()), pigServer.getPigContext()) + "';");
            pigServer.registerQuery("b = foreach a generate {(16, 4.0e-2, 'hello')} as mybag:{t:(i: int, d: double, c: chararray)};");
            pigServer.registerQuery("c = foreach b generate mybag.t;");
            pigServer.explain("c", System.out);
        } catch(FrontendException e) {
            exceptionOccured = true;
            String msg = e.getMessage();
            Util.checkStrContainsSubStr(msg, "Cannot find field t in i:int,d:double,c:chararray");
        }
        assertTrue(exceptionOccured);
    }

    @Test
    public void testBagConstantFlatten1() throws IOException, ExecException {
        File input = Util.createInputFile("tmp", "",
                new String[] {"sampledata\tnot_used"});
        pigServer.registerQuery("A = load '"
                + Util.generateURI(Util.encodeEscape(input.toString()), pigServer.getPigContext()) + "';");
        pigServer.registerQuery("B = foreach A generate {(('p1-t1-e1', 'p1-t1-e2'),('p1-t2-e1', 'p1-t2-e2'))," +
                "(('p2-t1-e1', 'p2-t1-e2'), ('p2-t2-e1', 'p2-t2-e2'))};");
        pigServer.registerQuery("C = foreach B generate $0 as pairbag : { pair: ( t1: (e1, e2), t2: (e1, e2) ) };");
        pigServer.registerQuery("D = foreach C generate FLATTEN(pairbag);");
        pigServer.registerQuery("E = foreach D generate t1.e2 as t1e2, t2.e1 as t2e1;");
        Iterator<Tuple> it = pigServer.openIterator("E");
        // We should get the following two tuples as the result:
        // (p1-t1-e2,p1-t2-e1)
        // (p2-t1-e2,p2-t2-e1)
        Tuple t = it.next();
        assertEquals("p1-t1-e2", t.get(0));
        assertEquals("p1-t2-e1", t.get(1));
        t = it.next();
        assertEquals("p2-t1-e2", t.get(0));
        assertEquals("p2-t2-e1", t.get(1));
        assertFalse(it.hasNext());
    }

    @Test
    public void testBagConstantFlatten2() throws IOException, ExecException {
        File input = Util.createInputFile("tmp", "",
                new String[] {"somestring\t10\t{(a,10),(b,20)}"});
        pigServer.registerQuery("a = load '"
                + Util.generateURI(Util.encodeEscape(input.toString()), pigServer.getPigContext())
                + "' " + "as (str:chararray, intval:int, bg:bag{t:tuple(s:chararray, i:int)});");
        pigServer.registerQuery("b = foreach a generate str, intval, flatten(bg);");
        pigServer.registerQuery("c = foreach b generate str, intval, s, i;");
        Iterator<Tuple> it = pigServer.openIterator("c");
        int i = 0;
        Object[][] results = new Object[][] { {"somestring", new Integer(10), "a", new Integer(10)},
                {"somestring", new Integer(10), "b", new Integer(20) }};
        Class[] resultClasses = new Class[] { String.class, Integer.class, String.class, Integer.class };
        while(it.hasNext()) {
            Tuple t = it.next();
            for (int j = 0; j < resultClasses.length; j++) {
                assertEquals(results[i][j], t.get(j));
                assertEquals(resultClasses[j], t.get(j).getClass());
            }
            i++;
        }
        assertEquals(results.length, i);

        pigServer.registerQuery("c = foreach b generate str, intval, bg::s, bg::i;");
        it = pigServer.openIterator("c");
        i = 0;
        while(it.hasNext()) {
            Tuple t = it.next();
            for (int j = 0; j < resultClasses.length; j++) {
                assertEquals(results[i][j], t.get(j));
                assertEquals(resultClasses[j], t.get(j).getClass());
            }
            i++;
        }
        assertEquals(results.length, i);
    }

    @Test
    public void testBagStoreLoad() throws IOException, ExecException {
        File input = Util.createInputFile("tmp", "",
                new String[] {"a\tid1", "a\tid2", "a\tid3", "b\tid4", "b\tid5", "b\tid6"});
        pigServer.registerQuery("a = load '"
                + Util.generateURI(Util.encodeEscape(input.toString()), pigServer.getPigContext())
                + "' " + "as (s:chararray, id:chararray);");
        pigServer.registerQuery("b = group a by s;");
        Class[] loadStoreClasses = new Class[] { BinStorage.class, PigStorage.class };
        for (int i = 0; i < loadStoreClasses.length; i++) {
            String output = "TestDataBagAccess-testBagStoreLoad-" +
                             loadStoreClasses[i].getName() + ".txt";
            pigServer.deleteFile(output);
            pigServer.store("b", output, loadStoreClasses[i].getName());
            pigServer.registerQuery("c = load '" + output + "' using " + loadStoreClasses[i].getName() + "() AS " +
                    "(gp: chararray, bg:bag { t: tuple (sReLoaded: chararray, idReLoaded: chararray)});;");
            Iterator<Tuple> it = pigServer.openIterator("c");
            MultiMap<Object, Object> results = new MultiMap<Object, Object>();
            results.put("a", "id1");
            results.put("a", "id2");
            results.put("a", "id3");
            results.put("b", "id4");
            results.put("b", "id5");
            results.put("b", "id6");
            int j = 0;
            while(it.hasNext()) {
                Tuple t = it.next();
                Object groupKey = t.get(0);
                DataBag groupBag = (DataBag)t.get(1);
                Iterator<Tuple> bgIt = groupBag.iterator();
                int k = 0;
                while(bgIt.hasNext()) {
                    // a hash to make sure we don't see the
                    // same "ids" twice
                    HashMap<Object, Boolean> seen = new HashMap<Object, Boolean>();
                    Tuple bgt = bgIt.next();
                    // the first col is the group by key
                    assertTrue(bgt.get(0).equals(groupKey));
                    Collection<Object> values = results.get(groupKey);
                    // check that the second column is one
                    // of the "id" values associated with this
                    // group by key
                    assertTrue(values.contains(bgt.get(1)));
                    // check that we have not seen the same "id" value
                    // before
                    if(seen.containsKey(bgt.get(1)))
                        fail("LoadStoreClass used : " + loadStoreClasses[i].getName() + " " +
                            ", duplicate value (" + bgt.get(1) + ")");
                    else
                        seen.put(bgt.get(1), true);
                    k++;
                }
                // check that we saw 3 tuples in each group bag
                assertEquals(3, k);
                j++;
            }
            // make sure we saw the right number of high
            // level tuples
            assertEquals(results.keySet().size(), j);

            pigServer.registerQuery("d = foreach c generate gp, flatten(bg);");
            // results should be
            // a a id1
            // a a id2
            // a a id3
            // b b id4
            // b b id5
            // b b id6
            // However order is not guaranteed
            List<Tuple> resultTuples = new ArrayList<Tuple>();
            resultTuples.add(Util.createTuple(new String[] { "a", "a", "id1"}));
            resultTuples.add(Util.createTuple(new String[] { "a", "a", "id2"}));
            resultTuples.add(Util.createTuple(new String[] { "a", "a", "id3"}));
            resultTuples.add(Util.createTuple(new String[] { "b", "b", "id4"}));
            resultTuples.add(Util.createTuple(new String[] { "b", "b", "id5"}));
            resultTuples.add(Util.createTuple(new String[] { "b", "b", "id6"}));
            it = pigServer.openIterator("d");
            j = 0;
            HashMap<Tuple, Boolean> seen = new HashMap<Tuple, Boolean>();
            while(it.hasNext()) {
                Tuple t = it.next();
                assertTrue(resultTuples.contains(t));
                if(seen.containsKey(t)) {
                    fail("LoadStoreClass used : " + loadStoreClasses[i].getName() + " " +
                                ", duplicate tuple (" + t + ") encountered.");
                } else {
                    seen.put(t, true);
                }
                j++;
            }
            // check we got expected number of tuples
            assertEquals(resultTuples.size(), j);

            // same output as above - but projection based on aliases
            pigServer.registerQuery("e = foreach d generate gp, sReLoaded, idReLoaded;");
            it = pigServer.openIterator("e");
            j = 0;
            seen = new HashMap<Tuple, Boolean>();
            while(it.hasNext()) {
                Tuple t = it.next();
                assertTrue(resultTuples.contains(t));
                if(seen.containsKey(t)) {
                    fail("LoadStoreClass used : " + loadStoreClasses[i].getName() + " " +
                                ", duplicate tuple (" + t + ") encountered.");
                } else {
                    seen.put(t, true);
                }
                j++;
            }
            // check we got expected number of tuples
            assertEquals(resultTuples.size(), j);

            // same result as above but projection based on position specifiers
            pigServer.registerQuery("f = foreach d generate $0, $1, $2;");
            it = pigServer.openIterator("f");
            j = 0;
            seen = new HashMap<Tuple, Boolean>();
            while(it.hasNext()) {
                Tuple t = it.next();
                assertTrue(resultTuples.contains(t));
                if(seen.containsKey(t)) {
                    fail("LoadStoreClass used : " + loadStoreClasses[i].getName() + " " +
                                ", duplicate tuple (" + t + ") encountered.");
                } else {
                    seen.put(t, true);
                }
                j++;
            }
            // check we got expected number of tuples
            assertEquals(resultTuples.size(), j);


        }
    }
}
TOP

Related Classes of org.apache.pig.test.TestDataBagAccess

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.