Package org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators

Source Code of org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.JoinPackager

/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators;

import java.util.Iterator;
import java.util.List;

import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.POStatus;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.Result;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.plans.PhysicalPlan;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.NonSpillableDataBag;
import org.apache.pig.data.Tuple;
import org.apache.pig.impl.plan.NodeIdGenerator;
import org.apache.pig.impl.plan.OperatorKey;
import org.apache.pig.pen.Illustrator;

public class JoinPackager extends Packager {

    private static final long serialVersionUID = 1L;
    private POOptimizedForEach forEach;
    private boolean newKey = true;
    private Tuple res = null;
    private static final Result eopResult = new Result(POStatus.STATUS_EOP, null);

    public static final String DEFAULT_CHUNK_SIZE = "1000";

    private long chunkSize = Long.parseLong(DEFAULT_CHUNK_SIZE);
    private Result forEachResult;
    private DataBag[] dbs = null;

    private int lastBagIndex;

    private Iterator<Tuple> lastBagIter;

    public JoinPackager(Packager p, POForEach f) {
        super();
        String scope = f.getOperatorKey().getScope();
        NodeIdGenerator nig = NodeIdGenerator.getGenerator();
        forEach = new POOptimizedForEach(new OperatorKey(scope,nig.getNextNodeId(scope)));
        if (p!=null)
        {
            setKeyType(p.getKeyType());
            setNumInputs(p.getNumInputs());
            lastBagIndex = numInputs - 1;
            setInner(p.getInner());
            setKeyInfo(p.getKeyInfo());
            this.isKeyTuple = p.isKeyTuple;
            this.isKeyCompound = p.isKeyCompound;
        }
        if (f!=null)
        {
            setInputPlans(f.getInputPlans());
            setToBeFlattened(f.getToBeFlattened());
        }
    }

    /**
     * Calls getNext to get next ForEach result. The input for POJoinPackage is
     * a (key, NullableTuple) pair. We will materialize n-1 inputs into bags, feed input#n
     * one tuple a time to the delegated ForEach operator, the input for ForEach is
     *
     *     (input#1, input#2, input#3....input#n[i]), i=(1..k), suppose input#n consists
     *
     * of k tuples.
     * For every ForEach input, pull all the results from ForEach.
     * getNext will be called multiple times for a particular input,
     * it returns one output tuple from ForEach every time we call getNext,
     * so we need to maintain internal status to keep tracking of where we are.
     */
    @Override
    public Result getNext() throws ExecException {
        Tuple it = null;

        // If we see a new NullableTupleIterator, materialize n-1 inputs, construct ForEach input
        // tuple res = (key, input#1, input#2....input#n), the only missing value is input#n,
        // we will get input#n one tuple a time, fill in res, feed to ForEach.
        // After this block, we have the first tuple of input#n in hand (kept in variable it)
        if (newKey)
        {
            // Put n-1 inputs into bags
            dbs = new DataBag[numInputs];
            for (int i = 0; i < numInputs - 1; i++) {
                dbs[i] = bags[i];
            }

            // For last bag, we always use NonSpillableBag.
            dbs[lastBagIndex] = new NonSpillableDataBag((int)chunkSize);

            lastBagIter = bags[lastBagIndex].iterator();

            // If we don't have any tuple for input#n
            // we do not need any further process, return EOP
            if (!lastBagIter.hasNext()) {
                // we will return at this point because we ought
                // to be having a flatten on this last input
                // and we have an empty bag which should result
                // in this key being taken out of the output
                newKey = true;
                return eopResult;
            }

            res = mTupleFactory.newTuple(numInputs+1);
            for (int i = 0; i < dbs.length; i++)
                res.set(i+1,dbs[i]);

            res.set(0,key);
            // if we have an inner anywhere and the corresponding
            // bag is empty, we can just return
            for (int i = 0; i < dbs.length - 1; i++) {
                if(inner[i]&&dbs[i].size()==0){
                    detachInput();
                    return eopResult;
                }
            }
            newKey = false;
        }

        // Keep attaching input tuple to ForEach, until:
        // 1. We can initialize ForEach.getNext();
        // 2. There is no more input#n
        while (lastBagIter.hasNext() || forEach.processingPlan) {
            // if a previous call to foreach.getNext()
            // has still not returned all output, process it
            while (forEach.processingPlan) {
                forEachResult = forEach.getNextTuple();
                switch (forEachResult.returnStatus) {
                case POStatus.STATUS_OK:
                case POStatus.STATUS_ERR:
                    return forEachResult;
                case POStatus.STATUS_NULL:
                    continue;
                case POStatus.STATUS_EOP:
                    break;
                }
            }

            if (lastBagIter.hasNext()) {
                // try setting up a bag of CHUNKSIZE OR
                // the remainder of the bag of last input
                // (if < CHUNKSIZE) to foreach
                dbs[lastBagIndex].clear(); // clear last chunk
                for (int i = 0; i < chunkSize && lastBagIter.hasNext(); i++) {
                    it = lastBagIter.next();
                    dbs[lastBagIndex].add(it);
                }
            } else {
                detachInput();
                return eopResult;
            }

            // Attach the input to forEach
            forEach.attachInput(res);

            // pull output tuple from ForEach
            Result forEachResult = forEach.getNextTuple();
            {
                switch (forEachResult.returnStatus) {
                case POStatus.STATUS_OK:
                case POStatus.STATUS_ERR:
                    return forEachResult;
                case POStatus.STATUS_NULL:
                    continue;
                case POStatus.STATUS_EOP:
                    break;
                }
            }
        }
        detachInput();
        return eopResult;
    }

    @Override
    public void attachInput(Object key, DataBag[] bags, boolean[] readOnce)
            throws ExecException {
        checkBagType();

        this.key = key;
        this.bags = bags;
        this.readOnce = readOnce;
        // JoinPackager expects all but the last bag to be materialized
        for (int i = 0; i < bags.length - 1; i++) {
            if (readOnce[i]) {
                DataBag materializedBag = getBag();
                materializedBag.addAll(bags[i]);
                bags[i] = materializedBag;
            }
        }
        if (readOnce[numInputs - 1] != true) {
            throw new ExecException(
                    "JoinPackager expects the last input to be streamed");
        }
        this.newKey = true;
    }

    public List<PhysicalPlan> getInputPlans() {
        return forEach.getInputPlans();
    }

    public void setInputPlans(List<PhysicalPlan> plans) {
        forEach.setInputPlans(plans);
    }

    public void setToBeFlattened(List<Boolean> flattens) {
        forEach.setToBeFlattened(flattens);
    }

    /**
     * @return the forEach
     */
    public POOptimizedForEach getForEach() {
        return forEach;
    }

    /**
     * @param chunkSize - the chunk size for the biggest input
     */
    public void setChunkSize(long chunkSize) {
        this.chunkSize = chunkSize;
    }

    @Override
    public void setIllustrator(Illustrator illustrator) {
        this.illustrator = illustrator;
        forEach.setIllustrator(illustrator);
    }

    @Override
    public String name() {
        return this.getClass().getSimpleName() + "(" + forEach.getFlatStr() + ")";
    }
}
TOP

Related Classes of org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.JoinPackager

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.