Package org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators

Source Code of org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.Packager

/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators;

import java.io.Serializable;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.Map;

import org.apache.pig.PigConfiguration;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigMapReduce;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.POStatus;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.Result;
import org.apache.pig.data.BagFactory;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.InternalCachedBag;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.apache.pig.impl.io.NullableTuple;
import org.apache.pig.impl.io.PigNullableWritable;
import org.apache.pig.impl.util.IdentityHashSet;
import org.apache.pig.impl.util.Pair;
import org.apache.pig.pen.Illustrable;
import org.apache.pig.pen.Illustrator;
import org.apache.pig.pen.util.ExampleTuple;
import org.apache.pig.pen.util.LineageTracer;

public class Packager implements Illustrable, Serializable, Cloneable {

    private static final long serialVersionUID = 1L;

    protected boolean[] readOnce;

    protected DataBag[] bags;

    public static enum PackageType {
        GROUP, JOIN
    };

    protected transient Illustrator illustrator = null;

    // The key being worked on
    Object key;

    // marker to indicate if key is a tuple
    protected boolean isKeyTuple = false;
    // marker to indicate if the tuple key is compound in nature
    protected boolean isKeyCompound = false;

    // key's type
    byte keyType;

    // The number of inputs to this
    // co-group. 0 indicates a distinct, which means there will only be a
    // key, no value.
    int numInputs;

    // If the attaching map-reduce plan use secondary sort key
    boolean useSecondaryKey = false;

    // Denotes if inner is specified
    // on a particular input
    boolean[] inner;

    // flag to denote whether there is a distinct
    // leading to this package
    protected boolean distinct = false;

    // A mapping of input index to key information got from LORearrange
    // for that index. The Key information is a pair of boolean, Map.
    // The boolean indicates whether there is a lone project(*) in the
    // cogroup by. If not, the Map has a mapping of column numbers in the
    // "value" to column numbers in the "key" which contain the fields in
    // the "value"
    protected Map<Integer, Pair<Boolean, Map<Integer, Integer>>> keyInfo;

    private PackageType pkgType;

    private transient boolean initialized;
    private transient boolean useDefaultBag;

    protected POPackage parent = null;

    protected static final BagFactory mBagFactory = BagFactory.getInstance();
    protected static final TupleFactory mTupleFactory = TupleFactory.getInstance();

    public Object getKey(PigNullableWritable key) throws ExecException {
        Object keyObject = key.getValueAsPigType();
        if (useSecondaryKey) {
            return ((Tuple) keyObject).get(0);
        } else {
            return keyObject;
        }
    }

    public void attachInput(Object key, DataBag[] bags, boolean[] readOnce)
            throws ExecException {
        checkBagType();

        this.key = key;
        this.bags = bags;
        this.readOnce = readOnce;
        // We assume that we need all bags materialized. Specialized subclasses
        // may choose to handle this differently
        for (int i = 0; i < bags.length; i++) {
            if (readOnce[i]) {
                DataBag materializedBag = getBag();
                materializedBag.addAll(bags[i]);
                bags[i] = materializedBag;
            }
        }
    }

    public Result getNext() throws ExecException {
        if (bags == null) {
            return new Result(POStatus.STATUS_EOP, null);
        }
        Tuple res;

        if (isDistinct()) {
            // only set the key which has the whole
            // tuple
            res = mTupleFactory.newTuple(1);
            res.set(0, key);
        } else {
            // Construct the output tuple by appending
            // the key and all the above constructed bags
            // and return it.
            res = mTupleFactory.newTuple(numInputs + 1);
            res.set(0, key);
            int i = -1;
            for (DataBag bag : bags) {
                i++;
                if (inner[i]) {
                    if (bag.size() == 0) {
                        detachInput();
                        Result r = new Result();
                        r.returnStatus = POStatus.STATUS_NULL;
                        return r;
                    }
                }

                res.set(i + 1, bag);
            }
        }
        Result r = new Result();
        r.returnStatus = POStatus.STATUS_OK;
        r.result = illustratorMarkup(null, res, 0);
        detachInput();
        return r;
    }

    public void detachInput() {
        key = null;
        bags = null;
    }

    protected Tuple illustratorMarkup2(Object in, Object out) {
        if (illustrator != null) {
            ExampleTuple tOut;
            if (!(out instanceof ExampleTuple)) {
                tOut = new ExampleTuple((Tuple) out);
            } else {
                tOut = (ExampleTuple) out;
            }
            illustrator.getLineage().insert(tOut);
            tOut.synthetic = ((ExampleTuple) in).synthetic;
            illustrator.getLineage().union(tOut, (Tuple) in);
            return tOut;
        } else
            return (Tuple) out;
    }

    protected Tuple starMarkup(Tuple key, Tuple val, Tuple out){
        if (illustrator != null){
            Tuple copy = illustratorMarkup2(key, out);
            // For distinct, we also need to retain lineage information from the values.
            if (isDistinct())
                copy = illustratorMarkup2(val, out);
            return copy;
        } else
            return (Tuple) out;
    }

    public Tuple getValueTuple(PigNullableWritable keyWritable,
            NullableTuple ntup, int index) throws ExecException {
        Object key = getKey(keyWritable);
        // Need to make a copy of the value, as hadoop uses the same ntup
        // to represent each value.
        Tuple val = (Tuple) ntup.getValueAsPigType();

        Tuple copy = null;
        // The "value (val)" that we just got may not
        // be the complete "value". It may have some portions
        // in the "key" (look in POLocalRearrange for more comments)
        // If this is the case we need to stitch
        // the "value" together.
        Pair<Boolean, Map<Integer, Integer>> lrKeyInfo = keyInfo.get(index);
        boolean isProjectStar = lrKeyInfo.first;
        Map<Integer, Integer> keyLookup = lrKeyInfo.second;
        int keyLookupSize = keyLookup.size();

        if (keyLookupSize > 0) {

            // we have some fields of the "value" in the
            // "key".
            int finalValueSize = keyLookupSize + val.size();
            copy = mTupleFactory.newTuple(finalValueSize);
            int valIndex = 0; // an index for accessing elements from
                              // the value (val) that we have currently
            for (int i = 0; i < finalValueSize; i++) {
                Integer keyIndex = keyLookup.get(i);
                if (keyIndex == null) {
                    // the field for this index is not in the
                    // key - so just take it from the "value"
                    // we were handed
                    copy.set(i, val.get(valIndex));
                    valIndex++;
                } else {
                    // the field for this index is in the key
                    if (isKeyTuple && isKeyCompound) {
                        // the key is a tuple, extract the
                        // field out of the tuple
                        copy.set(i, ((Tuple) key).get(keyIndex));
                    } else {
                        copy.set(i, key);
                    }
                }
            }
            copy = illustratorMarkup2(val, copy);
        } else if (isProjectStar) {

            // the whole "value" is present in the "key"
            copy = mTupleFactory.newTuple(((Tuple) key).getAll());
            copy = starMarkup((Tuple) key, val, copy);
        } else {

            // there is no field of the "value" in the
            // "key" - so just make a copy of what we got
            // as the "value"
            copy = mTupleFactory.newTuple(val.getAll());
            copy = illustratorMarkup2(val, copy);
        }
        return copy;
    }

    public byte getKeyType() {
        return keyType;
    }

    public void setKeyType(byte keyType) {
        this.keyType = keyType;
    }

    /**
     * @return the isKeyTuple
     */
    public boolean getKeyTuple() {
        return isKeyTuple;
    }

    /**
     * @return the keyAsTuple
     */
    public Tuple getKeyAsTuple() {
        return isKeyTuple ? (Tuple) key : null;
    }

    /**
     * @return the key
     */
    public Object getKey() {
        return key;
    }

    public boolean[] getInner() {
        return inner;
    }

    public void setInner(boolean[] inner) {
        this.inner = inner;
    }

    /**
     * @param keyInfo the keyInfo to set
     */
    public void setKeyInfo(
            Map<Integer, Pair<Boolean, Map<Integer, Integer>>> keyInfo) {
        this.keyInfo = keyInfo;
    }

    /**
     * @param keyTuple the keyTuple to set
     */
    public void setKeyTuple(boolean keyTuple) {
        this.isKeyTuple = keyTuple;
    }

    /**
     * @param keyCompound the keyCompound to set
     */
    public void setKeyCompound(boolean keyCompound) {
        this.isKeyCompound = keyCompound;
    }

    /**
     * @return the keyInfo
     */
    public Map<Integer, Pair<Boolean, Map<Integer, Integer>>> getKeyInfo() {
        return keyInfo;
    }

    public Illustrator getIllustrator() {
        return illustrator;
    }

    @Override
    public void setIllustrator(Illustrator illustrator) {
        this.illustrator = illustrator;
    }

    /**
     * @return the distinct
     */
    public boolean isDistinct() {
        return distinct;
    }

    /**
     * @param distinct the distinct to set
     */
    public void setDistinct(boolean distinct) {
        this.distinct = distinct;
    }

    public void setUseSecondaryKey(boolean useSecondaryKey) {
        this.useSecondaryKey = useSecondaryKey;
    }

    public void setPackageType(PackageType type) {
        this.pkgType = type;
    }

    public PackageType getPackageType() {
        return pkgType;
    }

    public int getNumInputs(byte index) {
        return numInputs;
    }

    public int getNumInputs() {
        return numInputs;
    }

    public void setNumInputs(int numInputs) {
        this.numInputs = numInputs;
    }

    @Override
    public Packager clone() throws CloneNotSupportedException {
        Packager clone = (Packager) super.clone();
        clone.setNumInputs(numInputs);
        clone.setPackageType(pkgType);
        clone.setDistinct(distinct);
        if (inner != null) {
            clone.inner = new boolean[inner.length];
            for (int i = 0; i < inner.length; i++) {
                clone.inner[i] = inner[i];
            }
        } else
            clone.inner = null;
        if (keyInfo != null)
            clone.setKeyInfo(new HashMap<Integer, Pair<Boolean, Map<Integer, Integer>>>(
                    keyInfo));
        clone.setKeyCompound(isKeyCompound);
        clone.setKeyTuple(isKeyTuple);
        clone.setKeyType(keyType);
        clone.setUseSecondaryKey(useSecondaryKey);
        return clone;
    }

    public String name() {
        return this.getClass().getSimpleName();
    }

    @Override
    public Tuple illustratorMarkup(Object in, Object out, int eqClassIndex) {
        // All customized packagers are introduced during MRCompilaition.
        // Illustrate happens before that, so we only have to focus on the basic
        // POPackage
        if (illustrator != null) {
            ExampleTuple tOut = new ExampleTuple((Tuple) out);
            LineageTracer lineageTracer = illustrator.getLineage();
            lineageTracer.insert(tOut);
            boolean synthetic = false;
            if (illustrator.getEquivalenceClasses() == null) {
                LinkedList<IdentityHashSet<Tuple>> equivalenceClasses = new LinkedList<IdentityHashSet<Tuple>>();
                for (int i = 0; i < numInputs; ++i) {
                    IdentityHashSet<Tuple> equivalenceClass = new IdentityHashSet<Tuple>();
                    equivalenceClasses.add(equivalenceClass);
                }
                illustrator.setEquivalenceClasses(equivalenceClasses, parent);
            }

            if (isDistinct()) {
                int count = 0;
                for (Tuple tmp : bags[0]){
                    count++;
                    if (!tmp.equals(tOut))
                        lineageTracer.union(tOut, tmp);
                }
                if (count > 1) // only non-distinct tuples are inserted into the
                    // equivalence class
                    illustrator.getEquivalenceClasses().get(eqClassIndex)
                    .add(tOut);
                illustrator.addData((Tuple) tOut);
                return (Tuple) tOut;
            }
            boolean outInEqClass = true;
            try {
                for (int i = 1; i < numInputs + 1; i++) {
                    DataBag dbs = (DataBag) ((Tuple) out).get(i);
                    Iterator<Tuple> iter = dbs.iterator();
                    if (dbs.size() <= 1 && outInEqClass) // all inputs have >= 2
                        // records
                        outInEqClass = false;
                    while (iter.hasNext()) {
                        Tuple tmp = iter.next();
                        // any of synthetic data in bags causes the output tuple
                        // to be synthetic
                        if (!synthetic && ((ExampleTuple) tmp).synthetic)
                            synthetic = true;
                        lineageTracer.union(tOut, tmp);
                    }
                }
            } catch (ExecException e) {
                // TODO better exception handling
                throw new RuntimeException("Illustrator exception :"
                        + e.getMessage());
            }
            if (outInEqClass)
                illustrator.getEquivalenceClasses().get(eqClassIndex).add(tOut);
            tOut.synthetic = synthetic;
            illustrator.addData((Tuple) tOut);
            return tOut;
        } else
            return (Tuple) out;
    }

    public void setParent(POPackage pack) {
        parent = pack;
    }

    public int numberOfEquivalenceClasses() {
        return 1;
    }

    public void checkBagType() {
        if(!initialized){
            initialized = true;
            if (PigMapReduce.sJobConfInternal.get() != null) {
                String bagType = PigMapReduce.sJobConfInternal.get().get(PigConfiguration.PIG_CACHEDBAG_TYPE);
                if (bagType != null && bagType.equalsIgnoreCase("default")) {
                    useDefaultBag = true;
                }
            }
        }
    }

    public DataBag getBag(){
        return useDefaultBag ? mBagFactory.newDefaultBag()
                // In a very rare case if there is a POStream after this
                // POJoinPackage in the pipeline and is also blocking the pipeline;
                // constructor argument should be 2 * numInputs. But for one obscure
                // case we don't want to pay the penalty all the time.
                : new InternalCachedBag(numInputs);
    }
}
TOP

Related Classes of org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.Packager

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.