Package org.apache.pig.impl.builtin

Source Code of org.apache.pig.impl.builtin.MergeJoinIndexer

/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.pig.impl.builtin;

import java.io.IOException;
import java.util.List;
import java.util.Map;

import org.apache.pig.ExecType;
import org.apache.pig.LoadFunc;
import org.apache.pig.PigException;
import org.apache.pig.SamplableLoader;
import org.apache.pig.backend.datastorage.DataStorage;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.POStatus;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.PhysicalOperator;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.Result;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.plans.PhysicalPlan;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POLocalRearrange;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.apache.pig.impl.PigContext;
import org.apache.pig.impl.io.BufferedPositionedInputStream;
import org.apache.pig.impl.logicalLayer.schema.Schema;
import org.apache.pig.impl.plan.NodeIdGenerator;
import org.apache.pig.impl.plan.OperatorKey;
import org.apache.pig.impl.util.ObjectSerializer;

/** Merge Join indexer is used to generate on the fly index for doing Merge Join efficiently.
*  It samples first record from every block of right side input.
*  and returns tuple in the following format :
*  (key0, key1,...,fileName, offset)
*  These tuples are then sorted before being written out to index file on HDFS.
*/

public class MergeJoinIndexer  implements LoadFunc{

    private boolean firstRec = true;
    private transient TupleFactory mTupleFactory;
    private String fileName;
    private POLocalRearrange lr;
    private PhysicalPlan precedingPhyPlan;
    private int keysCnt;
    private PhysicalOperator rightPipelineLeaf;
    private PhysicalOperator rightPipelineRoot;
    private Tuple dummyTuple = null;
    private SamplableLoader loader;

    /** @param funcSpec : Loader specification.
     *  @param innerPlan : This is serialized version of LR plan. We
     *  want to keep only keys in our index file and not the whole tuple. So, we need LR and thus its plan
     *  to get keys out of the sampled tuple. 
     * @param serializedPhyPlan Serialized physical plan on right side.
     * @throws ExecException
     */
    @SuppressWarnings("unchecked")
    public MergeJoinIndexer(String funcSpec, String innerPlan, String serializedPhyPlan) throws ExecException{
       
        loader = (SamplableLoader)PigContext.instantiateFuncFromSpec(funcSpec);
        try {
            List<PhysicalPlan> innerPlans = (List<PhysicalPlan>)ObjectSerializer.deserialize(innerPlan);
            lr = new POLocalRearrange(new OperatorKey("MergeJoin Indexer",NodeIdGenerator.getGenerator().getNextNodeId("MergeJoin Indexer")));
            lr.setPlans(innerPlans);
            keysCnt = innerPlans.size();
            precedingPhyPlan = (PhysicalPlan)ObjectSerializer.deserialize(serializedPhyPlan);
            if(precedingPhyPlan != null){
                    if(precedingPhyPlan.getLeaves().size() != 1 || precedingPhyPlan.getRoots().size() != 1){
                        int errCode = 2168;
                        String errMsg = "Expected physical plan with exactly one root and one leaf.";
                        throw new ExecException(errMsg,errCode,PigException.BUG);
                    }
                this.rightPipelineLeaf = precedingPhyPlan.getLeaves().get(0);
                this.rightPipelineRoot = precedingPhyPlan.getRoots().get(0);
                this.rightPipelineRoot.setInputs(null);                           
            }
        }
        catch (IOException e) {
            int errCode = 2094;
            String msg = "Unable to deserialize plans in Indexer.";
            throw new ExecException(msg,errCode,e);
        }
        mTupleFactory = TupleFactory.getInstance();
    }

    @Override
    public void bindTo(String fileName, BufferedPositionedInputStream is,long offset, long end) throws IOException {
        this.fileName = fileName;
        loader.bindTo(fileName, is, offset, end);
    }

    @Override
    public Tuple getNext() throws IOException {

        if(!firstRec)   // We sample only one record per block.
            return null;

        long curPos;
        Object key = null;
        Tuple wrapperTuple = mTupleFactory.newTuple(keysCnt+2);
       
        while(true){
            curPos = loader.getPosition();
            Tuple readTuple = loader.getNext();

            if(null == readTuple){    // We hit the end.

                for(int i =0; i < keysCnt; i++)
                    wrapperTuple.set(i, null);
                wrapperTuple.set(keysCnt, fileName);
                wrapperTuple.set(keysCnt+1, curPos);
                firstRec = false;
                return wrapperTuple;
            }

            if (null == precedingPhyPlan){

                lr.attachInput(readTuple);
                key = ((Tuple)lr.getNext(dummyTuple).result).get(1);
                lr.detachInput();
                if ( null == key) // Tuple with null key. Drop it.
                    continue;
                break;     
            }

            // There is a physical plan.

            rightPipelineRoot.attachInput(readTuple);
            boolean fetchNewTup;

            while(true){

                Result res = rightPipelineLeaf.getNext(dummyTuple);
                switch(res.returnStatus){

                case POStatus.STATUS_OK:

                    lr.attachInput((Tuple)res.result);
                    key = ((Tuple)lr.getNext(dummyTuple).result).get(1);
                    lr.detachInput();
                    if ( null == key) // Tuple with null key. Drop it.
                        continue;
                     fetchNewTup = false;
                    break;

                case POStatus.STATUS_EOP:
                    fetchNewTup = true;
                    break;

                default:
                    int errCode = 2164;
                    String errMsg = "Expected EOP/OK as return status. Found: "+res.returnStatus;
                    throw new ExecException(errMsg,errCode);
                }           
                break;
            }
            if (!fetchNewTup)
                break;
        }

        if(key instanceof Tuple){
            Tuple tupKey = (Tuple)key;
            for(int i =0; i < tupKey.size(); i++)
                wrapperTuple.set(i, tupKey.get(i));
        }

        else
            wrapperTuple.set(0, key);

        wrapperTuple.set(keysCnt, fileName);
        wrapperTuple.set(keysCnt+1, curPos);   
        firstRec = false;
        return wrapperTuple;
    }
   
    public Integer bytesToInteger(byte[] b) throws IOException {
        return loader.bytesToInteger(b);
    }

    public Long bytesToLong(byte[] b) throws IOException {
        return loader.bytesToLong(b);
    }

    public Float bytesToFloat(byte[] b) throws IOException {
        return loader.bytesToFloat(b);
    }

    public Double bytesToDouble(byte[] b) throws IOException {
        return loader.bytesToDouble(b);
    }

    public String bytesToCharArray(byte[] b) throws IOException {
        return loader.bytesToCharArray(b);
    }

    public Map<String, Object> bytesToMap(byte[] b) throws IOException {
        return loader.bytesToMap(b);
    }

    public Tuple bytesToTuple(byte[] b) throws IOException {
        return loader.bytesToTuple(b);
    }

    public DataBag bytesToBag(byte[] b) throws IOException {
        return loader.bytesToBag(b);
    }

    public void fieldsToRead(Schema schema) {
        loader.fieldsToRead(schema);
    }

    public Schema determineSchema(
            String fileName,
            ExecType execType,
            DataStorage storage) throws IOException {
        return loader.determineSchema(fileName, execType, storage);
    }
}
TOP

Related Classes of org.apache.pig.impl.builtin.MergeJoinIndexer

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.