Source Code of org.apache.pig.impl.builtin.MergeJoinIndexer

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.pig.impl.builtin;


import java.io.IOException;
import java.util.List;
import java.util.Map;


import org.apache.pig.ExecType;
import org.apache.pig.LoadFunc;
import org.apache.pig.PigException;
import org.apache.pig.SamplableLoader;
import org.apache.pig.backend.datastorage.DataStorage;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.POStatus;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.PhysicalOperator;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.Result;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.plans.PhysicalPlan;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POLocalRearrange;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.apache.pig.impl.PigContext;
import org.apache.pig.impl.io.BufferedPositionedInputStream;
import org.apache.pig.impl.logicalLayer.schema.Schema;
import org.apache.pig.impl.plan.NodeIdGenerator;
import org.apache.pig.impl.plan.OperatorKey;
import org.apache.pig.impl.util.ObjectSerializer;


/** Merge Join indexer is used to generate on the fly index for doing Merge Join efficiently.
 *  It samples first record from every block of right side input. 
 *  and returns tuple in the following format : 
 *  (key0, key1,...,fileName, offset)
 *  These tuples are then sorted before being written out to index file on HDFS.
 */


public class MergeJoinIndexer  implements LoadFunc{


    private boolean firstRec = true;
    private transient TupleFactory mTupleFactory;
    private String fileName;
    private POLocalRearrange lr;
    private PhysicalPlan precedingPhyPlan;
    private int keysCnt;
    private PhysicalOperator rightPipelineLeaf;
    private PhysicalOperator rightPipelineRoot;
    private Tuple dummyTuple = null;
    private SamplableLoader loader;


    /** @param funcSpec : Loader specification.
     *  @param innerPlan : This is serialized version of LR plan. We 
     *  want to keep only keys in our index file and not the whole tuple. So, we need LR and thus its plan
     *  to get keys out of the sampled tuple.  
     * @param serializedPhyPlan Serialized physical plan on right side.
     * @throws ExecException 
     */
    @SuppressWarnings("unchecked")
    public MergeJoinIndexer(String funcSpec, String innerPlan, String serializedPhyPlan) throws ExecException{
        
        loader = (SamplableLoader)PigContext.instantiateFuncFromSpec(funcSpec);
        try {
            List<PhysicalPlan> innerPlans = (List<PhysicalPlan>)ObjectSerializer.deserialize(innerPlan);
            lr = new POLocalRearrange(new OperatorKey("MergeJoin Indexer",NodeIdGenerator.getGenerator().getNextNodeId("MergeJoin Indexer")));
            lr.setPlans(innerPlans);
            keysCnt = innerPlans.size();
            precedingPhyPlan = (PhysicalPlan)ObjectSerializer.deserialize(serializedPhyPlan);
            if(precedingPhyPlan != null){
                    if(precedingPhyPlan.getLeaves().size() != 1 || precedingPhyPlan.getRoots().size() != 1){
                        int errCode = 2168;
                        String errMsg = "Expected physical plan with exactly one root and one leaf.";
                        throw new ExecException(errMsg,errCode,PigException.BUG);
                    }
                this.rightPipelineLeaf = precedingPhyPlan.getLeaves().get(0);
                this.rightPipelineRoot = precedingPhyPlan.getRoots().get(0);
                this.rightPipelineRoot.setInputs(null);                            
            }
        }
        catch (IOException e) {
            int errCode = 2094;
            String msg = "Unable to deserialize plans in Indexer.";
            throw new ExecException(msg,errCode,e);
        }
        mTupleFactory = TupleFactory.getInstance();
    }


    @Override
    public void bindTo(String fileName, BufferedPositionedInputStream is,long offset, long end) throws IOException {
        this.fileName = fileName;
        loader.bindTo(fileName, is, offset, end);
    }


    @Override
    public Tuple getNext() throws IOException {


        if(!firstRec)   // We sample only one record per block.
            return null;


        long curPos;
        Object key = null;
        Tuple wrapperTuple = mTupleFactory.newTuple(keysCnt+2);
        
        while(true){
            curPos = loader.getPosition();
            Tuple readTuple = loader.getNext();


            if(null == readTuple){    // We hit the end.


                for(int i =0; i < keysCnt; i++)
                    wrapperTuple.set(i, null);
                wrapperTuple.set(keysCnt, fileName);
                wrapperTuple.set(keysCnt+1, curPos);
                firstRec = false;
                return wrapperTuple;
            }


            if (null == precedingPhyPlan){


                lr.attachInput(readTuple);
                key = ((Tuple)lr.getNext(dummyTuple).result).get(1);
                lr.detachInput();
                if ( null == key) // Tuple with null key. Drop it.
                    continue;
                break;      
            }


            // There is a physical plan. 


            rightPipelineRoot.attachInput(readTuple);
            boolean fetchNewTup;


            while(true){


                Result res = rightPipelineLeaf.getNext(dummyTuple);
                switch(res.returnStatus){


                case POStatus.STATUS_OK:


                    lr.attachInput((Tuple)res.result);
                    key = ((Tuple)lr.getNext(dummyTuple).result).get(1);
                    lr.detachInput();
                    if ( null == key) // Tuple with null key. Drop it.
                        continue;
                     fetchNewTup = false;
                    break;


                case POStatus.STATUS_EOP:
                    fetchNewTup = true;
                    break;


                default:
                    int errCode = 2164;
                    String errMsg = "Expected EOP/OK as return status. Found: "+res.returnStatus;
                    throw new ExecException(errMsg,errCode);
                }            
                break;
            }
            if (!fetchNewTup)
                break;
        }


        if(key instanceof Tuple){
            Tuple tupKey = (Tuple)key;
            for(int i =0; i < tupKey.size(); i++)
                wrapperTuple.set(i, tupKey.get(i));
        }


        else
            wrapperTuple.set(0, key);


        wrapperTuple.set(keysCnt, fileName);
        wrapperTuple.set(keysCnt+1, curPos);    
        firstRec = false;
        return wrapperTuple;
    }
    
    public Integer bytesToInteger(byte[] b) throws IOException {
        return loader.bytesToInteger(b);
    }


    public Long bytesToLong(byte[] b) throws IOException {
        return loader.bytesToLong(b);
    }


    public Float bytesToFloat(byte[] b) throws IOException {
        return loader.bytesToFloat(b);
    }


    public Double bytesToDouble(byte[] b) throws IOException {
        return loader.bytesToDouble(b);
    }


    public String bytesToCharArray(byte[] b) throws IOException {
        return loader.bytesToCharArray(b);
    }


    public Map<String, Object> bytesToMap(byte[] b) throws IOException {
        return loader.bytesToMap(b);
    }


    public Tuple bytesToTuple(byte[] b) throws IOException {
        return loader.bytesToTuple(b);
    }


    public DataBag bytesToBag(byte[] b) throws IOException {
        return loader.bytesToBag(b);
    }


    public void fieldsToRead(Schema schema) {
        loader.fieldsToRead(schema);
    }


    public Schema determineSchema(
            String fileName,
            ExecType execType,
            DataStorage storage) throws IOException {
        return loader.determineSchema(fileName, execType, storage);
    }
}
Source Code of org.apache.pig.impl.builtin.MergeJoinIndexer

Related Classes of org.apache.pig.impl.builtin.MergeJoinIndexer