Package org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators

Examples of org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POStore


        // We want to use typed tuple comparator for this job, instead of default
        // raw binary comparator used by Pig, to make sure index entries are
        // sorted correctly by Hadoop.
        indexerMROp.useTypedComparator(true);

        POStore st = getStore();
        FileSpec strFile = getTempFileSpec();
        st.setSFile(strFile);
        indexerMROp.reducePlan.addAsLeaf(st);
        indexerMROp.setReduceDone(true);

        return strFile;
    }
View Full Code Here


            }
           
            else if(!rightMROpr.reduceDone){
                // Indexer must run in map. If we are in reduce, close it and start new MROper.
                // No need of yanking in this case. Since we are starting brand new MR Operator and it will contain nothing.
                POStore rightStore = getStore();
                FileSpec rightStrFile = getTempFileSpec();
                rightStore.setSFile(rightStrFile);
                rightMROpr.setReduceDone(true);
                rightMROpr = startNew(rightStrFile, rightMROpr);
                rightPipelinePlan = null;
            }
           
            else{
                int errCode = 2022;
                String msg = "Both map and reduce phases have been done. This is unexpected while compiling.";
                throw new PlanException(msg, errCode, PigException.BUG);
            }
           
            joinOp.setupRightPipeline(rightPipelinePlan);
            rightMROpr.requestedParallelism = 1; // we need exactly one reducer for indexing job.       
           
            // At this point, we must be operating on map plan of right input and it would contain nothing else other then a POLoad.
            POLoad rightLoader = (POLoad)rightMROpr.mapPlan.getRoots().get(0);
            joinOp.setSignature(rightLoader.getSignature());
            LoadFunc rightLoadFunc = rightLoader.getLoadFunc();
            List<String> udfs = new ArrayList<String>();
            if(IndexableLoadFunc.class.isAssignableFrom(rightLoadFunc.getClass())) {
                joinOp.setRightLoaderFuncSpec(rightLoader.getLFile().getFuncSpec());
                joinOp.setRightInputFileName(rightLoader.getLFile().getFileName());
                udfs.add(rightLoader.getLFile().getFuncSpec().toString());
               
                // we don't need the right MROper since
                // the right loader is an IndexableLoadFunc which can handle the index
                // itself
                MRPlan.remove(rightMROpr);
                if(rightMROpr == compiledInputs[0]) {
                    compiledInputs[0] = null;
                } else if(rightMROpr == compiledInputs[1]) {
                    compiledInputs[1] = null;
                }
                rightMROpr = null;
               
                // validate that the join keys in merge join are only                                                                                                                                                                             
                // simple column projections or '*' and not expression - expressions                                                                                                                                                              
                // cannot be handled when the index is built by the storage layer on the sorted                                                                                                                                                   
                // data when the sorted data (and corresponding index) is written.                                                                                                                                                                
                // So merge join will be restricted not have expressions as                                                                                                                                                                       
                // join keys     
                int numInputs = mPlan.getPredecessors(joinOp).size(); // should be 2
                for(int i = 0; i < numInputs; i++) {
                    List<PhysicalPlan> keyPlans = joinOp.getInnerPlansOf(i);
                    for (PhysicalPlan keyPlan : keyPlans) {
                        for(PhysicalOperator op : keyPlan) {
                            if(!(op instanceof POProject)) {
                                int errCode = 1106;
                                String errMsg = "Merge join is possible only for simple column or '*' join keys when using " +
                                rightLoader.getLFile().getFuncSpec() + " as the loader";
                                throw new MRCompilerException(errMsg, errCode, PigException.INPUT);
                            }
                        }
                    }
                }
            } else {
               
                // Replace POLoad with  indexer.

                LoadFunc loadFunc = rightLoader.getLoadFunc();
                if (! (OrderedLoadFunc.class.isAssignableFrom(loadFunc.getClass()))){
                    int errCode = 1104;
                    String errMsg = "Right input of merge-join must implement " +
                    "OrderedLoadFunc interface. The specified loader "
                    + loadFunc + " doesn't implement it";
                    throw new MRCompilerException(errMsg,errCode);
                }

                String[] indexerArgs = new String[6];
                List<PhysicalPlan> rightInpPlans = joinOp.getInnerPlansOf(1);
                FileSpec origRightLoaderFileSpec = rightLoader.getLFile();

                indexerArgs[0] = origRightLoaderFileSpec.getFuncSpec().toString();
                indexerArgs[1] = ObjectSerializer.serialize((Serializable)rightInpPlans);
                indexerArgs[2] = ObjectSerializer.serialize(rightPipelinePlan);
                indexerArgs[3] = rightLoader.getSignature();
                indexerArgs[4] = rightLoader.getOperatorKey().scope;
                indexerArgs[5] = Boolean.toString(true);
               
                FileSpec lFile = new FileSpec(rightLoader.getLFile().getFileName(),new FuncSpec(MergeJoinIndexer.class.getName(), indexerArgs));
                rightLoader.setLFile(lFile);
   
                // Loader of mro will return a tuple of form -
                // (keyFirst1, keyFirst2, .. , position, splitIndex) See MergeJoinIndexer

                simpleConnectMapToReduce(rightMROpr);
                rightMROpr.useTypedComparator(true);
               
                POStore st = getStore();
                FileSpec strFile = getTempFileSpec();
                st.setSFile(strFile);
                rightMROpr.reducePlan.addAsLeaf(st);
                rightMROpr.setReduceDone(true);
               
                // set up the DefaultIndexableLoader for the join operator
                String[] defaultIndexableLoaderArgs = new String[5];
                defaultIndexableLoaderArgs[0] = origRightLoaderFileSpec.getFuncSpec().toString();
                defaultIndexableLoaderArgs[1] = strFile.getFileName();
                defaultIndexableLoaderArgs[2] = strFile.getFuncSpec().toString();
                defaultIndexableLoaderArgs[3] = joinOp.getOperatorKey().scope;
                defaultIndexableLoaderArgs[4] = origRightLoaderFileSpec.getFileName();
                joinOp.setRightLoaderFuncSpec((new FuncSpec(DefaultIndexableLoader.class.getName(), defaultIndexableLoaderArgs)));
                joinOp.setRightInputFileName(origRightLoaderFileSpec.getFileName())
               
                joinOp.setIndexFile(strFile.getFileName());
                udfs.add(origRightLoaderFileSpec.getFuncSpec().toString());
            }
           
            // We are done with right side. Lets work on left now.
            // Join will be materialized in leftMROper.
            if(!curMROp.mapDone) // Life is easy
                curMROp.mapPlan.addAsLeaf(joinOp);
           
            else if(!curMROp.reduceDone){  // This is a map-side join. Close this MROper and start afresh.
                POStore leftStore = getStore();
                FileSpec leftStrFile = getTempFileSpec();
                leftStore.setSFile(leftStrFile);
                curMROp.setReduceDone(true);
                curMROp = startNew(leftStrFile, curMROp);
                curMROp.mapPlan.addAsLeaf(joinOp);
            }
           
View Full Code Here

      }
     
      //change plan to store the first join input into a temp file
      FileSpec fSpec = getTempFileSpec();
      MapReduceOper mro = compiledInputs[0];
      POStore str = getStore();
      str.setSFile(fSpec);
      if (!mro.isMapDone()) {
        mro.mapPlan.addAsLeaf(str);
        mro.setMapDoneSingle(true);
      } else if (mro.isMapDone() && !mro.isReduceDone()) {
        mro.reducePlan.addAsLeaf(str);
        mro.setReduceDone(true);
      } else {
        int errCode = 2022;
        String msg = "Both map and reduce phases have been done. This is unexpected while compiling.";
        throw new PlanException(msg, errCode, PigException.BUG);
      }
     
      FileSpec partitionFile = getTempFileSpec();
      int rp = op.getRequestedParallelism();
     
      Pair<MapReduceOper, Integer> sampleJobPair = getSkewedJoinSampleJob(op, mro, fSpec, partitionFile, rp);           
      rp = sampleJobPair.second;
     
      // set parallelism of SkewedJoin as the value calculated by sampling job
      // if "parallel" is specified in join statement, "rp" is equal to that number
      // if not specified, use the value that sampling process calculated
      // based on default.
      op.setRequestedParallelism(rp);
           
      // load the temp file for first table as input of join           
      MapReduceOper[] joinInputs = new MapReduceOper[] {startNew(fSpec, sampleJobPair.first), compiledInputs[1]};           
      MapReduceOper[] rearrangeOutputs = new MapReduceOper[2];                      
     
      compiledInputs = new MapReduceOper[] {joinInputs[0]};
      // run POLocalRearrange for first join table
      POLocalRearrange lr = new POLocalRearrange(new OperatorKey(scope,nig.getNextNodeId(scope)), rp);           
      try {
        lr.setIndex(0);               
      } catch (ExecException e) {
        int errCode = 2058;
        String msg = "Unable to set index on newly created POLocalRearrange.";
        throw new PlanException(msg, errCode, PigException.BUG, e);
      }
     
      List<PhysicalOperator> l = plan.getPredecessors(op);
      MultiMap<PhysicalOperator, PhysicalPlan> joinPlans = op.getJoinPlans();
      List<PhysicalPlan> groups = joinPlans.get(l.get(0));
      // check the type of group keys, if there are more than one field, the key is TUPLE.
      byte type = DataType.TUPLE;
      if (groups.size() == 1) {
        type = groups.get(0).getLeaves().get(0).getResultType();               
      }              
     
      lr.setKeyType(type);           
      lr.setPlans(groups);
      lr.setResultType(DataType.TUPLE);
     
      lr.visit(this);
      if(lr.getRequestedParallelism() > curMROp.requestedParallelism)
        curMROp.requestedParallelism = lr.getRequestedParallelism();
      rearrangeOutputs[0] = curMROp;
     
      compiledInputs = new MapReduceOper[] {joinInputs[1]};      
      // if the map for current input is already closed, then start a new job
      if (compiledInputs[0].isMapDone() && !compiledInputs[0].isReduceDone()) {
        FileSpec f = getTempFileSpec();
        POStore s = getStore();
        s.setSFile(f);
        compiledInputs[0].reducePlan.addAsLeaf(s);
        compiledInputs[0].setReduceDone(true);
        compiledInputs[0] = startNew(f, compiledInputs[0]);
      }              
     
View Full Code Here

        POForEach nfe3 = new POForEach(new OperatorKey(scope,nig.getNextNodeId(scope)), -1, ep4s, flattened3);
       
        mro.reducePlan.add(nfe3);
        mro.reducePlan.connect(nfe2, nfe3);
       
        POStore str = getStore();
        str.setSFile(sampleFile);
       
        mro.reducePlan.add(str);
        mro.reducePlan.connect(nfe3, str);
       
        mro.setReduceDone(true);
View Full Code Here

                    fixProjectionAfterLimit(limitAdjustMROp, mr);
                    limitAdjustMROp.setLimitAfterSort(true);
                    limitAdjustMROp.setSortOrder(mr.getSortOrder());
                }
               
                POStore st = getStore();
                st.setSFile(oldSpec);
                st.setIsTmpStore(oldIsTmpStore);
                limitAdjustMROp.reducePlan.addAsLeaf(st);
                limitAdjustMROp.requestedParallelism = 1;
                limitAdjustMROp.setLimitOnly(true);
               
                List<MapReduceOper> successorList = MRPlan.getSuccessors(mr);
View Full Code Here

    }
   
    @Override
    public void visit(LOStore loStore) throws VisitorException {
        String scope = loStore.getOperatorKey().scope;
        POStore store = new POStore(new OperatorKey(scope, nodeGen
                .getNextNodeId(scope)));
        store.setSFile(loStore.getOutputFile());
        store.setInputSpec(loStore.getInputSpec());
        try {
            // create a new schema for ourselves so that when
            // we serialize we are not serializing objects that
            // contain the schema - apparently Java tries to
            // serialize the object containing the schema if
            // we are trying to serialize the schema reference in
            // the containing object. The schema here will be serialized
            // in JobControlCompiler
            store.setSchema(new Schema(loStore.getSchema()));
        } catch (FrontendException e1) {
            int errorCode = 1060;
            String message = "Cannot resolve Store output schema"
            throw new VisitorException(message, errorCode, PigException.BUG, e1);   
        }
View Full Code Here

            }

            if (mapStores.size() + reduceStores.size() == 1) { // single store case
                log.info("Setting up single store job");
               
                POStore st;
                if (reduceStores.isEmpty()) {
                    st = mapStores.remove(0);
                    mro.mapPlan.remove(st);
                }
                else {
                    st = reduceStores.remove(0);
                    mro.reducePlan.remove(st);
                }

                // If the StoreFunc associate with the POStore is implements
                // getStorePreparationClass() and returns a non null value,
                // then it could be wanting to implement OutputFormat for writing out to hadoop
                // Check if this is the case, if so, use the OutputFormat class the
                // StoreFunc gives us else use our default PigOutputFormat
                Object storeFunc = PigContext.instantiateFuncFromSpec(st.getSFile().getFuncSpec());
                Class sPrepClass = null;
                try {
                    sPrepClass = ((StoreFunc)storeFunc).getStorePreparationClass();
                } catch(AbstractMethodError e) {
                    // this is for backward compatibility wherein some old StoreFunc
                    // which does not implement getStorePreparationClass() is being
                    // used. In this case, we want to just use PigOutputFormat
                    sPrepClass = null;
                }
                if(sPrepClass != null && OutputFormat.class.isAssignableFrom(sPrepClass)) {
                    jobConf.setOutputFormat(sPrepClass);
                } else {
                    jobConf.setOutputFormat(PigOutputFormat.class);
                }
               
                //set out filespecs
                String outputPath = st.getSFile().getFileName();
                FuncSpec outputFuncSpec = st.getSFile().getFuncSpec();
                FileOutputFormat.setOutputPath(jobConf, new Path(outputPath));
               
                // serialize the store func spec using ObjectSerializer
                // ObjectSerializer.serialize() uses default java serialization
                // and then further encodes the output so that control characters
                // get encoded as regular characters. Otherwise any control characters
                // in the store funcspec would break the job.xml which is created by
                // hadoop from the jobconf.
                jobConf.set("pig.storeFunc", ObjectSerializer.serialize(outputFuncSpec.toString()));
                jobConf.set(PIG_STORE_CONFIG,
                            ObjectSerializer.serialize(new StoreConfig(outputPath, st.getSchema())));

                jobConf.set("pig.streaming.log.dir",
                            new Path(outputPath, LOG_DIR).toString());
                jobConf.set("pig.streaming.task.output.dir", outputPath);
            }
           else { // multi store case
                log.info("Setting up multi store job");

                tmpLocation = makeTmpPath();

                FileSystem fs = tmpLocation.getFileSystem(conf);
                for (POStore st: mapStores) {
                    Path tmpOut = new Path(
                        tmpLocation,
                        PlanHelper.makeStoreTmpPath(st.getSFile().getFileName()));
                    fs.mkdirs(tmpOut);
                }

                jobConf.setOutputFormat(PigOutputFormat.class);
                FileOutputFormat.setOutputPath(jobConf, tmpLocation);
View Full Code Here

                else
                    simpleConnectMapToReduce(limitAdjustMROp);
                POLimit pLimit2 = new POLimit(new OperatorKey(scope,nig.getNextNodeId(scope)));
                pLimit2.setLimit(mr.limit);
                limitAdjustMROp.reducePlan.addAsLeaf(pLimit2);
                POStore st = getStore();
                st.setSFile(oldSpec);
                st.setIsTmpStore(false);
                limitAdjustMROp.reducePlan.addAsLeaf(st);
                limitAdjustMROp.requestedParallelism = 1;
                // If the operator we're following has global sort set, we
                // need to indicate that this is a limit after a sort.
                // This will assure that we get the right sort comparator
View Full Code Here

        ld.setPc(pigContext);
        return ld;
    }
   
    private POStore getStore(){
        POStore st = new POStore(new OperatorKey(scope,nig.getNextNodeId(scope)));
        // mark store as tmp store. These could be removed by the
        // optimizer, because it wasn't the user requesting it.
        st.setIsTmpStore(true);
        return st;
    }
View Full Code Here

                curMROp = mro;
            }
            else if(mro.isMapDone() && !mro.isReduceDone()){
                FileSpec fSpec = getTempFileSpec();
               
                POStore st = getStore();
                st.setSFile(fSpec);
                mro.reducePlan.addAsLeaf(st);
                mro.setReduceDone(true);
                curMROp = startNew(fSpec, mro);
                curMROp.setMapDone(true);
            }
View Full Code Here

TOP

Related Classes of org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POStore

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.