Examples of org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POStore

Package org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators

Examples of org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POStore

org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POStore
The store operator which is used in two ways: 1) As a local operator it can be used to store files 2) In the Map Reduce setting, it is used to create jobs from MapReduce operators which keep the loads and stores in the Map and Reduce Plans till the job is created

        // We want to use typed tuple comparator for this job, instead of default 
        // raw binary comparator used by Pig, to make sure index entries are 
        // sorted correctly by Hadoop.
        indexerMROp.useTypedComparator(true); 


        POStore st = getStore();
        FileSpec strFile = getTempFileSpec();
        st.setSFile(strFile);
        indexerMROp.reducePlan.addAsLeaf(st);
        indexerMROp.setReduceDone(true);


        return strFile;
    }

View Full Code Here

            }
            
            else if(!rightMROpr.reduceDone){ 
                // Indexer must run in map. If we are in reduce, close it and start new MROper.
                // No need of yanking in this case. Since we are starting brand new MR Operator and it will contain nothing.
                POStore rightStore = getStore();
                FileSpec rightStrFile = getTempFileSpec();
                rightStore.setSFile(rightStrFile);
                rightMROpr.setReduceDone(true);
                rightMROpr = startNew(rightStrFile, rightMROpr);
                rightPipelinePlan = null; 
            }
            
            else{
                int errCode = 2022;
                String msg = "Both map and reduce phases have been done. This is unexpected while compiling.";
                throw new PlanException(msg, errCode, PigException.BUG);
            }
            
            joinOp.setupRightPipeline(rightPipelinePlan);
            rightMROpr.requestedParallelism = 1; // we need exactly one reducer for indexing job.        
            
            // At this point, we must be operating on map plan of right input and it would contain nothing else other then a POLoad.
            POLoad rightLoader = (POLoad)rightMROpr.mapPlan.getRoots().get(0);
            joinOp.setSignature(rightLoader.getSignature());
            LoadFunc rightLoadFunc = rightLoader.getLoadFunc();
            List<String> udfs = new ArrayList<String>();
            if(IndexableLoadFunc.class.isAssignableFrom(rightLoadFunc.getClass())) {
                joinOp.setRightLoaderFuncSpec(rightLoader.getLFile().getFuncSpec());
                joinOp.setRightInputFileName(rightLoader.getLFile().getFileName());
                udfs.add(rightLoader.getLFile().getFuncSpec().toString());
                
                // we don't need the right MROper since
                // the right loader is an IndexableLoadFunc which can handle the index
                // itself
                MRPlan.remove(rightMROpr);
                if(rightMROpr == compiledInputs[0]) {
                    compiledInputs[0] = null;
                } else if(rightMROpr == compiledInputs[1]) {
                    compiledInputs[1] = null;
                } 
                rightMROpr = null;
                
                // validate that the join keys in merge join are only                                                                                                                                                                              
                // simple column projections or '*' and not expression - expressions                                                                                                                                                               
                // cannot be handled when the index is built by the storage layer on the sorted                                                                                                                                                    
                // data when the sorted data (and corresponding index) is written.                                                                                                                                                                 
                // So merge join will be restricted not have expressions as                                                                                                                                                                        
                // join keys      
                int numInputs = mPlan.getPredecessors(joinOp).size(); // should be 2
                for(int i = 0; i < numInputs; i++) {
                    List<PhysicalPlan> keyPlans = joinOp.getInnerPlansOf(i);
                    for (PhysicalPlan keyPlan : keyPlans) {
                        for(PhysicalOperator op : keyPlan) {
                            if(!(op instanceof POProject)) {
                                int errCode = 1106;
                                String errMsg = "Merge join is possible only for simple column or '*' join keys when using " +
                                rightLoader.getLFile().getFuncSpec() + " as the loader";
                                throw new MRCompilerException(errMsg, errCode, PigException.INPUT);
                            }
                        }
                    }
                }
            } else {
                
                // Replace POLoad with  indexer.


                LoadFunc loadFunc = rightLoader.getLoadFunc();
                if (! (OrderedLoadFunc.class.isAssignableFrom(loadFunc.getClass()))){
                    int errCode = 1104;
                    String errMsg = "Right input of merge-join must implement " +
                    "OrderedLoadFunc interface. The specified loader " 
                    + loadFunc + " doesn't implement it";
                    throw new MRCompilerException(errMsg,errCode);
                }


                String[] indexerArgs = new String[6];
                List<PhysicalPlan> rightInpPlans = joinOp.getInnerPlansOf(1);
                FileSpec origRightLoaderFileSpec = rightLoader.getLFile();


                indexerArgs[0] = origRightLoaderFileSpec.getFuncSpec().toString();
                indexerArgs[1] = ObjectSerializer.serialize((Serializable)rightInpPlans);
                indexerArgs[2] = ObjectSerializer.serialize(rightPipelinePlan);
                indexerArgs[3] = rightLoader.getSignature();
                indexerArgs[4] = rightLoader.getOperatorKey().scope;
                indexerArgs[5] = Boolean.toString(true);
                
                FileSpec lFile = new FileSpec(rightLoader.getLFile().getFileName(),new FuncSpec(MergeJoinIndexer.class.getName(), indexerArgs));
                rightLoader.setLFile(lFile);
    
                // Loader of mro will return a tuple of form - 
                // (keyFirst1, keyFirst2, .. , position, splitIndex) See MergeJoinIndexer


                simpleConnectMapToReduce(rightMROpr);
                rightMROpr.useTypedComparator(true);
                
                POStore st = getStore();
                FileSpec strFile = getTempFileSpec();
                st.setSFile(strFile);
                rightMROpr.reducePlan.addAsLeaf(st);
                rightMROpr.setReduceDone(true);
                
                // set up the DefaultIndexableLoader for the join operator
                String[] defaultIndexableLoaderArgs = new String[5];
                defaultIndexableLoaderArgs[0] = origRightLoaderFileSpec.getFuncSpec().toString();
                defaultIndexableLoaderArgs[1] = strFile.getFileName();
                defaultIndexableLoaderArgs[2] = strFile.getFuncSpec().toString();
                defaultIndexableLoaderArgs[3] = joinOp.getOperatorKey().scope;
                defaultIndexableLoaderArgs[4] = origRightLoaderFileSpec.getFileName();
                joinOp.setRightLoaderFuncSpec((new FuncSpec(DefaultIndexableLoader.class.getName(), defaultIndexableLoaderArgs)));
                joinOp.setRightInputFileName(origRightLoaderFileSpec.getFileName());  
                
                joinOp.setIndexFile(strFile.getFileName());
                udfs.add(origRightLoaderFileSpec.getFuncSpec().toString());
            }
            
            // We are done with right side. Lets work on left now.
            // Join will be materialized in leftMROper.
            if(!curMROp.mapDone) // Life is easy 
                curMROp.mapPlan.addAsLeaf(joinOp);
            
            else if(!curMROp.reduceDone){  // This is a map-side join. Close this MROper and start afresh.
                POStore leftStore = getStore();
                FileSpec leftStrFile = getTempFileSpec();
                leftStore.setSFile(leftStrFile);
                curMROp.setReduceDone(true);
                curMROp = startNew(leftStrFile, curMROp);
                curMROp.mapPlan.addAsLeaf(joinOp);
            }

View Full Code Here

      }
      
      //change plan to store the first join input into a temp file
      FileSpec fSpec = getTempFileSpec();
      MapReduceOper mro = compiledInputs[0];
      POStore str = getStore();
      str.setSFile(fSpec);
      if (!mro.isMapDone()) {
        mro.mapPlan.addAsLeaf(str);
        mro.setMapDoneSingle(true);
      } else if (mro.isMapDone() && !mro.isReduceDone()) {
        mro.reducePlan.addAsLeaf(str);
        mro.setReduceDone(true);
      } else {
        int errCode = 2022;
        String msg = "Both map and reduce phases have been done. This is unexpected while compiling.";
        throw new PlanException(msg, errCode, PigException.BUG);
      }
      
      FileSpec partitionFile = getTempFileSpec();
      int rp = op.getRequestedParallelism();
      
      Pair<MapReduceOper, Integer> sampleJobPair = getSkewedJoinSampleJob(op, mro, fSpec, partitionFile, rp);            
      rp = sampleJobPair.second;
      
      // set parallelism of SkewedJoin as the value calculated by sampling job
      // if "parallel" is specified in join statement, "rp" is equal to that number
      // if not specified, use the value that sampling process calculated
      // based on default.
      op.setRequestedParallelism(rp);
            
      // load the temp file for first table as input of join            
      MapReduceOper[] joinInputs = new MapReduceOper[] {startNew(fSpec, sampleJobPair.first), compiledInputs[1]};            
      MapReduceOper[] rearrangeOutputs = new MapReduceOper[2];                       
      
      compiledInputs = new MapReduceOper[] {joinInputs[0]};
      // run POLocalRearrange for first join table
      POLocalRearrange lr = new POLocalRearrange(new OperatorKey(scope,nig.getNextNodeId(scope)), rp);            
      try {
        lr.setIndex(0);                
      } catch (ExecException e) {
        int errCode = 2058;
        String msg = "Unable to set index on newly created POLocalRearrange.";
        throw new PlanException(msg, errCode, PigException.BUG, e);
      }
      
      List<PhysicalOperator> l = plan.getPredecessors(op);
      MultiMap<PhysicalOperator, PhysicalPlan> joinPlans = op.getJoinPlans();
      List<PhysicalPlan> groups = joinPlans.get(l.get(0));
      // check the type of group keys, if there are more than one field, the key is TUPLE.
      byte type = DataType.TUPLE;
      if (groups.size() == 1) {
        type = groups.get(0).getLeaves().get(0).getResultType();                
      }               
      
      lr.setKeyType(type);            
      lr.setPlans(groups);
      lr.setResultType(DataType.TUPLE);
      
      lr.visit(this);
      if(lr.getRequestedParallelism() > curMROp.requestedParallelism)
        curMROp.requestedParallelism = lr.getRequestedParallelism();
      rearrangeOutputs[0] = curMROp;
      
      compiledInputs = new MapReduceOper[] {joinInputs[1]};       
      // if the map for current input is already closed, then start a new job
      if (compiledInputs[0].isMapDone() && !compiledInputs[0].isReduceDone()) {
        FileSpec f = getTempFileSpec();
        POStore s = getStore();
        s.setSFile(f);
        compiledInputs[0].reducePlan.addAsLeaf(s);
        compiledInputs[0].setReduceDone(true);
        compiledInputs[0] = startNew(f, compiledInputs[0]);
      }

View Full Code Here

        POForEach nfe3 = new POForEach(new OperatorKey(scope,nig.getNextNodeId(scope)), -1, ep4s, flattened3);
        
        mro.reducePlan.add(nfe3);
        mro.reducePlan.connect(nfe2, nfe3);
        
        POStore str = getStore();
        str.setSFile(sampleFile);
        
        mro.reducePlan.add(str);
        mro.reducePlan.connect(nfe3, str);
        
        mro.setReduceDone(true);

View Full Code Here

                    fixProjectionAfterLimit(limitAdjustMROp, mr);
                    limitAdjustMROp.setLimitAfterSort(true);
                    limitAdjustMROp.setSortOrder(mr.getSortOrder());
                }
                
                POStore st = getStore();
                st.setSFile(oldSpec);
                st.setIsTmpStore(oldIsTmpStore);
                limitAdjustMROp.reducePlan.addAsLeaf(st);
                limitAdjustMROp.requestedParallelism = 1;
                limitAdjustMROp.setLimitOnly(true);
                
                List<MapReduceOper> successorList = MRPlan.getSuccessors(mr);

View Full Code Here

    }
    
    @Override
    public void visit(LOStore loStore) throws VisitorException {
        String scope = loStore.getOperatorKey().scope;
        POStore store = new POStore(new OperatorKey(scope, nodeGen
                .getNextNodeId(scope)));
        store.setSFile(loStore.getOutputFile());
        store.setInputSpec(loStore.getInputSpec());
        try {
            // create a new schema for ourselves so that when
            // we serialize we are not serializing objects that
            // contain the schema - apparently Java tries to
            // serialize the object containing the schema if
            // we are trying to serialize the schema reference in
            // the containing object. The schema here will be serialized
            // in JobControlCompiler
            store.setSchema(new Schema(loStore.getSchema()));
        } catch (FrontendException e1) {
            int errorCode = 1060;
            String message = "Cannot resolve Store output schema";  
            throw new VisitorException(message, errorCode, PigException.BUG, e1);    
        }

View Full Code Here

            }


            if (mapStores.size() + reduceStores.size() == 1) { // single store case
                log.info("Setting up single store job");
                
                POStore st;
                if (reduceStores.isEmpty()) {
                    st = mapStores.remove(0);
                    mro.mapPlan.remove(st);
                }
                else {
                    st = reduceStores.remove(0);
                    mro.reducePlan.remove(st);
                }


                // If the StoreFunc associate with the POStore is implements
                // getStorePreparationClass() and returns a non null value,
                // then it could be wanting to implement OutputFormat for writing out to hadoop
                // Check if this is the case, if so, use the OutputFormat class the 
                // StoreFunc gives us else use our default PigOutputFormat
                Object storeFunc = PigContext.instantiateFuncFromSpec(st.getSFile().getFuncSpec());
                Class sPrepClass = null;
                try {
                    sPrepClass = ((StoreFunc)storeFunc).getStorePreparationClass();
                } catch(AbstractMethodError e) {
                    // this is for backward compatibility wherein some old StoreFunc
                    // which does not implement getStorePreparationClass() is being
                    // used. In this case, we want to just use PigOutputFormat
                    sPrepClass = null;
                }
                if(sPrepClass != null && OutputFormat.class.isAssignableFrom(sPrepClass)) {
                    jobConf.setOutputFormat(sPrepClass);
                } else {
                    jobConf.setOutputFormat(PigOutputFormat.class);
                }
                
                //set out filespecs
                String outputPath = st.getSFile().getFileName();
                FuncSpec outputFuncSpec = st.getSFile().getFuncSpec();
                FileOutputFormat.setOutputPath(jobConf, new Path(outputPath));
                
                // serialize the store func spec using ObjectSerializer
                // ObjectSerializer.serialize() uses default java serialization
                // and then further encodes the output so that control characters
                // get encoded as regular characters. Otherwise any control characters
                // in the store funcspec would break the job.xml which is created by
                // hadoop from the jobconf.
                jobConf.set("pig.storeFunc", ObjectSerializer.serialize(outputFuncSpec.toString()));
                jobConf.set(PIG_STORE_CONFIG, 
                            ObjectSerializer.serialize(new StoreConfig(outputPath, st.getSchema())));


                jobConf.set("pig.streaming.log.dir", 
                            new Path(outputPath, LOG_DIR).toString());
                jobConf.set("pig.streaming.task.output.dir", outputPath);
            } 
           else { // multi store case
                log.info("Setting up multi store job");


                tmpLocation = makeTmpPath();


                FileSystem fs = tmpLocation.getFileSystem(conf);
                for (POStore st: mapStores) {
                    Path tmpOut = new Path(
                        tmpLocation,
                        PlanHelper.makeStoreTmpPath(st.getSFile().getFileName()));
                    fs.mkdirs(tmpOut);
                }


                jobConf.setOutputFormat(PigOutputFormat.class);
                FileOutputFormat.setOutputPath(jobConf, tmpLocation);

View Full Code Here

                else
                    simpleConnectMapToReduce(limitAdjustMROp);
                POLimit pLimit2 = new POLimit(new OperatorKey(scope,nig.getNextNodeId(scope)));
                pLimit2.setLimit(mr.limit);
                limitAdjustMROp.reducePlan.addAsLeaf(pLimit2);
                POStore st = getStore();
                st.setSFile(oldSpec);
                st.setIsTmpStore(false);
                limitAdjustMROp.reducePlan.addAsLeaf(st);
                limitAdjustMROp.requestedParallelism = 1;
                // If the operator we're following has global sort set, we
                // need to indicate that this is a limit after a sort.
                // This will assure that we get the right sort comparator

View Full Code Here

        ld.setPc(pigContext);
        return ld;
    }
    
    private POStore getStore(){
        POStore st = new POStore(new OperatorKey(scope,nig.getNextNodeId(scope)));
        // mark store as tmp store. These could be removed by the
        // optimizer, because it wasn't the user requesting it.
        st.setIsTmpStore(true);
        return st;
    }

View Full Code Here

                curMROp = mro;
            }
            else if(mro.isMapDone() && !mro.isReduceDone()){
                FileSpec fSpec = getTempFileSpec();
                
                POStore st = getStore();
                st.setSFile(fSpec);
                mro.reducePlan.addAsLeaf(st);
                mro.setReduceDone(true);
                curMROp = startNew(fSpec, mro);
                curMROp.setMapDone(true);
            }

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POStore

com.netflix.lipstick.util.OutputSamplerTest

org.apache.pig.backend.executionengine.util.ExecTools

org.apache.pig.backend.hadoop.executionengine.fetch.FetchLauncher

org.apache.pig.backend.hadoop.executionengine.HExecutionEngine

org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler

org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.LimitAdjuster

org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceLauncher

org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MRCompiler

org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MRCompiler$LimitAdjuster

org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MultiQueryOptimizer

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.