Examples of org.apache.pig.impl.io.FileSpec

org.apache.pig.impl.io.FileSpec
A simple class that specifies a file name and storage function which is used to read/write it

     * Returns a temporary DFS Path
     * @return
     * @throws IOException
     */
    private FileSpec getTempFileSpec() throws IOException {
        return new FileSpec(FileLocalizer.getTemporaryPath(pigContext).toString(),
                new FuncSpec(Utils.getTmpFileCompressorName(pigContext)));
    }

View Full Code Here

     * @throws VisitorException
     */
    @Override
    public void visitSplit(POSplit op) throws VisitorException{
        try{
            FileSpec fSpec = op.getSplitStore();
            MapReduceOper mro = endSingleInputPlanWithStr(fSpec);
            mro.setSplitter(true);
            splitsSeen.put(op.getOperatorKey(), mro);
            curMROp = startNew(fSpec, mro);
            phyToMROpMap.put(op, curMROp);

View Full Code Here

                boolean combinable = !conf.getBoolean("pig.noSplitCombination", false);
                
                if (!mro.isMapDone()) {   
                    if (combinable && hasTooManyInputFiles(mro, conf)) { 
                        POStore tmpSto = getStore();
                        FileSpec fSpec = getTempFileSpec();
                        tmpSto.setSFile(fSpec);                         
                        mro.mapPlan.addAsLeaf(tmpSto);
                        mro.setMapDoneSingle(true);                    
                        MapReduceOper catMROp = getConcatenateJob(fSpec, mro, str); 
                        MRPlan.connect(catMROp, curMROp);
                    } else {
                        mro.mapPlan.addAsLeaf(str);
                        mro.setMapDoneSingle(true); 
                        MRPlan.connect(mro, curMROp);
                    }
                } else if (mro.isMapDone() && !mro.isReduceDone()) {
                    if (combinable && (mro.requestedParallelism >= fileConcatenationThreshold)) {
                        POStore tmpSto = getStore();
                        FileSpec fSpec = getTempFileSpec();
                        tmpSto.setSFile(fSpec); 
                        mro.reducePlan.addAsLeaf(tmpSto);
                        mro.setReduceDone(true);
                        MapReduceOper catMROp = getConcatenateJob(fSpec, mro, str); 
                        MRPlan.connect(catMROp, curMROp);

View Full Code Here

                    String errMsg = "Expected physical operator at root to be POLoad. Found : "+rootPOOp.getClass().getCanonicalName();
                    throw new MRCompilerException(errMsg,errCode);
                }
                
                POLoad sideLoader = (POLoad)rootPOOp;
                FileSpec loadFileSpec = sideLoader.getLFile();
                FuncSpec funcSpec = loadFileSpec.getFuncSpec();
                LoadFunc loadfunc = sideLoader.getLoadFunc();
                if(i == 0){
                    
                    if(!(CollectableLoadFunc.class.isAssignableFrom(loadfunc.getClass()))){
                      int errCode = 2252;
                        throw new MRCompilerException("Base loader in Cogroup must implement CollectableLoadFunc.", errCode);
                    }
                    
                    ((CollectableLoadFunc)loadfunc).ensureAllKeyInstancesInSameSplit();
                    continue;
                }
                if(!(IndexableLoadFunc.class.isAssignableFrom(loadfunc.getClass()))){
                    int errCode = 2253;
                    throw new MRCompilerException("Side loaders in cogroup must implement IndexableLoadFunc.", errCode);
                }
                
                funcSpecs.add(funcSpec);
                fileSpecs.add(loadFileSpec.getFileName());
                loaderSigns.add(sideLoader.getSignature());
                MRPlan.remove(mrOper);
            }
            
            poCoGrp.setSideLoadFuncs(funcSpecs);
            poCoGrp.setSideFileSpecs(fileSpecs);
            poCoGrp.setLoaderSignatures(loaderSigns);
            
            // Use map-reduce operator of base relation for the cogroup operation.
            MapReduceOper baseMROp = phyToMROpMap.get(poCoGrp.getInputs().get(0));
            if(baseMROp.mapDone || !baseMROp.reducePlan.isEmpty()){
                int errCode = 2254;
                throw new MRCompilerException("Currently merged cogroup is not supported after blocking operators.", errCode);
            }
            
            // Create new map-reduce operator for indexing job and then configure it.
            MapReduceOper indexerMROp = getMROp();
            FileSpec idxFileSpec = getIndexingJob(indexerMROp, baseMROp, poCoGrp.getLRInnerPlansOf(0));
            poCoGrp.setIdxFuncSpec(idxFileSpec.getFuncSpec());
            poCoGrp.setIndexFileName(idxFileSpec.getFileName());
            
            baseMROp.mapPlan.addAsLeaf(poCoGrp);
            MRPlan.add(indexerMROp);
            MRPlan.connect(indexerMROp, baseMROp);

View Full Code Here

        throws MRCompilerException, PlanException, ExecException, IOException, CloneNotSupportedException {
        
        // First replace loader with  MergeJoinIndexer.
        PhysicalPlan baseMapPlan = baseMROp.mapPlan;
        POLoad baseLoader = (POLoad)baseMapPlan.getRoots().get(0);                            
        FileSpec origLoaderFileSpec = baseLoader.getLFile();
        FuncSpec funcSpec = origLoaderFileSpec.getFuncSpec();
        LoadFunc loadFunc = baseLoader.getLoadFunc();
        
        if (! (OrderedLoadFunc.class.isAssignableFrom(loadFunc.getClass()))){
            int errCode = 1104;
            String errMsg = "Base relation of merge-coGroup must implement " +
            "OrderedLoadFunc interface. The specified loader " 
            + funcSpec + " doesn't implement it";
            throw new MRCompilerException(errMsg,errCode);
        }
        
        String[] indexerArgs = new String[6];
        indexerArgs[0] = funcSpec.toString();
        indexerArgs[1] = ObjectSerializer.serialize((Serializable)mapperLRInnerPlans);
        indexerArgs[3] = baseLoader.getSignature();
        indexerArgs[4] = baseLoader.getOperatorKey().scope;
        indexerArgs[5] = Boolean.toString(false); // we care for nulls. 
            
        PhysicalPlan phyPlan;
        if (baseMapPlan.getSuccessors(baseLoader) == null 
                || baseMapPlan.getSuccessors(baseLoader).isEmpty()){
         // Load-Load-Cogroup case.
            phyPlan = null; 
        }
            
        else{ // We got something. Yank it and set it as inner plan.
            phyPlan = baseMapPlan.clone();
            PhysicalOperator root = phyPlan.getRoots().get(0);
            phyPlan.disconnect(root, phyPlan.getSuccessors(root).get(0));
            phyPlan.remove(root);


        }
        indexerArgs[2] = ObjectSerializer.serialize(phyPlan);


        POLoad idxJobLoader = getLoad();
        idxJobLoader.setLFile(new FileSpec(origLoaderFileSpec.getFileName(),
                new FuncSpec(MergeJoinIndexer.class.getName(), indexerArgs)));
        indexerMROp.mapPlan.add(idxJobLoader);
        
        // Loader of mro will return a tuple of form - 
        // (key1, key2, .. , WritableComparable, splitIndex). See MergeJoinIndexer for details.
        
        // After getting an index entry in each mapper, send all of them to one 
        // reducer where they will be sorted on the way by Hadoop.
        simpleConnectMapToReduce(indexerMROp);
        
        indexerMROp.requestedParallelism = 1; // we need exactly one reducer for indexing job.
        
        // We want to use typed tuple comparator for this job, instead of default 
        // raw binary comparator used by Pig, to make sure index entries are 
        // sorted correctly by Hadoop.
        indexerMROp.useTypedComparator(true); 


        POStore st = getStore();
        FileSpec strFile = getTempFileSpec();
        st.setSFile(strFile);
        indexerMROp.reducePlan.addAsLeaf(st);
        indexerMROp.setReduceDone(true);


        return strFile;

View Full Code Here

            
            else if(!rightMROpr.reduceDone){ 
                // Indexer must run in map. If we are in reduce, close it and start new MROper.
                // No need of yanking in this case. Since we are starting brand new MR Operator and it will contain nothing.
                POStore rightStore = getStore();
                FileSpec rightStrFile = getTempFileSpec();
                rightStore.setSFile(rightStrFile);
                rightMROpr.setReduceDone(true);
                rightMROpr = startNew(rightStrFile, rightMROpr);
                rightPipelinePlan = null; 
            }
            
            else{
                int errCode = 2022;
                String msg = "Both map and reduce phases have been done. This is unexpected while compiling.";
                throw new PlanException(msg, errCode, PigException.BUG);
            }
            
            joinOp.setupRightPipeline(rightPipelinePlan);
            rightMROpr.requestedParallelism = 1; // we need exactly one reducer for indexing job.        
            
            // At this point, we must be operating on map plan of right input and it would contain nothing else other then a POLoad.
            POLoad rightLoader = (POLoad)rightMROpr.mapPlan.getRoots().get(0);
            joinOp.setSignature(rightLoader.getSignature());
            LoadFunc rightLoadFunc = rightLoader.getLoadFunc();
            if(IndexableLoadFunc.class.isAssignableFrom(rightLoadFunc.getClass())) {
                joinOp.setRightLoaderFuncSpec(rightLoader.getLFile().getFuncSpec());
                joinOp.setRightInputFileName(rightLoader.getLFile().getFileName());
                
                // we don't need the right MROper since
                // the right loader is an IndexableLoadFunc which can handle the index
                // itself
                MRPlan.remove(rightMROpr);
                if(rightMROpr == compiledInputs[0]) {
                    compiledInputs[0] = null;
                } else if(rightMROpr == compiledInputs[1]) {
                    compiledInputs[1] = null;
                } 
                rightMROpr = null;
                
                // validate that the join keys in merge join are only                                                                                                                                                                              
                // simple column projections or '*' and not expression - expressions                                                                                                                                                               
                // cannot be handled when the index is built by the storage layer on the sorted                                                                                                                                                    
                // data when the sorted data (and corresponding index) is written.                                                                                                                                                                 
                // So merge join will be restricted not have expressions as                                                                                                                                                                        
                // join keys      
                int numInputs = mPlan.getPredecessors(joinOp).size(); // should be 2
                for(int i = 0; i < numInputs; i++) {
                    List<PhysicalPlan> keyPlans = joinOp.getInnerPlansOf(i);
                    for (PhysicalPlan keyPlan : keyPlans) {
                        for(PhysicalOperator op : keyPlan) {
                            if(!(op instanceof POProject)) {
                                int errCode = 1106;
                                String errMsg = "Merge join is possible only for simple column or '*' join keys when using " +
                                rightLoader.getLFile().getFuncSpec() + " as the loader";
                                throw new MRCompilerException(errMsg, errCode, PigException.INPUT);
                            }
                        }
                    }
                }
            } else {
                
                // Replace POLoad with  indexer.


                LoadFunc loadFunc = rightLoader.getLoadFunc();
                if (! (OrderedLoadFunc.class.isAssignableFrom(loadFunc.getClass()))){
                    int errCode = 1104;
                    String errMsg = "Right input of merge-join must implement " +
                    "OrderedLoadFunc interface. The specified loader " 
                    + loadFunc + " doesn't implement it";
                    throw new MRCompilerException(errMsg,errCode);
                }


                String[] indexerArgs = new String[6];
                List<PhysicalPlan> rightInpPlans = joinOp.getInnerPlansOf(1);
                FileSpec origRightLoaderFileSpec = rightLoader.getLFile();


                indexerArgs[0] = origRightLoaderFileSpec.getFuncSpec().toString();
                indexerArgs[1] = ObjectSerializer.serialize((Serializable)rightInpPlans);
                indexerArgs[2] = ObjectSerializer.serialize(rightPipelinePlan);
                indexerArgs[3] = rightLoader.getSignature();
                indexerArgs[4] = rightLoader.getOperatorKey().scope;
                indexerArgs[5] = Boolean.toString(true);
                
                FileSpec lFile = new FileSpec(rightLoader.getLFile().getFileName(),new FuncSpec(MergeJoinIndexer.class.getName(), indexerArgs));
                rightLoader.setLFile(lFile);
    
                // Loader of mro will return a tuple of form - 
                // (keyFirst1, keyFirst2, .. , position, splitIndex) See MergeJoinIndexer


                simpleConnectMapToReduce(rightMROpr);
                rightMROpr.useTypedComparator(true);
                
                POStore st = getStore();
                FileSpec strFile = getTempFileSpec();
                st.setSFile(strFile);
                rightMROpr.reducePlan.addAsLeaf(st);
                rightMROpr.setReduceDone(true);
                
                // set up the DefaultIndexableLoader for the join operator
                String[] defaultIndexableLoaderArgs = new String[5];
                defaultIndexableLoaderArgs[0] = origRightLoaderFileSpec.getFuncSpec().toString();
                defaultIndexableLoaderArgs[1] = strFile.getFileName();
                defaultIndexableLoaderArgs[2] = strFile.getFuncSpec().toString();
                defaultIndexableLoaderArgs[3] = joinOp.getOperatorKey().scope;
                defaultIndexableLoaderArgs[4] = origRightLoaderFileSpec.getFileName();
                joinOp.setRightLoaderFuncSpec((new FuncSpec(DefaultIndexableLoader.class.getName(), defaultIndexableLoaderArgs)));
                joinOp.setRightInputFileName(origRightLoaderFileSpec.getFileName());  
                
                joinOp.setIndexFile(strFile.getFileName());
            }
            
            // We are done with right side. Lets work on left now.
            // Join will be materialized in leftMROper.
            if(!curMROp.mapDone) // Life is easy 
                curMROp.mapPlan.addAsLeaf(joinOp);
            
            else if(!curMROp.reduceDone){  // This is a map-side join. Close this MROper and start afresh.
                POStore leftStore = getStore();
                FileSpec leftStrFile = getTempFileSpec();
                leftStore.setSFile(leftStrFile);
                curMROp.setReduceDone(true);
                curMROp = startNew(leftStrFile, curMROp);
                curMROp.mapPlan.addAsLeaf(joinOp);
            }

View Full Code Here

        int errCode = 2255;
        throw new VisitorException("POSkewedJoin operator has " + compiledInputs.length + " inputs. It should have 2.", errCode);
      }
      
      //change plan to store the first join input into a temp file
      FileSpec fSpec = getTempFileSpec();
      MapReduceOper mro = compiledInputs[0];
      POStore str = getStore();
      str.setSFile(fSpec);
      if (!mro.isMapDone()) {
        mro.mapPlan.addAsLeaf(str);
        mro.setMapDoneSingle(true);
      } else if (mro.isMapDone() && !mro.isReduceDone()) {
        mro.reducePlan.addAsLeaf(str);
        mro.setReduceDone(true);
      } else {
        int errCode = 2022;
        String msg = "Both map and reduce phases have been done. This is unexpected while compiling.";
        throw new PlanException(msg, errCode, PigException.BUG);
      }
      
      FileSpec partitionFile = getTempFileSpec();
      int rp = op.getRequestedParallelism();
      
      Pair<MapReduceOper, Integer> sampleJobPair = getSkewedJoinSampleJob(op, mro, fSpec, partitionFile, rp);            
      rp = sampleJobPair.second;
      
      // set parallelism of SkewedJoin as the value calculated by sampling job
      // if "parallel" is specified in join statement, "rp" is equal to that number
      // if not specified, use the value that sampling process calculated
      // based on default.
      op.setRequestedParallelism(rp);
            
      // load the temp file for first table as input of join            
      MapReduceOper[] joinInputs = new MapReduceOper[] {startNew(fSpec, sampleJobPair.first), compiledInputs[1]};            
      MapReduceOper[] rearrangeOutputs = new MapReduceOper[2];                       
      
      compiledInputs = new MapReduceOper[] {joinInputs[0]};
      // run POLocalRearrange for first join table
      POLocalRearrange lr = new POLocalRearrange(new OperatorKey(scope,nig.getNextNodeId(scope)), rp);            
      try {
        lr.setIndex(0);                
      } catch (ExecException e) {
        int errCode = 2058;
        String msg = "Unable to set index on newly created POLocalRearrange.";
        throw new PlanException(msg, errCode, PigException.BUG, e);
      }
      
      List<PhysicalOperator> l = plan.getPredecessors(op);
      MultiMap<PhysicalOperator, PhysicalPlan> joinPlans = op.getJoinPlans();
      List<PhysicalPlan> groups = (List<PhysicalPlan>)joinPlans.get(l.get(0));
      // check the type of group keys, if there are more than one field, the key is TUPLE.
      byte type = DataType.TUPLE;
      if (groups.size() == 1) {
        type = groups.get(0).getLeaves().get(0).getResultType();                
      }               
      
      lr.setKeyType(type);            
      lr.setPlans(groups);
      lr.setResultType(DataType.TUPLE);
      
      lr.visit(this);
      if(lr.getRequestedParallelism() > curMROp.requestedParallelism)
        curMROp.requestedParallelism = lr.getRequestedParallelism();
      rearrangeOutputs[0] = curMROp;
      
      compiledInputs = new MapReduceOper[] {joinInputs[1]};       
      // if the map for current input is already closed, then start a new job
      if (compiledInputs[0].isMapDone() && !compiledInputs[0].isReduceDone()) {
        FileSpec f = getTempFileSpec();
        POStore s = getStore();
        s.setSFile(f);
        compiledInputs[0].reducePlan.addAsLeaf(s);
        compiledInputs[0].setReduceDone(true);
        compiledInputs[0] = startNew(f, compiledInputs[0]);

View Full Code Here

    }


    @Override
    public void visitSort(POSort op) throws VisitorException {
        try{
            FileSpec fSpec = getTempFileSpec();
            MapReduceOper mro = endSingleInputPlanWithStr(fSpec);
            FileSpec quantFile = getTempFileSpec();
            int rp = op.getRequestedParallelism();
            Pair<Integer,Byte>[] fields = getSortCols(op.getSortPlans());
            Pair<MapReduceOper, Integer> quantJobParallelismPair = 
                getQuantileJob(op, mro, fSpec, quantFile, rp, fields);
            curMROp = getSortJob(op, quantJobParallelismPair.first, fSpec, quantFile,

View Full Code Here

        // as its first constructor argument.
        
        rslargs[0] = (new FuncSpec(Utils.getTmpFileCompressorName(pigContext))).toString();
        
        rslargs[1] = "100"; // The value is calculated based on the file size for skewed join
        FileSpec quantLdFilName = new FileSpec(lFile.getFileName(),
            new FuncSpec(sampleLdrClassName, rslargs));
        
        MapReduceOper mro = startNew(quantLdFilName, prevJob);
       
        if(sort.isUDFComparatorUsed) {

View Full Code Here


        // Okay, we're on.
        // First, replace our RandomSampleLoader with a RandomSampleLoader that uses
        // the load function from our predecessor.
        String[] rslargs = new String[2];
        FileSpec predFs = predLoad.getLFile();
        // First argument is FuncSpec of loader function to subsume, this we want to set for
        // ourselves.
        rslargs[0] = predFs.getFuncSpec().toString();
        // Add the loader's funcspec to the list of udf's associated with this mr operator
        mr.UDFs.add(rslargs[0]);
        // Second argument is the number of samples per block, read this from the original.
        rslargs[1] = load.getLFile().getFuncSpec().getCtorArgs()[1];
        FileSpec fs = new FileSpec(predFs.getFileName(),new FuncSpec(loadFunc, rslargs));
        POLoad newLoad = new POLoad(load.getOperatorKey(),load.getRequestedParallelism(), fs);
        newLoad.setSignature(predLoad.getSignature());
        try {
            mr.mapPlan.replace(load, newLoad);
            
            // check if it has PartitionSkewedKeys
            List<PhysicalOperator> ls = mr.reducePlan.getLeaves();
            for(PhysicalOperator op: ls) {
              scan(mr, op, fs.getFileName());
            }        
        } catch (PlanException e) {
            throw new VisitorException(e);
        }


        // Second, replace the loader in our successor with whatever the originally used loader was.
        fs = new FileSpec(predFs.getFileName(), predFs.getFuncSpec());
        newLoad = new POLoad(succLoad.getOperatorKey(), succLoad.getRequestedParallelism(), fs);
        newLoad.setSignature(predLoad.getSignature());
        try {
            succ.mapPlan.replace(succLoad, newLoad);
            // Add the loader's funcspec to the list of udf's associated with this mr operator

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of org.apache.pig.impl.io.FileSpec

com.netflix.lipstick.MRPlanCalculator

org.apache.pig.backend.executionengine.util.ExecTools

org.apache.pig.backend.hadoop.executionengine.HExecutionEngine

org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler$JoinDistributedCacheVisitor

org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.LimitAdjuster

org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MRCompiler

org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MRCompiler$FindStoreNameVisitor

org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MRCompiler$LimitAdjuster

org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.NoopStoreRemover$PhysicalRemover

org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.SampleOptimizer

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.