Package org.apache.pig.backend.hadoop.executionengine.fetch

Source Code of org.apache.pig.backend.hadoop.executionengine.fetch.FetchOptimizer

/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.pig.backend.hadoop.executionengine.fetch;

import java.io.IOException;
import java.util.LinkedList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.pig.PigConfiguration;
import org.apache.pig.backend.datastorage.DataStorageException;
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PhyPlanSetter;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.PhysicalOperator;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.plans.PhyPlanVisitor;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.plans.PhysicalPlan;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POCollectedGroup;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POCounter;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POCross;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.PODemux;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.PODistinct;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POFRJoin;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POGlobalRearrange;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POLimit;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POLoad;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POLocalRearrange;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POMergeCogroup;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POMergeJoin;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.PONative;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POOptimizedForEach;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POPackage;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POPartialAgg;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POPartitionRearrange;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POPreCombinerLocalRearrange;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.PORank;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POSkewedJoin;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POSort;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POSplit;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POStore;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POStream;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.util.PlanHelper;
import org.apache.pig.impl.PigContext;
import org.apache.pig.impl.PigImplConstants;
import org.apache.pig.impl.builtin.SampleLoader;
import org.apache.pig.impl.plan.DepthFirstWalker;
import org.apache.pig.impl.plan.VisitorException;
import org.apache.pig.impl.util.Utils;

/**
* FetchOptimizer determines whether the entire physical plan is fetchable, meaning
* that the task's result can be directly read (fetched) from the underlying storage
* rather than creating MR jobs. During the check {@link FetchablePlanVisitor} is used
* to walk through the plan.
*
*/
public class FetchOptimizer {
    private static final Log LOG = LogFactory.getLog(FetchOptimizer.class);

    /**
     * Checks whether the fetch is enabled
     *
     * @param pc
     * @return true if fetching is enabled
     */
    public static boolean isFetchEnabled(PigContext pc) {
        return "true".equalsIgnoreCase(
                pc.getProperties().getProperty(PigConfiguration.OPT_FETCH, "true"));
    }

    /**
     * Visits the plan with {@link FetchablePlanVisitor} and checks whether the
     * plan is fetchable.
     *
     * @param pc PigContext
     * @param pp the physical plan to be examined
     * @return true if the plan is fetchable
     * @throws VisitorException
     */
    public static boolean isPlanFetchable(PigContext pc, PhysicalPlan pp) throws VisitorException {
        if (isEligible(pc, pp)) {
            FetchablePlanVisitor fpv = new FetchablePlanVisitor(pc, pp);
            fpv.visit();
            // Plan is fetchable only if FetchablePlanVisitor returns true AND
            // limit is present in the plan. Limit is a safeguard. If the input
            // is large, and there is no limit, fetch optimizer will fetch the
            // entire input to the client. That can be dangerous.
            boolean isFetchable = fpv.isPlanFetchable() &&
                    PlanHelper.containsPhysicalOperator(pp, POLimit.class);
            if (isFetchable) {
                pc.getProperties().setProperty(PigImplConstants.CONVERTED_TO_FETCH, "true");
                init(pp);
            }
            return isFetchable;
        }
        return false;
    }

    private static void init(PhysicalPlan pp) throws VisitorException {
        //mark POStream ops 'fetchable'
        LinkedList<POStream> posList = PlanHelper.getPhysicalOperators(pp, POStream.class);
        for (POStream pos : posList) {
            pos.setFetchable(true);
        }
    }

    /**
     * Checks whether the plan fulfills the prerequisites needed for fetching.
     *
     * @param pc PigContext
     * @param pp the physical plan to be examined
     * @return
     */
    private static boolean isEligible(PigContext pc, PhysicalPlan pp) {
        if (!isFetchEnabled(pc)) {
            return false;
        }

        List<PhysicalOperator> roots = pp.getRoots();
        for (PhysicalOperator po : roots) {
            if (!(po instanceof POLoad)) {
                String msg = "Expected physical operator at root is POLoad. Found : "
                        + po.getClass().getCanonicalName() + ". Fetch optimizer will be disabled.";
                LOG.debug(msg);
                return false;
            }
        }

        //consider single leaf jobs only
        int leafSize = pp.getLeaves().size();
        if (pp.getLeaves().size() != 1) {
            LOG.debug("Expected physical plan should have one leaf. Found " + leafSize);
            return false;
        }

        return true;
    }

    /**
     * A plan is considered 'fetchable' if:
     * <pre>
     * - it contains only: LIMIT, FILTER, FOREACH, STREAM, UNION(no implicit SPLIT is allowed)
     * - no STORE
     * - no scalar aliases ({@link org.apache.pig.impl.builtin.ReadScalars ReadScalars})
     * - {@link org.apache.pig.LoadFunc LoadFunc} is not a {@link org.apache.pig.impl.builtin.SampleLoader SampleLoader}
     * </pre>
     */
    private static class FetchablePlanVisitor extends PhyPlanVisitor {

        private boolean planFetchable = true;
        private PigContext pc;

        public FetchablePlanVisitor(PigContext pc, PhysicalPlan plan) {
            super(plan, new DepthFirstWalker<PhysicalOperator, PhysicalPlan>(plan));
            this.pc = pc;
        }

        @Override
        public void visit() throws VisitorException {
            new PhyPlanSetter(mPlan).visit();
            super.visit();
        }

        @Override
        public void visitLoad(POLoad ld) throws VisitorException{
            if (ld.getLoadFunc() instanceof SampleLoader) {
                planFetchable = false;
            }
        }

        @Override
        public void visitStore(POStore st) throws VisitorException{
            String basePathName = st.getSFile().getFileName();

            //plan is fetchable if POStore belongs to EXPLAIN
            if ("fakefile".equals(basePathName)) {
                return;
            }

            //Otherwise check if target storage format equals to the intermediate storage format
            //and its path points to a temporary storage path
            boolean hasTmpStorageClass = st.getStoreFunc().getClass()
                .equals(Utils.getTmpFileStorageClass(pc.getProperties()));

            try {
                boolean hasTmpTargetPath = isTempPath(basePathName);
                if (!(hasTmpStorageClass && hasTmpTargetPath)) {
                    planFetchable = false;
                }
            }
            catch (IOException e) {
                String msg = "Internal error. Could not retrieve temporary store location.";
                throw new VisitorException(msg, e);
            }
        }

        @Override
        public void visitNative(PONative nat) throws VisitorException {
            planFetchable = false;
        }

        @Override
        public void visitCollectedGroup(POCollectedGroup mg) throws VisitorException {
            planFetchable = false;
        }

        @Override
        public void visitLocalRearrange(POLocalRearrange lr) throws VisitorException {
            planFetchable = false;
        }

        @Override
        public void visitGlobalRearrange(POGlobalRearrange gr) throws VisitorException {
            planFetchable = false;
        }

        @Override
        public void visitPackage(POPackage pkg) throws VisitorException {
            planFetchable = false;
        }

        @Override
        public void visitSplit(POSplit spl) throws VisitorException {
            planFetchable = false;
        }

        @Override
        public void visitDemux(PODemux demux) throws VisitorException {
            planFetchable = false;
        }

        @Override
        public void visitCounter(POCounter poCounter) throws VisitorException {
            planFetchable = false;
        }

        @Override
        public void visitRank(PORank rank) throws VisitorException {
            planFetchable = false;
        }

        @Override
        public void visitDistinct(PODistinct distinct) throws VisitorException {
            planFetchable = false;
        }

        @Override
        public void visitSort(POSort sort) throws VisitorException {
            planFetchable = false;
        }

        @Override
        public void visitCross(POCross cross) throws VisitorException {
            planFetchable = false;
        }

        @Override
        public void visitFRJoin(POFRJoin join) throws VisitorException {
            planFetchable = false;
        }

        @Override
        public void visitMergeJoin(POMergeJoin join) throws VisitorException {
            planFetchable = false;
        }

        @Override
        public void visitMergeCoGroup(POMergeCogroup mergeCoGrp) throws VisitorException {
            planFetchable = false;
        }

        @Override
        public void visitSkewedJoin(POSkewedJoin sk) throws VisitorException {
            planFetchable = false;
        }

        @Override
        public void visitPartitionRearrange(POPartitionRearrange pr) throws VisitorException {
            planFetchable = false;
        }

        @Override
        public void visitPOOptimizedForEach(POOptimizedForEach optimizedForEach)
                throws VisitorException {
            planFetchable = false;
        }

        @Override
        public void visitPreCombinerLocalRearrange(
                POPreCombinerLocalRearrange preCombinerLocalRearrange) {
            planFetchable = false;
        }

        @Override
        public void visitPartialAgg(POPartialAgg poPartialAgg) {
            planFetchable = false;
        }

        private boolean isPlanFetchable() {
            return planFetchable;
        }

        private boolean isTempPath(String basePathName) throws DataStorageException {
            String tdir = pc.getProperties().getProperty("pig.temp.dir", "/tmp");
            String tempStore = pc.getDfs().asContainer(tdir + "/temp").toString();
            Matcher matcher = Pattern.compile(tempStore + "-?[0-9]+").matcher(basePathName);
            return matcher.lookingAt();
        }
    }

}
TOP

Related Classes of org.apache.pig.backend.hadoop.executionengine.fetch.FetchOptimizer

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.