Package org.apache.hadoop.hive.ql.optimizer

Source Code of org.apache.hadoop.hive.ql.optimizer.BucketingSortingReduceSinkOptimizer$BucketSortReduceSinkProcessor

/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.hadoop.hive.ql.optimizer;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Stack;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.hive.common.ObjectPair;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.metastore.api.FieldSchema;
import org.apache.hadoop.hive.metastore.api.Order;
import org.apache.hadoop.hive.ql.exec.ExtractOperator;
import org.apache.hadoop.hive.ql.exec.FileSinkOperator;
import org.apache.hadoop.hive.ql.exec.FilterOperator;
import org.apache.hadoop.hive.ql.exec.Operator;
import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator;
import org.apache.hadoop.hive.ql.exec.SelectOperator;
import org.apache.hadoop.hive.ql.exec.TableScanOperator;
import org.apache.hadoop.hive.ql.lib.DefaultGraphWalker;
import org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher;
import org.apache.hadoop.hive.ql.lib.Dispatcher;
import org.apache.hadoop.hive.ql.lib.GraphWalker;
import org.apache.hadoop.hive.ql.lib.Node;
import org.apache.hadoop.hive.ql.lib.NodeProcessor;
import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx;
import org.apache.hadoop.hive.ql.lib.Rule;
import org.apache.hadoop.hive.ql.lib.RuleRegExp;
import org.apache.hadoop.hive.ql.metadata.Partition;
import org.apache.hadoop.hive.ql.metadata.Table;
import org.apache.hadoop.hive.ql.parse.ParseContext;
import org.apache.hadoop.hive.ql.parse.PrunedPartitionList;
import org.apache.hadoop.hive.ql.parse.SemanticException;
import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
import org.apache.hadoop.hive.ql.plan.OperatorDesc;
import org.apache.hadoop.hive.ql.plan.SelectDesc;

/**
* This transformation does optimization for enforcing bucketing and sorting.
* For a query of the form:
* insert overwrite table T1 select * from T2;
* where T1 and T2 are bucketized/sorted on the same keys, we don't need a reducer to
* enforce bucketing and sorting.
*/
public class BucketingSortingReduceSinkOptimizer implements Transform {

  private static final Log LOG = LogFactory.getLog(BucketingSortingReduceSinkOptimizer.class
      .getName());

  public BucketingSortingReduceSinkOptimizer() {
  }

  @Override
  public ParseContext transform(ParseContext pctx) throws SemanticException {

    Map<Rule, NodeProcessor> opRules = new LinkedHashMap<Rule, NodeProcessor>();
    HiveConf conf = pctx.getConf();

    // process reduce sink added by hive.enforce.bucketing or hive.enforce.sorting
    opRules.put(new RuleRegExp("R1",
        ReduceSinkOperator.getOperatorName() + "%" +
            ExtractOperator.getOperatorName() + "%" +
            FileSinkOperator.getOperatorName() + "%"),
        getBucketSortReduceSinkProc(pctx));

    // The dispatcher fires the processor corresponding to the closest matching rule
    Dispatcher disp = new DefaultRuleDispatcher(getDefaultProc(), opRules, null);
    GraphWalker ogw = new DefaultGraphWalker(disp);

    // Create a list of topop nodes
    ArrayList<Node> topNodes = new ArrayList<Node>();
    topNodes.addAll(pctx.getTopOps().values());
    ogw.startWalking(topNodes, null);

    return pctx;
  }

  private NodeProcessor getDefaultProc() {
    return new NodeProcessor() {
      @Override
      public Object process(Node nd, Stack<Node> stack,
          NodeProcessorCtx procCtx, Object... nodeOutputs) throws SemanticException {
        return null;
      }
    };
  }

  private NodeProcessor getBucketSortReduceSinkProc(ParseContext pctx) {
    return new BucketSortReduceSinkProcessor(pctx);
  }

  /**
   * BucketSortReduceSinkProcessor.
   *
   */
  public class BucketSortReduceSinkProcessor implements NodeProcessor {

    protected ParseContext pGraphContext;

    public BucketSortReduceSinkProcessor(ParseContext pGraphContext) {
      this.pGraphContext = pGraphContext;
    }

    // Get the bucket positions for the table
    private List<Integer> getBucketPositions(List<String> tabBucketCols,
        List<FieldSchema> tabCols) {
      List<Integer> posns = new ArrayList<Integer>();
      for (String bucketCol : tabBucketCols) {
        int pos = 0;
        for (FieldSchema tabCol : tabCols) {
          if (bucketCol.equals(tabCol.getName())) {
            posns.add(pos);
            break;
          }
          pos++;
        }
      }
      return posns;
    }

    // Get the sort positions and sort order for the table
    private List<ObjectPair<Integer, Integer>> getSortPositions(List<Order> tabSortCols,
        List<FieldSchema> tabCols) {
      List<ObjectPair<Integer, Integer>> posns = new ArrayList<ObjectPair<Integer, Integer>>();
      for (Order sortCol : tabSortCols) {
        int pos = 0;
        for (FieldSchema tabCol : tabCols) {
          if (sortCol.getCol().equals(tabCol.getName())) {
            posns.add(new ObjectPair<Integer, Integer>(pos, sortCol.getOrder()));
            break;
          }
          pos++;
        }
      }
      return posns;
    }

    // Return true if the parition is bucketed/sorted by the specified positions
    // The number of buckets, the sort order should also match along with the
    // columns which are bucketed/sorted
    private boolean checkPartition(Partition partition,
        List<Integer> bucketPositionsDest,
        List<ObjectPair<Integer, Integer>> sortPositionsDest,
        int numBucketsDest) {
      // The bucketing and sorting positions should exactly match
      int numBuckets = partition.getBucketCount();
      if (numBucketsDest != numBuckets) {
        return false;
      }

      List<Integer> partnBucketPositions =
          getBucketPositions(partition.getBucketCols(), partition.getTable().getCols());
      List<ObjectPair<Integer, Integer>> partnSortPositions =
          getSortPositions(partition.getSortCols(), partition.getTable().getCols());
      return bucketPositionsDest.equals(partnBucketPositions) &&
          sortPositionsDest.equals(partnSortPositions);
    }

    // Return true if the table is bucketed/sorted by the specified positions
    // The number of buckets, the sort order should also match along with the
    // columns which are bucketed/sorted
    private boolean checkTable(Table table,
        List<Integer> bucketPositionsDest,
        List<ObjectPair<Integer, Integer>> sortPositionsDest,
        int numBucketsDest) {
      // The bucketing and sorting positions should exactly match
      int numBuckets = table.getNumBuckets();
      if (numBucketsDest != numBuckets) {
        return false;
      }

      List<Integer> tableBucketPositions =
          getBucketPositions(table.getBucketCols(), table.getCols());
      List<ObjectPair<Integer, Integer>> tableSortPositions =
          getSortPositions(table.getSortCols(), table.getCols());
      return bucketPositionsDest.equals(tableBucketPositions) &&
          sortPositionsDest.equals(tableSortPositions);
    }

    private void storeBucketPathMapping(TableScanOperator tsOp, FileStatus[] srcs) {
      Map<String, Integer> bucketFileNameMapping = new HashMap<String, Integer>();
      for (int pos = 0; pos < srcs.length; pos++) {
        bucketFileNameMapping.put(srcs[pos].getPath().getName(), pos);
      }
      tsOp.getConf().setBucketFileNameMapping(bucketFileNameMapping);
    }

    // Remove the reduceSinkOperator.
    // The optimizer will automatically convert it to a map-only job.
    private void removeReduceSink(ReduceSinkOperator rsOp,
        TableScanOperator tsOp,
        FileSinkOperator fsOp,
        FileStatus[] srcs) {
      if (srcs == null) {
        return;
      }

      removeReduceSink(rsOp, tsOp, fsOp);
      // Store the mapping -> path, bucket number
      // This is needed since for the map-only job, any mapper can process any file.
      // For eg: if mapper 1 is processing the file corresponding to bucket 2, it should
      // also output the file correspodning to bucket 2 of the output.
      storeBucketPathMapping(tsOp, srcs);
    }

    // Remove the reduce sink operator
    // Use bucketized hive input format so that one mapper processes exactly one file
    private void removeReduceSink(ReduceSinkOperator rsOp,
        TableScanOperator tsOp,
        FileSinkOperator fsOp) {
      Operator<? extends OperatorDesc> parRSOp = rsOp.getParentOperators().get(0);
      parRSOp.getChildOperators().set(0, fsOp);
      fsOp.getParentOperators().set(0, parRSOp);
      fsOp.getConf().setMultiFileSpray(false);
      fsOp.getConf().setTotalFiles(1);
      fsOp.getConf().setNumFiles(1);
      tsOp.setUseBucketizedHiveInputFormat(true);
    }

    private int findColumnPosition(List<FieldSchema> cols, String colName) {
      int pos = 0;
      for (FieldSchema col : cols) {
        if (colName.equals(col.getName())) {
          return pos;
        }
        pos++;
      }
      return -1;
    }

    @Override
    public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
        Object... nodeOutputs) throws SemanticException {

      // If the reduce sink has not been introduced due to bucketing/sorting, ignore it
      FileSinkOperator fsOp = (FileSinkOperator) nd;
      ExtractOperator exOp = (ExtractOperator) fsOp.getParentOperators().get(0);
      ReduceSinkOperator rsOp = (ReduceSinkOperator) exOp.getParentOperators().get(0);

      List<ReduceSinkOperator> rsOps = pGraphContext
          .getReduceSinkOperatorsAddedByEnforceBucketingSorting();
      // nothing to do
      if ((rsOps != null) && (!rsOps.contains(rsOp))) {
        return null;
      }

      // Support for dynamic partitions can be added later
      if (fsOp.getConf().getDynPartCtx() != null) {
        return null;
      }

      // No conversion is possible for the reduce keys
      for (ExprNodeDesc keyCol : rsOp.getConf().getKeyCols()) {
        if (!(keyCol instanceof ExprNodeColumnDesc)) {
          return null;
        }
      }

      Table destTable = pGraphContext.getFsopToTable().get(fsOp);
      if (destTable == null) {
        return null;
      }

      // Get the positions for sorted and bucketed columns
      // For sorted columns, also get the order (ascending/descending) - that should
      // also match for this to be converted to a map-only job.
      List<Integer> bucketPositions =
          getBucketPositions(destTable.getBucketCols(), destTable.getCols());
      List<ObjectPair<Integer, Integer>> sortPositions =
          getSortPositions(destTable.getSortCols(), destTable.getCols());

      // Only selects and filters are allowed
      Operator<? extends OperatorDesc> op = rsOp;
      // TableScan will also be followed by a Select Operator. Find the expressions for the
      // bucketed/sorted columns for the destination table
      List<ExprNodeColumnDesc> sourceTableBucketCols = new ArrayList<ExprNodeColumnDesc>();
      List<ExprNodeColumnDesc> sourceTableSortCols = new ArrayList<ExprNodeColumnDesc>();

      while (true) {
        if (op.getParentOperators().size() > 1) {
          return null;
        }

        op = op.getParentOperators().get(0);
        if (!(op instanceof TableScanOperator) &&
            !(op instanceof FilterOperator) &&
            !(op instanceof SelectOperator)) {
          return null;
        }

        // nothing to be done for filters - the output schema does not change.
        if (op instanceof TableScanOperator) {
          Table srcTable = pGraphContext.getTopToTable().get(op);

          // Find the positions of the bucketed columns in the table corresponding
          // to the select list.
          // Consider the following scenario:
          // T1(key, value1, value2) bucketed/sorted by key into 2 buckets
          // T2(dummy, key, value1, value2) bucketed/sorted by key into 2 buckets
          // A query like: insert overwrite table T2 select 1, key, value1, value2 from T1
          // should be optimized.

          // Start with the destination: T2, bucketed/sorted position is [1]
          // At the source T1, the column corresponding to that position is [key], which
          // maps to column [0] of T1, which is also bucketed/sorted into the same
          // number of buckets
          List<Integer> newBucketPositions = new ArrayList<Integer>();
          for (int pos = 0; pos < bucketPositions.size(); pos++) {
            ExprNodeColumnDesc col = sourceTableBucketCols.get(pos);
            String colName = col.getColumn();
            int bucketPos = findColumnPosition(srcTable.getCols(), colName);
            if (bucketPos < 0) {
              return null;
            }
            newBucketPositions.add(bucketPos);
          }

          // Find the positions/order of the sorted columns in the table corresponding
          // to the select list.
          List<ObjectPair<Integer, Integer>> newSortPositions =
              new ArrayList<ObjectPair<Integer, Integer>>();
          for (int pos = 0; pos < sortPositions.size(); pos++) {
            ExprNodeColumnDesc col = sourceTableSortCols.get(pos);
            String colName = col.getColumn();
            int sortPos = findColumnPosition(srcTable.getCols(), colName);
            if (sortPos < 0) {
              return null;
            }
            newSortPositions.add(
                new ObjectPair<Integer, Integer>(sortPos, sortPositions.get(pos).getSecond()));
          }


          if (srcTable.isPartitioned()) {
            PrunedPartitionList prunedParts = pGraphContext.getOpToPartList().get(op);
            List<Partition> partitions = prunedParts.getNotDeniedPartns();

            // Support for dynamic partitions can be added later
            // The following is not optimized:
            // insert overwrite table T1(ds='1', hr) select key, value, hr from T2 where ds = '1';
            // where T1 and T2 are bucketed by the same keys and partitioned by ds. hr
            if ((partitions == null) || (partitions.isEmpty()) || (partitions.size() > 1)) {
              return null;
            }
            for (Partition partition : partitions) {
              if (!checkPartition(partition, newBucketPositions, newSortPositions,
                  pGraphContext.getFsopToTable().get(fsOp).getNumBuckets())) {
                return null;
              }
            }

            removeReduceSink(rsOp, (TableScanOperator) op, fsOp,
                partitions.get(0).getSortedPaths());
            return null;
          }
          else {
            if (!checkTable(srcTable, newBucketPositions, newSortPositions,
                pGraphContext.getFsopToTable().get(fsOp).getNumBuckets())) {
              return null;
            }

            removeReduceSink(rsOp, (TableScanOperator) op, fsOp, srcTable.getSortedPaths());
            return null;
          }
        }
        // None of the operators is changing the positions
        else if (op instanceof SelectOperator) {
          SelectOperator selectOp = (SelectOperator) op;
          SelectDesc selectDesc = selectOp.getConf();

          // There may be multiple selects - chose the one closest to the table
          sourceTableBucketCols.clear();
          sourceTableSortCols.clear();

          // Only columns can be selected for both sorted and bucketed positions
          for (int pos : bucketPositions) {
            ExprNodeDesc selectColList = selectDesc.getColList().get(pos);
            if (!(selectColList instanceof ExprNodeColumnDesc)) {
              return null;
            }
            sourceTableBucketCols.add((ExprNodeColumnDesc) selectColList);
          }

          for (ObjectPair<Integer, Integer> pos : sortPositions) {
            ExprNodeDesc selectColList = selectDesc.getColList().get(pos.getFirst());
            if (!(selectColList instanceof ExprNodeColumnDesc)) {
              return null;
            }
            sourceTableSortCols.add((ExprNodeColumnDesc) selectColList);
          }
        }
      }
    }
  }
}
TOP

Related Classes of org.apache.hadoop.hive.ql.optimizer.BucketingSortingReduceSinkOptimizer$BucketSortReduceSinkProcessor

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.