Examples of org.apache.hadoop.hive.ql.exec.GroupByOperator

org.apache.hadoop.hive.ql.exec.GroupByOperator
GroupBy operator implementation.

   * new Aggregation Desc of the new GroupByOperator.
   */
  private static class NewQueryGroupbySchemaProc implements NodeProcessor {
    public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx ctx,
        Object... nodeOutputs) throws SemanticException {
      GroupByOperator operator = (GroupByOperator)nd;
      rewriteQueryCtx = (RewriteQueryUsingAggregateIndexCtx)ctx;


      //We need to replace the GroupByOperator which is in
      //groupOpToInputTables map with the new GroupByOperator
      if(rewriteQueryCtx.getParseContext().getGroupOpToInputTables().containsKey(operator)){
        List<ExprNodeDesc> gbyKeyList = operator.getConf().getKeys();
        String gbyKeys = null;
        Iterator<ExprNodeDesc> gbyKeyListItr = gbyKeyList.iterator();
        while(gbyKeyListItr.hasNext()){
          ExprNodeDesc expr = gbyKeyListItr.next().clone();
          if(expr instanceof ExprNodeColumnDesc){
            ExprNodeColumnDesc colExpr = (ExprNodeColumnDesc)expr;
            gbyKeys = colExpr.getColumn();
            if(gbyKeyListItr.hasNext()){
              gbyKeys = gbyKeys + ",";
            }
          }
        }




          //the query contains the sum aggregation GenericUDAF
        String selReplacementCommand = "select sum(`"
          + rewriteQueryCtx.getAggregateFunction() + "`)"
          + " from " + rewriteQueryCtx.getIndexName()
          + " group by " + gbyKeys + " ";
        //create a new ParseContext for the query to retrieve its operator tree,
        //and the required GroupByOperator from it
        ParseContext newDAGContext = RewriteParseContextGenerator.generateOperatorTree(
            rewriteQueryCtx.getParseContext().getConf(),
            selReplacementCommand);


        //we get our new GroupByOperator here
        Map<GroupByOperator, Set<String>> newGbyOpMap = newDAGContext.getGroupOpToInputTables();
        GroupByOperator newGbyOperator = newGbyOpMap.keySet().iterator().next();
        GroupByDesc oldConf = operator.getConf();


        //we need this information to set the correct colList, outputColumnNames in SelectOperator
        ExprNodeColumnDesc aggrExprNode = null;


        //Construct the new AggregationDesc to get rid of the current
        //internal names and replace them with new internal names
        //as required by the operator tree
        GroupByDesc newConf = newGbyOperator.getConf();
        List<AggregationDesc> newAggrList = newConf.getAggregators();
        if(newAggrList != null && newAggrList.size() > 0){
          for (AggregationDesc aggregationDesc : newAggrList) {
            rewriteQueryCtx.setEval(aggregationDesc.getGenericUDAFEvaluator());
            aggrExprNode = (ExprNodeColumnDesc)aggregationDesc.getParameters().get(0);

View Full Code Here

  public static class GroupByRule implements NodeProcessor {


    @Override
    public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
        Object... nodeOutputs) throws SemanticException {
      GroupByOperator gbyOp = (GroupByOperator)nd;
      List<String> gbyKeys = new ArrayList<String>();
      for (ExprNodeDesc exprDesc : gbyOp.getConf().getKeys()) {
        for (Entry<String, ExprNodeDesc> entry : gbyOp.getColumnExprMap().entrySet()) {
          if (exprDesc.isSame(entry.getValue())) {
            gbyKeys.add(entry.getKey());
          }
        }
      }


      List<List<String>> listBucketCols = new ArrayList<List<String>>();
      listBucketCols.add(gbyKeys);
      OpTraits opTraits = new OpTraits(listBucketCols, -1);
      gbyOp.setOpTraits(opTraits);
      return null;
    }

View Full Code Here


    @Override
    public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
        Object... nodeOutputs) throws SemanticException {


      GroupByOperator gop = (GroupByOperator) nd;
      Operator<? extends OperatorDesc> parent = gop.getParentOperators().get(0);
      Statistics parentStats = parent.getStatistics();


      // parent stats are not populated yet
      if (parentStats == null) {
        return null;
      }


      AnnotateStatsProcCtx aspCtx = (AnnotateStatsProcCtx) procCtx;
      HiveConf conf = aspCtx.getConf();
      long maxSplitSize = HiveConf.getLongVar(conf, HiveConf.ConfVars.MAPREDMAXSPLITSIZE);
      List<AggregationDesc> aggDesc = gop.getConf().getAggregators();
      Map<String, ExprNodeDesc> colExprMap = gop.getColumnExprMap();
      RowSchema rs = gop.getSchema();
      Statistics stats = null;
      List<ColStatistics> colStats = StatsUtils.getColStatisticsFromExprMap(conf, parentStats,
          colExprMap, rs);
      long cardinality;
      long parallelism = 1L;
      boolean mapSide = false;
      boolean mapSideHashAgg = false;
      long inputSize = 1L;
      boolean containsGroupingSet = gop.getConf().isGroupingSetsPresent();
      long sizeOfGroupingSet =
          containsGroupingSet ? gop.getConf().getListGroupingSets().size() : 1L;


      // There are different cases for Group By depending on map/reduce side, hash aggregation,
      // grouping sets and column stats. If we don't have column stats, we just assume hash
      // aggregation is disabled. Following are the possible cases and rule for cardinality
      // estimation


      // MAP SIDE:
      // Case 1: NO column stats, NO hash aggregation, NO grouping sets — numRows
      // Case 2: NO column stats, NO hash aggregation, grouping sets — numRows * sizeOfGroupingSet
      // Case 3: column stats, hash aggregation, NO grouping sets — Min(numRows / 2, ndvProduct * parallelism)
      // Case 4: column stats, hash aggregation, grouping sets — Min((numRows * sizeOfGroupingSet) / 2, ndvProduct * parallelism * sizeOfGroupingSet)
      // Case 5: column stats, NO hash aggregation, NO grouping sets — numRows
      // Case 6: column stats, NO hash aggregation, grouping sets — numRows * sizeOfGroupingSet


      // REDUCE SIDE:
      // Case 7: NO column stats — numRows / 2
      // Case 8: column stats, grouping sets — Min(numRows, ndvProduct * sizeOfGroupingSet)
      // Case 9: column stats, NO grouping sets - Min(numRows, ndvProduct)


      if (gop.getChildOperators().get(0) instanceof ReduceSinkOperator ||
          gop.getChildOperators().get(0) instanceof AppMasterEventOperator) {


        mapSide = true;


        // consider approximate map side parallelism to be table data size
        // divided by max split size
        TableScanOperator top = OperatorUtils.findSingleOperatorUpstream(gop,
            TableScanOperator.class);
        // if top is null then there are multiple parents (RS as well), hence
        // lets use parent statistics to get data size. Also maxSplitSize should
        // be updated to bytes per reducer (1GB default)
        if (top == null) {
          inputSize = parentStats.getDataSize();
          maxSplitSize = HiveConf.getLongVar(conf, HiveConf.ConfVars.BYTESPERREDUCER);
        } else {
          inputSize = top.getConf().getStatistics().getDataSize();
        }
        parallelism = (int) Math.ceil((double) inputSize / maxSplitSize);
      }


      if (isDebugEnabled) {
        LOG.debug("STATS-" + gop.toString() + ": inputSize: " + inputSize + " maxSplitSize: " +
            maxSplitSize + " parallelism: " + parallelism + " containsGroupingSet: " +
            containsGroupingSet + " sizeOfGroupingSet: " + sizeOfGroupingSet);
      }


      try {
        // satisfying precondition means column statistics is available
        if (satisfyPrecondition(parentStats)) {


          // check if map side aggregation is possible or not based on column stats
          mapSideHashAgg = checkMapSideAggregation(gop, colStats, conf);


          if (isDebugEnabled) {
            LOG.debug("STATS-" + gop.toString() + " mapSideHashAgg: " + mapSideHashAgg);
          }


          stats = parentStats.clone();
          stats.setColumnStats(colStats);
          long ndvProduct = 1;
          final long parentNumRows = stats.getNumRows();


          // compute product of distinct values of grouping columns
          for (ColStatistics cs : colStats) {
            if (cs != null) {
              long ndv = cs.getCountDistint();
              if (cs.getNumNulls() > 0) {
                ndv = StatsUtils.safeAdd(ndv, 1);
              }
              ndvProduct = StatsUtils.safeMult(ndvProduct, ndv);
            } else {
              if (parentStats.getColumnStatsState().equals(Statistics.State.COMPLETE)) {
                // the column must be an aggregate column inserted by GBY. We
                // don't have to account for this column when computing product
                // of NDVs
                continue;
              } else {
                // partial column statistics on grouping attributes case.
                // if column statistics on grouping attribute is missing, then
                // assume worst case.
                // GBY rule will emit half the number of rows if ndvProduct is 0
                ndvProduct = 0;
              }
              break;
            }
          }


          // if ndvProduct is 0 then column stats state must be partial and we are missing
          // column stats for a group by column
          if (ndvProduct == 0) {
            ndvProduct = parentNumRows / 2;


            if (isDebugEnabled) {
              LOG.debug("STATS-" + gop.toString() + ": ndvProduct became 0 as some column does not" +
                  " have stats. ndvProduct changed to: " + ndvProduct);
            }
          }


          if (mapSide) {
            // MAP SIDE


            if (mapSideHashAgg) {
              if (containsGroupingSet) {
                // Case 4: column stats, hash aggregation, grouping sets
                cardinality = Math.min(
                    (StatsUtils.safeMult(parentNumRows, sizeOfGroupingSet)) / 2,
                    StatsUtils.safeMult(StatsUtils.safeMult(ndvProduct, parallelism), sizeOfGroupingSet));


                if (isDebugEnabled) {
                  LOG.debug("[Case 4] STATS-" + gop.toString() + ": cardinality: " + cardinality);
                }
              } else {
                // Case 3: column stats, hash aggregation, NO grouping sets
                cardinality = Math.min(parentNumRows / 2, StatsUtils.safeMult(ndvProduct, parallelism));


                if (isDebugEnabled) {
                  LOG.debug("[Case 3] STATS-" + gop.toString() + ": cardinality: " + cardinality);
                }
              }
            } else {
              if (containsGroupingSet) {
                // Case 6: column stats, NO hash aggregation, grouping sets
                cardinality = StatsUtils.safeMult(parentNumRows, sizeOfGroupingSet);


                if (isDebugEnabled) {
                  LOG.debug("[Case 6] STATS-" + gop.toString() + ": cardinality: " + cardinality);
                }
              } else {
                // Case 5: column stats, NO hash aggregation, NO grouping sets
                cardinality = parentNumRows;


                if (isDebugEnabled) {
                  LOG.debug("[Case 5] STATS-" + gop.toString() + ": cardinality: " + cardinality);
                }
              }
            }
          } else {
            // REDUCE SIDE


            // in reduce side GBY, we don't know if the grouping set was present or not. so get it
            // from map side GBY
            GroupByOperator mGop = OperatorUtils.findSingleOperatorUpstream(parent, GroupByOperator.class);
            if (mGop != null) {
              containsGroupingSet = mGop.getConf().isGroupingSetsPresent();
              sizeOfGroupingSet = mGop.getConf().getListGroupingSets().size();
            }


            if (containsGroupingSet) {
              // Case 8: column stats, grouping sets
              cardinality = Math.min(parentNumRows, StatsUtils.safeMult(ndvProduct, sizeOfGroupingSet));

View Full Code Here

    @Override
    public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
        Object... nodeOutputs) throws SemanticException {


      LineageCtx lctx = (LineageCtx)procCtx;
      GroupByOperator gop = (GroupByOperator)nd;
      ArrayList<ColumnInfo> col_infos = gop.getSchema().getSignature();
      Operator<? extends OperatorDesc> inpOp = getParent(stack);
      int cnt = 0;


      for(ExprNodeDesc expr : gop.getConf().getKeys()) {
        lctx.getIndex().putDependency(gop, col_infos.get(cnt++),
            ExprProcFactory.getExprDependency(lctx, inpOp, expr));
      }


      for(AggregationDesc agg : gop.getConf().getAggregators()) {
        // Concatenate the dependencies of all the parameters to
        // create the new dependency
        Dependency dep = new Dependency();
        DependencyType new_type = LineageInfo.DependencyType.EXPRESSION;
        // TODO: Get the actual string here.

View Full Code Here

   */
  public static class ColumnPrunerGroupByProc implements NodeProcessor {
    @Override
    public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx ctx,
        Object... nodeOutputs) throws SemanticException {
      GroupByOperator op = (GroupByOperator) nd;
      ColumnPrunerProcCtx cppCtx = (ColumnPrunerProcCtx) ctx;
      List<String> colLists = new ArrayList<String>();
      GroupByDesc conf = op.getConf();
      ArrayList<ExprNodeDesc> keys = conf.getKeys();
      for (ExprNodeDesc key : keys) {
        colLists = Utilities.mergeUniqElems(colLists, key.getCols());
      }

View Full Code Here

    }


    public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
        Object... nodeOutputs) throws SemanticException {
      FileSinkOperator FS = (FileSinkOperator) nd;
      GroupByOperator cGBY = (GroupByOperator) stack.get(stack.size() - 3);
      ReduceSinkOperator RS = (ReduceSinkOperator) stack.get(stack.size() - 4);
      if (RS.getConf().getNumReducers() != 1 || !RS.getConf().getKeyCols().isEmpty()) {
        return null;
      }
      GroupByOperator pGBY = (GroupByOperator) stack.get(stack.size() - 5);


      Path fileName = FS.getConf().getFinalDirName();
      TableDesc tsDesc = createIntermediateFS(pGBY, fileName);


      for (AggregationDesc aggregation : cGBY.getConf().getAggregators()) {

View Full Code Here


    @Override
    public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
        Object... nodeOutputs) throws SemanticException {
      // GBY,RS,GBY... (top to bottom)
      GroupByOperator groupByOp = (GroupByOperator) stack.get(stack.size() - 3);


      GroupByOptimizerContext ctx = (GroupByOptimizerContext) procCtx;


      if (!checkGroupByOperatorProcessed(ctx, groupByOp)) {
        processGroupBy(ctx, stack, groupByOp, 2);

View Full Code Here


    @Override
    public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
        Object... nodeOutputs) throws SemanticException {
      // GBY,RS,GBY,RS,GBY... (top to bottom)
      GroupByOperator groupByOp = (GroupByOperator) stack.get(stack.size() - 5);
      GroupByOptimizerContext ctx = (GroupByOptimizerContext) procCtx;


      if (!checkGroupByOperatorProcessed(ctx, groupByOp)) {
        processGroupBy(ctx, stack, groupByOp, 4);
      }

View Full Code Here

    GroupByDesc groupBy =
        new GroupByDesc(GroupByDesc.Mode.HASH, outputNames, groupByExprs,
            new ArrayList<AggregationDesc>(), false, groupByMemoryUsage, memoryThreshold,
            null, false, 0, true);


    GroupByOperator groupByOp =
        (GroupByOperator) OperatorFactory.getAndMakeChild(groupBy, selectOp);


    Map<String, ExprNodeDesc> colMap = new HashMap<String, ExprNodeDesc>();
    colMap.put(outputNames.get(0), groupByExpr);
    groupByOp.setColumnExprMap(colMap);


    // finally add the event broadcast operator
    DynamicPruningEventDesc eventDesc = new DynamicPruningEventDesc();
    eventDesc.setTableScan(ts);
    eventDesc.setTable(PlanUtils.getReduceValueTableDesc(PlanUtils

View Full Code Here

   */
  public static class ConstantPropagateGroupByProc implements NodeProcessor {
    @Override
    public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx ctx, Object... nodeOutputs)
        throws SemanticException {
      GroupByOperator op = (GroupByOperator) nd;
      ConstantPropagateProcCtx cppCtx = (ConstantPropagateProcCtx) ctx;
      Map<ColumnInfo, ExprNodeDesc> colToConstants = cppCtx.getPropagatedConstants(op);
      cppCtx.getOpToConstantExprs().put(op, colToConstants);


      if (colToConstants.isEmpty()) {
        return null;
      }


      GroupByDesc conf = op.getConf();
      ArrayList<ExprNodeDesc> keys = conf.getKeys();
      for (int i = 0; i < keys.size(); i++) {
        ExprNodeDesc key = keys.get(i);
        ExprNodeDesc newkey = foldExpr(key, colToConstants, cppCtx, op, 0, false);
        keys.set(i, newkey);

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of org.apache.hadoop.hive.ql.exec.GroupByOperator

javolution.util.FastBitSet

org.apache.hadoop.hive.ql.optimizer.ColumnPrunerProcFactory$ColumnPrunerGroupByProc

org.apache.hadoop.hive.ql.optimizer.ConstantPropagateProcFactory$ConstantPropagateGroupByProc

org.apache.hadoop.hive.ql.optimizer.correlation.CorrelationOptimizer$CorrelationNodeProc

org.apache.hadoop.hive.ql.optimizer.correlation.CorrelationUtilities

org.apache.hadoop.hive.ql.optimizer.correlation.QueryPlanTreeTransformation

org.apache.hadoop.hive.ql.optimizer.correlation.ReduceSinkDeDuplication$AbsctractReducerReducerProc

org.apache.hadoop.hive.ql.optimizer.correlation.ReduceSinkDeDuplication$GroupbyReducerProc

org.apache.hadoop.hive.ql.optimizer.DynamicPartitionPruningOptimization

org.apache.hadoop.hive.ql.optimizer.GroupByOptimizer$BucketGroupByProcessor

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.