Package org.apache.hadoop.hive.ql.optimizer.lineage

Source Code of org.apache.hadoop.hive.ql.optimizer.lineage.OpProcFactory$DefaultLineage

/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.hadoop.hive.ql.optimizer.lineage;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.Stack;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hive.metastore.api.FieldSchema;
import org.apache.hadoop.hive.metastore.api.Table;
import org.apache.hadoop.hive.ql.exec.ColumnInfo;
import org.apache.hadoop.hive.ql.exec.ForwardOperator;
import org.apache.hadoop.hive.ql.exec.GroupByOperator;
import org.apache.hadoop.hive.ql.exec.JoinOperator;
import org.apache.hadoop.hive.ql.exec.LateralViewJoinOperator;
import org.apache.hadoop.hive.ql.exec.Operator;
import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator;
import org.apache.hadoop.hive.ql.exec.RowSchema;
import org.apache.hadoop.hive.ql.exec.SelectOperator;
import org.apache.hadoop.hive.ql.exec.TableScanOperator;
import org.apache.hadoop.hive.ql.hooks.LineageInfo;
import org.apache.hadoop.hive.ql.hooks.LineageInfo.BaseColumnInfo;
import org.apache.hadoop.hive.ql.hooks.LineageInfo.Dependency;
import org.apache.hadoop.hive.ql.hooks.LineageInfo.DependencyType;
import org.apache.hadoop.hive.ql.hooks.LineageInfo.TableAliasInfo;
import org.apache.hadoop.hive.ql.lib.Node;
import org.apache.hadoop.hive.ql.lib.NodeProcessor;
import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx;
import org.apache.hadoop.hive.ql.lib.Utils;
import org.apache.hadoop.hive.ql.metadata.VirtualColumn;
import org.apache.hadoop.hive.ql.parse.ParseContext;
import org.apache.hadoop.hive.ql.parse.SemanticException;
import org.apache.hadoop.hive.ql.plan.AggregationDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
import org.apache.hadoop.hive.ql.plan.JoinDesc;
import org.apache.hadoop.hive.ql.plan.OperatorDesc;
import org.apache.hadoop.hive.ql.plan.ReduceSinkDesc;

/**
* Operator factory for the rule processors for lineage.
*/
public class OpProcFactory {

  /**
   * Returns the parent operator in the walk path to the current operator.
   *
   * @param stack The stack encoding the path.
   *
   * @return Operator The parent operator in the current path.
   */
  protected static Operator<? extends OperatorDesc> getParent(Stack<Node> stack) {
    return (Operator<? extends OperatorDesc>)Utils.getNthAncestor(stack, 1);
  }

  /**
   * Processor for Script and UDTF Operators.
   */
  public static class TransformLineage extends DefaultLineage implements NodeProcessor {

    @Override
    public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
        Object... nodeOutputs) throws SemanticException {

      // LineageCTx
      LineageCtx lCtx = (LineageCtx) procCtx;

      // The operators
      Operator<? extends OperatorDesc> op = (Operator<? extends OperatorDesc>)nd;
      Operator<? extends OperatorDesc> inpOp = getParent(stack);

      // Create a single dependency list by concatenating the dependencies of all
      // the cols
      Dependency dep = new Dependency();
      DependencyType new_type = LineageInfo.DependencyType.SCRIPT;
      dep.setType(LineageInfo.DependencyType.SCRIPT);
      // TODO: Fix this to a non null value.
      dep.setExpr(null);

      LinkedHashSet<BaseColumnInfo> col_set = new LinkedHashSet<BaseColumnInfo>();
      for(ColumnInfo ci : inpOp.getSchema().getSignature()) {
        Dependency d = lCtx.getIndex().getDependency(inpOp, ci);
        if (d != null) {
          new_type = LineageCtx.getNewDependencyType(d.getType(), new_type);
          col_set.addAll(d.getBaseCols());
        }
      }

      dep.setType(new_type);
      dep.setBaseCols(new ArrayList<BaseColumnInfo>(col_set));

      // This dependency is then set for all the colinfos of the script operator
      for(ColumnInfo ci : op.getSchema().getSignature()) {
        lCtx.getIndex().putDependency(op, ci, dep);
      }

      return null;
    }

  }

  /**
   * Processor for TableScan Operator. This actually creates the base column mappings.
   */
  public static class TableScanLineage extends DefaultLineage implements NodeProcessor {

    @Override
    public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
        Object... nodeOutputs) throws SemanticException {

      // LineageCtx
      LineageCtx lCtx = (LineageCtx) procCtx;
      ParseContext pctx = lCtx.getParseCtx();

      // Table scan operator.
      TableScanOperator top = (TableScanOperator)nd;
      org.apache.hadoop.hive.ql.metadata.Table t = pctx.getTopToTable().get(top);
      Table tab = t.getTTable();

      // Generate the mappings
      RowSchema rs = top.getSchema();
      List<FieldSchema> cols = t.getAllCols();
      Map<String, FieldSchema> fieldSchemaMap = new HashMap<String, FieldSchema>();
      for(FieldSchema col : cols) {
        fieldSchemaMap.put(col.getName(), col);
      }

      Iterator<VirtualColumn> vcs = VirtualColumn.getRegistry(pctx.getConf()).iterator();
      while (vcs.hasNext()) {
        VirtualColumn vc = vcs.next();
        fieldSchemaMap.put(vc.getName(), new FieldSchema(vc.getName(),
            vc.getTypeInfo().getTypeName(), ""));
      }

      TableAliasInfo tai = new TableAliasInfo();
      tai.setAlias(top.getConf().getAlias());
      tai.setTable(tab);
      for(ColumnInfo ci : rs.getSignature()) {
        // Create a dependency
        Dependency dep = new Dependency();
        BaseColumnInfo bci = new BaseColumnInfo();
        bci.setTabAlias(tai);
        bci.setColumn(fieldSchemaMap.get(ci.getInternalName()));

        // Populate the dependency
        dep.setType(LineageInfo.DependencyType.SIMPLE);
        // TODO: Find out how to get the expression here.
        dep.setExpr(null);
        dep.setBaseCols(new ArrayList<BaseColumnInfo>());
        dep.getBaseCols().add(bci);

        // Put the dependency in the map
        lCtx.getIndex().putDependency(top, ci, dep);
      }

      return null;
    }

  }

  /**
   * Processor for Join Operator.
   */
  public static class JoinLineage extends DefaultLineage implements NodeProcessor {
    @Override
    public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
        Object... nodeOutputs) throws SemanticException {

      // Assert that there is atleast one item in the stack. This should never
      // be called for leafs.
      assert(!stack.isEmpty());

      // LineageCtx
      LineageCtx lCtx = (LineageCtx) procCtx;
      JoinOperator op = (JoinOperator)nd;
      JoinDesc jd = op.getConf();

      // The input operator to the join is always a reduce sink operator
      ReduceSinkOperator inpOp = (ReduceSinkOperator)getParent(stack);
      ReduceSinkDesc rd = inpOp.getConf();
      int tag = rd.getTag();

      // Iterate over the outputs of the join operator and merge the
      // dependencies of the columns that corresponding to the tag.
      int cnt = 0;
      List<ExprNodeDesc> exprs = jd.getExprs().get((byte)tag);
      for(ColumnInfo ci : op.getSchema().getSignature()) {
        if (jd.getReversedExprs().get(ci.getInternalName()) != tag) {
          continue;
        }

        // Otherwise look up the expression corresponding to this ci
        ExprNodeDesc expr = exprs.get(cnt++);
        lCtx.getIndex().mergeDependency(op, ci,
            ExprProcFactory.getExprDependency(lCtx, inpOp, expr));
      }

      return null;
    }

  }

  /**
   * Processor for Join Operator.
   */
  public static class LateralViewJoinLineage extends DefaultLineage implements NodeProcessor {
    @Override
    public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
        Object... nodeOutputs) throws SemanticException {

      // Assert that there is atleast one item in the stack. This should never
      // be called for leafs.
      assert(!stack.isEmpty());

      // LineageCtx
      LineageCtx lCtx = (LineageCtx) procCtx;
      LateralViewJoinOperator op = (LateralViewJoinOperator)nd;
      boolean isUdtfPath = true;
      Operator<? extends OperatorDesc> inpOp = getParent(stack);
      ArrayList<ColumnInfo> cols = inpOp.getSchema().getSignature();

      if (inpOp instanceof SelectOperator) {
        isUdtfPath = false;
      }

      // Dirty hack!!
      // For the select path the columns are the ones at the end of the
      // current operators schema and for the udtf path the columns are
      // at the beginning of the operator schema.
      ArrayList<ColumnInfo> out_cols = op.getSchema().getSignature();
      int out_cols_size = out_cols.size();
      int cols_size = cols.size();
      if (isUdtfPath) {
        int cnt = 0;
        while (cnt < cols_size) {
          lCtx.getIndex().mergeDependency(op, out_cols.get(cnt),
              lCtx.getIndex().getDependency(inpOp, cols.get(cnt)));
          cnt++;
        }
      }
      else {
        int cnt = cols_size - 1;
        while (cnt >= 0) {
          lCtx.getIndex().mergeDependency(op, out_cols.get(out_cols_size - cols_size + cnt),
              lCtx.getIndex().getDependency(inpOp, cols.get(cnt)));
          cnt--;
        }
      }
      return null;
    }

  }

  /**
   * Processor for Select operator.
   */
  public static class SelectLineage extends DefaultLineage implements NodeProcessor {
    @Override
    public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
        Object... nodeOutputs) throws SemanticException {

      LineageCtx lctx = (LineageCtx)procCtx;
      SelectOperator sop = (SelectOperator)nd;

      // if this is a selStarNoCompute then this select operator
      // is treated like a default operator, so just call the super classes
      // process method.
      if (sop.getConf().isSelStarNoCompute()) {
        return super.process(nd, stack, procCtx, nodeOutputs);
      }

      // Otherwise we treat this as a normal select operator and look at
      // the expressions.

      ArrayList<ColumnInfo> col_infos = sop.getSchema().getSignature();
      int cnt = 0;
      for(ExprNodeDesc expr : sop.getConf().getColList()) {
        lctx.getIndex().putDependency(sop, col_infos.get(cnt++),
            ExprProcFactory.getExprDependency(lctx, getParent(stack), expr));
      }

      return null;
    }

  }

  /**
   * Processor for GroupBy operator.
   */
  public static class GroupByLineage extends DefaultLineage implements NodeProcessor {
    @Override
    public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
        Object... nodeOutputs) throws SemanticException {

      LineageCtx lctx = (LineageCtx)procCtx;
      GroupByOperator gop = (GroupByOperator)nd;
      ArrayList<ColumnInfo> col_infos = gop.getSchema().getSignature();
      Operator<? extends OperatorDesc> inpOp = getParent(stack);
      int cnt = 0;

      for(ExprNodeDesc expr : gop.getConf().getKeys()) {
        lctx.getIndex().putDependency(gop, col_infos.get(cnt++),
            ExprProcFactory.getExprDependency(lctx, inpOp, expr));
      }

      for(AggregationDesc agg : gop.getConf().getAggregators()) {
        // Concatenate the dependencies of all the parameters to
        // create the new dependency
        Dependency dep = new Dependency();
        DependencyType new_type = LineageInfo.DependencyType.EXPRESSION;
        // TODO: Get the actual string here.
        dep.setExpr(null);
        LinkedHashSet<BaseColumnInfo> bci_set = new LinkedHashSet<BaseColumnInfo>();
        for(ExprNodeDesc expr : agg.getParameters()) {
          Dependency expr_dep = ExprProcFactory.getExprDependency(lctx, inpOp, expr);
          if (expr_dep != null) {
            new_type = LineageCtx.getNewDependencyType(expr_dep.getType(), new_type);
            bci_set.addAll(expr_dep.getBaseCols());
          }
        }

        // If the bci_set is empty, this means that the inputs to this
        // aggregate function were all constants (e.g. count(1)). In this case
        // the aggregate function is just dependent on all the tables that are in
        // the dependency list of the input operator.
        if (bci_set.isEmpty()) {
          Set<TableAliasInfo> tai_set = new LinkedHashSet<TableAliasInfo>();
          if (inpOp.getSchema() != null && inpOp.getSchema().getSignature() != null ) {
            for(ColumnInfo ci : inpOp.getSchema().getSignature()) {
              Dependency inp_dep = lctx.getIndex().getDependency(inpOp, ci);
              // The dependency can be null as some of the input cis may not have
              // been set in case of joins.
              if (inp_dep != null) {
                for(BaseColumnInfo bci : inp_dep.getBaseCols()) {
                  new_type = LineageCtx.getNewDependencyType(inp_dep.getType(), new_type);
                  tai_set.add(bci.getTabAlias());
                }
              }
            }
          }

          // Create the BaseColumnInfos and set them in the bci_set
          for(TableAliasInfo tai : tai_set) {
            BaseColumnInfo bci = new BaseColumnInfo();
            bci.setTabAlias(tai);
            // This is set to null to reflect that the dependency is not on any
            // particular column of the table.
            bci.setColumn(null);
            bci_set.add(bci);
          }
        }

        dep.setBaseCols(new ArrayList<BaseColumnInfo>(bci_set));
        dep.setType(new_type);
        lctx.getIndex().putDependency(gop, col_infos.get(cnt++), dep);
      }

      return null;
    }

  }

  /**
   * Union processor.
   * In this case we call mergeDependency as opposed to putDependency
   * in order to account for visits from different parents.
   */
  public static class UnionLineage extends DefaultLineage implements NodeProcessor {

    protected static final Log LOG = LogFactory.getLog(OpProcFactory.class.getName());

    @SuppressWarnings("unchecked")
    @Override
    public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
        Object... nodeOutputs) throws SemanticException {
      // Assert that there is atleast one item in the stack. This should never
      // be called for leafs.
      assert(!stack.isEmpty());

      // LineageCtx
      LineageCtx lCtx = (LineageCtx) procCtx;
      Operator<? extends OperatorDesc> op = (Operator<? extends OperatorDesc>)nd;

      // Get the row schema of the input operator.
      // The row schema of the parent operator
      Operator<? extends OperatorDesc> inpOp = getParent(stack);
      RowSchema rs = op.getSchema();
      ArrayList<ColumnInfo> inp_cols = inpOp.getSchema().getSignature();
      int cnt = 0;
      for(ColumnInfo ci : rs.getSignature()) {
        Dependency inp_dep = lCtx.getIndex().getDependency(inpOp, inp_cols.get(cnt++));
        if (inp_dep != null) {
          lCtx.getIndex().mergeDependency(op, ci, inp_dep);
        }
      }
      return null;
    }
  }

  /**
   * ReduceSink processor.
   */
  public static class ReduceSinkLineage implements NodeProcessor {

    protected static final Log LOG = LogFactory.getLog(OpProcFactory.class.getName());

    @SuppressWarnings("unchecked")
    @Override
    public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
        Object... nodeOutputs) throws SemanticException {
      // Assert that there is atleast one item in the stack. This should never
      // be called for leafs.
      assert(!stack.isEmpty());

      // LineageCtx
      LineageCtx lCtx = (LineageCtx) procCtx;
      ReduceSinkOperator rop = (ReduceSinkOperator)nd;

      ArrayList<ColumnInfo> col_infos = rop.getSchema().getSignature();
      Operator<? extends OperatorDesc> inpOp = getParent(stack);
      int cnt = 0;

      // The keys are included only in case the reduce sink feeds into
      // a group by operator through a chain of forward operators
      Operator<? extends OperatorDesc> op = rop.getChildOperators().get(0);
      while (op instanceof ForwardOperator) {
        op = op.getChildOperators().get(0);
      }

      if (op instanceof GroupByOperator) {
        for(ExprNodeDesc expr : rop.getConf().getKeyCols()) {
          lCtx.getIndex().putDependency(rop, col_infos.get(cnt++),
              ExprProcFactory.getExprDependency(lCtx, inpOp, expr));
        }
      }

      for(ExprNodeDesc expr : rop.getConf().getValueCols()) {
        lCtx.getIndex().putDependency(rop, col_infos.get(cnt++),
            ExprProcFactory.getExprDependency(lCtx, inpOp, expr));
      }

      return null;
    }
  }

  /**
   * Default processor. This basically passes the input dependencies as such
   * to the output dependencies.
   */
  public static class DefaultLineage implements NodeProcessor {

    protected static final Log LOG = LogFactory.getLog(OpProcFactory.class.getName());

    @SuppressWarnings("unchecked")
    @Override
    public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
        Object... nodeOutputs) throws SemanticException {
      // Assert that there is atleast one item in the stack. This should never
      // be called for leafs.
      assert(!stack.isEmpty());

      // LineageCtx
      LineageCtx lCtx = (LineageCtx) procCtx;
      Operator<? extends OperatorDesc> op = (Operator<? extends OperatorDesc>)nd;

      // Get the row schema of the input operator.
      // The row schema of the parent operator
      Operator<? extends OperatorDesc> inpOp = getParent(stack);
      RowSchema rs = op.getSchema();
      ArrayList<ColumnInfo> inp_cols = inpOp.getSchema().getSignature();
      int cnt = 0;
      for(ColumnInfo ci : rs.getSignature()) {
        lCtx.getIndex().putDependency(op, ci,
            lCtx.getIndex().getDependency(inpOp, inp_cols.get(cnt++)));
      }
      return null;
    }
  }

  public static NodeProcessor getJoinProc() {
    return new JoinLineage();
  }

  public static NodeProcessor getLateralViewJoinProc() {
    return new LateralViewJoinLineage();
  }

  public static NodeProcessor getTSProc() {
    return new TableScanLineage();
  }

  public static NodeProcessor getTransformProc() {
    return new TransformLineage();
  }

  public static NodeProcessor getSelProc() {
    return new SelectLineage();
  }

  public static NodeProcessor getGroupByProc() {
    return new GroupByLineage();
  }

  public static NodeProcessor getUnionProc() {
    return new UnionLineage();
  }

  public static NodeProcessor getReduceSinkProc() {
    return new ReduceSinkLineage();
  }

  public static NodeProcessor getDefaultProc() {
    return new DefaultLineage();
  }

}
TOP

Related Classes of org.apache.hadoop.hive.ql.optimizer.lineage.OpProcFactory$DefaultLineage

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.