Source Code of org.apache.hadoop.hive.ql.exec.StatsTask$TableStatistics

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */




package org.apache.hadoop.hive.ql.exec;


import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.Serializable;
import java.net.URI;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Pattern;


import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.metastore.Warehouse;
import org.apache.hadoop.hive.ql.DriverContext;
import org.apache.hadoop.hive.ql.ErrorMsg;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.metadata.Partition;
import org.apache.hadoop.hive.ql.metadata.Table;
import org.apache.hadoop.hive.ql.optimizer.listbucketingpruner.ListBucketingPrunerUtils;
import org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer.tableSpec;
import org.apache.hadoop.hive.ql.plan.DynamicPartitionCtx;
import org.apache.hadoop.hive.ql.plan.LoadTableDesc;
import org.apache.hadoop.hive.ql.plan.StatsWork;
import org.apache.hadoop.hive.ql.plan.api.StageType;
import org.apache.hadoop.hive.ql.stats.StatsAggregator;
import org.apache.hadoop.hive.ql.stats.StatsFactory;
import org.apache.hadoop.hive.ql.stats.StatsPublisher;
import org.apache.hadoop.hive.ql.stats.StatsSetupConst;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.util.StringUtils;


/**
 * StatsTask implementation.
 **/
public class StatsTask extends Task<StatsWork> implements Serializable {


  private static final long serialVersionUID = 1L;
  private static transient final Log LOG = LogFactory.getLog(StatsTask.class);


  private Table table;
  private List<LinkedHashMap<String, String>> dpPartSpecs;


  private static final List<String> supportedStats = new ArrayList<String>();
  private static final List<String> collectableStats = new ArrayList<String>();
  private static final Map<String, String> nameMapping = new HashMap<String, String>();
  static {
    // supported statistics
    supportedStats.add(StatsSetupConst.NUM_FILES);
    supportedStats.add(StatsSetupConst.ROW_COUNT);
    supportedStats.add(StatsSetupConst.TOTAL_SIZE);
    supportedStats.add(StatsSetupConst.RAW_DATA_SIZE);


    // statistics that need to be collected throughout the execution
    collectableStats.add(StatsSetupConst.ROW_COUNT);
    collectableStats.add(StatsSetupConst.RAW_DATA_SIZE);


    nameMapping.put(StatsSetupConst.NUM_FILES, "num_files");
    nameMapping.put(StatsSetupConst.ROW_COUNT, "num_rows");
    nameMapping.put(StatsSetupConst.TOTAL_SIZE, "total_size");
    nameMapping.put(StatsSetupConst.RAW_DATA_SIZE, "raw_data_size");
  }


  public StatsTask() {
    super();
    dpPartSpecs = null;
  }


  /**
   *
   * Partition Level Statistics.
   *
   */
  class PartitionStatistics {
    Map<String, LongWritable> stats;


    public PartitionStatistics() {
      stats = new HashMap<String, LongWritable>();
      for (String statType : supportedStats) {
        stats.put(statType, new LongWritable(0L));
      }
    }


    public PartitionStatistics(Map<String, Long> st) {
      stats = new HashMap<String, LongWritable>();
      for (String statType : st.keySet()) {
        Long stValue = st.get(statType) == null ? 0L : st.get(statType);
        stats.put(statType, new LongWritable(stValue));
      }
    }


    public long getStat(String statType) {
      return stats.get(statType) == null ? 0L : stats.get(statType).get();
    }


    public void setStat(String statType, long value) {
      stats.put(statType, new LongWritable(value));
    }




    @Override
    public String toString() {
      StringBuilder sb = new StringBuilder();
      for (String statType : supportedStats) {
        sb.append(nameMapping.get(statType)).append(": ").append(stats.get(statType)).append(", ");
      }
      sb.delete(sb.length() - 2, sb.length());
      return sb.toString();
    }
  }


  /**
   * Table Level Statistics.
   */
  class TableStatistics extends PartitionStatistics {
    int numPartitions; // number of partitions


    public TableStatistics() {
      super();
      numPartitions = 0;
    }


    public void setNumPartitions(int np) {
      numPartitions = np;
    }


    public int getNumPartitions() {
      return numPartitions;
    }


    /**
     * Incrementally update the table statistics according to the old and new
     * partition level statistics.
     *
     * @param oldStats
     *          The old statistics of a partition.
     * @param newStats
     *          The new statistics of a partition.
     */
    public void updateStats(PartitionStatistics oldStats, PartitionStatistics newStats) {
      deletePartitionStats(oldStats);
      addPartitionStats(newStats);
    }


    /**
     * Update the table level statistics when a new partition is added.
     *
     * @param newStats
     *          the new partition statistics.
     */
    public void addPartitionStats(PartitionStatistics newStats) {
      for (String statType : supportedStats) {
        LongWritable value = stats.get(statType);
        if (value == null) {
          stats.put(statType, new LongWritable(newStats.getStat(statType)));
        } else {
          value.set(value.get() + newStats.getStat(statType));
        }
      }
      this.numPartitions++;
    }


    /**
     * Update the table level statistics when an old partition is dropped.
     *
     * @param oldStats
     *          the old partition statistics.
     */
    public void deletePartitionStats(PartitionStatistics oldStats) {
      for (String statType : supportedStats) {
        LongWritable value = stats.get(statType);
        value.set(value.get() - oldStats.getStat(statType));
      }
      this.numPartitions--;
    }


    @Override
    public String toString() {
      StringBuilder sb = new StringBuilder();
      sb.append("num_partitions: ").append(numPartitions).append(", ");
      sb.append(super.toString());
      return sb.toString();
    }
  }


  @Override
  protected void receiveFeed(FeedType feedType, Object feedValue) {
    // this method should be called by MoveTask when there are dynamic partitions generated
    if (feedType == FeedType.DYNAMIC_PARTITIONS) {
      assert feedValue instanceof List<?>;
      dpPartSpecs = (List<LinkedHashMap<String, String>>) feedValue;
    }
  }


  @Override
  public int execute(DriverContext driverContext) {


    LOG.info("Executing stats task");
    // Make sure that it is either an ANALYZE, INSERT OVERWRITE or CTAS command
    short workComponentsPresent = 0;
    if (work.getLoadTableDesc() != null) {
      workComponentsPresent++;
    }
    if (work.getTableSpecs() != null) {
      workComponentsPresent++;
    }
    if (work.getLoadFileDesc() != null) {
      workComponentsPresent++;
    }


    assert (workComponentsPresent == 1);


    String tableName = "";
    try {
      if (work.getLoadTableDesc() != null) {
        tableName = work.getLoadTableDesc().getTable().getTableName();
      } else if (work.getTableSpecs() != null){
        tableName = work.getTableSpecs().tableName;
      } else {
        tableName = work.getLoadFileDesc().getDestinationCreateTable();
      }


      table = db.getTable(tableName);


    } catch (HiveException e) {
      LOG.error("Cannot get table " + tableName, e);
      console.printError("Cannot get table " + tableName, e.toString());
    }


    return aggregateStats();


  }


  @Override
  public StageType getType() {
    return StageType.STATS;
  }


  @Override
  public String getName() {
    return "STATS";
  }


  private int aggregateStats() {


    StatsAggregator statsAggregator = null;
    int ret = 0;


    try {
      // Stats setup:
      Warehouse wh = new Warehouse(conf);


      if (!this.getWork().getNoStatsAggregator()) {
        String statsImplementationClass = HiveConf.getVar(conf, HiveConf.ConfVars.HIVESTATSDBCLASS);
        StatsFactory.setImplementation(statsImplementationClass, conf);
        if (work.isNoScanAnalyzeCommand()){
          // initialize stats publishing table for noscan which has only stats task
          // the rest of MR task following stats task initializes it in ExecDriver.java
          StatsPublisher statsPublisher = StatsFactory.getStatsPublisher();
          if (!statsPublisher.init(conf)) { // creating stats table if not exists
            if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_STATS_RELIABLE)) {
              throw
                new HiveException(ErrorMsg.STATSPUBLISHER_INITIALIZATION_ERROR.getErrorCodedMsg());
            }
          }
        }
        statsAggregator = StatsFactory.getStatsAggregator();
        // manufacture a StatsAggregator
        if (!statsAggregator.connect(conf)) {
          throw new HiveException("StatsAggregator connect failed " + statsImplementationClass);
        }
      }


      TableStatistics tblStats = new TableStatistics();


      org.apache.hadoop.hive.metastore.api.Table tTable = table.getTTable();
      Map<String, String> parameters = tTable.getParameters();


      boolean tableStatsExist = this.existStats(parameters);


      for (String statType : supportedStats) {
        if (parameters.containsKey(statType)) {
          tblStats.setStat(statType, Long.parseLong(parameters.get(statType)));
        }
      }


      if (parameters.containsKey(StatsSetupConst.NUM_PARTITIONS)) {
        tblStats.setNumPartitions(Integer.parseInt(parameters.get(StatsSetupConst.NUM_PARTITIONS)));
      }


      List<Partition> partitions = getPartitionsList();
      boolean atomic = HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_STATS_ATOMIC);
      int maxPrefixLength = HiveConf.getIntVar(conf,
          HiveConf.ConfVars.HIVE_STATS_KEY_PREFIX_MAX_LENGTH);


      if (partitions == null) {
        // non-partitioned tables:
        if (!tableStatsExist && atomic) {
          return 0;
        }
        long[] summary = summary(conf, table);
        tblStats.setStat(StatsSetupConst.NUM_FILES, summary[0]);
        tblStats.setStat(StatsSetupConst.TOTAL_SIZE, summary[1]);


        // In case of a non-partitioned table, the key for stats temporary store is "rootDir"
        if (statsAggregator != null) {
          String aggKey = Utilities.getHashedStatsPrefix(work.getAggKey(), maxPrefixLength);
          updateStats(collectableStats, tblStats, statsAggregator, parameters,
              aggKey, atomic);
          statsAggregator.cleanUp(aggKey);
        }
        // The collectable stats for the aggregator needs to be cleared.
        // For eg. if a file is being loaded, the old number of rows are not valid
        else if (work.isClearAggregatorStats()) {
          for (String statType : collectableStats) {
            if (parameters.containsKey(statType)) {
              tblStats.setStat(statType, 0L);
            }
          }
        }
      } else {
        // Partitioned table:
        // Need to get the old stats of the partition
        // and update the table stats based on the old and new stats.
        for (Partition partn : partitions) {
          //
          // get the old partition stats
          //
          org.apache.hadoop.hive.metastore.api.Partition tPart = partn.getTPartition();
          parameters = tPart.getParameters();


          boolean hasStats = this.existStats(parameters);
          if (!hasStats && atomic) {
            continue;
          }


          Map<String, Long> currentValues = new HashMap<String, Long>();
          for (String statType : supportedStats) {
            Long val = parameters.containsKey(statType) ? Long.parseLong(parameters.get(statType))
                : 0L;
            currentValues.put(statType, val);
          }


          //
          // get the new partition stats
          //
          PartitionStatistics newPartStats = new PartitionStatistics();


          // In that case of a partition, the key for stats temporary store is
          // "rootDir/[dynamic_partition_specs/]%"
          String partitionID = Utilities.getHashedStatsPrefix(
              work.getAggKey() + Warehouse.makePartPath(partn.getSpec()), maxPrefixLength);


          LOG.info("Stats aggregator : " + partitionID);


          if (statsAggregator != null) {
            updateStats(collectableStats, newPartStats, statsAggregator,
                parameters, partitionID, atomic);
            statsAggregator.cleanUp(partitionID);
          } else {
            for (String statType : collectableStats) {
              // The collectable stats for the aggregator needs to be cleared.
              // For eg. if a file is being loaded, the old number of rows are not valid
              if (work.isClearAggregatorStats()) {
                if (parameters.containsKey(statType)) {
                  newPartStats.setStat(statType, 0L);
                }
              }
              else {
                newPartStats.setStat(statType, currentValues.get(statType));
              }
            }
          }


          long[] summary = summary(conf, partn);
          newPartStats.setStat(StatsSetupConst.NUM_FILES, summary[0]);
          newPartStats.setStat(StatsSetupConst.TOTAL_SIZE, summary[1]);


          if (hasStats) {
            PartitionStatistics oldPartStats = new PartitionStatistics(currentValues);
            tblStats.updateStats(oldPartStats, newPartStats);
          } else {
            tblStats.addPartitionStats(newPartStats);
          }


          //
          // update the metastore
          //
          for (String statType : supportedStats) {
            long statValue = newPartStats.getStat(statType);
            if (statValue >= 0) {
              parameters.put(statType, Long.toString(newPartStats.getStat(statType)));
            }
          }


          tPart.setParameters(parameters);
          String tableFullName = table.getDbName() + "." + table.getTableName();
          db.alterPartition(tableFullName, new Partition(table, tPart));


          console.printInfo("Partition " + tableFullName + partn.getSpec() +
              " stats: [" + newPartStats.toString() + ']');
        }


      }


      //
      // write table stats to metastore
      //
      parameters = tTable.getParameters();
      for (String statType : supportedStats) {
        parameters.put(statType, Long.toString(tblStats.getStat(statType)));
      }
      parameters.put(StatsSetupConst.NUM_PARTITIONS, Integer.toString(tblStats.getNumPartitions()));
      tTable.setParameters(parameters);


      String tableFullName = table.getDbName() + "." + table.getTableName();


      db.alterTable(tableFullName, new Table(tTable));


      console.printInfo("Table " + tableFullName + " stats: [" + tblStats.toString() + ']');


    } catch (Exception e) {
      console.printInfo("[Warning] could not update stats.",
          "Failed with exception " + e.getMessage() + "\n"
              + StringUtils.stringifyException(e));


      // Fail the query if the stats are supposed to be reliable
      if (work.isStatsReliable()) {
        ret = 1;
      }
    } finally {
      if (statsAggregator != null) {
        statsAggregator.closeConnection();
      }
    }
    // The return value of 0 indicates success,
    // anything else indicates failure
    return ret;
  }


  private long[] summary(HiveConf conf, Partition partn) throws IOException {
    Path path = partn.getPartitionPath();
    FileSystem fs = path.getFileSystem(conf);
    List<String> skewedColNames = partn.getSkewedColNames();
    if (skewedColNames == null || skewedColNames.isEmpty()) {
      return summary(fs, path);
    }
    List<List<String>> skewColValues = table.getSkewedColValues();
    if (skewColValues == null || skewColValues.isEmpty()) {
      return summary(fs, toDefaultLBPath(path));
    }
    return summary(fs, path, skewedColNames);
  }


  private long[] summary(HiveConf conf, Table table) throws IOException {
    Path path = table.getPath();
    FileSystem fs = path.getFileSystem(conf);
    List<String> skewedColNames = table.getSkewedColNames();
    if (skewedColNames == null || skewedColNames.isEmpty()) {
      return summary(fs, path);
    }
    List<List<String>> skewColValues = table.getSkewedColValues();
    if (skewColValues == null || skewColValues.isEmpty()) {
      return summary(fs, toDefaultLBPath(path));
    }
    return summary(fs, path, table.getSkewedColNames());
  }


  private Path toDefaultLBPath(Path path) {
    return new Path(path, ListBucketingPrunerUtils.HIVE_LIST_BUCKETING_DEFAULT_DIR_NAME);
  }


  private long[] summary(FileSystem fs, Path path) throws IOException {
    try {
      FileStatus status = fs.getFileStatus(path);
      if (!status.isDir()) {
        return new long[] {1, status.getLen()};
      }
    } catch (FileNotFoundException e) {
      return new long[] {0, 0};
    }
    FileStatus[] children = fs.listStatus(path);  // can be null
    if (children == null) {
      return new long[] {0, 0};
    }
    long numFiles = 0L;
    long tableSize = 0L;
    for (FileStatus child : children) {
      if (!child.isDir()) {
        tableSize += child.getLen();
        numFiles++;
      }
    }
    return new long[] {numFiles, tableSize};
  }


  private Pattern toPattern(List<String> skewCols) {
    StringBuilder builder = new StringBuilder();
    for (String skewCol : skewCols) {
      if (builder.length() > 0) {
        builder.append(Path.SEPARATOR_CHAR);
      }
      builder.append(skewCol).append('=');
      builder.append("[^").append(Path.SEPARATOR_CHAR).append("]*");
    }
    builder.append(Path.SEPARATOR_CHAR);
    builder.append("[^").append(Path.SEPARATOR_CHAR).append("]*$");
    return Pattern.compile(builder.toString());
  }


  private long[] summary(FileSystem fs, Path path, List<String> skewCols) throws IOException {
    long numFiles = 0L;
    long tableSize = 0L;
    Pattern pattern = toPattern(skewCols);
    for (FileStatus status : Utilities.getFileStatusRecurse(path, skewCols.size() + 1, fs)) {
      if (status.isDir()) {
        continue;
      }
      String relative = toRelativePath(path, status.getPath());
      if (relative == null) {
        continue;
      }
      if (relative.startsWith(ListBucketingPrunerUtils.HIVE_LIST_BUCKETING_DEFAULT_DIR_NAME) ||
        pattern.matcher(relative).matches()) {
        tableSize += status.getLen();
        numFiles++;
      }
    }
    return new long[] {numFiles, tableSize};
  }


  private String toRelativePath(Path path1, Path path2) {
    URI relative = path1.toUri().relativize(path2.toUri());
    if (relative == path2.toUri()) {
      return null;
    }
    return relative.getPath();
  }


  private boolean existStats(Map<String, String> parameters) {
    return parameters.containsKey(StatsSetupConst.ROW_COUNT)
        || parameters.containsKey(StatsSetupConst.NUM_FILES)
        || parameters.containsKey(StatsSetupConst.TOTAL_SIZE)
        || parameters.containsKey(StatsSetupConst.RAW_DATA_SIZE)
        || parameters.containsKey(StatsSetupConst.NUM_PARTITIONS);
  }


  private void updateStats(List<String> statsList, PartitionStatistics stats,
      StatsAggregator statsAggregator, Map<String, String> parameters,
      String aggKey, boolean atomic) throws HiveException {


    String value;
    Long longValue;
    for (String statType : statsList) {
      value = statsAggregator.aggregateStats(aggKey, statType);
      if (value != null) {
        longValue = Long.parseLong(value);


        if (work.getLoadTableDesc() != null &&
            !work.getLoadTableDesc().getReplace()) {
          String originalValue = parameters.get(statType);
          if (originalValue != null) {
            longValue += Long.parseLong(originalValue);
          }
        }
        stats.setStat(statType, longValue);
      } else {
        if (atomic) {
          throw new HiveException("StatsAggregator failed to get statistics.");
        }
      }
    }
  }


  /**
   * Get the list of partitions that need to update statistics.
   * TODO: we should reuse the Partitions generated at compile time
   * since getting the list of partitions is quite expensive.
   *
   * @return a list of partitions that need to update statistics.
   * @throws HiveException
   */
  private List<Partition> getPartitionsList() throws HiveException {
    if (work.getLoadFileDesc() != null) {
      return null; //we are in CTAS, so we know there are no partitions
    }


    List<Partition> list = new ArrayList<Partition>();


    if (work.getTableSpecs() != null) {


      // ANALYZE command
      tableSpec tblSpec = work.getTableSpecs();
      table = tblSpec.tableHandle;
      if (!table.isPartitioned()) {
        return null;
      }
      // get all partitions that matches with the partition spec
      List<Partition> partitions = tblSpec.partitions;
      if (partitions != null) {
        for (Partition partn : partitions) {
          list.add(partn);
        }
      }
    } else if (work.getLoadTableDesc() != null) {


      // INSERT OVERWRITE command
      LoadTableDesc tbd = work.getLoadTableDesc();
      table = db.getTable(tbd.getTable().getTableName());
      if (!table.isPartitioned()) {
        return null;
      }
      DynamicPartitionCtx dpCtx = tbd.getDPCtx();
      if (dpCtx != null && dpCtx.getNumDPCols() > 0) { // dynamic partitions
        // If no dynamic partitions are generated, dpPartSpecs may not be initialized
        if (dpPartSpecs != null) {
          // load the list of DP partitions and return the list of partition specs
          for (LinkedHashMap<String, String> partSpec : dpPartSpecs) {
            Partition partn = db.getPartition(table, partSpec, false);
            list.add(partn);
          }
        }
      } else { // static partition
        Partition partn = db.getPartition(table, tbd.getPartitionSpec(), false);
        list.add(partn);
      }
    }
    return list;
  }


  /**
   * This method is static as it is called from the shutdown hook at the ExecDriver.
   */
  public static void cleanUp(String jobID, Configuration config) {
    StatsAggregator statsAggregator;
    String statsImplementationClass = HiveConf.getVar(config, HiveConf.ConfVars.HIVESTATSDBCLASS);
    StatsFactory.setImplementation(statsImplementationClass, config);
    statsAggregator = StatsFactory.getStatsAggregator();
    if (statsAggregator.connect(config)) {
      statsAggregator.cleanUp(jobID + Path.SEPARATOR); // Adding the path separator to avoid an Id
                                                       // being a prefix of another ID
      statsAggregator.closeConnection();
    }
  }
}
Source Code of org.apache.hadoop.hive.ql.exec.StatsTask$TableStatistics

Related Classes of org.apache.hadoop.hive.ql.exec.StatsTask$TableStatistics