Package org.apache.hadoop.hive.ql.metadata

Source Code of org.apache.hadoop.hive.ql.metadata.HiveMetaStoreChecker

package org.apache.hadoop.hive.ql.metadata;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.metastore.MetaStoreUtils;
import org.apache.hadoop.hive.metastore.Warehouse;
import org.apache.hadoop.hive.metastore.api.MetaException;
import org.apache.hadoop.hive.metastore.api.NoSuchObjectException;
import org.apache.hadoop.hive.ql.metadata.CheckResult.PartitionResult;
import org.apache.hadoop.hive.conf.HiveConf;

import org.apache.thrift.TException;

/**
* Verify that the information in the metastore matches what
* is on the filesystem. Return a CheckResult object
* containing lists of missing and any unexpected tables and partitions.
*/
public class HiveMetaStoreChecker {

  public static final Log LOG = LogFactory.getLog(HiveMetaStoreChecker.class);

  private Hive hive;
  private HiveConf conf;

  public HiveMetaStoreChecker(Hive hive) {
    super();
    this.hive = hive;
    this.conf = hive.getConf();
  }

  /**
   * Check the metastore for inconsistencies, data missing in either the
   * metastore or on the dfs.
   *
   * @param dbName
   *          name of the database, if not specified the default will be used.
   * @param tableName
   *          Table we want to run the check for. If null we'll check all the
   *          tables in the database.
   * @param partitions List of partition name value pairs,
   * if null or empty check all partitions
   * @param result Fill this with the results of the check
   * @throws HiveException Failed to get required information
   * from the metastore.
   * @throws IOException Most likely filesystem related
   */
  public void checkMetastore(String dbName, String tableName,
      List<Map<String, String>> partitions, CheckResult result)
    throws HiveException, IOException {

    if (dbName == null || "".equalsIgnoreCase(dbName)) {
      dbName = MetaStoreUtils.DEFAULT_DATABASE_NAME;
    }

    try {
      if (tableName == null || "".equals(tableName)) {
        // no table specified, check all tables and all partitions.
        List<String> tables = hive.getTablesForDb(dbName, ".*");
        for (String currentTableName : tables) {
          checkTable(dbName, currentTableName, null, result);
        }
 
        findUnknownTables(dbName, tables, result);
      } else if (partitions == null || partitions.isEmpty()) {
        // only one table, let's check all partitions
        checkTable(dbName, tableName, null, result);
      } else {
        // check the specified partitions
        checkTable(dbName, tableName, partitions, result);
      }
    } catch (MetaException e) {
      throw new HiveException(e);
    } catch (TException e) {
      throw new HiveException(e);
    }
  }

  /**
   * Check for table directories that aren't in the metastore.
   * @param dbName Name of the database
   * @param tables List of table names
   * @param result Add any found tables to this
   * @throws HiveException Failed to get required information
   * from the metastore.
   * @throws IOException Most likely filesystem related
   * @throws MetaException Failed to get required information
   * from the metastore.
   * @throws NoSuchObjectException Failed to get required information
   * from the metastore.
   * @throws TException Thrift communication error.
   */
  void findUnknownTables(String dbName, List<String> tables,
      CheckResult result) throws IOException, MetaException, TException,
      HiveException {

    Set<Path> dbPaths = new HashSet<Path>();
    Set<String> tableNames = new HashSet<String>(tables);

    for (String tableName : tables) {
      Table table = hive.getTable(dbName, tableName);
      // hack, instead figure out a way to get the db paths
      String isExternal = table.getParameters().get("EXTERNAL");
      if (isExternal == null || !"TRUE".equalsIgnoreCase(isExternal)) {
        dbPaths.add(table.getPath().getParent());
      }
    }

    for (Path dbPath : dbPaths) {
      FileSystem fs = dbPath.getFileSystem(conf);
      FileStatus[] statuses = fs.listStatus(dbPath);
      for (FileStatus status : statuses) {
       
        if (status.isDir()
            && !tableNames.contains(status.getPath().getName())) {
         
          result.getTablesNotInMs().add(status.getPath().getName());
        }
      }
    }
  }

  /**
   * Check the metastore for inconsistencies, data missing in either the
   * metastore or on the dfs.
   *
   * @param dbName Name of the database
   * @param tableName Name of the table
   * @param partitions Partitions to check, if null or empty
   * get all the partitions.
   * @param result Result object
   * @throws HiveException Failed to get required information
   * from the metastore.
   * @throws IOException Most likely filesystem related
   * @throws MetaException Failed to get required information
   * from the metastore.
   */
  void checkTable(String dbName, String tableName,
      List<Map<String, String>> partitions, CheckResult result)
      throws MetaException, IOException, HiveException {

    Table table = null;

    try {
      table = hive.getTable(dbName, tableName);
    } catch (HiveException e) {
      result.getTablesNotInMs().add(tableName);
      return;
    }

    List<Partition> parts = new ArrayList<Partition>();
    boolean findUnknownPartitions = true;
   
    if (table.isPartitioned()) {
      if (partitions == null || partitions.isEmpty()) {
        // no partitions specified, let's get all
        parts = hive.getPartitions(table);
      } else {
        //we're interested in specific partitions,
        //don't check for any others
        findUnknownPartitions = false;
        for (Map<String, String> map : partitions) {
          Partition part = hive.getPartition(table, map, false);
          if(part == null) {
            PartitionResult pr = new PartitionResult();
            pr.setTableName(tableName);
            pr.setPartitionName(Warehouse.makePartName(map));
            result.getPartitionsNotInMs().add(pr);
          } else {
            parts.add(part);
          }
        }
      }
    }

    checkTable(table, parts, findUnknownPartitions, result);
  }

  /**
   * Check the metastore for inconsistencies, data missing in either the
   * metastore or on the dfs.
   *
   * @param table Table to check
   * @param parts Partitions to check
   * @param result Result object
   * @param findUnknownPartitions Should we try to find unknown partitions?
   * @throws IOException Could not get information from filesystem
   * @throws HiveException Could not create Partition object
   */
  void checkTable(Table table, List<Partition> parts,
      boolean findUnknownPartitions, CheckResult result)
    throws IOException, HiveException {

    Path tablePath = table.getPath();
    FileSystem fs = tablePath.getFileSystem(conf);
    if (!fs.exists(tablePath)) {
      result.getTablesNotOnFs().add(table.getName());
      return;
    }

    Set<Path> partPaths = new HashSet<Path>();

    // check that the partition folders exist on disk
    for (Partition partition : parts) {
      if(partition == null) {
        //most likely the user specified an invalid partition
        continue;
      }
      Path partPath = partition.getPartitionPath();
      fs = partPath.getFileSystem(conf);
      if (!fs.exists(partPath)) {
        PartitionResult pr = new PartitionResult();
        pr.setPartitionName(partition.getName());
        pr.setTableName(partition.getTable().getName());
        result.getPartitionsNotOnFs().add(pr);
      }

      for (int i = 0; i < partition.getSpec().size(); i++) {
        partPaths.add(partPath.makeQualified(fs));
        partPath = partPath.getParent();
      }
    }

    if(findUnknownPartitions) {
      findUnknownPartitions(table, partPaths, result);
    }
  }

  /**
   * Find partitions on the fs that are
   * unknown to the metastore
   * @param table Table where the partitions would be located
   * @param partPaths Paths of the partitions the ms knows about
   * @param result Result object
   * @throws IOException Thrown if we fail at fetching listings from
   * the fs.
   */
  void findUnknownPartitions(Table table, Set<Path> partPaths,
      CheckResult result) throws IOException {
   
    Path tablePath = table.getPath();
    // now check the table folder and see if we find anything
    // that isn't in the metastore
    Set<Path> allPartDirs = new HashSet<Path>();
    getAllLeafDirs(tablePath, allPartDirs);
    // don't want the table dir
    allPartDirs.remove(tablePath);

    // remove the partition paths we know about
    allPartDirs.removeAll(partPaths);
   
    // we should now only have the unexpected folders left
    for (Path partPath : allPartDirs) {
      FileSystem fs = partPath.getFileSystem(conf);
      String partitionName = getPartitionName(fs.makeQualified(tablePath),
          partPath);
     
      if (partitionName != null) {
        PartitionResult pr = new PartitionResult();
        pr.setPartitionName(partitionName);
        pr.setTableName(table.getName());
       
        result.getPartitionsNotInMs().add(pr);
      }
    }
  }

  /**
   * Get the partition name from the path.
   *
   * @param tablePath Path of the table.
   * @param partitionPath Path of the partition.
   * @return Partition name, for example partitiondate=2008-01-01
   */
  private String getPartitionName(Path tablePath, Path partitionPath) {
    String result = null;
    Path currPath = partitionPath;
    while (currPath != null && !tablePath.equals(currPath)) {
      if(result == null) {
        result = currPath.getName();
      } else {
        result = currPath.getName() + Path.SEPARATOR + result;
      }
     
      currPath = currPath.getParent();
    }
    return result;
  }

  /**
   * Recursive method to get the leaf directories of a base path.
   * Example:
   * base/dir1/dir2
   * base/dir3
   *
   * This will return dir2 and dir3 but not dir1.
   *
   * @param basePath Start directory
   * @param allDirs This set will contain the leaf paths at the end.
   * @throws IOException Thrown if we can't get lists from the fs.
   */

  private void getAllLeafDirs(Path basePath, Set<Path> allDirs)
      throws IOException {
    getAllLeafDirs(basePath, allDirs, basePath.getFileSystem(conf));
  }

  private void getAllLeafDirs(Path basePath, Set<Path> allDirs, FileSystem fs)
      throws IOException {
   
    FileStatus[] statuses = fs.listStatus(basePath);

    if (statuses.length == 0) {
      allDirs.add(basePath);
    }

    for (FileStatus status : statuses) {
      if (status.isDir()) {
        getAllLeafDirs(status.getPath(), allDirs, fs);
      }
    }
  }

}
TOP

Related Classes of org.apache.hadoop.hive.ql.metadata.HiveMetaStoreChecker

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.