Package com.cloudera.cdk.data.hcatalog

Source Code of com.cloudera.cdk.data.hcatalog.HiveUtils

/*
* Copyright 2013 Cloudera.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*      http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package com.cloudera.cdk.data.hcatalog;

import com.cloudera.cdk.data.DatasetDescriptor;
import com.cloudera.cdk.data.FieldPartitioner;
import com.cloudera.cdk.data.Format;
import com.cloudera.cdk.data.Formats;
import com.cloudera.cdk.data.MetadataProviderException;
import com.cloudera.cdk.data.PartitionStrategy;
import com.cloudera.cdk.data.UnknownFormatException;
import com.cloudera.cdk.data.impl.Accessor;
import com.google.common.base.Joiner;
import com.google.common.base.Preconditions;
import com.google.common.base.Splitter;
import com.google.common.collect.BiMap;
import com.google.common.collect.HashBiMap;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.List;
import java.util.Map;
import org.apache.avro.Schema;
import org.apache.avro.Schema.Field;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.metastore.TableType;
import org.apache.hadoop.hive.metastore.api.FieldSchema;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.metadata.Table;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;

class HiveUtils {
  static final String DEFAULT_DB = "default";
  static final String HDFS_SCHEME = "hdfs";

  private static final String CUSTOM_PROPERTIES_PROPERTY_NAME = "cdk.custom.property.names";
  private static final String PARTITION_EXPRESSION_PROPERTY_NAME = "cdk.partition.expression";
  private static final String AVRO_SCHEMA_URL_PROPERTY_NAME = "avro.schema.url";
  private static final String AVRO_SCHEMA_LITERAL_PROPERTY_NAME = "avro.schema.literal";

  private static final Splitter NAME_SPLITTER = Splitter.on(',');
  private static final Joiner NAME_JOINER = Joiner.on(',');

  private static final BiMap<Format, String> FORMAT_TO_SERDE = HashBiMap.create(2);
  private static final BiMap<String, Format> SERDE_TO_FORMAT = FORMAT_TO_SERDE.inverse();
  private static final Map<Format, String> FORMAT_TO_INPUT_FORMAT = Maps.newHashMap();
  private static final Map<Format, String> FORMAT_TO_OUTPUT_FORMAT = Maps.newHashMap();
  static {
    FORMAT_TO_SERDE.put(Formats.AVRO, "org.apache.hadoop.hive.serde2.avro.AvroSerDe");
    FORMAT_TO_SERDE.put(Formats.PARQUET, "parquet.hive.serde.ParquetHiveSerDe");

    FORMAT_TO_INPUT_FORMAT.put(Formats.AVRO, "org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat");
    FORMAT_TO_INPUT_FORMAT.put(Formats.PARQUET, "parquet.hive.DeprecatedParquetInputFormat");

    FORMAT_TO_OUTPUT_FORMAT.put(Formats.AVRO, "org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat");
    FORMAT_TO_OUTPUT_FORMAT.put(Formats.PARQUET, "parquet.hive.DeprecatedParquetOutputFormat");
  }

  static DatasetDescriptor descriptorForTable(Configuration conf, Table table) {
    final DatasetDescriptor.Builder builder = new DatasetDescriptor.Builder();

    final String serializationLib = table.getSerializationLib();
    if (SERDE_TO_FORMAT.containsKey(serializationLib)) {
      builder.format(SERDE_TO_FORMAT.get(serializationLib));
    } else {
      // TODO: should this use an "unknown" format? others fail in open()
      throw new UnknownFormatException(
          "Unknown format for serde:" + serializationLib);
    }

    final Path dataLocation = new Path(table.getDataLocation());
    final FileSystem fs = fsForPath(conf, dataLocation);

    builder.location(fs.makeQualified(dataLocation));

    // custom properties
    String namesProperty = table.getProperty(CUSTOM_PROPERTIES_PROPERTY_NAME);
    if (namesProperty != null) {
      for (String property : NAME_SPLITTER.split(namesProperty)) {
        builder.property(property, table.getProperty(property));
      }
    }

    if (table.getProperty(PARTITION_EXPRESSION_PROPERTY_NAME) != null) {
      builder.partitionStrategy(
          Accessor.getDefault().fromExpression(
              table.getProperty(PARTITION_EXPRESSION_PROPERTY_NAME)));
    }

    String schemaUrlString = table.getProperty(AVRO_SCHEMA_URL_PROPERTY_NAME);
    if (schemaUrlString != null) {
      try {
        builder.schemaUri(new URI(schemaUrlString));
      } catch (IOException e) {
        throw new MetadataProviderException("Could not read schema", e);
      } catch (URISyntaxException e) {
        // this library sets the URI, so it should always be valid
        throw new MetadataProviderException("[BUG] Invalid schema URI", e);
      }
    }

    String schemaLiteral = table.getProperty(AVRO_SCHEMA_LITERAL_PROPERTY_NAME);
    if (schemaLiteral != null) {
      builder.schemaLiteral(schemaLiteral);
    }

    try {
      return builder.build();
    catch (IllegalStateException ex) {
      throw new MetadataProviderException("Cannot find schema: missing metadata");
    }
  }

  static Table tableForDescriptor(
      String name, DatasetDescriptor descriptor, boolean external) {
    final Table table = new Table(DEFAULT_DB, name);

    if (external) {
      // you'd think this would do it...
      table.setTableType(TableType.EXTERNAL_TABLE);
      // but it doesn't work without some additional magic:
      table.getParameters().put("EXTERNAL", "TRUE");
      table.setDataLocation(descriptor.getLocation());
    } else {
      table.setTableType(TableType.MANAGED_TABLE);
    }

    // copy custom properties to the table
    if (!descriptor.listProperties().isEmpty()) {
      for (String property : descriptor.listProperties()) {
        // no need to check the reserved list, those are not set on descriptors
        table.setProperty(property, descriptor.getProperty(property));
      }
      // set which properties are custom and should be set on descriptors
      table.setProperty(CUSTOM_PROPERTIES_PROPERTY_NAME,
          NAME_JOINER.join(descriptor.listProperties()));
    }

    // translate from Format to SerDe
    final Format format = descriptor.getFormat();
    if (FORMAT_TO_SERDE.containsKey(format)) {
      table.setSerializationLib(FORMAT_TO_SERDE.get(format));
      try {
        table.setInputFormatClass(FORMAT_TO_INPUT_FORMAT.get(format));
        table.setOutputFormatClass(FORMAT_TO_OUTPUT_FORMAT.get(format));
      } catch (HiveException ex) {
        throw new MetadataProviderException(
            "Failed to set input/output formats for format:" +
            format.getName(), ex);
      }
    } else {
      throw new UnknownFormatException(
          "No known serde for format:" + format.getName());
    }
    try {
    } catch (Exception e) {
      throw new MetadataProviderException("Error configuring Hive Avro table, " +
          "table creation failed", e);
    }

    // copy schema info
    boolean useLiteral;
    final URL schemaURL = descriptor.getSchemaUrl();
    try {
      useLiteral = (schemaURL == null) ||
          !HDFS_SCHEME.equals(schemaURL.toURI().getScheme());
    } catch (URISyntaxException ex) {
      useLiteral = true;
    }

    if (useLiteral) {
      table.setProperty(
          AVRO_SCHEMA_LITERAL_PROPERTY_NAME,
          descriptor.getSchema().toString());
    } else {
      table.setProperty(
          AVRO_SCHEMA_URL_PROPERTY_NAME,
          schemaURL.toExternalForm());
    }

    // convert the schema to Hive columns
    table.setFields(convertSchema(descriptor.getSchema()));

    // copy partitioning info
    if (descriptor.isPartitioned()) {
      PartitionStrategy ps = descriptor.getPartitionStrategy();
      table.setProperty(PARTITION_EXPRESSION_PROPERTY_NAME,
          Accessor.getDefault().toExpression(ps));
      List<FieldSchema> partCols = Lists.newArrayList();
      for (FieldPartitioner fp : ps.getFieldPartitioners()) {
        partCols.add(new FieldSchema(fp.getName(), getHiveType(fp.getType()),
            "Partition column derived from '" + fp.getSourceName() + "' column, " +
                "generated by CDK."));
      }
      table.setPartCols(partCols);
    }

    return table;
  }


  public static void updateTableSchema(Table table, DatasetDescriptor descriptor) {
    if (table.getProperty(AVRO_SCHEMA_LITERAL_PROPERTY_NAME) != null) {
      table.setProperty(
          AVRO_SCHEMA_LITERAL_PROPERTY_NAME,
          descriptor.getSchema().toString());
    } else if (table.getProperty(AVRO_SCHEMA_URL_PROPERTY_NAME) != null) {
      if (descriptor.getSchemaUrl() == null) {
        throw new MetadataProviderException("Cannot update " +
            AVRO_SCHEMA_URL_PROPERTY_NAME + " since descriptor schema URL is not set.");
      }
      table.setProperty(
          AVRO_SCHEMA_URL_PROPERTY_NAME,
          descriptor.getSchemaUrl().toExternalForm());
    } else {
      throw new MetadataProviderException("Cannot update Avro schema since neither " +
          AVRO_SCHEMA_LITERAL_PROPERTY_NAME + " nor " + AVRO_SCHEMA_URL_PROPERTY_NAME +
          " is set.");
    }
  }

  static FileSystem fsForPath(Configuration conf, Path path) {
    try {
      return path.getFileSystem(conf);
    } catch (IOException ex) {
      throw new MetadataProviderException(
          "Cannot access FileSystem for uri:" + path, ex);
    }
  }

  static FileSystem fsForPath(Configuration conf, String fsUri, Path path) {
    try {
      if (fsUri == null) {
        return path.getFileSystem(conf);
      } else {
        return FileSystem.get(new URI(fsUri), conf);
      }
    } catch (IOException ex) {
      throw new MetadataProviderException(
          "Cannot access FileSystem for path:" + path, ex);
    } catch (URISyntaxException ex) {
      throw new MetadataProviderException(
          "Cannot access FileSystem for uri:" + fsUri, ex);
    }
  }

  /**
   * Returns the correct dataset path for the given name and root directory.
   *
   * @param root A Path
   * @param name A String dataset name
   * @return the correct dataset Path
   */
  static Path pathForDataset(Path root, String name) {
    Preconditions.checkArgument(name != null, "Dataset name cannot be null");

    // Why replace '.' here? Is this a namespacing hack?
    return new Path(root, name.replace('.', Path.SEPARATOR_CHAR));
  }

  static String getHiveType(Class<?> type) {
    String typeName = PrimitiveObjectInspectorUtils.getTypeNameFromPrimitiveJava(type);
    if (typeName == null) {
      throw new MetadataProviderException("Unsupported FieldPartitioner type: " + type);
    }
    return typeName;
  }

  static List<FieldSchema> convertSchema(Schema avroSchema) {
    List<FieldSchema> columns = Lists.newArrayList();
    if (Schema.Type.RECORD.equals(avroSchema.getType())) {
      for (Field field : avroSchema.getFields()) {
        columns.add(new FieldSchema(
            field.name(), convert(field.schema()).getTypeName(), field.doc()));
      }
    } else {
      columns.add(new FieldSchema(
          "column", convert(avroSchema).getTypeName(), avroSchema.getDoc()));
    }
    return columns;
  }

  static TypeInfo convert(Schema avroSchema) {
    // TODO: throw an error on recursive types
    switch (avroSchema.getType()) {
      case RECORD:
        return convertRecord(avroSchema);
      case UNION:
        return convertUnion(avroSchema);
      case ARRAY:
        return convertArray(avroSchema);
      case MAP:
        return convertMap(avroSchema);
      default:
        return convertPrimitive(avroSchema);
    }
  }

  static TypeInfo convertRecord(Schema avroRecord) {
    Preconditions.checkArgument(
        Schema.Type.RECORD.equals(avroRecord.getType()),
        "Avro Schema must be a Record");

    final List<Field> fields = avroRecord.getFields();
    final List<String> names = Lists.newArrayListWithExpectedSize(fields.size());
    final List<TypeInfo> types = Lists.newArrayListWithExpectedSize(fields.size());
    for (Field field : fields) {
      names.add(field.name());
      types.add(convert(field.schema()));
    }

    return TypeInfoFactory.getStructTypeInfo(names, types);
  }

  static TypeInfo convertUnion(Schema avroUnion) {
    Preconditions.checkArgument(
        Schema.Type.UNION.equals(avroUnion.getType()),
        "Avro Schema must be a Union");

    final List<TypeInfo> nonNullTypes = Lists.newArrayList();
    for (Schema schema : avroUnion.getTypes()) {
      if (!Schema.Type.NULL.equals(schema.getType())) {
        nonNullTypes.add(convert(schema));
      }
    }

    // check for a single, possible nullable field
    if (nonNullTypes.size() == 1) {
      // TODO: where does nullability get passed?
      // nullable if avroUnion.getTypes().size() == 2
      return nonNullTypes.get(0);
    }

    return TypeInfoFactory.getUnionTypeInfo(nonNullTypes);
  }

  static TypeInfo convertArray(Schema avroArray) {
    Preconditions.checkArgument(
        Schema.Type.ARRAY.equals(avroArray.getType()),
        "Avro Schema must be an Array");

    return TypeInfoFactory.getListTypeInfo(convert(avroArray.getElementType()));
  }

  static TypeInfo convertMap(Schema avroMap) {
    Preconditions.checkArgument(
        Schema.Type.MAP.equals(avroMap.getType()),
        "Avro Schema must be a Map");

    return TypeInfoFactory.getMapTypeInfo(
        TypeInfoFactory.stringTypeInfo, convert(avroMap.getValueType()));
  }

  static final ImmutableMap<Schema.Type, TypeInfo> TYPE_TO_TYPEINFO =
      ImmutableMap.<Schema.Type, TypeInfo>builder()
          .put(Schema.Type.BOOLEAN, TypeInfoFactory.booleanTypeInfo)
          .put(Schema.Type.INT, TypeInfoFactory.intTypeInfo)
          .put(Schema.Type.LONG, TypeInfoFactory.longTypeInfo)
          .put(Schema.Type.FLOAT, TypeInfoFactory.floatTypeInfo)
          .put(Schema.Type.DOUBLE, TypeInfoFactory.doubleTypeInfo)
          .put(Schema.Type.STRING, TypeInfoFactory.stringTypeInfo)
          .put(Schema.Type.ENUM, TypeInfoFactory.stringTypeInfo)
          .put(Schema.Type.BYTES, TypeInfoFactory.binaryTypeInfo)
          .put(Schema.Type.FIXED, TypeInfoFactory.binaryTypeInfo)
          .build();

  static TypeInfo convertPrimitive(Schema avroPrimitive) {
    TypeInfo type = TYPE_TO_TYPEINFO.get(avroPrimitive.getType());

    Preconditions.checkArgument(type != null,
        "Avro Schema must be a primitive type, not " + avroPrimitive.getType());

    return type;
  }

}
TOP

Related Classes of com.cloudera.cdk.data.hcatalog.HiveUtils

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.