Package org.apache.hadoop.hive.ql.io.parquet.serde

Source Code of org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe

/**
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.ql.io.parquet.serde;

import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Properties;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.common.type.HiveDecimal;
import org.apache.hadoop.hive.serde.serdeConstants;
import org.apache.hadoop.hive.serde2.AbstractSerDe;
import org.apache.hadoop.hive.serde2.SerDeException;
import org.apache.hadoop.hive.serde2.SerDeSpec;
import org.apache.hadoop.hive.serde2.SerDeStats;
import org.apache.hadoop.hive.serde2.io.ByteWritable;
import org.apache.hadoop.hive.serde2.io.DoubleWritable;
import org.apache.hadoop.hive.serde2.io.ShortWritable;
import org.apache.hadoop.hive.serde2.io.TimestampWritable;
import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.BooleanObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.ByteObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.DoubleObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.FloatObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveCharObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveVarcharObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.ShortObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector;
import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.TimestampObjectInspector;
import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
import org.apache.hadoop.io.ArrayWritable;
import org.apache.hadoop.io.BooleanWritable;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import parquet.hadoop.ParquetOutputFormat;
import parquet.hadoop.ParquetWriter;
import parquet.io.api.Binary;

/**
*
* A ParquetHiveSerDe for Hive (with the deprecated package mapred)
*
*/
@SerDeSpec(schemaProps = {serdeConstants.LIST_COLUMNS, serdeConstants.LIST_COLUMN_TYPES,
        ParquetOutputFormat.COMPRESSION})
public class ParquetHiveSerDe extends AbstractSerDe {
  public static final Text MAP_KEY = new Text("key");
  public static final Text MAP_VALUE = new Text("value");
  public static final Text MAP = new Text("map");
  public static final Text ARRAY = new Text("bag");

  // default compression type for parquet output format
  private static final String DEFAULTCOMPRESSION =
          ParquetWriter.DEFAULT_COMPRESSION_CODEC_NAME.name();

  // Map precision to the number bytes needed for binary conversion.
  public static final int PRECISION_TO_BYTE_COUNT[] = new int[38];
  static {
    for (int prec = 1; prec <= 38; prec++) {
      // Estimated number of bytes needed.
      PRECISION_TO_BYTE_COUNT[prec - 1] = (int)
          Math.ceil((Math.log(Math.pow(10, prec) - 1) / Math.log(2) + 1) / 8);
    }
  }

  private SerDeStats stats;
  private ObjectInspector objInspector;

  private enum LAST_OPERATION {
    SERIALIZE,
    DESERIALIZE,
    UNKNOWN
  }

  private LAST_OPERATION status;
  private long serializedSize;
  private long deserializedSize;
  private String compressionType;

  @Override
  public final void initialize(final Configuration conf, final Properties tbl) throws SerDeException {

    final TypeInfo rowTypeInfo;
    final List<String> columnNames;
    final List<TypeInfo> columnTypes;
    // Get column names and sort order
    final String columnNameProperty = tbl.getProperty(serdeConstants.LIST_COLUMNS);
    final String columnTypeProperty = tbl.getProperty(serdeConstants.LIST_COLUMN_TYPES);

    // Get compression properties
    compressionType = tbl.getProperty(ParquetOutputFormat.COMPRESSION, DEFAULTCOMPRESSION);

    if (columnNameProperty.length() == 0) {
      columnNames = new ArrayList<String>();
    } else {
      columnNames = Arrays.asList(columnNameProperty.split(","));
    }
    if (columnTypeProperty.length() == 0) {
      columnTypes = new ArrayList<TypeInfo>();
    } else {
      columnTypes = TypeInfoUtils.getTypeInfosFromTypeString(columnTypeProperty);
    }
    if (columnNames.size() != columnTypes.size()) {
      throw new IllegalArgumentException("ParquetHiveSerde initialization failed. Number of column " +
        "name and column type differs. columnNames = " + columnNames + ", columnTypes = " +
        columnTypes);
    }
    // Create row related objects
    rowTypeInfo = TypeInfoFactory.getStructTypeInfo(columnNames, columnTypes);
    this.objInspector = new ArrayWritableObjectInspector((StructTypeInfo) rowTypeInfo);

    // Stats part
    stats = new SerDeStats();
    serializedSize = 0;
    deserializedSize = 0;
    status = LAST_OPERATION.UNKNOWN;
  }

  @Override
  public Object deserialize(final Writable blob) throws SerDeException {
    status = LAST_OPERATION.DESERIALIZE;
    deserializedSize = 0;
    if (blob instanceof ArrayWritable) {
      deserializedSize = ((ArrayWritable) blob).get().length;
      return blob;
    } else {
      return null;
    }
  }

  @Override
  public ObjectInspector getObjectInspector() throws SerDeException {
    return objInspector;
  }

  @Override
  public Class<? extends Writable> getSerializedClass() {
    return ArrayWritable.class;
  }

  @Override
  public Writable serialize(final Object obj, final ObjectInspector objInspector)
      throws SerDeException {
    if (!objInspector.getCategory().equals(Category.STRUCT)) {
      throw new SerDeException("Cannot serialize " + objInspector.getCategory() + ". Can only serialize a struct");
    }
    final ArrayWritable serializeData = createStruct(obj, (StructObjectInspector) objInspector);
    serializedSize = serializeData.get().length;
    status = LAST_OPERATION.SERIALIZE;
    return serializeData;
  }

  private ArrayWritable createStruct(final Object obj, final StructObjectInspector inspector)
      throws SerDeException {
    final List<? extends StructField> fields = inspector.getAllStructFieldRefs();
    final Writable[] arr = new Writable[fields.size()];
    for (int i = 0; i < fields.size(); i++) {
      final StructField field = fields.get(i);
      final Object subObj = inspector.getStructFieldData(obj, field);
      final ObjectInspector subInspector = field.getFieldObjectInspector();
      arr[i] = createObject(subObj, subInspector);
    }
    return new ArrayWritable(Writable.class, arr);
  }

  private Writable createMap(final Object obj, final MapObjectInspector inspector)
      throws SerDeException {
    final Map<?, ?> sourceMap = inspector.getMap(obj);
    final ObjectInspector keyInspector = inspector.getMapKeyObjectInspector();
    final ObjectInspector valueInspector = inspector.getMapValueObjectInspector();
    final List<ArrayWritable> array = new ArrayList<ArrayWritable>();

    if (sourceMap != null) {
      for (final Entry<?, ?> keyValue : sourceMap.entrySet()) {
        final Writable key = createObject(keyValue.getKey(), keyInspector);
        final Writable value = createObject(keyValue.getValue(), valueInspector);
        if (key != null) {
          Writable[] arr = new Writable[2];
          arr[0] = key;
          arr[1] = value;
          array.add(new ArrayWritable(Writable.class, arr));
        }
      }
    }
    if (array.size() > 0) {
      final ArrayWritable subArray = new ArrayWritable(ArrayWritable.class,
          array.toArray(new ArrayWritable[array.size()]));
      return new ArrayWritable(Writable.class, new Writable[] {subArray});
    } else {
      return null;
    }
  }

  private ArrayWritable createArray(final Object obj, final ListObjectInspector inspector)
      throws SerDeException {
    final List<?> sourceArray = inspector.getList(obj);
    final ObjectInspector subInspector = inspector.getListElementObjectInspector();
    final List<Writable> array = new ArrayList<Writable>();
    if (sourceArray != null) {
      for (final Object curObj : sourceArray) {
        final Writable newObj = createObject(curObj, subInspector);
        if (newObj != null) {
          array.add(newObj);
        }
      }
    }
    if (array.size() > 0) {
      final ArrayWritable subArray = new ArrayWritable(array.get(0).getClass(),
          array.toArray(new Writable[array.size()]));
      return new ArrayWritable(Writable.class, new Writable[] {subArray});
    } else {
      return null;
    }
  }

  private Writable createPrimitive(final Object obj, final PrimitiveObjectInspector inspector)
      throws SerDeException {
    if (obj == null) {
      return null;
    }
    switch (inspector.getPrimitiveCategory()) {
    case VOID:
      return null;
    case BOOLEAN:
      return new BooleanWritable(((BooleanObjectInspector) inspector).get(obj) ? Boolean.TRUE : Boolean.FALSE);
    case BYTE:
      return new ByteWritable(((ByteObjectInspector) inspector).get(obj));
    case DOUBLE:
      return new DoubleWritable(((DoubleObjectInspector) inspector).get(obj));
    case FLOAT:
      return new FloatWritable(((FloatObjectInspector) inspector).get(obj));
    case INT:
      return new IntWritable(((IntObjectInspector) inspector).get(obj));
    case LONG:
      return new LongWritable(((LongObjectInspector) inspector).get(obj));
    case SHORT:
      return new ShortWritable(((ShortObjectInspector) inspector).get(obj));
    case STRING:
      String v = ((StringObjectInspector) inspector).getPrimitiveJavaObject(obj);
      try {
        return new BytesWritable(v.getBytes("UTF-8"));
      } catch (UnsupportedEncodingException e) {
        throw new SerDeException("Failed to encode string in UTF-8", e);
      }
    case DECIMAL:
      HiveDecimal hd = (HiveDecimal)inspector.getPrimitiveJavaObject(obj);
      DecimalTypeInfo decTypeInfo = (DecimalTypeInfo) inspector.getTypeInfo();
      int prec = decTypeInfo.precision();
      int scale = decTypeInfo.scale();
      byte[] src = hd.setScale(scale).unscaledValue().toByteArray();
      // Estimated number of bytes needed.
      int bytes =  PRECISION_TO_BYTE_COUNT[prec - 1];
      if (bytes == src.length) {
        // No padding needed.
        return new BytesWritable(src);
      }
      byte[] tgt = new byte[bytes];
      if ( hd.signum() == -1) {
        // For negative number, initializing bits to 1
        for (int i = 0; i < bytes; i++) {
          tgt[i] |= 0xFF;
        }
      }
      System.arraycopy(src, 0, tgt, bytes - src.length, src.length); // Padding leading zeroes/ones.
      return new BytesWritable(tgt);
    case TIMESTAMP:
      return new TimestampWritable(((TimestampObjectInspector) inspector).getPrimitiveJavaObject(obj));
    case CHAR:
      String strippedValue = ((HiveCharObjectInspector) inspector).getPrimitiveJavaObject(obj).getStrippedValue();
      return new BytesWritable(Binary.fromString(strippedValue).getBytes());
    case VARCHAR:
      String value = ((HiveVarcharObjectInspector) inspector).getPrimitiveJavaObject(obj).getValue();
        return new BytesWritable(Binary.fromString(value).getBytes());
    default:
      throw new SerDeException("Unknown primitive : " + inspector.getPrimitiveCategory());
    }
  }

  private Writable createObject(final Object obj, final ObjectInspector inspector) throws SerDeException {
    switch (inspector.getCategory()) {
    case STRUCT:
      return createStruct(obj, (StructObjectInspector) inspector);
    case LIST:
      return createArray(obj, (ListObjectInspector) inspector);
    case MAP:
      return createMap(obj, (MapObjectInspector) inspector);
    case PRIMITIVE:
      return createPrimitive(obj, (PrimitiveObjectInspector) inspector);
    default:
      throw new SerDeException("Unknown data type" + inspector.getCategory());
    }
  }

  @Override
  public SerDeStats getSerDeStats() {
    // must be different
    assert (status != LAST_OPERATION.UNKNOWN);
    if (status == LAST_OPERATION.SERIALIZE) {
      stats.setRawDataSize(serializedSize);
    } else {
      stats.setRawDataSize(deserializedSize);
    }
    return stats;
  }
}
TOP

Related Classes of org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.