/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.ql.exec.vector;
import java.io.IOException;
import java.sql.Date;
import java.sql.Timestamp;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.common.type.Decimal128;
import org.apache.hadoop.hive.common.type.HiveDecimal;
import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants;
import org.apache.hadoop.hive.ql.exec.Utilities;
import org.apache.hadoop.hive.ql.io.HiveFileFormatUtils;
import org.apache.hadoop.hive.ql.io.IOPrepareCache;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.plan.PartitionDesc;
import org.apache.hadoop.hive.serde2.ColumnProjectionUtils;
import org.apache.hadoop.hive.serde2.Deserializer;
import org.apache.hadoop.hive.serde2.SerDeException;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorConverters;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
import org.apache.hadoop.io.DataOutputBuffer;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.FileSplit;
/**
* Context for Vectorized row batch. this calss does eager deserialization of row data using serde
* in the RecordReader layer.
* It has supports partitions in this layer so that the vectorized batch is populated correctly
* with the partition column.
*/
public class VectorizedRowBatchCtx {
private static final Log LOG = LogFactory.getLog(VectorizedRowBatchCtx.class.getName());
// OI for raw row data (EG without partition cols)
private StructObjectInspector rawRowOI;
// OI for the row (Raw row OI + partition OI)
private StructObjectInspector rowOI;
// Deserializer for the row data
private Deserializer deserializer;
// Hash map of partition values. Key=TblColName value=PartitionValue
private Map<String, Object> partitionValues;
//partition types
private Map<String, PrimitiveCategory> partitionTypes;
// Column projection list - List of column indexes to include. This
// list does not contain partition columns
private List<Integer> colsToInclude;
private Map<Integer, String> columnTypeMap = null;
/**
* Constructor for VectorizedRowBatchCtx
*
* @param rawRowOI
* OI for raw row data (EG without partition cols)
* @param rowOI
* OI for the row (Raw row OI + partition OI)
* @param deserializer
* Deserializer for the row data
* @param partitionValues
* Hash map of partition values. Key=TblColName value=PartitionValue
*/
public VectorizedRowBatchCtx(StructObjectInspector rawRowOI, StructObjectInspector rowOI,
Deserializer deserializer, Map<String, Object> partitionValues,
Map<String, PrimitiveCategory> partitionTypes) {
this.rowOI = rowOI;
this.rawRowOI = rawRowOI;
this.deserializer = deserializer;
this.partitionValues = partitionValues;
this.partitionTypes = partitionTypes;
}
/**
* Constructor for VectorizedRowBatchCtx
*/
public VectorizedRowBatchCtx() {
}
/**
* Initializes the VectorizedRowBatch context based on an arbitrary object inspector
* Used by non-tablescan operators when they change the vectorization context
* @param hiveConf
* @param fileKey
* The key on which to retrieve the extra column mapping from the map scratch
* @param rowOI
* Object inspector that shapes the column types
*/
public void init(Configuration hiveConf, String fileKey,
StructObjectInspector rowOI) {
columnTypeMap = Utilities
.getMapRedWork(hiveConf).getMapWork().getScratchColumnVectorTypes()
.get(fileKey);
this.rowOI= rowOI;
this.rawRowOI = rowOI;
}
/**
* Initializes VectorizedRowBatch context based on the
* split and Hive configuration (Job conf with hive Plan).
*
* @param hiveConf
* Hive configuration using Hive plan is extracted
* @param split
* File split of the file being read
* @throws ClassNotFoundException
* @throws IOException
* @throws SerDeException
* @throws InstantiationException
* @throws IllegalAccessException
* @throws HiveException
*/
public void init(Configuration hiveConf, FileSplit split) throws ClassNotFoundException,
IOException,
SerDeException,
InstantiationException,
IllegalAccessException, HiveException {
Map<String, PartitionDesc> pathToPartitionInfo = Utilities
.getMapRedWork(hiveConf).getMapWork().getPathToPartitionInfo();
PartitionDesc part = HiveFileFormatUtils
.getPartitionDescFromPathRecursively(pathToPartitionInfo,
split.getPath(), IOPrepareCache.get().getPartitionDescMap());
String partitionPath = split.getPath().getParent().toString();
columnTypeMap = Utilities
.getMapRedWork(hiveConf).getMapWork().getScratchColumnVectorTypes()
.get(partitionPath);
Properties partProps =
(part.getPartSpec() == null || part.getPartSpec().isEmpty()) ?
part.getTableDesc().getProperties() : part.getProperties();
Class serdeclass = hiveConf.getClassByName(part.getSerdeClassName());
Deserializer partDeserializer = (Deserializer) serdeclass.newInstance();
partDeserializer.initialize(hiveConf, partProps);
StructObjectInspector partRawRowObjectInspector = (StructObjectInspector) partDeserializer
.getObjectInspector();
deserializer = partDeserializer;
// Check to see if this split is part of a partition of a table
String pcols = partProps.getProperty(hive_metastoreConstants.META_TABLE_PARTITION_COLUMNS);
if (pcols != null && pcols.length() > 0) {
// Partitions exist for this table. Get the partition object inspector and
// raw row object inspector (row with out partition col)
LinkedHashMap<String, String> partSpec = part.getPartSpec();
String[] partKeys = pcols.trim().split("/");
String pcolTypes = partProps.getProperty(hive_metastoreConstants.META_TABLE_PARTITION_COLUMN_TYPES);
String[] partKeyTypes = pcolTypes.trim().split(":");
if (partKeys.length > partKeyTypes.length) {
throw new HiveException("Internal error : partKeys length, " +partKeys.length +
" greater than partKeyTypes length, " + partKeyTypes.length);
}
List<String> partNames = new ArrayList<String>(partKeys.length);
List<ObjectInspector> partObjectInspectors = new ArrayList<ObjectInspector>(partKeys.length);
partitionValues = new LinkedHashMap<String, Object>();
partitionTypes = new LinkedHashMap<String, PrimitiveCategory>();
for (int i = 0; i < partKeys.length; i++) {
String key = partKeys[i];
partNames.add(key);
ObjectInspector objectInspector = null;
Object objectVal;
if (partSpec == null) {
// for partitionless table, initialize partValue to empty string.
// We can have partitionless table even if we have partition keys
// when there is only only partition selected and the partition key is not
// part of the projection/include list.
objectVal = null;
objectInspector = PrimitiveObjectInspectorFactory.javaStringObjectInspector;
partitionTypes.put(key, PrimitiveCategory.STRING);
} else {
// Create a Standard java object Inspector
objectInspector = TypeInfoUtils.getStandardJavaObjectInspectorFromTypeInfo(
TypeInfoFactory.getPrimitiveTypeInfo(partKeyTypes[i]));
objectVal =
ObjectInspectorConverters.
getConverter(PrimitiveObjectInspectorFactory.
javaStringObjectInspector, objectInspector).
convert(partSpec.get(key));
partitionTypes.put(key, TypeInfoFactory.getPrimitiveTypeInfo(partKeyTypes[i]).getPrimitiveCategory());
}
if (LOG.isDebugEnabled()) {
LOG.debug("Partition column: name: " + key + ", value: " + objectVal + ", type: " + partitionTypes.get(key));
}
partitionValues.put(key, objectVal);
partObjectInspectors.add(objectInspector);
}
// Create partition OI
StructObjectInspector partObjectInspector = ObjectInspectorFactory
.getStandardStructObjectInspector(partNames, partObjectInspectors);
// Get row OI from partition OI and raw row OI
StructObjectInspector rowObjectInspector = ObjectInspectorFactory
.getUnionStructObjectInspector(Arrays
.asList(new StructObjectInspector[] {partRawRowObjectInspector, partObjectInspector}));
rowOI = rowObjectInspector;
rawRowOI = partRawRowObjectInspector;
} else {
// No partitions for this table, hence row OI equals raw row OI
rowOI = partRawRowObjectInspector;
rawRowOI = partRawRowObjectInspector;
}
colsToInclude = ColumnProjectionUtils.getReadColumnIDs(hiveConf);
}
/**
* Creates a Vectorized row batch and the column vectors.
*
* @return VectorizedRowBatch
* @throws HiveException
*/
public VectorizedRowBatch createVectorizedRowBatch() throws HiveException
{
List<? extends StructField> fieldRefs = rowOI.getAllStructFieldRefs();
VectorizedRowBatch result = new VectorizedRowBatch(fieldRefs.size());
for (int j = 0; j < fieldRefs.size(); j++) {
// If the column is included in the include list or if the column is a
// partition column then create the column vector. Also note that partition columns are not
// in the included list.
if ((colsToInclude == null) || colsToInclude.contains(j)
|| ((partitionValues != null) &&
partitionValues.containsKey(fieldRefs.get(j).getFieldName()))) {
ObjectInspector foi = fieldRefs.get(j).getFieldObjectInspector();
switch (foi.getCategory()) {
case PRIMITIVE: {
PrimitiveObjectInspector poi = (PrimitiveObjectInspector) foi;
// Vectorization currently only supports the following data types:
// BOOLEAN, BYTE, SHORT, INT, LONG, FLOAT, DOUBLE, STRING, TIMESTAMP,
// DATE and DECIMAL
switch (poi.getPrimitiveCategory()) {
case BOOLEAN:
case BYTE:
case SHORT:
case INT:
case LONG:
case TIMESTAMP:
case DATE:
result.cols[j] = new LongColumnVector(VectorizedRowBatch.DEFAULT_SIZE);
break;
case FLOAT:
case DOUBLE:
result.cols[j] = new DoubleColumnVector(VectorizedRowBatch.DEFAULT_SIZE);
break;
case STRING:
result.cols[j] = new BytesColumnVector(VectorizedRowBatch.DEFAULT_SIZE);
break;
case DECIMAL:
DecimalTypeInfo tInfo = (DecimalTypeInfo) poi.getTypeInfo();
result.cols[j] = new DecimalColumnVector(VectorizedRowBatch.DEFAULT_SIZE,
tInfo.precision(), tInfo.scale());
break;
default:
throw new RuntimeException("Vectorizaton is not supported for datatype:"
+ poi.getPrimitiveCategory());
}
break;
}
case LIST:
case MAP:
case STRUCT:
case UNION:
throw new HiveException("Vectorizaton is not supported for datatype:"
+ foi.getCategory());
default:
throw new HiveException("Unknown ObjectInspector category!");
}
}
}
result.numCols = fieldRefs.size();
this.addScratchColumnsToBatch(result);
result.reset();
return result;
}
/**
* Adds the row to the batch after deserializing the row
*
* @param rowIndex
* Row index in the batch to which the row is added
* @param rowBlob
* Row blob (serialized version of row)
* @param batch
* Vectorized batch to which the row is added
* @param buffer a buffer to copy strings into
* @throws HiveException
* @throws SerDeException
*/
public void addRowToBatch(int rowIndex, Writable rowBlob,
VectorizedRowBatch batch,
DataOutputBuffer buffer
) throws HiveException, SerDeException
{
Object row = this.deserializer.deserialize(rowBlob);
VectorizedBatchUtil.addRowToBatch(row, this.rawRowOI, rowIndex, batch, buffer);
}
/**
* Deserialized set of rows and populates the batch
*
* @param rowBlob
* to deserialize
* @param batch
* Vectorized row batch which contains deserialized data
* @throws SerDeException
*/
public void convertRowBatchBlobToVectorizedBatch(Object rowBlob, int rowsInBlob,
VectorizedRowBatch batch)
throws SerDeException {
if (deserializer instanceof VectorizedSerde) {
((VectorizedSerde) deserializer).deserializeVector(rowBlob, rowsInBlob, batch);
} else {
throw new SerDeException(
"Not able to deserialize row batch. Serde does not implement VectorizedSerde");
}
}
private int getColIndexBasedOnColName(String colName) throws HiveException
{
List<? extends StructField> fieldRefs = rowOI.getAllStructFieldRefs();
for (int i = 0; i < fieldRefs.size(); i++) {
if (fieldRefs.get(i).getFieldName().equals(colName)) {
return i;
}
}
throw new HiveException("Not able to find column name in row object inspector");
}
/**
* Add the partition values to the batch
*
* @param batch
* @throws HiveException
*/
public void addPartitionColsToBatch(VectorizedRowBatch batch) throws HiveException
{
int colIndex;
Object value;
PrimitiveCategory pCategory;
if (partitionValues != null) {
for (String key : partitionValues.keySet()) {
colIndex = getColIndexBasedOnColName(key);
value = partitionValues.get(key);
pCategory = partitionTypes.get(key);
switch (pCategory) {
case BOOLEAN: {
LongColumnVector lcv = (LongColumnVector) batch.cols[colIndex];
if (value == null) {
lcv.noNulls = false;
lcv.isNull[0] = true;
lcv.isRepeating = true;
} else {
lcv.fill((Boolean)value == true ? 1 : 0);
lcv.isNull[0] = false;
}
}
break;
case BYTE: {
LongColumnVector lcv = (LongColumnVector) batch.cols[colIndex];
if (value == null) {
lcv.noNulls = false;
lcv.isNull[0] = true;
lcv.isRepeating = true;
} else {
lcv.fill((Byte)value);
lcv.isNull[0] = false;
}
}
break;
case SHORT: {
LongColumnVector lcv = (LongColumnVector) batch.cols[colIndex];
if (value == null) {
lcv.noNulls = false;
lcv.isNull[0] = true;
lcv.isRepeating = true;
} else {
lcv.fill((Short)value);
lcv.isNull[0] = false;
}
}
break;
case INT: {
LongColumnVector lcv = (LongColumnVector) batch.cols[colIndex];
if (value == null) {
lcv.noNulls = false;
lcv.isNull[0] = true;
lcv.isRepeating = true;
} else {
lcv.fill((Integer)value);
lcv.isNull[0] = false;
}
}
break;
case LONG: {
LongColumnVector lcv = (LongColumnVector) batch.cols[colIndex];
if (value == null) {
lcv.noNulls = false;
lcv.isNull[0] = true;
lcv.isRepeating = true;
} else {
lcv.fill((Long)value);
lcv.isNull[0] = false;
}
}
break;
case DATE: {
LongColumnVector lcv = (LongColumnVector) batch.cols[colIndex];
if (value == null) {
lcv.noNulls = false;
lcv.isNull[0] = true;
lcv.isRepeating = true;
} else {
lcv.fill(((Date)value).getTime());
lcv.isNull[0] = false;
}
}
break;
case TIMESTAMP: {
LongColumnVector lcv = (LongColumnVector) batch.cols[colIndex];
if (value == null) {
lcv.noNulls = false;
lcv.isNull[0] = true;
lcv.isRepeating = true;
} else {
lcv.fill((long)(((Timestamp) value).getTime()));
lcv.isNull[0] = false;
}
}
break;
case FLOAT: {
DoubleColumnVector dcv = (DoubleColumnVector) batch.cols[colIndex];
if (value == null) {
dcv.noNulls = false;
dcv.isNull[0] = true;
dcv.isRepeating = true;
} else {
dcv.fill((Float) value);
dcv.isNull[0] = false;
}
}
break;
case DOUBLE: {
DoubleColumnVector dcv = (DoubleColumnVector) batch.cols[colIndex];
if (value == null) {
dcv.noNulls = false;
dcv.isNull[0] = true;
dcv.isRepeating = true;
} else {
dcv.fill((Double) value);
dcv.isNull[0] = false;
}
}
break;
case DECIMAL: {
DecimalColumnVector dv = (DecimalColumnVector) batch.cols[colIndex];
if (value == null) {
dv.noNulls = false;
dv.isNull[0] = true;
dv.isRepeating = true;
} else {
HiveDecimal hd = (HiveDecimal)(value);
dv.vector[0] = new Decimal128(hd.toString(), (short)hd.scale());
dv.isRepeating = true;
dv.isNull[0] = false;
}
}
break;
case STRING: {
BytesColumnVector bcv = (BytesColumnVector) batch.cols[colIndex];
String sVal = (String)value;
if (sVal == null) {
bcv.noNulls = false;
bcv.isNull[0] = true;
bcv.isRepeating = true;
} else {
bcv.fill(sVal.getBytes());
bcv.isNull[0] = false;
}
}
break;
default:
throw new HiveException("Unable to recognize the partition type " + pCategory +
" for column " + key);
}
}
}
}
private void addScratchColumnsToBatch(VectorizedRowBatch vrb) {
if (columnTypeMap != null && !columnTypeMap.isEmpty()) {
int origNumCols = vrb.numCols;
int newNumCols = vrb.cols.length+columnTypeMap.keySet().size();
vrb.cols = Arrays.copyOf(vrb.cols, newNumCols);
for (int i = origNumCols; i < newNumCols; i++) {
vrb.cols[i] = allocateColumnVector(columnTypeMap.get(i),
VectorizedRowBatch.DEFAULT_SIZE);
}
vrb.numCols = vrb.cols.length;
}
}
/**
* Get the scale and precision for the given decimal type string. The decimal type is assumed to be
* of the format decimal(precision,scale) e.g. decimal(20,10).
* @param decimalType The given decimal type string.
* @return An integer array of size 2 with first element set to precision and second set to scale.
*/
private int[] getScalePrecisionFromDecimalType(String decimalType) {
Pattern p = Pattern.compile("\\d+");
Matcher m = p.matcher(decimalType);
m.find();
int precision = Integer.parseInt(m.group());
m.find();
int scale = Integer.parseInt(m.group());
int [] precScale = { precision, scale };
return precScale;
}
private ColumnVector allocateColumnVector(String type, int defaultSize) {
if (type.equalsIgnoreCase("double")) {
return new DoubleColumnVector(defaultSize);
} else if (type.equalsIgnoreCase("string")) {
return new BytesColumnVector(defaultSize);
} else if (VectorizationContext.decimalTypePattern.matcher(type).matches()){
int [] precisionScale = getScalePrecisionFromDecimalType(type);
return new DecimalColumnVector(defaultSize, precisionScale[0], precisionScale[1]);
} else {
return new LongColumnVector(defaultSize);
}
}
}