/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.hive.hcatalog.pig;
import java.io.IOException;
import java.math.BigDecimal;
import java.sql.Date;
import java.sql.Timestamp;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Properties;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.common.type.HiveChar;
import org.apache.hadoop.hive.common.type.HiveDecimal;
import org.apache.hadoop.hive.common.type.HiveVarchar;
import org.apache.hadoop.hive.serde2.typeinfo.CharTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
import org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hive.hcatalog.common.HCatException;
import org.apache.hive.hcatalog.data.DefaultHCatRecord;
import org.apache.hive.hcatalog.data.HCatRecord;
import org.apache.hive.hcatalog.data.schema.HCatFieldSchema;
import org.apache.hive.hcatalog.data.schema.HCatFieldSchema.Type;
import org.apache.hive.hcatalog.data.schema.HCatSchema;
import org.apache.pig.ResourceSchema;
import org.apache.pig.ResourceStatistics;
import org.apache.pig.StoreFunc;
import org.apache.pig.StoreMetadata;
import org.apache.pig.backend.BackendException;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.DataByteArray;
import org.apache.pig.data.DataType;
import org.apache.pig.data.Tuple;
import org.apache.pig.impl.logicalLayer.FrontendException;
import org.apache.pig.impl.logicalLayer.schema.Schema;
import org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema;
import org.apache.pig.impl.util.ObjectSerializer;
import org.apache.pig.impl.util.UDFContext;
import org.apache.pig.impl.util.Utils;
import org.joda.time.DateTime;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.collect.Lists;
/**
* Base class for HCatStorer and HCatEximStorer
*
*/
abstract class HCatBaseStorer extends StoreFunc implements StoreMetadata {
private static final Logger LOG = LoggerFactory.getLogger( HCatBaseStorer.class );
private static final List<Type> SUPPORTED_INTEGER_CONVERSIONS =
Lists.newArrayList(Type.TINYINT, Type.SMALLINT, Type.INT);
protected static final String COMPUTED_OUTPUT_SCHEMA = "hcat.output.schema";
protected final List<String> partitionKeys;
protected final Map<String, String> partitions;
protected Schema pigSchema;
private RecordWriter<WritableComparable<?>, HCatRecord> writer;
protected HCatSchema computedSchema;
protected static final String PIG_SCHEMA = "hcat.pig.store.schema";
/**
* Controls what happens when incoming Pig value is out-of-range for target Hive column
*/
static final String ON_OOR_VALUE_OPT = "onOutOfRangeValue";
/**
* prop name in Configuration/context
*/
static final String ON_OORA_VALUE_PROP = "hcat.pig.store.onoutofrangevalue";
/**
* valid values for ON_OOR_VALUE_OPT
*/
public static enum OOR_VALUE_OPT_VALUES {Null, Throw}
protected String sign;
//it's key that this is a per HCatStorer instance object
private final DataLossLogger dataLossLogger = new DataLossLogger();
private final OOR_VALUE_OPT_VALUES onOutOfRange;
public HCatBaseStorer(String partSpecs, String schema) throws Exception {
partitionKeys = new ArrayList<String>();
partitions = new HashMap<String, String>();
if (partSpecs != null && !partSpecs.trim().isEmpty()) {
String[] partKVPs = partSpecs.split(",");
for (String partKVP : partKVPs) {
String[] partKV = partKVP.split("=");
if (partKV.length == 2) {
String partKey = partKV[0].trim();
partitionKeys.add(partKey);
partitions.put(partKey, partKV[1].trim());
} else {
throw new FrontendException("Invalid partition column specification. " + partSpecs, PigHCatUtil.PIG_EXCEPTION_CODE);
}
}
}
if (schema != null && !schema.trim().isEmpty()) {
pigSchema = Utils.getSchemaFromString(schema);
}
Properties udfProps = UDFContext.getUDFContext().getUDFProperties(this.getClass(), new String[]{sign});
onOutOfRange = OOR_VALUE_OPT_VALUES.valueOf(udfProps.getProperty(ON_OORA_VALUE_PROP, getDefaultValue().name()));
}
static OOR_VALUE_OPT_VALUES getDefaultValue() {
return OOR_VALUE_OPT_VALUES.Null;
}
@Override
public void checkSchema(ResourceSchema resourceSchema) throws IOException {
/* Schema provided by user and the schema computed by Pig
* at the time of calling store must match.
*/
Schema runtimeSchema = Schema.getPigSchema(resourceSchema);
if (pigSchema != null) {
if (!Schema.equals(runtimeSchema, pigSchema, false, true)) {
throw new FrontendException("Schema provided in store statement doesn't match with the Schema" +
"returned by Pig run-time. Schema provided in HCatStorer: " + pigSchema.toString() + " Schema received from Pig runtime: " + runtimeSchema.toString(), PigHCatUtil.PIG_EXCEPTION_CODE);
}
} else {
pigSchema = runtimeSchema;
}
UDFContext.getUDFContext().getUDFProperties(this.getClass(), new String[]{sign}).setProperty(PIG_SCHEMA, ObjectSerializer.serialize(pigSchema));
}
/** Constructs HCatSchema from pigSchema. Passed tableSchema is the existing
* schema of the table in metastore.
*/
protected HCatSchema convertPigSchemaToHCatSchema(Schema pigSchema, HCatSchema tableSchema) throws FrontendException {
if(LOG.isDebugEnabled()) {
LOG.debug("convertPigSchemaToHCatSchema(pigSchema,tblSchema)=(" + pigSchema + "," + tableSchema + ")");
}
List<HCatFieldSchema> fieldSchemas = new ArrayList<HCatFieldSchema>(pigSchema.size());
for (FieldSchema fSchema : pigSchema.getFields()) {
try {
HCatFieldSchema hcatFieldSchema = getColFromSchema(fSchema.alias, tableSchema);
//if writing to a partitioned table, then pigSchema will have more columns than tableSchema
//partition columns are not part of tableSchema... e.g. TestHCatStorer#testPartColsInData()
// HCatUtil.assertNotNull(hcatFieldSchema, "Nothing matching '" + fSchema.alias + "' found " +
// "in target table schema", LOG);
fieldSchemas.add(getHCatFSFromPigFS(fSchema, hcatFieldSchema, pigSchema, tableSchema));
} catch (HCatException he) {
throw new FrontendException(he.getMessage(), PigHCatUtil.PIG_EXCEPTION_CODE, he);
}
}
HCatSchema s = new HCatSchema(fieldSchemas);
LOG.debug("convertPigSchemaToHCatSchema(computed)=(" + s + ")");
return s;
}
public static boolean removeTupleFromBag(HCatFieldSchema hcatFieldSchema, FieldSchema bagFieldSchema) throws HCatException {
if (hcatFieldSchema != null && hcatFieldSchema.getArrayElementSchema().get(0).getType() != Type.STRUCT) {
return true;
}
// Column was not found in table schema. Its a new column
List<FieldSchema> tupSchema = bagFieldSchema.schema.getFields();
if (hcatFieldSchema == null && tupSchema.size() == 1 && (tupSchema.get(0).schema == null || (tupSchema.get(0).type == DataType.TUPLE && tupSchema.get(0).schema.size() == 1))) {
return true;
}
return false;
}
/**
* Here we are processing HCat table schema as derived from metastore,
* thus it should have information about all fields/sub-fields, but not for partition columns
*/
private HCatFieldSchema getHCatFSFromPigFS(FieldSchema fSchema, HCatFieldSchema hcatFieldSchema,
Schema pigSchema, HCatSchema tableSchema)
throws FrontendException, HCatException {
if(hcatFieldSchema == null) {
if(LOG.isDebugEnabled()) {
LOG.debug("hcatFieldSchema is null for fSchema '" + fSchema.alias + "'");
//throw new IllegalArgumentException("hcatFiledSchema is null; fSchema=" + fSchema + " " +
// "(pigSchema, tableSchema)=(" + pigSchema + "," + tableSchema + ")");
}
}
byte type = fSchema.type;
switch (type) {
case DataType.CHARARRAY:
case DataType.BIGCHARARRAY:
if(hcatFieldSchema != null && hcatFieldSchema.getTypeInfo() != null) {
return new HCatFieldSchema(fSchema.alias, hcatFieldSchema.getTypeInfo(), null);
}
return new HCatFieldSchema(fSchema.alias, TypeInfoFactory.stringTypeInfo, null);
case DataType.INTEGER:
if (hcatFieldSchema != null) {
if (!SUPPORTED_INTEGER_CONVERSIONS.contains(hcatFieldSchema.getType())) {
throw new FrontendException("Unsupported type: " + type + " in Pig's schema",
PigHCatUtil.PIG_EXCEPTION_CODE);
}
return new HCatFieldSchema(fSchema.alias, hcatFieldSchema.getTypeInfo(), null);
}
return new HCatFieldSchema(fSchema.alias, TypeInfoFactory.intTypeInfo, null);
case DataType.LONG:
return new HCatFieldSchema(fSchema.alias, TypeInfoFactory.longTypeInfo, null);
case DataType.FLOAT:
return new HCatFieldSchema(fSchema.alias, TypeInfoFactory.floatTypeInfo, null);
case DataType.DOUBLE:
return new HCatFieldSchema(fSchema.alias, TypeInfoFactory.doubleTypeInfo, null);
case DataType.BYTEARRAY:
return new HCatFieldSchema(fSchema.alias, TypeInfoFactory.binaryTypeInfo, null);
case DataType.BOOLEAN:
return new HCatFieldSchema(fSchema.alias, TypeInfoFactory.booleanTypeInfo, null);
case DataType.DATETIME:
//Pig DATETIME can map to DATE or TIMESTAMP (see HCatBaseStorer#validateSchema()) which
//is controlled by Hive target table information
if(hcatFieldSchema != null && hcatFieldSchema.getTypeInfo() != null) {
return new HCatFieldSchema(fSchema.alias, hcatFieldSchema.getTypeInfo(), null);
}
return new HCatFieldSchema(fSchema.alias, TypeInfoFactory.timestampTypeInfo, null);
case DataType.BIGDECIMAL:
if(hcatFieldSchema != null && hcatFieldSchema.getTypeInfo() != null) {
return new HCatFieldSchema(fSchema.alias, hcatFieldSchema.getTypeInfo(), null);
}
return new HCatFieldSchema(fSchema.alias, TypeInfoFactory.decimalTypeInfo, null);
case DataType.BAG:
Schema bagSchema = fSchema.schema;
List<HCatFieldSchema> arrFields = new ArrayList<HCatFieldSchema>(1);
FieldSchema field;
// Find out if we need to throw away the tuple or not.
if (removeTupleFromBag(hcatFieldSchema, fSchema)) {
field = bagSchema.getField(0).schema.getField(0);
} else {
field = bagSchema.getField(0);
}
arrFields.add(getHCatFSFromPigFS(field, hcatFieldSchema == null ? null : hcatFieldSchema
.getArrayElementSchema().get(0), pigSchema, tableSchema));
return new HCatFieldSchema(fSchema.alias, Type.ARRAY, new HCatSchema(arrFields), "");
case DataType.TUPLE:
List<HCatFieldSchema> hcatFSs = new ArrayList<HCatFieldSchema>();
HCatSchema structSubSchema = hcatFieldSchema == null ? null : hcatFieldSchema.getStructSubSchema();
List<FieldSchema> fields = fSchema.schema.getFields();
for (int i = 0; i < fields.size(); i++) {
FieldSchema fieldSchema = fields.get(i);
hcatFSs.add(getHCatFSFromPigFS(fieldSchema, structSubSchema == null ? null : structSubSchema.get(i), pigSchema, tableSchema));
}
return new HCatFieldSchema(fSchema.alias, Type.STRUCT, new HCatSchema(hcatFSs), "");
case DataType.MAP: {
// Pig's schema contain no type information about map's keys and
// values. So, if its a new column assume <string,string> if its existing
// return whatever is contained in the existing column.
HCatFieldSchema valFS;
List<HCatFieldSchema> valFSList = new ArrayList<HCatFieldSchema>(1);
if (hcatFieldSchema != null) {
return HCatFieldSchema.createMapTypeFieldSchema(fSchema.alias, hcatFieldSchema.getMapKeyTypeInfo(),
hcatFieldSchema.getMapValueSchema(), "");
}
// Column not found in target table. Its a new column. Its schema is map<string,string>
valFS = new HCatFieldSchema(fSchema.alias, TypeInfoFactory.stringTypeInfo, "");
valFSList.add(valFS);
return HCatFieldSchema.createMapTypeFieldSchema(fSchema.alias,
TypeInfoFactory.stringTypeInfo, new HCatSchema(valFSList), "");
}
case DataType.BIGINTEGER:
//fall through; doesn't map to Hive/Hcat type; here for completeness
default:
throw new FrontendException("Unsupported type: " + type + " in Pig's schema", PigHCatUtil.PIG_EXCEPTION_CODE);
}
}
@Override
public void prepareToWrite(RecordWriter writer) throws IOException {
this.writer = writer;
computedSchema = (HCatSchema) ObjectSerializer.deserialize(UDFContext.getUDFContext().getUDFProperties(this.getClass(), new String[]{sign}).getProperty(COMPUTED_OUTPUT_SCHEMA));
}
@Override
public void putNext(Tuple tuple) throws IOException {
List<Object> outgoing = new ArrayList<Object>(tuple.size());
int i = 0;
for (HCatFieldSchema fSchema : computedSchema.getFields()) {
outgoing.add(getJavaObj(tuple.get(i++), fSchema));
}
try {
writer.write(null, new DefaultHCatRecord(outgoing));
} catch (InterruptedException e) {
throw new BackendException("Error while writing tuple: " + tuple, PigHCatUtil.PIG_EXCEPTION_CODE, e);
}
}
/**
* Convert from Pig value object to Hive value object
* This method assumes that {@link #validateSchema(org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema, org.apache.hive.hcatalog.data.schema.HCatFieldSchema, org.apache.pig.impl.logicalLayer.schema.Schema, org.apache.hive.hcatalog.data.schema.HCatSchema, int)}
* which checks the types in Pig schema are compatible with target Hive table, has been called.
*/
private Object getJavaObj(Object pigObj, HCatFieldSchema hcatFS) throws HCatException, BackendException {
try {
if(pigObj == null) return null;
// The real work-horse. Spend time and energy in this method if there is
// need to keep HCatStorer lean and go fast.
Type type = hcatFS.getType();
switch (type) {
case BINARY:
return ((DataByteArray) pigObj).get();
case STRUCT:
HCatSchema structSubSchema = hcatFS.getStructSubSchema();
// Unwrap the tuple.
List<Object> all = ((Tuple) pigObj).getAll();
ArrayList<Object> converted = new ArrayList<Object>(all.size());
for (int i = 0; i < all.size(); i++) {
converted.add(getJavaObj(all.get(i), structSubSchema.get(i)));
}
return converted;
case ARRAY:
// Unwrap the bag.
DataBag pigBag = (DataBag) pigObj;
HCatFieldSchema tupFS = hcatFS.getArrayElementSchema().get(0);
boolean needTuple = tupFS.getType() == Type.STRUCT;
List<Object> bagContents = new ArrayList<Object>((int) pigBag.size());
Iterator<Tuple> bagItr = pigBag.iterator();
while (bagItr.hasNext()) {
// If there is only one element in tuple contained in bag, we throw away the tuple.
bagContents.add(getJavaObj(needTuple ? bagItr.next() : bagItr.next().get(0), tupFS));
}
return bagContents;
case MAP:
Map<?, ?> pigMap = (Map<?, ?>) pigObj;
Map<Object, Object> typeMap = new HashMap<Object, Object>();
for (Entry<?, ?> entry : pigMap.entrySet()) {
// the value has a schema and not a FieldSchema
typeMap.put(
// Schema validation enforces that the Key is a String
(String) entry.getKey(),
getJavaObj(entry.getValue(), hcatFS.getMapValueSchema().get(0)));
}
return typeMap;
case STRING:
case INT:
case BIGINT:
case FLOAT:
case DOUBLE:
return pigObj;
case SMALLINT:
if ((Integer) pigObj < Short.MIN_VALUE || (Integer) pigObj > Short.MAX_VALUE) {
handleOutOfRangeValue(pigObj, hcatFS);
return null;
}
return ((Integer) pigObj).shortValue();
case TINYINT:
if ((Integer) pigObj < Byte.MIN_VALUE || (Integer) pigObj > Byte.MAX_VALUE) {
handleOutOfRangeValue(pigObj, hcatFS);
return null;
}
return ((Integer) pigObj).byteValue();
case BOOLEAN:
if( pigObj instanceof String ) {
if( ((String)pigObj).trim().compareTo("0") == 0 ) {
return Boolean.FALSE;
}
if( ((String)pigObj).trim().compareTo("1") == 0 ) {
return Boolean.TRUE;
}
throw new BackendException("Unexpected type " + type + " for value " + pigObj
+ " of class " + pigObj.getClass().getName(), PigHCatUtil.PIG_EXCEPTION_CODE);
}
return Boolean.parseBoolean( pigObj.toString() );
case DECIMAL:
BigDecimal bd = (BigDecimal)pigObj;
DecimalTypeInfo dti = (DecimalTypeInfo)hcatFS.getTypeInfo();
if(bd.precision() > dti.precision() || bd.scale() > dti.scale()) {
handleOutOfRangeValue(pigObj, hcatFS);
return null;
}
return HiveDecimal.create(bd);
case CHAR:
String charVal = (String)pigObj;
CharTypeInfo cti = (CharTypeInfo)hcatFS.getTypeInfo();
if(charVal.length() > cti.getLength()) {
handleOutOfRangeValue(pigObj, hcatFS);
return null;
}
return new HiveChar(charVal, cti.getLength());
case VARCHAR:
String varcharVal = (String)pigObj;
VarcharTypeInfo vti = (VarcharTypeInfo)hcatFS.getTypeInfo();
if(varcharVal.length() > vti.getLength()) {
handleOutOfRangeValue(pigObj, hcatFS);
return null;
}
return new HiveVarchar(varcharVal, vti.getLength());
case TIMESTAMP:
DateTime dt = (DateTime)pigObj;
return new Timestamp(dt.getMillis());//getMillis() returns UTC time regardless of TZ
case DATE:
/**
* We ignore any TZ setting on Pig value since java.sql.Date doesn't have it (in any
* meaningful way). So the assumption is that if Pig value has 0 time component (midnight)
* we assume it reasonably 'fits' into a Hive DATE. If time part is not 0, it's considered
* out of range for target type.
*/
DateTime dateTime = ((DateTime)pigObj);
if(dateTime.getMillisOfDay() != 0) {
handleOutOfRangeValue(pigObj, hcatFS, "Time component must be 0 (midnight) in local timezone; Local TZ val='" + pigObj + "'");
return null;
}
/*java.sql.Date is a poorly defined API. Some (all?) SerDes call toString() on it
[e.g. LazySimpleSerDe, uses LazyUtils.writePrimitiveUTF8()], which automatically adjusts
for local timezone. Date.valueOf() also uses local timezone (as does Date(int,int,int).
Also see PigHCatUtil#extractPigObject() for corresponding read op. This way a DATETIME from Pig,
when stored into Hive and read back comes back with the same value.*/
return new Date(dateTime.getYear() - 1900, dateTime.getMonthOfYear() - 1, dateTime.getDayOfMonth());
default:
throw new BackendException("Unexpected HCat type " + type + " for value " + pigObj
+ " of class " + pigObj.getClass().getName(), PigHCatUtil.PIG_EXCEPTION_CODE);
}
} catch (BackendException e) {
// provide the path to the field in the error message
throw new BackendException(
(hcatFS.getName() == null ? " " : hcatFS.getName() + ".") + e.getMessage(), e);
}
}
private void handleOutOfRangeValue(Object pigObj, HCatFieldSchema hcatFS) throws BackendException {
handleOutOfRangeValue(pigObj, hcatFS, null);
}
/**
* depending on user config, throws an exception or logs a msg if the incoming Pig value is
* out-of-range for target type.
* @param additionalMsg may be {@code null}
*/
private void handleOutOfRangeValue(Object pigObj, HCatFieldSchema hcatFS, String additionalMsg) throws BackendException {
String msg = "Pig value '" + pigObj + "' is outside the bounds of column " + hcatFS.getName() +
" with type " + (hcatFS.getTypeInfo() == null ? hcatFS.getType() : hcatFS.getTypeInfo().getTypeName()) +
(additionalMsg == null ? "" : "[" + additionalMsg + "]");
switch (onOutOfRange) {
case Throw:
throw new BackendException(msg, PigHCatUtil.PIG_EXCEPTION_CODE);
case Null:
dataLossLogger.logDataLossMsg(hcatFS, pigObj, msg);
break;
default:
throw new BackendException("Unexpected " + ON_OOR_VALUE_OPT + " value: '" + onOutOfRange + "'");
}
}
@Override
public String relToAbsPathForStoreLocation(String location, Path curDir) throws IOException {
// Need to necessarily override this method since default impl assumes HDFS
// based location string.
return location;
}
@Override
public void setStoreFuncUDFContextSignature(String signature) {
sign = signature;
}
protected void doSchemaValidations(Schema pigSchema, HCatSchema tblSchema) throws FrontendException, HCatException {
// Iterate through all the elements in Pig Schema and do validations as
// dictated by semantics, consult HCatSchema of table when need be.
int columnPos = 0;//helps with debug messages
for (FieldSchema pigField : pigSchema.getFields()) {
HCatFieldSchema hcatField = getColFromSchema(pigField.alias, tblSchema);
validateSchema(pigField, hcatField, pigSchema, tblSchema, columnPos++);
}
try {
PigHCatUtil.validateHCatTableSchemaFollowsPigRules(tblSchema);
} catch (IOException e) {
throw new FrontendException("HCatalog schema is not compatible with Pig: " + e.getMessage(), PigHCatUtil.PIG_EXCEPTION_CODE, e);
}
}
/**
* This method encodes which Pig type can map (be stored in) to which HCat type.
* @throws HCatException
* @throws FrontendException
*/
private void validateSchema(FieldSchema pigField, HCatFieldSchema hcatField,
Schema topLevelPigSchema, HCatSchema topLevelHCatSchema,
int columnPos)
throws HCatException, FrontendException {
validateAlias(pigField.alias);
byte type = pigField.type;
if (DataType.isComplex(type)) {
switch (type) {
case DataType.MAP:
if (hcatField != null) {
if (hcatField.getMapKeyType() != Type.STRING) {
throw new FrontendException("Key Type of map must be String " + hcatField, PigHCatUtil.PIG_EXCEPTION_CODE);
}
// Map values can be primitive or complex
}
break;
case DataType.BAG:
HCatSchema arrayElementSchema = hcatField == null ? null : hcatField.getArrayElementSchema();
for (FieldSchema innerField : pigField.schema.getField(0).schema.getFields()) {
validateSchema(innerField, getColFromSchema(pigField.alias, arrayElementSchema),
topLevelPigSchema, topLevelHCatSchema, columnPos);
}
break;
case DataType.TUPLE:
HCatSchema structSubSchema = hcatField == null ? null : hcatField.getStructSubSchema();
for (FieldSchema innerField : pigField.schema.getFields()) {
validateSchema(innerField, getColFromSchema(pigField.alias, structSubSchema),
topLevelPigSchema, topLevelHCatSchema, columnPos);
}
break;
default:
throw new FrontendException("Internal Error.", PigHCatUtil.PIG_EXCEPTION_CODE);
}
}
else if(hcatField != null) {
//there is no point trying to validate further if we have no type info about target field
switch (type) {
case DataType.BIGDECIMAL:
throwTypeMismatchException(type, Lists.newArrayList(Type.DECIMAL), hcatField, columnPos);
break;
case DataType.DATETIME:
throwTypeMismatchException(type, Lists.newArrayList(Type.TIMESTAMP, Type.DATE), hcatField, columnPos);
break;
case DataType.BYTEARRAY:
throwTypeMismatchException(type, Lists.newArrayList(Type.BINARY), hcatField, columnPos);
break;
case DataType.BIGINTEGER:
throwTypeMismatchException(type, Collections.<Type>emptyList(), hcatField, columnPos);
break;
case DataType.BOOLEAN:
throwTypeMismatchException(type, Lists.newArrayList(Type.BOOLEAN), hcatField, columnPos);
break;
case DataType.CHARARRAY:
throwTypeMismatchException(type, Lists.newArrayList(Type.STRING, Type.CHAR, Type.VARCHAR),
hcatField, columnPos);
break;
case DataType.DOUBLE:
throwTypeMismatchException(type, Lists.newArrayList(Type.DOUBLE), hcatField, columnPos);
break;
case DataType.FLOAT:
throwTypeMismatchException(type, Lists.newArrayList(Type.FLOAT), hcatField, columnPos);
break;
case DataType.INTEGER:
throwTypeMismatchException(type, Lists.newArrayList(Type.INT, Type.BIGINT,
Type.TINYINT, Type.SMALLINT), hcatField, columnPos);
break;
case DataType.LONG:
throwTypeMismatchException(type, Lists.newArrayList(Type.BIGINT), hcatField, columnPos);
break;
default:
throw new FrontendException("'" + type +
"' Pig datatype in column " + columnPos + "(0-based) is not supported by HCat",
PigHCatUtil.PIG_EXCEPTION_CODE);
}
}
else {
if(false) {
//see HIVE-6194
throw new FrontendException("(pigSch,hcatSchema)=(" + pigField + "," +
"" + hcatField + ") (topPig, topHcat)=(" + topLevelPigSchema + "," +
"" + topLevelHCatSchema + ")");
}
}
}
private static void throwTypeMismatchException(byte pigDataType,
List<Type> hcatRequiredType, HCatFieldSchema hcatActualField,
int columnPos) throws FrontendException {
if(!hcatRequiredType.contains(hcatActualField.getType())) {
throw new FrontendException(
"Pig '" + DataType.findTypeName(pigDataType) + "' type in column " +
columnPos + "(0-based) cannot map to HCat '" +
hcatActualField.getType() + "'type. Target filed must be of HCat type {" +
StringUtils.join(hcatRequiredType, " or ") + "}");
}
}
private void validateAlias(String alias) throws FrontendException {
if (alias == null) {
throw new FrontendException("Column name for a field is not specified. Please provide the full schema as an argument to HCatStorer.", PigHCatUtil.PIG_EXCEPTION_CODE);
}
if (alias.matches(".*[A-Z]+.*")) {
throw new FrontendException("Column names should all be in lowercase. Invalid name found: " + alias, PigHCatUtil.PIG_EXCEPTION_CODE);
}
}
// Finds column by name in HCatSchema, if not found returns null.
private HCatFieldSchema getColFromSchema(String alias, HCatSchema tblSchema) {
if (tblSchema != null) {
for (HCatFieldSchema hcatField : tblSchema.getFields()) {
if (hcatField != null && hcatField.getName() != null && hcatField.getName().equalsIgnoreCase(alias)) {
return hcatField;
}
}
}
// Its a new column
return null;
}
@Override
public void cleanupOnFailure(String location, Job job) throws IOException {
// No-op.
}
@Override
public void storeStatistics(ResourceStatistics stats, String arg1, Job job) throws IOException {
}
/**
* todo: when job is complete, should print the msgCount table to log
*/
private static final class DataLossLogger {
private static final Map<String, Integer> msgCount = new HashMap<String, Integer>();
private static String getColumnTypeKey(HCatFieldSchema fieldSchema) {
return fieldSchema.getName() + "_" + (fieldSchema.getTypeInfo() == null ?
fieldSchema.getType() : fieldSchema.getTypeInfo());
}
private void logDataLossMsg(HCatFieldSchema fieldSchema, Object pigOjb, String msg) {
String key = getColumnTypeKey(fieldSchema);
if(!msgCount.containsKey(key)) {
msgCount.put(key, 0);
LOG.warn(msg + " " + "Will write NULL instead. Only 1 such message per type/column is emitted.");
}
msgCount.put(key, msgCount.get(key) + 1);
}
}
}