/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.ql.exec;
import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.commons.logging.Log;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.common.FileUtils;
import org.apache.hadoop.hive.ql.io.HiveFileFormatUtils;
import org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat;
import org.apache.hadoop.hive.ql.io.HiveKey;
import org.apache.hadoop.hive.ql.io.HiveOutputFormat;
import org.apache.hadoop.hive.ql.io.HivePartitioner;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.plan.DynamicPartitionCtx;
import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
import org.apache.hadoop.hive.ql.plan.FileSinkDesc;
import org.apache.hadoop.hive.ql.plan.PlanUtils;
import org.apache.hadoop.hive.ql.plan.TableDesc;
import org.apache.hadoop.hive.ql.plan.api.OperatorType;
import org.apache.hadoop.hive.ql.stats.StatsPublisher;
import org.apache.hadoop.hive.ql.stats.StatsSetupConst;
import org.apache.hadoop.hive.serde2.SerDeException;
import org.apache.hadoop.hive.serde2.Serializer;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.ObjectInspectorCopyOption;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.SubStructObjectInspector;
import org.apache.hadoop.hive.shims.ShimLoader;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.util.ReflectionUtils;
/**
* File Sink operator implementation.
**/
public class FileSinkOperator extends TerminalOperator<FileSinkDesc> implements
Serializable {
protected transient HashMap<String, FSPaths> valToPaths;
protected transient int numDynParts;
protected transient List<String> dpColNames;
protected transient DynamicPartitionCtx dpCtx;
protected transient boolean isCompressed;
protected transient Path parent;
protected transient HiveOutputFormat<?, ?> hiveOutputFormat;
protected transient Path specPath;
protected transient int dpStartCol; // start column # for DP columns
protected transient List<String> dpVals; // array of values corresponding to DP columns
protected transient List<Object> dpWritables;
protected transient RecordWriter[] rowOutWriters; // row specific RecordWriters
protected transient int maxPartitions;
private static final transient String[] FATAL_ERR_MSG = {
null, // counter value 0 means no error
"Number of dynamic partitions exceeded hive.exec.max.dynamic.partitions.pernode."
};
/**
* RecordWriter.
*
*/
public static interface RecordWriter {
void write(Writable w) throws IOException;
void close(boolean abort) throws IOException;
}
public class FSPaths implements Cloneable {
Path tmpPath;
Path[] outPaths;
Path[] finalPaths;
RecordWriter[] outWriters;
Stat stat;
public FSPaths() {
}
public FSPaths(Path specPath) {
tmpPath = Utilities.toTempPath(specPath);
outPaths = new Path[numFiles];
finalPaths = new Path[numFiles];
outWriters = new RecordWriter[numFiles];
stat = new Stat();
}
/**
* Append a subdirectory to the tmp path.
* @param dp subdirecgtory name
*/
public void appendTmpPath(String dp) {
tmpPath = new Path(tmpPath, dp);
}
/**
* Update OutPath according to tmpPath.
*/
public Path getOutPath(String taskId) {
return getOutPath(taskId, this.tmpPath);
}
/**
* Update OutPath according to tmpPath.
*/
public Path getOutPath(String taskId, Path tmp) {
return new Path(tmp, Utilities.toTempPath(taskId));
}
/**
* Update the final paths according to tmpPath.
*/
public Path getFinalPath(String taskId) {
return getFinalPath(taskId, this.tmpPath, null);
}
/**
* Update the final paths according to tmpPath.
*/
public Path getFinalPath(String taskId, Path tmpPath, String extension) {
if (extension != null) {
return new Path(tmpPath, taskId + extension);
} else {
return new Path(tmpPath, taskId);
}
}
public void setOutWriters(RecordWriter[] out) {
outWriters = out;
}
public RecordWriter[] getOutWriters() {
return outWriters;
}
public void closeWriters(boolean abort) throws HiveException {
for (int idx = 0; idx < outWriters.length; idx++) {
if (outWriters[idx] != null) {
try {
outWriters[idx].close(abort);
updateProgress();
} catch (IOException e) {
throw new HiveException(e);
}
}
}
}
private void commit(FileSystem fs) throws HiveException {
for (int idx = 0; idx < outPaths.length; ++idx) {
try {
if (!fs.rename(outPaths[idx], finalPaths[idx])) {
throw new HiveException("Unable to rename output to: "
+ finalPaths[idx]);
}
updateProgress();
} catch (IOException e) {
throw new HiveException(e + "Unable to rename output to: "
+ finalPaths[idx]);
}
}
}
public void abortWriters(FileSystem fs, boolean abort, boolean delete) throws HiveException {
for (int idx = 0; idx < outWriters.length; idx++) {
if (outWriters[idx] != null) {
try {
outWriters[idx].close(abort);
if (delete) {
fs.delete(outPaths[idx], true);
}
updateProgress();
} catch (IOException e) {
throw new HiveException(e);
}
}
}
}
} // class FSPaths
private static final long serialVersionUID = 1L;
protected transient FileSystem fs;
protected transient Serializer serializer;
protected transient BytesWritable commonKey = new BytesWritable();
protected transient TableIdEnum tabIdEnum = null;
private transient LongWritable row_count;
private transient boolean isNativeTable = true;
/**
* The evaluators for the multiFile sprayer. If the table under consideration has 1000 buckets,
* it is not a good idea to start so many reducers - if the maximum number of reducers is 100,
* each reducer can write 10 files - this way we effectively get 1000 files.
*/
private transient ExprNodeEvaluator[] partitionEval;
private transient int totalFiles;
private transient int numFiles;
private transient boolean multiFileSpray;
private transient final Map<Integer, Integer> bucketMap = new HashMap<Integer, Integer>();
private transient ObjectInspector[] partitionObjectInspectors;
private transient HivePartitioner<HiveKey, Object> prtner;
private transient final HiveKey key = new HiveKey();
private transient Configuration hconf;
private transient FSPaths fsp;
private transient boolean bDynParts;
private transient SubStructObjectInspector subSetOI;
private transient int timeOut; // JT timeout in msec.
private transient long lastProgressReport = System.currentTimeMillis();
/**
* TableIdEnum.
*
*/
public static enum TableIdEnum {
TABLE_ID_1_ROWCOUNT,
TABLE_ID_2_ROWCOUNT,
TABLE_ID_3_ROWCOUNT,
TABLE_ID_4_ROWCOUNT,
TABLE_ID_5_ROWCOUNT,
TABLE_ID_6_ROWCOUNT,
TABLE_ID_7_ROWCOUNT,
TABLE_ID_8_ROWCOUNT,
TABLE_ID_9_ROWCOUNT,
TABLE_ID_10_ROWCOUNT,
TABLE_ID_11_ROWCOUNT,
TABLE_ID_12_ROWCOUNT,
TABLE_ID_13_ROWCOUNT,
TABLE_ID_14_ROWCOUNT,
TABLE_ID_15_ROWCOUNT;
}
protected transient boolean autoDelete = false;
protected transient JobConf jc;
Class<? extends Writable> outputClass;
String taskId;
private boolean filesCreated = false;
@Override
protected void initializeOp(Configuration hconf) throws HiveException {
try {
this.hconf = hconf;
filesCreated = false;
isNativeTable = !conf.getTableInfo().isNonNative();
multiFileSpray = conf.isMultiFileSpray();
totalFiles = conf.getTotalFiles();
numFiles = conf.getNumFiles();
dpCtx = conf.getDynPartCtx();
valToPaths = new HashMap<String, FSPaths>();
taskId = Utilities.getTaskId(hconf);
specPath = new Path(conf.getDirName());
fs = specPath.getFileSystem(hconf);
hiveOutputFormat = conf.getTableInfo().getOutputFileFormatClass().newInstance();
isCompressed = conf.getCompressed();
parent = Utilities.toTempPath(conf.getDirName());
serializer = (Serializer) conf.getTableInfo().getDeserializerClass().newInstance();
serializer.initialize(null, conf.getTableInfo().getProperties());
outputClass = serializer.getSerializedClass();
// Timeout is chosen to make sure that even if one iteration takes more than
// half of the script.timeout but less than script.timeout, we will still
// be able to report progress.
timeOut = hconf.getInt("mapred.healthChecker.script.timeout", 600000)/2;
if (hconf instanceof JobConf) {
jc = (JobConf) hconf;
} else {
// test code path
jc = new JobConf(hconf, ExecDriver.class);
}
if (multiFileSpray) {
partitionEval = new ExprNodeEvaluator[conf.getPartitionCols().size()];
int i = 0;
for (ExprNodeDesc e : conf.getPartitionCols()) {
partitionEval[i++] = ExprNodeEvaluatorFactory.get(e);
}
partitionObjectInspectors = initEvaluators(partitionEval, outputObjInspector);
prtner = (HivePartitioner<HiveKey, Object>) ReflectionUtils.newInstance(
jc.getPartitionerClass(), null);
}
int id = conf.getDestTableId();
if ((id != 0) && (id <= TableIdEnum.values().length)) {
String enumName = "TABLE_ID_" + String.valueOf(id) + "_ROWCOUNT";
tabIdEnum = TableIdEnum.valueOf(enumName);
row_count = new LongWritable();
statsMap.put(tabIdEnum, row_count);
}
if (dpCtx != null) {
dpSetup();
}
if (!bDynParts) {
fsp = new FSPaths(specPath);
// Create all the files - this is required because empty files need to be created for
// empty buckets
// createBucketFiles(fsp);
valToPaths.put("", fsp); // special entry for non-DP case
}
initializeChildren(hconf);
} catch (HiveException e) {
throw e;
} catch (Exception e) {
e.printStackTrace();
throw new HiveException(e);
}
}
/**
* Set up for dynamic partitioning including a new ObjectInspector for the output row.
*/
private void dpSetup() {
this.bDynParts = false;
this.numDynParts = dpCtx.getNumDPCols();
this.dpColNames = dpCtx.getDPColNames();
this.maxPartitions = dpCtx.getMaxPartitionsPerNode();
assert numDynParts == dpColNames.size():
"number of dynamic paritions should be the same as the size of DP mapping";
if (dpColNames != null && dpColNames.size() > 0) {
this.bDynParts = true;
assert inputObjInspectors.length == 1: "FileSinkOperator should have 1 parent, but it has "
+ inputObjInspectors.length;
StructObjectInspector soi = (StructObjectInspector) inputObjInspectors[0];
// remove the last dpMapping.size() columns from the OI
List<? extends StructField> fieldOI = soi.getAllStructFieldRefs();
ArrayList<ObjectInspector> newFieldsOI = new ArrayList<ObjectInspector>();
ArrayList<String> newFieldsName = new ArrayList<String>();
this.dpStartCol = 0;
for (StructField sf: fieldOI) {
String fn = sf.getFieldName();
if (!dpCtx.getInputToDPCols().containsKey(fn)) {
newFieldsOI.add(sf.getFieldObjectInspector());
newFieldsName.add(sf.getFieldName());
this.dpStartCol++;
}
}
assert newFieldsOI.size() > 0: "new Fields ObjectInspector is empty";
this.subSetOI = new SubStructObjectInspector(soi, 0, this.dpStartCol);
this.dpVals = new ArrayList<String>(numDynParts);
this.dpWritables = new ArrayList<Object>(numDynParts);
}
}
private void createBucketFiles(FSPaths fsp) throws HiveException {
try {
int filesIdx = 0;
Set<Integer> seenBuckets = new HashSet<Integer>();
for (int idx = 0; idx < totalFiles; idx++) {
if (this.getExecContext() != null && this.getExecContext().getFileId() != -1) {
LOG.info("replace taskId from execContext ");
taskId = Utilities.replaceTaskIdFromFilename(taskId, this.getExecContext().getFileId());
LOG.info("new taskId: FS " + taskId);
assert !multiFileSpray;
assert totalFiles == 1;
}
if (multiFileSpray) {
key.setHashCode(idx);
// Does this hashcode belong to this reducer
int numReducers = totalFiles/numFiles;
if (numReducers > 1) {
int currReducer = Integer.valueOf(Utilities.getTaskIdFromFilename(Utilities.getTaskId(hconf)));
int reducerIdx = prtner.getPartition(key, null, numReducers);
if (currReducer != reducerIdx) {
continue;
}
}
int bucketNum = prtner.getBucket(key, null, totalFiles);
if (seenBuckets.contains(bucketNum)) {
continue;
}
seenBuckets.add(bucketNum);
bucketMap.put(bucketNum, filesIdx);
taskId = Utilities.replaceTaskIdFromFilename(Utilities.getTaskId(hconf), bucketNum);
}
if (isNativeTable) {
fsp.finalPaths[filesIdx] = fsp.getFinalPath(taskId);
LOG.info("Final Path: FS " + fsp.finalPaths[filesIdx]);
fsp.outPaths[filesIdx] = fsp.getOutPath(taskId);
LOG.info("Writing to temp file: FS " + fsp.outPaths[filesIdx]);
} else {
fsp.finalPaths[filesIdx] = fsp.outPaths[filesIdx] = specPath;
}
try {
// The reason to keep these instead of using
// OutputFormat.getRecordWriter() is that
// getRecordWriter does not give us enough control over the file name that
// we create.
if (!bDynParts) {
fsp.finalPaths[filesIdx] = HiveFileFormatUtils.getOutputFormatFinalPath(
parent, taskId, jc, hiveOutputFormat, isCompressed, fsp.finalPaths[filesIdx]);
} else {
String extension = null;
if (hiveOutputFormat instanceof HiveIgnoreKeyTextOutputFormat) {
extension = Utilities.getFileExtension(jc, isCompressed);
}
fsp.finalPaths[filesIdx] = fsp.getFinalPath(taskId, fsp.tmpPath, extension);
}
} catch (Exception e) {
e.printStackTrace();
throw new HiveException(e);
}
LOG.info("New Final Path: FS " + fsp.finalPaths[filesIdx]);
if (isNativeTable) {
try {
// in recent hadoop versions, use deleteOnExit to clean tmp files.
autoDelete = ShimLoader.getHadoopShims().fileSystemDeleteOnExit(
fs, fsp.outPaths[filesIdx]);
} catch (IOException e) {
throw new HiveException(e);
}
}
Utilities.copyTableJobPropertiesToConf(conf.getTableInfo(), jc);
// only create bucket files only if no dynamic partitions,
// buckets of dynamic partitions will be created for each newly created partition
fsp.outWriters[filesIdx] = HiveFileFormatUtils.getHiveRecordWriter(
jc, conf.getTableInfo(), outputClass, conf, fsp.outPaths[filesIdx]);
// increment the CREATED_FILES counter
if (reporter != null) {
reporter.incrCounter(ProgressCounter.CREATED_FILES, 1);
}
filesIdx++;
}
assert filesIdx == numFiles;
// in recent hadoop versions, use deleteOnExit to clean tmp files.
if (isNativeTable) {
autoDelete = ShimLoader.getHadoopShims().fileSystemDeleteOnExit(fs, fsp.outPaths[0]);
}
} catch (HiveException e) {
throw e;
} catch (Exception e) {
e.printStackTrace();
throw new HiveException(e);
}
filesCreated = true;
}
/**
* Report status to JT so that JT won't kill this task if closing takes too long
* due to too many files to close and the NN is overloaded.
* @param lastUpdateTime the time (msec) that progress update happened.
* @return true if a new progress update is reported, false otherwise.
*/
private boolean updateProgress() {
if (reporter != null &&
(System.currentTimeMillis() - lastProgressReport) > timeOut) {
reporter.progress();
lastProgressReport = System.currentTimeMillis();
return true;
} else {
return false;
}
}
Writable recordValue;
@Override
public void processOp(Object row, int tag) throws HiveException {
if (!bDynParts && !filesCreated) {
createBucketFiles(fsp);
}
// Since File Sink is a terminal operator, forward is not called - so,
// maintain the number of output rows explicitly
if (counterNameToEnum != null) {
++outputRows;
if (outputRows % 1000 == 0) {
incrCounter(numOutputRowsCntr, outputRows);
outputRows = 0;
}
}
try {
updateProgress();
// if DP is enabled, get the final output writers and prepare the real output row
assert inputObjInspectors[0].getCategory() == ObjectInspector.Category.STRUCT:
"input object inspector is not struct";
if (bDynParts) {
// copy the DP column values from the input row to dpVals
dpVals.clear();
dpWritables.clear();
ObjectInspectorUtils.partialCopyToStandardObject(dpWritables, row, dpStartCol, numDynParts,
(StructObjectInspector) inputObjInspectors[0], ObjectInspectorCopyOption.WRITABLE);
// get a set of RecordWriter based on the DP column values
// pass the null value along to the escaping process to determine what the dir should be
for (Object o: dpWritables) {
if (o == null || o.toString().length() == 0) {
dpVals.add(dpCtx.getDefaultPartitionName());
} else {
dpVals.add(o.toString());
}
}
rowOutWriters = getDynOutWriters(dpVals);
// use SubStructObjectInspector to serialize the non-partitioning columns in the input row
recordValue = serializer.serialize(row, subSetOI);
} else {
rowOutWriters = fsp.outWriters;
if (conf.isGatherStats()) {
fsp.stat.increaseNumRows(1);
}
// use SerDe to serialize r, and write it out
recordValue = serializer.serialize(row, inputObjInspectors[0]);
}
if (row_count != null) {
row_count.set(row_count.get() + 1);
}
if (!multiFileSpray) {
rowOutWriters[0].write(recordValue);
}
else {
int keyHashCode = 0;
for (int i = 0; i < partitionEval.length; i++) {
Object o = partitionEval[i].evaluate(row);
keyHashCode = keyHashCode * 31
+ ObjectInspectorUtils.hashCode(o, partitionObjectInspectors[i]);
}
key.setHashCode(keyHashCode);
int bucketNum = prtner.getBucket(key, null, totalFiles);
int idx = bucketMap.get(bucketNum);
rowOutWriters[idx].write(recordValue);
}
} catch (IOException e) {
throw new HiveException(e);
} catch (SerDeException e) {
throw new HiveException(e);
}
}
private RecordWriter[] getDynOutWriters(List<String> row) throws HiveException {
RecordWriter[] rw; // return value
// get the path corresponding to the dynamic partition columns,
String dpDir = getDynPartDirectory(row, dpColNames, numDynParts);
if (dpDir != null) {
FSPaths fsp2 = valToPaths.get(dpDir);
if (fsp2 == null) {
// check # of dp
if (valToPaths.size() > maxPartitions) {
// throw fatal error
incrCounter(fatalErrorCntr, 1);
fatalError = true;
LOG.error("Fatal error was thrown due to exceeding number of dynamic partitions");
}
fsp2 = new FSPaths(specPath);
fsp2.tmpPath = new Path(fsp2.tmpPath, dpDir);
createBucketFiles(fsp2);
valToPaths.put(dpDir, fsp2);
}
if (conf.isGatherStats()) {
fsp2.stat.increaseNumRows(1);
}
rw = fsp2.outWriters;
} else {
rw = fsp.outWriters;
}
return rw;
}
// given the current input row, the mapping for input col info to dp columns, and # of dp cols,
// return the relative path corresponding to the row.
// e.g., ds=2008-04-08/hr=11
private String getDynPartDirectory(List<String> row, List<String> dpColNames, int numDynParts) {
assert row.size() == numDynParts && numDynParts == dpColNames.size():
"data length is different from num of DP columns";
return FileUtils.makePartName(dpColNames, row);
}
@Override
protected void fatalErrorMessage(StringBuilder errMsg, long counterCode) {
errMsg.append("Operator ").append(getOperatorId()).append(" (id=").append(id).append("): ");
errMsg.append(counterCode > FATAL_ERR_MSG.length - 1 ?
"fatal error":
FATAL_ERR_MSG[(int) counterCode]);
}
@Override
public void closeOp(boolean abort) throws HiveException {
if (!bDynParts && !filesCreated) {
createBucketFiles(fsp);
}
lastProgressReport = System.currentTimeMillis();
if (!abort) {
for (FSPaths fsp: valToPaths.values()) {
fsp.closeWriters(abort);
if (isNativeTable) {
fsp.commit(fs);
}
}
// Only publish stats if this operator's flag was set to gather stats
if (conf.isGatherStats()) {
publishStats();
}
} else {
// Will come here if an Exception was thrown in map() or reduce().
// Hadoop always call close() even if an Exception was thrown in map() or
// reduce().
for (FSPaths fsp: valToPaths.values()) {
fsp.abortWriters(fs, abort, !autoDelete && isNativeTable);
}
}
}
/**
* @return the name of the operator
*/
@Override
public String getName() {
return new String("FS");
}
@Override
public void jobClose(Configuration hconf, boolean success, JobCloseFeedBack feedBack)
throws HiveException {
try {
if ((conf != null) && isNativeTable) {
String specPath = conf.getDirName();
DynamicPartitionCtx dpCtx = conf.getDynPartCtx();
mvFileToFinalPath(specPath, hconf, success, LOG, dpCtx);
}
} catch (IOException e) {
throw new HiveException(e);
}
super.jobClose(hconf, success, feedBack);
}
public void mvFileToFinalPath(String specPath, Configuration hconf,
boolean success, Log log, DynamicPartitionCtx dpCtx) throws IOException, HiveException {
FileSystem fs = (new Path(specPath)).getFileSystem(hconf);
Path tmpPath = Utilities.toTempPath(specPath);
Path intermediatePath = new Path(tmpPath.getParent(), tmpPath.getName()
+ ".intermediate");
Path finalPath = new Path(specPath);
if (success) {
if (fs.exists(tmpPath)) {
// Step1: rename tmp output folder to intermediate path. After this
// point, updates from speculative tasks still writing to tmpPath
// will not appear in finalPath.
log.info("Moving tmp dir: " + tmpPath + " to: " + intermediatePath);
Utilities.rename(fs, tmpPath, intermediatePath);
// Step2: remove any tmp file or double-committed output files
ArrayList<String> emptyBuckets =
Utilities.removeTempOrDuplicateFiles(fs, intermediatePath, dpCtx);
// create empty buckets if necessary
if (emptyBuckets.size() > 0) {
createEmptyBuckets(hconf, emptyBuckets);
}
// Step3: move to the file destination
log.info("Moving tmp dir: " + intermediatePath + " to: " + finalPath);
Utilities.renameOrMoveFiles(fs, intermediatePath, finalPath);
}
} else {
fs.delete(tmpPath, true);
}
}
/**
* Check the existence of buckets according to bucket specification. Create empty buckets if
* needed.
* @param specPath The final path where the dynamic partitions should be in.
* @param conf FileSinkDesc.
* @param dpCtx dynamic partition context.
* @throws HiveException
* @throws IOException
*/
private void createEmptyBuckets(Configuration hconf, ArrayList<String> paths)
throws HiveException, IOException {
JobConf jc;
if (hconf instanceof JobConf) {
jc = new JobConf(hconf);
} else {
// test code path
jc = new JobConf(hconf, ExecDriver.class);
}
HiveOutputFormat<?, ?> hiveOutputFormat = null;
Class<? extends Writable> outputClass = null;
boolean isCompressed = conf.getCompressed();
TableDesc tableInfo = conf.getTableInfo();
try {
Serializer serializer = (Serializer) tableInfo.getDeserializerClass().newInstance();
serializer.initialize(null, tableInfo.getProperties());
outputClass = serializer.getSerializedClass();
hiveOutputFormat = conf.getTableInfo().getOutputFileFormatClass().newInstance();
} catch (SerDeException e) {
throw new HiveException(e);
} catch (InstantiationException e) {
throw new HiveException(e);
} catch (IllegalAccessException e) {
throw new HiveException(e);
}
for (String p: paths) {
Path path = new Path(p);
RecordWriter writer = HiveFileFormatUtils.getRecordWriter(
jc, hiveOutputFormat, outputClass, isCompressed, tableInfo.getProperties(), path);
writer.close(false);
LOG.info("created empty bucket for enforcing bucketing at " + path);
}
}
@Override
public OperatorType getType() {
return OperatorType.FILESINK;
}
@Override
public void augmentPlan() {
PlanUtils.configureTableJobPropertiesForStorageHandler(
getConf().getTableInfo());
}
private void publishStats() {
// Initializing a stats publisher
StatsPublisher statsPublisher = Utilities.getStatsPublisher(jc);
if (statsPublisher == null || !statsPublisher.connect(hconf)) {
// just return, stats gathering should not block the main query
LOG.error("StatsPublishing error: cannot connect to database");
return;
}
String taskID = Utilities.getTaskIdFromFilename(Utilities.getTaskId(hconf));
String spSpec = conf.getStaticSpec() != null ? conf.getStaticSpec() : "";
for (String fspKey : valToPaths.keySet()) {
FSPaths fspValue = valToPaths.get(fspKey);
String key;
// construct the key(fileID) to insert into the intermediate stats table
if (fspKey == "") {
// for non-partitioned/static partitioned table, the key for temp storage is
// common key prefix + static partition spec + taskID
key = conf.getStatsAggPrefix() + spSpec + taskID ;
} else {
// for partitioned table, the key is
// common key prefix + static partition spec + DynamicPartSpec + taskID
key = conf.getStatsAggPrefix() + spSpec + fspKey + Path.SEPARATOR + taskID;
}
statsPublisher.publishStat(key, StatsSetupConst.ROW_COUNT, Long.toString(fspValue.stat.getNumRows()));
}
statsPublisher.closeConnection();
}
}