Package org.apache.hadoop.hive.ql.exec

Source Code of org.apache.hadoop.hive.ql.exec.FileSinkOperator

/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.hadoop.hive.ql.exec;

import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.commons.logging.Log;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.common.FileUtils;
import org.apache.hadoop.hive.ql.io.HiveFileFormatUtils;
import org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat;
import org.apache.hadoop.hive.ql.io.HiveKey;
import org.apache.hadoop.hive.ql.io.HiveOutputFormat;
import org.apache.hadoop.hive.ql.io.HivePartitioner;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.plan.DynamicPartitionCtx;
import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
import org.apache.hadoop.hive.ql.plan.FileSinkDesc;
import org.apache.hadoop.hive.ql.plan.PlanUtils;
import org.apache.hadoop.hive.ql.plan.TableDesc;
import org.apache.hadoop.hive.ql.plan.api.OperatorType;
import org.apache.hadoop.hive.ql.stats.StatsPublisher;
import org.apache.hadoop.hive.ql.stats.StatsSetupConst;
import org.apache.hadoop.hive.serde2.SerDeException;
import org.apache.hadoop.hive.serde2.Serializer;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.ObjectInspectorCopyOption;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.SubStructObjectInspector;
import org.apache.hadoop.hive.shims.ShimLoader;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.util.ReflectionUtils;
/**
* File Sink operator implementation.
**/
public class FileSinkOperator extends TerminalOperator<FileSinkDesc> implements
    Serializable {

  protected transient HashMap<String, FSPaths> valToPaths;
  protected transient int numDynParts;
  protected transient List<String> dpColNames;
  protected transient DynamicPartitionCtx dpCtx;
  protected transient boolean isCompressed;
  protected transient Path parent;
  protected transient HiveOutputFormat<?, ?> hiveOutputFormat;
  protected transient Path specPath;
  protected transient int dpStartCol; // start column # for DP columns
  protected transient List<String> dpVals; // array of values corresponding to DP columns
  protected transient List<Object> dpWritables;
  protected transient RecordWriter[] rowOutWriters;      // row specific RecordWriters
  protected transient int maxPartitions;

  private static final transient String[] FATAL_ERR_MSG = {
    null, // counter value 0 means no error
    "Number of dynamic partitions exceeded hive.exec.max.dynamic.partitions.pernode."
  };

  /**
   * RecordWriter.
   *
   */
  public static interface RecordWriter {
    void write(Writable w) throws IOException;

    void close(boolean abort) throws IOException;
  }

  public class FSPaths implements Cloneable {
    Path tmpPath;
    Path[] outPaths;
    Path[] finalPaths;
    RecordWriter[] outWriters;
    Stat stat;

    public FSPaths() {
    }

    public FSPaths(Path specPath) {
      tmpPath = Utilities.toTempPath(specPath);
      outPaths   = new Path[numFiles];
      finalPaths = new Path[numFiles];
      outWriters = new RecordWriter[numFiles];
      stat = new Stat();
    }

    /**
     * Append a subdirectory to the tmp path.
     * @param dp subdirecgtory name
     */
    public void appendTmpPath(String dp) {
      tmpPath = new Path(tmpPath, dp);
    }

    /**
     * Update OutPath according to tmpPath.
     */
    public Path getOutPath(String taskId) {
      return getOutPath(taskId, this.tmpPath);
    }

    /**
     * Update OutPath according to tmpPath.
     */
    public Path getOutPath(String taskId, Path tmp) {
      return new Path(tmp, Utilities.toTempPath(taskId));
    }

    /**
     * Update the final paths according to tmpPath.
     */
    public Path getFinalPath(String taskId) {
      return getFinalPath(taskId, this.tmpPath, null);
    }

    /**
     * Update the final paths according to tmpPath.
     */
    public Path getFinalPath(String taskId, Path tmpPath, String extension) {
      if (extension != null) {
        return new Path(tmpPath, taskId + extension);
      } else {
        return new Path(tmpPath, taskId);
      }
    }

    public void setOutWriters(RecordWriter[] out) {
      outWriters = out;
    }

    public RecordWriter[] getOutWriters() {
      return outWriters;
    }

    public void closeWriters(boolean abort) throws HiveException {
      for (int idx = 0; idx < outWriters.length; idx++) {
        if (outWriters[idx] != null) {
          try {
            outWriters[idx].close(abort);
            updateProgress();
          } catch (IOException e) {
            throw new HiveException(e);
          }
        }
      }
    }

    private void commit(FileSystem fs) throws HiveException {
      for (int idx = 0; idx < outPaths.length; ++idx) {
        try {
          if (!fs.rename(outPaths[idx], finalPaths[idx])) {
            throw new HiveException("Unable to rename output to: "
                + finalPaths[idx]);
          }
          updateProgress();
        } catch (IOException e) {
          throw new HiveException(e + "Unable to rename output to: "
              + finalPaths[idx]);
        }
      }
    }

    public void abortWriters(FileSystem fs, boolean abort, boolean delete) throws HiveException {
      for (int idx = 0; idx < outWriters.length; idx++) {
        if (outWriters[idx] != null) {
          try {
            outWriters[idx].close(abort);
            if (delete) {
              fs.delete(outPaths[idx], true);
            }
            updateProgress();
          } catch (IOException e) {
            throw new HiveException(e);
          }
        }
      }
    }
  } // class FSPaths

  private static final long serialVersionUID = 1L;
  protected transient FileSystem fs;
  protected transient Serializer serializer;
  protected transient BytesWritable commonKey = new BytesWritable();
  protected transient TableIdEnum tabIdEnum = null;
  private transient LongWritable row_count;
  private transient boolean isNativeTable = true;

  /**
   * The evaluators for the multiFile sprayer. If the table under consideration has 1000 buckets,
   * it is not a good idea to start so many reducers - if the maximum number of reducers is 100,
   * each reducer can write 10 files - this way we effectively get 1000 files.
   */
  private transient ExprNodeEvaluator[] partitionEval;
  private transient int      totalFiles;
  private transient int      numFiles;
  private transient boolean  multiFileSpray;
  private transient final Map<Integer, Integer> bucketMap = new HashMap<Integer, Integer>();

  private transient ObjectInspector[] partitionObjectInspectors;
  private transient HivePartitioner<HiveKey, Object> prtner;
  private transient final HiveKey key = new HiveKey();
  private transient Configuration hconf;
  private transient FSPaths fsp;
  private transient boolean bDynParts;
  private transient SubStructObjectInspector subSetOI;
  private transient int timeOut; // JT timeout in msec.
  private transient long lastProgressReport = System.currentTimeMillis();

  /**
   * TableIdEnum.
   *
   */
  public static enum TableIdEnum {
    TABLE_ID_1_ROWCOUNT,
    TABLE_ID_2_ROWCOUNT,
    TABLE_ID_3_ROWCOUNT,
    TABLE_ID_4_ROWCOUNT,
    TABLE_ID_5_ROWCOUNT,
    TABLE_ID_6_ROWCOUNT,
    TABLE_ID_7_ROWCOUNT,
    TABLE_ID_8_ROWCOUNT,
    TABLE_ID_9_ROWCOUNT,
    TABLE_ID_10_ROWCOUNT,
    TABLE_ID_11_ROWCOUNT,
    TABLE_ID_12_ROWCOUNT,
    TABLE_ID_13_ROWCOUNT,
    TABLE_ID_14_ROWCOUNT,
    TABLE_ID_15_ROWCOUNT;
  }

  protected transient boolean autoDelete = false;
  protected transient JobConf jc;
  Class<? extends Writable> outputClass;
  String taskId;

  private boolean filesCreated = false;
  @Override
  protected void initializeOp(Configuration hconf) throws HiveException {
    try {
      this.hconf       = hconf;
      filesCreated     = false;
      isNativeTable    = !conf.getTableInfo().isNonNative();
      multiFileSpray   = conf.isMultiFileSpray();
      totalFiles       = conf.getTotalFiles();
      numFiles         = conf.getNumFiles();
      dpCtx            = conf.getDynPartCtx();
      valToPaths       = new HashMap<String, FSPaths>();
      taskId           = Utilities.getTaskId(hconf);
      specPath         = new Path(conf.getDirName());
      fs               = specPath.getFileSystem(hconf);
      hiveOutputFormat = conf.getTableInfo().getOutputFileFormatClass().newInstance();
      isCompressed     = conf.getCompressed();
      parent           = Utilities.toTempPath(conf.getDirName());
      serializer       = (Serializer) conf.getTableInfo().getDeserializerClass().newInstance();
      serializer.initialize(null, conf.getTableInfo().getProperties());
      outputClass = serializer.getSerializedClass();

      // Timeout is chosen to make sure that even if one iteration takes more than
      // half of the script.timeout but less than script.timeout, we will still
      // be able to report progress.
      timeOut = hconf.getInt("mapred.healthChecker.script.timeout", 600000)/2;

      if (hconf instanceof JobConf) {
        jc = (JobConf) hconf;
      } else {
        // test code path
        jc = new JobConf(hconf, ExecDriver.class);
      }

      if (multiFileSpray) {
        partitionEval = new ExprNodeEvaluator[conf.getPartitionCols().size()];
        int i = 0;
        for (ExprNodeDesc e : conf.getPartitionCols()) {
          partitionEval[i++] = ExprNodeEvaluatorFactory.get(e);
        }

        partitionObjectInspectors = initEvaluators(partitionEval, outputObjInspector);
        prtner = (HivePartitioner<HiveKey, Object>) ReflectionUtils.newInstance(
            jc.getPartitionerClass(), null);
      }
      int id = conf.getDestTableId();
      if ((id != 0) && (id <= TableIdEnum.values().length)) {
        String enumName = "TABLE_ID_" + String.valueOf(id) + "_ROWCOUNT";
        tabIdEnum = TableIdEnum.valueOf(enumName);
        row_count = new LongWritable();
        statsMap.put(tabIdEnum, row_count);
      }

      if (dpCtx != null) {
        dpSetup();
      }

      if (!bDynParts) {
        fsp = new FSPaths(specPath);

        // Create all the files - this is required because empty files need to be created for
        // empty buckets
        // createBucketFiles(fsp);
        valToPaths.put("", fsp); // special entry for non-DP case
      }
      initializeChildren(hconf);
    } catch (HiveException e) {
      throw e;
    } catch (Exception e) {
      e.printStackTrace();
      throw new HiveException(e);
    }
  }

  /**
   * Set up for dynamic partitioning including a new ObjectInspector for the output row.
   */
  private void dpSetup() {

    this.bDynParts      = false;
    this.numDynParts    = dpCtx.getNumDPCols();
    this.dpColNames     = dpCtx.getDPColNames();
    this.maxPartitions  = dpCtx.getMaxPartitionsPerNode();

    assert numDynParts == dpColNames.size():
      "number of dynamic paritions should be the same as the size of DP mapping";

    if (dpColNames != null && dpColNames.size() > 0) {
      this.bDynParts = true;
      assert inputObjInspectors.length == 1: "FileSinkOperator should have 1 parent, but it has "
        + inputObjInspectors.length;
      StructObjectInspector soi = (StructObjectInspector) inputObjInspectors[0];
      // remove the last dpMapping.size() columns from the OI
      List<? extends StructField> fieldOI = soi.getAllStructFieldRefs();
      ArrayList<ObjectInspector> newFieldsOI = new ArrayList<ObjectInspector>();
      ArrayList<String> newFieldsName = new ArrayList<String>();
      this.dpStartCol = 0;
      for (StructField sf: fieldOI) {
        String fn = sf.getFieldName();
        if (!dpCtx.getInputToDPCols().containsKey(fn)) {
          newFieldsOI.add(sf.getFieldObjectInspector());
          newFieldsName.add(sf.getFieldName());
          this.dpStartCol++;
        }
      }
      assert newFieldsOI.size() > 0: "new Fields ObjectInspector is empty";

      this.subSetOI = new SubStructObjectInspector(soi, 0, this.dpStartCol);
      this.dpVals = new ArrayList<String>(numDynParts);
      this.dpWritables = new ArrayList<Object>(numDynParts);
    }
  }

  private void createBucketFiles(FSPaths fsp) throws HiveException {
    try {
      int filesIdx = 0;
      Set<Integer> seenBuckets = new HashSet<Integer>();
      for (int idx = 0; idx < totalFiles; idx++) {
        if (this.getExecContext() != null && this.getExecContext().getFileId() != -1) {
          LOG.info("replace taskId from execContext ");

          taskId = Utilities.replaceTaskIdFromFilename(taskId, this.getExecContext().getFileId());

          LOG.info("new taskId: FS " + taskId);

          assert !multiFileSpray;
          assert totalFiles == 1;
        }

        if (multiFileSpray) {
          key.setHashCode(idx);

          // Does this hashcode belong to this reducer
          int numReducers = totalFiles/numFiles;

          if (numReducers > 1) {
            int currReducer = Integer.valueOf(Utilities.getTaskIdFromFilename(Utilities.getTaskId(hconf)));

            int reducerIdx = prtner.getPartition(key, null, numReducers);
            if (currReducer != reducerIdx) {
              continue;
            }
          }

          int bucketNum = prtner.getBucket(key, null, totalFiles);
          if (seenBuckets.contains(bucketNum)) {
            continue;
          }
          seenBuckets.add(bucketNum);

          bucketMap.put(bucketNum, filesIdx);
          taskId = Utilities.replaceTaskIdFromFilename(Utilities.getTaskId(hconf), bucketNum);
        }
        if (isNativeTable) {
          fsp.finalPaths[filesIdx] = fsp.getFinalPath(taskId);
          LOG.info("Final Path: FS " + fsp.finalPaths[filesIdx]);
          fsp.outPaths[filesIdx] = fsp.getOutPath(taskId);
          LOG.info("Writing to temp file: FS " + fsp.outPaths[filesIdx]);
        } else {
          fsp.finalPaths[filesIdx] = fsp.outPaths[filesIdx] = specPath;
        }
        try {
          // The reason to keep these instead of using
          // OutputFormat.getRecordWriter() is that
          // getRecordWriter does not give us enough control over the file name that
          // we create.
          if (!bDynParts) {
            fsp.finalPaths[filesIdx] = HiveFileFormatUtils.getOutputFormatFinalPath(
                parent, taskId, jc, hiveOutputFormat, isCompressed, fsp.finalPaths[filesIdx]);
          } else {
            String extension = null;
            if (hiveOutputFormat instanceof HiveIgnoreKeyTextOutputFormat) {
              extension = Utilities.getFileExtension(jc, isCompressed);
            }
            fsp.finalPaths[filesIdx] = fsp.getFinalPath(taskId, fsp.tmpPath, extension);
          }

        } catch (Exception e) {
          e.printStackTrace();
          throw new HiveException(e);
        }
        LOG.info("New Final Path: FS " + fsp.finalPaths[filesIdx]);

        if (isNativeTable) {
          try {
            // in recent hadoop versions, use deleteOnExit to clean tmp files.
            autoDelete = ShimLoader.getHadoopShims().fileSystemDeleteOnExit(
                fs, fsp.outPaths[filesIdx]);
          } catch (IOException e) {
            throw new HiveException(e);
          }
        }

        Utilities.copyTableJobPropertiesToConf(conf.getTableInfo(), jc);
        // only create bucket files only if no dynamic partitions,
        // buckets of dynamic partitions will be created for each newly created partition
        fsp.outWriters[filesIdx] = HiveFileFormatUtils.getHiveRecordWriter(
            jc, conf.getTableInfo(), outputClass, conf, fsp.outPaths[filesIdx]);
        // increment the CREATED_FILES counter
        if (reporter != null) {
          reporter.incrCounter(ProgressCounter.CREATED_FILES, 1);
        }
        filesIdx++;
      }
      assert filesIdx == numFiles;

      // in recent hadoop versions, use deleteOnExit to clean tmp files.
      if (isNativeTable) {
        autoDelete = ShimLoader.getHadoopShims().fileSystemDeleteOnExit(fs, fsp.outPaths[0]);
      }
    } catch (HiveException e) {
      throw e;
    } catch (Exception e) {
      e.printStackTrace();
      throw new HiveException(e);
    }

    filesCreated = true;
  }

  /**
   * Report status to JT so that JT won't kill this task if closing takes too long
   * due to too many files to close and the NN is overloaded.
   * @param lastUpdateTime the time (msec) that progress update happened.
   * @return true if a new progress update is reported, false otherwise.
   */
  private boolean updateProgress() {
    if (reporter != null &&
        (System.currentTimeMillis() - lastProgressReport) > timeOut) {
      reporter.progress();
      lastProgressReport = System.currentTimeMillis();
      return true;
    } else {
      return false;
    }
  }

  Writable recordValue;

  @Override
  public void processOp(Object row, int tag) throws HiveException {
    if (!bDynParts && !filesCreated) {
      createBucketFiles(fsp);
    }

    // Since File Sink is a terminal operator, forward is not called - so,
    // maintain the number of output rows explicitly
    if (counterNameToEnum != null) {
      ++outputRows;
      if (outputRows % 1000 == 0) {
        incrCounter(numOutputRowsCntr, outputRows);
        outputRows = 0;
      }
    }

    try {
      updateProgress();

      // if DP is enabled, get the final output writers and prepare the real output row
      assert inputObjInspectors[0].getCategory() == ObjectInspector.Category.STRUCT:
        "input object inspector is not struct";

      if (bDynParts) {
        // copy the DP column values from the input row to dpVals
        dpVals.clear();
        dpWritables.clear();
        ObjectInspectorUtils.partialCopyToStandardObject(dpWritables, row, dpStartCol, numDynParts,
            (StructObjectInspector) inputObjInspectors[0], ObjectInspectorCopyOption.WRITABLE);
        // get a set of RecordWriter based on the DP column values
        // pass the null value along to the escaping process to determine what the dir should be
        for (Object o: dpWritables) {
          if (o == null || o.toString().length() == 0) {
            dpVals.add(dpCtx.getDefaultPartitionName());
          } else {
            dpVals.add(o.toString());
          }
        }
        rowOutWriters = getDynOutWriters(dpVals);
        // use SubStructObjectInspector to serialize the non-partitioning columns in the input row
        recordValue = serializer.serialize(row, subSetOI);
      } else {
        rowOutWriters = fsp.outWriters;

        if (conf.isGatherStats()) {
          fsp.stat.increaseNumRows(1);
        }

        // use SerDe to serialize r, and write it out
        recordValue = serializer.serialize(row, inputObjInspectors[0]);
      }

      if (row_count != null) {
        row_count.set(row_count.get() + 1);
      }

      if (!multiFileSpray) {
        rowOutWriters[0].write(recordValue);
      }
      else {
        int keyHashCode = 0;
        for (int i = 0; i < partitionEval.length; i++) {
          Object o = partitionEval[i].evaluate(row);
          keyHashCode = keyHashCode * 31
              + ObjectInspectorUtils.hashCode(o, partitionObjectInspectors[i]);
        }
        key.setHashCode(keyHashCode);
        int bucketNum = prtner.getBucket(key, null, totalFiles);
        int idx = bucketMap.get(bucketNum);
        rowOutWriters[idx].write(recordValue);
      }
    } catch (IOException e) {
      throw new HiveException(e);
    } catch (SerDeException e) {
      throw new HiveException(e);
    }
  }

  private RecordWriter[] getDynOutWriters(List<String> row) throws HiveException {

    RecordWriter[] rw;  // return value

    // get the path corresponding to the dynamic partition columns,
    String dpDir = getDynPartDirectory(row, dpColNames, numDynParts);

    if (dpDir != null) {
      FSPaths fsp2 = valToPaths.get(dpDir);

      if (fsp2 == null) {
        // check # of dp
        if (valToPaths.size() > maxPartitions) {
          // throw fatal error
          incrCounter(fatalErrorCntr, 1);
          fatalError = true;
          LOG.error("Fatal error was thrown due to exceeding number of dynamic partitions");
        }
        fsp2 = new FSPaths(specPath);
        fsp2.tmpPath = new Path(fsp2.tmpPath, dpDir);
        createBucketFiles(fsp2);
        valToPaths.put(dpDir, fsp2);
      }
      if (conf.isGatherStats()) {
        fsp2.stat.increaseNumRows(1);
      }
      rw = fsp2.outWriters;
    } else {
      rw = fsp.outWriters;
    }
    return rw;
  }

  // given the current input row, the mapping for input col info to dp columns, and # of dp cols,
  // return the relative path corresponding to the row.
  // e.g., ds=2008-04-08/hr=11
  private String getDynPartDirectory(List<String> row, List<String> dpColNames, int numDynParts) {
    assert row.size() == numDynParts && numDynParts == dpColNames.size():
      "data length is different from num of DP columns";
    return FileUtils.makePartName(dpColNames, row);
  }

  @Override
  protected void fatalErrorMessage(StringBuilder errMsg, long counterCode) {
    errMsg.append("Operator ").append(getOperatorId()).append(" (id=").append(id).append("): ");
    errMsg.append(counterCode > FATAL_ERR_MSG.length - 1 ?
        "fatal error":
          FATAL_ERR_MSG[(int) counterCode]);
  }

  @Override
  public void closeOp(boolean abort) throws HiveException {

    if (!bDynParts && !filesCreated) {
      createBucketFiles(fsp);
    }

    lastProgressReport = System.currentTimeMillis();
    if (!abort) {
      for (FSPaths fsp: valToPaths.values()) {
        fsp.closeWriters(abort);
        if (isNativeTable) {
          fsp.commit(fs);
        }
      }
      // Only publish stats if this operator's flag was set to gather stats
      if (conf.isGatherStats()) {
        publishStats();
      }
    } else {
      // Will come here if an Exception was thrown in map() or reduce().
      // Hadoop always call close() even if an Exception was thrown in map() or
      // reduce().
      for (FSPaths fsp: valToPaths.values()) {
        fsp.abortWriters(fs, abort, !autoDelete && isNativeTable);
      }
    }
  }

  /**
   * @return the name of the operator
   */
  @Override
  public String getName() {
    return new String("FS");
  }

  @Override
  public void jobClose(Configuration hconf, boolean success, JobCloseFeedBack feedBack)
      throws HiveException {
    try {
      if ((conf != null) && isNativeTable) {
        String specPath = conf.getDirName();
        DynamicPartitionCtx dpCtx = conf.getDynPartCtx();
        mvFileToFinalPath(specPath, hconf, success, LOG, dpCtx);
      }
    } catch (IOException e) {
      throw new HiveException(e);
    }
    super.jobClose(hconf, success, feedBack);
  }

  public void mvFileToFinalPath(String specPath, Configuration hconf,
      boolean success, Log log, DynamicPartitionCtx dpCtx) throws IOException, HiveException {

    FileSystem fs = (new Path(specPath)).getFileSystem(hconf);
    Path tmpPath = Utilities.toTempPath(specPath);
    Path intermediatePath = new Path(tmpPath.getParent(), tmpPath.getName()
        + ".intermediate");
    Path finalPath = new Path(specPath);
    if (success) {
      if (fs.exists(tmpPath)) {
        // Step1: rename tmp output folder to intermediate path. After this
        // point, updates from speculative tasks still writing to tmpPath
        // will not appear in finalPath.
        log.info("Moving tmp dir: " + tmpPath + " to: " + intermediatePath);
        Utilities.rename(fs, tmpPath, intermediatePath);
        // Step2: remove any tmp file or double-committed output files
        ArrayList<String> emptyBuckets =
          Utilities.removeTempOrDuplicateFiles(fs, intermediatePath, dpCtx);
        // create empty buckets if necessary
        if (emptyBuckets.size() > 0) {
          createEmptyBuckets(hconf, emptyBuckets);
        }

        // Step3: move to the file destination
        log.info("Moving tmp dir: " + intermediatePath + " to: " + finalPath);
        Utilities.renameOrMoveFiles(fs, intermediatePath, finalPath);
      }
    } else {
      fs.delete(tmpPath, true);
    }
  }

  /**
   * Check the existence of buckets according to bucket specification. Create empty buckets if
   * needed.
   * @param specPath The final path where the dynamic partitions should be in.
   * @param conf FileSinkDesc.
   * @param dpCtx dynamic partition context.
   * @throws HiveException
   * @throws IOException
   */
  private void createEmptyBuckets(Configuration hconf, ArrayList<String> paths)
      throws HiveException, IOException {

    JobConf jc;
    if (hconf instanceof JobConf) {
      jc = new JobConf(hconf);
    } else {
      // test code path
      jc = new JobConf(hconf, ExecDriver.class);
    }
    HiveOutputFormat<?, ?> hiveOutputFormat = null;
    Class<? extends Writable> outputClass = null;
    boolean isCompressed = conf.getCompressed();
    TableDesc tableInfo = conf.getTableInfo();
    try {
      Serializer serializer = (Serializer) tableInfo.getDeserializerClass().newInstance();
      serializer.initialize(null, tableInfo.getProperties());
      outputClass = serializer.getSerializedClass();
      hiveOutputFormat = conf.getTableInfo().getOutputFileFormatClass().newInstance();
    } catch (SerDeException e) {
      throw new HiveException(e);
    } catch (InstantiationException e) {
      throw new HiveException(e);
    } catch (IllegalAccessException e) {
      throw new HiveException(e);
    }

    for (String p: paths) {
      Path path = new Path(p);
      RecordWriter writer = HiveFileFormatUtils.getRecordWriter(
          jc, hiveOutputFormat, outputClass, isCompressed, tableInfo.getProperties(), path);
      writer.close(false);
      LOG.info("created empty bucket for enforcing bucketing at " + path);
    }
  }

  @Override
  public OperatorType getType() {
    return OperatorType.FILESINK;
  }

  @Override
  public void augmentPlan() {
    PlanUtils.configureTableJobPropertiesForStorageHandler(
        getConf().getTableInfo());
  }

  private void publishStats() {
    // Initializing a stats publisher
    StatsPublisher statsPublisher = Utilities.getStatsPublisher(jc);

    if (statsPublisher == null || !statsPublisher.connect(hconf)) {
      // just return, stats gathering should not block the main query
      LOG.error("StatsPublishing error: cannot connect to database");
      return;
    }

    String taskID = Utilities.getTaskIdFromFilename(Utilities.getTaskId(hconf));
    String spSpec = conf.getStaticSpec() != null ? conf.getStaticSpec() : "";

    for (String fspKey : valToPaths.keySet()) {
      FSPaths fspValue = valToPaths.get(fspKey);
      String key;

      // construct the key(fileID) to insert into the intermediate stats table
      if (fspKey == "") {
        // for non-partitioned/static partitioned table, the key for temp storage is
        // common key prefix + static partition spec +  taskID
        key = conf.getStatsAggPrefix() + spSpec + taskID ;
      } else {
        // for partitioned table, the key is
        // common key prefix + static partition spec + DynamicPartSpec + taskID
        key = conf.getStatsAggPrefix() + spSpec + fspKey + Path.SEPARATOR + taskID;
      }
      statsPublisher.publishStat(key, StatsSetupConst.ROW_COUNT, Long.toString(fspValue.stat.getNumRows()));
    }
    statsPublisher.closeConnection();
  }
}
TOP

Related Classes of org.apache.hadoop.hive.ql.exec.FileSinkOperator

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.