Package org.apache.hadoop.hive.ql.exec

Source Code of org.apache.hadoop.hive.ql.exec.MapJoinOperator

/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.hadoop.hive.ql.exec;

import java.io.File;
import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Random;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.plan.mapJoinDesc;
import org.apache.hadoop.hive.ql.plan.tableDesc;
import org.apache.hadoop.hive.ql.util.jdbm.RecordManager;
import org.apache.hadoop.hive.ql.util.jdbm.RecordManagerFactory;
import org.apache.hadoop.hive.ql.util.jdbm.RecordManagerOptions;
import org.apache.hadoop.hive.ql.util.jdbm.htree.HTree;
import org.apache.hadoop.hive.serde2.SerDe;
import org.apache.hadoop.hive.serde2.SerDeException;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.ObjectInspectorCopyOption;
import org.apache.hadoop.util.ReflectionUtils;

/**
* Map side Join operator implementation.
*/
public class MapJoinOperator extends CommonJoinOperator<mapJoinDesc> implements Serializable {
  private static final long serialVersionUID = 1L;
  static final private Log LOG = LogFactory.getLog(MapJoinOperator.class.getName());

  /**
   * The expressions for join inputs's join keys.
   */
  transient protected Map<Byte, List<ExprNodeEvaluator>> joinKeys;
  /**
   * The ObjectInspectors for the join inputs's join keys.
   */
  transient protected Map<Byte, List<ObjectInspector>> joinKeysObjectInspectors;
  /**
   * The standard ObjectInspectors for the join inputs's join keys.
   */
  transient protected Map<Byte, List<ObjectInspector>> joinKeysStandardObjectInspectors;

  transient private int posBigTable;       // one of the tables that is not in memory
  transient int mapJoinRowsKey;            // rows for a given key
 
  transient protected Map<Byte, HTree> mapJoinTables;

  public static class MapJoinObjectCtx {
    ObjectInspector standardOI;
    SerDe      serde;
   
    /**
     * @param standardOI
     * @param serde
     */
    public MapJoinObjectCtx(ObjectInspector standardOI, SerDe serde) {
      this.standardOI = standardOI;
      this.serde = serde;
    }
   
    /**
     * @return the standardOI
     */
    public ObjectInspector getStandardOI() {
      return standardOI;
    }

    /**
     * @return the serde
     */
    public SerDe getSerDe() {
      return serde;
    }

  }

  transient static Map<Integer, MapJoinObjectCtx> mapMetadata = new HashMap<Integer, MapJoinObjectCtx>();
  transient static int nextVal = 0;
 
  static public Map<Integer, MapJoinObjectCtx> getMapMetadata() {
    return mapMetadata;
  }
 
  transient boolean firstRow;
 
  transient int   metadataKeyTag;
  transient int[] metadataValueTag;
  transient List<File> hTables;
  transient int      numMapRowsRead;
  transient int      heartbeatInterval;
 
  @Override
  protected void initializeOp(Configuration hconf) throws HiveException {
    super.initializeOp(hconf);
    numMapRowsRead = 0;
 
    firstRow = true;
    try {
      heartbeatInterval = HiveConf.getIntVar(hconf, HiveConf.ConfVars.HIVESENDHEARTBEAT);

      joinKeys  = new HashMap<Byte, List<ExprNodeEvaluator>>();
     
      populateJoinKeyValue(joinKeys, conf.getKeys());
      joinKeysObjectInspectors = getObjectInspectorsFromEvaluators(joinKeys, inputObjInspectors);
      joinKeysStandardObjectInspectors = getStandardObjectInspectors(joinKeysObjectInspectors);
       
      // all other tables are small, and are cached in the hash table
      posBigTable = conf.getPosBigTable();

      metadataValueTag = new int[numAliases];
      for (int pos = 0; pos < numAliases; pos++)
        metadataValueTag[pos] = -1;
     
      mapJoinTables = new HashMap<Byte, HTree>();
      hTables = new ArrayList<File>();
     
      // initialize the hash tables for other tables
      for (int pos = 0; pos < numAliases; pos++) {
        if (pos == posBigTable)
          continue;
       
        Properties props = new Properties();
        props.setProperty(RecordManagerOptions.CACHE_SIZE,
          String.valueOf(HiveConf.getIntVar(hconf, HiveConf.ConfVars.HIVEMAPJOINCACHEROWS)));
       
        Random rand = new Random();
        File newDir = new File("/tmp/" + rand.nextInt());
        String newDirName = null;
        while (true) {
          if (newDir.mkdir()) {
            newDirName = newDir.getAbsolutePath();
            hTables.add(newDir);
            break;
          }
          newDir = new File("/tmp" + rand.nextInt());
        }
       
        RecordManager recman = RecordManagerFactory.createRecordManager(newDirName + "/" + pos, props );
        HTree hashTable = HTree.createInstance(recman);
       
        mapJoinTables.put(Byte.valueOf((byte)pos), hashTable);
      }

      storage.put((byte)posBigTable, new ArrayList<ArrayList<Object>>());
     
      mapJoinRowsKey = HiveConf.getIntVar(hconf, HiveConf.ConfVars.HIVEMAPJOINROWSIZE);
     
      List<? extends StructField> structFields = ((StructObjectInspector)outputObjInspector).getAllStructFieldRefs();
      if (conf.getOutputColumnNames().size() < structFields.size()) {
        List<ObjectInspector> structFieldObjectInspectors = new ArrayList<ObjectInspector>();
        for (Byte alias : order) {
          int sz = conf.getExprs().get(alias).size();
          List<Integer> retained = conf.getRetainList().get(alias);
          for (int i = 0; i < sz; i++) {
            int pos = retained.get(i);
            structFieldObjectInspectors.add(structFields.get(pos)
                .getFieldObjectInspector());
          }
        }
        outputObjInspector = ObjectInspectorFactory
            .getStandardStructObjectInspector(conf.getOutputColumnNames(),
                structFieldObjectInspectors);
      }
      initializeChildren(hconf);
    } catch (IOException e) {
      throw new HiveException(e);
    }
  }

  @Override
  public void process(Object row, int tag) throws HiveException {
    try {

      // get alias
      alias = (byte)tag;

      if ((lastAlias == null) || (!lastAlias.equals(alias)))
        nextSz = joinEmitInterval;
     
      // compute keys and values as StandardObjects    
      ArrayList<Object> key   = computeValues(row, joinKeys.get(alias), joinKeysObjectInspectors.get(alias));
      ArrayList<Object> value = computeValues(row, joinValues.get(alias), joinValuesObjectInspectors.get(alias));

      // does this source need to be stored in the hash map
      if (tag != posBigTable) {
        if (firstRow) {
          metadataKeyTag = nextVal++;
         
          tableDesc keyTableDesc = conf.getKeyTblDesc();
          SerDe keySerializer = (SerDe)ReflectionUtils.newInstance(keyTableDesc.getDeserializerClass(), null);
          keySerializer.initialize(null, keyTableDesc.getProperties());

          mapMetadata.put(Integer.valueOf(metadataKeyTag),
              new MapJoinObjectCtx(
                  ObjectInspectorUtils.getStandardObjectInspector(keySerializer.getObjectInspector(),
                      ObjectInspectorCopyOption.WRITABLE),
                  keySerializer));
         
          firstRow = false;
        }
       
        // Send some status perodically
        numMapRowsRead++;
        if (((numMapRowsRead % heartbeatInterval) == 0) && (reporter != null))
          reporter.progress();

        HTree hashTable = mapJoinTables.get(alias);
        MapJoinObjectKey keyMap = new MapJoinObjectKey(metadataKeyTag, key);
        MapJoinObjectValue o = (MapJoinObjectValue)hashTable.get(keyMap);
        ArrayList<ArrayList<Object>> res = null;
       
        if (o == null) {
          res = new ArrayList<ArrayList<Object>>();
        }
        else {
          res = o.getObj();
        }
       
        res.add(value);
 
        if (metadataValueTag[tag] == -1) {
          metadataValueTag[tag] = nextVal++;
                   
          tableDesc valueTableDesc = conf.getValueTblDescs().get(tag);
          SerDe valueSerDe = (SerDe)ReflectionUtils.newInstance(valueTableDesc.getDeserializerClass(), null);
          valueSerDe.initialize(null, valueTableDesc.getProperties());
          mapMetadata.put(Integer.valueOf(metadataValueTag[tag]),
              new MapJoinObjectCtx(
                  ObjectInspectorUtils.getStandardObjectInspector(valueSerDe.getObjectInspector(),
                      ObjectInspectorCopyOption.WRITABLE),
              valueSerDe));
        }
       
        // Construct externalizable objects for key and value
        MapJoinObjectKey keyObj = new MapJoinObjectKey(metadataKeyTag, key);
        MapJoinObjectValue valueObj = new MapJoinObjectValue(metadataValueTag[tag], res);

        if (res.size() > 1)
          hashTable.remove(keyObj);

        // This may potentially increase the size of the hashmap on the mapper
        if (res.size() > mapJoinRowsKey) {
          LOG.warn("Number of values for a given key " + keyObj + " are " + res.size());
          LOG.warn("used memory " + Runtime.getRuntime().totalMemory());
        }
       
        hashTable.put(keyObj, valueObj);
        return;
      }

      // Add the value to the ArrayList
      storage.get(alias).add(value);

      for (Byte pos : order) {
        if (pos.intValue() != tag) {
          MapJoinObjectKey keyMap = new MapJoinObjectKey(metadataKeyTag, key);
          MapJoinObjectValue o = (MapJoinObjectValue)mapJoinTables.get(pos).get(keyMap);

          if (o == null) {
            storage.put(pos, new ArrayList<ArrayList<Object>>());
          }
          else {
            storage.put(pos, o.getObj());
          }
        }
      }
     
      // generate the output records
      checkAndGenObject();
   
      // done with the row
      storage.get(alias).clear();

      for (Byte pos : order)
        if (pos.intValue() != tag)
          storage.put(pos, null);
   
    } catch (SerDeException e) {
      e.printStackTrace();
      throw new HiveException(e);
    } catch (IOException e) {
      e.printStackTrace();
      throw new HiveException(e);
    }
  }
 
  /**
   * Implements the getName function for the Node Interface.
   * @return the name of the operator
   */
  public String getName() {
    return "MAPJOIN";
  }
 
  public void closeOp(boolean abort) throws HiveException {
    for (File hTbl : hTables) {
      deleteDir(hTbl);
    }
  }
 
  private void deleteDir(File dir) {
    if (dir.isDirectory()) {
      String[] children = dir.list();
      for (int i = 0; i < children.length; i++) {
        deleteDir(new File(dir, children[i]));
      }
    }

    dir.delete();
  }
}
TOP

Related Classes of org.apache.hadoop.hive.ql.exec.MapJoinOperator

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.