Package com.ebay.erl.mobius.core.mapred

Source Code of com.ebay.erl.mobius.core.mapred.AbstractMobiusMapper

package com.ebay.erl.mobius.core.mapred;

import java.io.Closeable;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.IllegalFormatException;
import java.util.List;
import java.util.Map;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;

import com.ebay.erl.mobius.core.ConfigureConstants;
import com.ebay.erl.mobius.core.collection.BigTupleList;
import com.ebay.erl.mobius.core.criterion.TupleCriterion;
import com.ebay.erl.mobius.core.datajoin.DataJoinMapper;
import com.ebay.erl.mobius.core.model.ComputedColumns;
import com.ebay.erl.mobius.core.model.KeyTuple;
import com.ebay.erl.mobius.core.model.Tuple;
import com.ebay.erl.mobius.util.SerializableUtil;
import com.ebay.erl.mobius.util.Util;

/**
* Base class for implementing a customized Mobius mapper.
* <p>
*
* Extends this class if the built-in mappers,
* {@link com.ebay.erl.mobius.core.mapred.TSVMapper} and
* {@link com.ebay.erl.mobius.core.mapred.SequenceFileMapper},
* does not meet the needs.
* <p>
*
* This class provides filtering (by taking user specified
* {@link #tuple_criteria}), compute {@link #computedColumns},
* and updating counters.
* <p>
*
* Override the {@link #parse(Object, Object)} method to convert
* the K-V objects into a tuple, then the underlying data source
* can be processed by mobius.
* <p>
*
* <p>
* This product is licensed under the Apache License,  Version 2.0,
* available at http://www.apache.org/licenses/LICENSE-2.0.
*
* This product contains portions derived from Apache hadoop which is
* licensed under the Apache License, Version 2.0, available at
* http://hadoop.apache.org.
*
* © 2007 – 2012 eBay Inc., Evan Chiu, Woody Zhou, Neel Sundaresan
*
* @param <IK> input key type.
* @param <IV> input value type.
*
*/
@SuppressWarnings("deprecation")
public abstract class AbstractMobiusMapper<IK, IV> extends DataJoinMapper<IK, IV, WritableComparable<?>, WritableComparable<?>>
{
 
  /**
   * filters
   */
  protected TupleCriterion tuple_criteria;
 
  /**
   * columns to be emitted as key of this {@link Mapper}
   */
  protected String[] key_columns;
 
  /**
   * columns to be emitted as value of this {@link Mapper}
   */
  protected String[] value_columns;
 
  /**
   * Output column names for map only job, ex: listing.
   */
  protected String[] projection_order;
 
 
  /**
   * The current dataset ID.
   */
  protected Byte currentDatasetID = null;
 
 
  /**
   * The normalized name of the dataset been processed by this
   * mapper currently, it is used as counter ID to update the
   * corresponding Hadoop counters for this dataset.
   * <p>
   *
   * The name is normalized from the {@link #currentDatasetID} by
   * removing the serial number part.
   */
  protected String dataset_display_id;
 
  /**
   * A background thread responsible for updating the
   * Hadoop counters.
   */
  protected CounterUpdateThread counterThread;
 
  /**
   * Counts for the number of input records.
   * <p>
   *
   * #INPUT_RECORDS = #FILTERED_RECORDS + #OUTPUT_RECORDS.
   */
  protected long _COUNTER_INPUT_RECORD;
 
  /**
   * Counts for the number of outputted records.
   */
  protected long _COUNTER_OUTPUT_RECORD;
 
  /**
   * Counts for the number of filtered records,
   * filtered by user specified {@link #tuple_criteria}.
   */
  protected long _COUNTER_FILTERED_RECORD;
 
  /**
   * Counts for invalidate format records.
   */
  protected long _COUNTER_INVALIDATE_FORMAT_RECORD;
 
  /**
   * {@link ComputedColumns} specified by user.
   */
  protected List<ComputedColumns> computedColumns = null;
 
  protected boolean _IS_MAP_ONLY_JOB = false;
 
  public static final long _100MB = 100L*1024L*1024L;
 
  protected boolean reporterSet = false;
 
  private static final Log LOGGER = LogFactory.getLog(AbstractMobiusMapper.class);
 
 
  /**
   * Setup Mapper.
   * <p>
   *
   * Override this method if there is extra initial
   * settings need to be done.
   * <p>
   *
   * Make sure to call <code>super.configure(JobConf)</code>
   * when overriding.
   *
   */
  @SuppressWarnings("unchecked")
  @Override
  public void configure(JobConf conf)
  {
    super.configure(conf);   
   
    this.conf = conf;
   
    this._IS_MAP_ONLY_JOB = this.conf.getInt("mapred.reduce.tasks", 1)==0;
   
    // catch the current dataset ID, the {@link Configuration#get(String)}
    // is costly as it compose Pattern every time.
    this.currentDatasetID = Byte.valueOf(this.conf.get (ConfigureConstants.CURRENT_DATASET_ID));
   
    String[] datasetIDstoNames = this.conf.getStrings(ConfigureConstants.DATASET_ID_TO_NAME_MAPPING);
    Map<Byte, String> mapping = new HashMap<Byte, String>();
    for( String aMapping:datasetIDstoNames )
    {
      int cut = aMapping.indexOf(";");
      Byte datasetID = Byte.parseByte(aMapping.substring(0, cut));
      String datasetDisplayName = aMapping.substring(cut+1);
     
      mapping.put(datasetID, datasetDisplayName);
    }
    if( mapping.size()==0 )
      throw new IllegalArgumentException(ConfigureConstants.DATASET_ID_TO_NAME_MAPPING+" is not set.");
   
    this.dataset_display_id = mapping.get(this.currentDatasetID);
    if( this.dataset_display_id==null )
    {
      throw new IllegalArgumentException("Cannot find display name for datasetID:"+this.currentDatasetID +
          " from "+ConfigureConstants.DATASET_ID_TO_NAME_MAPPING+":"+
          this.conf.get(ConfigureConstants.DATASET_ID_TO_NAME_MAPPING));
    }
   
   
     // initialize counters
    this._COUNTER_INPUT_RECORD        = 0L;
    this._COUNTER_OUTPUT_RECORD        = 0L;
    this._COUNTER_FILTERED_RECORD      = 0L;
    this._COUNTER_INVALIDATE_FORMAT_RECORD   = 0L;
   
    try
    {
      this.key_columns  = (String[])this.conf.getStrings(this.getDatasetID()+".key.columns", Util.ZERO_SIZE_STRING_ARRAY);     
      this.value_columns  = (String[])this.conf.getStrings(this.getDatasetID()+".value.columns", Util.ZERO_SIZE_STRING_ARRAY);
      this.tuple_criteria = (TupleCriterion)this.get("tuple.criteria");
      this.computedColumns= (List<ComputedColumns>)this.get("computed.columns");
     
      if( this._IS_MAP_ONLY_JOB )
      {
        this.projection_order = (String[])this.conf.getStrings(this.getDatasetID()+".columns.in.original.order", Util.ZERO_SIZE_STRING_ARRAY);
      }
    }catch(IOException e)
    {
      e.printStackTrace();
      throw new RuntimeException(e);
    }
  }
 
  /**
   * map()
   */
  @SuppressWarnings("unchecked")
  @Override
  public void joinmap(IK key, IV value, OutputCollector<WritableComparable<?>, WritableComparable<?>> output, Reporter reporter)
    throws IOException
  {
    // initializing counter updating thread, to be run in the background.
    if( this.counterThread==null )
    {
      this.counterThread = new CounterUpdateThread(reporter);
      new Thread(this.counterThread).start();
    }
   
    if (!reporterSet){
      if( this.computedColumns!=null ){
        for( ComputedColumns c:this.computedColumns ){
          c.setReporter(reporter);
        }
      }
      reporterSet = true;
    }
   
    Tuple record = null;
    try
    {
      record = this.parse(key, value);
    }
    catch(IllegalFormatException e)
    {
      this._COUNTER_INVALIDATE_FORMAT_RECORD++;
      this.updateCounter(this.dataset_display_id, "INVALIDATE_RECORDS", this._COUNTER_INVALIDATE_FORMAT_RECORD);
      return;
    }
   
    this._COUNTER_INPUT_RECORD++;
    this.updateCounter(this.dataset_display_id, "INPUT_RECORDS", this._COUNTER_INPUT_RECORD);
   
   
    Iterable<Tuple> rows_to_be_output = new ArrayList<Tuple>();
    ((List<Tuple>)rows_to_be_output).add(record);
   
    // apply computed column if any
    if( this.computedColumns!=null )
    {
      for( ComputedColumns aComputedColumn:this.computedColumns )
      {
        aComputedColumn.reset();
        aComputedColumn.consume(Tuple.immutable(record));
       
        if ( aComputedColumn.getResult()!=null && aComputedColumn.getResult().size()>0 )
        {
          BigTupleList computedResult = aComputedColumn.getResult();
          if( computedResult.size()<5000 )
          {
            // use in memory cross product
            Iterable<Tuple>[] allValues = new Iterable[2];
            allValues[0] = rows_to_be_output;
            allValues[1] = aComputedColumn.getResult();
           
            rows_to_be_output = Util.inMemoryCrossProduct(allValues);
          }
          else
          {
            // computed result is too big, don't use in memory cross
            // product.
            Iterable<Tuple>[] allValues = new Iterable[2];
            allValues[0] = rows_to_be_output;
            allValues[1] = aComputedColumn.getResult();
           
            rows_to_be_output = Util.crossProduct(this.conf, reporter,  allValues);
          }
        }
      }     
    }
   
   
    // apply the criteria if any and prepare output
    for( Tuple aRow:rows_to_be_output)
    {
      Tuple out_key  = this.getKeyTuple(this.key_columns, aRow, null);
      Tuple out_value = null;
      if( !this._IS_MAP_ONLY_JOB )
      {
        // tuple will go to reducer phase, we use the sorted column
        // so the reducer can set the schema back correctly.
        out_value = this.getTuple(this.value_columns, aRow, Tuple.NULL);
      }
      else
      {
        out_value = this.getTuple(this.projection_order, aRow, Tuple.NULL);
      }
     
       
      /**
       * TODO some tuple criteria can be applied earlier, if the
       * columns it is evaluating are not derived.  This can
       * save the time to compute the derived columns, as it might
       * be costly.
       */
      if( this.tuple_criteria!=null )
      {
        // use the aRow as the criteria might use column(s)
        // not within the projection columns (<code>value_columns</code>).
        if ( this.tuple_criteria.accept(aRow, this.conf) )
        {
          outputRecords(out_key, out_value, output);
          this._COUNTER_OUTPUT_RECORD++;
          this.updateCounter(this.dataset_display_id, "OUTPUT_RECORDS", this._COUNTER_OUTPUT_RECORD);
        }
        else
        {
          this._COUNTER_FILTERED_RECORD++;
          this.updateCounter(this.dataset_display_id, "FILTERED_RECORDS", this._COUNTER_FILTERED_RECORD);
        }
      }
      else
      {
        outputRecords(out_key, out_value, output);
        this._COUNTER_OUTPUT_RECORD++;
        this.updateCounter(this.dataset_display_id, "OUTPUT_RECORDS", this._COUNTER_OUTPUT_RECORD);
      }
    }
   
    if( rows_to_be_output instanceof Closeable )
    {
      ((Closeable)rows_to_be_output).close();
    }
  }
 
  protected void outputRecords(Tuple key, Tuple value, OutputCollector<WritableComparable<?>, WritableComparable<?>> output)
    throws IOException
  {
    if( this._IS_MAP_ONLY_JOB )
    {
      // map only job, key is not needed as no join is required.
      output.collect(NullWritable.get(), value);
    }
    else
    {
      if( key==null )
      {
        // should never happen, this is to perform join/group by, but there
        // is no key
        throw new IllegalArgumentException("key for dataset: "+this.getDatasetID()+
            " cannot be empty when performing join/group by.");
      }     
      output.collect(key, value);
    }
  }
 
  /**
   * close Mapper
   */
  @Override
  public void close()
    throws IOException
  {   
    this.counterThread.stop();
  }
 
  /**
   * Parse the input key and input value into {@link Tuple}
   */
  public abstract Tuple parse(IK inkey, IV invalue)
    throws IllegalArgumentException, IOException;
 
 
  /**
   * update certain counter
   */
  protected final void updateCounter(String group, String couter, long number)
  {
    this.counterThread.updateCounter(group, couter, number);
  }
 
 
  /**
   * Get the current dataset ID.
   */
  public final Byte getDatasetID()
  {
    return this.currentDatasetID; 
  }
 
  /**
   * Get object from {@link JobConf}, assuming the value
   * is Base64 encoded, and can be decoded back to Java
   * object.
   * <p>
   *
   * If the value from {@link JobConf} for the given
   * <code>key</code> is null or empty, null is returned.
   */
  protected final Object get(String key) throws IOException
  {
    String value = this.conf.get(this.getDatasetID()+"."+key);
    if( value==null || (value=value.trim()).isEmpty() )
      return null;
   
    return SerializableUtil.deserializeFromBase64(value, this.conf);
  }
 
  private final Tuple getKeyTuple(String[] columns, Tuple record, Tuple defaultValue)
  {
    if( columns==null || columns.length==0 )
      return defaultValue;
    else
    {
      Tuple t = new KeyTuple();
      for( String aColumn:columns )
      {
        t.insert(aColumn, record.get(aColumn));
      }
      return t;
    }
  }
 
  /**
   * retrieve columns from the record tuple, and return
   * a new tuple instance which contains only the specified
   * columns.
   */
  private final Tuple getTuple(String[] columns, Tuple record, Tuple defaultValue)
  {
    if( columns==null || columns.length==0 )
      return defaultValue;
    else
    {
      Tuple t = new Tuple();
      for( String aColumn:columns )
      {
        t.insert(aColumn, record.get(aColumn));
      }
      return t;
    }
  }
 
 
}
TOP

Related Classes of com.ebay.erl.mobius.core.mapred.AbstractMobiusMapper

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.