Package com.ebay.erl.mobius.core.mapred

Source Code of com.ebay.erl.mobius.core.mapred.TSVMapper

package com.ebay.erl.mobius.core.mapred;

import java.io.IOException;
import java.util.IllegalFormatException;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;

import com.ebay.erl.mobius.core.builder.TSVDatasetBuilder;
import com.ebay.erl.mobius.core.model.Tuple;
import com.ebay.erl.mobius.util.SerializableUtil;

/**
* A mapper that parses the values (in {@link Text} format)
* into {@link Tuple} elements.
*
* <p>
* This product is licensed under the Apache License,  Version 2.0,
* available at http://www.apache.org/licenses/LICENSE-2.0.
*
* This product contains portions derived from Apache hadoop which is
* licensed under the Apache License, Version 2.0, available at
* http://hadoop.apache.org.
*
* © 2007 – 2012 eBay Inc., Evan Chiu, Woody Zhou, Neel Sundaresan
*/
@SuppressWarnings("deprecation")
public class TSVMapper extends AbstractMobiusMapper<LongWritable, Text>
{
  /**
   * delimiter of the underline file, it is
   * tab by default, and can be changed using
   * {@link TSVDatasetBuilder#setDelimiter(String)}
   *
   */
  protected String delimiter;
 
 
  /**
   * schema of this TSV dataset
   */
  protected String[] schema;
 
  /**
   * {@inheritDoc}
   * <p>
   *
   * setup the {@link #delimiter} and {@link #schema}
   * from Hadoop configuration.
   * <p>
   *
   * {@link #delimiter} is set in {@link TSVDatasetBuilder#setDelimiter(String)}
   * and {@link #schema} is set in {@link TSVDatasetBuilder#newInstance(com.ebay.erl.mobius.core.MobiusJob, String, String[])},
   * both are stored in Hadoop configuration and Mobius
   * retrieve the values from here.
   *
   */
  @Override
  public void configure(JobConf conf)
  {
    super.configure (conf);
   
    ///////////////////////////
    // setup delimiter
    ///////////////////////////
    try
    {
      this.delimiter = this.conf.get(this.getDatasetID()+".delimiter", "\t");
      if ( !this.delimiter.equals("\t") )
      {
        // if the delimiter is not tab, by default, Mobius will encoded the delimiter,
        // attempting to decode it.
        this.delimiter = (String)SerializableUtil.deserializeFromBase64(delimiter, null);
      }
    }
    catch(IOException e)
    {
      throw new RuntimeException("Unable to decode the delimiter["+this.conf.get(this.getDatasetID()+".delimiter")+"]" +
          " for dataset ["+this.getDatasetID()+"] using Base64.");
    }
   
   
    ///////////////////////////
    // setup schema
    ///////////////////////////
    if( this.conf.get(this.getDatasetID()+".schema", "").trim().isEmpty() )
      throw new IllegalArgumentException("Please define schema for dataset:"+this.getDatasetID ());
       
    this.schema = this.conf.get(this.getDatasetID()+".schema", "").split(",");
  }

  /**
   * Parse the <code>invalue</code> into {@link Tuple}
   * with the schema given in {@link TSVDatasetBuilder}.
   * <p>
   *
   * <code>invalue</code> is delimited by the {@link #delimiter},
   * and stored in a {@link Tuple} all in java.lang.String type,
   * then the {@link #schema} will be assigned to the tuple.
   * <p>
   *
   * If the length of the value array (the delimited result
   * of <code>invalue</code>) is lesser than the length
   * of {@link #schema}, the <code>null</code> will be placed
   * for those columns which don't have value.
   * <p>
   *
   * If the length of the value array is longer that
   * the {@link #schema}, then "IDX_$i" will be used
   * for those unnamed values, where $i start from
   * the length of {@link #schema}.
   */
  @Override
  public Tuple parse(LongWritable inkey, Text invalue)
      throws IllegalFormatException, IOException
  {   
    Tuple tuple    = Tuple.valueOf(invalue, this.schema, this.delimiter);   
    return tuple;
  }
}
TOP

Related Classes of com.ebay.erl.mobius.core.mapred.TSVMapper

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.