package com.ebay.erl.mobius.core;
import java.io.ByteArrayOutputStream;
import java.io.DataOutputStream;
import java.io.IOException;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapred.JobConf;
import com.ebay.erl.mobius.core.builder.Dataset;
import com.ebay.erl.mobius.core.mapred.AbstractMobiusMapper;
import com.ebay.erl.mobius.core.model.Column;
import com.ebay.erl.mobius.core.model.Tuple;
import com.ebay.erl.mobius.core.model.WriteImpl;
import com.ebay.erl.mobius.util.SerializableUtil;
import com.ebay.erl.mobius.util.Util;
/**
* Specify the join relationship in a Mobius join job.
* <p>
*
* A join relationship defines the columns to be
* used as the join keys of the participating
* datasets.
* <p>
*
* Use {@link MobiusJob#innerJoin(Dataset...)},
* {@link MobiusJob#leftOuterJoin(Dataset, Dataset, Object)} or
* {@link MobiusJob#rightOuterJoin(Dataset, Dataset, Object)} to
* obtain an instance of this class to create a join job.
*
*
*
* This product is licensed under the Apache License, Version 2.0,
* available at http://www.apache.org/licenses/LICENSE-2.0.
*
* This product contains portions derived from Apache hadoop which is
* licensed under the Apache License, Version 2.0, available at
* http://hadoop.apache.org.
*
* © 2007 – 2012 eBay Inc., Evan Chiu, Woody Zhou, Neel Sundaresan
*/
@SuppressWarnings("deprecation")
public class JoinOnConfigure
{
private Configuration jobConf;
private Dataset[] datasets;
// hide the constructor to default level
JoinOnConfigure(Configuration jobConf, Dataset... datasets)
{
if( datasets==null||datasets.length<=1 )
{
throw new IllegalArgumentException("Join must be performed with at least two or more dataset.");
}
this.jobConf = jobConf;
this.datasets = datasets;
}
JoinOnConfigure(Object nullReplacement, Configuration jobConf, Dataset... datasets)
throws IOException
{
this(jobConf, datasets);
// validate the type of nullReplacement
byte type = Tuple.getType(nullReplacement);
this.jobConf.setInt(ConfigureConstants.NULL_REPLACEMENT_TYPE, type);
if( nullReplacement!=null )
{
ByteArrayOutputStream buffer = new ByteArrayOutputStream();
DataOutputStream out = new DataOutputStream(buffer);
WriteImpl writer = new WriteImpl(out);
writer.setValue(nullReplacement);
writer.handle(type);
out.flush();
out.close();
byte[] binary = buffer.toByteArray();
String base64 = SerializableUtil.serializeToBase64(binary);
this.jobConf.set(ConfigureConstants.NULL_REPLACEMENT, base64);
}
}
/**
* Specify the joining columns from the dataset.
* <p>
*
* Where there are more than one {@link EQ} in the
* argument, they will be concatenated together
* with AND.
* <p>
*
* Mobius only supports equal-join, ex: dataset1.column1=dataset2.column1.
*/
public Persistable on(EQ... eqs)
throws IOException
{
if( eqs==null||eqs.length==0 )
{
throw new IllegalArgumentException ("Please set at least one join key");
}
Set<Column> keyColumns = new HashSet<Column>();
for( EQ anEQ:eqs )
{
for( Column aColumn:anEQ.columns )
{
this.setJoinKey (aColumn);
keyColumns.add(aColumn);
}
}
this.jobConf.set(ConfigureConstants.ALL_GROUP_KEY_COLUMNS, SerializableUtil.serializeToBase64(keyColumns.toArray(new Column[0])));
StringBuffer involvedDSName = new StringBuffer();
for( int i=0;i<this.datasets.length;i++ )
{
involvedDSName.append(this.datasets[i].getName());
if( i<this.datasets.length-1 )
involvedDSName.append(", ");
}
boolean isOuterJoin = this.jobConf.getBoolean(ConfigureConstants.IS_OUTER_JOIN, false);
this.jobConf.set("mapred.job.name", (isOuterJoin?"Outer Join ":"Inner Join ")+involvedDSName.toString()+" On "+Arrays.toString(eqs));
return new Persistable(new JobConf(this.jobConf), this.datasets);
}
/**
* set the $datasetID.key.columns so the {@link AbstractMobiusMapper} can
* emit the correct key to perform join.
*
* @param aColumn
*/
private void setJoinKey(Column aColumn)
throws IOException
{
// represent property name, $datasetID.key.columns
String joinKeyPropertyName = null;
for( byte assignedDatasetID=0;assignedDatasetID<this.datasets.length;assignedDatasetID++ )
{
Dataset aDataset = this.datasets[assignedDatasetID];
if( aColumn.getDataset().equals(aDataset) )
{
JobSetup.validateColumns(aDataset, aColumn);
Configuration aJobConf = aDataset.createJobConf(assignedDatasetID);
this.jobConf = Util.merge(this.jobConf, aJobConf);
joinKeyPropertyName = assignedDatasetID+".key.columns";
break;
}
}
if ( joinKeyPropertyName==null )
{
throw new IllegalArgumentException (aColumn.getDataset() + " doesn't in the selected join dataset.");
}
if( this.jobConf.get(joinKeyPropertyName, "").isEmpty() )
{
this.jobConf.set (joinKeyPropertyName, aColumn.getInputColumnName ());
}
else
{
this.jobConf.set (joinKeyPropertyName, this.jobConf.get (joinKeyPropertyName) + "," + aColumn.getInputColumnName ());
}
}
/**
* Specify the equal relationship
* of columns from different datasets.
* <p>
*
* Each {@link EQ} is used in a join job
* as one of the join conditions.
*/
public static class EQ
{
private Column[] columns;
/**
* Build a equal join condition from columns
* in the participated datasets in a join job.
*
*
* @param columns Columns from different datasets
* to be used to build equal relationship.
*/
public EQ(Column... columns)
{
// validation, lengh of columns
// must >=2, and all from different
// dataset
if( columns==null || columns.length<2 )
throw new IllegalArgumentException("Please specify at least two columns to form a equal relation.");
// make sure columns all from different dataset
Set<Dataset> ds = new HashSet<Dataset>();
for( Column aColumn:columns )
ds.add(aColumn.getDataset());
if( ds.size()!=columns.length )
throw new IllegalArgumentException("The specified columns must from different datasets.");
this.columns = columns;
}
public String toString()
{
StringBuffer str = new StringBuffer();
for( int i=0;i<this.columns.length;i++ )
{
str.append(this.columns[i].getInputColumnName());
if( i<this.columns.length-1 )
str.append(" = ");
}
return str.toString();
}
}
}