package org.pygmalion.udf;
import java.io.IOException;
import java.util.regex.Pattern;
import org.apache.pig.EvalFunc;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.apache.pig.data.BagFactory;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.DataType;
import org.apache.pig.impl.logicalLayer.schema.Schema;
/**
* UDF to take the data structure that Cassandra outputs:
* <code>(key, columns: bag {T: tuple(name, value)})</code>
* and projects out just the key and column values that I
* would like to use:
* <code>bag: {(key, columns: bag {values})}</code>
*
* We can not only specify column names but also prefixes.
* In this case it will return the bag of name/value pairs
* in the bag of values that is returned.
*
* Example:
* ('account_id', 'tag*') as a set of fields to project
* will return this bag of values:
* (12345, {('tag123', 'lol'), ('tag456', 'cat')}
*
* We are returning a bag when we really want to
* be returning a tuple. The reason is that if we
* return a tuple then pig wraps it up in another
* tuple (which we don't want). By stuffing our
* single record result into a bag we can simply
* FLATTEN it and get what we want.
*
* NB: When a row has a ton of columns and if we don't
* specify a slice predicate, this will be inefficient.
*/
public class FromCassandraBag extends EvalFunc<Tuple> {
private static Pattern DELIM_PATTERN = Pattern.compile("[\\s,]+");
private static String GREEDY_OPERATOR = "*";
public Tuple exec(Tuple input) throws IOException {
// Size must be two (column_selector,cassandra_bag)
if (input == null || input.size() < 2)
throw new IOException("Invalid input. Please pass in both a list of column names and the columns themselves.");
if (input.isNull(0) || input.isNull(1))
return null;
String columnSelector = input.get(0).toString();
DataBag cassandraBag = (DataBag)input.get(1);
String[] selections = DELIM_PATTERN.split(columnSelector);
Tuple output = TupleFactory.getInstance().newTuple(selections.length);
for (int i = 0; i < selections.length; i++) {
String selection = selections[i];
if (selection.endsWith(GREEDY_OPERATOR)) {
String namePrefix = selection.substring(0,selection.length()-1);
DataBag columnsBag = BagFactory.getInstance().newDefaultBag();
// Find all columns in the input bag that begin with 'namePrefix'
// and add them to the 'columnsBag'
for (Tuple cassandraColumn : cassandraBag) {
String name = cassandraColumn.get(0).toString();
if (name.startsWith(namePrefix)) {
columnsBag.add(cassandraColumn);
}
}
// Sometimes this bag will have no columns in it, this _is_ the desired behavior.
output.set(i, columnsBag);
} else {
// Find the column in the input bag that has a name equal to 'selection'
// and add _only_ the value to the output tuple. This is what you actually
// want since you're specifying both an order and a name in the 'columnSelector'
// string.
for (Tuple cassandraColumn : cassandraBag) {
String name = cassandraColumn.get(0).toString();
if (name.equals(selection)) {
output.set(i, cassandraColumn.get(1));
break;
}
}
}
}
return output;
}
public Schema outputSchema(Schema input) {
try {
return new Schema(new Schema.FieldSchema(getSchemaName(this.getClass().getName().toLowerCase(), input), DataType.TUPLE));
} catch (Exception e) {
return null;
}
}
}