Package org.pygmalion.udf

Source Code of org.pygmalion.udf.DeleteColumns

package org.pygmalion.udf;


import org.apache.pig.EvalFunc;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.data.BagFactory;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.apache.pig.impl.logicalLayer.schema.Schema;
import org.apache.pig.impl.util.UDFContext;

import java.io.IOException;
import java.util.List;
import java.util.Properties;

/**
* EvalFunc to take the given set of column names and convert them
* into a CassandraBag to delete those columns.  Works similarly
* to ToCassandraBag.
*
* The first value in the input bag *has* to be the key. For
* the rest of the fields, this UDF will interrogate the values
* that you have named the variables to be the column names.
*/
public class DeleteColumns extends EvalFunc<Tuple>  {

    private static String UDFCONTEXT_SCHEMA_KEY = "cassandra.input_field_schema";
    private static String INPUT_DELIM = "[\\s,]+";
    private static char OUTPUT_DELIM = ',';

    public Tuple exec(Tuple input) throws IOException {
        Tuple row = TupleFactory.getInstance().newTuple(2);
        DataBag columns = BagFactory.getInstance().newDefaultBag();
        UDFContext context = UDFContext.getUDFContext();
        Properties property = context.getUDFProperties(DeleteColumns.class);
        String fieldString = property.getProperty(UDFCONTEXT_SCHEMA_KEY);
        String [] fieldnames = fieldString.split(INPUT_DELIM);

        // IT IS ALWAYS ASSUMED THAT THE OBJECT AT INDEX 0 IS THE ROW KEY

        for (int i=1; i<input.size(); i++) {
            if (input.get(i) instanceof DataBag) {
                for (Tuple cassandraColumn : (DataBag) input.get(i)) {
                    String name = cassandraColumn.get(0).toString();
                    columns.add(getColumnDef(name, null));
                }
            } else {
                columns.add(getColumnDef(fieldnames[i], null));
            }
        }

        row.set(0, input.get(0));
        row.set(1, columns);
        return row;
    }

    private Tuple getColumnDef(String name, Object value) throws ExecException {
        Tuple column = TupleFactory.getInstance().newTuple(2);
        column.set(0, name);
        column.set(1, value);
        return column;
    }

    public Schema outputSchema(Schema input) {
        StringBuilder builder = new StringBuilder();
        List<Schema.FieldSchema> fields = input.getFields();
        for (int i=0; i<fields.size(); i++) {
            builder.append(fields.get(i).alias);
            if (i != fields.size()-1) {
                builder.append(OUTPUT_DELIM);
            }
        }

        UDFContext context = UDFContext.getUDFContext();
        Properties property = context.getUDFProperties(DeleteColumns.class);
        property.setProperty(UDFCONTEXT_SCHEMA_KEY, builder.toString());

        return super.outputSchema(input);
    }
}
TOP

Related Classes of org.pygmalion.udf.DeleteColumns

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.