Package elephantdb.cascading

Source Code of elephantdb.cascading.KeyValTailAssembly$Shardize

package elephantdb.cascading;

import cascading.flow.FlowProcess;
import cascading.operation.BaseOperation;
import cascading.operation.Function;
import cascading.operation.FunctionCall;
import cascading.operation.Identity;
import cascading.pipe.Each;
import cascading.pipe.GroupBy;
import cascading.pipe.Pipe;
import cascading.pipe.SubAssembly;
import cascading.tuple.Fields;
import cascading.tuple.Tuple;
import elephantdb.DomainSpec;
import elephantdb.Utils;
import elephantdb.partition.ShardingScheme;
import org.apache.hadoop.io.BytesWritable;
import org.apache.log4j.Logger;

import java.util.UUID;

public class KeyValTailAssembly extends SubAssembly {
    public static Logger LOG = Logger.getLogger(KeyValTailAssembly.class);

    public static class Shardize extends BaseOperation implements Function {
        ShardingScheme shardScheme;
        int shardCount;

        public Shardize(String outfield, DomainSpec spec) {
            super(new Fields(outfield));
            shardScheme = spec.getShardScheme();
            shardCount = spec.getNumShards();
        }

        public int shardIndex(byte[] key) {
            return shardScheme.shardIndex(key, shardCount);
        }

        public void operate(FlowProcess process, FunctionCall call) {
            Object key = call.getArguments().getObject(0);

            int shard = shardIndex((byte[])key);
            call.getOutputCollector().add(new Tuple(shard));
        }
    }

    public static class MakeSortableKey extends BaseOperation implements Function {

        public MakeSortableKey(String outfield, DomainSpec spec) {
            super(new Fields(outfield));
        }

        public void operate(FlowProcess process, FunctionCall call) {
            Object f1 = call.getArguments().getObject(0);
            byte[] key = (byte[]) f1;
            BytesWritable sortField = new BytesWritable(key);
            call.getOutputCollector().add(new Tuple(sortField));
        }
    }

    public KeyValTailAssembly(Pipe keyValuePairs, ElephantDBTap outTap) {
        // generate two random field names
        String shardField = "shard" + UUID.randomUUID().toString();
        String keySortField = "keysort" + UUID.randomUUID().toString();

        DomainSpec spec = outTap.getSpec();
        LOG.info("Instantiating spec: " + spec);

        // Add the shard index as field #2.
        Pipe out = new Each(keyValuePairs, new Fields(0), new Shardize(shardField, spec), Fields.ALL);

        // Add the serialized key itself as field #3 for sorting.
        // TODO: Make secondary sorting optional, and come up with a function to generate
        // a sortable key (vs just using the same serialization as for sharding).
        out = new Each(out, new Fields(0), new MakeSortableKey(keySortField, spec), Fields.ALL);

        //put in order of shard, key, value, sortablekey
        out = new Each(out, new Fields(2, 0, 1, 3), new Identity(), Fields.RESULTS);
        out = new GroupBy(out, new Fields(0), new Fields(3)); // group by shard

        // emit shard, key, value
        out = new Each(out, new Fields(0, 1, 2), new Identity());
        setTails(out);
    }
}
TOP

Related Classes of elephantdb.cascading.KeyValTailAssembly$Shardize

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.