Package com.aamend.hadoop.clustering.mapreduce

Source Code of com.aamend.hadoop.clustering.mapreduce.CanopyCreateReducer

package com.aamend.hadoop.clustering.mapreduce;

import com.aamend.hadoop.clustering.cluster.Canopy;
import com.aamend.hadoop.clustering.cluster.CanopyWritable;
import com.aamend.hadoop.clustering.cluster.Cluster;
import com.aamend.hadoop.clustering.distance.DistanceMeasure;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

/**
* Author: antoine.amend@gmail.com
* Date: 21/03/14
*/
public class CanopyCreateReducer extends Reducer<Text, CanopyWritable, Text, CanopyWritable> {

    private static final Text KEY = new Text("canopies");
    private static final Logger LOGGER = LoggerFactory.getLogger(CanopyCreateReducer.class);
    public static final String COUNTER = "data";
    public static final String COUNTER_CANOPY = "canopies";
    public static final String COUNTER_REJECTED_CANOPY = "canopies.rejected";

    private boolean lastIteration;
    private long minObservations;
    private DistanceMeasure measure;
    private int nextCanopyId;

    @Override
    protected void setup(Context context) throws IOException {
        Configuration conf = context.getConfiguration();
        minObservations = conf.getLong(Canopy.MIN_OBSERVATIONS, 1);
        lastIteration = conf.getBoolean(Canopy.LAST_ITERATION, false);
        measure = Canopy.configureMeasure(conf);
    }

    @Override
    protected void reduce(Text key, Iterable<CanopyWritable> values, Context context)
            throws IOException, InterruptedException {

        // Try to find a center that could minimize all data points
        List<int[]> points = new ArrayList<int[]>();

        long obs = 0L;
        Cluster clusterTemplate = null;
        for (CanopyWritable value : values) {
            if (clusterTemplate == null) {
                clusterTemplate = value.get();
            } else {
                obs += value.get().getNum();
                points.add(value.get().getCenter());
            }
        }

        // Increment number of observations for this cluster
        clusterTemplate.observe(obs);

        if (lastIteration) {
            if (clusterTemplate.getNum() < minObservations) {
                context.getCounter(COUNTER, COUNTER_REJECTED_CANOPY).increment(1L);
                return;
            }
        }

        LOGGER.info("Minimizing distance across {} data points in cluster center {}",
                points.size(), Arrays.toString(clusterTemplate.getCenter()));

        clusterTemplate.computeCenter(points, measure);
        nextCanopyId++;
        Cluster newCluster = new Canopy(nextCanopyId, clusterTemplate.getCenter(), clusterTemplate.getNum());
        context.getCounter(COUNTER, COUNTER_CANOPY).increment(1L);
        context.write(KEY, new CanopyWritable(newCluster));

    }

}
TOP

Related Classes of com.aamend.hadoop.clustering.mapreduce.CanopyCreateReducer

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.