Package org.apache.mahout.clustering.fuzzykmeans

Source Code of org.apache.mahout.clustering.fuzzykmeans.TestFuzzyKmeansClustering

/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.mahout.clustering.fuzzykmeans;

import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DataInputBuffer;
import org.apache.hadoop.io.DataOutputBuffer;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.util.ToolRunner;
import org.apache.mahout.clustering.AbstractCluster;
import org.apache.mahout.clustering.ClusterObservations;
import org.apache.mahout.clustering.ClusteringTestUtils;
import org.apache.mahout.clustering.WeightedVectorWritable;
import org.apache.mahout.clustering.kmeans.TestKmeansClustering;
import org.apache.mahout.common.DummyRecordWriter;
import org.apache.mahout.common.HadoopUtil;
import org.apache.mahout.common.MahoutTestCase;
import org.apache.mahout.common.commandline.DefaultOptionCreator;
import org.apache.mahout.common.distance.DistanceMeasure;
import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
import org.apache.mahout.math.DenseVector;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorWritable;
import org.junit.Before;
import org.junit.Test;

public final class TestFuzzyKmeansClustering extends MahoutTestCase {

  private FileSystem fs;
  private final DistanceMeasure measure = new EuclideanDistanceMeasure();

  @Override
  @Before
  public void setUp() throws Exception {
    super.setUp();
    Configuration conf = new Configuration();
    fs = FileSystem.get(conf);
  }

  private static double round(double val, int places) {
    long factor = (long) Math.pow(10, places);

    // Shift the decimal the correct number of places
    // to the right.
    val *= factor;

    // Round to the nearest integer.
    long tmp = Math.round(val);

    // Shift the decimal the correct number of places
    // back to the left.
    return (double) tmp / factor;
  }

  private static Vector tweakValue(Vector point) {
    return point.plus(0.1);
  }

  private static void computeCluster(Iterable<Vector> points,
                                     List<SoftCluster> clusterList,
                                     FuzzyKMeansClusterer clusterer,
                                     Map<Integer, List<WeightedVectorWritable>> pointClusterInfo) {

    for (Vector point : points) {
      // calculate point distances for all clusters   
      List<Double> clusterDistanceList = new ArrayList<Double>();
      for (SoftCluster cluster : clusterList) {
        clusterDistanceList.add(clusterer.getMeasure().distance(cluster.getCenter(), point));
      }
      // calculate point pdf for all clusters
      List<Double> clusterPdfList = new ArrayList<Double>();
      for (int i = 0; i < clusterList.size(); i++) {
        double probWeight = clusterer.computeProbWeight(clusterDistanceList.get(i), clusterDistanceList);
        clusterPdfList.add(probWeight);
      }
      // for now just emit the most likely cluster
      int clusterId = -1;
      double clusterPdf = 0;
      for (int i = 0; i < clusterList.size(); i++) {
        // System.out.println("cluster-" + clusters.get(i).getId() + "@ " + ClusterBase.formatVector(center, null));
        double pdf = clusterPdfList.get(i);
        if (pdf > clusterPdf) {
          clusterId = clusterList.get(i).getId();
          clusterPdf = pdf;
        }
      }
      List<WeightedVectorWritable> list = pointClusterInfo.get(clusterId);
      if (list == null) {
        list = new ArrayList<WeightedVectorWritable>();
        pointClusterInfo.put(clusterId, list);
      }
      list.add(new WeightedVectorWritable(clusterPdf, point));
      double totalProb = 0;
      for (int i = 0; i < clusterList.size(); i++) {
        //SoftCluster cluster = clusterList.get(i);
        double probWeight = clusterer.computeProbWeight(clusterDistanceList.get(i), clusterDistanceList);
        totalProb += probWeight;
      }
      assertTrue("total probability", Math.abs(1.0 - totalProb) < 0.0001);
    }

    for (SoftCluster cluster : clusterList) {
      System.out.println(cluster.asFormatString(null));
      List<WeightedVectorWritable> list = pointClusterInfo.get(cluster.getId());
      if (list != null) {
        for (WeightedVectorWritable vector : list) {
          System.out.println("\t" + vector);
        }
      }
    }
  }

  @Test
  public void testReferenceImplementation() throws Exception {
    List<Vector> points = TestKmeansClustering.getPoints(TestKmeansClustering.REFERENCE);
    EuclideanDistanceMeasure measure = new EuclideanDistanceMeasure();
    for (int k = 0; k < points.size(); k++) {
      System.out.println("test k= " + k);

      List<SoftCluster> clusterList = new ArrayList<SoftCluster>();
      // pick k initial cluster centers at random
      for (int i = 0; i < k + 1; i++) {
        Vector vec = tweakValue(points.get(i));
        SoftCluster cluster = new SoftCluster(vec, i, measure);
        // add the center so the centroid will be correct upon output
        //cluster.addPoint(cluster.getCenter(), 1);
        clusterList.add(cluster);
      }
      Map<Integer, List<WeightedVectorWritable>> pointClusterInfo = new HashMap<Integer, List<WeightedVectorWritable>>();
      // run reference FuzzyKmeans algorithm
      List<List<SoftCluster>> clusters = FuzzyKMeansClusterer.clusterPoints(points,
                                                                            clusterList,
                                                                            measure,
                                                                            0.001,
                                                                            2,
                                                                            2);
      computeCluster(points,
                     clusters.get(clusters.size() - 1),
                     new FuzzyKMeansClusterer(measure, 0.001, 2),
                     pointClusterInfo);

      // iterate for each cluster
      int size = 0;
      for (List<WeightedVectorWritable> pts : pointClusterInfo.values()) {
        size += pts.size();
      }
      assertEquals("total size", size, points.size());
    }
  }

  @Test
  public void testFuzzyKMeansSeqJob() throws Exception {
    List<VectorWritable> points = TestKmeansClustering.getPointsWritable(TestKmeansClustering.REFERENCE);

    Path pointsPath = getTestTempDirPath("points");
    Path clustersPath = getTestTempDirPath("clusters");
    Configuration conf = new Configuration();
    ClusteringTestUtils.writePointsToFile(points, new Path(pointsPath, "file1"), fs, conf);

    for (int k = 0; k < points.size(); k++) {
      System.out.println("testKFuzzyKMeansMRJob k= " + k);
      // pick k initial cluster centers at random
      SequenceFile.Writer writer = new SequenceFile.Writer(fs,
                                                           conf,
                                                           new Path(clustersPath, "part-00000"),
                                                           Text.class,
                                                           SoftCluster.class);
      for (int i = 0; i < k + 1; i++) {
        Vector vec = tweakValue(points.get(i).get());

        SoftCluster cluster = new SoftCluster(vec, i, measure);
        // add the center so the centroid will be correct upon output
        cluster.observe(cluster.getCenter(), 1);
        /*
         * writer.write(cluster.getIdentifier() + '\t' + SoftCluster.formatCluster(cluster) + '\n');
         */
        writer.append(new Text(cluster.getIdentifier()), cluster);

      }
      writer.close();

      // now run the Job using the run() command line options.
      Path output = getTestTempDirPath("output");
      /*      FuzzyKMeansDriver.runJob(pointsPath,
                                     clustersPath,
                                     output,
                                     EuclideanDistanceMeasure.class.getName(),
                                     0.001,
                                     2,
                                     k + 1,
                                     2,
                                     false,
                                     true,
                                     0);
      */
      String[] args = {
          optKey(DefaultOptionCreator.INPUT_OPTION), pointsPath.toString(),
          optKey(DefaultOptionCreator.CLUSTERS_IN_OPTION),
          clustersPath.toString(),
          optKey(DefaultOptionCreator.OUTPUT_OPTION),
          output.toString(),
          optKey(DefaultOptionCreator.DISTANCE_MEASURE_OPTION),
          EuclideanDistanceMeasure.class.getName(),
          optKey(DefaultOptionCreator.CONVERGENCE_DELTA_OPTION),
          "0.001",
          optKey(DefaultOptionCreator.MAX_ITERATIONS_OPTION),
          "2",
          optKey(FuzzyKMeansDriver.M_OPTION),
          "2.0",
          optKey(DefaultOptionCreator.CLUSTERING_OPTION),
          optKey(DefaultOptionCreator.EMIT_MOST_LIKELY_OPTION),
          optKey(DefaultOptionCreator.OVERWRITE_OPTION),
          optKey(DefaultOptionCreator.METHOD_OPTION),
          DefaultOptionCreator.SEQUENTIAL_METHOD
      };
      new FuzzyKMeansDriver().run(args);
      long count = HadoopUtil.countRecords(new Path(output, "clusteredPoints/part-m-0"), conf);
      assertTrue(count > 0);
    }

  }

  @Test
  public void testFuzzyKMeansMRJob() throws Exception {
    List<VectorWritable> points = TestKmeansClustering.getPointsWritable(TestKmeansClustering.REFERENCE);

    Path pointsPath = getTestTempDirPath("points");
    Path clustersPath = getTestTempDirPath("clusters");
    Configuration conf = new Configuration();
    ClusteringTestUtils.writePointsToFile(points, new Path(pointsPath, "file1"), fs, conf);

    for (int k = 0; k < points.size(); k++) {
      System.out.println("testKFuzzyKMeansMRJob k= " + k);
      // pick k initial cluster centers at random
      SequenceFile.Writer writer = new SequenceFile.Writer(fs,
                                                           conf,
                                                           new Path(clustersPath, "part-00000"),
                                                           Text.class,
                                                           SoftCluster.class);
      for (int i = 0; i < k + 1; i++) {
        Vector vec = tweakValue(points.get(i).get());

        SoftCluster cluster = new SoftCluster(vec, i, measure);
        // add the center so the centroid will be correct upon output
        cluster.observe(cluster.getCenter(), 1);
        /*
         * writer.write(cluster.getIdentifier() + '\t' + SoftCluster.formatCluster(cluster) + '\n');
         */
        writer.append(new Text(cluster.getIdentifier()), cluster);

      }
      writer.close();

      // now run the Job using the run() command line options.
      Path output = getTestTempDirPath("output");
      /*      FuzzyKMeansDriver.runJob(pointsPath,
                                     clustersPath,
                                     output,
                                     EuclideanDistanceMeasure.class.getName(),
                                     0.001,
                                     2,
                                     k + 1,
                                     2,
                                     false,
                                     true,
                                     0);
      */
      String[] args = {
          optKey(DefaultOptionCreator.INPUT_OPTION),
          pointsPath.toString(),
          optKey(DefaultOptionCreator.CLUSTERS_IN_OPTION),
          clustersPath.toString(),
          optKey(DefaultOptionCreator.OUTPUT_OPTION),
          output.toString(),
          optKey(DefaultOptionCreator.DISTANCE_MEASURE_OPTION),
          EuclideanDistanceMeasure.class.getName(),
          optKey(DefaultOptionCreator.CONVERGENCE_DELTA_OPTION),
          "0.001",
          optKey(DefaultOptionCreator.MAX_ITERATIONS_OPTION),
          "2",
          optKey(FuzzyKMeansDriver.M_OPTION),
          "2.0",
          optKey(DefaultOptionCreator.CLUSTERING_OPTION),
          optKey(DefaultOptionCreator.EMIT_MOST_LIKELY_OPTION),
          optKey(DefaultOptionCreator.OVERWRITE_OPTION)
      };
      ToolRunner.run(new Configuration(), new FuzzyKMeansDriver(), args);
      long count = HadoopUtil.countRecords(new Path(output, "clusteredPoints/part-m-00000"), conf);
      assertTrue(count > 0);
    }

  }

  @Test
  public void testFuzzyKMeansMapper() throws Exception {
    List<VectorWritable> points = TestKmeansClustering.getPointsWritable(TestKmeansClustering.REFERENCE);

    for (int k = 0; k < points.size(); k++) {
      System.out.println("testKFuzzyKMeansMRJob k= " + k);
      // pick k initial cluster centers at random
      Collection<SoftCluster> clusterList = new ArrayList<SoftCluster>();

      for (int i = 0; i < k + 1; i++) {
        Vector vec = tweakValue(points.get(i).get());

        SoftCluster cluster = new SoftCluster(vec, i, measure);
        cluster.observe(cluster.getCenter(), 1);
        clusterList.add(cluster);
      }

      // run mapper
      FuzzyKMeansMapper mapper = new FuzzyKMeansMapper();
      mapper.config(clusterList);
      DistanceMeasure measure = new EuclideanDistanceMeasure();
      Configuration conf = new Configuration();
      conf.set(FuzzyKMeansConfigKeys.DISTANCE_MEASURE_KEY, measure.getClass().getName());
      conf.set(FuzzyKMeansConfigKeys.CLUSTER_CONVERGENCE_KEY, "0.001");
      conf.set(FuzzyKMeansConfigKeys.M_KEY, "2");
      conf.set(FuzzyKMeansConfigKeys.EMIT_MOST_LIKELY_KEY, "true");
      conf.set(FuzzyKMeansConfigKeys.THRESHOLD_KEY, "0");

      DummyRecordWriter<Text, ClusterObservations> mapWriter = new DummyRecordWriter<Text, ClusterObservations>();
      Mapper<WritableComparable<?>, VectorWritable, Text, ClusterObservations>.Context mapContext = DummyRecordWriter
          .build(mapper, conf, mapWriter);
      mapper.setup(mapContext);
      for (VectorWritable point : points) {
        mapper.map(new Text(), point, mapContext);
      }

      // now verify mapper output
      assertEquals("Mapper Keys", k + 1, mapWriter.getData().size());

      Map<Vector, Double> pointTotalProbMap = new HashMap<Vector, Double>();

      for (Text key : mapWriter.getKeys()) {
        // SoftCluster cluster = SoftCluster.decodeCluster(key);
        List<ClusterObservations> values = mapWriter.getValue(key);

        for (ClusterObservations value : values) {
          Double val = pointTotalProbMap.get(value.getS1());
          double probVal = 0.0;
          if (val != null) {
            probVal = val;
          }
          pointTotalProbMap.put(value.getS1(), probVal + value.getS0());
        }
      }
      for (Map.Entry<Vector, Double> entry : pointTotalProbMap.entrySet()) {
        Vector key = entry.getKey();
        double value = round(entry.getValue(), 1);

        assertEquals("total Prob for Point:" + key, 1.0, value, EPSILON);
      }
    }
  }

  @Test
  public void testFuzzyKMeansCombiner() throws Exception {
    List<VectorWritable> points = TestKmeansClustering.getPointsWritable(TestKmeansClustering.REFERENCE);

    for (int k = 0; k < points.size(); k++) {
      System.out.println("testKFuzzyKMeansMRJob k= " + k);
      // pick k initial cluster centers at random
      Collection<SoftCluster> clusterList = new ArrayList<SoftCluster>();

      for (int i = 0; i < k + 1; i++) {
        Vector vec = tweakValue(points.get(i).get());

        SoftCluster cluster = new SoftCluster(vec, i, measure);
        cluster.observe(cluster.getCenter(), 1);
        clusterList.add(cluster);
      }

      // run mapper
      FuzzyKMeansMapper mapper = new FuzzyKMeansMapper();
      mapper.config(clusterList);

      Configuration conf = new Configuration();
      conf.set(FuzzyKMeansConfigKeys.DISTANCE_MEASURE_KEY,
          "org.apache.mahout.common.distance.EuclideanDistanceMeasure");
      conf.set(FuzzyKMeansConfigKeys.CLUSTER_CONVERGENCE_KEY, "0.001");
      conf.set(FuzzyKMeansConfigKeys.M_KEY, "2");
      conf.set(FuzzyKMeansConfigKeys.EMIT_MOST_LIKELY_KEY, "true");
      conf.set(FuzzyKMeansConfigKeys.THRESHOLD_KEY, "0");

      DummyRecordWriter<Text, ClusterObservations> mapWriter = new DummyRecordWriter<Text, ClusterObservations>();
      Mapper<WritableComparable<?>, VectorWritable, Text, ClusterObservations>.Context mapContext =
          DummyRecordWriter.build(mapper, conf, mapWriter);
      mapper.setup(mapContext);
      for (VectorWritable point : points) {
        mapper.map(new Text(), point, mapContext);
      }

      // run combiner
      FuzzyKMeansCombiner combiner = new FuzzyKMeansCombiner();
      DummyRecordWriter<Text, ClusterObservations> combinerWriter = new DummyRecordWriter<Text, ClusterObservations>();
      Reducer<Text, ClusterObservations, Text, ClusterObservations>.Context combinerContext =
          DummyRecordWriter.build(combiner, conf, combinerWriter, Text.class, ClusterObservations.class);
      combiner.setup(combinerContext);
      for (Text key : mapWriter.getKeys()) {
        List<ClusterObservations> values = mapWriter.getValue(key);
        combiner.reduce(new Text(key), values, combinerContext);
      }

      // now verify the combiner output
      assertEquals("Combiner Output", k + 1, combinerWriter.getData().size());

      for (Text key : combinerWriter.getKeys()) {
        List<ClusterObservations> values = combinerWriter.getValue(key);
        assertEquals("too many values", 1, values.size());
      }
    }
  }

  @Test
  public void testFuzzyKMeansReducer() throws Exception {
    List<VectorWritable> points = TestKmeansClustering.getPointsWritable(TestKmeansClustering.REFERENCE);

    for (int k = 0; k < points.size(); k++) {
      System.out.println("testKFuzzyKMeansMRJob k= " + k);
      // pick k initial cluster centers at random
      Collection<SoftCluster> clusterList = new ArrayList<SoftCluster>();

      for (int i = 0; i < k + 1; i++) {
        Vector vec = tweakValue(points.get(i).get());

        SoftCluster cluster = new SoftCluster(vec, i, measure);
        // cluster.addPoint(cluster.getCenter(), 1);
        clusterList.add(cluster);
      }

      // run mapper
      FuzzyKMeansMapper mapper = new FuzzyKMeansMapper();
      mapper.config(clusterList);
      DistanceMeasure measure = new EuclideanDistanceMeasure();
      Configuration conf = new Configuration();
      conf.set(FuzzyKMeansConfigKeys.DISTANCE_MEASURE_KEY, measure.getClass().getName());
      conf.set(FuzzyKMeansConfigKeys.CLUSTER_CONVERGENCE_KEY, "0.001");
      conf.set(FuzzyKMeansConfigKeys.M_KEY, "2");
      conf.set(FuzzyKMeansConfigKeys.EMIT_MOST_LIKELY_KEY, "true");
      conf.set(FuzzyKMeansConfigKeys.THRESHOLD_KEY, "0");

      DummyRecordWriter<Text, ClusterObservations> mapWriter = new DummyRecordWriter<Text, ClusterObservations>();
      Mapper<WritableComparable<?>, VectorWritable, Text, ClusterObservations>.Context mapContext =
          DummyRecordWriter.build(mapper, conf, mapWriter);
      mapper.setup(mapContext);
      for (VectorWritable point : points) {
        mapper.map(new Text(), point, mapContext);
      }

      // run combiner
      FuzzyKMeansCombiner combiner = new FuzzyKMeansCombiner();
      DummyRecordWriter<Text, ClusterObservations> combinerWriter = new DummyRecordWriter<Text, ClusterObservations>();
      Reducer<Text, ClusterObservations, Text, ClusterObservations>.Context combinerContext =
          DummyRecordWriter.build(combiner, conf, combinerWriter, Text.class, ClusterObservations.class);
      combiner.setup(combinerContext);
      for (Text key : mapWriter.getKeys()) {
        List<ClusterObservations> values = mapWriter.getValue(key);
        combiner.reduce(new Text(key), values, combinerContext);
      }

      // run reducer
      FuzzyKMeansReducer reducer = new FuzzyKMeansReducer();
      DummyRecordWriter<Text, SoftCluster> reducerWriter = new DummyRecordWriter<Text, SoftCluster>();
      Reducer<Text, ClusterObservations, Text, SoftCluster>.Context reducerContext =
          DummyRecordWriter.build(reducer, conf, reducerWriter, Text.class, ClusterObservations.class);
      reducer.setup(clusterList, conf);

      for (Text key : combinerWriter.getKeys()) {
        List<ClusterObservations> values = combinerWriter.getValue(key);
        reducer.reduce(new Text(key), values, reducerContext);
      }

      // now verify the reducer output
      assertEquals("Reducer Output", k + 1, combinerWriter.getData().size());

      // compute the reference result after one iteration and compare
      List<SoftCluster> reference = new ArrayList<SoftCluster>();
      for (int i = 0; i < k + 1; i++) {
        Vector vec = tweakValue(points.get(i).get());
        reference.add(new SoftCluster(vec, i, measure));
      }
      Collection<Vector> pointsVectors = new ArrayList<Vector>();
      for (VectorWritable point : points) {
        pointsVectors.add(point.get());
      }

      FuzzyKMeansClusterer clusterer = new FuzzyKMeansClusterer(measure, 0.001, 2);
      FuzzyKMeansClusterer.runFuzzyKMeansIteration(pointsVectors, reference, clusterer);

      for (SoftCluster key : reference) {
        String clusterId = key.getIdentifier();
        List<SoftCluster> values = reducerWriter.getValue(new Text(clusterId));
        SoftCluster cluster = values.get(0);
        System.out.println("ref= " + key.toString() + " cluster= " + cluster);
        cluster.computeParameters();
        assertEquals("key center: " + AbstractCluster.formatVector(key.getCenter(), null) + " does not equal cluster: "
            + AbstractCluster.formatVector(cluster.getCenter(), null), key.getCenter(), cluster.getCenter());
      }
    }
  }

  @Test
  public void testFuzzyKMeansClusterMapper() throws Exception {
    List<VectorWritable> points = TestKmeansClustering.getPointsWritable(TestKmeansClustering.REFERENCE);

    for (int k = 0; k < points.size(); k++) {
      System.out.println("testKFuzzyKMeansMRJob k= " + k);
      // pick k initial cluster centers at random
      Collection<SoftCluster> clusterList = new ArrayList<SoftCluster>();

      for (int i = 0; i < k + 1; i++) {
        Vector vec = tweakValue(points.get(i).get());

        SoftCluster cluster = new SoftCluster(vec, i, measure);
        cluster.observe(cluster.getCenter(), 1);
        clusterList.add(cluster);
      }
      for (SoftCluster softCluster : clusterList) {
        softCluster.computeParameters();
      }

      // run mapper
      FuzzyKMeansMapper mapper = new FuzzyKMeansMapper();
      mapper.config(clusterList);
      DistanceMeasure measure = new EuclideanDistanceMeasure();

      Configuration conf = new Configuration();
      conf.set(FuzzyKMeansConfigKeys.DISTANCE_MEASURE_KEY, measure.getClass().getName());
      conf.set(FuzzyKMeansConfigKeys.CLUSTER_CONVERGENCE_KEY, "0.001");
      conf.set(FuzzyKMeansConfigKeys.M_KEY, "2");
      conf.set(FuzzyKMeansConfigKeys.EMIT_MOST_LIKELY_KEY, "true");
      conf.set(FuzzyKMeansConfigKeys.THRESHOLD_KEY, "0");

      DummyRecordWriter<Text, ClusterObservations> mapWriter = new DummyRecordWriter<Text, ClusterObservations>();
      Mapper<WritableComparable<?>, VectorWritable, Text, ClusterObservations>.Context mapContext =
          DummyRecordWriter.build(mapper, conf, mapWriter);
      mapper.setup(mapContext);
      for (VectorWritable point : points) {
        mapper.map(new Text(), point, mapContext);
      }

      // run combiner
      FuzzyKMeansCombiner combiner = new FuzzyKMeansCombiner();
      DummyRecordWriter<Text, ClusterObservations> combinerWriter = new DummyRecordWriter<Text, ClusterObservations>();
      Reducer<Text, ClusterObservations, Text, ClusterObservations>.Context combinerContext =
          DummyRecordWriter.build(combiner, conf, combinerWriter, Text.class, ClusterObservations.class);
      combiner.setup(combinerContext);
      for (Text key : mapWriter.getKeys()) {
        List<ClusterObservations> values = mapWriter.getValue(key);
        combiner.reduce(new Text(key), values, combinerContext);
      }

      // run reducer
      FuzzyKMeansReducer reducer = new FuzzyKMeansReducer();
      DummyRecordWriter<Text, SoftCluster> reducerWriter = new DummyRecordWriter<Text, SoftCluster>();
      Reducer<Text, ClusterObservations, Text, SoftCluster>.Context reducerContext =
          DummyRecordWriter.build(reducer, conf, reducerWriter, Text.class, ClusterObservations.class);
      reducer.setup(clusterList, conf);

      for (Text key : combinerWriter.getKeys()) {
        List<ClusterObservations> values = combinerWriter.getValue(key);
        reducer.reduce(new Text(key), values, reducerContext);
      }

      // run clusterMapper
      Collection<SoftCluster> reducerClusters = new ArrayList<SoftCluster>();
      for (Text key : reducerWriter.getKeys()) {
        List<SoftCluster> values = reducerWriter.getValue(key);
        reducerClusters.add(values.get(0));
      }
      for (SoftCluster softCluster : reducerClusters) {
        softCluster.computeParameters();
      }

      FuzzyKMeansClusterMapper clusterMapper = new FuzzyKMeansClusterMapper();
      DummyRecordWriter<IntWritable, WeightedVectorWritable> clusterWriter =
          new DummyRecordWriter<IntWritable, WeightedVectorWritable>();
      Mapper<WritableComparable<?>, VectorWritable, IntWritable, WeightedVectorWritable>.Context clusterContext =
          DummyRecordWriter.build(clusterMapper, conf, clusterWriter);
      clusterMapper.setup(reducerClusters, conf);

      for (VectorWritable point : points) {
        clusterMapper.map(new Text(), point, clusterContext);
      }

      // compute the reference result after one iteration and compare
      List<SoftCluster> reference = new ArrayList<SoftCluster>();
      for (int i = 0; i < k + 1; i++) {
        Vector vec = tweakValue(points.get(i).get());
        reference.add(new SoftCluster(vec, i, measure));
      }
      Map<Integer, List<WeightedVectorWritable>> refClusters = new HashMap<Integer, List<WeightedVectorWritable>>();
      Collection<Vector> pointsVectors = new ArrayList<Vector>();
      for (VectorWritable point : points) {
        pointsVectors.add(point.get());
      }

      List<List<SoftCluster>> clusters = FuzzyKMeansClusterer.clusterPoints(pointsVectors,
                                                                            reference,
                                                                            new EuclideanDistanceMeasure(),
                                                                            0.001,
                                                                            2,
                                                                            1);

      computeCluster(pointsVectors, clusters.get(clusters.size() - 1),
                     new FuzzyKMeansClusterer(new EuclideanDistanceMeasure(), 0.001, 2), refClusters);

      // Now compare the clustermapper results with reference implementation
      assertEquals("mapper and reference sizes", refClusters.size(), clusterWriter.getKeys().size());
      for (Map.Entry<Integer, List<WeightedVectorWritable>> entry : refClusters.entrySet()) {
        int key = entry.getKey();
        List<WeightedVectorWritable> value = entry.getValue();
        System.out.println("refClusters=" + value + " mapClusters=" + clusterWriter.getValue(new IntWritable(key)));
        assertEquals("cluster " + key + " sizes", value.size(), clusterWriter.getValue(new IntWritable(key)).size());
      }
      // make sure all points are allocated to a cluster
      int size = 0;
      for (List<WeightedVectorWritable> pts : refClusters.values()) {
        size += pts.size();
      }
      assertEquals("total size", size, points.size());
    }
  }

  @Test
  public void testClusterObservationsSerialization() throws Exception {
    double[] data = { 1.1, 2.2, 3.3 };
    Vector vector = new DenseVector(data);
    ClusterObservations reference = new ClusterObservations(1, 2.0, vector, vector);
    DataOutputBuffer out = new DataOutputBuffer();
    reference.write(out);
    ClusterObservations info = new ClusterObservations();
    DataInputBuffer in = new DataInputBuffer();
    in.reset(out.getData(), out.getLength());
    info.readFields(in);
    assertEquals("probability", reference.getS0(), info.getS0(), EPSILON);
    assertEquals("point total", reference.getS1(), info.getS1());
    assertEquals("combiner", reference.getCombinerState(), info.getCombinerState());
  }

}
TOP

Related Classes of org.apache.mahout.clustering.fuzzykmeans.TestFuzzyKmeansClustering

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.