Package org.apache.mahout.clustering.fuzzykmeans

Source Code of org.apache.mahout.clustering.fuzzykmeans.TestFuzzyKmeansClustering

/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.mahout.clustering.fuzzykmeans;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import junit.framework.TestCase;

import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.mahout.clustering.kmeans.TestKmeansClustering;
import org.apache.mahout.matrix.Vector;
import org.apache.mahout.utils.DistanceMeasure;
import org.apache.mahout.utils.DummyOutputCollector;
import org.apache.mahout.utils.EuclideanDistanceMeasure;

public class TestFuzzyKmeansClustering extends TestCase {

  private static void rmr(String path) throws Exception {
    File f = new File(path);
    if (f.exists()) {
      if (f.isDirectory()) {
        String[] contents = f.list();
        for (String content : contents) {
          rmr(f.toString() + File.separator + content);
        }
      }
      f.delete();
    }
  }

  @Override
  protected void setUp() throws Exception {
    super.setUp();
    rmr("output");
    rmr("testdata");
  }

  public static double round(double val, int places) {
    long factor = (long) Math.pow(10, places);

    // Shift the decimal the correct number of places
    // to the right.
    val *= factor;

    // Round to the nearest integer.
    long tmp = Math.round(val);

    // Shift the decimal the correct number of places
    // back to the left.
    return (double) tmp / factor;
  }

  public static Vector tweakValue(Vector point) {
    return point.plus(0.1);

  }

  public static void referenceFuzzyKMeans(List<Vector> points,
      List<SoftCluster> clusterList, Map<String, String> pointClusterInfo,
      String distanceMeasureClass, double threshold, int numIter)
      throws Exception {
    ClassLoader ccl = Thread.currentThread().getContextClassLoader();
    Class<?> cl = ccl.loadClass(distanceMeasureClass);

    DistanceMeasure measure = (DistanceMeasure) cl.newInstance();
    SoftCluster.config(measure, threshold);
    boolean converged = false;
    for (int iter = 0; !converged && iter < numIter; iter++) {
      converged = iterateReference(points, clusterList, measure);
    }
    computeCluster(points, clusterList, measure, pointClusterInfo);
  }

  public static boolean iterateReference(List<Vector> points,
      List<SoftCluster> clusterList, DistanceMeasure measure) {
    // for each
    for (Vector point : points) {
      List<Double> clusterDistanceList = new ArrayList<Double>();
      for (SoftCluster cluster : clusterList) {
        clusterDistanceList.add(measure.distance(point, cluster.getCenter()));
      }

      for (int i = 0; i < clusterList.size(); i++) {
        double probWeight = SoftCluster.computeProbWeight(clusterDistanceList
            .get(i), clusterDistanceList);
        clusterList.get(i).addPoint(point,
            Math.pow(probWeight, SoftCluster.getM()));
      }
    }
    boolean converged = true;
    for (SoftCluster cluster : clusterList) {
      if (!cluster.computeConvergence())
        converged = false;
    }
    // update the cluster centers
    if (!converged)
      for (SoftCluster cluster : clusterList)
        cluster.recomputeCenter();
    return converged;

  }

  public static void computeCluster(List<Vector> points,
      List<SoftCluster> clusterList, DistanceMeasure measure,
      Map<String, String> pointClusterInfo) {

    for (Vector point : points) {
      StringBuilder outputValue = new StringBuilder("[");
      List<Double> clusterDistanceList = new ArrayList<Double>();
      for (SoftCluster cluster : clusterList) {
        clusterDistanceList.add(measure.distance(point, cluster.getCenter()));
      }
      for (int i = 0; i < clusterList.size(); i++) {
        double probWeight = SoftCluster.computeProbWeight(clusterDistanceList
            .get(i), clusterDistanceList);
        outputValue.append(clusterList.get(i).getClusterId()).append(':')
            .append(probWeight).append(' ');
      }

      pointClusterInfo.put(point.asFormatString().trim(), outputValue
          .toString().trim()
          + ']');
    }
  }

  public void testReferenceImplementation() throws Exception {
    List<Vector> points = TestKmeansClustering
        .getPoints(TestKmeansClustering.reference);
    for (int k = 0; k < points.size(); k++) {
      System.out.println("test k= " + k);

      List<SoftCluster> clusterList = new ArrayList<SoftCluster>();
      // pick k initial cluster centers at random
      for (int i = 0; i < k + 1; i++) {
        Vector vec = tweakValue(points.get(i));
        SoftCluster cluster = new SoftCluster(vec);
        // add the center so the centroid will be correct upon output
        cluster.addPoint(cluster.getCenter(), 1);

        clusterList.add(cluster);
      }
      Map<String, String> pointClusterInfo = new HashMap<String, String>();
      // run reference FuzzyKmeans algorithm
      referenceFuzzyKMeans(points, clusterList, pointClusterInfo,
          EuclideanDistanceMeasure.class.getName(), 0.001, 2);

      // iterate for each point
      for (String value : pointClusterInfo.values()) {
        String clusterInfoStr = value.substring(1, value.length() - 1);
        String[] clusterInfoList = clusterInfoStr.split(" ");
        assertEquals("Number of clusters", k + 1, clusterInfoList.length);
        double prob = 0.0;
        for (String clusterInfo : clusterInfoList) {
          String[] clusterProb = clusterInfo.split(":");

          double clusterProbVal = Double.parseDouble(clusterProb[1]);
          prob += clusterProbVal;
        }
        prob = round(prob, 1);
        assertEquals(
            "Sum of cluster Membership problability should be equal to=", 1.0,
            prob);
      }
    }
  }

  public void testFuzzyKMeansMRJob() throws Exception {
    List<Vector> points = TestKmeansClustering
        .getPoints(TestKmeansClustering.reference);
    File testData = new File("testdata");
    if (!testData.exists())
      testData.mkdir();
    testData = new File("testdata/points");
    if (!testData.exists())
      testData.mkdir();

    TestKmeansClustering.writePointsToFile(points, "testdata/points/file1");
    TestKmeansClustering.writePointsToFile(points, "testdata/points/file2");

    for (int k = 0; k < points.size(); k++) {
      System.out.println("testKFuzzyKMeansMRJob k= " + k);
      // pick k initial cluster centers at random
      JobConf job = new JobConf(FuzzyKMeansDriver.class);
      FileSystem fs = FileSystem.get(job);
      Path path = new Path("testdata/clusters");
      if (fs.exists(path)) {
        fs.delete(path, true);
      }

      testData = new File("testdata/clusters");
      if (!testData.exists())
        testData.mkdir();

      BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(
          new FileOutputStream("testdata/clusters/part-00000"), Charset
              .forName("UTF-8")));

      for (int i = 0; i < k + 1; i++) {
        Vector vec = tweakValue(points.get(i));

        SoftCluster cluster = new SoftCluster(vec);
        // add the center so the centroid will be correct upon output
        cluster.addPoint(cluster.getCenter(), 1);
        writer.write(cluster.getIdentifier() + '\t'
            + SoftCluster.formatCluster(cluster) + '\n');

      }
      writer.flush();
      writer.close();

      JobConf conf = new JobConf(FuzzyKMeansDriver.class);
      Path outPath = new Path("output");
      fs = FileSystem.get(conf);
      if (fs.exists(outPath)) {
        fs.delete(outPath, true);
      }
      fs.mkdirs(outPath);
      // now run the Job     
      FuzzyKMeansDriver.runJob("testdata/points", "testdata/clusters",
          "output", EuclideanDistanceMeasure.class.getName(), 0.001, 2, 1,
          k + 1, 2);     

      // now compare the expected clusters with actual
      File outDir = new File("output/points");
      assertTrue("output dir exists?", outDir.exists());
      String[] outFiles = outDir.list();
//      assertEquals("output dir files?", 4, outFiles.length);
      BufferedReader reader = new BufferedReader(new InputStreamReader(
          new FileInputStream("output/points/part-00000"), Charset
              .forName("UTF-8")));

      while (reader.ready()) {
        String line = reader.readLine();
        String[] lineParts = line.split("\t");
        assertEquals("line parts", 2, lineParts.length);
        String clusterInfoStr = lineParts[1].replace("[", "").replace("]", "");

        String[] clusterInfoList = clusterInfoStr.split(" ");
        assertEquals("Number of clusters", k + 1, clusterInfoList.length);
        double prob = 0.0;
        for (String clusterInfo : clusterInfoList) {
          String[] clusterProb = clusterInfo.split(":");

          double clusterProbVal = Double.parseDouble(clusterProb[1]);
          prob += clusterProbVal;
        }
        prob = round(prob, 1);
        assertEquals(
            "Sum of cluster Membership problability should be equal to=", 1.0,
            prob);
      }

      reader.close();

    }

  }

  public void testFuzzyKMeansMapper() throws Exception {
    List<Vector> points = TestKmeansClustering
        .getPoints(TestKmeansClustering.reference);

    DistanceMeasure measure = new EuclideanDistanceMeasure();
    SoftCluster.config(measure, 0.001);

    for (int k = 0; k < points.size(); k++) {
      System.out.println("testKFuzzyKMeansMRJob k= " + k);
      // pick k initial cluster centers at random
      List<SoftCluster> clusterList = new ArrayList<SoftCluster>();

      for (int i = 0; i < k + 1; i++) {
        Vector vec = tweakValue(points.get(i));

        SoftCluster cluster = new SoftCluster(vec);
        cluster.addPoint(cluster.getCenter(), 1);
        clusterList.add(cluster);
      }

      // run mapper
      FuzzyKMeansMapper mapper = new FuzzyKMeansMapper();
      mapper.config(clusterList);

      DummyOutputCollector<Text, Text> mapCollector = new DummyOutputCollector<Text, Text>();
      for (Vector point : points) {
        mapper.map(new Text(), new Text(point.asFormatString()), mapCollector,
            null);
      }

      // now verify mapper output
      assertEquals("Mapper Keys", k + 1, mapCollector.getData().size());

      Map<String, Double> pointTotalProbMap = new HashMap<String, Double>();

      for (String key : mapCollector.getKeys()) {
        // SoftCluster cluster = SoftCluster.decodeCluster(key);
        List<Text> values = mapCollector.getValue(key);

        for (Text value : values) {
          String pointInfo = value.toString();
          double pointProb = Double.parseDouble(pointInfo.substring(0,
              pointInfo.indexOf(FuzzyKMeansDriver.MAPPER_VALUE_SEPARATOR)));

          String encodedVector = pointInfo.substring(pointInfo
              .indexOf(FuzzyKMeansDriver.MAPPER_VALUE_SEPARATOR) + 1);

          Double val = pointTotalProbMap.get(encodedVector);
          double probVal = 0.0;
          if (val != null) {
            probVal = val;
          }

          pointTotalProbMap.put(encodedVector, probVal + pointProb);
        }
      }

      for (Map.Entry<String, Double> entry : pointTotalProbMap.entrySet()) {
        String key = entry.getKey();
        double value = round(entry.getValue(), 1);

        assertEquals("total Prob for Point:" + key, 1.0, value);
      }
    }
  }

  public void testFuzzyKMeansCombiner() throws Exception {
    List<Vector> points = TestKmeansClustering
        .getPoints(TestKmeansClustering.reference);

    DistanceMeasure measure = new EuclideanDistanceMeasure();
    SoftCluster.config(measure, 0.001);

    for (int k = 0; k < points.size(); k++) {
      System.out.println("testKFuzzyKMeansMRJob k= " + k);
      // pick k initial cluster centers at random
      List<SoftCluster> clusterList = new ArrayList<SoftCluster>();

      for (int i = 0; i < k + 1; i++) {
        Vector vec = tweakValue(points.get(i));

        SoftCluster cluster = new SoftCluster(vec);
        cluster.addPoint(cluster.getCenter(), 1);
        clusterList.add(cluster);
      }

      // run mapper
      FuzzyKMeansMapper mapper = new FuzzyKMeansMapper();
      mapper.config(clusterList);

      DummyOutputCollector<Text, Text> mapCollector = new DummyOutputCollector<Text, Text>();
      for (Vector point : points) {
        mapper.map(new Text(), new Text(point.asFormatString()), mapCollector,
            null);
      }

      // run combiner
      DummyOutputCollector<Text, Text> combinerCollector = new DummyOutputCollector<Text, Text>();
      FuzzyKMeansCombiner combiner = new FuzzyKMeansCombiner();

      for (String key : mapCollector.getKeys()) {

        List<Text> values = mapCollector.getValue(key);
        combiner.reduce(new Text(key), values.iterator(), combinerCollector,
            null);
      }

      // now verify the combiner output
      assertEquals("Combiner Output", k + 1, combinerCollector.getData().size());

      for (String key : combinerCollector.getKeys()) {
        List<Text> values = combinerCollector.getValue(key);
        assertEquals("too many values", 1, values.size());
      }
    }
  }

  public void testFuzzyKMeansReducer() throws Exception {
    List<Vector> points = TestKmeansClustering
        .getPoints(TestKmeansClustering.reference);

    DistanceMeasure measure = new EuclideanDistanceMeasure();
    SoftCluster.config(measure, 0.001);

    for (int k = 0; k < points.size(); k++) {
      System.out.println("testKFuzzyKMeansMRJob k= " + k);
      // pick k initial cluster centers at random
      List<SoftCluster> clusterList = new ArrayList<SoftCluster>();

      for (int i = 0; i < k + 1; i++) {
        Vector vec = tweakValue(points.get(i));

        SoftCluster cluster = new SoftCluster(vec, i);
        // cluster.addPoint(cluster.getCenter(), 1);
        clusterList.add(cluster);
      }

      // run mapper
      FuzzyKMeansMapper mapper = new FuzzyKMeansMapper();
      mapper.config(clusterList);

      DummyOutputCollector<Text, Text> mapCollector = new DummyOutputCollector<Text, Text>();
      for (Vector point : points) {
        mapper.map(new Text(), new Text(point.asFormatString()), mapCollector,
            null);
      }

      // run combiner
      DummyOutputCollector<Text, Text> combinerCollector = new DummyOutputCollector<Text, Text>();
      FuzzyKMeansCombiner combiner = new FuzzyKMeansCombiner();

      for (String key : mapCollector.getKeys()) {

        List<Text> values = mapCollector.getValue(key);
        combiner.reduce(new Text(key), values.iterator(), combinerCollector,
            null);
      }

      // run reducer
      DummyOutputCollector<Text, Text> reducerCollector = new DummyOutputCollector<Text, Text>();
      FuzzyKMeansReducer reducer = new FuzzyKMeansReducer();
      reducer.config(clusterList);

      for (String key : combinerCollector.getKeys()) {
        List<Text> values = combinerCollector.getValue(key);
        reducer
            .reduce(new Text(key), values.iterator(), reducerCollector, null);
      }

      // now verify the reducer output
      assertEquals("Reducer Output", k + 1, combinerCollector.getData().size());

      // compute the reference result after one iteration and compare
      List<SoftCluster> reference = new ArrayList<SoftCluster>();
      for (int i = 0; i < k + 1; i++) {
        Vector vec = tweakValue(points.get(i));
        reference.add(new SoftCluster(vec, i));
      }
      iterateReference(points, reference, measure);
      for (SoftCluster key : reference) {
        String clusterId = key.getIdentifier();
        List<Text> values = reducerCollector.getValue(clusterId);
        SoftCluster cluster = SoftCluster.decodeCluster(values.get(0)
            .toString());
        System.out.println("ref= " + key.toString() + " cluster= "
            + cluster.toString());
        assertEquals(k + " center[" + key + "][0]", key.getCenter().get(0),
            cluster.getCenter().get(0));
        assertEquals(k + " center[" + key + "][1]", key.getCenter().get(1),
            cluster.getCenter().get(1));
      }
    }
  }

  public void testFuzzyKMeansClusterMapper() throws Exception {
    List<Vector> points = TestKmeansClustering
        .getPoints(TestKmeansClustering.reference);

    DistanceMeasure measure = new EuclideanDistanceMeasure();
    SoftCluster.config(measure, 0.001);

    for (int k = 0; k < points.size(); k++) {
      System.out.println("testKFuzzyKMeansMRJob k= " + k);
      // pick k initial cluster centers at random
      List<SoftCluster> clusterList = new ArrayList<SoftCluster>();

      for (int i = 0; i < k + 1; i++) {
        Vector vec = tweakValue(points.get(i));

        SoftCluster cluster = new SoftCluster(vec, i);
        cluster.addPoint(cluster.getCenter(), 1);
        clusterList.add(cluster);
      }

      // run mapper
      FuzzyKMeansMapper mapper = new FuzzyKMeansMapper();
      mapper.config(clusterList);

      DummyOutputCollector<Text, Text> mapCollector = new DummyOutputCollector<Text, Text>();
      for (Vector point : points) {
        mapper.map(new Text(), new Text(point.asFormatString()), mapCollector,
            null);
      }

      // run combiner
      DummyOutputCollector<Text, Text> combinerCollector = new DummyOutputCollector<Text, Text>();
      FuzzyKMeansCombiner combiner = new FuzzyKMeansCombiner();

      for (String key : mapCollector.getKeys()) {

        List<Text> values = mapCollector.getValue(key);
        combiner.reduce(new Text(key), values.iterator(), combinerCollector,
            null);
      }

      // run reducer
      DummyOutputCollector<Text, Text> reducerCollector = new DummyOutputCollector<Text, Text>();
      FuzzyKMeansReducer reducer = new FuzzyKMeansReducer();
      reducer.config(clusterList);

      for (String key : combinerCollector.getKeys()) {
        List<Text> values = combinerCollector.getValue(key);
        reducer
            .reduce(new Text(key), values.iterator(), reducerCollector, null);
      }

      // run clusterMapper
      List<SoftCluster> reducerCluster = new ArrayList<SoftCluster>();

      for (String key : reducerCollector.getKeys()) {
        List<Text> values = reducerCollector.getValue(key);
        reducerCluster.add(SoftCluster.decodeCluster(values.get(0).toString()));
      }

      DummyOutputCollector<Text, Text> clusterMapperCollector = new DummyOutputCollector<Text, Text>();
      FuzzyKMeansClusterMapper clusterMapper = new FuzzyKMeansClusterMapper();
      clusterMapper.config(reducerCluster);
      for (Vector point : points) {
        clusterMapper.map(new Text(), new Text(point.asFormatString()),
            clusterMapperCollector, null);
      }

      // now run for one iteration of referencefuzzykmeans and compare the
      // results
      // compute the reference result after one iteration and compare
      List<SoftCluster> reference = new ArrayList<SoftCluster>();
      for (int i = 0; i < k + 1; i++) {
        Vector vec = tweakValue(points.get(i));
        reference.add(new SoftCluster(vec, i));
      }
      Map<String, String> pointClusterInfo = new HashMap<String, String>();
      referenceFuzzyKMeans(points, reference, pointClusterInfo,
          EuclideanDistanceMeasure.class.getName(), 0.001, 1);

      // Now compare the clustermapper results with reducer
      for (String key : clusterMapperCollector.getKeys()) {
        List<Text> value = clusterMapperCollector.getValue(key);

        String refValue = pointClusterInfo.get(key);
        String clusterInfoStr = refValue.substring(1, refValue.length() - 1);
        String[] refClusterInfoList = clusterInfoStr.split(" ");
        assertEquals("Number of clusters", k + 1, refClusterInfoList.length);
        Map<String, Double> refClusterInfoMap = new HashMap<String, Double>();
        for (String clusterInfo : refClusterInfoList) {
          String[] clusterProb = clusterInfo.split(":");
          double clusterProbVal = Double.parseDouble(clusterProb[1]);
          refClusterInfoMap.put(clusterProb[0], clusterProbVal);
        }

        String[] clusterInfoList = value.get(0).toString().replace("[", "")
            .replace("]", "").split(" ");
        assertEquals("Number of clusters", k + 1, clusterInfoList.length);
        for (String clusterInfo : refClusterInfoList) {
          String[] clusterProb = clusterInfo.split(":");
          double clusterProbVal = Double.parseDouble(clusterProb[1]);
          assertEquals(k + " point:" + key + ": Cluster:" + clusterProb[0],
              refClusterInfoMap.get(clusterProb[0]), clusterProbVal);
        }
      }
    }
  }

}
TOP

Related Classes of org.apache.mahout.clustering.fuzzykmeans.TestFuzzyKmeansClustering

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.