Package edu.ucla.sspace.clustering

Source Code of edu.ucla.sspace.clustering.BisectingKMeans

/*
* Copyright 2010 Keith Stevens
*
* This file is part of the S-Space package and is covered under the terms and
* conditions therein.
*
* The S-Space package is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as published
* by the Free Software Foundation and distributed hereunder to you.
*
* THIS SOFTWARE IS PROVIDED "AS IS" AND NO REPRESENTATIONS OR WARRANTIES,
* EXPRESS OR IMPLIED ARE MADE.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, WE MAKE
* NO REPRESENTATIONS OR WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY
* PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR DOCUMENTATION
* WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER
* RIGHTS.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/

package edu.ucla.sspace.clustering;

import edu.ucla.sspace.matrix.Matrix;
import edu.ucla.sspace.matrix.Matrices;

import edu.ucla.sspace.vector.DoubleVector;

import java.util.ArrayList;
import java.util.List;
import java.util.Properties;

import java.util.logging.Logger;


/**
* An implementation of the Bisecting K-Means algorithm, also known as Repeated
* Bisections.  This implementation is based on the following paper:
*
*   <li style="font-family:Garamond, Georgia, serif">Michael Steinbach,
*   George Karypis, Vipin Kumar.  "A comparison of document clustering
*   techniques," in <i>KDD Workshop on Text Mining</i>, 200</li>
*
* This clustering algorithm improves upon the standard K-Means algorithm by
* taking a data set and repeatedly splitting the data points into two regions.
* Initially all data points are separated into two clusters.  Then, until the
* desired number of clusters are created, the largest cluster is divided using
* K-Means with K equal to 2.  This implementation relies on the {@link
* DirectClustering} implementation.  Any properties passed to this clustering
* method are passed onto the {@link DirectClustering} algorithm, allowing the
* user to set the desired seeding method.
*
* @see KMeansClustering
*
* @author Keith Stevens
*/
public class BisectingKMeans implements Clustering {

    /**
     * Not implemented.
     */
    public Assignments cluster(Matrix dataPoints, Properties props) {
        throw new UnsupportedOperationException(
                "KMeansClustering requires that the " +
                "number of clusters be specified");
    }

    /**
     * {@inheritDoc}
     */
    public Assignments cluster(Matrix dataPoints,
                               int numClusters,
                               Properties props) {
        // Handle a simple base case.
        if (numClusters <= 1) {
            Assignment[] assignments = new Assignment[dataPoints.rows()];
            for (int i = 0; i < assignments.length; ++i)
                assignments[i] = new HardAssignment(0);
            return new Assignments(numClusters, assignments, dataPoints);
        }

        // Create a count of cluster assignments.
        int[] numAssignments = new int[numClusters];

        // Create a list of lists.  The inner list represents the vectors
        // assigned to a particular cluster.  We use this method so that we can
        // easily transform the cluster to a Matrix
        List<List<DoubleVector>> clusters = new ArrayList<List<DoubleVector>>(
                numClusters);
        for (int c = 0; c < numClusters; ++c)
            clusters.add(new ArrayList<DoubleVector>());

        Clustering clustering = new DirectClustering();
        // Make the first bisection.
        Assignment[] assignments =
            clustering.cluster(dataPoints, 2, props).assignments();

        // Count the number of assignments made to each cluster and move the
        // vectors in to the corresponding list.
        for (int i = 0; i < assignments.length; ++i) {
            int assignment = assignments[i].assignments()[0];
            numAssignments[assignment]++;
            clusters.get(assignment).add(dataPoints.getRowVector(i));
        }

        // Generate the numClusters - 2 clusters by finding the largest cluster
        // and bisecting it.  Of the 2 resulting clusters, one will maintain the
        // same cluster index and the other will be given a new cluster index,
        // namely k, the current cluster number.
        for (int k = 2; k < numClusters; k++) {
            // Find the largest cluster.
            int largestSize = 0;
            int largestIndex = 0;
            for (int c = 0; c < numClusters; ++c) {
                if (numAssignments[c] > largestSize) {
                    largestSize = numAssignments[c];
                    largestIndex = c;
                }
            }

            // Get the list of vectors representing the cluster being split and
            // the cluster that will hold the vectors split off from this
            // cluster.
            List<DoubleVector> originalCluster = clusters.get(largestIndex);
            List<DoubleVector> newCluster = clusters.get(k);

            // Split the largest cluster.
            Matrix clusterToSplit = Matrices.asMatrix(originalCluster);
            Assignment[] newAssignments =
                clustering.cluster(clusterToSplit, 2, props).assignments();

            // Clear the lists for cluster being split and the new cluster.
            // Also clear the number of assignments.
            originalCluster.clear();
            newCluster.clear();
            numAssignments[largestIndex] = 0;
            numAssignments[k] = 0;

            // Reassign data points in the largest cluster.  Data points
            // assigned to the 0 cluster maintain their cluster number in the
            // real assignment list.  Data points assigned to cluster 1 get the
            // new cluster number, k. 
            for (int i = 0, j = 0; i < dataPoints.rows(); ++i) {
                if (assignments[i].assignments()[0] == largestIndex) {
                    // Make the assignment for vectors that keep their
                    // assignment.
                    if (newAssignments[j].assignments()[0] == 0) {
                        originalCluster.add(dataPoints.getRowVector(i));
                        numAssignments[largestIndex]++;
                    }
                    // Make the assignment for vectors that have changed their
                    // assignment.
                    else {
                        newCluster.add(dataPoints.getRowVector(i));
                        assignments[i] = new HardAssignment(k);
                        numAssignments[k]++;
                    }
                    j++;
                }
            }
        }
        return new Assignments(numClusters, assignments, dataPoints);
    }

    public String toString() {
        return "BisectingKMeans";
    }
}
TOP

Related Classes of edu.ucla.sspace.clustering.BisectingKMeans

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.