new ArrayList<CandidateCluster>(kappa);
for (int r = 0; r < rows; /* no auto-increment */) {
for ( ; facilities.size() <= kappa && r < rows; ++r) {
DoubleVector x = matrix.getRowVector(r);
CandidateCluster closest = null;
// Delta is ultimately assigned the lowest inverse-similarity
// (distance) to any of the current facilities' center of mass
double delta = Double.MAX_VALUE;
for (CandidateCluster y : facilities) {
double similarity =
simFunc.sim(x, y.centerOfMass());
double invSim = invertSim(similarity);
if (invSim < delta) {
delta = invSim;
closest = y;
}
}
// Base case: If this is the first data point and there are no
// other facilities
//
// Or if we surpass the probability of a new event occurring
// (line 6)
if (closest == null || Math.random() < delta / f) {
CandidateCluster fac = new CandidateCluster();
fac.add(r, x);
facilities.add(fac);
}
// Otherwise, add this data point to an existing facility
else {
closest.add(r, x);
}
}
// If we still have data points left to process (line 10:)
if (r < rows) {
// Check whether we have more than kappa clusters (line 11).
// Kappa provides the upper bound on the clusters (facilities)
// that are kept at any given time. If there are more, then we
// need to consolidate facilities
while (facilities.size() > kappa) {
f *= beta;
int curNumFacilities = facilities.size();
List<CandidateCluster> consolidated =
new ArrayList<CandidateCluster>(kappa);
consolidated.add(facilities.get(0));
for (int j = 1; j < curNumFacilities; ++j) {
CandidateCluster x = facilities.get(j);
int pointsAssigned = x.size();
// Compute the similarity of this facility to all other
// consolidated facilities. Delta represents the lowest
// inverse-similarity (distance) to another facility.
// See line 17 of the algorithm.
double delta = Double.MAX_VALUE;
CandidateCluster closest = null;
for (CandidateCluster y : consolidated) {
double similarity =
simFunc.sim(x.centerOfMass(), y.centerOfMass());
double invSim = invertSim(similarity);
if (invSim < delta) {
delta = invSim;
closest = y;
}
}
// Use (pointsAssigned * delta / f) as a threshold for
// whether this facility could constitute a new event.
// If a random check is less than it, then we nominate
// this facilty to continue.
if (Math.random() < (pointsAssigned * delta) / f) {
consolidated.add(x);
}
// Otherwise, we consolidate the points in this
// community to the closest facility
else {
assert closest != null : "no closest facility";
closest.merge(x);
}
}
verbose(LOGGER, "Consolidated %d facilities down to %d",
facilities.size(), consolidated.size());
facilities = consolidated;
}
}
// Once we have processed all of the items in the stream (line 23 of
// algorithm), reduce the kappa clusters into k clusters.
else {
// Edge case for when we already have fewer facilities than we
// need
if (facilities.size() <= numClusters) {
verbose(LOGGER, "Had fewer facilities, %d, than the " +
"requested number of clusters %d",
facilities.size(), numClusters);
// There's no point in reducing the number of facilities
// further since we're under the desired amount, nor can we
// go back and increase the number of facilities since all
// the data has been seen at this point. Therefore, just
// loop through the candidates and report their assignemnts.
Assignment[] assignments = new Assignment[rows];
int numFacilities = facilities.size();
for (int j = 0; j < numFacilities; ++j) {
CandidateCluster fac = facilities.get(j);
veryVerbose(LOGGER, "Facility %d had a center of mass at %s",
j, fac.centerOfMass());
int clusterId = j;
IntIterator iter = fac.indices().iterator();
while (iter.hasNext()) {
int row = iter.nextInt();
assignments[row] =
new HardAssignment(clusterId);
}
}
return new Assignments(numClusters, assignments, matrix);
}
else {
verbose(LOGGER, "Had more than %d facilities, " +
"consolidating to %d", facilities.size(),
numClusters);
List<DoubleVector> facilityCentroids =
new ArrayList<DoubleVector>(facilities.size());
int[] weights = new int[facilities.size()];
int i = 0;
for (CandidateCluster fac : facilities) {
facilityCentroids.add(fac.centerOfMass());
weights[i++] = fac.size();
}
// Wrap the facilities centroids in a matrix for convenience
Matrix m = Matrices.asMatrix(facilityCentroids);
// Select the initial seed points for reducing the kappa
// clusters to k using the generalized ORSS selection
// process, which supports data comparisons other than
// Euclidean distance
GeneralizedOrssSeed orss = new GeneralizedOrssSeed(simFunc);
DoubleVector[] centroids = orss.chooseSeeds(numClusters, m);
assert nonNullCentroids(centroids)
: "ORSS seed returned too few centroids";
// This records the assignments of the kappa facilities to
// the k centers. Initially, everyhting is assigned to the
// same center and iterations repeat until convergence.
int[] facilityAssignments = new int[facilities.size()];
// Using those facilities as starting points, run k-means on
// the facility centroids until no facilities change their
// memebership.
int numChanged = 0;
int kmeansIters = 0;
do {
numChanged = 0;
// Recompute the new centroids each time
DoubleVector[] updatedCentroids =
new DoubleVector[numClusters];
for (i = 0; i < updatedCentroids.length; ++i)
updatedCentroids[i] = new DenseVector(cols);
int[] updatedCentroidSizes = new int[numClusters];
double similaritySum = 0;
// For each CandidateCluster find the most similar centroid
i = 0;
for (CandidateCluster fac : facilities) {
int mostSim = -1;
double highestSim = -1;
for (int j = 0; j < centroids.length; ++j) {
// System.out.printf("centroids[%d]: %s%n fac.centroid(): %s%n",
// j, centroids[j],
// fac.centerOfMass());
double sim = simFunc.sim(centroids[j],
fac.centerOfMass());
if (sim > highestSim) {
highestSim = sim;
mostSim = j;
}
}
// For the most similar centroid, update its center
// of mass for the next round with the weighted
// vector
VectorMath.add(updatedCentroids[mostSim],
fac.sum());
updatedCentroidSizes[mostSim] += fac.size();
int curAssignment = facilityAssignments[i];
facilityAssignments[i] = mostSim;
similaritySum += highestSim;
if (curAssignment != mostSim) {
veryVerbose(LOGGER, "Facility %d changed its " +
"centroid from %d to %d",
i, curAssignment, mostSim);
numChanged++;
}
i++;
}
// Once all the facilities have been assigned to one of
// the k-centroids, recompute the centroids by
// normalizing the sum of the weighted vectors according
// the number of points
for (int j = 0; j < updatedCentroids.length; ++j) {
DoubleVector v = updatedCentroids[j];
int size = updatedCentroidSizes[j];
for (int k = 0; k < cols; ++k)
v.set(k, v.get(k) / size);
// Update this centroid for the next round
centroids[j] = v;
}
veryVerbose(LOGGER, "%d centroids swapped their facility",