package de.lmu.ifi.dbs.elki.algorithm.clustering;
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
Copyright (C) 2011
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm;
import de.lmu.ifi.dbs.elki.data.Cluster;
import de.lmu.ifi.dbs.elki.data.Clustering;
import de.lmu.ifi.dbs.elki.data.NumberVector;
import de.lmu.ifi.dbs.elki.data.model.ClusterModel;
import de.lmu.ifi.dbs.elki.data.model.Model;
import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.database.Database;
import de.lmu.ifi.dbs.elki.database.ids.DBID;
import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
import de.lmu.ifi.dbs.elki.database.query.DistanceResultPair;
import de.lmu.ifi.dbs.elki.database.query.range.RangeQuery;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction;
import de.lmu.ifi.dbs.elki.distance.distancefunction.EuclideanDistanceFunction;
import de.lmu.ifi.dbs.elki.distance.distancefunction.IndexBasedDistanceFunction;
import de.lmu.ifi.dbs.elki.distance.distancefunction.LocallyWeightedDistanceFunction;
import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance;
import de.lmu.ifi.dbs.elki.distance.distancevalue.DoubleDistance;
import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress;
import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ChainedParameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ListParameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DistanceParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
/**
* Provides an abstract algorithm requiring a VarianceAnalysisPreprocessor.
*
* @author Arthur Zimek
* @param <V> the type of NumberVector handled by this Algorithm
*/
public abstract class AbstractProjectedDBSCAN<R extends Clustering<Model>, V extends NumberVector<V, ?>> extends AbstractAlgorithm<R> implements ClusteringAlgorithm<R> {
/**
* Parameter to specify the distance function to determine the distance
* between database objects, must extend
* {@link de.lmu.ifi.dbs.elki.distance.distancefunction.LocallyWeightedDistanceFunction}
* .
* <p>
* Key: {@code -projdbscan.distancefunction}
* </p>
* <p>
* Default value:
* {@link de.lmu.ifi.dbs.elki.distance.distancefunction.LocallyWeightedDistanceFunction}
* </p>
*/
public static final OptionID OUTER_DISTANCE_FUNCTION_ID = OptionID.getOrCreateOptionID("projdbscan.outerdistancefunction", "Distance function to determine the distance between database objects.");
/**
* Parameter distance function
*/
public static final OptionID INNER_DISTANCE_FUNCTION_ID = OptionID.getOrCreateOptionID("projdbscan.distancefunction", "Distance function to determine the neighbors for variance analysis.");
/**
* Parameter to specify the maximum radius of the neighborhood to be
* considered, must be suitable to {@link LocallyWeightedDistanceFunction}.
* <p>
* Key: {@code -projdbscan.epsilon}
* </p>
*/
public static final OptionID EPSILON_ID = OptionID.getOrCreateOptionID("projdbscan.epsilon", "The maximum radius of the neighborhood to be considered.");
/**
* Parameter to specify the intrinsic dimensionality of the clusters to find,
* must be an integer greater than 0.
* <p>
* Key: {@code -projdbscan.lambda}
* </p>
*/
public static final OptionID LAMBDA_ID = OptionID.getOrCreateOptionID("projdbscan.lambda", "The intrinsic dimensionality of the clusters to find.");
/**
* Parameter to specify the threshold for minimum number of points in the
* epsilon-neighborhood of a point, must be an integer greater than 0.
* <p>
* Key: {@code -projdbscan.minpts}
* </p>
*/
public static final OptionID MINPTS_ID = OptionID.getOrCreateOptionID("projdbscan.minpts", "Threshold for minimum number of points in " + "the epsilon-neighborhood of a point.");
/**
* Holds the instance of the distance function specified by
* {@link #INNER_DISTANCE_FUNCTION_ID}.
*/
private LocallyWeightedDistanceFunction<V> distanceFunction;
/**
* Holds the value of {@link #EPSILON_ID}.
*/
protected DoubleDistance epsilon;
/**
* Holds the value of {@link #LAMBDA_ID}.
*/
private int lambda;
/**
* Holds the value of {@link #MINPTS_ID}.
*/
protected int minpts = 1;
/**
* Holds a list of clusters found.
*/
private List<ModifiableDBIDs> resultList;
/**
* Holds a set of noise.
*/
private ModifiableDBIDs noise;
/**
* Holds a set of processed ids.
*/
private ModifiableDBIDs processedIDs;
/**
* Constructor.
*
* @param epsilon Epsilon
* @param minpts MinPts parameter
* @param distanceFunction Outer distance function
* @param lambda Lambda value
*/
public AbstractProjectedDBSCAN(DoubleDistance epsilon, int minpts, LocallyWeightedDistanceFunction<V> distanceFunction, int lambda) {
super();
this.epsilon = epsilon;
this.minpts = minpts;
this.distanceFunction = distanceFunction;
this.lambda = lambda;
}
public Clustering<Model> run(Database database, Relation<V> relation) throws IllegalStateException {
FiniteProgress objprog = getLogger().isVerbose() ? new FiniteProgress("Processing objects", relation.size(), getLogger()) : null;
IndefiniteProgress clusprog = getLogger().isVerbose() ? new IndefiniteProgress("Number of clusters", getLogger()) : null;
resultList = new ArrayList<ModifiableDBIDs>();
noise = DBIDUtil.newHashSet();
processedIDs = DBIDUtil.newHashSet(relation.size());
LocallyWeightedDistanceFunction.Instance<V> distFunc = distanceFunction.instantiate(relation);
RangeQuery<V, DoubleDistance> rangeQuery = database.getRangeQuery(distFunc);
if(relation.size() >= minpts) {
for(DBID id : relation.iterDBIDs()) {
if(!processedIDs.contains(id)) {
expandCluster(distFunc, rangeQuery, id, objprog, clusprog);
if(processedIDs.size() == relation.size() && noise.size() == 0) {
break;
}
}
if(objprog != null && clusprog != null) {
objprog.setProcessed(processedIDs.size(), getLogger());
clusprog.setProcessed(resultList.size(), getLogger());
}
}
}
else {
for(DBID id : relation.iterDBIDs()) {
noise.add(id);
if(objprog != null && clusprog != null) {
objprog.setProcessed(processedIDs.size(), getLogger());
clusprog.setProcessed(resultList.size(), getLogger());
}
}
}
if(objprog != null && clusprog != null) {
objprog.setProcessed(processedIDs.size(), getLogger());
clusprog.setProcessed(resultList.size(), getLogger());
}
Clustering<Model> result = new Clustering<Model>(getLongResultName(), getShortResultName());
for(Iterator<ModifiableDBIDs> resultListIter = resultList.iterator(); resultListIter.hasNext();) {
Cluster<Model> c = new Cluster<Model>(resultListIter.next(), ClusterModel.CLUSTER);
result.addCluster(c);
}
Cluster<Model> n = new Cluster<Model>(noise, true, ClusterModel.CLUSTER);
result.addCluster(n);
if(objprog != null && clusprog != null) {
objprog.setProcessed(processedIDs.size(), getLogger());
clusprog.setProcessed(resultList.size(), getLogger());
}
// Signal that the progress has completed.
if(objprog != null && clusprog != null) {
objprog.ensureCompleted(getLogger());
clusprog.setCompleted(getLogger());
}
return result;
}
/**
* Return the long result name.
*
* @return Long name for result
*/
public abstract String getLongResultName();
/**
* Return the short result name.
*
* @return Short name for result
*/
public abstract String getShortResultName();
/**
* ExpandCluster function of DBSCAN.
*
* @param distFunc Distance query to use
* @param rangeQuery Range query
* @param startObjectID the object id of the database object to start the
* expansion with
* @param objprog the progress object for logging the current status
*/
protected void expandCluster(LocallyWeightedDistanceFunction.Instance<V> distFunc, RangeQuery<V, DoubleDistance> rangeQuery, DBID startObjectID, FiniteProgress objprog, IndefiniteProgress clusprog) {
Integer corrDim = distFunc.getIndex().getLocalProjection(startObjectID).getCorrelationDimension();
if(getLogger().isDebugging()) {
getLogger().debugFine("EXPAND CLUSTER id = " + startObjectID + " " + corrDim + "\n#clusters: " + resultList.size());
}
// euclidean epsilon neighborhood < minpts OR local dimensionality >
// lambda -> noise
if(corrDim == null || corrDim > lambda) {
noise.add(startObjectID);
processedIDs.add(startObjectID);
if(objprog != null && clusprog != null) {
objprog.setProcessed(processedIDs.size(), getLogger());
clusprog.setProcessed(resultList.size(), getLogger());
}
return;
}
// compute weighted epsilon neighborhood
List<DistanceResultPair<DoubleDistance>> seeds = rangeQuery.getRangeForDBID(startObjectID, epsilon);
// neighbors < minPts -> noise
if(seeds.size() < minpts) {
noise.add(startObjectID);
processedIDs.add(startObjectID);
if(objprog != null && clusprog != null) {
objprog.setProcessed(processedIDs.size(), getLogger());
clusprog.setProcessed(resultList.size(), getLogger());
}
return;
}
// try to expand the cluster
ModifiableDBIDs currentCluster = DBIDUtil.newArray();
for(DistanceResultPair<DoubleDistance> seed : seeds) {
DBID nextID = seed.getDBID();
Integer nextID_corrDim = distFunc.getIndex().getLocalProjection(nextID).getCorrelationDimension();
// nextID is not reachable from start object
if(nextID_corrDim > lambda) {
continue;
}
if(!processedIDs.contains(nextID)) {
currentCluster.add(nextID);
processedIDs.add(nextID);
}
else if(noise.contains(nextID)) {
currentCluster.add(nextID);
noise.remove(nextID);
}
}
seeds.remove(0);
while(seeds.size() > 0) {
DBID q = seeds.remove(0).getDBID();
Integer corrDim_q = distFunc.getIndex().getLocalProjection(q).getCorrelationDimension();
// q forms no lambda-dim hyperplane
if(corrDim_q > lambda) {
continue;
}
List<DistanceResultPair<DoubleDistance>> reachables = rangeQuery.getRangeForDBID(q, epsilon);
if(reachables.size() > minpts) {
for(DistanceResultPair<DoubleDistance> r : reachables) {
Integer corrDim_r = distFunc.getIndex().getLocalProjection(r.getDBID()).getCorrelationDimension();
// r is not reachable from q
if(corrDim_r > lambda) {
continue;
}
boolean inNoise = noise.contains(r.getDBID());
boolean unclassified = !processedIDs.contains(r.getDBID());
if(inNoise || unclassified) {
if(unclassified) {
seeds.add(r);
}
currentCluster.add(r.getDBID());
processedIDs.add(r.getDBID());
if(inNoise) {
noise.remove(r.getDBID());
}
if(objprog != null && clusprog != null) {
objprog.setProcessed(processedIDs.size(), getLogger());
int numClusters = currentCluster.size() > minpts ? resultList.size() + 1 : resultList.size();
clusprog.setProcessed(numClusters, getLogger());
}
}
}
}
if(processedIDs.size() == distFunc.getRelation().size() && noise.size() == 0) {
break;
}
}
if(currentCluster.size() >= minpts) {
resultList.add(currentCluster);
}
else {
for(DBID id : currentCluster) {
noise.add(id);
}
noise.add(startObjectID);
processedIDs.add(startObjectID);
}
if(objprog != null && clusprog != null) {
objprog.setProcessed(processedIDs.size(), getLogger());
clusprog.setProcessed(resultList.size(), getLogger());
}
}
@Override
public TypeInformation[] getInputTypeRestriction() {
return TypeUtil.array(distanceFunction.getInputTypeRestriction());
}
/**
* Parameterization class.
*
* @author Erich Schubert
*
* @apiviz.exclude
*/
public static abstract class Parameterizer<V extends NumberVector<V, ?>, D extends Distance<D>> extends AbstractParameterizer {
protected DistanceFunction<V, D> innerdist;
protected D epsilon;
protected LocallyWeightedDistanceFunction<V> outerdist;
protected int minpts = -1;
protected Integer lambda;
protected void configInnerDistance(Parameterization config) {
ObjectParameter<DistanceFunction<V, D>> innerdistP = new ObjectParameter<DistanceFunction<V, D>>(AbstractProjectedDBSCAN.INNER_DISTANCE_FUNCTION_ID, DistanceFunction.class, EuclideanDistanceFunction.class);
if(config.grab(innerdistP)) {
innerdist = innerdistP.instantiateClass(config);
}
}
protected void configEpsilon(Parameterization config, DistanceFunction<V, D> innerdist) {
D distanceParser = innerdist != null ? innerdist.getDistanceFactory() : null;
DistanceParameter<D> epsilonP = new DistanceParameter<D>(EPSILON_ID, distanceParser);
if(config.grab(epsilonP)) {
epsilon = epsilonP.getValue();
}
}
protected void configMinPts(Parameterization config) {
IntParameter minptsP = new IntParameter(MINPTS_ID, new GreaterConstraint(0));
if(config.grab(minptsP)) {
minpts = minptsP.getValue();
}
}
protected void configOuterDistance(Parameterization config, D epsilon, int minpts, Class<?> preprocessorClass, DistanceFunction<V, D> innerdist) {
ObjectParameter<LocallyWeightedDistanceFunction<V>> outerdistP = new ObjectParameter<LocallyWeightedDistanceFunction<V>>(OUTER_DISTANCE_FUNCTION_ID, LocallyWeightedDistanceFunction.class, LocallyWeightedDistanceFunction.class);
if(config.grab(outerdistP)) {
// parameters for the distance function
ListParameterization distanceFunctionParameters = new ListParameterization();
// distanceFunctionParameters.addFlag(PreprocessorHandler.OMIT_PREPROCESSING_ID);
distanceFunctionParameters.addParameter(IndexBasedDistanceFunction.INDEX_ID, preprocessorClass);
distanceFunctionParameters.addParameter(AbstractProjectedDBSCAN.INNER_DISTANCE_FUNCTION_ID, innerdist);
distanceFunctionParameters.addParameter(AbstractProjectedDBSCAN.EPSILON_ID, epsilon);
distanceFunctionParameters.addParameter(AbstractProjectedDBSCAN.MINPTS_ID, minpts);
ChainedParameterization combinedConfig = new ChainedParameterization(distanceFunctionParameters, config);
combinedConfig.errorsTo(config);
outerdist = outerdistP.instantiateClass(combinedConfig);
}
}
protected void configLambda(Parameterization config) {
IntParameter lambdaP = new IntParameter(LAMBDA_ID, new GreaterConstraint(0));
if(config.grab(lambdaP)) {
lambda = lambdaP.getValue();
}
}
}
}