Source Code of de.lmu.ifi.dbs.elki.algorithm.clustering.SNNClustering$Parameterizer

package de.lmu.ifi.dbs.elki.algorithm.clustering;


/*
 This file is part of ELKI:
 Environment for Developing KDD-Applications Supported by Index-Structures


 Copyright (C) 2011
 Ludwig-Maximilians-Universität München
 Lehr- und Forschungseinheit für Datenbanksysteme
 ELKI Development Team


 This program is free software: you can redistribute it and/or modify
 it under the terms of the GNU Affero General Public License as published by
 the Free Software Foundation, either version 3 of the License, or
 (at your option) any later version.


 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU Affero General Public License for more details.


 You should have received a copy of the GNU Affero General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */


import java.util.ArrayList;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;


import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm;
import de.lmu.ifi.dbs.elki.data.Cluster;
import de.lmu.ifi.dbs.elki.data.Clustering;
import de.lmu.ifi.dbs.elki.data.model.ClusterModel;
import de.lmu.ifi.dbs.elki.data.model.Model;
import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.database.Database;
import de.lmu.ifi.dbs.elki.database.ids.DBID;
import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
import de.lmu.ifi.dbs.elki.database.query.similarity.SimilarityQuery;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.distance.distancevalue.IntegerDistance;
import de.lmu.ifi.dbs.elki.distance.similarityfunction.SharedNearestNeighborSimilarityFunction;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress;
import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress;
import de.lmu.ifi.dbs.elki.utilities.ClassGenericsUtil;
import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DistanceParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;


/**
 * <p>
 * Shared nearest neighbor clustering.
 * </p>
 * <p>
 * Reference: L. Ertöz, M. Steinbach, V. Kumar: Finding Clusters of Different
 * Sizes, Shapes, and Densities in Noisy, High Dimensional Data. <br>
 * In: Proc. of SIAM Data Mining (SDM), 2003.
 * </p>
 * 
 * @author Arthur Zimek
 * 
 * @apiviz.uses SharedNearestNeighborSimilarityFunction
 * 
 * @param <O> the type of Object the algorithm is applied on
 */
@Title("SNN: Shared Nearest Neighbor Clustering")
@Description("Algorithm to find shared-nearest-neighbors-density-connected sets in a database based on the " + "parameters 'minPts' and 'epsilon' (specifying a volume). " + "These two parameters determine a density threshold for clustering.")
@Reference(authors = "L. Ertöz, M. Steinbach, V. Kumar", title = "Finding Clusters of Different Sizes, Shapes, and Densities in Noisy, High Dimensional Data", booktitle = "Proc. of SIAM Data Mining (SDM), 2003", url = "http://www.siam.org/meetings/sdm03/proceedings/sdm03_05.pdf")
public class SNNClustering<O> extends AbstractAlgorithm<Clustering<Model>> implements ClusteringAlgorithm<Clustering<Model>> {
  /**
   * The logger for this class.
   */
  private static final Logging logger = Logging.getLogger(SNNClustering.class);


  /**
   * Parameter to specify the minimum SNN density, must be an integer greater
   * than 0.
   */
  public static final OptionID EPSILON_ID = OptionID.getOrCreateOptionID("snn.epsilon", "The minimum SNN density.");


  /**
   * Holds the value of {@link #EPSILON_ID}.
   */
  private IntegerDistance epsilon;


  /**
   * Parameter to specify the threshold for minimum number of points in the
   * epsilon-SNN-neighborhood of a point, must be an integer greater than 0.
   */
  public static final OptionID MINPTS_ID = OptionID.getOrCreateOptionID("snn.minpts", "Threshold for minimum number of points in " + "the epsilon-SNN-neighborhood of a point.");


  /**
   * Holds the value of {@link #MINPTS_ID}.
   */
  private int minpts;


  /**
   * Holds a list of clusters found.
   */
  protected List<ModifiableDBIDs> resultList;


  /**
   * Holds a set of noise.
   */
  protected ModifiableDBIDs noise;


  /**
   * Holds a set of processed ids.
   */
  protected ModifiableDBIDs processedIDs;


  /**
   * The similarity function for the shared nearest neighbor similarity.
   */
  private SharedNearestNeighborSimilarityFunction<O> similarityFunction;


  /**
   * Constructor.
   * 
   * @param similarityFunction Similarity function
   * @param epsilon Epsilon
   * @param minpts Minpts
   */
  public SNNClustering(SharedNearestNeighborSimilarityFunction<O> similarityFunction, IntegerDistance epsilon, int minpts) {
    super();
    this.similarityFunction = similarityFunction;
    this.epsilon = epsilon;
    this.minpts = minpts;
  }


  /**
   * Perform SNN clustering
   * 
   * @param database Database
   * @param relation Relation
   * @return Result
   */
  public Clustering<Model> run(Database database, Relation<O> relation) {
    SimilarityQuery<O, IntegerDistance> snnInstance = similarityFunction.instantiate(relation);


    FiniteProgress objprog = logger.isVerbose() ? new FiniteProgress("SNNClustering", relation.size(), logger) : null;
    IndefiniteProgress clusprog = logger.isVerbose() ? new IndefiniteProgress("Number of clusters", logger) : null;
    resultList = new ArrayList<ModifiableDBIDs>();
    noise = DBIDUtil.newHashSet();
    processedIDs = DBIDUtil.newHashSet(relation.size());
    if(relation.size() >= minpts) {
      for(DBID id : snnInstance.getRelation().iterDBIDs()) {
        if(!processedIDs.contains(id)) {
          expandCluster(snnInstance, id, objprog, clusprog);
          if(processedIDs.size() == relation.size() && noise.size() == 0) {
            break;
          }
        }
        if(objprog != null && clusprog != null) {
          objprog.setProcessed(processedIDs.size(), logger);
          clusprog.setProcessed(resultList.size(), logger);
        }
      }
    }
    else {
      for(DBID id : snnInstance.getRelation().iterDBIDs()) {
        noise.add(id);
        if(objprog != null && clusprog != null) {
          objprog.setProcessed(noise.size(), logger);
          clusprog.setProcessed(resultList.size(), logger);
        }
      }
    }
    // Finish progress logging
    if(objprog != null && clusprog != null) {
      objprog.ensureCompleted(logger);
      clusprog.setCompleted(logger);
    }


    Clustering<Model> result = new Clustering<Model>("Shared-Nearest-Neighbor Clustering", "snn-clustering");
    for(Iterator<ModifiableDBIDs> resultListIter = resultList.iterator(); resultListIter.hasNext();) {
      result.addCluster(new Cluster<Model>(resultListIter.next(), ClusterModel.CLUSTER));
    }
    result.addCluster(new Cluster<Model>(noise, true, ClusterModel.CLUSTER));


    return result;
  }


  /**
   * Returns the shared nearest neighbors of the specified query object in the
   * given database.
   * 
   * @param snnInstance shared nearest neighbors
   * @param queryObject the query object
   * @return the shared nearest neighbors of the specified query object in the
   *         given database
   */
  protected List<DBID> findSNNNeighbors(SimilarityQuery<O, IntegerDistance> snnInstance, DBID queryObject) {
    List<DBID> neighbors = new LinkedList<DBID>();
    for(DBID id : snnInstance.getRelation().iterDBIDs()) {
      if(snnInstance.similarity(queryObject, id).compareTo(epsilon) >= 0) {
        neighbors.add(id);
      }
    }
    return neighbors;
  }


  /**
   * DBSCAN-function expandCluster adapted to SNN criterion.
   * <p/>
   * <p/>
   * Border-Objects become members of the first possible cluster.
   * 
   * @param snnInstance shared nearest neighbors
   * @param startObjectID potential seed of a new potential cluster
   * @param objprog the progress object to report about the progress of
   *        clustering
   */
  protected void expandCluster(SimilarityQuery<O, IntegerDistance> snnInstance, DBID startObjectID, FiniteProgress objprog, IndefiniteProgress clusprog) {
    List<DBID> seeds = findSNNNeighbors(snnInstance, startObjectID);


    // startObject is no core-object
    if(seeds.size() < minpts) {
      noise.add(startObjectID);
      processedIDs.add(startObjectID);
      if(objprog != null && clusprog != null) {
        objprog.setProcessed(processedIDs.size(), logger);
        clusprog.setProcessed(resultList.size(), logger);
      }
      return;
    }


    // try to expand the cluster
    ModifiableDBIDs currentCluster = DBIDUtil.newArray();
    for(DBID seed : seeds) {
      if(!processedIDs.contains(seed)) {
        currentCluster.add(seed);
        processedIDs.add(seed);
      }
      else if(noise.contains(seed)) {
        currentCluster.add(seed);
        noise.remove(seed);
      }
    }
    seeds.remove(0);


    while(seeds.size() > 0) {
      DBID o = seeds.remove(0);
      List<DBID> neighborhood = findSNNNeighbors(snnInstance, o);


      if(neighborhood.size() >= minpts) {
        for(DBID p : neighborhood) {
          boolean inNoise = noise.contains(p);
          boolean unclassified = !processedIDs.contains(p);
          if(inNoise || unclassified) {
            if(unclassified) {
              seeds.add(p);
            }
            currentCluster.add(p);
            processedIDs.add(p);
            if(inNoise) {
              noise.remove(p);
            }
          }
        }
      }


      if(objprog != null && clusprog != null) {
        objprog.setProcessed(processedIDs.size(), logger);
        int numClusters = currentCluster.size() > minpts ? resultList.size() + 1 : resultList.size();
        clusprog.setProcessed(numClusters, logger);
      }


      if(processedIDs.size() == snnInstance.getRelation().size() && noise.size() == 0) {
        break;
      }
    }
    if(currentCluster.size() >= minpts) {
      resultList.add(currentCluster);
    }
    else {
      for(DBID id : currentCluster) {
        noise.add(id);
      }
      noise.add(startObjectID);
      processedIDs.add(startObjectID);
    }
  }


  @Override
  public TypeInformation[] getInputTypeRestriction() {
    return TypeUtil.array(similarityFunction.getInputTypeRestriction());
  }


  @Override
  protected Logging getLogger() {
    return logger;
  }


  /**
   * Parameterization class.
   * 
   * @author Erich Schubert
   * 
   * @apiviz.exclude
   * 
   * @param <O> object type
   */
  public static class Parameterizer<O> extends AbstractParameterizer {
    protected IntegerDistance epsilon;


    protected int minpts;


    private SharedNearestNeighborSimilarityFunction<O> similarityFunction;


    @Override
    protected void makeOptions(Parameterization config) {
      super.makeOptions(config);
      Class<SharedNearestNeighborSimilarityFunction<O>> cls = ClassGenericsUtil.uglyCastIntoSubclass(SharedNearestNeighborSimilarityFunction.class);
      similarityFunction = config.tryInstantiate(cls);


      DistanceParameter<IntegerDistance> epsilonP = new DistanceParameter<IntegerDistance>(EPSILON_ID, IntegerDistance.FACTORY);
      if(config.grab(epsilonP)) {
        epsilon = epsilonP.getValue();
      }


      IntParameter minptsP = new IntParameter(MINPTS_ID, new GreaterConstraint(0));
      if(config.grab(minptsP)) {
        minpts = minptsP.getValue();
      }
    }


    @Override
    protected SNNClustering<O> makeInstance() {
      return new SNNClustering<O>(similarityFunction, epsilon, minpts);
    }
  }
}
Source Code of de.lmu.ifi.dbs.elki.algorithm.clustering.SNNClustering$Parameterizer

Related Classes of de.lmu.ifi.dbs.elki.algorithm.clustering.SNNClustering$Parameterizer