Package de.lmu.ifi.dbs.elki.algorithm.clustering.subspace

Source Code of de.lmu.ifi.dbs.elki.algorithm.clustering.subspace.DiSH

package de.lmu.ifi.dbs.elki.algorithm.clustering.subspace;

/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures

Copyright (C) 2011
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU Affero General Public License for more details.

You should have received a copy of the GNU Affero General Public License
along with this program.  If not, see <http://www.gnu.org/licenses/>.
*/

import java.util.ArrayList;
import java.util.BitSet;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm;
import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm;
import de.lmu.ifi.dbs.elki.algorithm.clustering.OPTICS;
import de.lmu.ifi.dbs.elki.data.Cluster;
import de.lmu.ifi.dbs.elki.data.Clustering;
import de.lmu.ifi.dbs.elki.data.NumberVector;
import de.lmu.ifi.dbs.elki.data.Subspace;
import de.lmu.ifi.dbs.elki.data.model.SubspaceModel;
import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.database.Database;
import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs;
import de.lmu.ifi.dbs.elki.database.ids.DBID;
import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.distance.distancefunction.IndexBasedDistanceFunction;
import de.lmu.ifi.dbs.elki.distance.distancefunction.ProxyDistanceFunction;
import de.lmu.ifi.dbs.elki.distance.distancefunction.subspace.DiSHDistanceFunction;
import de.lmu.ifi.dbs.elki.distance.distancevalue.AbstractDistance;
import de.lmu.ifi.dbs.elki.distance.distancevalue.PreferenceVectorBasedCorrelationDistance;
import de.lmu.ifi.dbs.elki.index.preprocessed.preference.DiSHPreferenceVectorIndex;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress;
import de.lmu.ifi.dbs.elki.result.optics.ClusterOrderEntry;
import de.lmu.ifi.dbs.elki.result.optics.ClusterOrderResult;
import de.lmu.ifi.dbs.elki.utilities.ClassGenericsUtil;
import de.lmu.ifi.dbs.elki.utilities.DatabaseUtil;
import de.lmu.ifi.dbs.elki.utilities.FormatUtil;
import de.lmu.ifi.dbs.elki.utilities.datastructures.hierarchy.HierarchyReferenceLists;
import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ChainedParameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ListParameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.TrackParameters;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
import de.lmu.ifi.dbs.elki.utilities.pairs.Pair;

/**
* <p>
* Algorithm for detecting subspace hierarchies.
* </p>
* <p>
* Reference: <br>
* E. Achtert, C. Böhm, H.-P. Kriegel, P. Kröger, I. Müller-Gorman, A. Zimek:
* Detection and Visualization of Subspace Cluster Hierarchies. <br>
* In Proc. 12th International Conference on Database Systems for Advanced
* Applications (DASFAA), Bangkok, Thailand, 2007.
* </p>
*
* @author Elke Achtert
*
* @apiviz.uses DiSHPreferenceVectorIndex
* @apiviz.uses DiSHDistanceFunction
* @apiviz.has SubspaceModel
*
* @param <V> the type of NumberVector handled by this Algorithm
*/
@Title("DiSH: Detecting Subspace cluster Hierarchies")
@Description("Algorithm to find hierarchical correlation clusters in subspaces.")
@Reference(authors = "E. Achtert, C. Böhm, H.-P. Kriegel, P. Kröger, I. Müller-Gorman, A. Zimek", title = "Detection and Visualization of Subspace Cluster Hierarchies", booktitle = "Proc. 12th International Conference on Database Systems for Advanced Applications (DASFAA), Bangkok, Thailand, 2007", url = "http://dx.doi.org/10.1007/978-3-540-71703-4_15")
public class DiSH<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clustering<SubspaceModel<V>>> implements ClusteringAlgorithm<Clustering<SubspaceModel<V>>> {
  /**
   * The logger for this class.
   */
  private static final Logging logger = Logging.getLogger(DiSH.class);

  /**
   * Parameter that specifies the maximum radius of the neighborhood to be
   * considered in each dimension for determination of the preference vector,
   * must be a double equal to or greater than 0.
   * <p>
   * Default value: {@code 0.001}
   * </p>
   * <p>
   * Key: {@code -dish.epsilon}
   * </p>
   */
  public static final OptionID EPSILON_ID = OptionID.getOrCreateOptionID("dish.epsilon", "The maximum radius of the neighborhood " + "to be considered in each dimension for determination of " + "the preference vector.");

  /**
   * Parameter that specifies the a minimum number of points as a smoothing
   * factor to avoid the single-link-effect, must be an integer greater than 0.
   * <p>
   * Default value: {@code 1}
   * </p>
   * <p>
   * Key: {@code -dish.mu}
   * </p>
   */
  public static final OptionID MU_ID = OptionID.getOrCreateOptionID("dish.mu", "The minimum number of points as a smoothing factor to avoid the single-link-effekt.");

  /**
   * Holds the value of {@link #EPSILON_ID}.
   */
  private double epsilon;

  /**
   * The distance function we use
   */
  private DiSHDistanceFunction dishDistance;

  /**
   * Parameters that were given to OPTICS
   */
  private Collection<Pair<OptionID, Object>> opticsAlgorithmParameters;

  /**
   * Constructor.
   *
   * @param epsilon Epsilon value
   * @param dishDistance Distance function
   * @param opticsAlgorithmParameters OPTICS parameters
   */
  public DiSH(double epsilon, DiSHDistanceFunction dishDistance, Collection<Pair<OptionID, Object>> opticsAlgorithmParameters) {
    super();
    this.epsilon = epsilon;
    this.dishDistance = dishDistance;
    this.opticsAlgorithmParameters = opticsAlgorithmParameters;
  }

  /**
   * Performs the DiSH algorithm on the given database.
   */
  public Clustering<SubspaceModel<V>> run(Database database, Relation<V> relation) throws IllegalStateException {
    // Instantiate DiSH distance (and thus run the preprocessor)
    if(logger.isVerbose()) {
      logger.verbose("*** Run DiSH preprocessor.");
    }
    DiSHDistanceFunction.Instance<V> dishDistanceQuery = dishDistance.instantiate(relation);
    // Configure and run OPTICS.
    if(logger.isVerbose()) {
      logger.verbose("*** Run OPTICS algorithm.");
    }
    ListParameterization opticsconfig = new ListParameterization(opticsAlgorithmParameters);
    opticsconfig.addParameter(OPTICS.DISTANCE_FUNCTION_ID, ProxyDistanceFunction.proxy(dishDistanceQuery));

    Class<OPTICS<V, PreferenceVectorBasedCorrelationDistance>> cls = ClassGenericsUtil.uglyCastIntoSubclass(OPTICS.class);
    OPTICS<V, PreferenceVectorBasedCorrelationDistance> optics = null;
    optics = opticsconfig.tryInstantiate(cls);
    ClusterOrderResult<PreferenceVectorBasedCorrelationDistance> opticsResult = optics.run(database, relation);

    if(logger.isVerbose()) {
      logger.verbose("*** Compute Clusters.");
    }
    return computeClusters(relation, opticsResult, dishDistanceQuery);
  }

  /**
   * Computes the hierarchical clusters according to the cluster order.
   *
   * @param database the database holding the objects
   * @param clusterOrder the cluster order
   * @param distFunc Distance function
   */
  private Clustering<SubspaceModel<V>> computeClusters(Relation<V> database, ClusterOrderResult<PreferenceVectorBasedCorrelationDistance> clusterOrder, DiSHDistanceFunction.Instance<V> distFunc) {
    int dimensionality = DatabaseUtil.dimensionality(database);
    int minpts = dishDistance.getMinpts();

    // extract clusters
    Map<BitSet, List<Pair<BitSet, ArrayModifiableDBIDs>>> clustersMap = extractClusters(database, distFunc, clusterOrder);

    if(logger.isVerbose()) {
      StringBuffer msg = new StringBuffer("Step 1: extract clusters");
      for(List<Pair<BitSet, ArrayModifiableDBIDs>> clusterList : clustersMap.values()) {
        for(Pair<BitSet, ArrayModifiableDBIDs> c : clusterList) {
          msg.append("\n").append(FormatUtil.format(dimensionality, c.first)).append(" ids ").append(c.second.size());
        }
      }
      logger.verbose(msg.toString());
    }

    // check if there are clusters < minpts
    checkClusters(database, distFunc, clustersMap, minpts);
    if(logger.isVerbose()) {
      StringBuffer msg = new StringBuffer("Step 2: check clusters");
      for(List<Pair<BitSet, ArrayModifiableDBIDs>> clusterList : clustersMap.values()) {
        for(Pair<BitSet, ArrayModifiableDBIDs> c : clusterList) {
          msg.append("\n").append(FormatUtil.format(dimensionality, c.first)).append(" ids ").append(c.second.size());
        }
      }
      logger.verbose(msg.toString());
    }

    // sort the clusters
    List<Cluster<SubspaceModel<V>>> clusters = sortClusters(database, clustersMap);
    if(logger.isVerbose()) {
      StringBuffer msg = new StringBuffer("Step 3: sort clusters");
      for(Cluster<SubspaceModel<V>> c : clusters) {
        msg.append("\n").append(FormatUtil.format(dimensionality, c.getModel().getSubspace().getDimensions())).append(" ids ").append(c.size());
      }
      logger.verbose(msg.toString());
    }

    // build the hierarchy
    buildHierarchy(database, distFunc, clusters, dimensionality);
    if(logger.isVerbose()) {
      StringBuffer msg = new StringBuffer("Step 4: build hierarchy");
      for(Cluster<SubspaceModel<V>> c : clusters) {
        msg.append("\n").append(FormatUtil.format(dimensionality, c.getModel().getDimensions())).append(" ids ").append(c.size());
        for(Cluster<SubspaceModel<V>> cluster : c.getParents()) {
          msg.append("\n   parent ").append(cluster);
        }
        for(Cluster<SubspaceModel<V>> cluster : c.getChildren()) {
          msg.append("\n   child ").append(cluster);
        }
      }
      logger.verbose(msg.toString());
    }

    // build result
    Clustering<SubspaceModel<V>> result = new Clustering<SubspaceModel<V>>("DiSH clustering", "dish-clustering");
    for(Cluster<SubspaceModel<V>> c : clusters) {
      if(c.getParents() == null || c.getParents().isEmpty()) {
        result.addCluster(c);
      }
    }
    return result;
  }

  /**
   * Extracts the clusters from the cluster order.
   *
   * @param database the database storing the objects
   * @param distFunc the distance function
   * @param clusterOrder the cluster order to extract the clusters from
   * @return the extracted clusters
   */
  private Map<BitSet, List<Pair<BitSet, ArrayModifiableDBIDs>>> extractClusters(Relation<V> database, DiSHDistanceFunction.Instance<V> distFunc, ClusterOrderResult<PreferenceVectorBasedCorrelationDistance> clusterOrder) {
    FiniteProgress progress = logger.isVerbose() ? new FiniteProgress("Extract Clusters", database.size(), logger) : null;
    int processed = 0;
    Map<BitSet, List<Pair<BitSet, ArrayModifiableDBIDs>>> clustersMap = new HashMap<BitSet, List<Pair<BitSet, ArrayModifiableDBIDs>>>();
    Map<DBID, ClusterOrderEntry<PreferenceVectorBasedCorrelationDistance>> entryMap = new HashMap<DBID, ClusterOrderEntry<PreferenceVectorBasedCorrelationDistance>>();
    Map<DBID, Pair<BitSet, ArrayModifiableDBIDs>> entryToClusterMap = new HashMap<DBID, Pair<BitSet, ArrayModifiableDBIDs>>();
    for(Iterator<ClusterOrderEntry<PreferenceVectorBasedCorrelationDistance>> it = clusterOrder.iterator(); it.hasNext();) {
      ClusterOrderEntry<PreferenceVectorBasedCorrelationDistance> entry = it.next();
      entryMap.put(entry.getID(), entry);

      V object = database.get(entry.getID());
      BitSet preferenceVector = entry.getReachability().getCommonPreferenceVector();

      // get the list of (parallel) clusters for the preference vector
      List<Pair<BitSet, ArrayModifiableDBIDs>> parallelClusters = clustersMap.get(preferenceVector);
      if(parallelClusters == null) {
        parallelClusters = new ArrayList<Pair<BitSet, ArrayModifiableDBIDs>>();
        clustersMap.put(preferenceVector, parallelClusters);
      }

      // look for the proper cluster
      Pair<BitSet, ArrayModifiableDBIDs> cluster = null;
      for(Pair<BitSet, ArrayModifiableDBIDs> c : parallelClusters) {
        V c_centroid = DatabaseUtil.centroid(database, c.second, c.first);
        PreferenceVectorBasedCorrelationDistance dist = distFunc.correlationDistance(object, c_centroid, preferenceVector, preferenceVector);
        if(dist.getCorrelationValue() == entry.getReachability().getCorrelationValue()) {
          double d = distFunc.weightedDistance(object, c_centroid, dist.getCommonPreferenceVector());
          if(d <= 2 * epsilon) {
            cluster = c;
            break;
          }
        }
      }
      if(cluster == null) {
        cluster = new Pair<BitSet, ArrayModifiableDBIDs>(preferenceVector, DBIDUtil.newArray());
        parallelClusters.add(cluster);
      }
      cluster.second.add(entry.getID());
      entryToClusterMap.put(entry.getID(), cluster);

      if(progress != null) {
        progress.setProcessed(++processed, logger);
      }
    }
    if(progress != null) {
      progress.ensureCompleted(logger);
    }

    if(logger.isDebuggingFiner()) {
      StringBuffer msg = new StringBuffer("Step 0");
      for(List<Pair<BitSet, ArrayModifiableDBIDs>> clusterList : clustersMap.values()) {
        for(Pair<BitSet, ArrayModifiableDBIDs> c : clusterList) {
          msg.append("\n").append(FormatUtil.format(DatabaseUtil.dimensionality(database), c.first)).append(" ids ").append(c.second.size());
        }
      }
      logger.debugFiner(msg.toString());
    }

    // add the predecessor to the cluster
    for(BitSet pv : clustersMap.keySet()) {
      List<Pair<BitSet, ArrayModifiableDBIDs>> parallelClusters = clustersMap.get(pv);
      for(Pair<BitSet, ArrayModifiableDBIDs> cluster : parallelClusters) {
        if(cluster.second.isEmpty()) {
          continue;
        }
        DBID firstID = cluster.second.get(0);
        ClusterOrderEntry<PreferenceVectorBasedCorrelationDistance> entry = entryMap.get(firstID);
        DBID predecessorID = entry.getPredecessorID();
        if(predecessorID == null) {
          continue;
        }
        ClusterOrderEntry<PreferenceVectorBasedCorrelationDistance> predecessor = entryMap.get(predecessorID);
        // parallel cluster
        if(predecessor.getReachability().getCommonPreferenceVector().equals(entry.getReachability().getCommonPreferenceVector())) {
          continue;
        }
        if(predecessor.getReachability().compareTo(entry.getReachability()) < 0) {
          continue;
        }

        Pair<BitSet, ArrayModifiableDBIDs> oldCluster = entryToClusterMap.get(predecessorID);
        oldCluster.second.remove(predecessorID);
        cluster.second.add(predecessorID);
        entryToClusterMap.remove(predecessorID);
        entryToClusterMap.put(predecessorID, cluster);
      }
    }

    return clustersMap;
  }

  /**
   * Returns a sorted list of the clusters w.r.t. the subspace dimensionality in
   * descending order.
   *
   * @param database the database storing the objects
   * @param clustersMap the mapping of bits sets to clusters
   * @return a sorted list of the clusters
   */
  private List<Cluster<SubspaceModel<V>>> sortClusters(Relation<V> database, Map<BitSet, List<Pair<BitSet, ArrayModifiableDBIDs>>> clustersMap) {
    final int db_dim = DatabaseUtil.dimensionality(database);
    // int num = 1;
    List<Cluster<SubspaceModel<V>>> clusters = new ArrayList<Cluster<SubspaceModel<V>>>();
    for(BitSet pv : clustersMap.keySet()) {
      List<Pair<BitSet, ArrayModifiableDBIDs>> parallelClusters = clustersMap.get(pv);
      for(int i = 0; i < parallelClusters.size(); i++) {
        Pair<BitSet, ArrayModifiableDBIDs> c = parallelClusters.get(i);
        Cluster<SubspaceModel<V>> cluster = new Cluster<SubspaceModel<V>>(c.second);
        cluster.setModel(new SubspaceModel<V>(new Subspace<V>(c.first), DatabaseUtil.centroid(database, c.second)));
        cluster.setHierarchy(new HierarchyReferenceLists<Cluster<SubspaceModel<V>>>(cluster, new ArrayList<Cluster<SubspaceModel<V>>>(), new ArrayList<Cluster<SubspaceModel<V>>>()));
        // cluster.setName("Cluster_" + num++);
        String subspace = FormatUtil.format(cluster.getModel().getSubspace().getDimensions(), db_dim, "");
        if(parallelClusters.size() > 1) {
          cluster.setName("Cluster_" + subspace + "_" + i);
        }
        else {
          cluster.setName("Cluster_" + subspace);
        }
        clusters.add(cluster);
      }
    }
    // sort the clusters w.r.t. lambda
    Comparator<Cluster<SubspaceModel<V>>> comparator = new Comparator<Cluster<SubspaceModel<V>>>() {
      @Override
      public int compare(Cluster<SubspaceModel<V>> c1, Cluster<SubspaceModel<V>> c2) {
        return c2.getModel().getSubspace().dimensionality() - c1.getModel().getSubspace().dimensionality();
      }

    };
    Collections.sort(clusters, comparator);
    return clusters;
  }

  /**
   * Removes the clusters with size < minpts from the cluster map and adds them
   * to their parents.
   *
   * @param database the database storing the objects
   * @param distFunc the distance function
   * @param clustersMap the map containing the clusters
   * @param minpts MinPts
   */
  private void checkClusters(Relation<V> database, DiSHDistanceFunction.Instance<V> distFunc, Map<BitSet, List<Pair<BitSet, ArrayModifiableDBIDs>>> clustersMap, int minpts) {
    // check if there are clusters < minpts
    // and add them to not assigned
    List<Pair<BitSet, ArrayModifiableDBIDs>> notAssigned = new ArrayList<Pair<BitSet, ArrayModifiableDBIDs>>();
    Map<BitSet, List<Pair<BitSet, ArrayModifiableDBIDs>>> newClustersMap = new HashMap<BitSet, List<Pair<BitSet, ArrayModifiableDBIDs>>>();
    Pair<BitSet, ArrayModifiableDBIDs> noise = new Pair<BitSet, ArrayModifiableDBIDs>(new BitSet(), DBIDUtil.newArray());
    for(BitSet pv : clustersMap.keySet()) {
      // noise
      if(pv.cardinality() == 0) {
        List<Pair<BitSet, ArrayModifiableDBIDs>> parallelClusters = clustersMap.get(pv);
        for(Pair<BitSet, ArrayModifiableDBIDs> c : parallelClusters) {
          noise.second.addDBIDs(c.second);
        }
      }
      // clusters
      else {
        List<Pair<BitSet, ArrayModifiableDBIDs>> parallelClusters = clustersMap.get(pv);
        List<Pair<BitSet, ArrayModifiableDBIDs>> newParallelClusters = new ArrayList<Pair<BitSet, ArrayModifiableDBIDs>>(parallelClusters.size());
        for(Pair<BitSet, ArrayModifiableDBIDs> c : parallelClusters) {
          if(!pv.equals(new BitSet()) && c.second.size() < minpts) {
            notAssigned.add(c);
          }
          else {
            newParallelClusters.add(c);
          }
        }
        newClustersMap.put(pv, newParallelClusters);
      }
    }

    clustersMap.clear();
    clustersMap.putAll(newClustersMap);

    for(Pair<BitSet, ArrayModifiableDBIDs> c : notAssigned) {
      if(c.second.isEmpty()) {
        continue;
      }
      Pair<BitSet, ArrayModifiableDBIDs> parent = findParent(database, distFunc, c, clustersMap);
      if(parent != null) {
        parent.second.addDBIDs(c.second);
      }
      else {
        noise.second.addDBIDs(c.second);
      }
    }

    List<Pair<BitSet, ArrayModifiableDBIDs>> noiseList = new ArrayList<Pair<BitSet, ArrayModifiableDBIDs>>(1);
    noiseList.add(noise);
    clustersMap.put(noise.first, noiseList);
  }

  /**
   * Returns the parent of the specified cluster
   *
   * @param database the database storing the objects
   * @param distFunc the distance function
   * @param child the child to search the parent for
   * @param clustersMap the map containing the clusters
   * @return the parent of the specified cluster
   */
  private Pair<BitSet, ArrayModifiableDBIDs> findParent(Relation<V> database, DiSHDistanceFunction.Instance<V> distFunc, Pair<BitSet, ArrayModifiableDBIDs> child, Map<BitSet, List<Pair<BitSet, ArrayModifiableDBIDs>>> clustersMap) {
    V child_centroid = DatabaseUtil.centroid(database, child.second, child.first);

    Pair<BitSet, ArrayModifiableDBIDs> result = null;
    int resultCardinality = -1;

    BitSet childPV = child.first;
    int childCardinality = childPV.cardinality();
    for(BitSet parentPV : clustersMap.keySet()) {
      int parentCardinality = parentPV.cardinality();
      if(parentCardinality >= childCardinality) {
        continue;
      }
      if(resultCardinality != -1 && parentCardinality <= resultCardinality) {
        continue;
      }

      BitSet pv = (BitSet) childPV.clone();
      pv.and(parentPV);
      if(pv.equals(parentPV)) {
        List<Pair<BitSet, ArrayModifiableDBIDs>> parentList = clustersMap.get(parentPV);
        for(Pair<BitSet, ArrayModifiableDBIDs> parent : parentList) {
          V parent_centroid = DatabaseUtil.centroid(database, parent.second, parentPV);
          double d = distFunc.weightedDistance(child_centroid, parent_centroid, parentPV);
          if(d <= 2 * epsilon) {
            result = parent;
            resultCardinality = parentCardinality;
            break;
          }
        }
      }
    }

    return result;
  }

  /**
   * Builds the cluster hierarchy.
   *
   * @param distFunc the distance function
   * @param clusters the sorted list of clusters
   * @param dimensionality the dimensionality of the data
   * @param database the database containing the data objects
   */
  private void buildHierarchy(Relation<V> database, DiSHDistanceFunction.Instance<V> distFunc, List<Cluster<SubspaceModel<V>>> clusters, int dimensionality) {
    StringBuffer msg = new StringBuffer();
    final int db_dim = DatabaseUtil.dimensionality(database);

    for(int i = 0; i < clusters.size() - 1; i++) {
      Cluster<SubspaceModel<V>> c_i = clusters.get(i);
      int subspaceDim_i = dimensionality - c_i.getModel().getSubspace().dimensionality();
      V ci_centroid = DatabaseUtil.centroid(database, c_i.getIDs(), c_i.getModel().getDimensions());

      for(int j = i + 1; j < clusters.size(); j++) {
        Cluster<SubspaceModel<V>> c_j = clusters.get(j);
        int subspaceDim_j = dimensionality - c_j.getModel().getSubspace().dimensionality();

        if(subspaceDim_i < subspaceDim_j) {
          if(logger.isDebugging()) {
            msg.append("\n l_i=").append(subspaceDim_i).append(" pv_i=[").append(FormatUtil.format(db_dim, c_i.getModel().getSubspace().getDimensions())).append("]");
            msg.append("\n l_j=").append(subspaceDim_j).append(" pv_j=[").append(FormatUtil.format(db_dim, c_j.getModel().getSubspace().getDimensions())).append("]");
          }

          // noise level reached
          if(c_j.getModel().getSubspace().dimensionality() == 0) {
            // no parents exists -> parent is noise
            if(c_i.getParents().isEmpty()) {
              c_j.getChildren().add(c_i);
              c_i.getParents().add(c_j);
              if(logger.isDebugging()) {
                msg.append("\n [").append(FormatUtil.format(db_dim, c_j.getModel().getSubspace().getDimensions()));
                msg.append("] is parent of [").append(FormatUtil.format(db_dim, c_i.getModel().getSubspace().getDimensions()));
                msg.append("]");
              }
            }
          }
          else {
            V cj_centroid = DatabaseUtil.centroid(database, c_j.getIDs(), c_j.getModel().getDimensions());
            PreferenceVectorBasedCorrelationDistance distance = distFunc.correlationDistance(ci_centroid, cj_centroid, c_i.getModel().getSubspace().getDimensions(), c_j.getModel().getSubspace().getDimensions());
            double d = distFunc.weightedDistance(ci_centroid, cj_centroid, distance.getCommonPreferenceVector());
            if(logger.isDebugging()) {
              msg.append("\n dist = ").append(distance.getCorrelationValue());
            }

            if(distance.getCorrelationValue() == subspaceDim_j) {
              if(logger.isDebugging()) {
                msg.append("\n d = ").append(d);
              }
              if(d <= 2 * epsilon) {
                // no parent exists or c_j is not a parent of the already
                // existing parents
                if(c_i.getParents().isEmpty() || !isParent(database, distFunc, c_j, c_i.getParents())) {
                  c_j.getChildren().add(c_i);
                  c_i.getParents().add(c_j);
                  if(logger.isDebugging()) {
                    msg.append("\n [").append(FormatUtil.format(db_dim, c_j.getModel().getSubspace().getDimensions()));
                    msg.append("] is parent of [");
                    msg.append(FormatUtil.format(db_dim, c_i.getModel().getSubspace().getDimensions()));
                    msg.append("]");
                  }
                }
              }
              else {
                throw new RuntimeException("Should never happen: d = " + d);
              }
            }
          }
        }
      }
    }
    if(logger.isDebugging()) {
      logger.debug(msg.toString());
    }
  }

  /**
   * Returns true, if the specified parent cluster is a parent of one child of
   * the children clusters.
   *
   * @param database the database containing the objects
   * @param distFunc the distance function for distance computation between the
   *        clusters
   * @param parent the parent to be tested
   * @param children the list of children to be tested
   * @return true, if the specified parent cluster is a parent of one child of
   *         the children clusters, false otherwise
   */
  private boolean isParent(Relation<V> database, DiSHDistanceFunction.Instance<V> distFunc, Cluster<SubspaceModel<V>> parent, List<Cluster<SubspaceModel<V>>> children) {
    V parent_centroid = DatabaseUtil.centroid(database, parent.getIDs(), parent.getModel().getDimensions());
    int dimensionality = DatabaseUtil.dimensionality(database);
    int subspaceDim_parent = dimensionality - parent.getModel().getSubspace().dimensionality();

    for(Cluster<SubspaceModel<V>> child : children) {
      V child_centroid = DatabaseUtil.centroid(database, child.getIDs(), child.getModel().getDimensions());
      PreferenceVectorBasedCorrelationDistance distance = distFunc.correlationDistance(parent_centroid, child_centroid, parent.getModel().getSubspace().getDimensions(), child.getModel().getSubspace().getDimensions());
      if(distance.getCorrelationValue() == subspaceDim_parent) {
        return true;
      }
    }

    return false;
  }

  @Override
  public TypeInformation[] getInputTypeRestriction() {
    return TypeUtil.array(TypeUtil.NUMBER_VECTOR_FIELD);
  }

  @Override
  protected Logging getLogger() {
    return logger;
  }

  /**
   * Parameterization class.
   *
   * @author Erich Schubert
   *
   * @apiviz.exclude
   */
  public static class Parameterizer<V extends NumberVector<V, ?>> extends AbstractParameterizer {
    protected double epsilon = 0.0;

    protected int mu = 1;

    protected DiSHDistanceFunction dishDistance;

    protected Collection<Pair<OptionID, Object>> opticsO;

    @Override
    protected void makeOptions(Parameterization config) {
      super.makeOptions(config);

      DoubleParameter epsilonP = new DoubleParameter(EPSILON_ID, new GreaterEqualConstraint(0), 0.001);
      if(config.grab(epsilonP)) {
        epsilon = epsilonP.getValue();
      }

      IntParameter muP = new IntParameter(MU_ID, new GreaterConstraint(0), 1);
      if(config.grab(muP)) {
        mu = muP.getValue();
      }

      configDiSHDistance(config, epsilon, mu);

      configOPTICS(config, mu, dishDistance);
    }

    public void configDiSHDistance(Parameterization config, double epsilon, int minpts) {
      ListParameterization dishParameters = new ListParameterization();
      dishParameters.addParameter(DiSHDistanceFunction.EPSILON_ID, epsilon);
      dishParameters.addParameter(IndexBasedDistanceFunction.INDEX_ID, DiSHPreferenceVectorIndex.Factory.class);
      dishParameters.addParameter(DiSHPreferenceVectorIndex.Factory.EPSILON_ID, Double.toString(epsilon));
      dishParameters.addParameter(DiSHPreferenceVectorIndex.Factory.MINPTS_ID, minpts);
      ChainedParameterization dishchain = new ChainedParameterization(dishParameters, config);
      dishchain.errorsTo(config);

      dishDistance = dishchain.tryInstantiate(DiSHDistanceFunction.class);
    }

    /**
     * Get the parameters for embedded OPTICS.
     *
     * @param config Parameterization
     * @param minpts MinPts value
     * @param dishDistance DiSH distance function
     */
    public void configOPTICS(Parameterization config, final int minpts, final DiSHDistanceFunction dishDistance) {
      // Configure OPTICS. Tracked parameters
      ListParameterization opticsParameters = new ListParameterization();
      opticsParameters.addParameter(OPTICS.EPSILON_ID, AbstractDistance.INFINITY_PATTERN);
      opticsParameters.addParameter(OPTICS.MINPTS_ID, minpts);
      // Configure OPTICS. Untracked parameters
      ListParameterization opticsUntrackedParameters = new ListParameterization();
      opticsUntrackedParameters.addParameter(OPTICS.DISTANCE_FUNCTION_ID, dishDistance);
      ChainedParameterization optchain = new ChainedParameterization(opticsParameters, config);
      TrackParameters trackpar = new TrackParameters(optchain);

      ChainedParameterization optchain2 = new ChainedParameterization(opticsUntrackedParameters, trackpar);
      optchain2.errorsTo(config);

      // Instantiate OPTICS for parameterization
      optchain2.tryInstantiate(OPTICS.class);
      // store parameters
      opticsO = trackpar.getGivenParameters();
    }

    @Override
    protected DiSH<V> makeInstance() {
      return new DiSH<V>(epsilon, dishDistance, opticsO);
    }
  }
}
TOP

Related Classes of de.lmu.ifi.dbs.elki.algorithm.clustering.subspace.DiSH

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.