Package de.lmu.ifi.dbs.elki.algorithm.clustering.correlation

Source Code of de.lmu.ifi.dbs.elki.algorithm.clustering.correlation.ERiC

package de.lmu.ifi.dbs.elki.algorithm.clustering.correlation;

/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures

Copyright (C) 2011
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU Affero General Public License for more details.

You should have received a copy of the GNU Affero General Public License
along with this program.  If not, see <http://www.gnu.org/licenses/>.
*/

import java.util.ArrayList;
import java.util.List;
import java.util.SortedMap;
import java.util.TreeMap;

import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm;
import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm;
import de.lmu.ifi.dbs.elki.algorithm.clustering.DBSCAN;
import de.lmu.ifi.dbs.elki.data.Cluster;
import de.lmu.ifi.dbs.elki.data.Clustering;
import de.lmu.ifi.dbs.elki.data.NumberVector;
import de.lmu.ifi.dbs.elki.data.model.CorrelationModel;
import de.lmu.ifi.dbs.elki.data.model.DimensionModel;
import de.lmu.ifi.dbs.elki.data.model.Model;
import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction;
import de.lmu.ifi.dbs.elki.distance.distancefunction.ProxyDistanceFunction;
import de.lmu.ifi.dbs.elki.distance.distancefunction.correlation.ERiCDistanceFunction;
import de.lmu.ifi.dbs.elki.distance.distancevalue.BitDistance;
import de.lmu.ifi.dbs.elki.distance.distancevalue.DoubleDistance;
import de.lmu.ifi.dbs.elki.distance.distancevalue.IntegerDistance;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.logging.progress.StepProgress;
import de.lmu.ifi.dbs.elki.math.linearalgebra.pca.FirstNEigenPairFilter;
import de.lmu.ifi.dbs.elki.math.linearalgebra.pca.PCAFilteredResult;
import de.lmu.ifi.dbs.elki.math.linearalgebra.pca.PCAFilteredRunner;
import de.lmu.ifi.dbs.elki.utilities.ClassGenericsUtil;
import de.lmu.ifi.dbs.elki.utilities.DatabaseUtil;
import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.ParameterException;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ListParameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;

/**
* Performs correlation clustering on the data partitioned according to local
* correlation dimensionality and builds a hierarchy of correlation clusters
* that allows multiple inheritance from the clustering result.
* <p>
* Reference: E. Achtert, C. Böhm, H.-P. Kriegel, P. Kröger, and A. Zimek: On
* Exploring Complex Relationships of Correlation Clusters. <br>
* In Proc. 19th International Conference on Scientific and Statistical Database
* Management (SSDBM 2007), Banff, Canada, 2007.
* </p>
*
* @author Elke Achtert
*
* @apiviz.uses COPAC
* @apiviz.uses DBSCAN
* @apiviz.uses ERiCDistanceFunction
* @apiviz.uses FirstNEigenPairFilter
* @apiviz.uses PCAFilteredRunner
* @apiviz.has CorrelationModel
*
* @param <V> the type of NumberVector handled by this Algorithm
*/
// TODO: Re-use PCARunner objects somehow?
@Title("ERiC: Exploring Relationships among Correlation Clusters")
@Description("Performs the DBSCAN algorithm on the data using a special distance function taking into account correlations among attributes and builds " + "a hierarchy that allows multiple inheritance from the correlation clustering result.")
@Reference(authors = "E. Achtert, C. Böhm, H.-P. Kriegel, P. Kröger, and A. Zimek", title = "On Exploring Complex Relationships of Correlation Clusters", booktitle = "Proc. 19th International Conference on Scientific and Statistical Database Management (SSDBM 2007), Banff, Canada, 2007", url = "http://dx.doi.org/10.1109/SSDBM.2007.21")
public class ERiC<V extends NumberVector<V, ?>> extends AbstractAlgorithm<Clustering<CorrelationModel<V>>> implements ClusteringAlgorithm<Clustering<CorrelationModel<V>>> {
  /**
   * The logger for this class.
   */
  private static final Logging logger = Logging.getLogger(ERiC.class);

  /**
   * The COPAC clustering algorithm.
   */
  private COPAC<V, IntegerDistance> copacAlgorithm;

  /**
   * Constructor.
   *
   * @param copacAlgorithm COPAC to use
   */
  public ERiC(COPAC<V, IntegerDistance> copacAlgorithm) {
    super();
    this.copacAlgorithm = copacAlgorithm;
  }

  /**
   * Performs the ERiC algorithm on the given database.
   *
   * @param relation Relation to process
   * @return Clustering result
   */
  public Clustering<CorrelationModel<V>> run(Relation<V> relation) throws IllegalStateException {
    final int dimensionality = DatabaseUtil.dimensionality(relation);

    StepProgress stepprog = logger.isVerbose() ? new StepProgress(3) : null;

    // run COPAC
    if(stepprog != null) {
      stepprog.beginStep(1, "Preprocessing local correlation dimensionalities and partitioning data", logger);
    }
    Clustering<Model> copacResult = copacAlgorithm.run(relation);

    DistanceQuery<V, IntegerDistance> query = copacAlgorithm.getPartitionDistanceQuery();

    // extract correlation clusters
    if(stepprog != null) {
      stepprog.beginStep(2, "Extract correlation clusters", logger);
    }
    SortedMap<Integer, List<Cluster<CorrelationModel<V>>>> clusterMap = extractCorrelationClusters(copacResult, relation, dimensionality);
    if(logger.isDebugging()) {
      StringBuffer msg = new StringBuffer("Step 2: Extract correlation clusters...");
      for(Integer corrDim : clusterMap.keySet()) {
        List<Cluster<CorrelationModel<V>>> correlationClusters = clusterMap.get(corrDim);
        msg.append("\n\ncorrDim ").append(corrDim);
        for(Cluster<CorrelationModel<V>> cluster : correlationClusters) {
          msg.append("\n  cluster ").append(cluster).append(", ids: ").append(cluster.getIDs().size());
          // .append(", level: ").append(cluster.getLevel()).append(", index: ").append(cluster.getLevelIndex());
          // msg.append("\n  basis " +
          // cluster.getPCA().getWeakEigenvectors().toString("    ", NF) +
          // "  ids " + cluster.getIDs().size());
        }
      }
      logger.debugFine(msg.toString());
    }
    if(logger.isVerbose()) {
      int clusters = 0;
      for(List<Cluster<CorrelationModel<V>>> correlationClusters : clusterMap.values()) {
        clusters += correlationClusters.size();
      }
      logger.verbose(clusters + " clusters extracted.");
    }

    // build hierarchy
    if(stepprog != null) {
      stepprog.beginStep(3, "Building hierarchy", logger);
    }
    buildHierarchy(clusterMap, query);
    if(logger.isDebugging()) {
      StringBuffer msg = new StringBuffer("Step 3: Build hierarchy");
      for(Integer corrDim : clusterMap.keySet()) {
        List<Cluster<CorrelationModel<V>>> correlationClusters = clusterMap.get(corrDim);
        for(Cluster<CorrelationModel<V>> cluster : correlationClusters) {
          msg.append("\n  cluster ").append(cluster).append(", ids: ").append(cluster.getIDs().size());
          // .append(", level: ").append(cluster.getLevel()).append(", index: ").append(cluster.getLevelIndex());
          for(int i = 0; i < cluster.getParents().size(); i++) {
            msg.append("\n   parent ").append(cluster.getParents().get(i));
          }
          for(int i = 0; i < cluster.numChildren(); i++) {
            msg.append("\n   child ").append(cluster.getChildren().get(i));
          }
        }
      }
      logger.debugFine(msg.toString());
    }
    if(stepprog != null) {
      stepprog.setCompleted(logger);
    }

    Clustering<CorrelationModel<V>> result = new Clustering<CorrelationModel<V>>("ERiC clustering", "eric-clustering");
    for(Cluster<CorrelationModel<V>> rc : clusterMap.get(clusterMap.lastKey())) {
      result.addCluster(rc);
    }
    return result;
  }

  /**
   * Extracts the correlation clusters and noise from the copac result and
   * returns a mapping of correlation dimension to maps of clusters within this
   * correlation dimension. Each cluster is defined by the basis vectors
   * defining the subspace in which the cluster appears.
   *
   * @param copacResult
   *
   * @param database the database containing the objects
   * @param dimensionality the dimensionality of the feature space
   * @return a mapping of correlation dimension to maps of clusters
   */
  private SortedMap<Integer, List<Cluster<CorrelationModel<V>>>> extractCorrelationClusters(Clustering<Model> copacResult, Relation<V> database, int dimensionality) {
    // result
    SortedMap<Integer, List<Cluster<CorrelationModel<V>>>> clusterMap = new TreeMap<Integer, List<Cluster<CorrelationModel<V>>>>();

    // noise cluster containing all noise objects over all partitions
    Cluster<Model> noise = null;

    // iterate over correlation dimensions
    for(Cluster<Model> clus : copacResult.getAllClusters()) {
      DBIDs group = clus.getIDs();
      if(clus.getModel() != null && clus.getModel() instanceof DimensionModel) {
        int correlationDimension = ((DimensionModel) clus.getModel()).getDimension();

        ListParameterization parameters = pcaParameters(correlationDimension);
        Class<PCAFilteredRunner<V>> cls = ClassGenericsUtil.uglyCastIntoSubclass(PCAFilteredRunner.class);
        PCAFilteredRunner<V> pca = parameters.tryInstantiate(cls);
        for(ParameterException e : parameters.getErrors()) {
          logger.warning("Error in internal parameterization: " + e.getMessage());
        }

        // get cluster list for this dimension.
        List<Cluster<CorrelationModel<V>>> correlationClusters = clusterMap.get(correlationDimension);
        if(correlationClusters == null) {
          correlationClusters = new ArrayList<Cluster<CorrelationModel<V>>>();
          clusterMap.put(correlationDimension, correlationClusters);
        }

        PCAFilteredResult pcares = pca.processIds(group, database);

        V centroid = DatabaseUtil.centroid(database, group);
        Cluster<CorrelationModel<V>> correlationCluster = new Cluster<CorrelationModel<V>>("[" + correlationDimension + "_" + correlationClusters.size() + "]", group, new CorrelationModel<V>(pcares, centroid), new ArrayList<Cluster<CorrelationModel<V>>>(), new ArrayList<Cluster<CorrelationModel<V>>>());
        correlationClusters.add(correlationCluster);
      }
      // partition containing noise
      else if(clus.getModel() != null && clus.isNoise()) {
        if(noise == null) {
          noise = clus;
        }
        else {
          ModifiableDBIDs merged = DBIDUtil.newHashSet(noise.getIDs());
          merged.addAll(clus.getIDs().asCollection());
          noise.setIDs(merged);
        }
      }
      else {
        throw new IllegalStateException("Unexpected group returned: " + clus.getClass().getName());
      }
    }

    if(noise != null && noise.size() > 0) {
      // get cluster list for this dimension.
      List<Cluster<CorrelationModel<V>>> correlationClusters = clusterMap.get(dimensionality);
      if(correlationClusters == null) {
        correlationClusters = new ArrayList<Cluster<CorrelationModel<V>>>();
        clusterMap.put(dimensionality, correlationClusters);
      }
      ListParameterization parameters = pcaParameters(dimensionality);
      Class<PCAFilteredRunner<V>> cls = ClassGenericsUtil.uglyCastIntoSubclass(PCAFilteredRunner.class);
      PCAFilteredRunner<V> pca = parameters.tryInstantiate(cls);
      for(ParameterException e : parameters.getErrors()) {
        logger.warning("Error in internal parameterization: " + e.getMessage());
      }
      PCAFilteredResult pcares = pca.processIds(noise.getIDs(), database);

      V centroid = DatabaseUtil.centroid(database, noise.getIDs());
      Cluster<CorrelationModel<V>> correlationCluster = new Cluster<CorrelationModel<V>>("[noise]", noise.getIDs(), new CorrelationModel<V>(pcares, centroid), new ArrayList<Cluster<CorrelationModel<V>>>(), new ArrayList<Cluster<CorrelationModel<V>>>());
      correlationClusters.add(correlationCluster);
    }

    return clusterMap;
  }

  /**
   * Returns the parameters for the PCA for the specified correlation dimension.
   *
   * @param correlationDimension the correlation dimension
   * @return the parameters for the PCA for the specified correlation dimension
   */
  private ListParameterization pcaParameters(int correlationDimension) {
    ListParameterization parameters = new ListParameterization();

    parameters.addParameter(PCAFilteredRunner.PCA_EIGENPAIR_FILTER, FirstNEigenPairFilter.class);
    parameters.addParameter(FirstNEigenPairFilter.EIGENPAIR_FILTER_N, correlationDimension);

    return parameters;
  }

  private void buildHierarchy(SortedMap<Integer, List<Cluster<CorrelationModel<V>>>> clusterMap, DistanceQuery<V, IntegerDistance> query) throws IllegalStateException {
    StringBuffer msg = new StringBuffer();

    DBSCAN<V, DoubleDistance> dbscan = ClassGenericsUtil.castWithGenericsOrNull(DBSCAN.class, copacAlgorithm.getPartitionAlgorithm(query));
    if(dbscan == null) {
      // TODO: appropriate exception class?
      throw new IllegalArgumentException("ERiC was run without DBSCAN as COPAC algorithm!");
    }
    DistanceFunction<? super V, ?> dfun = ProxyDistanceFunction.unwrapDistance(dbscan.getDistanceFunction());
    ERiCDistanceFunction distanceFunction = ClassGenericsUtil.castWithGenericsOrNull(ERiCDistanceFunction.class, dfun);
    if(distanceFunction == null) {
      // TODO: appropriate exception class?
      throw new IllegalArgumentException("ERiC was run without ERiCDistanceFunction as distance function: got " + dfun.getClass());
    }
    Integer lambda_max = clusterMap.lastKey();

    for(Integer childCorrDim : clusterMap.keySet()) {
      List<Cluster<CorrelationModel<V>>> children = clusterMap.get(childCorrDim);
      SortedMap<Integer, List<Cluster<CorrelationModel<V>>>> parentMap = clusterMap.tailMap(childCorrDim + 1);
      if(logger.isDebugging()) {
        msg.append("\ncorrdim ").append(childCorrDim);
        msg.append("\nparents ").append(parentMap.keySet());
      }

      for(Cluster<CorrelationModel<V>> child : children) {
        for(Integer parentCorrDim : parentMap.keySet()) {
          List<Cluster<CorrelationModel<V>>> parents = parentMap.get(parentCorrDim);
          for(Cluster<CorrelationModel<V>> parent : parents) {
            int subspaceDim_parent = parent.getModel().getPCAResult().getCorrelationDimension();
            if(subspaceDim_parent == lambda_max && child.getParents().isEmpty()) {
              parent.getChildren().add(child);
              child.getParents().add(parent);
              if(logger.isDebugging()) {
                msg.append("\n").append(parent).append(" is parent of ").append(child);
              }
            }
            else {
              BitDistance dist = distanceFunction.distance(parent.getModel().getCentroid(), child.getModel().getCentroid(), parent.getModel().getPCAResult(), child.getModel().getPCAResult());
              if(!dist.bitValue() && (child.getParents().isEmpty() || !isParent(distanceFunction, parent, child.getParents()))) {
                parent.getChildren().add(child);
                child.getParents().add(parent);
                if(logger.isDebugging()) {
                  msg.append("\n").append(parent).append(" is parent of ").append(child);
                }
              }
            }
          }
        }
      }
    }
    if(logger.isDebugging()) {
      logger.debugFine(msg.toString());
    }

  }

  /**
   * Returns true, if the specified parent cluster is a parent of one child of
   * the children clusters.
   *
   * @param distanceFunction the distance function for distance computation
   *        between the clusters
   * @param parent the parent to be tested
   * @param children the list of children to be tested
   * @return true, if the specified parent cluster is a parent of one child of
   *         the children clusters, false otherwise
   */
  private boolean isParent(ERiCDistanceFunction distanceFunction, Cluster<CorrelationModel<V>> parent, List<Cluster<CorrelationModel<V>>> children) {

    StringBuffer msg = new StringBuffer();

    for(Cluster<CorrelationModel<V>> child : children) {
      if(parent.getModel().getPCAResult().getCorrelationDimension() == child.getModel().getPCAResult().getCorrelationDimension()) {
        return false;
      }

      BitDistance dist = distanceFunction.distance(parent.getModel().getCentroid(), child.getModel().getCentroid(), parent.getModel().getPCAResult(), child.getModel().getPCAResult());
      if(logger.isDebugging()) {
        msg.append("\ndist(").append(child).append(" - ").append(parent).append(") = ").append(dist);
      }
      if(!dist.bitValue()) {
        if(logger.isDebugging()) {
          logger.debugFine(msg.toString());
        }
        return true;
      }
    }

    if(logger.isDebugging()) {
      logger.debugFine(msg.toString());
    }
    return false;
  }

  @Override
  public TypeInformation[] getInputTypeRestriction() {
    return TypeUtil.array(TypeUtil.NUMBER_VECTOR_FIELD);
  }

  @Override
  protected Logging getLogger() {
    return logger;
  }
 
  /**
   * Parameterization class.
   *
   * @author Erich Schubert
   *
   * @apiviz.exclude
   */
  public static class Parameterizer<V extends NumberVector<V, ?>> extends AbstractParameterizer {
    /**
     * The COPAC instance to use
     */
    protected COPAC<V, IntegerDistance> copac;

    @SuppressWarnings("unchecked")
    @Override
    protected void makeOptions(Parameterization config) {
      super.makeOptions(config);
      copac = config.tryInstantiate(COPAC.class);
    }

    @Override
    protected ERiC<V> makeInstance() {
      return new ERiC<V>(copac);
    }
  }
}
TOP

Related Classes of de.lmu.ifi.dbs.elki.algorithm.clustering.correlation.ERiC

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.