Package de.lmu.ifi.dbs.elki.algorithm.clustering.correlation

Source Code of de.lmu.ifi.dbs.elki.algorithm.clustering.correlation.COPAC$Parameterizer

package de.lmu.ifi.dbs.elki.algorithm.clustering.correlation;

/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures

Copyright (C) 2011
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU Affero General Public License for more details.

You should have received a copy of the GNU Affero General Public License
along with this program.  If not, see <http://www.gnu.org/licenses/>.
*/

import java.util.Collection;
import java.util.HashMap;
import java.util.Map;
import java.util.Map.Entry;

import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm;
import de.lmu.ifi.dbs.elki.algorithm.AbstractDistanceBasedAlgorithm;
import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm;
import de.lmu.ifi.dbs.elki.data.Cluster;
import de.lmu.ifi.dbs.elki.data.Clustering;
import de.lmu.ifi.dbs.elki.data.NumberVector;
import de.lmu.ifi.dbs.elki.data.model.ClusterModel;
import de.lmu.ifi.dbs.elki.data.model.DimensionModel;
import de.lmu.ifi.dbs.elki.data.model.Model;
import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.database.ProxyDatabase;
import de.lmu.ifi.dbs.elki.database.ids.DBID;
import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.distance.distancefunction.FilteredLocalPCABasedDistanceFunction;
import de.lmu.ifi.dbs.elki.distance.distancefunction.IndexBasedDistanceFunction;
import de.lmu.ifi.dbs.elki.distance.distancefunction.LocallyWeightedDistanceFunction;
import de.lmu.ifi.dbs.elki.distance.distancefunction.ProxyDistanceFunction;
import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance;
import de.lmu.ifi.dbs.elki.index.preprocessed.LocalProjectionIndex;
import de.lmu.ifi.dbs.elki.index.preprocessed.LocalProjectionIndex.Factory;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress;
import de.lmu.ifi.dbs.elki.utilities.DatabaseUtil;
import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ChainedParameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ListParameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.TrackParameters;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ClassParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
import de.lmu.ifi.dbs.elki.utilities.pairs.Pair;

/**
* Provides the COPAC algorithm, an algorithm to partition a database according
* to the correlation dimension of its objects and to then perform an arbitrary
* clustering algorithm over the partitions.
* <p>
* Reference: Achtert E., Böhm C., Kriegel H.-P., Kröger P., Zimek A.: Robust,
* Complete, and Efficient Correlation Clustering. <br>
* In Proc. 7th SIAM International Conference on Data Mining (SDM'07),
* Minneapolis, MN, 2007
* </p>
*
* @author Arthur Zimek
*
* @apiviz.uses LocalProjectionIndex
* @apiviz.uses FilteredLocalPCABasedDistanceFunction
* @apiviz.has DimensionModel
*
* @param <V> the type of NumberVector handled by this Algorithm
*/
@Title("COPAC: COrrelation PArtition Clustering")
@Description("Partitions a database according to the correlation dimension of its objects and performs " + "a clustering algorithm over the partitions.")
@Reference(authors = "E. Achtert, C. Böhm, H.-P. Kriegel, P. Kröger P., A. Zimek", title = "Robust, Complete, and Efficient Correlation Clustering", booktitle = "Proc. 7th SIAM International Conference on Data Mining (SDM'07), Minneapolis, MN, 2007", url = "http://www.siam.org/proceedings/datamining/2007/dm07_037achtert.pdf")
public class COPAC<V extends NumberVector<V, ?>, D extends Distance<D>> extends AbstractAlgorithm<Clustering<Model>> implements ClusteringAlgorithm<Clustering<Model>> {
  /**
   * The logger for this class.
   */
  private static final Logging logger = Logging.getLogger(COPAC.class);

  /**
   * Parameter to specify the local PCA preprocessor to derive partition
   * criterion, must extend
   * {@link de.lmu.ifi.dbs.elki.index.preprocessed.localpca.AbstractFilteredPCAIndex}.
   * <p>
   * Key: {@code -copac.preprocessor}
   * </p>
   */
  public static final OptionID PREPROCESSOR_ID = OptionID.getOrCreateOptionID("copac.preprocessor", "Local PCA Preprocessor to derive partition criterion.");

  /**
   * Parameter to specify the distance function to use inside the partitions
   * {@link de.lmu.ifi.dbs.elki.distance.distancefunction.AbstractIndexBasedDistanceFunction}
   * .
   * <p>
   * Default value:
   * {@link de.lmu.ifi.dbs.elki.distance.distancefunction.LocallyWeightedDistanceFunction}
   * </p>
   * <p>
   * Key: {@code -copac.partitionDistance}
   * </p>
   */
  public static final OptionID PARTITION_DISTANCE_ID = OptionID.getOrCreateOptionID("copac.partitionDistance", "Distance to use for the inner algorithms.");

  /**
   * Parameter to specify the clustering algorithm to apply to each partition,
   * must extend
   * {@link de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm}.
   * <p>
   * Key: {@code -copac.partitionAlgorithm}
   * </p>
   */
  public static final OptionID PARTITION_ALGORITHM_ID = OptionID.getOrCreateOptionID("copac.partitionAlgorithm", "Clustering algorithm to apply to each partition.");

  /**
   * Holds the instance of the preprocessed distance function
   * {@link #PARTITION_DISTANCE_ID}.
   */
  private FilteredLocalPCABasedDistanceFunction<V, ?, D> partitionDistanceFunction;

  /**
   * Get the algorithm to run on each partition.
   */
  private Class<? extends ClusteringAlgorithm<Clustering<Model>>> partitionAlgorithm;

  /**
   * Holds the parameters of the algorithm to run on each partition.
   */
  private Collection<Pair<OptionID, Object>> partitionAlgorithmParameters;

  /**
   * The last used distance query
   */
  // FIXME: remove this when migrating to a full Factory pattern! This is
  // non-reentrant!
  private FilteredLocalPCABasedDistanceFunction.Instance<V, LocalProjectionIndex<V, ?>, D> partitionDistanceQuery;

  /**
   * Constructor.
   *
   * @param partitionDistanceFunction Distance function
   * @param partitionAlgorithm Algorithm to use on partitions
   * @param partitionAlgorithmParameters Parameters for Algorithm to run on
   *        partitions
   */
  public COPAC(FilteredLocalPCABasedDistanceFunction<V, ?, D> partitionDistanceFunction, Class<? extends ClusteringAlgorithm<Clustering<Model>>> partitionAlgorithm, Collection<Pair<OptionID, Object>> partitionAlgorithmParameters) {
    super();
    this.partitionDistanceFunction = partitionDistanceFunction;
    this.partitionAlgorithm = partitionAlgorithm;
    this.partitionAlgorithmParameters = partitionAlgorithmParameters;
  }

  /**
   * Performs the COPAC algorithm on the given database.
   *
   * @param relation Relation to process
   * @return Clustering result
   */
  @SuppressWarnings("unchecked")
  public Clustering<Model> run(Relation<V> relation) throws IllegalStateException {
    if(logger.isVerbose()) {
      logger.verbose("Running COPAC on db size = " + relation.size() + " with dimensionality = " + DatabaseUtil.dimensionality(relation));
    }

    partitionDistanceQuery = (FilteredLocalPCABasedDistanceFunction.Instance<V, LocalProjectionIndex<V, ?>, D>) partitionDistanceFunction.instantiate(relation);
    LocalProjectionIndex<V, ?> preprocin = partitionDistanceQuery.getIndex();

    // partitioning
    Map<Integer, ModifiableDBIDs> partitionMap = new HashMap<Integer, ModifiableDBIDs>();
    FiniteProgress partitionProgress = logger.isVerbose() ? new FiniteProgress("Partitioning", relation.size(), logger) : null;
    int processed = 1;

    for(DBID id : relation.iterDBIDs()) {
      Integer corrdim = preprocin.getLocalProjection(id).getCorrelationDimension();

      if(!partitionMap.containsKey(corrdim)) {
        partitionMap.put(corrdim, DBIDUtil.newArray());
      }

      partitionMap.get(corrdim).add(id);
      if(partitionProgress != null) {
        partitionProgress.setProcessed(processed++, logger);
      }
    }

    if(partitionProgress != null) {
      partitionProgress.ensureCompleted(logger);
    }
    if(logger.isVerbose()) {
      for(Integer corrDim : partitionMap.keySet()) {
        ModifiableDBIDs list = partitionMap.get(corrDim);
        logger.verbose("Partition [corrDim = " + corrDim + "]: " + list.size() + " objects.");
      }
    }

    // convert for partition algorithm.
    // TODO: do this with DynamicDBIDs instead
    Map<Integer, DBIDs> pmap = new HashMap<Integer, DBIDs>();
    for(Entry<Integer, ModifiableDBIDs> ent : partitionMap.entrySet()) {
      pmap.put(ent.getKey(), ent.getValue());
    }
    // running partition algorithm
    return runPartitionAlgorithm(relation, pmap, partitionDistanceQuery);
  }

  /**
   * Runs the partition algorithm and creates the result.
   *
   * @param relation the database to run this algorithm on
   * @param partitionMap the map of partition IDs to object ids
   * @param query The preprocessor based query function
   */
  private Clustering<Model> runPartitionAlgorithm(Relation<V> relation, Map<Integer, DBIDs> partitionMap, DistanceQuery<V, D> query) {
    Clustering<Model> result = new Clustering<Model>("COPAC clustering", "copac-clustering");

    // TODO: use an extra finite progress for the partitions?
    for(Entry<Integer, DBIDs> pair : partitionMap.entrySet()) {
      // noise partition
      if(pair.getKey() == DatabaseUtil.dimensionality(relation)) {
        // Make a Noise cluster
        result.addCluster(new Cluster<Model>(pair.getValue(), true, ClusterModel.CLUSTER));
      }
      else {
        DBIDs partids = pair.getValue();
        ProxyDatabase proxy = new ProxyDatabase(partids, relation);
       
        ClusteringAlgorithm<Clustering<Model>> partitionAlgorithm = getPartitionAlgorithm(query);
        if(logger.isVerbose()) {
          logger.verbose("Running " + partitionAlgorithm.getClass().getName() + " on partition [corrDim = " + pair.getKey() + "]...");
        }
        Clustering<Model> p = partitionAlgorithm.run(proxy);
        // Re-Wrap resulting Clusters as DimensionModel clusters.
        for(Cluster<Model> clus : p.getAllClusters()) {
          if(clus.isNoise()) {
            result.addCluster(new Cluster<Model>(clus.getIDs(), true, ClusterModel.CLUSTER));
          }
          else {
            result.addCluster(new Cluster<Model>(clus.getIDs(), new DimensionModel(pair.getKey())));
          }
        }
      }
    }
    return result;
  }

  /**
   * Returns the partition algorithm.
   *
   * @return the specified partition algorithm
   */
  public ClusteringAlgorithm<Clustering<Model>> getPartitionAlgorithm(DistanceQuery<V, D> query) {
    ListParameterization reconfig = new ListParameterization(partitionAlgorithmParameters);
    ProxyDistanceFunction<V, D> dist = ProxyDistanceFunction.proxy(query);
    reconfig.addParameter(AbstractDistanceBasedAlgorithm.DISTANCE_FUNCTION_ID, dist);
    ClusteringAlgorithm<Clustering<Model>> instance = reconfig.tryInstantiate(partitionAlgorithm);
    reconfig.failOnErrors();
    return instance;
  }

  /**
   * Get the last used distance query (to expose access to the preprocessor)
   *
   * Used by ERiC. TODO: migrate to factory pattern!
   *
   * @return distance query
   */
  public FilteredLocalPCABasedDistanceFunction.Instance<V, LocalProjectionIndex<V, ?>, D> getPartitionDistanceQuery() {
    return partitionDistanceQuery;
  }

  @Override
  public TypeInformation[] getInputTypeRestriction() {
    return TypeUtil.array(TypeUtil.NUMBER_VECTOR_FIELD);
  }

  @Override
  protected Logging getLogger() {
    return logger;
  }

  /**
   * Parameterization class.
   *
   * @author Erich Schubert
   *
   * @apiviz.exclude
   */
  public static class Parameterizer<V extends NumberVector<V, ?>, D extends Distance<D>> extends AbstractParameterizer {
    protected LocalProjectionIndex.Factory<V, ?> indexI = null;

    protected FilteredLocalPCABasedDistanceFunction<V, ?, D> pdistI = null;

    protected Class<? extends ClusteringAlgorithm<Clustering<Model>>> algC = null;

    protected Collection<Pair<OptionID, Object>> algO = null;

    @Override
    protected void makeOptions(Parameterization config) {
      super.makeOptions(config);
      ClassParameter<Factory<V, ?>> indexP = new ClassParameter<LocalProjectionIndex.Factory<V, ?>>(PREPROCESSOR_ID, LocalProjectionIndex.Factory.class);
      if(config.grab(indexP)) {
        indexI = indexP.instantiateClass(config);
      }

      ObjectParameter<FilteredLocalPCABasedDistanceFunction<V, ?, D>> pdistP = new ObjectParameter<FilteredLocalPCABasedDistanceFunction<V, ?, D>>(PARTITION_DISTANCE_ID, FilteredLocalPCABasedDistanceFunction.class, LocallyWeightedDistanceFunction.class);
      if(config.grab(pdistP)) {
        ListParameterization predefinedDist = new ListParameterization();
        predefinedDist.addParameter(IndexBasedDistanceFunction.INDEX_ID, indexI);
        ChainedParameterization chainDist = new ChainedParameterization(predefinedDist, config);
        chainDist.errorsTo(config);
        pdistI = pdistP.instantiateClass(chainDist);
        predefinedDist.reportInternalParameterizationErrors(config);
      }

      // Parameterize algorithm:
      ClassParameter<ClusteringAlgorithm<Clustering<Model>>> algP = new ClassParameter<ClusteringAlgorithm<Clustering<Model>>>(PARTITION_ALGORITHM_ID, ClusteringAlgorithm.class);
      if(config.grab(algP)) {
        ListParameterization predefined = new ListParameterization();
        predefined.addParameter(AbstractDistanceBasedAlgorithm.DISTANCE_FUNCTION_ID, pdistI);
        TrackParameters trackpar = new TrackParameters(config);
        ChainedParameterization chain = new ChainedParameterization(predefined, trackpar);
        chain.errorsTo(config);
        algP.instantiateClass(chain);
        algC = algP.getValue();
        algO = trackpar.getGivenParameters();
        predefined.reportInternalParameterizationErrors(chain);
      }
    }

    @Override
    protected COPAC<V, D> makeInstance() {
      return new COPAC<V, D>(pdistI, algC, algO);
    }
  }
}
TOP

Related Classes of de.lmu.ifi.dbs.elki.algorithm.clustering.correlation.COPAC$Parameterizer

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.