Source Code of de.lmu.ifi.dbs.elki.algorithm.clustering.correlation.LMCLUS$Parameterizer

package de.lmu.ifi.dbs.elki.algorithm.clustering.correlation;
/*
 This file is part of ELKI:
 Environment for Developing KDD-Applications Supported by Index-Structures


 Copyright (C) 2012
 Ludwig-Maximilians-Universität München
 Lehr- und Forschungseinheit für Datenbanksysteme
 ELKI Development Team


 This program is free software: you can redistribute it and/or modify
 it under the terms of the GNU Affero General Public License as published by
 the Free Software Foundation, either version 3 of the License, or
 (at your option) any later version.


 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU Affero General Public License for more details.


 You should have received a copy of the GNU Affero General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */


import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Random;


import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm;
import de.lmu.ifi.dbs.elki.data.Cluster;
import de.lmu.ifi.dbs.elki.data.Clustering;
import de.lmu.ifi.dbs.elki.data.NumberVector;
import de.lmu.ifi.dbs.elki.data.model.Model;
import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.database.Database;
import de.lmu.ifi.dbs.elki.database.ids.DBID;
import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress;
import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress;
import de.lmu.ifi.dbs.elki.math.MeanVariance;
import de.lmu.ifi.dbs.elki.math.histograms.FlexiHistogram;
import de.lmu.ifi.dbs.elki.math.linearalgebra.Matrix;
import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector;
import de.lmu.ifi.dbs.elki.utilities.DatabaseUtil;
import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException;
import de.lmu.ifi.dbs.elki.utilities.exceptions.UnableToComplyException;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
import de.lmu.ifi.dbs.elki.utilities.pairs.DoubleObjPair;


/**
 * Linear manifold clustering in high dimensional spaces by stochastic search.
 * 
 * Reference:
 * <p>
 * Robert Haralick, Rave Harpaz<br />
 * Linear manifold clustering in high dimensional spaces by stochastic search<br/>
 * In: Pattern Recognition volume 40, Issue 10
 * </p>
 * 
 * Implementation note: the LMCLUS algorithm seems to lack good stopping
 * criterions. We can't entirely reproduce the good results from the original
 * publication, in particular not on noisy data. But the questionable parts are
 * as in the original publication, associated thesis and published source code.
 * The minimum cluster size however can serve as a hidden stopping criterion.
 * 
 * @author Ernst Waas
 * @author Erich Schubert
 */
@Reference(authors = "Robert Haralick, Rave Harpaz", title = "Linear manifold clustering in high dimensional spaces by stochastic search", booktitle = "Pattern Recognition volume 40, Issue 10", url = "http://dx.doi.org/10.1016/j.patcog.2007.01.020")
public class LMCLUS extends AbstractAlgorithm<Clustering<Model>> {
  /**
   * The logger for this class.
   */
  private static final Logging logger = Logging.getLogger(LMCLUS.class);


  /**
   * Epsilon
   */
  private final static double NOT_FROM_ONE_CLUSTER_PROBABILITY = 0.2;


  /**
   * Histogram resolution
   */
  private final static int BINS = 50;


  /**
   * The current threshold value calculated by the findSeperation Method.
   */
  private final double sensitivityThreshold;


  /**
   * Maximum cluster dimensionality
   */
  private final int maxLMDim;


  /**
   * Minimum cluster size
   */
  private final int minsize;


  /**
   * Number of sampling rounds to find a good split
   */
  private final int samplingLevel;


  /**
   * Constructor.
   * 
   * @param maxdim Maximum dimensionality
   * @param minsize Minimum cluster size
   * @param samplingLevel Sampling level
   * @param sensitivityThreshold Threshold
   */
  public LMCLUS(int maxdim, int minsize, int samplingLevel, double sensitivityThreshold) {
    super();
    this.maxLMDim = maxdim;
    this.minsize = minsize;
    this.samplingLevel = samplingLevel;
    this.sensitivityThreshold = sensitivityThreshold;
  }


  /**
   * The main LMCLUS (Linear manifold clustering algorithm) is processed in this
   * method.
   * 
   * <PRE>
   * The algorithm samples random linear manifolds and tries to find clusters in it.
   * It calculates a distance histogram searches for a threshold and partitions the
   * points in two groups the ones in the cluster and everything else.
   * Then the best fitting linear manifold is searched and registered as a cluster.
   * The process is started over until all points are clustered.
   * The last cluster should contain all the outliers. (or the whole data if no clusters have been found.)
   * For details see {@link LMCLUS}.
   * </PRE>
   * 
   * @param database The database to operate on
   * @param relation Relation
   * @return Clustering result
   * @throws de.lmu.ifi.dbs.elki.utilities.UnableToComplyException
   */
  public Clustering<Model> run(Database database, Relation<NumberVector<?, ?>> relation) throws UnableToComplyException {
    Clustering<Model> ret = new Clustering<Model>("LMCLUS Clustering", "lmclus-clustering");
    FiniteProgress progress = logger.isVerbose() ? new FiniteProgress("Clustered objects", relation.size(), logger) : null;
    IndefiniteProgress cprogress = logger.isVerbose() ? new IndefiniteProgress("Clusters found", logger) : null;
    ModifiableDBIDs unclustered = DBIDUtil.newHashSet(relation.getDBIDs());


    final int maxdim = Math.min(maxLMDim, DatabaseUtil.dimensionality(relation));
    int cnum = 0;
    while(unclustered.size() > minsize) {
      DBIDs current = unclustered;
      int lmDim = 1;
      for(int k = 1; k <= maxdim; k++) {
        // Implementation note: this while loop is from the original publication
        // and the published LMCLUS source code. It doesn't make sense to me -
        // it is lacking a stop criterion other than "cluster is too small" and
        // "cluster is inseparable"! Additionally, there is good criterion for
        // stopping at the appropriate dimensionality either.
        while(true) {
          Separation separation = findSeparation(relation, current, k);
          // logger.verbose("k: " + k + " goodness: " + separation.goodness +
          // " threshold: " + separation.threshold);
          if(separation.goodness <= sensitivityThreshold) {
            break;
          }
          ModifiableDBIDs subset = DBIDUtil.newArray(current.size());
          for(DBID id : current) {
            if(deviation(relation.get(id).getColumnVector().minusEquals(separation.originV), separation.basis) < separation.threshold) {
              subset.add(id);
            }
          }
          // logger.verbose("size:"+subset.size());
          if(subset.size() < minsize) {
            break;
          }
          current = subset;
          lmDim = k;
          // System.out.println("Partition: " + subset.size());
        }
      }
      // No more clusters found
      if(current.size() < minsize || current == unclustered) {
        break;
      }
      // New cluster found
      // TODO: annotate cluster with dimensionality
      final Cluster<Model> cluster = new Cluster<Model>(current);
      cluster.setName("Cluster_" + lmDim + "d_" + cnum);
      cnum++;
      ret.addCluster(cluster);
      // Remove from main working set.
      unclustered.removeDBIDs(current);
      if(progress != null) {
        progress.setProcessed(relation.size() - unclustered.size(), logger);
      }
      if(cprogress != null) {
        cprogress.setProcessed(cnum, logger);
      }
    }
    // Remaining objects are noise
    if(unclustered.size() > 0) {
      ret.addCluster(new Cluster<Model>(unclustered, true));
    }
    if(progress != null) {
      progress.setProcessed(relation.size(), logger);
      progress.ensureCompleted(logger);
    }
    if(cprogress != null) {
      cprogress.setCompleted(logger);
    }
    return ret;
  }


  /**
   * Deviation from a manifold described by beta.
   * 
   * @param delta Delta from origin vector
   * @param beta Manifold
   * @return Deviation score
   */
  private double deviation(Vector delta, Matrix beta) {
    double a = delta.euclideanLength();
    double b = beta.transposeTimes(delta).euclideanLength();
    return Math.sqrt((a * a) - (b * b));
  }


  /**
   * This method samples a number of linear manifolds an tries to determine
   * which the one with the best cluster is.
   * 
   * <PRE>
   * A number of sample points according to the dimension of the linear manifold are taken.
   * The basis (B) and the origin(o) of the manifold are calculated.
   * A distance histogram using  the distance function ||x-o|| -||B^t*(x-o)|| is generated.
   * The best threshold is searched using the elevate threshold function.
   * The overall goodness of the threshold is determined.
   * The process is redone until a specific number of samples is taken.
   * </PRE>
   * 
   * @param relation The vector relation
   * @param currentids Current DBIDs
   * @param dimension the dimension of the linear manifold to sample.
   * @return the overall goodness of the separation. The values origin basis and
   *         threshold are returned indirectly over class variables.
   */
  private Separation findSeparation(Relation<NumberVector<?, ?>> relation, DBIDs currentids, int dimension) {
    Separation separation = new Separation();
    // determine the number of samples needed, to secure that with a specific
    // probability
    // in at least on sample every sampled point is from the same cluster.
    int samples = (int) Math.min(Math.log(NOT_FROM_ONE_CLUSTER_PROBABILITY) / (Math.log(1 - Math.pow((1.0d / samplingLevel), dimension))), (double) currentids.size());
    // System.out.println("Number of samples: " + samples);
    Random r = new Random();
    int remaining_retries = 100;
    for(int i = 1; i <= samples; i++) {
      DBIDs sample = DBIDUtil.randomSample(currentids, dimension + 1, r.nextLong());
      final Iterator<DBID> iter = sample.iterator();
      // Use first as origin
      DBID origin = iter.next();
      Vector originV = relation.get(origin).getColumnVector();
      // Build orthogonal basis from remainder
      Matrix basis;
      {
        List<Vector> vectors = new ArrayList<Vector>(sample.size() - 1);
        while(iter.hasNext()) {
          Vector vec = relation.get(iter.next()).getColumnVector();
          vectors.add(vec.minusEquals(originV));
        }
        // generate orthogonal basis
        basis = generateOrthonormalBasis(vectors);
        if(basis == null) {
          // new sample has to be taken.
          i--;
          remaining_retries--;
          if(remaining_retries < 0) {
            throw new AbortException("Too many retries in sampling, and always a linear dependant data set.");
          }
          continue;
        }
      }
      // Generate and fill a histogram.
      FlexiHistogram<Double, Double> histogram = FlexiHistogram.DoubleSumHistogram(BINS);
      double w = 1.0 / currentids.size();
      for(DBID point : currentids) {
        // Skip sampled points
        if(sample.contains(point)) {
          continue;
        }
        Vector vec = relation.get(point).getColumnVector().minusEquals(originV);
        final double distance = deviation(vec, basis);
        histogram.aggregate(distance, w);
      }
      double[] th = findAndEvaluateThreshold(histogram); // evaluate threshold
      if(th[1] > separation.goodness) {
        separation.goodness = th[1];
        separation.threshold = th[0];
        separation.originV = originV;
        separation.basis = basis;
      }
    }
    return separation;
  }


  /**
   * This Method generates an orthonormal basis from a set of Vectors. It uses
   * the established Gram-Schmidt algorithm for orthonormalisation:
   * 
   * <PRE>
   * u_1 = v_1
   * u_k = v_k -proj_u1(v_k)...proj_u(k-1)(v_k);
   * 
   * Where proj_u(v) = <v,u>/<u,u> *u
   * </PRE>
   * 
   * @param vectors The set of vectors to generate the orthonormal basis from
   * @return the orthonormal basis generated by this method.
   * @throws RuntimeException if the given vectors are not linear independent.
   */
  private Matrix generateOrthonormalBasis(List<Vector> vectors) {
    Vector first = vectors.get(0);
    first = first.times(1.0 / first.euclideanLength());
    Matrix ret = new Matrix(first.getDimensionality(), vectors.size());
    ret.setCol(0, first);
    for(int i = 1; i < vectors.size(); i++) {
      // System.out.println("Matrix:" + ret);
      Vector v_i = vectors.get(i);
      Vector u_i = v_i.copy();
      // System.out.println("Vector " + i + ":" + partialSol);
      for(int j = 0; j < i; j++) {
        Vector v_j = ret.getCol(j);
        double f = v_i.transposeTimes(v_j) / v_j.transposeTimes(v_j);
        if(Double.isNaN(f)) {
          if(logger.isDebuggingFine()) {
            logger.debugFine("Zero vector encountered? " + v_j);
          }
          return null;
        }
        u_i.minusTimesEquals(v_j, f);
      }
      // check if the vectors weren't independent
      final double len_u_i = u_i.euclideanLength();
      if(len_u_i == 0.0) {
        if(logger.isDebuggingFine()) {
          logger.debugFine("Points not independent - no orthonormalization.");
        }
        return null;
      }
      // System.out.println("Vector " + i + ":" + partialSol);
      u_i.timesEquals(1 / len_u_i);
      ret.setCol(i, u_i);
    }
    return ret;
  }


  /**
   * Evaluate the histogram to find a suitable threshold
   * 
   * @param histogram Histogram to evaluate
   * @return Position and goodness
   */
  private double[] findAndEvaluateThreshold(FlexiHistogram<Double, Double> histogram) {
    int n = histogram.getNumBins();
    double[] p1 = new double[n];
    double[] p2 = new double[n];
    double[] mu1 = new double[n];
    double[] mu2 = new double[n];
    double[] sigma1 = new double[n];
    double[] sigma2 = new double[n];
    double[] jt = new double[n];
    // Forward pass
    {
      MeanVariance mv = new MeanVariance();
      Iterator<DoubleObjPair<Double>> forward = histogram.iterator();
      for(int i = 0; forward.hasNext(); i++) {
        DoubleObjPair<Double> pair = forward.next();
        p1[i] = pair.second + ((i > 0) ? p1[i - 1] : 0);
        mv.put(i, pair.second);
        mu1[i] = mv.getMean();
        sigma1[i] = mv.getNaiveStddev();
      }
    }
    // Backwards pass
    {
      MeanVariance mv = new MeanVariance();
      Iterator<DoubleObjPair<Double>> backwards = histogram.reverseIterator();
      for(int j = n - 1; backwards.hasNext(); j--) {
        DoubleObjPair<Double> pair = backwards.next();
        p2[j] = pair.second + ((j + 1 < n) ? p2[j + 1] : 0);
        mv.put(j, pair.second);
        mu2[j] = mv.getMean();
        sigma2[j] = mv.getNaiveStddev();
      }
    }


    for(int i = 0; i < n; i++) {
      jt[i] = 1.0 + 2 * (p1[i] * (Math.log(sigma1[i]) - Math.log(p1[i])) + p2[i] * (Math.log(sigma2[i]) - Math.log(p2[i])));
    }


    int bestpos = -1;
    double bestgoodness = Double.NEGATIVE_INFINITY;


    double devPrev = jt[1] - jt[0];
    for(int i = 1; i < jt.length - 1; i++) {
      double devCur = jt[i + 1] - jt[i];
      // System.out.println(p1[i]);
      // System.out.println(jt[i + 1]);
      // System.out.println(jt[i]);
      // System.out.println(devCur);
      // Local minimum found - calculate depth
      if(devCur >= 0 && devPrev <= 0) {
        double lowestMaxima = Double.POSITIVE_INFINITY;
        for(int j = i - 1; j > 0; j--) {
          if(jt[j - 1] < jt[j]) {
            lowestMaxima = Math.min(lowestMaxima, jt[j]);
            break;
          }
        }
        for(int j = i + 1; j < n - 2; j++) {
          if(jt[j + 1] < jt[j]) {
            lowestMaxima = Math.min(lowestMaxima, jt[j]);
            break;
          }
        }
        double localDepth = lowestMaxima - jt[i];


        final double mud = mu1[i] - mu2[i];
        double discriminability = mud * mud / (sigma1[i] * sigma1[i] + sigma2[i] * sigma2[i]);
        if(Double.isNaN(discriminability)) {
          discriminability = -1;
        }
        double goodness = localDepth * discriminability;
        if(goodness > bestgoodness) {
          bestgoodness = goodness;
          bestpos = i;
        }
      }
      devPrev = devCur;
    }
    return new double[] { histogram.getBinMax(bestpos), bestgoodness };
  }


  @Override
  protected Logging getLogger() {
    return logger;
  }


  @Override
  public TypeInformation[] getInputTypeRestriction() {
    return TypeUtil.array(TypeUtil.NUMBER_VECTOR_FIELD);
  }


  /**
   * Class to represent a linear manifold separation
   * 
   * @author Erich Schubert
   * 
   * @apiviz.exclude
   */
  private static class Separation {
    /**
     * Goodness of separation
     */
    double goodness = Double.NEGATIVE_INFINITY;


    /**
     * Threshold
     */
    double threshold = Double.NEGATIVE_INFINITY;


    /**
     * Basis of manifold
     */
    Matrix basis = null;


    /**
     * Origin vector
     */
    Vector originV = null;
  }


  /**
   * Parameterization class
   * 
   * @author Erich Schubert
   * 
   * @apiviz.exclude
   */
  public static class Parameterizer extends AbstractParameterizer {
    /**
     * Parameter with the maximum dimension to search for
     */
    public static final OptionID MAXDIM_ID = OptionID.getOrCreateOptionID("lmclus.maxdim", "Maximum linear manifold dimension to search.");


    /**
     * Parameter for the minimum cluster size
     */
    public static final OptionID MINSIZE_ID = OptionID.getOrCreateOptionID("lmclus.minsize", "Minimum cluster size to allow.");


    /**
     * Sampling intensity level
     */
    public static final OptionID SAMPLINGL_ID = OptionID.getOrCreateOptionID("lmclus.sampling-level", "A number used to determine how many samples are taken in each search.");


    /**
     * Global significance threshold
     */
    public static final OptionID THRESHOLD_ID = OptionID.getOrCreateOptionID("lmclus.threshold", "Threshold to determine if a cluster was found.");


    /**
     * Maximum dimensionality to search for
     */
    private int maxdim = Integer.MAX_VALUE;


    /**
     * Minimum cluster size.
     */
    private int minsize;


    /**
     * Sampling level
     */
    private int samplingLevel;


    /**
     * Threshold
     */
    private double threshold;


    @Override
    protected void makeOptions(Parameterization config) {
      super.makeOptions(config);
      IntParameter maxLMDimP = new IntParameter(MAXDIM_ID, new GreaterEqualConstraint(1), true);
      if(config.grab(maxLMDimP)) {
        maxdim = maxLMDimP.getValue();
      }
      IntParameter minsizeP = new IntParameter(MINSIZE_ID, new GreaterEqualConstraint(1));
      if(config.grab(minsizeP)) {
        minsize = minsizeP.getValue();
      }
      IntParameter samplingLevelP = new IntParameter(SAMPLINGL_ID, 100);
      if(config.grab(samplingLevelP)) {
        samplingLevel = samplingLevelP.getValue();
      }
      DoubleParameter sensivityThresholdP = new DoubleParameter(THRESHOLD_ID);
      if(config.grab(sensivityThresholdP)) {
        threshold = sensivityThresholdP.getValue();
      }
    }


    @Override
    protected LMCLUS makeInstance() {
      return new LMCLUS(maxdim, minsize, samplingLevel, threshold);
    }
  }
}
Source Code of de.lmu.ifi.dbs.elki.algorithm.clustering.correlation.LMCLUS$Parameterizer

Related Classes of de.lmu.ifi.dbs.elki.algorithm.clustering.correlation.LMCLUS$Parameterizer