Package de.lmu.ifi.dbs.elki.algorithm.clustering.correlation

Source Code of de.lmu.ifi.dbs.elki.algorithm.clustering.correlation.CASH$Parameterizer

package de.lmu.ifi.dbs.elki.algorithm.clustering.correlation;

/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures

Copyright (C) 2011
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU Affero General Public License for more details.

You should have received a copy of the GNU Affero General Public License
along with this program.  If not, see <http://www.gnu.org/licenses/>.
*/

import java.util.Arrays;
import java.util.List;

import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm;
import de.lmu.ifi.dbs.elki.algorithm.DependencyDerivator;
import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm;
import de.lmu.ifi.dbs.elki.algorithm.clustering.correlation.cash.CASHInterval;
import de.lmu.ifi.dbs.elki.algorithm.clustering.correlation.cash.CASHIntervalSplit;
import de.lmu.ifi.dbs.elki.data.Cluster;
import de.lmu.ifi.dbs.elki.data.Clustering;
import de.lmu.ifi.dbs.elki.data.DoubleVector;
import de.lmu.ifi.dbs.elki.data.HyperBoundingBox;
import de.lmu.ifi.dbs.elki.data.ParameterizationFunction;
import de.lmu.ifi.dbs.elki.data.model.ClusterModel;
import de.lmu.ifi.dbs.elki.data.model.CorrelationAnalysisSolution;
import de.lmu.ifi.dbs.elki.data.model.LinearEquationModel;
import de.lmu.ifi.dbs.elki.data.model.Model;
import de.lmu.ifi.dbs.elki.data.spatial.SpatialUtil;
import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation;
import de.lmu.ifi.dbs.elki.database.Database;
import de.lmu.ifi.dbs.elki.database.ProxyDatabase;
import de.lmu.ifi.dbs.elki.database.QueryUtil;
import de.lmu.ifi.dbs.elki.database.ids.DBID;
import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery;
import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.datasource.filter.NonNumericFeaturesException;
import de.lmu.ifi.dbs.elki.distance.distancefunction.WeightedDistanceFunction;
import de.lmu.ifi.dbs.elki.distance.distancevalue.DoubleDistance;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress;
import de.lmu.ifi.dbs.elki.math.linearalgebra.LinearEquationSystem;
import de.lmu.ifi.dbs.elki.math.linearalgebra.Matrix;
import de.lmu.ifi.dbs.elki.math.linearalgebra.pca.FirstNEigenPairFilter;
import de.lmu.ifi.dbs.elki.math.linearalgebra.pca.PCAFilteredRunner;
import de.lmu.ifi.dbs.elki.utilities.ClassGenericsUtil;
import de.lmu.ifi.dbs.elki.utilities.DatabaseUtil;
import de.lmu.ifi.dbs.elki.utilities.datastructures.heap.Heap;
import de.lmu.ifi.dbs.elki.utilities.datastructures.heap.IntegerPriorityObject;
import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
import de.lmu.ifi.dbs.elki.utilities.exceptions.UnableToComplyException;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.ParameterException;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ListParameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.Flag;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;

/**
* Provides the CASH algorithm, an subspace clustering algorithm based on the
* hough transform.
* <p>
* Reference: E. Achtert, C. Böhm, J. David, P. Kröger, A. Zimek: Robust
* clustering in arbitrarily oriented subspaces. <br>
* In Proc. 8th SIAM Int. Conf. on Data Mining (SDM'08), Atlanta, GA, 2008
* </p>
*
* @author Elke Achtert
*
* @apiviz.has CASHInterval
* @apiviz.uses ParameterizationFunction
* @apiviz.has LinearEquationModel
*/
// todo elke hierarchy (later)
@Title("CASH: Robust clustering in arbitrarily oriented subspaces")
@Description("Subspace clustering algorithm based on the hough transform.")
@Reference(authors = "E. Achtert, C. Böhm, J. David, P. Kröger, A. Zimek", title = "Robust clustering in arbitraily oriented subspaces", booktitle = "Proc. 8th SIAM Int. Conf. on Data Mining (SDM'08), Atlanta, GA, 2008", url = "http://www.siam.org/proceedings/datamining/2008/dm08_69_AchtertBoehmDavidKroegerZimek.pdf")
public class CASH extends AbstractAlgorithm<Clustering<Model>> implements ClusteringAlgorithm<Clustering<Model>> {
  /**
   * The logger for this class.
   */
  private static final Logging logger = Logging.getLogger(CASH.class);

  /**
   * Parameter to specify the threshold for minimum number of points in a
   * cluster, must be an integer greater than 0.
   * <p>
   * Key: {@code -cash.minpts}
   * </p>
   */
  public static final OptionID MINPTS_ID = OptionID.getOrCreateOptionID("cash.minpts", "Threshold for minimum number of points in a cluster.");

  /**
   * Parameter to specify the maximum level for splitting the hypercube, must be
   * an integer greater than 0.
   * <p>
   * Key: {@code -cash.maxlevel}
   * </p>
   */
  public static final OptionID MAXLEVEL_ID = OptionID.getOrCreateOptionID("cash.maxlevel", "The maximum level for splitting the hypercube.");

  /**
   * Parameter to specify the minimum dimensionality of the subspaces to be
   * found, must be an integer greater than 0.
   * <p>
   * Default value: {@code 1}
   * </p>
   * <p>
   * Key: {@code -cash.mindim}
   * </p>
   */
  public static final OptionID MINDIM_ID = OptionID.getOrCreateOptionID("cash.mindim", "The minimum dimensionality of the subspaces to be found.");

  /**
   * Parameter to specify the maximum jitter for distance values, must be a
   * double greater than 0.
   * <p>
   * Key: {@code -cash.jitter}
   * </p>
   */
  public static final OptionID JITTER_ID = OptionID.getOrCreateOptionID("cash.jitter", "The maximum jitter for distance values.");

  /**
   * Flag to indicate that an adjustment of the applied heuristic for choosing
   * an interval is performed after an interval is selected.
   * <p>
   * Key: {@code -cash.adjust}
   * </p>
   */
  public static final OptionID ADJUST_ID = OptionID.getOrCreateOptionID("cash.adjust", "Flag to indicate that an adjustment of the applied heuristic for choosing an interval " + "is performed after an interval is selected.");

  /**
   * Holds the value of {@link #MINPTS_ID}.
   */
  private int minPts;

  /**
   * Holds the value of {@link #MAXLEVEL_ID}.
   */
  private int maxLevel;

  /**
   * Holds the value of {@link #MINDIM_ID}.
   */
  private int minDim;

  /**
   * Holds the value of {@link #JITTER_ID}.
   */
  private double jitter;

  /**
   * Holds the value of {@link #ADJUST_ID}.
   */
  private boolean adjust;

  /**
   * Holds the dimensionality for noise.
   */
  private int noiseDim;

  /**
   * Holds a set of processed ids.
   */
  private ModifiableDBIDs processedIDs;

  /**
   * The entire database
   */
  private Relation<ParameterizationFunction> fulldatabase;

  /**
   * Constructor.
   *
   * @param minPts MinPts parameter
   * @param maxLevel Maximum level
   * @param minDim Minimum dimensionality
   * @param jitter Jitter
   * @param adjust Adjust
   */
  public CASH(int minPts, int maxLevel, int minDim, double jitter, boolean adjust) {
    super();
    this.minPts = minPts;
    this.maxLevel = maxLevel;
    this.minDim = minDim;
    this.jitter = jitter;
    this.adjust = adjust;
  }

  /**
   * Run CASH on the relation.
   *
   * @param database Database
   * @param relation Relation
   * @return Clustering result
   */
  public Clustering<Model> run(Database database, Relation<ParameterizationFunction> relation) {
    this.fulldatabase = relation;
    if(logger.isVerbose()) {
      StringBuffer msg = new StringBuffer();
      msg.append("DB size: ").append(relation.size());
      msg.append("\nmin Dim: ").append(minDim);
      logger.verbose(msg.toString());
    }

    try {
      processedIDs = DBIDUtil.newHashSet(relation.size());
      noiseDim = DatabaseUtil.dimensionality(relation);

      FiniteProgress progress = logger.isVerbose() ? new FiniteProgress("CASH Clustering", relation.size(), logger) : null;
      Clustering<Model> result = doRun(relation, progress);
      if(progress != null) {
        progress.ensureCompleted(logger);
      }

      if(logger.isVerbose()) {
        StringBuffer msg = new StringBuffer();
        for(Cluster<Model> c : result.getAllClusters()) {
          if(c.getModel() instanceof LinearEquationModel) {
            LinearEquationModel s = (LinearEquationModel) c.getModel();
            msg.append("\n Cluster: Dim: " + s.getLes().subspacedim() + " size: " + c.size());
          }
          else {
            msg.append("\n Cluster: " + c.getModel().getClass().getName() + " size: " + c.size());
          }
        }
        logger.verbose(msg.toString());
      }
      return result;
    }
    catch(UnableToComplyException e) {
      throw new IllegalStateException(e);
    }
    catch(ParameterException e) {
      throw new IllegalStateException(e);
    }
    catch(NonNumericFeaturesException e) {
      throw new IllegalStateException(e);
    }
  }

  /**
   * Runs the CASH algorithm on the specified database, this method is
   * recursively called until only noise is left.
   *
   * @param relation the Relation to run the CASH algorithm on
   * @param progress the progress object for verbose messages
   * @return a mapping of subspace dimensionalities to clusters
   * @throws UnableToComplyException if an error according to the database
   *         occurs
   * @throws ParameterException if the parameter setting is wrong
   * @throws NonNumericFeaturesException if non numeric feature vectors are used
   */
  private Clustering<Model> doRun(Relation<ParameterizationFunction> relation, FiniteProgress progress) throws UnableToComplyException, ParameterException, NonNumericFeaturesException {
    Clustering<Model> res = new Clustering<Model>("CASH clustering", "cash-clustering");

    final int dim = DatabaseUtil.dimensionality(relation);

    // init heap
    Heap<IntegerPriorityObject<CASHInterval>> heap = new Heap<IntegerPriorityObject<CASHInterval>>();
    ModifiableDBIDs noiseIDs = DBIDUtil.newHashSet(relation.getDBIDs());
    initHeap(heap, relation, dim, noiseIDs);

    if(logger.isDebugging()) {
      StringBuffer msg = new StringBuffer();
      msg.append("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX");
      msg.append("\nXXXX dim ").append(dim);
      msg.append("\nXXXX database.size ").append(relation.size());
      msg.append("\nXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX");
      logger.debugFine(msg.toString());
    }
    else if(logger.isVerbose()) {
      StringBuffer msg = new StringBuffer();
      msg.append("XXXX dim ").append(dim).append(" database.size ").append(relation.size());
      logger.verbose(msg.toString());
    }

    // get the ''best'' d-dimensional intervals at max level
    while(!heap.isEmpty()) {
      CASHInterval interval = determineNextIntervalAtMaxLevel(heap);
      if(logger.isDebugging()) {
        logger.debugFine("next interval in dim " + dim + ": " + interval);
      }
      else if(logger.isVerbose()) {
        logger.verbose("next interval in dim " + dim + ": " + interval);
      }

      // only noise left
      if(interval == null) {
        break;
      }

      // do a dim-1 dimensional run
      ModifiableDBIDs clusterIDs = DBIDUtil.newHashSet();
      if(dim > minDim + 1) {
        ModifiableDBIDs ids;
        Matrix basis_dim_minus_1;
        if(adjust) {
          ids = DBIDUtil.newHashSet();
          basis_dim_minus_1 = runDerivator(relation, dim, interval, ids);
        }
        else {
          ids = interval.getIDs();
          basis_dim_minus_1 = determineBasis(SpatialUtil.centroid(interval));
        }

        if(ids.size() != 0) {
          MaterializedRelation<ParameterizationFunction> db = buildDB(dim, basis_dim_minus_1, ids, relation);
          // add result of dim-1 to this result
          Clustering<Model> res_dim_minus_1 = doRun(db, progress);
          for(Cluster<Model> cluster : res_dim_minus_1.getAllClusters()) {
            res.addCluster(cluster);
            noiseIDs.removeDBIDs(cluster.getIDs());
            clusterIDs.addDBIDs(cluster.getIDs());
            processedIDs.addDBIDs(cluster.getIDs());
          }
        }
      }
      // dim == minDim
      else {
        LinearEquationSystem les = runDerivator(relation, dim - 1, interval.getIDs());
        Cluster<Model> c = new Cluster<Model>(interval.getIDs(), new LinearEquationModel(les));
        res.addCluster(c);
        noiseIDs.removeDBIDs(interval.getIDs());
        clusterIDs.addDBIDs(interval.getIDs());
        processedIDs.addAll(interval.getIDs());
      }

      // Rebuild heap
      List<IntegerPriorityObject<CASHInterval>> heapVector = heap.toSortedArrayList();
      for(IntegerPriorityObject<CASHInterval> pair : heapVector) {
        CASHInterval currentInterval = pair.getObject();
        currentInterval.removeIDs(clusterIDs);
        if(currentInterval.getIDs().size() >= minPts) {
          heap.add(new IntegerPriorityObject<CASHInterval>(currentInterval.priority(), currentInterval));
        }
      }

      if(progress != null) {
        progress.setProcessed(processedIDs.size(), logger);
      }
    }

    // put noise to clusters
    if(!noiseIDs.isEmpty()) {
      if(dim == noiseDim) {
        Cluster<Model> c = new Cluster<Model>(noiseIDs, true, ClusterModel.CLUSTER);
        res.addCluster(c);
        processedIDs.addAll(noiseIDs);
      }
      else if(noiseIDs.size() >= minPts) {
        LinearEquationSystem les = runDerivator(fulldatabase, dim - 1, noiseIDs);
        Cluster<Model> c = new Cluster<Model>(noiseIDs, true, new LinearEquationModel(les));
        res.addCluster(c);
        processedIDs.addAll(noiseIDs);
      }
    }

    if(logger.isDebugging()) {
      StringBuffer msg = new StringBuffer();
      msg.append("noise fuer dim ").append(dim).append(": ").append(noiseIDs.size());

      for(Cluster<Model> c : res.getAllClusters()) {
        if(c.getModel() instanceof LinearEquationModel) {
          LinearEquationModel s = (LinearEquationModel) c.getModel();
          msg.append("\n Cluster: Dim: " + s.getLes().subspacedim() + " size: " + c.size());
        }
        else {
          msg.append("\n Cluster: " + c.getModel().getClass().getName() + " size: " + c.size());
        }
      }
      logger.debugFine(msg.toString());
    }

    if(progress != null) {
      progress.setProcessed(processedIDs.size(), logger);
    }
    return res;
  }

  /**
   * Initializes the heap with the root intervals.
   *
   * @param heap the heap to be initialized
   * @param relation the database storing the parameterization functions
   * @param dim the dimensionality of the database
   * @param ids the ids of the database
   */
  private void initHeap(Heap<IntegerPriorityObject<CASHInterval>> heap, Relation<ParameterizationFunction> relation, int dim, DBIDs ids) {
    CASHIntervalSplit split = new CASHIntervalSplit(relation, minPts);

    // determine minimum and maximum function value of all functions
    double[] minMax = determineMinMaxDistance(relation, dim);

    double d_min = minMax[0];
    double d_max = minMax[1];
    double dIntervalLength = d_max - d_min;
    int numDIntervals = (int) Math.ceil(dIntervalLength / jitter);
    double dIntervalSize = dIntervalLength / numDIntervals;
    double[] d_mins = new double[numDIntervals];
    double[] d_maxs = new double[numDIntervals];

    if(logger.isDebugging()) {
      StringBuffer msg = new StringBuffer();
      msg.append("d_min ").append(d_min);
      msg.append("\nd_max ").append(d_max);
      msg.append("\nnumDIntervals ").append(numDIntervals);
      msg.append("\ndIntervalSize ").append(dIntervalSize);
      logger.debugFine(msg.toString());
    }
    else if(logger.isVerbose()) {
      StringBuffer msg = new StringBuffer();
      msg.append("d_min ").append(d_min);
      msg.append("\nd_max ").append(d_max);
      msg.append("\nnumDIntervals ").append(numDIntervals);
      msg.append("\ndIntervalSize ").append(dIntervalSize);
      logger.verbose(msg.toString());
    }

    // alpha intervals
    double[] alphaMin = new double[dim - 1];
    double[] alphaMax = new double[dim - 1];
    Arrays.fill(alphaMax, Math.PI);

    for(int i = 0; i < numDIntervals; i++) {
      if(i == 0) {
        d_mins[i] = d_min;
      }
      else {
        d_mins[i] = d_maxs[i - 1];
      }

      if(i < numDIntervals - 1) {
        d_maxs[i] = d_mins[i] + dIntervalSize;
      }
      else {
        d_maxs[i] = d_max - d_mins[i];
      }

      HyperBoundingBox alphaInterval = new HyperBoundingBox(alphaMin, alphaMax);
      ModifiableDBIDs intervalIDs = split.determineIDs(ids, alphaInterval, d_mins[i], d_maxs[i]);
      if(intervalIDs != null && intervalIDs.size() >= minPts) {
        CASHInterval rootInterval = new CASHInterval(alphaMin, alphaMax, split, intervalIDs, 0, 0, d_mins[i], d_maxs[i]);
        heap.add(new IntegerPriorityObject<CASHInterval>(rootInterval.priority(), rootInterval));
      }
    }

    if(logger.isDebuggingFiner()) {
      StringBuffer msg = new StringBuffer();
      msg.append("heap.size ").append(heap.size());
      logger.debugFiner(msg.toString());
    }
  }

  /**
   * Builds a dim-1 dimensional database where the objects are projected into
   * the specified subspace.
   *
   * @param dim the dimensionality of the database
   * @param basis the basis defining the subspace
   * @param ids the ids for the new database
   * @param relation the database storing the parameterization functions
   * @return a dim-1 dimensional database where the objects are projected into
   *         the specified subspace
   * @throws UnableToComplyException if an error according to the database
   *         occurs
   */
  private MaterializedRelation<ParameterizationFunction> buildDB(int dim, Matrix basis, DBIDs ids, Relation<ParameterizationFunction> relation) throws UnableToComplyException {
    ProxyDatabase proxy = new ProxyDatabase(ids);
    VectorFieldTypeInformation<ParameterizationFunction> type = VectorFieldTypeInformation.get(ParameterizationFunction.class, basis.getColumnDimensionality());
    MaterializedRelation<ParameterizationFunction> prep = new MaterializedRelation<ParameterizationFunction>(proxy, type, ids);
    proxy.addRelation(prep);
   
    // Project
    for(DBID id : ids) {
      ParameterizationFunction f = project(basis, relation.get(id));
      prep.set(id, f);
    }

    if(logger.isDebugging()) {
      logger.debugFine("db fuer dim " + (dim - 1) + ": " + ids.size());
    }

    return prep;
  }

  /**
   * Projects the specified parameterization function into the subspace
   * described by the given basis.
   *
   * @param basis the basis defining he subspace
   * @param f the parameterization function to be projected
   * @return the projected parameterization function
   */
  private ParameterizationFunction project(Matrix basis, ParameterizationFunction f) {
    // Matrix m = new Matrix(new
    // double[][]{f.getPointCoordinates()}).times(basis);
    Matrix m = f.getRowVector().times(basis);
    ParameterizationFunction f_t = new ParameterizationFunction(m.getColumnPackedCopy());
    return f_t;
  }

  /**
   * Determines a basis defining a subspace described by the specified alpha
   * values.
   *
   * @param alpha the alpha values
   * @return a basis defining a subspace described by the specified alpha values
   */
  private Matrix determineBasis(double[] alpha) {
    double[] nn = new double[alpha.length + 1];
    for(int i = 0; i < nn.length; i++) {
      double alpha_i = i == alpha.length ? 0 : alpha[i];
      nn[i] = sinusProduct(0, i, alpha) * StrictMath.cos(alpha_i);
    }
    Matrix n = new Matrix(nn, alpha.length + 1);
    return n.completeToOrthonormalBasis();
  }

  /**
   * Computes the product of all sinus values of the specified angles from start
   * to end index.
   *
   * @param start the index to start
   * @param end the index to end
   * @param alpha the array of angles
   * @return the product of all sinus values of the specified angles from start
   *         to end index
   */
  private double sinusProduct(int start, int end, double[] alpha) {
    double result = 1;
    for(int j = start; j < end; j++) {
      result *= StrictMath.sin(alpha[j]);
    }
    return result;
  }

  /**
   * Determines the next ''best'' interval at maximum level, i.e. the next
   * interval containing the most unprocessed objects.
   *
   * @param heap the heap storing the intervals
   * @return the next ''best'' interval at maximum level
   */
  private CASHInterval determineNextIntervalAtMaxLevel(Heap<IntegerPriorityObject<CASHInterval>> heap) {
    CASHInterval next = doDetermineNextIntervalAtMaxLevel(heap);
    // noise path was chosen
    while(next == null) {
      if(heap.isEmpty()) {
        return null;
      }
      next = doDetermineNextIntervalAtMaxLevel(heap);
    }

    return next;
  }

  /**
   * Recursive helper method to determine the next ''best'' interval at maximum
   * level, i.e. the next interval containing the most unprocessed objects
   *
   * @param heap the heap storing the intervals
   * @return the next ''best'' interval at maximum level
   */
  private CASHInterval doDetermineNextIntervalAtMaxLevel(Heap<IntegerPriorityObject<CASHInterval>> heap) {
    CASHInterval interval = heap.poll().getObject();
    int dim = interval.getDimensionality();
    while(true) {
      // max level is reached
      if(interval.getLevel() >= maxLevel && interval.getMaxSplitDimension() == dim) {
        return interval;
      }

      if(heap.size() % 10000 == 0 && logger.isVerbose()) {
        logger.verbose("heap size " + heap.size());
      }

      if(heap.size() >= 40000) {
        logger.warning("Heap size > 40.000!!!");
        heap.clear();
        return null;
      }

      if(logger.isDebuggingFiner()) {
        logger.debugFiner("split " + interval.toString() + " " + interval.getLevel() + "-" + interval.getMaxSplitDimension());
      }
      interval.split();

      // noise
      if(!interval.hasChildren()) {
        return null;
      }

      CASHInterval bestInterval;
      if(interval.getLeftChild() != null && interval.getRightChild() != null) {
        int comp = interval.getLeftChild().compareTo(interval.getRightChild());
        if(comp < 0) {
          bestInterval = interval.getRightChild();
          heap.add(new IntegerPriorityObject<CASHInterval>(interval.getLeftChild().priority(), interval.getLeftChild()));
        }
        else {
          bestInterval = interval.getLeftChild();
          heap.add(new IntegerPriorityObject<CASHInterval>(interval.getRightChild().priority(), interval.getRightChild()));
        }
      }
      else if(interval.getLeftChild() == null) {
        bestInterval = interval.getRightChild();
      }
      else {
        bestInterval = interval.getLeftChild();
      }

      interval = bestInterval;
    }
  }

  /**
   * Determines the minimum and maximum function value of all parameterization
   * functions stored in the specified database.
   *
   * @param relation the database containing the parameterization functions.
   * @param dimensionality the dimensionality of the database
   * @return an array containing the minimum and maximum function value of all
   *         parameterization functions stored in the specified database
   */
  private double[] determineMinMaxDistance(Relation<ParameterizationFunction> relation, int dimensionality) {
    double[] min = new double[dimensionality - 1];
    double[] max = new double[dimensionality - 1];
    Arrays.fill(max, Math.PI);
    HyperBoundingBox box = new HyperBoundingBox(min, max);

    double d_min = Double.POSITIVE_INFINITY;
    double d_max = Double.NEGATIVE_INFINITY;
    for(DBID id : relation.iterDBIDs()) {
      ParameterizationFunction f = relation.get(id);
      HyperBoundingBox minMax = f.determineAlphaMinMax(box);
      double f_min = f.function(SpatialUtil.getMin(minMax));
      double f_max = f.function(SpatialUtil.getMax(minMax));

      d_min = Math.min(d_min, f_min);
      d_max = Math.max(d_max, f_max);
    }
    return new double[] { d_min, d_max };
  }

  /**
   * Runs the derivator on the specified interval and assigns all points having
   * a distance less then the standard deviation of the derivator model to the
   * model to this model.
   *
   * @param relation the database containing the parameterization functions
   * @param interval the interval to build the model
   * @param dim the dimensionality of the database
   * @param ids an empty set to assign the ids
   * @return a basis of the found subspace
   * @throws UnableToComplyException if an error according to the database
   *         occurs
   * @throws ParameterException if the parameter setting is wrong
   */
  private Matrix runDerivator(Relation<ParameterizationFunction> relation, int dim, CASHInterval interval, ModifiableDBIDs ids) throws UnableToComplyException, ParameterException {
    // build database for derivator
    Database derivatorDB = buildDerivatorDB(relation, interval);

    // set the parameters
    ListParameterization parameters = new ListParameterization();
    parameters.addParameter(PCAFilteredRunner.PCA_EIGENPAIR_FILTER, FirstNEigenPairFilter.class.getName());
    parameters.addParameter(FirstNEigenPairFilter.EIGENPAIR_FILTER_N, Integer.toString(dim - 1));
    DependencyDerivator<DoubleVector, DoubleDistance> derivator = null;
    Class<DependencyDerivator<DoubleVector, DoubleDistance>> cls = ClassGenericsUtil.uglyCastIntoSubclass(DependencyDerivator.class);
    derivator = parameters.tryInstantiate(cls);

    CorrelationAnalysisSolution<DoubleVector> model = derivator.run(derivatorDB);

    Matrix weightMatrix = model.getSimilarityMatrix();
    DoubleVector centroid = new DoubleVector(model.getCentroid());
    DistanceQuery<DoubleVector, DoubleDistance> df = QueryUtil.getDistanceQuery(derivatorDB, new WeightedDistanceFunction(weightMatrix));
    DoubleDistance eps = df.getDistanceFactory().parseString("0.25");

    ids.addDBIDs(interval.getIDs());
    // Search for nearby vectors in original database
    for(DBID id : relation.iterDBIDs()) {
      DoubleVector v = new DoubleVector(relation.get(id).getColumnVector().getArrayRef());
      DoubleDistance d = df.distance(v, centroid);
      if(d.compareTo(eps) < 0) {
        ids.add(id);
      }
    }

    Matrix basis = model.getStrongEigenvectors();
    return basis.getMatrix(0, basis.getRowDimensionality() - 1, 0, dim - 2);
  }

  /**
   * Builds a database for the derivator consisting of the ids in the specified
   * interval.
   *
   * @param relation the database storing the parameterization functions
   * @param interval the interval to build the database from
   * @return a database for the derivator consisting of the ids in the specified
   *         interval
   * @throws UnableToComplyException if an error according to the database
   *         occurs
   */
  private Database buildDerivatorDB(Relation<ParameterizationFunction> relation, CASHInterval interval) throws UnableToComplyException {
    DBIDs ids = interval.getIDs();
    ProxyDatabase proxy = new ProxyDatabase(ids);
    int dim = relation.get(ids.iterator().next()).getDimensionality();
    SimpleTypeInformation<DoubleVector> type = new VectorFieldTypeInformation<DoubleVector>(DoubleVector.class, dim, new DoubleVector(new double[dim]));
    MaterializedRelation<DoubleVector> prep = new MaterializedRelation<DoubleVector>(proxy, type, ids);
    proxy.addRelation(prep);

    // Project
    for(DBID id : ids) {
      DoubleVector v = new DoubleVector(relation.get(id).getColumnVector().getArrayRef());
      prep.set(id, v);
    }

    if(logger.isDebugging()) {
      logger.debugFine("db fuer derivator : " + prep.size());
    }

    return proxy;
  }

  /**
   * Runs the derivator on the specified interval and assigns all points having
   * a distance less then the standard deviation of the derivator model to the
   * model to this model.
   *
   * @param relation the database containing the parameterization functions
   * @param ids the ids to build the model
   * @param dimensionality the dimensionality of the subspace
   * @return a basis of the found subspace
   */
  private LinearEquationSystem runDerivator(Relation<ParameterizationFunction> relation, int dimensionality, DBIDs ids) {
    try {
      // build database for derivator
      Database derivatorDB = buildDerivatorDB(relation, ids);

      ListParameterization parameters = new ListParameterization();
      parameters.addParameter(PCAFilteredRunner.PCA_EIGENPAIR_FILTER, FirstNEigenPairFilter.class.getName());
      parameters.addParameter(FirstNEigenPairFilter.EIGENPAIR_FILTER_N, Integer.toString(dimensionality));
      DependencyDerivator<DoubleVector, DoubleDistance> derivator = null;
      Class<DependencyDerivator<DoubleVector, DoubleDistance>> cls = ClassGenericsUtil.uglyCastIntoSubclass(DependencyDerivator.class);
      derivator = parameters.tryInstantiate(cls);

      CorrelationAnalysisSolution<DoubleVector> model = derivator.run(derivatorDB);
      LinearEquationSystem les = model.getNormalizedLinearEquationSystem(null);
      return les;
    }
    catch(UnableToComplyException e) {
      throw new IllegalStateException("Initialization of the database for the derivator failed: " + e);
    }
    catch(NonNumericFeaturesException e) {
      throw new IllegalStateException("Error during normalization" + e);
    }
  }

  /**
   * Builds a database for the derivator consisting of the ids in the specified
   * interval.
   *
   * @param relation the database storing the parameterization functions
   * @param ids the ids to build the database from
   * @return a database for the derivator consisting of the ids in the specified
   *         interval
   * @throws UnableToComplyException if initialization of the database is not
   *         possible
   */
  private Database buildDerivatorDB(Relation<ParameterizationFunction> relation, DBIDs ids) throws UnableToComplyException {
    ProxyDatabase proxy = new ProxyDatabase(ids);
    int dim = relation.get(ids.iterator().next()).getDimensionality();
    SimpleTypeInformation<DoubleVector> type = new VectorFieldTypeInformation<DoubleVector>(DoubleVector.class, dim, new DoubleVector(new double[dim]));
    MaterializedRelation<DoubleVector> prep = new MaterializedRelation<DoubleVector>(proxy, type, ids);
    proxy.addRelation(prep);

    // Project
    for(DBID id : ids) {
      DoubleVector v = new DoubleVector(relation.get(id).getColumnVector().getArrayRef());
      prep.set(id, v);
    }

    return proxy;
  }

  @Override
  public TypeInformation[] getInputTypeRestriction() {
    return TypeUtil.array(VectorFieldTypeInformation.get(ParameterizationFunction.class));
  }

  @Override
  protected Logging getLogger() {
    return logger;
  }

  /**
   * Parameterization class.
   *
   * @author Erich Schubert
   *
   * @apiviz.exclude
   */
  public static class Parameterizer extends AbstractParameterizer {
    protected int minpts;

    protected int maxlevel;

    protected int mindim;

    protected double jitter;

    protected boolean adjust;

    @Override
    protected void makeOptions(Parameterization config) {
      super.makeOptions(config);
      IntParameter minptsP = new IntParameter(MINPTS_ID, new GreaterConstraint(0));
      if(config.grab(minptsP)) {
        minpts = minptsP.getValue();
      }
      IntParameter maxlevelP = new IntParameter(MAXLEVEL_ID, new GreaterConstraint(0));
      if(config.grab(maxlevelP)) {
        maxlevel = maxlevelP.getValue();
      }
      IntParameter mindimP = new IntParameter(MINDIM_ID, new GreaterConstraint(0), 1);
      if(config.grab(mindimP)) {
        mindim = mindimP.getValue();
      }
      DoubleParameter jitterP = new DoubleParameter(JITTER_ID, new GreaterConstraint(0));
      if(config.grab(jitterP)) {
        jitter = jitterP.getValue();
      }
      Flag adjustF = new Flag(ADJUST_ID);
      if(config.grab(adjustF)) {
        adjust = adjustF.getValue();
      }
    }

    @Override
    protected CASH makeInstance() {
      return new CASH(minpts, maxlevel, mindim, jitter, adjust);
    }
  }
}
TOP

Related Classes of de.lmu.ifi.dbs.elki.algorithm.clustering.correlation.CASH$Parameterizer

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.