Source Code of de.lmu.ifi.dbs.elki.algorithm.outlier.ReferenceBasedOutlierDetection$Parameterizer

package de.lmu.ifi.dbs.elki.algorithm.outlier;


/*
 This file is part of ELKI:
 Environment for Developing KDD-Applications Supported by Index-Structures


 Copyright (C) 2011
 Ludwig-Maximilians-Universität München
 Lehr- und Forschungseinheit für Datenbanksysteme
 ELKI Development Team


 This program is free software: you can redistribute it and/or modify
 it under the terms of the GNU Affero General Public License as published by
 the Free Software Foundation, either version 3 of the License, or
 (at your option) any later version.


 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU Affero General Public License for more details.


 You should have received a copy of the GNU Affero General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */


import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;


import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm;
import de.lmu.ifi.dbs.elki.algorithm.AbstractDistanceBasedAlgorithm;
import de.lmu.ifi.dbs.elki.data.NumberVector;
import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory;
import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil;
import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore;
import de.lmu.ifi.dbs.elki.database.ids.DBID;
import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
import de.lmu.ifi.dbs.elki.database.query.DistanceResultPair;
import de.lmu.ifi.dbs.elki.database.query.GenericDistanceResultPair;
import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery;
import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction;
import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.math.Mean;
import de.lmu.ifi.dbs.elki.result.ReferencePointsResult;
import de.lmu.ifi.dbs.elki.result.outlier.BasicOutlierScoreMeta;
import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult;
import de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta;
import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
import de.lmu.ifi.dbs.elki.utilities.referencepoints.GridBasedReferencePoints;
import de.lmu.ifi.dbs.elki.utilities.referencepoints.ReferencePointsHeuristic;


/**
 * <p>
 * provides the Reference-Based Outlier Detection algorithm, an algorithm that
 * computes kNN distances approximately, using reference points.
 * </p>
 * <p>
 * Reference:<br>
 * Y. Pei, O. R. Zaiane, Y. Gao: An Efficient Reference-Based Approach to
 * Outlier Detection in Large Datasets.</br> In: Proc. IEEE Int. Conf. on Data
 * Mining (ICDM'06), Hong Kong, China, 2006.
 * </p>
 * 
 * @author Lisa Reichert
 * @author Erich Schubert
 * 
 * @apiviz.composedOf ReferencePointsHeuristic
 * 
 * @param <V> a type of {@link NumberVector} as a suitable data object for this
 *        algorithm
 * @param <D> the distance type processed
 */
@Title("An Efficient Reference-based Approach to Outlier Detection in Large Datasets")
@Description("Computes kNN distances approximately, using reference points with various reference point strategies.")
@Reference(authors = "Y. Pei, O.R. Zaiane, Y. Gao", title = "An Efficient Reference-based Approach to Outlier Detection in Large Datasets", booktitle = "Proc. 19th IEEE Int. Conf. on Data Engineering (ICDE '03), Bangalore, India, 2003", url = "http://dx.doi.org/10.1109/ICDM.2006.17")
public class ReferenceBasedOutlierDetection<V extends NumberVector<?, ?>, D extends NumberDistance<D, ?>> extends AbstractAlgorithm<OutlierResult> implements OutlierAlgorithm {
  /**
   * The logger for this class.
   */
  private static final Logging logger = Logging.getLogger(ReferenceBasedOutlierDetection.class);


  /**
   * Parameter for the reference points heuristic.
   */
  public static final OptionID REFP_ID = OptionID.getOrCreateOptionID("refod.refp", "The heuristic for finding reference points.");


  /**
   * Parameter to specify the number of nearest neighbors of an object, to be
   * considered for computing its REFOD_SCORE, must be an integer greater than
   * 1.
   */
  public static final OptionID K_ID = OptionID.getOrCreateOptionID("refod.k", "The number of nearest neighbors");


  /**
   * Holds the value of {@link #K_ID}.
   */
  private int k;


  /**
   * Stores the reference point strategy
   */
  private ReferencePointsHeuristic<V> refp;


  /**
   * Distance function to use.
   */
  private DistanceFunction<V, D> distanceFunction;


  /**
   * Constructor with parameters.
   * 
   * @param k k Parameter
   * @param distanceFunction distance function
   * @param refp Reference points heuristic
   */
  public ReferenceBasedOutlierDetection(int k, DistanceFunction<V, D> distanceFunction, ReferencePointsHeuristic<V> refp) {
    super();
    this.k = k;
    this.distanceFunction = distanceFunction;
    this.refp = refp;
  }


  /**
   * Run the algorithm on the given relation.
   * 
   * @param relation Relation to process
   * @return Outlier result
   */
  public OutlierResult run(Relation<V> relation) {
    DistanceQuery<V, D> distFunc = relation.getDatabase().getDistanceQuery(relation, distanceFunction);
    Collection<V> refPoints = refp.getReferencePoints(relation);


    DBIDs ids = relation.getDBIDs();
    // storage of distance/score values.
    WritableDoubleDataStore rbod_score = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_STATIC | DataStoreFactory.HINT_HOT);


    // Compute density estimation:
    {
      // compute density for one reference point, to initialize the first
      // density
      // value for each object, then update
      final Iterator<V> iter = refPoints.iterator();
      if(!iter.hasNext()) {
        throw new AbortException("Cannot compute ROS without reference points!");
      }
      V firstRef = iter.next();
      // compute distance vector for the first reference point
      List<DistanceResultPair<D>> firstReferenceDists = computeDistanceVector(firstRef, relation, distFunc);
      for(int l = 0; l < firstReferenceDists.size(); l++) {
        double density = computeDensity(firstReferenceDists, l);
        // Initial value
        rbod_score.putDouble(firstReferenceDists.get(l).getDBID(), density);
      }
      // compute density values for all remaining reference points
      while(iter.hasNext()) {
        V refPoint = iter.next();
        List<DistanceResultPair<D>> referenceDists = computeDistanceVector(refPoint, relation, distFunc);
        // compute density value for each object
        for(int l = 0; l < referenceDists.size(); l++) {
          double density = computeDensity(referenceDists, l);
          // Update minimum
          if(density < rbod_score.doubleValue(referenceDists.get(l).getDBID())) {
            rbod_score.putDouble(referenceDists.get(l).getDBID(), density);
          }
        }
      }
    }
    // compute maximum density
    double maxDensity = 0.0;
    for(DBID id : relation.iterDBIDs()) {
      double dens = rbod_score.doubleValue(id);
      if(dens > maxDensity) {
        maxDensity = dens;
      }
    }
    // compute ROS
    for(DBID id : relation.iterDBIDs()) {
      double score = 1 - (rbod_score.doubleValue(id) / maxDensity);
      rbod_score.putDouble(id, score);
    }


    // adds reference points to the result. header information for the
    // visualizer to find the reference points in the result
    ReferencePointsResult<V> refp = new ReferencePointsResult<V>("Reference points", "reference-points", refPoints);


    Relation<Double> scoreResult = new MaterializedRelation<Double>("Reference-points Outlier Scores", "reference-outlier", TypeUtil.DOUBLE, rbod_score, relation.getDBIDs());
    OutlierScoreMeta scoreMeta = new BasicOutlierScoreMeta(0.0, 1.0, 0.0, 1.0, 0.0);
    OutlierResult result = new OutlierResult(scoreMeta, scoreResult);
    result.addChildResult(refp);
    return result;
  }


  /**
   * Computes for each object the distance to one reference point. (one
   * dimensional representation of the data set)
   * 
   * @param refPoint Reference Point Feature Vector
   * @param database database to work on
   * @param distFunc Distance function to use
   * @return array containing the distance to one reference point for each
   *         database object and the object id
   */
  protected List<DistanceResultPair<D>> computeDistanceVector(V refPoint, Relation<V> database, DistanceQuery<V, D> distFunc) {
    // TODO: optimize for double distances?
    List<DistanceResultPair<D>> referenceDists = new ArrayList<DistanceResultPair<D>>(database.size());
    for(DBID id : database.iterDBIDs()) {
      final D distance = distFunc.distance(id, refPoint);
      referenceDists.add(new GenericDistanceResultPair<D>(distance, id));
    }
    Collections.sort(referenceDists);
    return referenceDists;
  }


  /**
   * Computes the density of an object. The density of an object is the
   * distances to the k nearest neighbors. Neighbors and distances are computed
   * approximately. (approximation for kNN distance: instead of a normal NN
   * search the NN of an object are those objects that have a similar distance
   * to a reference point. The k- nearest neighbors of an object are those
   * objects that lay close to the object in the reference distance vector)
   * 
   * @param referenceDists vector of the reference distances,
   * @param index index of the current object
   * @return density for one object and reference point
   */
  protected double computeDensity(List<DistanceResultPair<D>> referenceDists, int index) {
    final DistanceResultPair<D> x = referenceDists.get(index);
    final double xDist = x.getDistance().doubleValue();


    int lef = index - 1;
    int rig = index + 1;
    Mean mean = new Mean();
    double lef_d = (lef >= 0) ? referenceDists.get(lef).getDistance().doubleValue() : Double.NEGATIVE_INFINITY;
    double rig_d = (rig < referenceDists.size()) ? referenceDists.get(rig).getDistance().doubleValue() : Double.NEGATIVE_INFINITY;
    while(mean.getCount() < k) {
      if(lef >= 0 && rig < referenceDists.size()) {
        // Prefer n or m?
        if(Math.abs(lef_d - xDist) < Math.abs(rig_d - xDist)) {
          mean.put(Math.abs(lef_d - xDist));
          // Update n
          lef--;
          lef_d = (lef >= 0) ? referenceDists.get(lef).getDistance().doubleValue() : Double.NEGATIVE_INFINITY;
        }
        else {
          mean.put(Math.abs(rig_d - xDist));
          // Update right
          rig++;
          rig_d = (rig < referenceDists.size()) ? referenceDists.get(rig).getDistance().doubleValue() : Double.NEGATIVE_INFINITY;
        }
      }
      else {
        if(lef >= 0) {
          // Choose left, since right is not available.
          mean.put(Math.abs(lef_d - xDist));
          // update left
          lef--;
          lef_d = (lef >= 0) ? referenceDists.get(lef).getDistance().doubleValue() : Double.NEGATIVE_INFINITY;
        }
        else if(rig < referenceDists.size()) {
          // Choose right, since left is not available
          mean.put(Math.abs(rig_d - xDist));
          // Update right
          rig++;
          rig_d = (rig < referenceDists.size()) ? referenceDists.get(rig).getDistance().doubleValue() : Double.NEGATIVE_INFINITY;
        }
        else {
          // Not enough objects in database?
          throw new IndexOutOfBoundsException();
        }
      }
    }


    return 1.0 / mean.getMean();
  }


  @Override
  public TypeInformation[] getInputTypeRestriction() {
    return TypeUtil.array(TypeUtil.NUMBER_VECTOR_FIELD);
  }


  @Override
  protected Logging getLogger() {
    return logger;
  }


  /**
   * Parameterization class.
   * 
   * @author Erich Schubert
   * 
   * @apiviz.exclude
   */
  public static class Parameterizer<V extends NumberVector<?, ?>, D extends NumberDistance<D, ?>> extends AbstractDistanceBasedAlgorithm.Parameterizer<V, D> {
    /**
     * Holds the value of {@link #K_ID}.
     */
    private int k;


    /**
     * Stores the reference point strategy
     */
    private ReferencePointsHeuristic<V> refp;


    @Override
    protected void makeOptions(Parameterization config) {
      super.makeOptions(config);
      final IntParameter pK = new IntParameter(K_ID, new GreaterConstraint(1));
      if(config.grab(pK)) {
        k = pK.getValue();
      }
      final ObjectParameter<ReferencePointsHeuristic<V>> refpP = new ObjectParameter<ReferencePointsHeuristic<V>>(REFP_ID, ReferencePointsHeuristic.class, GridBasedReferencePoints.class);
      if(config.grab(refpP)) {
        refp = refpP.instantiateClass(config);
      }
    }


    @Override
    protected ReferenceBasedOutlierDetection<V, D> makeInstance() {
      return new ReferenceBasedOutlierDetection<V, D>(k, distanceFunction, refp);
    }
  }
}
Source Code of de.lmu.ifi.dbs.elki.algorithm.outlier.ReferenceBasedOutlierDetection$Parameterizer

Related Classes of de.lmu.ifi.dbs.elki.algorithm.outlier.ReferenceBasedOutlierDetection$Parameterizer