package de.lmu.ifi.dbs.elki.algorithm.outlier;
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
Copyright (C) 2011
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
import java.util.List;
import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm;
import de.lmu.ifi.dbs.elki.algorithm.AbstractDistanceBasedAlgorithm;
import de.lmu.ifi.dbs.elki.data.type.CombinedTypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.database.QueryUtil;
import de.lmu.ifi.dbs.elki.database.datastore.DataStore;
import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory;
import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil;
import de.lmu.ifi.dbs.elki.database.datastore.WritableDataStore;
import de.lmu.ifi.dbs.elki.database.ids.DBID;
import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
import de.lmu.ifi.dbs.elki.database.query.DatabaseQuery;
import de.lmu.ifi.dbs.elki.database.query.DistanceResultPair;
import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery;
import de.lmu.ifi.dbs.elki.database.query.knn.KNNQuery;
import de.lmu.ifi.dbs.elki.database.query.knn.PreprocessorKNNQuery;
import de.lmu.ifi.dbs.elki.database.query.rknn.RKNNQuery;
import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction;
import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance;
import de.lmu.ifi.dbs.elki.index.preprocessed.knn.MaterializeKNNPreprocessor;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress;
import de.lmu.ifi.dbs.elki.logging.progress.StepProgress;
import de.lmu.ifi.dbs.elki.math.DoubleMinMax;
import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult;
import de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta;
import de.lmu.ifi.dbs.elki.result.outlier.QuotientOutlierScoreMeta;
import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
import de.lmu.ifi.dbs.elki.utilities.pairs.Pair;
/**
* <p>
* Algorithm to compute density-based local outlier factors in a database based
* on a specified parameter {@link #K_ID} ({@code -lof.k}).
* </p>
*
* <p>
* This implementation diverts from the original LOF publication in that it
* allows the user to use a different distance function for the reachability
* distance and neighborhood determination (although the default is to use the
* same value.)
* </p>
*
* <p>
* The k nearest neighbors are determined using the parameter
* {@link de.lmu.ifi.dbs.elki.algorithm.AbstractDistanceBasedAlgorithm#DISTANCE_FUNCTION_ID}
* , while the reference set used in reachability distance computation is
* configured using {@link #REACHABILITY_DISTANCE_FUNCTION_ID}.
* </p>
*
* <p>
* The original LOF parameter was called "minPts". Since kNN queries
* in ELKI have slightly different semantics - exactly k neighbors are returned
* - we chose to rename the parameter to {@link #K_ID} ({@code -lof.k}) to
* reflect this difference.
* </p>
*
* <p>
* Reference: <br>
* M. M. Breunig, H.-P. Kriegel, R. Ng, J. Sander: LOF: Identifying
* Density-Based Local Outliers. <br>
* In: Proc. 2nd ACM SIGMOD Int. Conf. on Management of Data (SIGMOD'00),
* Dallas, TX, 2000.
* </p>
*
* @author Peer Kröger
* @author Erich Schubert
* @author Elke Achtert
*
* @apiviz.has LOFResult oneway - - computes
* @apiviz.has KNNQuery
*
* @param <O> the type of DatabaseObjects handled by this Algorithm
* @param <D> Distance type
*/
@Title("LOF: Local Outlier Factor")
@Description("Algorithm to compute density-based local outlier factors in a database based on the neighborhood size parameter 'k'")
@Reference(authors = "M. M. Breunig, H.-P. Kriegel, R. Ng, and J. Sander", title = "LOF: Identifying Density-Based Local Outliers", booktitle = "Proc. 2nd ACM SIGMOD Int. Conf. on Management of Data (SIGMOD '00), Dallas, TX, 2000", url = "http://dx.doi.org/10.1145/342009.335388")
public class LOF<O, D extends NumberDistance<D, ?>> extends AbstractAlgorithm<OutlierResult> implements OutlierAlgorithm {
/**
* The logger for this class.
*/
private static final Logging logger = Logging.getLogger(LOF.class);
/**
* The distance function to determine the reachability distance between
* database objects.
*/
public static final OptionID REACHABILITY_DISTANCE_FUNCTION_ID = OptionID.getOrCreateOptionID("lof.reachdistfunction", "Distance function to determine the reachability distance between database objects.");
/**
* Parameter to specify the number of nearest neighbors of an object to be
* considered for computing its LOF_SCORE, must be an integer greater than 1.
*/
public static final OptionID K_ID = OptionID.getOrCreateOptionID("lof.k", "The number of nearest neighbors of an object to be considered for computing its LOF_SCORE.");
/**
* Holds the value of {@link #K_ID}.
*/
protected int k = 2;
/**
* Neighborhood distance function.
*/
protected DistanceFunction<? super O, D> neighborhoodDistanceFunction;
/**
* Reachability distance function.
*/
protected DistanceFunction<? super O, D> reachabilityDistanceFunction;
/**
* Include object itself in kNN neighborhood.
*
* In the official LOF publication, the point itself is not considered to be
* part of its k nearest neighbors.
*/
private static boolean objectIsInKNN = false;
/**
* Constructor.
*
* @param k the value of k
* @param neighborhoodDistanceFunction the neighborhood distance function
* @param reachabilityDistanceFunction the reachability distance function
*/
public LOF(int k, DistanceFunction<? super O, D> neighborhoodDistanceFunction, DistanceFunction<? super O, D> reachabilityDistanceFunction) {
super();
this.k = k + (objectIsInKNN ? 0 : 1);
this.neighborhoodDistanceFunction = neighborhoodDistanceFunction;
this.reachabilityDistanceFunction = reachabilityDistanceFunction;
}
/**
* Performs the Generalized LOF_SCORE algorithm on the given database by
* calling {@code #doRunInTime(Database)}.
*
* @param relation Data to process
*/
public OutlierResult run(Relation<O> relation) {
StepProgress stepprog = logger.isVerbose() ? new StepProgress("LOF", 3) : null;
Pair<KNNQuery<O, D>, KNNQuery<O, D>> pair = getKNNQueries(relation, stepprog);
KNNQuery<O, D> kNNRefer = pair.getFirst();
KNNQuery<O, D> kNNReach = pair.getSecond();
return doRunInTime(kNNRefer, kNNReach, stepprog).getResult();
}
/**
* Get the kNN queries for the algorithm.
*
* @param relation the data
* @param stepprog the progress logger
* @return the kNN queries for the algorithm
*/
private Pair<KNNQuery<O, D>, KNNQuery<O, D>> getKNNQueries(Relation<O> relation, StepProgress stepprog) {
// "HEAVY" flag for knnReach since it is used more than once
KNNQuery<O, D> knnReach = QueryUtil.getKNNQuery(relation, reachabilityDistanceFunction, k, DatabaseQuery.HINT_HEAVY_USE, DatabaseQuery.HINT_OPTIMIZED_ONLY, DatabaseQuery.HINT_NO_CACHE);
// No optimized kNN query - use a preprocessor!
if(!(knnReach instanceof PreprocessorKNNQuery)) {
if(stepprog != null) {
if(neighborhoodDistanceFunction.equals(reachabilityDistanceFunction)) {
stepprog.beginStep(1, "Materializing neighborhoods w.r.t. reference neighborhood distance function.", logger);
}
else {
stepprog.beginStep(1, "Not materializing neighborhoods w.r.t. reference neighborhood distance function, but materializing neighborhoods w.r.t. reachability distance function.", logger);
}
}
MaterializeKNNPreprocessor<O, D> preproc = new MaterializeKNNPreprocessor<O, D>(relation, reachabilityDistanceFunction, k);
relation.getDatabase().addIndex(preproc);
DistanceQuery<O, D> rdq = relation.getDatabase().getDistanceQuery(relation, reachabilityDistanceFunction);
knnReach = preproc.getKNNQuery(rdq, k);
}
// knnReach is only used once
KNNQuery<O, D> knnRefer;
if(neighborhoodDistanceFunction == reachabilityDistanceFunction || neighborhoodDistanceFunction.equals(reachabilityDistanceFunction)) {
knnRefer = knnReach;
}
else {
// do not materialize the first neighborhood, since it is used only once
knnRefer = QueryUtil.getKNNQuery(relation, neighborhoodDistanceFunction, k);
}
return new Pair<KNNQuery<O, D>, KNNQuery<O, D>>(knnRefer, knnReach);
}
/**
* Performs the Generalized LOF_SCORE algorithm on the given database and
* returns a {@link LOF.LOFResult} encapsulating information that may be
* needed by an OnlineLOF algorithm.
*
* @param kNNRefer the kNN query w.r.t. reference neighborhood distance
* function
* @param kNNReach the kNN query w.r.t. reachability distance function
*/
protected LOFResult<O, D> doRunInTime(KNNQuery<O, D> kNNRefer, KNNQuery<O, D> kNNReach, StepProgress stepprog) throws IllegalStateException {
// Assert we got something
if(kNNRefer == null) {
throw new AbortException("No kNN queries supported by database for reference neighborhood distance function.");
}
if(kNNReach == null) {
throw new AbortException("No kNN queries supported by database for reachability distance function.");
}
// Compute LRDs
if(stepprog != null) {
stepprog.beginStep(2, "Computing LRDs.", logger);
}
WritableDataStore<Double> lrds = computeLRDs(kNNReach.getRelation().getDBIDs(), kNNReach);
// compute LOF_SCORE of each db object
if(stepprog != null) {
stepprog.beginStep(3, "Computing LOFs.", logger);
}
Pair<WritableDataStore<Double>, DoubleMinMax> lofsAndMax = computeLOFs(kNNRefer.getRelation().getDBIDs(), lrds, kNNRefer);
WritableDataStore<Double> lofs = lofsAndMax.getFirst();
// track the maximum value for normalization.
DoubleMinMax lofminmax = lofsAndMax.getSecond();
if(stepprog != null) {
stepprog.setCompleted(logger);
}
// Build result representation.
Relation<Double> scoreResult = new MaterializedRelation<Double>("Local Outlier Factor", "lof-outlier", TypeUtil.DOUBLE, lofs, kNNRefer.getRelation().getDBIDs());
OutlierScoreMeta scoreMeta = new QuotientOutlierScoreMeta(lofminmax.getMin(), lofminmax.getMax(), 0.0, Double.POSITIVE_INFINITY, 1.0);
OutlierResult result = new OutlierResult(scoreMeta, scoreResult);
return new LOFResult<O, D>(result, kNNRefer, kNNReach, lrds, lofs);
}
/**
* Computes the local reachability density (LRD) of the specified objects.
*
* @param ids the ids of the objects
* @param knnReach the precomputed neighborhood of the objects w.r.t. the
* reachability distance
* @return the LRDs of the objects
*/
protected WritableDataStore<Double> computeLRDs(DBIDs ids, KNNQuery<O, D> knnReach) {
WritableDataStore<Double> lrds = DataStoreUtil.makeStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP, Double.class);
FiniteProgress lrdsProgress = logger.isVerbose() ? new FiniteProgress("LRD", ids.size(), logger) : null;
for(DBID id : ids) {
double sum = 0;
List<DistanceResultPair<D>> neighbors = knnReach.getKNNForDBID(id, k);
int nsize = neighbors.size() - (objectIsInKNN ? 0 : 1);
for(DistanceResultPair<D> neighbor : neighbors) {
if(objectIsInKNN || !neighbor.getDBID().equals(id)) {
List<DistanceResultPair<D>> neighborsNeighbors = knnReach.getKNNForDBID(neighbor.getDBID(), k);
sum += Math.max(neighbor.getDistance().doubleValue(), neighborsNeighbors.get(neighborsNeighbors.size() - 1).getDistance().doubleValue());
}
}
// Avoid division by 0
Double lrd = (sum > 0) ? nsize / sum : 0.0;
lrds.put(id, lrd);
if(lrdsProgress != null) {
lrdsProgress.incrementProcessed(logger);
}
}
if(lrdsProgress != null) {
lrdsProgress.ensureCompleted(logger);
}
return lrds;
}
/**
* Computes the Local outlier factor (LOF) of the specified objects.
*
* @param ids the ids of the objects
* @param lrds the LRDs of the objects
* @param knnRefer the precomputed neighborhood of the objects w.r.t. the
* reference distance
* @return the LOFs of the objects and the maximum LOF
*/
protected Pair<WritableDataStore<Double>, DoubleMinMax> computeLOFs(DBIDs ids, DataStore<Double> lrds, KNNQuery<O, D> knnRefer) {
WritableDataStore<Double> lofs = DataStoreUtil.makeStorage(ids, DataStoreFactory.HINT_STATIC, Double.class);
// track the maximum value for normalization.
DoubleMinMax lofminmax = new DoubleMinMax();
FiniteProgress progressLOFs = logger.isVerbose() ? new FiniteProgress("LOF_SCORE for objects", ids.size(), logger) : null;
for(DBID id : ids) {
double lrdp = lrds.get(id);
final Double lof;
if(lrdp > 0) {
List<DistanceResultPair<D>> neighbors = knnRefer.getKNNForDBID(id, k);
int nsize = neighbors.size() - (objectIsInKNN ? 0 : 1);
// skip the point itself
// neighbors.remove(0);
double sum = 0;
for(DistanceResultPair<D> neighbor : neighbors) {
if(objectIsInKNN || !neighbor.getDBID().equals(id)) {
sum += lrds.get(neighbor.getDBID());
}
}
lof = (sum / nsize) / lrdp;
}
else {
lof = 1.0;
}
lofs.put(id, lof);
// update minimum and maximum
lofminmax.put(lof);
if(progressLOFs != null) {
progressLOFs.incrementProcessed(logger);
}
}
if(progressLOFs != null) {
progressLOFs.ensureCompleted(logger);
}
return new Pair<WritableDataStore<Double>, DoubleMinMax>(lofs, lofminmax);
}
@Override
public TypeInformation[] getInputTypeRestriction() {
final TypeInformation type;
if(reachabilityDistanceFunction.equals(neighborhoodDistanceFunction)) {
type = reachabilityDistanceFunction.getInputTypeRestriction();
}
else {
type = new CombinedTypeInformation(neighborhoodDistanceFunction.getInputTypeRestriction(), reachabilityDistanceFunction.getInputTypeRestriction());
}
return TypeUtil.array(type);
}
@Override
protected Logging getLogger() {
return logger;
}
/**
* Encapsulates information like the neighborhood, the LRD and LOF values of
* the objects during a run of the {@link LOF} algorithm.
*/
public static class LOFResult<O, D extends NumberDistance<D, ?>> {
/**
* The result of the run of the {@link LOF} algorithm.
*/
private OutlierResult result;
/**
* The kNN query w.r.t. the reference neighborhood distance.
*/
private final KNNQuery<O, D> kNNRefer;
/**
* The kNN query w.r.t. the reachability distance.
*/
private final KNNQuery<O, D> kNNReach;
/**
* The RkNN query w.r.t. the reference neighborhood distance.
*/
private RKNNQuery<O, D> rkNNRefer;
/**
* The rkNN query w.r.t. the reachability distance.
*/
private RKNNQuery<O, D> rkNNReach;
/**
* The LRD values of the objects.
*/
private final WritableDataStore<Double> lrds;
/**
* The LOF values of the objects.
*/
private final WritableDataStore<Double> lofs;
/**
* Encapsulates information generated during a run of the {@link LOF}
* algorithm.
*
* @param result the result of the run of the {@link LOF} algorithm
* @param kNNRefer the kNN query w.r.t. the reference neighborhood distance
* @param kNNReach the kNN query w.r.t. the reachability distance
* @param lrds the LRD values of the objects
* @param lofs the LOF values of the objects
*/
public LOFResult(OutlierResult result, KNNQuery<O, D> kNNRefer, KNNQuery<O, D> kNNReach, WritableDataStore<Double> lrds, WritableDataStore<Double> lofs) {
this.result = result;
this.kNNRefer = kNNRefer;
this.kNNReach = kNNReach;
this.lrds = lrds;
this.lofs = lofs;
}
/**
* @return the kNN query w.r.t. the reference neighborhood distance
*/
public KNNQuery<O, D> getKNNRefer() {
return kNNRefer;
}
/**
* @return the kNN query w.r.t. the reachability distance
*/
public KNNQuery<O, D> getKNNReach() {
return kNNReach;
}
/**
* @return the LRD values of the objects
*/
public WritableDataStore<Double> getLrds() {
return lrds;
}
/**
* @return the LOF values of the objects
*/
public WritableDataStore<Double> getLofs() {
return lofs;
}
/**
* @return the result of the run of the {@link LOF} algorithm
*/
public OutlierResult getResult() {
return result;
}
/**
* Sets the RkNN query w.r.t. the reference neighborhood distance.
*
* @param rkNNRefer the query to set
*/
public void setRkNNRefer(RKNNQuery<O, D> rkNNRefer) {
this.rkNNRefer = rkNNRefer;
}
/**
* @return the RkNN query w.r.t. the reference neighborhood distance
*/
public RKNNQuery<O, D> getRkNNRefer() {
return rkNNRefer;
}
/**
* @return the RkNN query w.r.t. the reachability distance
*/
public RKNNQuery<O, D> getRkNNReach() {
return rkNNReach;
}
/**
* Sets the RkNN query w.r.t. the reachability distance.
*
* @param rkNNReach the query to set
*/
public void setRkNNReach(RKNNQuery<O, D> rkNNReach) {
this.rkNNReach = rkNNReach;
}
}
/**
* Parameterization class.
*
* @author Erich Schubert
*
* @apiviz.exclude
*/
public static class Parameterizer<O, D extends NumberDistance<D, ?>> extends AbstractDistanceBasedAlgorithm.Parameterizer<O, D> {
/**
* The neighborhood size to use
*/
protected int k = 2;
/**
* Neighborhood distance function.
*/
protected DistanceFunction<O, D> neighborhoodDistanceFunction = null;
/**
* Reachability distance function.
*/
protected DistanceFunction<O, D> reachabilityDistanceFunction = null;
@Override
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
final IntParameter pK = new IntParameter(K_ID, new GreaterConstraint(1));
if(config.grab(pK)) {
k = pK.getValue();
}
final ObjectParameter<DistanceFunction<O, D>> reachDistP = new ObjectParameter<DistanceFunction<O, D>>(REACHABILITY_DISTANCE_FUNCTION_ID, DistanceFunction.class, true);
if(config.grab(reachDistP)) {
reachabilityDistanceFunction = reachDistP.instantiateClass(config);
}
}
@Override
protected LOF<O, D> makeInstance() {
// Default is to re-use the same distance
DistanceFunction<O, D> rdist = (reachabilityDistanceFunction != null) ? reachabilityDistanceFunction : distanceFunction;
return new LOF<O, D>(k, distanceFunction, rdist);
}
}
}