package org.apache.lucene.facet.old;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map.Entry;
import org.apache.lucene.facet.complements.TotalFacetCounts;
import org.apache.lucene.facet.complements.TotalFacetCountsCache;
import org.apache.lucene.facet.params.FacetIndexingParams;
import org.apache.lucene.facet.params.FacetSearchParams;
import org.apache.lucene.facet.partitions.IntermediateFacetResult;
import org.apache.lucene.facet.partitions.PartitionsFacetResultsHandler;
import org.apache.lucene.facet.sampling.Sampler.OverSampledFacetRequest;
import org.apache.lucene.facet.search.CategoryListIterator;
import org.apache.lucene.facet.search.CountFacetRequest;
import org.apache.lucene.facet.search.FacetArrays;
import org.apache.lucene.facet.search.FacetRequest;
import org.apache.lucene.facet.search.FacetRequest.ResultMode;
import org.apache.lucene.facet.search.FacetResult;
import org.apache.lucene.facet.search.FacetsAccumulator;
import org.apache.lucene.facet.search.FacetsAggregator;
import org.apache.lucene.facet.search.FacetsCollector.MatchingDocs;
import org.apache.lucene.facet.search.OrdinalValueResolver;
import org.apache.lucene.facet.search.OrdinalValueResolver.FloatValueResolver;
import org.apache.lucene.facet.search.OrdinalValueResolver.IntValueResolver;
import org.apache.lucene.facet.search.SumScoreFacetRequest;
import org.apache.lucene.facet.search.TaxonomyFacetsAccumulator;
import org.apache.lucene.facet.search.TopKFacetResultsHandler;
import org.apache.lucene.facet.search.TopKInEachNodeHandler;
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
import org.apache.lucene.facet.util.PartitionsUtils;
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.util.IntsRef;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* A {@link FacetsAccumulator} which supports partitions, sampling and
* complement counting.
* <p>
* <b>NOTE:</b> this accumulator still uses the old API and will be removed
* eventually in favor of dedicated accumulators which support the above
* features ovee the new {@link FacetsAggregator} API. It provides
* {@link Aggregator} implementations for {@link CountFacetRequest},
* {@link SumScoreFacetRequest} and {@link OverSampledFacetRequest}. If you need
* to use it in conjunction with other facet requests, you should override
* {@link #createAggregator(FacetRequest, FacetArrays)}.
*
* @lucene.experimental
*/
public class OldFacetsAccumulator extends TaxonomyFacetsAccumulator {
/**
* Default threshold for using the complements optimization.
* If accumulating facets for a document set larger than this ratio of the index size than
* perform the complement optimization.
* @see #setComplementThreshold(double) for more info on the complements optimization.
*/
public static final double DEFAULT_COMPLEMENT_THRESHOLD = 0.6;
/**
* Passing this to {@link #setComplementThreshold(double)} will disable using complement optimization.
*/
public static final double DISABLE_COMPLEMENT = Double.POSITIVE_INFINITY; // > 1 actually
/**
* Passing this to {@link #setComplementThreshold(double)} will force using complement optimization.
*/
public static final double FORCE_COMPLEMENT = 0; // <=0
protected int partitionSize;
protected int maxPartitions;
protected boolean isUsingComplements;
private TotalFacetCounts totalFacetCounts;
private Object accumulateGuard;
private double complementThreshold = DEFAULT_COMPLEMENT_THRESHOLD;
private static FacetArrays createFacetArrays(FacetSearchParams searchParams, TaxonomyReader taxoReader) {
return new FacetArrays(PartitionsUtils.partitionSize(searchParams.indexingParams, taxoReader));
}
public OldFacetsAccumulator(FacetSearchParams searchParams, IndexReader indexReader,
TaxonomyReader taxonomyReader) {
this(searchParams, indexReader, taxonomyReader, null);
}
public OldFacetsAccumulator(FacetSearchParams searchParams, IndexReader indexReader,
TaxonomyReader taxonomyReader, FacetArrays facetArrays) {
super(searchParams, indexReader, taxonomyReader, facetArrays == null ? createFacetArrays(searchParams, taxonomyReader) : facetArrays);
// can only be computed later when docids size is known
isUsingComplements = false;
partitionSize = PartitionsUtils.partitionSize(searchParams.indexingParams, taxonomyReader);
maxPartitions = (int) Math.ceil(this.taxonomyReader.getSize() / (double) partitionSize);
accumulateGuard = new Object();
}
// TODO: this should be removed once we clean the API
public List<FacetResult> accumulate(ScoredDocIDs docids) throws IOException {
// synchronize to prevent calling two accumulate()'s at the same time.
// We decided not to synchronize the method because that might mislead
// users to feel encouraged to call this method simultaneously.
synchronized (accumulateGuard) {
// only now we can compute this
isUsingComplements = shouldComplement(docids);
if (isUsingComplements) {
try {
totalFacetCounts = TotalFacetCountsCache.getSingleton().getTotalCounts(indexReader, taxonomyReader, searchParams.indexingParams);
if (totalFacetCounts != null) {
docids = ScoredDocIdsUtils.getComplementSet(docids, indexReader);
} else {
isUsingComplements = false;
}
} catch (UnsupportedOperationException e) {
// TODO (Facet): this exception is thrown from TotalCountsKey if the
// IndexReader used does not support getVersion(). We should re-think
// this: is this tiny detail worth disabling total counts completely
// for such readers? Currently, it's not supported by Parallel and
// MultiReader, which might be problematic for several applications.
// We could, for example, base our "isCurrent" logic on something else
// than the reader's version. Need to think more deeply about it.
isUsingComplements = false;
} catch (IOException e) {
// silently fail if for some reason failed to load/save from/to dir
isUsingComplements = false;
} catch (Exception e) {
// give up: this should not happen!
throw new IOException("PANIC: Got unexpected exception while trying to get/calculate total counts", e);
}
}
docids = actualDocsToAccumulate(docids);
HashMap<FacetRequest, IntermediateFacetResult> fr2tmpRes = new HashMap<FacetRequest, IntermediateFacetResult>();
try {
for (int part = 0; part < maxPartitions; part++) {
// fill arrays from category lists
fillArraysForPartition(docids, facetArrays, part);
int offset = part * partitionSize;
// for each partition we go over all requests and handle
// each, where the request maintains the merged result.
// In this implementation merges happen after each partition,
// but other impl could merge only at the end.
final HashSet<FacetRequest> handledRequests = new HashSet<FacetRequest>();
for (FacetRequest fr : searchParams.facetRequests) {
// Handle and merge only facet requests which were not already handled.
if (handledRequests.add(fr)) {
PartitionsFacetResultsHandler frHndlr = createFacetResultsHandler(fr, createOrdinalValueResolver(fr));
IntermediateFacetResult res4fr = frHndlr.fetchPartitionResult(offset);
IntermediateFacetResult oldRes = fr2tmpRes.get(fr);
if (oldRes != null) {
res4fr = frHndlr.mergeResults(oldRes, res4fr);
}
fr2tmpRes.put(fr, res4fr);
}
}
}
} finally {
facetArrays.free();
}
// gather results from all requests into a list for returning them
List<FacetResult> res = new ArrayList<FacetResult>();
for (FacetRequest fr : searchParams.facetRequests) {
PartitionsFacetResultsHandler frHndlr = createFacetResultsHandler(fr, createOrdinalValueResolver(fr));
IntermediateFacetResult tmpResult = fr2tmpRes.get(fr);
if (tmpResult == null) {
// Add empty FacetResult:
res.add(emptyResult(taxonomyReader.getOrdinal(fr.categoryPath), fr));
continue;
}
FacetResult facetRes = frHndlr.renderFacetResult(tmpResult);
// final labeling if allowed (because labeling is a costly operation)
frHndlr.labelResult(facetRes);
res.add(facetRes);
}
return res;
}
}
/** check if all requests are complementable */
protected boolean mayComplement() {
for (FacetRequest freq : searchParams.facetRequests) {
if (!(freq instanceof CountFacetRequest)) {
return false;
}
}
return true;
}
@Override
public PartitionsFacetResultsHandler createFacetResultsHandler(FacetRequest fr, OrdinalValueResolver resolver) {
if (fr.getResultMode() == ResultMode.PER_NODE_IN_TREE) {
return new TopKInEachNodeHandler(taxonomyReader, fr, resolver, facetArrays);
} else {
return new TopKFacetResultsHandler(taxonomyReader, fr, resolver, facetArrays);
}
}
/**
* Set the actual set of documents over which accumulation should take place.
* <p>
* Allows to override the set of documents to accumulate for. Invoked just
* before actual accumulating starts. From this point that set of documents
* remains unmodified. Default implementation just returns the input
* unchanged.
*
* @param docids
* candidate documents to accumulate for
* @return actual documents to accumulate for
*/
protected ScoredDocIDs actualDocsToAccumulate(ScoredDocIDs docids) throws IOException {
return docids;
}
/** Check if it is worth to use complements */
protected boolean shouldComplement(ScoredDocIDs docids) {
return mayComplement() && (docids.size() > indexReader.numDocs() * getComplementThreshold()) ;
}
/**
* Creates an {@link OrdinalValueResolver} for the given {@link FacetRequest}.
* By default this method supports {@link CountFacetRequest} and
* {@link SumScoreFacetRequest}. You should override if you are using other
* requests with this accumulator.
*/
public OrdinalValueResolver createOrdinalValueResolver(FacetRequest fr) {
if (fr instanceof CountFacetRequest) {
return new IntValueResolver(facetArrays);
} else if (fr instanceof SumScoreFacetRequest) {
return new FloatValueResolver(facetArrays);
} else if (fr instanceof OverSampledFacetRequest) {
return createOrdinalValueResolver(((OverSampledFacetRequest) fr).orig);
} else {
throw new IllegalArgumentException("unrecognized FacetRequest " + fr.getClass());
}
}
/**
* Iterate over the documents for this partition and fill the facet arrays with the correct
* count/complement count/value.
*/
private final void fillArraysForPartition(ScoredDocIDs docids, FacetArrays facetArrays, int partition)
throws IOException {
if (isUsingComplements) {
initArraysByTotalCounts(facetArrays, partition, docids.size());
} else {
facetArrays.free(); // to get a cleared array for this partition
}
HashMap<CategoryListIterator, Aggregator> categoryLists = getCategoryListMap(facetArrays, partition);
IntsRef ordinals = new IntsRef(32); // a reasonable start capacity for most common apps
for (Entry<CategoryListIterator, Aggregator> entry : categoryLists.entrySet()) {
final ScoredDocIDsIterator iterator = docids.iterator();
final CategoryListIterator categoryListIter = entry.getKey();
final Aggregator aggregator = entry.getValue();
Iterator<AtomicReaderContext> contexts = indexReader.leaves().iterator();
AtomicReaderContext current = null;
int maxDoc = -1;
while (iterator.next()) {
int docID = iterator.getDocID();
if (docID >= maxDoc) {
boolean iteratorDone = false;
do { // find the segment which contains this document
if (!contexts.hasNext()) {
throw new RuntimeException("ScoredDocIDs contains documents outside this reader's segments !?");
}
current = contexts.next();
maxDoc = current.docBase + current.reader().maxDoc();
if (docID < maxDoc) { // segment has docs, check if it has categories
boolean validSegment = categoryListIter.setNextReader(current);
validSegment &= aggregator.setNextReader(current);
if (!validSegment) { // if categoryList or aggregtor say it's an invalid segment, skip all docs
while (docID < maxDoc && iterator.next()) {
docID = iterator.getDocID();
}
if (docID < maxDoc) {
iteratorDone = true;
}
}
}
} while (docID >= maxDoc);
if (iteratorDone) { // iterator finished, terminate the loop
break;
}
}
docID -= current.docBase;
categoryListIter.getOrdinals(docID, ordinals);
if (ordinals.length == 0) {
continue; // document does not have category ordinals
}
aggregator.aggregate(docID, iterator.getScore(), ordinals);
}
}
}
/** Init arrays for partition by total counts, optionally applying a factor */
private final void initArraysByTotalCounts(FacetArrays facetArrays, int partition, int nAccumulatedDocs) {
int[] intArray = facetArrays.getIntArray();
totalFacetCounts.fillTotalCountsForPartition(intArray, partition);
double totalCountsFactor = getTotalCountsFactor();
// fix total counts, but only if the effect of this would be meaningful.
if (totalCountsFactor < 0.99999) {
int delta = nAccumulatedDocs + 1;
for (int i = 0; i < intArray.length; i++) {
intArray[i] *= totalCountsFactor;
// also translate to prevent loss of non-positive values
// due to complement sampling (ie if sampled docs all decremented a certain category).
intArray[i] += delta;
}
}
}
/**
* Expert: factor by which counts should be multiplied when initializing
* the count arrays from total counts.
* Default implementation for this returns 1, which is a no op.
* @return a factor by which total counts should be multiplied
*/
protected double getTotalCountsFactor() {
return 1;
}
protected Aggregator createAggregator(FacetRequest fr, FacetArrays facetArrays) {
if (fr instanceof CountFacetRequest) {
// we rely on that, if needed, result is cleared by arrays!
int[] a = facetArrays.getIntArray();
if (isUsingComplements) {
return new ComplementCountingAggregator(a);
} else {
return new CountingAggregator(a);
}
} else if (fr instanceof SumScoreFacetRequest) {
if (isUsingComplements) {
throw new IllegalArgumentException("complements are not supported by SumScoreFacetRequest");
} else {
return new ScoringAggregator(facetArrays.getFloatArray());
}
} else if (fr instanceof OverSampledFacetRequest) {
return createAggregator(((OverSampledFacetRequest) fr).orig, facetArrays);
} else {
throw new IllegalArgumentException("unknown Aggregator implementation for request " + fr.getClass());
}
}
/**
* Create an {@link Aggregator} and a {@link CategoryListIterator} for each
* and every {@link FacetRequest}. Generating a map, matching each
* categoryListIterator to its matching aggregator.
* <p>
* If two CategoryListIterators are served by the same aggregator, a single
* aggregator is returned for both.
*
* <b>NOTE: </b>If a given category list iterator is needed with two different
* aggregators (e.g counting and association) - an exception is thrown as this
* functionality is not supported at this time.
*/
protected HashMap<CategoryListIterator, Aggregator> getCategoryListMap(FacetArrays facetArrays,
int partition) throws IOException {
HashMap<CategoryListIterator, Aggregator> categoryLists = new HashMap<CategoryListIterator, Aggregator>();
FacetIndexingParams indexingParams = searchParams.indexingParams;
for (FacetRequest facetRequest : searchParams.facetRequests) {
Aggregator categoryAggregator = createAggregator(facetRequest, facetArrays);
CategoryListIterator cli = indexingParams.getCategoryListParams(facetRequest.categoryPath).createCategoryListIterator(partition);
// get the aggregator
Aggregator old = categoryLists.put(cli, categoryAggregator);
if (old != null && !old.equals(categoryAggregator)) {
throw new RuntimeException("Overriding existing category list with different aggregator");
}
// if the aggregator is the same we're covered
}
return categoryLists;
}
@Override
public List<FacetResult> accumulate(List<MatchingDocs> matchingDocs) throws IOException {
return accumulate(new MatchingDocsAsScoredDocIDs(matchingDocs));
}
/**
* Returns the complement threshold.
* @see #setComplementThreshold(double)
*/
public double getComplementThreshold() {
return complementThreshold;
}
/**
* Set the complement threshold.
* This threshold will dictate whether the complements optimization is applied.
* The optimization is to count for less documents. It is useful when the same
* FacetSearchParams are used for varying sets of documents. The first time
* complements is used the "total counts" are computed - counting for all the
* documents in the collection. Then, only the complementing set of documents
* is considered, and used to decrement from the overall counts, thereby
* walking through less documents, which is faster.
* <p>
* For the default settings see {@link #DEFAULT_COMPLEMENT_THRESHOLD}.
* <p>
* To forcing complements in all cases pass {@link #FORCE_COMPLEMENT}.
* This is mostly useful for testing purposes, as forcing complements when only
* tiny fraction of available documents match the query does not make sense and
* would incur performance degradations.
* <p>
* To disable complements pass {@link #DISABLE_COMPLEMENT}.
* @param complementThreshold the complement threshold to set
* @see #getComplementThreshold()
*/
public void setComplementThreshold(double complementThreshold) {
this.complementThreshold = complementThreshold;
}
/** Returns true if complements are enabled. */
public boolean isUsingComplements() {
return isUsingComplements;
}
}