package org.apache.lucene.facet.sortedset;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import org.apache.lucene.facet.params.CategoryListParams;
import org.apache.lucene.facet.params.FacetSearchParams;
import org.apache.lucene.facet.search.CountFacetRequest;
import org.apache.lucene.facet.search.FacetArrays;
import org.apache.lucene.facet.search.FacetRequest;
import org.apache.lucene.facet.search.FacetResult;
import org.apache.lucene.facet.search.FacetResultNode;
import org.apache.lucene.facet.search.FacetsAccumulator;
import org.apache.lucene.facet.search.FacetsAggregator;
import org.apache.lucene.facet.search.FacetsCollector.MatchingDocs;
import org.apache.lucene.facet.taxonomy.CategoryPath;
import org.apache.lucene.index.AtomicReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiDocValues.MultiSortedSetDocValues;
import org.apache.lucene.index.MultiDocValues;
import org.apache.lucene.index.ReaderUtil;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.PriorityQueue;
/** A {@link FacetsAccumulator} that uses previously
* indexed {@link SortedSetDocValuesFacetFields} to perform faceting,
* without require a separate taxonomy index. Faceting is
* a bit slower (~25%), and there is added cost on every
* {@link IndexReader} open to create a new {@link
* SortedSetDocValuesReaderState}. Furthermore, this does
* not support hierarchical facets; only flat (dimension +
* label) facets, but it uses quite a bit less RAM to do so. */
public class SortedSetDocValuesAccumulator extends FacetsAccumulator {
final SortedSetDocValuesReaderState state;
final SortedSetDocValues dv;
final String field;
public SortedSetDocValuesAccumulator(FacetSearchParams fsp, SortedSetDocValuesReaderState state) throws IOException {
super(fsp, null, null, new FacetArrays(state.getSize()));
this.state = state;
this.field = state.getField();
dv = state.getDocValues();
// Check params:
for(FacetRequest request : fsp.facetRequests) {
if (!(request instanceof CountFacetRequest)) {
throw new IllegalArgumentException("this collector only supports CountFacetRequest; got " + request);
}
if (request.categoryPath.length != 1) {
throw new IllegalArgumentException("this collector only supports depth 1 CategoryPath; got " + request.categoryPath);
}
if (request.getDepth() != 1) {
throw new IllegalArgumentException("this collector only supports depth=1; got " + request.getDepth());
}
String dim = request.categoryPath.components[0];
SortedSetDocValuesReaderState.OrdRange ordRange = state.getOrdRange(dim);
if (ordRange == null) {
throw new IllegalArgumentException("dim \"" + dim + "\" does not exist");
}
}
}
@Override
public FacetsAggregator getAggregator() {
return new FacetsAggregator() {
@Override
public void aggregate(MatchingDocs matchingDocs, CategoryListParams clp, FacetArrays facetArrays) throws IOException {
AtomicReader reader = matchingDocs.context.reader();
// LUCENE-5090: make sure the provided reader context "matches"
// the top-level reader passed to the
// SortedSetDocValuesReaderState, else cryptic
// AIOOBE can happen:
if (ReaderUtil.getTopLevelContext(matchingDocs.context).reader() != state.origReader) {
throw new IllegalStateException("the SortedSetDocValuesReaderState provided to this class does not match the reader being searched; you must create a new SortedSetDocValuesReaderState every time you open a new IndexReader");
}
SortedSetDocValues segValues = reader.getSortedSetDocValues(field);
if (segValues == null) {
return;
}
final int[] counts = facetArrays.getIntArray();
final int maxDoc = reader.maxDoc();
assert maxDoc == matchingDocs.bits.length();
if (dv instanceof MultiSortedSetDocValues) {
MultiDocValues.OrdinalMap ordinalMap = ((MultiSortedSetDocValues) dv).mapping;
int segOrd = matchingDocs.context.ord;
int numSegOrds = (int) segValues.getValueCount();
if (matchingDocs.totalHits < numSegOrds/10) {
// Remap every ord to global ord as we iterate:
int doc = 0;
while (doc < maxDoc && (doc = matchingDocs.bits.nextSetBit(doc)) != -1) {
segValues.setDocument(doc);
int term = (int) segValues.nextOrd();
while (term != SortedSetDocValues.NO_MORE_ORDS) {
counts[(int) ordinalMap.getGlobalOrd(segOrd, term)]++;
term = (int) segValues.nextOrd();
}
++doc;
}
} else {
// First count in seg-ord space:
final int[] segCounts = new int[numSegOrds];
int doc = 0;
while (doc < maxDoc && (doc = matchingDocs.bits.nextSetBit(doc)) != -1) {
segValues.setDocument(doc);
int term = (int) segValues.nextOrd();
while (term != SortedSetDocValues.NO_MORE_ORDS) {
segCounts[term]++;
term = (int) segValues.nextOrd();
}
++doc;
}
// Then, migrate to global ords:
for(int ord=0;ord<numSegOrds;ord++) {
int count = segCounts[ord];
if (count != 0) {
counts[(int) ordinalMap.getGlobalOrd(segOrd, ord)] += count;
}
}
}
} else {
// No ord mapping (e.g., single segment index):
// just aggregate directly into counts:
int doc = 0;
while (doc < maxDoc && (doc = matchingDocs.bits.nextSetBit(doc)) != -1) {
segValues.setDocument(doc);
int term = (int) segValues.nextOrd();
while (term != SortedSetDocValues.NO_MORE_ORDS) {
counts[term]++;
term = (int) segValues.nextOrd();
}
++doc;
}
}
}
@Override
public void rollupValues(FacetRequest fr, int ordinal, int[] children, int[] siblings, FacetArrays facetArrays) {
// Nothing to do here: we only support flat (dim +
// label) facets, and in accumulate we sum up the
// count for the dimension.
}
@Override
public boolean requiresDocScores() {
return false;
}
};
}
/** Keeps highest count results. */
static class TopCountPQ extends PriorityQueue<FacetResultNode> {
public TopCountPQ(int topN) {
super(topN, false);
}
@Override
protected boolean lessThan(FacetResultNode a, FacetResultNode b) {
if (a.value < b.value) {
return true;
} else if (a.value > b.value) {
return false;
} else {
return a.ordinal > b.ordinal;
}
}
}
@Override
public List<FacetResult> accumulate(List<MatchingDocs> matchingDocs) throws IOException {
FacetsAggregator aggregator = getAggregator();
for (CategoryListParams clp : getCategoryLists()) {
for (MatchingDocs md : matchingDocs) {
aggregator.aggregate(md, clp, facetArrays);
}
}
// compute top-K
List<FacetResult> results = new ArrayList<FacetResult>();
int[] counts = facetArrays.getIntArray();
BytesRef scratch = new BytesRef();
for(FacetRequest request : searchParams.facetRequests) {
String dim = request.categoryPath.components[0];
SortedSetDocValuesReaderState.OrdRange ordRange = state.getOrdRange(dim);
// checked in ctor:
assert ordRange != null;
if (request.numResults >= ordRange.end - ordRange.start + 1) {
// specialize this case, user is interested in all available results
ArrayList<FacetResultNode> nodes = new ArrayList<FacetResultNode>();
int dimCount = 0;
for(int ord=ordRange.start; ord<=ordRange.end; ord++) {
//System.out.println(" ord=" + ord + " count= "+ counts[ord] + " bottomCount=" + bottomCount);
if (counts[ord] != 0) {
dimCount += counts[ord];
FacetResultNode node = new FacetResultNode(ord, counts[ord]);
dv.lookupOrd(ord, scratch);
node.label = new CategoryPath(scratch.utf8ToString().split(state.separatorRegex, 2));
nodes.add(node);
}
}
Collections.sort(nodes, new Comparator<FacetResultNode>() {
@Override
public int compare(FacetResultNode o1, FacetResultNode o2) {
// First by highest count
int value = (int) (o2.value - o1.value);
if (value == 0) {
// ... then by lowest ord:
value = o1.ordinal - o2.ordinal;
}
return value;
}
});
CategoryListParams.OrdinalPolicy op = searchParams.indexingParams.getCategoryListParams(request.categoryPath).getOrdinalPolicy(dim);
if (op == CategoryListParams.OrdinalPolicy.ALL_BUT_DIMENSION) {
dimCount = 0;
}
FacetResultNode rootNode = new FacetResultNode(-1, dimCount);
rootNode.label = new CategoryPath(new String[] {dim});
rootNode.subResults = nodes;
results.add(new FacetResult(request, rootNode, nodes.size()));
continue;
}
TopCountPQ q = new TopCountPQ(request.numResults);
int bottomCount = 0;
//System.out.println("collect");
int dimCount = 0;
int childCount = 0;
FacetResultNode reuse = null;
for(int ord=ordRange.start; ord<=ordRange.end; ord++) {
//System.out.println(" ord=" + ord + " count= "+ counts[ord] + " bottomCount=" + bottomCount);
if (counts[ord] > 0) {
childCount++;
if (counts[ord] > bottomCount) {
dimCount += counts[ord];
//System.out.println(" keep");
if (reuse == null) {
reuse = new FacetResultNode(ord, counts[ord]);
} else {
reuse.ordinal = ord;
reuse.value = counts[ord];
}
reuse = q.insertWithOverflow(reuse);
if (q.size() == request.numResults) {
bottomCount = (int) q.top().value;
//System.out.println(" new bottom=" + bottomCount);
}
}
}
}
CategoryListParams.OrdinalPolicy op = searchParams.indexingParams.getCategoryListParams(request.categoryPath).getOrdinalPolicy(dim);
if (op == CategoryListParams.OrdinalPolicy.ALL_BUT_DIMENSION) {
dimCount = 0;
}
FacetResultNode rootNode = new FacetResultNode(-1, dimCount);
rootNode.label = new CategoryPath(new String[] {dim});
FacetResultNode[] childNodes = new FacetResultNode[q.size()];
for(int i=childNodes.length-1;i>=0;i--) {
childNodes[i] = q.pop();
dv.lookupOrd(childNodes[i].ordinal, scratch);
childNodes[i].label = new CategoryPath(scratch.utf8ToString().split(state.separatorRegex, 2));
}
rootNode.subResults = Arrays.asList(childNodes);
results.add(new FacetResult(request, rootNode, childCount));
}
return results;
}
}