/**
* This software is licensed to you under the Apache License, Version 2.0 (the
* "Apache License").
*
* LinkedIn's contributions are made under the Apache License. If you contribute
* to the Software, the contributions will be deemed to have been made under the
* Apache License, unless you expressly indicate otherwise. Please do not make any
* contributions that would be inconsistent with the Apache License.
*
* You may obtain a copy of the Apache License at http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, this software
* distributed under the Apache License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the Apache
* License for the specific language governing permissions and limitations for the
* software governed under the Apache License.
*
* © 2012 LinkedIn Corp. All Rights Reserved.
*/
package com.senseidb.search.query.filters;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import org.apache.commons.lang.StringUtils;
import org.apache.log4j.Logger;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.DocIdSetIterator;
import com.browseengine.bobo.api.BoboIndexReader;
import com.browseengine.bobo.api.BrowseSelection;
import com.browseengine.bobo.api.BrowseSelection.ValueOperation;
import com.browseengine.bobo.facets.FacetHandler;
import com.browseengine.bobo.facets.data.FacetDataCache;
import com.browseengine.bobo.facets.data.TermValueList;
import com.browseengine.bobo.facets.filter.EmptyFilter;
import com.browseengine.bobo.facets.filter.RandomAccessFilter;
import com.browseengine.bobo.query.MatchAllDocIdSetIterator;
import com.kamikaze.docidset.impl.AndDocIdSet;
import com.kamikaze.docidset.impl.NotDocIdSet;
import com.kamikaze.docidset.impl.OrDocIdSet;
import com.senseidb.search.facet.UIDFacetHandler;
import com.senseidb.util.Pair;
public class SenseiTermFilter extends SenseiFilter {
/**
*
*/
private static final long serialVersionUID = 1L;
private static Logger logger = Logger.getLogger(SenseiTermFilter.class);
private final String _name;
private final String[] _vals;
private final String[] _not;
private final boolean _isAnd;
private final boolean _noAutoOptimize;
public SenseiTermFilter(String name,String vals[],String[] not,boolean isAnd,boolean noAutoOptimize){
_name = name;
_vals = vals != null ? vals : new String[0];
_not = not != null ? not : new String[0];
// Bobo silliness: Empty vals means match all, which technically means an AND of an empty set.
// EXCEPT if nots are also empty, but this is handled bellow.
_isAnd = isAnd || vals == null || vals.length == 0;
_noAutoOptimize = noAutoOptimize;
}
static DocIdSet buildDefaultDocIdSets(final BoboIndexReader reader,
final String name,
final String[] vals,
boolean isAnd){
if (vals==null) return null;
ArrayList<DocIdSet> docSetList = new ArrayList<DocIdSet>(vals.length);
for (final String val : vals){
docSetList.add(new DocIdSet() {
@Override
public DocIdSetIterator iterator() throws IOException {
return new TermDocIdSetIterator(new Term(name,val), reader);
}
});
}
if (docSetList.size()==1){
return docSetList.get(0);
}
else if (docSetList.size()==0) return null;
else{
if (isAnd) {
return new AndDocIdSet(docSetList);
}
else{
return new OrDocIdSet(docSetList);
}
}
}
private static DocIdSet buildLuceneDefaultDocIdSet(final BoboIndexReader reader,
final String name,
final String[] vals,
String[] nots,
boolean isAnd) throws IOException{
if (reader.getRuntimeFacetHandlerFactoryMap().containsKey(name))
{
// Skip runtime facet handlers
return new DocIdSet()
{
@Override
public boolean isCacheable()
{
return false;
}
@Override
public DocIdSetIterator iterator() throws IOException
{
return new MatchAllDocIdSetIterator(reader);
}
};
}
DocIdSet positiveSet = null;
DocIdSet negativeSet = null;
if (vals!=null && vals.length > 0)
positiveSet = buildDefaultDocIdSets(reader, name, vals, isAnd);
if (nots!=null && nots.length>0)
negativeSet = buildDefaultDocIdSets(reader, name, nots, false);
if (positiveSet!=null){
if (negativeSet==null){
return positiveSet;
}
else {
DocIdSet[] sets = new DocIdSet[]{positiveSet,new NotDocIdSet(negativeSet, reader.maxDoc())};
return new AndDocIdSet(Arrays.asList(sets));
}
}
else{
if (negativeSet==null){
return EmptyFilter.getInstance().getRandomAccessDocIdSet(reader);
}
else{
// this could be optimize with AndNot in new Kamikaze
return new NotDocIdSet(negativeSet, reader.maxDoc());
}
}
}
public String planString(String type, String[] vals, String[] nots, List<String> optimizedVals, List<String> optimizedNots) {
if(logger.isDebugEnabled()) {
StringBuilder plan = new StringBuilder();
boolean first = false;
plan.append(_name);
plan.append(" ");
plan.append(type);
plan.append(_isAnd ? " CONTAINS ALL <" : " IN <");
plan.append(StringUtils.join(vals, ", "));
if (!optimizedVals.isEmpty()) {
first = vals.length == 0;
for (String optimized: optimizedVals) {
if (first) {
first = false;
} else {
plan.append(", ");
}
plan.append(optimized);
plan.append('*');
}
}
plan.append("> EXCEPT <");
plan.append(StringUtils.join(nots, ", "));
if (!optimizedNots.isEmpty()) {
first = vals.length == 0;
for (String optimized: optimizedNots) {
if (first) {
first = false;
} else {
plan.append(", ");
}
plan.append(optimized);
plan.append('*');
}
}
plan.append(">");
return plan.toString();
} else {
return EMPTY_STRING;
}
}
@Override
public SenseiDocIdSet getSenseiDocIdSet(IndexReader reader) throws IOException {
if (reader instanceof BoboIndexReader) {
BoboIndexReader boboReader = (BoboIndexReader)reader;
FacetHandler facetHandler = (FacetHandler)boboReader.getFacetHandler(_name);
Object obj = null;
String[] vals = _vals;
String[] nots = _not;
List<String> optimizedVals = new ArrayList<String>(vals.length);
List<String> optimizedNots = new ArrayList<String>(nots.length);
int maxDoc = reader.maxDoc();
if ( (vals == null || vals.length == 0) && (nots == null || nots.length == 0) ) {
// Bobo madness part 2: no vals and no nots will match nothing, regardless of isAnd.
return SenseiDocIdSet.buildMatchNone(planString("TRIVIAL", vals, nots, optimizedVals, optimizedNots));
}
// No facetHandler == no cardinality info.
DocIdSetCardinality totalDocIdSetCardinality = null;
String planType = "FACETED NOFACETDATA";
if(facetHandler == null) {
if (logger.isDebugEnabled()) {
logger.debug("not facet support, default to term filter: "+_name);
}
DocIdSet docIdSet = buildLuceneDefaultDocIdSet(boboReader, _name, vals, nots, _isAnd);
// No cardinality since we don't have the facet data and because Lucene's TermDocs is
// too expensive to justify calling
return new SenseiDocIdSet(docIdSet, DocIdSetCardinality.random(), planString("NOFACET LUCENE", vals, nots, optimizedVals, optimizedNots));
} else if (facetHandler instanceof UIDFacetHandler) {
planType = "FACET UID";
if (vals.length != 0) {
// We *could* look up all the ranges right now and see if there's any one even there. This would greatly
// speed up empty _uid queries, but I've never seen one of those.
totalDocIdSetCardinality = DocIdSetCardinality.exactRange(0, 1, maxDoc + 1);
} else {
totalDocIdSetCardinality = DocIdSetCardinality.zero();
}
if (nots.length != 0) {
totalDocIdSetCardinality.andWith(DocIdSetCardinality.exactRange(maxDoc + 1 - nots.length, maxDoc + 1, maxDoc + 1));
}
} else {
obj = facetHandler.getFacetData(boboReader);
if (obj != null && obj instanceof FacetDataCache) {
planType = "FACETED";
FacetDataCache facetData = (FacetDataCache)obj;
TermValueList valArray = facetData.valArray;
int[] freqs = facetData.freqs;
// Total cardinality = AND/OR(val1, val2, ...) AND NOT (OR(not1, not2))
totalDocIdSetCardinality = _isAnd ? DocIdSetCardinality.one() : DocIdSetCardinality.zero();
vals = getValsByFrequency(vals, freqs, maxDoc, totalDocIdSetCardinality, valArray, optimizedVals, _isAnd);
DocIdSetCardinality notDocIdSetCardinality = DocIdSetCardinality.zero();
nots = getValsByFrequency(nots, freqs, maxDoc, notDocIdSetCardinality, valArray, optimizedNots, false);
notDocIdSetCardinality.invert();
totalDocIdSetCardinality.andWith(notDocIdSetCardinality);
// If we optimized it out completely, return trivial sets. This is mostly there to deal with weird
// semantics for empty-match filters in Bobo.
if (totalDocIdSetCardinality.isOne()) {
return SenseiDocIdSet.buildMatchAll(reader, planString("FACET TRIVIAL", vals, nots, optimizedVals, optimizedNots));
} else if (totalDocIdSetCardinality.isZero()) {
return SenseiDocIdSet.buildMatchNone(planString("FACET TRIVIAL", vals, nots, optimizedVals, optimizedNots));
}
if(_noAutoOptimize) {
DocIdSet docIdSet = buildLuceneDefaultDocIdSet(boboReader,
_name,
vals,
nots,
_isAnd);
return new SenseiDocIdSet(docIdSet, totalDocIdSetCardinality, planString("DE-OPTIMIZED LUCENE", vals, nots, optimizedVals, optimizedNots));
}
}
}
// we get to optimize using facets
BrowseSelection sel = new BrowseSelection(_name);
sel.setValues(vals);
if (nots != null)
sel.setNotValues(nots);
if (_isAnd) {
sel.setSelectionOperation(ValueOperation.ValueOperationAnd);
} else {
sel.setSelectionOperation(ValueOperation.ValueOperationOr);
}
RandomAccessFilter filter = facetHandler.buildFilter(sel);
if (filter == null)
filter = EmptyFilter.getInstance();
// If we don't have an cardinality estimate, ask Bobo.
if (totalDocIdSetCardinality == null) {
totalDocIdSetCardinality = DocIdSetCardinality.exact(filter.getFacetSelectivity(boboReader));
// Zero means 'delete', and I don't trust Bobo enough.
if (totalDocIdSetCardinality.isZero()) {
totalDocIdSetCardinality = DocIdSetCardinality.exactRange(0.0, 0.001);
}
}
return new SenseiDocIdSet(filter.getDocIdSet(boboReader), totalDocIdSetCardinality, planString(planType, vals, nots, optimizedVals, optimizedNots));
} else{
throw new IllegalStateException("read not instance of "+BoboIndexReader.class);
}
}
private static final Comparator<Pair<String, DocIdSetCardinality>> DECREASING_CARDINALITY_COMPARATOR = new Comparator<Pair<String, DocIdSetCardinality>>() {
@Override
public int compare(Pair<String, DocIdSetCardinality> a, Pair<String, DocIdSetCardinality> b) {
return -a.getSecond().compareTo(b.getSecond());
}
};
public static final Comparator<Pair<String, DocIdSetCardinality>> INCREASING_CARDINALITY_COMPARATOR = new Comparator<Pair<String, DocIdSetCardinality>> (){
@Override
public int compare(Pair<String, DocIdSetCardinality> a, Pair<String, DocIdSetCardinality> b) {
return a.getSecond().compareTo(b.getSecond());
}
};
/* Get the list of values, sorted by frequency.
*
* ANDs will be sorted by increasing frequency, ORs by decreasing.
* We skip terms in the AND which match all docs. We skip terms in OR which match all docs.
* We update total cardinality as we go, but it's supposed to be initialized to 1 for ANDs, 0 for ORs.
*/
static String[] getValsByFrequency(String[] vals, int[] freqs, int maxDoc, DocIdSetCardinality total, TermValueList valArray, List<String> optimizedOut, boolean isAnd) {
List<Pair<String, DocIdSetCardinality>> valsAndFreqs = new ArrayList<Pair<String, DocIdSetCardinality>>(vals.length);
for (String val : vals) {
int i = valArray.indexOf(val);
if (i >=0) {
DocIdSetCardinality docIdSetCardinality = DocIdSetCardinality.exact(((double) freqs[i]) / (maxDoc + 1));
if (isAnd) {
if (docIdSetCardinality.isOne()) {
optimizedOut.add(val);
continue;
}
total.andWith(docIdSetCardinality);
} else {
if (docIdSetCardinality.isZero()) {
optimizedOut.add(val);
continue;
}
total.orWith(docIdSetCardinality);
}
valsAndFreqs.add(new Pair<String, DocIdSetCardinality>(valArray.get(i), docIdSetCardinality));
}
}
// Lowest cardinality docs go first to optimize the AND case, last for the OR case.
Collections.sort(valsAndFreqs, isAnd ? INCREASING_CARDINALITY_COMPARATOR : DECREASING_CARDINALITY_COMPARATOR);
String[] sortedVals = new String[valsAndFreqs.size()];
int i = 0;
while (i < sortedVals.length) {
sortedVals[i] = valsAndFreqs.get(i).getFirst();
++i;
}
return sortedVals;
}
}