/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.codec.postingsformat;
import org.apache.lucene.codecs.*;
import org.apache.lucene.index.*;
import org.apache.lucene.store.*;
import org.apache.lucene.util.*;
import org.elasticsearch.common.util.BloomFilter;
import org.elasticsearch.index.store.DirectoryUtils;
import org.elasticsearch.index.store.Store;
import java.io.IOException;
import java.util.*;
import java.util.Map.Entry;
/**
* <p>
* A {@link PostingsFormat} useful for low doc-frequency fields such as primary
* keys. Bloom filters are maintained in a ".blm" file which offers "fast-fail"
* for reads in segments known to have no record of the key. A choice of
* delegate PostingsFormat is used to record all other Postings data.
* </p>
* <p>
* This is a special bloom filter version, based on {@link org.elasticsearch.common.util.BloomFilter} and inspired
* by Lucene {@link org.apache.lucene.codecs.bloom.BloomFilteringPostingsFormat}.
* </p>
*/
public final class BloomFilterPostingsFormat extends PostingsFormat {
public static final String BLOOM_CODEC_NAME = "XBloomFilter"; // the Lucene one is named BloomFilter
public static final int BLOOM_CODEC_VERSION = 1;
public static final int BLOOM_CODEC_VERSION_CHECKSUM = 2;
public static final int BLOOM_CODEC_VERSION_CURRENT = BLOOM_CODEC_VERSION_CHECKSUM;
/**
* Extension of Bloom Filters file
*/
static final String BLOOM_EXTENSION = "blm";
private BloomFilter.Factory bloomFilterFactory = BloomFilter.Factory.DEFAULT;
private PostingsFormat delegatePostingsFormat;
/**
* Creates Bloom filters for a selection of fields created in the index. This
* is recorded as a set of Bitsets held as a segment summary in an additional
* "blm" file. This PostingsFormat delegates to a choice of delegate
* PostingsFormat for encoding all other postings data.
*
* @param delegatePostingsFormat The PostingsFormat that records all the non-bloom filter data i.e.
* postings info.
* @param bloomFilterFactory The {@link BloomFilter.Factory} responsible for sizing BloomFilters
* appropriately
*/
public BloomFilterPostingsFormat(PostingsFormat delegatePostingsFormat,
BloomFilter.Factory bloomFilterFactory) {
super(BLOOM_CODEC_NAME);
this.delegatePostingsFormat = delegatePostingsFormat;
this.bloomFilterFactory = bloomFilterFactory;
}
// Used only by core Lucene at read-time via Service Provider instantiation -
// do not use at Write-time in application code.
public BloomFilterPostingsFormat() {
super(BLOOM_CODEC_NAME);
}
@Override
public BloomFilteredFieldsConsumer fieldsConsumer(SegmentWriteState state)
throws IOException {
if (delegatePostingsFormat == null) {
throw new UnsupportedOperationException("Error - " + getClass().getName()
+ " has been constructed without a choice of PostingsFormat");
}
return new BloomFilteredFieldsConsumer(
delegatePostingsFormat.fieldsConsumer(state), state,
delegatePostingsFormat);
}
@Override
public BloomFilteredFieldsProducer fieldsProducer(SegmentReadState state)
throws IOException {
return new BloomFilteredFieldsProducer(state);
}
public PostingsFormat getDelegate() {
return delegatePostingsFormat;
}
private final class LazyBloomLoader implements Accountable {
private final long offset;
private final IndexInput indexInput;
private BloomFilter filter;
private LazyBloomLoader(long offset, IndexInput origial) {
this.offset = offset;
this.indexInput = origial.clone();
}
synchronized BloomFilter get() throws IOException {
if (filter == null) {
try (final IndexInput input = indexInput) {
input.seek(offset);
this.filter = BloomFilter.deserialize(input);
}
}
return filter;
}
@Override
public long ramBytesUsed() {
return filter == null ? 0l : filter.getSizeInBytes();
}
@Override
public Iterable<? extends Accountable> getChildResources() {
return Collections.singleton(Accountables.namedAccountable("bloom", ramBytesUsed()));
}
}
public final class BloomFilteredFieldsProducer extends FieldsProducer {
private FieldsProducer delegateFieldsProducer;
HashMap<String, LazyBloomLoader> bloomsByFieldName = new HashMap<>();
private final int version;
private final IndexInput data;
// for internal use only
FieldsProducer getDelegate() {
return delegateFieldsProducer;
}
public BloomFilteredFieldsProducer(SegmentReadState state)
throws IOException {
final String bloomFileName = IndexFileNames.segmentFileName(
state.segmentInfo.name, state.segmentSuffix, BLOOM_EXTENSION);
final Directory directory = state.directory;
IndexInput dataInput = directory.openInput(bloomFileName, state.context);
try {
ChecksumIndexInput bloomIn = new BufferedChecksumIndexInput(dataInput.clone());
version = CodecUtil.checkHeader(bloomIn, BLOOM_CODEC_NAME, BLOOM_CODEC_VERSION,
BLOOM_CODEC_VERSION_CURRENT);
// // Load the hash function used in the BloomFilter
// hashFunction = HashFunction.forName(bloomIn.readString());
// Load the delegate postings format
final String delegatePostings = bloomIn
.readString();
int numBlooms = bloomIn.readInt();
boolean load = false;
Store.StoreDirectory storeDir = DirectoryUtils.getStoreDirectory(state.directory);
if (storeDir != null && storeDir.codecService() != null) {
load = storeDir.codecService().isLoadBloomFilter();
}
if (load) {
for (int i = 0; i < numBlooms; i++) {
int fieldNum = bloomIn.readInt();
FieldInfo fieldInfo = state.fieldInfos.fieldInfo(fieldNum);
LazyBloomLoader loader = new LazyBloomLoader(bloomIn.getFilePointer(), dataInput);
bloomsByFieldName.put(fieldInfo.name, loader);
BloomFilter.skipBloom(bloomIn);
}
if (version >= BLOOM_CODEC_VERSION_CHECKSUM) {
CodecUtil.checkFooter(bloomIn);
} else {
CodecUtil.checkEOF(bloomIn);
}
}
this.delegateFieldsProducer = PostingsFormat.forName(delegatePostings)
.fieldsProducer(state);
this.data = dataInput;
dataInput = null; // null it out such that we don't close it
} finally {
IOUtils.closeWhileHandlingException(dataInput);
}
}
@Override
public Iterator<String> iterator() {
return delegateFieldsProducer.iterator();
}
@Override
public void close() throws IOException {
IOUtils.close(data, delegateFieldsProducer);
}
@Override
public Terms terms(String field) throws IOException {
LazyBloomLoader filter = bloomsByFieldName.get(field);
if (filter == null) {
return delegateFieldsProducer.terms(field);
} else {
Terms result = delegateFieldsProducer.terms(field);
if (result == null) {
return null;
}
return new BloomFilteredTerms(result, filter.get());
}
}
@Override
public int size() {
return delegateFieldsProducer.size();
}
@Override
public long ramBytesUsed() {
long size = delegateFieldsProducer.ramBytesUsed();
for (LazyBloomLoader bloomFilter : bloomsByFieldName.values()) {
size += bloomFilter.ramBytesUsed();
}
return size;
}
@Override
public Iterable<? extends Accountable> getChildResources() {
List<Accountable> resources = new ArrayList<>();
resources.addAll(Accountables.namedAccountables("field", bloomsByFieldName));
if (delegateFieldsProducer != null) {
resources.add(Accountables.namedAccountable("delegate", delegateFieldsProducer));
}
return Collections.unmodifiableList(resources);
}
@Override
public void checkIntegrity() throws IOException {
delegateFieldsProducer.checkIntegrity();
if (version >= BLOOM_CODEC_VERSION_CHECKSUM) {
CodecUtil.checksumEntireFile(data);
}
}
@Override
public FieldsProducer getMergeInstance() throws IOException {
return delegateFieldsProducer.getMergeInstance();
}
}
public static final class BloomFilteredTerms extends FilterLeafReader.FilterTerms {
private BloomFilter filter;
public BloomFilteredTerms(Terms terms, BloomFilter filter) {
super(terms);
this.filter = filter;
}
public BloomFilter getFilter() {
return filter;
}
@Override
public TermsEnum iterator(TermsEnum reuse) throws IOException {
TermsEnum result;
if ((reuse != null) && (reuse instanceof BloomFilteredTermsEnum)) {
// recycle the existing BloomFilteredTermsEnum by asking the delegate
// to recycle its contained TermsEnum
BloomFilteredTermsEnum bfte = (BloomFilteredTermsEnum) reuse;
if (bfte.filter == filter) {
bfte.reset(this.in);
return bfte;
}
reuse = bfte.reuse;
}
// We have been handed something we cannot reuse (either null, wrong
// class or wrong filter) so allocate a new object
result = new BloomFilteredTermsEnum(this.in, reuse, filter);
return result;
}
}
static final class BloomFilteredTermsEnum extends TermsEnum {
private Terms delegateTerms;
private TermsEnum delegateTermsEnum;
private TermsEnum reuse;
private BloomFilter filter;
public BloomFilteredTermsEnum(Terms other, TermsEnum reuse, BloomFilter filter) {
this.delegateTerms = other;
this.reuse = reuse;
this.filter = filter;
}
void reset(Terms others) {
reuse = this.delegateTermsEnum;
this.delegateTermsEnum = null;
this.delegateTerms = others;
}
private TermsEnum getDelegate() throws IOException {
if (delegateTermsEnum == null) {
/* pull the iterator only if we really need it -
* this can be a relatively heavy operation depending on the
* delegate postings format and they underlying directory
* (clone IndexInput) */
delegateTermsEnum = delegateTerms.iterator(reuse);
}
return delegateTermsEnum;
}
@Override
public final BytesRef next() throws IOException {
return getDelegate().next();
}
@Override
public final boolean seekExact(BytesRef text)
throws IOException {
// The magical fail-fast speed up that is the entire point of all of
// this code - save a disk seek if there is a match on an in-memory
// structure
// that may occasionally give a false positive but guaranteed no false
// negatives
if (!filter.mightContain(text)) {
return false;
}
return getDelegate().seekExact(text);
}
@Override
public final SeekStatus seekCeil(BytesRef text)
throws IOException {
return getDelegate().seekCeil(text);
}
@Override
public final void seekExact(long ord) throws IOException {
getDelegate().seekExact(ord);
}
@Override
public final BytesRef term() throws IOException {
return getDelegate().term();
}
@Override
public final long ord() throws IOException {
return getDelegate().ord();
}
@Override
public final int docFreq() throws IOException {
return getDelegate().docFreq();
}
@Override
public final long totalTermFreq() throws IOException {
return getDelegate().totalTermFreq();
}
@Override
public DocsAndPositionsEnum docsAndPositions(Bits liveDocs,
DocsAndPositionsEnum reuse, int flags) throws IOException {
return getDelegate().docsAndPositions(liveDocs, reuse, flags);
}
@Override
public DocsEnum docs(Bits liveDocs, DocsEnum reuse, int flags)
throws IOException {
return getDelegate().docs(liveDocs, reuse, flags);
}
}
final class BloomFilteredFieldsConsumer extends FieldsConsumer {
private FieldsConsumer delegateFieldsConsumer;
private Map<FieldInfo, BloomFilter> bloomFilters = new HashMap<>();
private SegmentWriteState state;
// private PostingsFormat delegatePostingsFormat;
public BloomFilteredFieldsConsumer(FieldsConsumer fieldsConsumer,
SegmentWriteState state, PostingsFormat delegatePostingsFormat) {
this.delegateFieldsConsumer = fieldsConsumer;
// this.delegatePostingsFormat=delegatePostingsFormat;
this.state = state;
}
// for internal use only
FieldsConsumer getDelegate() {
return delegateFieldsConsumer;
}
@Override
public void write(Fields fields) throws IOException {
// Delegate must write first: it may have opened files
// on creating the class
// (e.g. Lucene41PostingsConsumer), and write() will
// close them; alternatively, if we delayed pulling
// the fields consumer until here, we could do it
// afterwards:
delegateFieldsConsumer.write(fields);
for(String field : fields) {
Terms terms = fields.terms(field);
if (terms == null) {
continue;
}
FieldInfo fieldInfo = state.fieldInfos.fieldInfo(field);
TermsEnum termsEnum = terms.iterator(null);
BloomFilter bloomFilter = null;
DocsEnum docsEnum = null;
while (true) {
BytesRef term = termsEnum.next();
if (term == null) {
break;
}
if (bloomFilter == null) {
bloomFilter = bloomFilterFactory.createFilter(state.segmentInfo.getDocCount());
assert bloomFilters.containsKey(field) == false;
bloomFilters.put(fieldInfo, bloomFilter);
}
// Make sure there's at least one doc for this term:
docsEnum = termsEnum.docs(null, docsEnum, 0);
if (docsEnum.nextDoc() != DocsEnum.NO_MORE_DOCS) {
bloomFilter.put(term);
}
}
}
}
@Override
public void close() throws IOException {
delegateFieldsConsumer.close();
// Now we are done accumulating values for these fields
List<Entry<FieldInfo, BloomFilter>> nonSaturatedBlooms = new ArrayList<>();
for (Entry<FieldInfo, BloomFilter> entry : bloomFilters.entrySet()) {
nonSaturatedBlooms.add(entry);
}
String bloomFileName = IndexFileNames.segmentFileName(
state.segmentInfo.name, state.segmentSuffix, BLOOM_EXTENSION);
IndexOutput bloomOutput = null;
try {
bloomOutput = state.directory
.createOutput(bloomFileName, state.context);
CodecUtil.writeHeader(bloomOutput, BLOOM_CODEC_NAME,
BLOOM_CODEC_VERSION_CURRENT);
// remember the name of the postings format we will delegate to
bloomOutput.writeString(delegatePostingsFormat.getName());
// First field in the output file is the number of fields+blooms saved
bloomOutput.writeInt(nonSaturatedBlooms.size());
for (Entry<FieldInfo, BloomFilter> entry : nonSaturatedBlooms) {
FieldInfo fieldInfo = entry.getKey();
BloomFilter bloomFilter = entry.getValue();
bloomOutput.writeInt(fieldInfo.number);
saveAppropriatelySizedBloomFilter(bloomOutput, bloomFilter, fieldInfo);
}
CodecUtil.writeFooter(bloomOutput);
} finally {
IOUtils.close(bloomOutput);
}
//We are done with large bitsets so no need to keep them hanging around
bloomFilters.clear();
}
private void saveAppropriatelySizedBloomFilter(IndexOutput bloomOutput,
BloomFilter bloomFilter, FieldInfo fieldInfo) throws IOException {
BloomFilter.serilaize(bloomFilter, bloomOutput);
}
}
}