Package org.elasticsearch.index.codec.postingsformat

Source Code of org.elasticsearch.index.codec.postingsformat.BloomFilterPostingsFormat

/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*    http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied.  See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.elasticsearch.index.codec.postingsformat;

import org.apache.lucene.codecs.*;
import org.apache.lucene.index.*;
import org.apache.lucene.store.*;
import org.apache.lucene.util.*;
import org.elasticsearch.common.util.BloomFilter;
import org.elasticsearch.index.store.DirectoryUtils;
import org.elasticsearch.index.store.Store;

import java.io.IOException;
import java.util.*;
import java.util.Map.Entry;

/**
* <p>
* A {@link PostingsFormat} useful for low doc-frequency fields such as primary
* keys. Bloom filters are maintained in a ".blm" file which offers "fast-fail"
* for reads in segments known to have no record of the key. A choice of
* delegate PostingsFormat is used to record all other Postings data.
* </p>
* <p>
* This is a special bloom filter version, based on {@link org.elasticsearch.common.util.BloomFilter} and inspired
* by Lucene {@link org.apache.lucene.codecs.bloom.BloomFilteringPostingsFormat}.
* </p>
*/
public final class BloomFilterPostingsFormat extends PostingsFormat {

    public static final String BLOOM_CODEC_NAME = "XBloomFilter"; // the Lucene one is named BloomFilter
    public static final int BLOOM_CODEC_VERSION = 1;
    public static final int BLOOM_CODEC_VERSION_CHECKSUM = 2;
    public static final int BLOOM_CODEC_VERSION_CURRENT = BLOOM_CODEC_VERSION_CHECKSUM;

    /**
     * Extension of Bloom Filters file
     */
    static final String BLOOM_EXTENSION = "blm";

    private BloomFilter.Factory bloomFilterFactory = BloomFilter.Factory.DEFAULT;
    private PostingsFormat delegatePostingsFormat;

    /**
     * Creates Bloom filters for a selection of fields created in the index. This
     * is recorded as a set of Bitsets held as a segment summary in an additional
     * "blm" file. This PostingsFormat delegates to a choice of delegate
     * PostingsFormat for encoding all other postings data.
     *
     * @param delegatePostingsFormat The PostingsFormat that records all the non-bloom filter data i.e.
     *                               postings info.
     * @param bloomFilterFactory     The {@link BloomFilter.Factory} responsible for sizing BloomFilters
     *                               appropriately
     */
    public BloomFilterPostingsFormat(PostingsFormat delegatePostingsFormat,
                                     BloomFilter.Factory bloomFilterFactory) {
        super(BLOOM_CODEC_NAME);
        this.delegatePostingsFormat = delegatePostingsFormat;
        this.bloomFilterFactory = bloomFilterFactory;
    }

    // Used only by core Lucene at read-time via Service Provider instantiation -
    // do not use at Write-time in application code.
    public BloomFilterPostingsFormat() {
        super(BLOOM_CODEC_NAME);
    }

    @Override
    public BloomFilteredFieldsConsumer fieldsConsumer(SegmentWriteState state)
            throws IOException {
        if (delegatePostingsFormat == null) {
            throw new UnsupportedOperationException("Error - " + getClass().getName()
                    + " has been constructed without a choice of PostingsFormat");
        }
        return new BloomFilteredFieldsConsumer(
                delegatePostingsFormat.fieldsConsumer(state), state,
                delegatePostingsFormat);
    }

    @Override
    public BloomFilteredFieldsProducer fieldsProducer(SegmentReadState state)
            throws IOException {
        return new BloomFilteredFieldsProducer(state);
    }

    public PostingsFormat getDelegate() {
        return delegatePostingsFormat;
    }

    private final class LazyBloomLoader implements Accountable {
        private final long offset;
        private final IndexInput indexInput;
        private BloomFilter filter;

        private LazyBloomLoader(long offset, IndexInput origial) {
            this.offset = offset;
            this.indexInput = origial.clone();
        }

        synchronized BloomFilter get() throws IOException {
            if (filter == null) {
                try (final IndexInput input = indexInput) {
                    input.seek(offset);
                    this.filter = BloomFilter.deserialize(input);
                }
            }
            return filter;
        }

        @Override
        public long ramBytesUsed() {
            return filter == null ? 0l : filter.getSizeInBytes();
        }

        @Override
        public Iterable<? extends Accountable> getChildResources() {
            return Collections.singleton(Accountables.namedAccountable("bloom", ramBytesUsed()));
        }
    }

    public final class BloomFilteredFieldsProducer extends FieldsProducer {
        private FieldsProducer delegateFieldsProducer;
        HashMap<String, LazyBloomLoader> bloomsByFieldName = new HashMap<>();
        private final int version;
        private final IndexInput data;

        // for internal use only
        FieldsProducer getDelegate() {
            return delegateFieldsProducer;
        }

        public BloomFilteredFieldsProducer(SegmentReadState state)
                throws IOException {

            final String bloomFileName = IndexFileNames.segmentFileName(
                    state.segmentInfo.name, state.segmentSuffix, BLOOM_EXTENSION);
            final Directory directory = state.directory;
            IndexInput dataInput = directory.openInput(bloomFileName, state.context);
            try {
                ChecksumIndexInput bloomIn = new BufferedChecksumIndexInput(dataInput.clone());
                version = CodecUtil.checkHeader(bloomIn, BLOOM_CODEC_NAME, BLOOM_CODEC_VERSION,
                        BLOOM_CODEC_VERSION_CURRENT);
                // // Load the hash function used in the BloomFilter
                // hashFunction = HashFunction.forName(bloomIn.readString());
                // Load the delegate postings format
               final String delegatePostings = bloomIn
                        .readString();
                int numBlooms = bloomIn.readInt();

                boolean load = false;
                Store.StoreDirectory storeDir = DirectoryUtils.getStoreDirectory(state.directory);
                if (storeDir != null && storeDir.codecService() != null) {
                    load = storeDir.codecService().isLoadBloomFilter();
                }

                if (load) {
                    for (int i = 0; i < numBlooms; i++) {
                        int fieldNum = bloomIn.readInt();
                        FieldInfo fieldInfo = state.fieldInfos.fieldInfo(fieldNum);
                        LazyBloomLoader loader = new LazyBloomLoader(bloomIn.getFilePointer(), dataInput);
                        bloomsByFieldName.put(fieldInfo.name, loader);
                        BloomFilter.skipBloom(bloomIn);
                    }
                    if (version >= BLOOM_CODEC_VERSION_CHECKSUM) {
                        CodecUtil.checkFooter(bloomIn);
                    } else {
                        CodecUtil.checkEOF(bloomIn);
                    }
                }
                this.delegateFieldsProducer = PostingsFormat.forName(delegatePostings)
                        .fieldsProducer(state);
                this.data = dataInput;
                dataInput = null; // null it out such that we don't close it
            } finally {
                IOUtils.closeWhileHandlingException(dataInput);
            }
        }

        @Override
        public Iterator<String> iterator() {
            return delegateFieldsProducer.iterator();
        }

        @Override
        public void close() throws IOException {
            IOUtils.close(data, delegateFieldsProducer);
        }

        @Override
        public Terms terms(String field) throws IOException {
            LazyBloomLoader filter = bloomsByFieldName.get(field);
            if (filter == null) {
                return delegateFieldsProducer.terms(field);
            } else {
                Terms result = delegateFieldsProducer.terms(field);
                if (result == null) {
                    return null;
                }
                return new BloomFilteredTerms(result, filter.get());
            }
        }

        @Override
        public int size() {
            return delegateFieldsProducer.size();
        }

        @Override
        public long ramBytesUsed() {
            long size = delegateFieldsProducer.ramBytesUsed();
            for (LazyBloomLoader bloomFilter : bloomsByFieldName.values()) {
                size += bloomFilter.ramBytesUsed();
            }
            return size;
        }

        @Override
        public Iterable<? extends Accountable> getChildResources() {
            List<Accountable> resources = new ArrayList<>();
            resources.addAll(Accountables.namedAccountables("field", bloomsByFieldName));
            if (delegateFieldsProducer != null) {
                resources.add(Accountables.namedAccountable("delegate", delegateFieldsProducer));
            }
            return Collections.unmodifiableList(resources);
        }

        @Override
        public void checkIntegrity() throws IOException {
            delegateFieldsProducer.checkIntegrity();
            if (version >= BLOOM_CODEC_VERSION_CHECKSUM) {
                CodecUtil.checksumEntireFile(data);
            }
        }

        @Override
        public FieldsProducer getMergeInstance() throws IOException {
            return delegateFieldsProducer.getMergeInstance();
        }
    }

    public static final class BloomFilteredTerms extends FilterLeafReader.FilterTerms {
        private BloomFilter filter;

        public BloomFilteredTerms(Terms terms, BloomFilter filter) {
            super(terms);
            this.filter = filter;
        }

        public BloomFilter getFilter() {
            return filter;
        }

        @Override
        public TermsEnum iterator(TermsEnum reuse) throws IOException {
            TermsEnum result;
            if ((reuse != null) && (reuse instanceof BloomFilteredTermsEnum)) {
                // recycle the existing BloomFilteredTermsEnum by asking the delegate
                // to recycle its contained TermsEnum
                BloomFilteredTermsEnum bfte = (BloomFilteredTermsEnum) reuse;
                if (bfte.filter == filter) {
                    bfte.reset(this.in);
                    return bfte;
                }
                reuse = bfte.reuse;
            }
            // We have been handed something we cannot reuse (either null, wrong
            // class or wrong filter) so allocate a new object
            result = new BloomFilteredTermsEnum(this.in, reuse, filter);
            return result;
        }
    }

    static final class BloomFilteredTermsEnum extends TermsEnum {

        private Terms delegateTerms;
        private TermsEnum delegateTermsEnum;
        private TermsEnum reuse;
        private BloomFilter filter;

        public BloomFilteredTermsEnum(Terms other, TermsEnum reuse, BloomFilter filter) {
            this.delegateTerms = other;
            this.reuse = reuse;
            this.filter = filter;
        }

        void reset(Terms others) {
            reuse = this.delegateTermsEnum;
            this.delegateTermsEnum = null;
            this.delegateTerms = others;
        }

        private TermsEnum getDelegate() throws IOException {
            if (delegateTermsEnum == null) {
                /* pull the iterator only if we really need it -
                 * this can be a relatively heavy operation depending on the
                 * delegate postings format and they underlying directory
                 * (clone IndexInput) */
                delegateTermsEnum = delegateTerms.iterator(reuse);
            }
            return delegateTermsEnum;
        }

        @Override
        public final BytesRef next() throws IOException {
            return getDelegate().next();
        }

        @Override
        public final boolean seekExact(BytesRef text)
                throws IOException {
            // The magical fail-fast speed up that is the entire point of all of
            // this code - save a disk seek if there is a match on an in-memory
            // structure
            // that may occasionally give a false positive but guaranteed no false
            // negatives
            if (!filter.mightContain(text)) {
                return false;
            }
            return getDelegate().seekExact(text);
        }

        @Override
        public final SeekStatus seekCeil(BytesRef text)
                throws IOException {
            return getDelegate().seekCeil(text);
        }

        @Override
        public final void seekExact(long ord) throws IOException {
            getDelegate().seekExact(ord);
        }

        @Override
        public final BytesRef term() throws IOException {
            return getDelegate().term();
        }

        @Override
        public final long ord() throws IOException {
            return getDelegate().ord();
        }

        @Override
        public final int docFreq() throws IOException {
            return getDelegate().docFreq();
        }

        @Override
        public final long totalTermFreq() throws IOException {
            return getDelegate().totalTermFreq();
        }


        @Override
        public DocsAndPositionsEnum docsAndPositions(Bits liveDocs,
                                                     DocsAndPositionsEnum reuse, int flags) throws IOException {
            return getDelegate().docsAndPositions(liveDocs, reuse, flags);
        }

        @Override
        public DocsEnum docs(Bits liveDocs, DocsEnum reuse, int flags)
                throws IOException {
            return getDelegate().docs(liveDocs, reuse, flags);
        }


    }


    final class BloomFilteredFieldsConsumer extends FieldsConsumer {
        private FieldsConsumer delegateFieldsConsumer;
        private Map<FieldInfo, BloomFilter> bloomFilters = new HashMap<>();
        private SegmentWriteState state;

        // private PostingsFormat delegatePostingsFormat;

        public BloomFilteredFieldsConsumer(FieldsConsumer fieldsConsumer,
                                           SegmentWriteState state, PostingsFormat delegatePostingsFormat) {
            this.delegateFieldsConsumer = fieldsConsumer;
            // this.delegatePostingsFormat=delegatePostingsFormat;
            this.state = state;
        }

        // for internal use only
        FieldsConsumer getDelegate() {
            return delegateFieldsConsumer;
        }


        @Override
        public void write(Fields fields) throws IOException {

            // Delegate must write first: it may have opened files
            // on creating the class
            // (e.g. Lucene41PostingsConsumer), and write() will
            // close them; alternatively, if we delayed pulling
            // the fields consumer until here, we could do it
            // afterwards:
            delegateFieldsConsumer.write(fields);

            for(String field : fields) {
                Terms terms = fields.terms(field);
                if (terms == null) {
                    continue;
                }
                FieldInfo fieldInfo = state.fieldInfos.fieldInfo(field);
                TermsEnum termsEnum = terms.iterator(null);

                BloomFilter bloomFilter = null;

                DocsEnum docsEnum = null;
                while (true) {
                    BytesRef term = termsEnum.next();
                    if (term == null) {
                        break;
                    }
                    if (bloomFilter == null) {
                        bloomFilter = bloomFilterFactory.createFilter(state.segmentInfo.getDocCount());
                        assert bloomFilters.containsKey(field) == false;
                        bloomFilters.put(fieldInfo, bloomFilter);
                    }
                    // Make sure there's at least one doc for this term:
                    docsEnum = termsEnum.docs(null, docsEnum, 0);
                    if (docsEnum.nextDoc() != DocsEnum.NO_MORE_DOCS) {
                        bloomFilter.put(term);
                    }
                }
            }
        }

        @Override
        public void close() throws IOException {
            delegateFieldsConsumer.close();
            // Now we are done accumulating values for these fields
            List<Entry<FieldInfo, BloomFilter>> nonSaturatedBlooms = new ArrayList<>();

            for (Entry<FieldInfo, BloomFilter> entry : bloomFilters.entrySet()) {
                nonSaturatedBlooms.add(entry);
            }
            String bloomFileName = IndexFileNames.segmentFileName(
                    state.segmentInfo.name, state.segmentSuffix, BLOOM_EXTENSION);
            IndexOutput bloomOutput = null;
            try {
                bloomOutput = state.directory
                        .createOutput(bloomFileName, state.context);
                CodecUtil.writeHeader(bloomOutput, BLOOM_CODEC_NAME,
                        BLOOM_CODEC_VERSION_CURRENT);
                // remember the name of the postings format we will delegate to
                bloomOutput.writeString(delegatePostingsFormat.getName());

                // First field in the output file is the number of fields+blooms saved
                bloomOutput.writeInt(nonSaturatedBlooms.size());
                for (Entry<FieldInfo, BloomFilter> entry : nonSaturatedBlooms) {
                    FieldInfo fieldInfo = entry.getKey();
                    BloomFilter bloomFilter = entry.getValue();
                    bloomOutput.writeInt(fieldInfo.number);
                    saveAppropriatelySizedBloomFilter(bloomOutput, bloomFilter, fieldInfo);
                }
                CodecUtil.writeFooter(bloomOutput);
            } finally {
                IOUtils.close(bloomOutput);
            }
            //We are done with large bitsets so no need to keep them hanging around
            bloomFilters.clear();
        }

        private void saveAppropriatelySizedBloomFilter(IndexOutput bloomOutput,
                                                       BloomFilter bloomFilter, FieldInfo fieldInfo) throws IOException {
            BloomFilter.serilaize(bloomFilter, bloomOutput);
        }

    }
}
TOP

Related Classes of org.elasticsearch.index.codec.postingsformat.BloomFilterPostingsFormat

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.