Package com.pearson.entech.elasticsearch.search.facet.approx.date.internal

Source Code of com.pearson.entech.elasticsearch.search.facet.approx.date.internal.CountThenEstimateBytes

package com.pearson.entech.elasticsearch.search.facet.approx.date.internal;

// Based on CountThenEstimate.java from ClearSpring's stream-lib package

import static com.pearson.entech.elasticsearch.plugin.approx.BytesRefUtils.process;

import java.io.ByteArrayInputStream;
import java.io.Externalizable;
import java.io.IOException;
import java.io.ObjectInput;
import java.io.ObjectInputStream;
import java.io.ObjectOutput;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

import org.apache.lucene.codecs.bloom.MurmurHash2;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefHash;

import com.clearspring.analytics.stream.cardinality.AdaptiveCounting;
import com.clearspring.analytics.stream.cardinality.CardinalityMergeException;
import com.clearspring.analytics.stream.cardinality.HyperLogLog;
import com.clearspring.analytics.stream.cardinality.HyperLogLogPlus;
import com.clearspring.analytics.stream.cardinality.ICardinality;
import com.clearspring.analytics.stream.cardinality.LinearCounting;
import com.clearspring.analytics.stream.cardinality.LogLog;
import com.clearspring.analytics.util.ExternalizableUtil;
import com.clearspring.analytics.util.IBuilder;
import com.pearson.entech.elasticsearch.plugin.approx.BytesRefUtils.Procedure;

/**
* Exact -> Estimator cardinality counting
* <p/>
* <p>
* Avoids allocating a large block of memory for cardinality estimation until
* a specified "tipping point" cardinality is reached.
* </p>
* <p/>
* Currently supports serialization with LinearCounting or AdaptiveCounting
*/
public class CountThenEstimateBytes implements ICardinality, Externalizable
{

    /**
     *  Hash function used when adding bytesrefs to the estimator, not used in exact counting.
     */
    private static final MurmurHash2 __luceneMurmurHash = MurmurHash2.INSTANCE;

    /** Linear Counting constant for serialization. */
    protected final static byte LC = 1;

    /** Adaptive Counting constant for serialization. */
    protected final static byte AC = 2;

    /** HyperLogLog constant for serialization. */
    protected final static byte HLC = 3;

    /** LogLog constant for serialization. */
    protected final static byte LLC = 4;

    /** HyperLogLog Plus constant for serialization. */
    protected final static byte HLPC = 5;

    /**
     * Cardinality after which exact counting gives way to estimation.
     */
    private int _tippingPoint;

    /**
     * True after switching to estimation.
     */
    protected boolean _tipped = false;

    /**
     * True after compacting the counter storage for exact cardinality.
     * You can't add new data after this has been done, so an exception will be thrown.
     */
    protected boolean _compacted = false;

    /**
     * Factory for instantiating estimator after the tipping point is reached
     */
    protected IBuilder<ICardinality> _builder;

    /**
     * Cardinality estimator: null until tipping point is reached
     */
    protected ICardinality _estimator;

    /**
     * Cardinality counter: null after tipping point is reached
     */
    protected BytesRefHash _counter;

    /**
     * Size of the longest BytesRef that the counter object has seen
     */
    protected int _longestBytesRefSize = 0;

    /**
     * Create a new count-then-estimate cardinality object with the tipping point provided.
     * After this has been reached, the provided builder will be used to create an estimator.
     *
     * @param tippingPoint Cardinality at which exact counting gives way to estimation
     * @param builder      Factory for instantiating estimator after the tipping point is reached
     */
    public CountThenEstimateBytes(final int tippingPoint, final IBuilder<ICardinality> builder) {
        _tippingPoint = tippingPoint;
        _builder = builder;
        if(tippingPoint == 0) {
            _counter = null;
            _estimator = builder.build();
            _tipped = true;
        } else {
            _counter = new BytesRefHash();
        }
    }

    /**
     * Deserialization constructor. Creates a new object from an array of bytes.
     *
     * @param bytes
     * @param tippingPoint Cardinality at which exact counting gives way to estimation
     * @param builder for _estimator to use if there are too many bytes for our liking
     * @throws IOException
     * @throws ClassNotFoundException
     */
    public CountThenEstimateBytes(final byte[] bytes, final int tippingPoint,
            final IBuilder<ICardinality> builder) throws IOException, ClassNotFoundException {
        this(tippingPoint, builder);
        readExternal(new ObjectInputStream(new ByteArrayInputStream(bytes)));

        if(!_tipped && _counter.size() > tippingPoint)
            tip();
    }

    @Override
    public long cardinality() {
        if(_tipped) {
            return _estimator.cardinality();
        }
        return _counter.size();
    }

    @Override
    public boolean offerHashed(final long hashedLong) {
        throw new UnsupportedOperationException();
    }

    @Override
    public boolean offerHashed(final int hashedInt) {
        throw new UnsupportedOperationException();
    }

    /**
     * Add a BytesRef to the underlying counter or estimator, tipping into approx
     * mode if the tipping point has been reached. Calling this when in exact mode,
     * but after merging or seriaizing this CountThenEstimateBytes instance, will
     * result in an IllegalStateException. The supplied BytesRef does not need to
     * be made safe by the calling class, as it will be copied on entry.
     *
     * @param ref the BytesRef to add
     * @return true if an equivalent string of bytes had not previously been offered
     */
    public boolean offerBytesRef(final BytesRef ref) {
        boolean modified = false;
        if(_tipped) {
            // The _estimator just needs the hash of the current bytes of the BytesRef
            modified = _estimator.offerHashed(__luceneMurmurHash.hash(ref));
        } else {
            if(_compacted)
                throw new IllegalStateException("Counter has already been compacted -- cannot add new data");
            if(_counter.add(ref) >= 0) {
                modified = true;
                if(_counter.size() > _tippingPoint) {
                    tip();
                } else {
                    if(ref.length > _longestBytesRefSize)
                        _longestBytesRefSize = ref.length;
                }
            }
        }
        return modified;
    }

    @Override
    public boolean offer(final Object o)
    {
        if(o instanceof BytesRef)
            return offerBytesRef((BytesRef) o);
        else
            return offerBytesRef(new BytesRef(o.toString()));
    }

    @Override
    public int sizeof() {
        if(_tipped)
            return _estimator.sizeof();

        return -1;
    }

    /**
     * Returns the tipping point.
     *
     * @return the number of entries at which this instance switched from exact to approx mode
     */
    public int getTippingPoint() {
        return _tippingPoint;
    }

    /**
     * Switch from exact counting to estimation.
     */
    private void tip() {
        if(!_tipped) {
            _estimator = _builder.build();
            process(_counter, new Procedure() {
                @Override
                public void consume(final BytesRef ref) {
                    _estimator.offerHashed(__luceneMurmurHash.hash(ref));
                }
            });
            _counter = null;
            _longestBytesRefSize = 0;
            _builder = null;
            _tipped = true;
            _compacted = true;
        }
    }

    /**
     * Check the tipped status.
     *
     * @return true if this instance is in approx mode
     */
    public boolean tipped() {
        return _tipped;
    }

    @Override
    public byte[] getBytes() throws IOException {
        return ExternalizableUtil.toBytes(this);
    }

    @SuppressWarnings({ "unchecked", "rawtypes" })
    @Override
    public void readExternal(final ObjectInput in) throws IOException, ClassNotFoundException {
        _tipped = in.readBoolean();
        if(_tipped) {

            final byte type = in.readByte();
            final byte[] bytes = new byte[in.readInt()];
            in.readFully(bytes);

            switch(type)
            {
            case LC:
                _estimator = new LinearCounting(bytes);
                break;
            case AC:
                _estimator = new AdaptiveCounting(bytes);
                break;
            case HLC:
                _estimator = HyperLogLog.Builder.build(bytes);
                break;
            case HLPC:
                _estimator = HyperLogLogPlus.Builder.build(bytes);
                break;
            case LLC:
                _estimator = new LinearCounting(bytes);
                break;
            default:
                throw new IOException("Unrecognized estimator type: " + type);
            }

        } else {

            _tippingPoint = in.readInt();
            _builder = (IBuilder) in.readObject();
            final int count = in.readInt();

            assert (count <= _tippingPoint) : String.format("Invalid serialization: count (%d) > _tippingPoint (%d)", count, _tippingPoint);

            // Just in case some muppet tries to deserialize into an already-used counter
            if(_compacted || _counter.size() > 0) {
                _counter.clear();
                _counter.reinit();
                _compacted = false;
            }

            _longestBytesRefSize = in.readInt();
            final byte[] scratch = new byte[_longestBytesRefSize];
            for(int i = 0; i < count; i++) {
                final int length = in.readInt();
                in.read(scratch, 0, length);
                _counter.add(new BytesRef(scratch, 0, length));
            }
        }
    }

    @Override
    public void writeExternal(final ObjectOutput out) throws IOException {
        out.writeBoolean(_tipped);
        if(_tipped) {

            if(_estimator instanceof LinearCounting)
                out.writeByte(LC);
            else if(_estimator instanceof AdaptiveCounting)
                out.writeByte(AC);
            else if(_estimator instanceof HyperLogLog)
                out.writeByte(HLC);
            else if(_estimator instanceof HyperLogLogPlus)
                out.writeByte(HLPC);
            else if(_estimator instanceof LogLog)
                out.writeByte(LLC);
            else
                throw new IOException("Estimator unsupported for serialization: " + _estimator.getClass().getName());

            final byte[] bytes = _estimator.getBytes();
            out.writeInt(bytes.length);
            out.write(bytes);

        } else {

            out.writeInt(_tippingPoint);
            out.writeObject(_builder);
            out.writeInt(_counter.size());
            out.writeInt(_longestBytesRefSize);

            process(_counter, new Procedure() {
                @Override
                public void consume(final BytesRef ref) throws IOException {
                    out.writeInt(ref.length);
                    out.write(ref.bytes, ref.offset, ref.length);
                }
            });
            _compacted = true;

        }
    }

    @Override
    public ICardinality merge(final ICardinality... estimators) throws CardinalityMergeException {
        if(estimators == null)
            return mergeEstimators(this);

        final CountThenEstimateBytes[] all = Arrays.copyOf(estimators, estimators.length + 1, CountThenEstimateBytes[].class);
        all[all.length - 1] = this;
        return mergeEstimators(all);
    }

    /**
     * Merges estimators to produce an estimator for their combined streams.
     *
     * @param estimators the estimators to merge
     * @return a merged estimator, or null if no estimators were provided
     * @throws CardinalityMergeException
     *          if estimators are not mergeable (all must be CountThenEstimateBytes made with the same builder)
     */
    public static CountThenEstimateBytes mergeEstimators(final CountThenEstimateBytes... estimators) throws CardinalityMergeException
    {
        final CountThenEstimateBytes merged;
        final int numEstimators = (estimators == null) ? 0 : estimators.length;
        if(numEstimators > 0) {
            final List<ICardinality> tipped = new ArrayList<ICardinality>(numEstimators);
            final List<CountThenEstimateBytes> untipped = new ArrayList<CountThenEstimateBytes>(numEstimators);

            for(final CountThenEstimateBytes estimator : estimators) {
                if(estimator._tipped)
                    tipped.add(estimator._estimator);
                else
                    untipped.add(estimator);
            }

            final int untippedSize = untipped.size();
            if(untippedSize > 0) {

                merged = untipped.get(0);
                for(int i = 1; i < untippedSize; i++) {
                    final CountThenEstimateBytes cte = untipped.get(i);
                    process(cte._counter, new Procedure() {
                        @Override
                        public void consume(final BytesRef ref) throws Exception {
                            merged.offerBytesRef(ref);
                        }
                    });
                    cte._compacted = true;
                }

            } else {

                merged = new CountThenEstimateBytes(0, new LinearCounting.Builder(1));
                merged.tip();
                merged._estimator = tipped.remove(0);

            }

            if(!tipped.isEmpty()) {
                if(!merged._tipped)
                    merged.tip();

                merged._estimator = merged._estimator.merge(tipped.toArray(new ICardinality[tipped.size()]));
            }

            return merged;

            // TODO we need to make sure that after the final merge, i.e. in the materialize phase, all remaining memory is cleared
        }
        return null;
    }

    /**
     * Exception thrown to indicate that you have asked to merge two incompatible estimators.
     */
    @SuppressWarnings("serial")
    protected static class CountThenEstimateMergeException extends CardinalityMergeException
    {

        /**
         * Create exception.
         * @param message
         */
        public CountThenEstimateMergeException(final String message)
        {
            super(message);
        }

    }

}
TOP

Related Classes of com.pearson.entech.elasticsearch.search.facet.approx.date.internal.CountThenEstimateBytes

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.