Source Code of com.salesforce.phoenix.expression.aggregator.DistinctValueWithCountServerAggregator

/*******************************************************************************
 * Copyright (c) 2013, Salesforce.com, Inc.
 * All rights reserved.
 * 
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 * 
 *     Redistributions of source code must retain the above copyright notice,
 *     this list of conditions and the following disclaimer.
 *     Redistributions in binary form must reproduce the above copyright notice,
 *     this list of conditions and the following disclaimer in the documentation
 *     and/or other materials provided with the distribution.
 *     Neither the name of Salesforce.com nor the names of its contributors may 
 *     be used to endorse or promote products derived from this software without 
 *     specific prior written permission.
 * 
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 
 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************************************************************/
package com.salesforce.phoenix.expression.aggregator;


import java.io.ByteArrayOutputStream;
import java.io.OutputStream;
import java.util.HashMap;
import java.util.Map;
import java.util.Map.Entry;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.io.hfile.Compression;
import org.apache.hadoop.hbase.io.hfile.Compression.Algorithm;
import org.apache.hadoop.hbase.util.Bytes;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;


import com.salesforce.hbase.index.util.ImmutableBytesPtr;
import com.salesforce.phoenix.query.QueryServices;
import com.salesforce.phoenix.query.QueryServicesOptions;
import com.salesforce.phoenix.schema.PDataType;
import com.salesforce.phoenix.schema.tuple.Tuple;
import com.salesforce.phoenix.util.ByteUtil;
import com.salesforce.phoenix.util.SizedUtil;


/**
 * Server side Aggregator which will aggregate data and find distinct values with number of occurrences for each.
 * 
 * @author anoopsjohn
 * @since 1.2.1
 */
public class DistinctValueWithCountServerAggregator extends BaseAggregator {
    private static final Logger LOG = LoggerFactory.getLogger(DistinctValueWithCountServerAggregator.class);
    public static final int DEFAULT_ESTIMATED_DISTINCT_VALUES = 10000;
    public static final byte[] COMPRESS_MARKER = new byte[] { (byte)1 };
    public static final Algorithm COMPRESS_ALGO = Compression.Algorithm.SNAPPY;


    private int compressThreshold;
    private byte[] buffer = null;
    private Map<ImmutableBytesPtr, Integer> valueVsCount = new HashMap<ImmutableBytesPtr, Integer>();


    public DistinctValueWithCountServerAggregator(Configuration conf) {
        super(null);
        compressThreshold = conf.getInt(QueryServices.DISTINCT_VALUE_COMPRESS_THRESHOLD_ATTRIB,
                QueryServicesOptions.DEFAULT_DISTINCT_VALUE_COMPRESS_THRESHOLD);
    }


    public DistinctValueWithCountServerAggregator(Configuration conf, DistinctValueWithCountClientAggregator clientAgg) {
        this(conf);
        valueVsCount = clientAgg.valueVsCount;
    }


    @Override
    public void aggregate(Tuple tuple, ImmutableBytesWritable ptr) {
        ImmutableBytesPtr key = new ImmutableBytesPtr(ptr.get(), ptr.getOffset(), ptr.getLength());
        Integer count = this.valueVsCount.get(key);
        if (count == null) {
            this.valueVsCount.put(key, 1);
        } else {
            this.valueVsCount.put(key, ++count);
        }
    }


    @Override
    public boolean isNullable() {
        return false;
    }


    @Override
    public boolean evaluate(Tuple tuple, ImmutableBytesWritable ptr) {
        // This serializes the Map. The format is as follows
        // Map size(VInt ie. 1 to 5 bytes) +
        // ( key length [VInt ie. 1 to 5 bytes] + key bytes + value [VInt ie. 1 to 5 bytes] )*
        int serializationSize = countMapSerializationSize();
        buffer = new byte[serializationSize];
        int offset = 1;
        offset += ByteUtil.vintToBytes(buffer, offset, this.valueVsCount.size());
        for (Entry<ImmutableBytesPtr, Integer> entry : this.valueVsCount.entrySet()) {
            ImmutableBytesPtr key = entry.getKey();
            offset += ByteUtil.vintToBytes(buffer, offset, key.getLength());
            System.arraycopy(key.get(), key.getOffset(), buffer, offset, key.getLength());
            offset += key.getLength();
            offset += ByteUtil.vintToBytes(buffer, offset, entry.getValue().intValue());
        }
        if (serializationSize > compressThreshold) {
            // The size for the map serialization is above the threshold. We will do the Snappy compression here.
            ByteArrayOutputStream compressedByteStream = new ByteArrayOutputStream();
            try {
                compressedByteStream.write(COMPRESS_MARKER);
                OutputStream compressionStream = COMPRESS_ALGO.createCompressionStream(compressedByteStream,
                        COMPRESS_ALGO.getCompressor(), 0);
                compressionStream.write(buffer, 1, buffer.length - 1);
                compressionStream.flush();
                ptr.set(compressedByteStream.toByteArray(), 0, compressedByteStream.size());
                return true;
            } catch (Exception e) {
                LOG.error("Exception while Snappy compression of data.", e);
            }
        }
        ptr.set(buffer, 0, offset);
        return true;
    }


    // The #bytes required to serialize the count map.
    // Here let us assume to use 4 bytes for each of the int items. Normally it will consume lesser
    // bytes as we will use vints.
    // TODO Do we need to consider 5 as the number of bytes for each of the int field? Else there is
    // a chance of ArrayIndexOutOfBoundsException when all the int fields are having very large
    // values. Will that ever occur?
    private int countMapSerializationSize() {
        int size = Bytes.SIZEOF_INT;// Write the number of entries in the Map
        for (ImmutableBytesPtr key : this.valueVsCount.keySet()) {
            // Add up the key and key's lengths (Int) and the value
            size += key.getLength() + Bytes.SIZEOF_INT + Bytes.SIZEOF_INT;
        }
        return size;
    }


    // The heap size which will be taken by the count map.
    private int countMapHeapSize() {
        int size = 0;
        if (this.valueVsCount.size() > 0) {
            for (ImmutableBytesPtr key : this.valueVsCount.keySet()) {
                size += SizedUtil.MAP_ENTRY_SIZE + // entry
                        Bytes.SIZEOF_INT + // key size
                        key.getLength() + SizedUtil.ARRAY_SIZE; // value size
            }
        } else {
            // Initially when the getSize() is called, we dont have any entries in the map so as to
            // tell the exact heap need. Let us approximate the #entries
            SizedUtil.sizeOfMap(DEFAULT_ESTIMATED_DISTINCT_VALUES,
                    SizedUtil.IMMUTABLE_BYTES_PTR_SIZE, Bytes.SIZEOF_INT);
        }
        return size;
    }


    @Override
    public final PDataType getDataType() {
        return PDataType.VARBINARY;
    }


    @Override
    public void reset() {
        valueVsCount = new HashMap<ImmutableBytesPtr, Integer>();
        buffer = null;
        super.reset();
    }


    @Override
    public String toString() {
        return "DISTINCT VALUE vs COUNT";
    }


    @Override
    public int getSize() {
        // TODO make this size correct.??
        // This size is being called initially at the begin of the scanner open. At that time we any
        // way can not tell the exact size of the Map. The Aggregators get size from all Aggregator
        // and stores in a variable for future use. This size of the Aggregators is being used in
        // Grouped unordered scan. Do we need some changes there in that calculation?
        return super.getSize() + SizedUtil.ARRAY_SIZE + countMapHeapSize();
    }
}
Source Code of com.salesforce.phoenix.expression.aggregator.DistinctValueWithCountServerAggregator

Related Classes of com.salesforce.phoenix.expression.aggregator.DistinctValueWithCountServerAggregator