/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.cassandra.db.context;
import java.nio.ByteBuffer;
import java.security.MessageDigest;
import java.util.ArrayList;
import java.util.List;
import com.google.common.annotations.VisibleForTesting;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.cassandra.db.TypeSizes;
import org.apache.cassandra.db.compaction.CompactionManager;
import org.apache.cassandra.serializers.MarshalException;
import org.apache.cassandra.utils.*;
/**
* An implementation of a partitioned counter context.
*
* A context is primarily a list of tuples (counter id, clock, count) -- called
* shards, with some shards flagged as global or local (with
* special resolution rules in merge()).
*
* The data structure has two parts:
* a) a header containing the lists of global and local shard indexes in the body
* b) a list of shards -- (counter id, logical clock, count) tuples -- (the so-called 'body' below)
*
* The exact layout is:
* | header | body |
* context : |--|------|----------|
* ^ ^
* | list of indices in the body list (2*#elt bytes)
* #elt in rest of header (2 bytes)
*
* Non-negative indices refer to local shards. Global shard indices are encoded as [idx + Short.MIN_VALUE],
* and are thus always negative.
*
* The body layout being:
*
* body: |----|----|----|----|----|----|....
* ^ ^ ^ ^ ^ ^
* | | count_1 | | count_2
* | clock_1 | clock_2
* counterid_1 counterid_2
*
* The rules when merging two shard with the same counter id are:
* - global + global = keep the shard with the highest logical clock
* - global + local = keep the global one
* - global + remote = keep the global one
* - local + local = sum counts (and logical clocks)
* - local + remote = keep the local one
* - remote + remote = keep the shard with the highest logical clock
*
* For a detailed description of the meaning of a local and why the merging
* rules work this way, see CASSANDRA-1938 - specifically the 1938_discussion
* attachment (doesn't cover global shards, see CASSANDRA-4775 for that).
*/
public class CounterContext implements IContext
{
private static final int HEADER_SIZE_LENGTH = TypeSizes.NATIVE.sizeof(Short.MAX_VALUE);
private static final int HEADER_ELT_LENGTH = TypeSizes.NATIVE.sizeof(Short.MAX_VALUE);
private static final int CLOCK_LENGTH = TypeSizes.NATIVE.sizeof(Long.MAX_VALUE);
private static final int COUNT_LENGTH = TypeSizes.NATIVE.sizeof(Long.MAX_VALUE);
private static final int STEP_LENGTH = CounterId.LENGTH + CLOCK_LENGTH + COUNT_LENGTH;
private static final Logger logger = LoggerFactory.getLogger(CounterContext.class);
// lazy-load singleton
private static class LazyHolder
{
private static final CounterContext counterContext = new CounterContext();
}
public static CounterContext instance()
{
return LazyHolder.counterContext;
}
/**
* Creates a counter context with a single local shard.
*/
public ByteBuffer createLocal(long count, Allocator allocator)
{
ContextState state = ContextState.allocate(0, 1, 0, allocator);
state.writeLocal(CounterId.getLocalId(), 1L, count);
return state.context;
}
/**
* Creates a counter context with a single remote shard.
*/
public ByteBuffer createRemote(CounterId id, long clock, long count, Allocator allocator)
{
ContextState state = ContextState.allocate(0, 0, 1, allocator);
state.writeRemote(id, clock, count);
return state.context;
}
private static int headerLength(ByteBuffer context)
{
return HEADER_SIZE_LENGTH + Math.abs(context.getShort(context.position())) * HEADER_ELT_LENGTH;
}
private static int compareId(ByteBuffer bb1, int pos1, ByteBuffer bb2, int pos2)
{
return ByteBufferUtil.compareSubArrays(bb1, pos1, bb2, pos2, CounterId.LENGTH);
}
/**
* Determine the count relationship between two contexts.
*
* EQUAL: Equal set of nodes and every count is equal.
* GREATER_THAN: Superset of nodes and every count is equal or greater than its corollary.
* LESS_THAN: Subset of nodes and every count is equal or less than its corollary.
* DISJOINT: Node sets are not equal and/or counts are not all greater or less than.
*
* Strategy: compare node logical clocks (like a version vector).
*
* @param left counter context.
* @param right counter context.
* @return the ContextRelationship between the contexts.
*/
public ContextRelationship diff(ByteBuffer left, ByteBuffer right)
{
ContextRelationship relationship = ContextRelationship.EQUAL;
ContextState leftState = ContextState.wrap(left);
ContextState rightState = ContextState.wrap(right);
while (leftState.hasRemaining() && rightState.hasRemaining())
{
// compare id bytes
int compareId = leftState.compareIdTo(rightState);
if (compareId == 0)
{
long leftClock = leftState.getClock();
long rightClock = rightState.getClock();
long leftCount = leftState.getCount();
long rightCount = rightState.getCount();
// advance
leftState.moveToNext();
rightState.moveToNext();
// process clock comparisons
if (leftClock == rightClock)
{
if (leftCount != rightCount)
{
// Inconsistent shard (see the corresponding code in merge()). We return DISJOINT in this
// case so that it will be treated as a difference, allowing read-repair to work.
return ContextRelationship.DISJOINT;
}
else
{
continue;
}
}
else if ((leftClock >= 0 && rightClock > 0 && leftClock > rightClock)
|| (leftClock < 0 && (rightClock > 0 || leftClock < rightClock)))
{
if (relationship == ContextRelationship.EQUAL)
{
relationship = ContextRelationship.GREATER_THAN;
}
else if (relationship == ContextRelationship.GREATER_THAN)
{
continue;
}
else
{
// relationship == ContextRelationship.LESS_THAN
return ContextRelationship.DISJOINT;
}
}
else
{
if (relationship == ContextRelationship.EQUAL)
{
relationship = ContextRelationship.LESS_THAN;
}
else if (relationship == ContextRelationship.GREATER_THAN)
{
return ContextRelationship.DISJOINT;
}
else
{
// relationship == ContextRelationship.LESS_THAN
continue;
}
}
}
else if (compareId > 0)
{
// only advance the right context
rightState.moveToNext();
if (relationship == ContextRelationship.EQUAL)
{
relationship = ContextRelationship.LESS_THAN;
}
else if (relationship == ContextRelationship.GREATER_THAN)
{
return ContextRelationship.DISJOINT;
}
else
{
// relationship == ContextRelationship.LESS_THAN
continue;
}
}
else // compareId < 0
{
// only advance the left context
leftState.moveToNext();
if (relationship == ContextRelationship.EQUAL)
{
relationship = ContextRelationship.GREATER_THAN;
}
else if (relationship == ContextRelationship.GREATER_THAN)
{
continue;
}
else
// relationship == ContextRelationship.LESS_THAN
{
return ContextRelationship.DISJOINT;
}
}
}
// check final lengths
if (leftState.hasRemaining())
{
if (relationship == ContextRelationship.EQUAL)
{
return ContextRelationship.GREATER_THAN;
}
else if (relationship == ContextRelationship.LESS_THAN)
{
return ContextRelationship.DISJOINT;
}
}
else if (rightState.hasRemaining())
{
if (relationship == ContextRelationship.EQUAL)
{
return ContextRelationship.LESS_THAN;
}
else if (relationship == ContextRelationship.GREATER_THAN)
{
return ContextRelationship.DISJOINT;
}
}
return relationship;
}
/**
* Return a context w/ an aggregated count for each counter id.
*
* @param left counter context.
* @param right counter context.
* @param allocator An allocator for the merged value.
*/
public ByteBuffer merge(ByteBuffer left, ByteBuffer right, Allocator allocator)
{
int globalCount = 0;
int localCount = 0;
int remoteCount = 0;
ContextState leftState = ContextState.wrap(left);
ContextState rightState = ContextState.wrap(right);
while (leftState.hasRemaining() && rightState.hasRemaining())
{
int cmp = leftState.compareIdTo(rightState);
if (cmp == 0)
{
if (leftState.isGlobal() || rightState.isGlobal())
globalCount += 1;
else if (leftState.isLocal() || rightState.isLocal())
localCount += 1;
else
remoteCount += 1;
leftState.moveToNext();
rightState.moveToNext();
}
else if (cmp > 0)
{
if (rightState.isGlobal())
globalCount += 1;
else if (rightState.isLocal())
localCount += 1;
else
remoteCount += 1;
rightState.moveToNext();
}
else // cmp < 0
{
if (leftState.isGlobal())
globalCount += 1;
else if (leftState.isLocal())
localCount += 1;
else
remoteCount += 1;
leftState.moveToNext();
}
}
while (leftState.hasRemaining())
{
if (leftState.isGlobal())
globalCount += 1;
else if (leftState.isLocal())
localCount += 1;
else
remoteCount += 1;
leftState.moveToNext();
}
while (rightState.hasRemaining())
{
if (rightState.isGlobal())
globalCount += 1;
else if (rightState.isLocal())
localCount += 1;
else
remoteCount += 1;
rightState.moveToNext();
}
leftState.reset();
rightState.reset();
return merge(ContextState.allocate(globalCount, localCount, remoteCount, allocator), leftState, rightState);
}
private ByteBuffer merge(ContextState mergedState, ContextState leftState, ContextState rightState)
{
while (leftState.hasRemaining() && rightState.hasRemaining())
{
int cmp = leftState.compareIdTo(rightState);
if (cmp == 0)
{
mergeTie(mergedState, leftState, rightState);
rightState.moveToNext();
leftState.moveToNext();
}
else if (cmp > 0)
{
rightState.copyTo(mergedState);
rightState.moveToNext();
}
else // cmp < 0
{
leftState.copyTo(mergedState);
leftState.moveToNext();
}
}
while (leftState.hasRemaining())
{
leftState.copyTo(mergedState);
leftState.moveToNext();
}
while (rightState.hasRemaining())
{
rightState.copyTo(mergedState);
rightState.moveToNext();
}
return mergedState.context;
}
private void mergeTie(ContextState mergedState, ContextState leftState, ContextState rightState)
{
if (leftState.isGlobal() || rightState.isGlobal())
{
if (leftState.isGlobal() && rightState.isGlobal())
{
long leftClock = leftState.getClock();
long rightClock = rightState.getClock();
if (leftClock == rightClock)
{
long leftCount = leftState.getCount();
long rightCount = rightState.getCount();
// Can happen if an sstable gets lost and disk failure policy is set to 'best effort'
if (leftCount != rightCount && CompactionManager.isCompactionManager.get())
{
logger.warn("invalid global counter shard detected; ({}, {}, {}) and ({}, {}, {}) differ only in "
+ "count; will pick highest to self-heal on compaction",
leftState.getCounterId(), leftClock, leftCount,
rightState.getCounterId(), rightClock, rightCount);
}
if (leftCount > rightCount)
leftState.copyTo(mergedState);
else
rightState.copyTo(mergedState);
}
else
{
(leftClock > rightClock ? leftState : rightState).copyTo(mergedState);
}
}
else // only one is global - keep that one
{
(leftState.isGlobal() ? leftState : rightState).copyTo(mergedState);
}
}
else if (leftState.isLocal() || rightState.isLocal())
{
// Local id and at least one is a local shard.
if (leftState.isLocal() && rightState.isLocal())
{
// both local - sum
long clock = leftState.getClock() + rightState.getClock();
long count = leftState.getCount() + rightState.getCount();
mergedState.writeLocal(leftState.getCounterId(), clock, count);
}
else // only one is local - keep that one
{
(leftState.isLocal() ? leftState : rightState).copyTo(mergedState);
}
}
else // both are remote shards
{
long leftClock = leftState.getClock();
long rightClock = rightState.getClock();
if (leftClock == rightClock)
{
// We should never see non-local shards w/ same id+clock but different counts. However, if we do
// we should "heal" the problem by being deterministic in our selection of shard - and
// log the occurrence so that the operator will know something is wrong.
long leftCount = leftState.getCount();
long rightCount = rightState.getCount();
if (leftCount != rightCount && CompactionManager.isCompactionManager.get())
{
logger.warn("invalid remote counter shard detected; ({}, {}, {}) and ({}, {}, {}) differ only in "
+ "count; will pick highest to self-heal on compaction",
leftState.getCounterId(), leftClock, leftCount,
rightState.getCounterId(), rightClock, rightCount);
}
if (leftCount > rightCount)
leftState.copyTo(mergedState);
else
rightState.copyTo(mergedState);
}
else
{
if ((leftClock >= 0 && rightClock > 0 && leftClock >= rightClock)
|| (leftClock < 0 && (rightClock > 0 || leftClock < rightClock)))
leftState.copyTo(mergedState);
else
rightState.copyTo(mergedState);
}
}
}
/**
* Human-readable String from context.
*
* @param context counter context.
* @return a human-readable String of the context.
*/
public String toString(ByteBuffer context)
{
ContextState state = ContextState.wrap(context);
StringBuilder sb = new StringBuilder();
sb.append("[");
while (state.hasRemaining())
{
if (state.getElementIndex() > 0)
sb.append(",");
sb.append("{");
sb.append(state.getCounterId().toString()).append(", ");
sb.append(state.getClock()).append(", ");
sb.append(state.getCount());
sb.append("}");
if (state.isGlobal())
sb.append("$");
else if (state.isLocal())
sb.append("*");
state.moveToNext();
}
sb.append("]");
return sb.toString();
}
/**
* Returns the aggregated count across all counter ids.
*
* @param context a counter context
* @return the aggregated count represented by {@code context}
*/
public long total(ByteBuffer context)
{
long total = 0L;
// we could use a ContextState but it is easy enough that we avoid the object creation
for (int offset = context.position() + headerLength(context); offset < context.limit(); offset += STEP_LENGTH)
{
long count = context.getLong(offset + CounterId.LENGTH + CLOCK_LENGTH);
total += count;
}
return total;
}
public boolean shouldClearLocal(ByteBuffer context)
{
// #elt being negative means we have to clean local shards.
return context.getShort(context.position()) < 0;
}
/**
* Mark context to delete local references afterward.
* Marking is done by multiply #elt by -1 to preserve header length
* and #elt count in order to clear all local refs later.
*
* @param context a counter context
* @return context that marked to delete local refs
*/
public ByteBuffer markLocalToBeCleared(ByteBuffer context)
{
short count = context.getShort(context.position());
if (count <= 0)
return context; // already marked or all are remote.
boolean hasLocalShards = false;
for (int i = 0; i < count; i++)
{
if (context.getShort(context.position() + HEADER_SIZE_LENGTH + i * HEADER_ELT_LENGTH) >= 0)
{
hasLocalShards = true;
break;
}
}
if (!hasLocalShards)
return context; // all shards are global or remote.
ByteBuffer marked = ByteBuffer.allocate(context.remaining());
marked.putShort(marked.position(), (short) (count * -1));
ByteBufferUtil.arrayCopy(context,
context.position() + HEADER_SIZE_LENGTH,
marked,
marked.position() + HEADER_SIZE_LENGTH,
context.remaining() - HEADER_SIZE_LENGTH);
return marked;
}
/**
* Remove all the local of a context (but keep global).
*
* @param context a counter context
* @return a version of {@code context} where no shards are local.
*/
public ByteBuffer clearAllLocal(ByteBuffer context)
{
int count = Math.abs(context.getShort(context.position()));
if (count == 0)
return context; // no local or global shards present.
List<Short> globalShardIndexes = new ArrayList<>(count);
for (int i = 0; i < count; i++)
{
short elt = context.getShort(context.position() + HEADER_SIZE_LENGTH + i * HEADER_ELT_LENGTH);
if (elt < 0)
globalShardIndexes.add(elt);
}
if (count == globalShardIndexes.size())
return context; // no local shards detected.
// allocate a smaller BB for the cleared context - with no local header elts.
ByteBuffer cleared = ByteBuffer.allocate(context.remaining() - (count - globalShardIndexes.size()) * HEADER_ELT_LENGTH);
cleared.putShort(cleared.position(), (short) globalShardIndexes.size());
for (int i = 0; i < globalShardIndexes.size(); i++)
cleared.putShort(cleared.position() + HEADER_SIZE_LENGTH + i * HEADER_ELT_LENGTH, globalShardIndexes.get(i));
int origHeaderLength = headerLength(context);
ByteBufferUtil.arrayCopy(context,
context.position() + origHeaderLength,
cleared,
cleared.position() + headerLength(cleared),
context.remaining() - origHeaderLength);
return cleared;
}
public void validateContext(ByteBuffer context) throws MarshalException
{
if ((context.remaining() - headerLength(context)) % STEP_LENGTH != 0)
throw new MarshalException("Invalid size for a counter context");
}
/**
* Update a MessageDigest with the content of a context.
* Note that this skips the header entirely since the header information
* has local meaning only, while digests are meant for comparison across
* nodes. This means in particular that we always have:
* updateDigest(ctx) == updateDigest(clearAllLocal(ctx))
*/
public void updateDigest(MessageDigest message, ByteBuffer context)
{
ByteBuffer dup = context.duplicate();
dup.position(context.position() + headerLength(context));
message.update(dup);
}
/**
* Checks whether the provided context has a count for the provided
* CounterId.
*
* TODO: since the context is sorted, we could implement a binary search.
* This is however not called in any critical path and contexts will be
* fairly small so it doesn't matter much.
*/
public boolean hasCounterId(ByteBuffer context, CounterId id)
{
// we could use a ContextState but it is easy enough that we avoid the object creation
for (int offset = context.position() + headerLength(context); offset < context.limit(); offset += STEP_LENGTH)
{
if (id.equals(CounterId.wrap(context, offset)))
{
return true;
}
}
return false;
}
/**
* Helper class to work on contexts (works by iterating over them).
* A context being abstractly a list of tuple (counterid, clock, count), a
* ContextState encapsulate a context and a position to one of the tuple.
* It also allow to create new context iteratively.
*
* Note: this is intrinsically a private class intended for use by the
* methods of CounterContext only. It is however public because it is
* convenient to create handcrafted context for unit tests.
*/
public static class ContextState
{
public final ByteBuffer context;
public final int headerLength;
private int headerOffset; // offset from context.position()
private int bodyOffset; // offset from context.position()
private boolean currentIsGlobal;
private boolean currentIsLocal;
private ContextState(ByteBuffer context)
{
this.context = context;
this.headerLength = this.bodyOffset = headerLength(context);
this.headerOffset = HEADER_SIZE_LENGTH;
updateIsGlobalOrLocal();
}
public static ContextState wrap(ByteBuffer context)
{
return new ContextState(context);
}
/**
* Allocate a new context big enough for globalCount + localCount + remoteCount elements
* and return the initial corresponding ContextState.
*/
public static ContextState allocate(int globalCount, int localCount, int remoteCount, Allocator allocator)
{
int headerLength = HEADER_SIZE_LENGTH + (globalCount + localCount) * HEADER_ELT_LENGTH;
int bodyLength = (globalCount + localCount + remoteCount) * STEP_LENGTH;
ByteBuffer buffer = allocator.allocate(headerLength + bodyLength);
buffer.putShort(buffer.position(), (short) (globalCount + localCount));
return ContextState.wrap(buffer);
}
public boolean isGlobal()
{
return currentIsGlobal;
}
public boolean isLocal()
{
return currentIsLocal;
}
public boolean isRemote()
{
return !(currentIsGlobal || currentIsLocal);
}
private void updateIsGlobalOrLocal()
{
if (headerOffset >= headerLength)
{
currentIsGlobal = currentIsLocal = false;
}
else
{
short headerElt = context.getShort(context.position() + headerOffset);
currentIsGlobal = headerElt == getElementIndex() + Short.MIN_VALUE;
currentIsLocal = headerElt == getElementIndex();
}
}
public boolean hasRemaining()
{
return bodyOffset < context.remaining();
}
public void moveToNext()
{
bodyOffset += STEP_LENGTH;
if (currentIsGlobal || currentIsLocal)
headerOffset += HEADER_ELT_LENGTH;
updateIsGlobalOrLocal();
}
public void copyTo(ContextState other)
{
ByteBufferUtil.arrayCopy(context,
context.position() + bodyOffset,
other.context,
other.context.position() + other.bodyOffset,
STEP_LENGTH);
if (currentIsGlobal)
other.context.putShort(other.context.position() + other.headerOffset, (short) (other.getElementIndex() + Short.MIN_VALUE));
else if (currentIsLocal)
other.context.putShort(other.context.position() + other.headerOffset, (short) other.getElementIndex());
other.currentIsGlobal = currentIsGlobal;
other.currentIsLocal = currentIsLocal;
other.moveToNext();
}
public int compareIdTo(ContextState other)
{
return compareId(context, context.position() + bodyOffset, other.context, other.context.position() + other.bodyOffset);
}
public void reset()
{
this.headerOffset = HEADER_SIZE_LENGTH;
this.bodyOffset = headerLength;
updateIsGlobalOrLocal();
}
public int getElementIndex()
{
return (bodyOffset - headerLength) / STEP_LENGTH;
}
public CounterId getCounterId()
{
return CounterId.wrap(context, context.position() + bodyOffset);
}
public long getClock()
{
return context.getLong(context.position() + bodyOffset + CounterId.LENGTH);
}
public long getCount()
{
return context.getLong(context.position() + bodyOffset + CounterId.LENGTH + CLOCK_LENGTH);
}
// In 2.0 only used by the unit tests.
public void writeGlobal(CounterId id, long clock, long count)
{
writeElement(id, clock, count, true, false);
}
public void writeLocal(CounterId id, long clock, long count)
{
writeElement(id, clock, count, false, true);
}
public void writeRemote(CounterId id, long clock, long count)
{
writeElement(id, clock, count, false, false);
}
private void writeElement(CounterId id, long clock, long count, boolean isGlobal, boolean isLocal)
{
writeElementAtOffset(context, context.position() + bodyOffset, id, clock, count);
if (isGlobal)
context.putShort(context.position() + headerOffset, (short) (getElementIndex() + Short.MIN_VALUE));
else if (isLocal)
context.putShort(context.position() + headerOffset, (short) getElementIndex());
currentIsGlobal = isGlobal;
currentIsLocal = isLocal;
moveToNext();
}
// write a tuple (counter id, clock, count) at an absolute (bytebuffer-wise) offset
private void writeElementAtOffset(ByteBuffer ctx, int offset, CounterId id, long clock, long count)
{
ctx = ctx.duplicate();
ctx.position(offset);
ctx.put(id.bytes().duplicate());
ctx.putLong(clock);
ctx.putLong(count);
}
}
}