/**
* Copyright (C) 2009-2013 FoundationDB, LLC
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package com.foundationdb.server.service.metrics;
import com.foundationdb.server.error.AkibanInternalException;
import com.foundationdb.server.service.Service;
import com.foundationdb.server.service.config.ConfigurationService;
import com.foundationdb.server.store.FDBHolder;
import com.foundationdb.Database;
import com.foundationdb.KeyValue;
import com.foundationdb.Range;
import com.foundationdb.Transaction;
import com.foundationdb.async.Function;
import com.foundationdb.async.Future;
import com.foundationdb.subspace.Subspace;
import com.foundationdb.tuple.ByteArrayUtil;
import com.foundationdb.tuple.Tuple2;
import com.google.inject.Inject;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.InetAddress;
import java.util.Arrays;
import java.util.ArrayDeque;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.Deque;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicLong;
/** Metrics service implemented using system keyspace and compatible
* with fdbserver's own metrics.
*/
public class FDBMetricsService implements MetricsService, Service
{
public static final boolean BINARY_STRINGS = true;
public static final String METRIC_KEY = "TDMetric";
public static final String METRIC_CONF_KEY = "TDMetricConf";
public static final String METRIC_CONF_CHANGES_KEY = "TDMetricConfChanges";
public static final String BOOLEAN_TYPE = "Bool";
public static final String LONG_TYPE = "Int64";
public static final String DEFAULT_ID = "0";
public static final String ENABLED_OPTION = "Enabled";
public static final byte[] ENABLED_FALSE = { (byte)0 };
public static final byte[] ENABLED_TRUE = { (byte)1 };
public static final int NLEVELS = 25;
public static final int LEVEL_VALUE_MAX_SIZE = 50000;
public static final double METRIC_LEVEL_DIVISOR = Math.log(4);
public static final String SQL_LAYER_RUNNING_NAME = "SQLLayerRunning";
public static final String CONFIG_CLUSTER_FILE = "fdbsql.fdb.metrics.cluster_file";
public static final String CONFIG_PREFIX = "fdbsql.fdb.metrics.prefix";
public static final String CONFIG_FLUSH_INTERVAL = "fdbsql.fdb.metrics.flush_interval";
private static final Logger logger = LoggerFactory.getLogger(FDBMetricsService.class);
private final ConfigurationService configService;
private final FDBHolder fdbService;
private final ConcurrentHashMap<String,BaseMetricImpl<?>> metrics = new ConcurrentHashMap<>();
private final List<KeyValue> pendingWrites = new ArrayList<>();
private final Random random = new Random();
// Effectively final, since set by start and so before service is
// available to any client thread.
private Database ownDatabase;
private Subspace dataSubspace, confSubspace, confChangesSubspace;
private long timebase;
private String address;
private long flushInterval;
protected Thread backgroundThread;
protected volatile boolean running, anyEnabled, confChanged, metricsConfChanged, metricsDataChanged, backgroundIdle;
private volatile Map<List<String>,byte[]> conf;
private BooleanMetric sqlLayerRunningMetric;
@Inject
public FDBMetricsService(ConfigurationService configService, FDBHolder fdbService) {
this.configService = configService;
this.fdbService = fdbService;
}
static class MetricLevel<T> {
final Deque<MetricLevelValues> values = new ArrayDeque<>();
T lastValue;
long lastTime;
}
static class MetricLevelValues {
final long start;
byte[] bytes;
MetricLevelValues(long start, byte[] bytes) {
this.start = start;
this.bytes = bytes;
}
void append(byte[] moreBytes) {
byte[] newBytes = new byte[bytes.length + moreBytes.length];
System.arraycopy(bytes, 0, newBytes, 0, bytes.length);
System.arraycopy(moreBytes, 0, newBytes, bytes.length, moreBytes.length);
bytes = newBytes;
}
boolean isFull() {
return (bytes.length >= LEVEL_VALUE_MAX_SIZE);
}
}
abstract class BaseMetricImpl<T> implements FDBMetric<T> {
private final String name;
protected volatile boolean enabled, confChanged, valueChanged;
protected long changeTime;
protected final List<MetricLevel<T>> levels = new ArrayList<>(NLEVELS);
protected BaseMetricImpl(String name) {
this.name = name;
for (int i = 0; i < NLEVELS; i++) {
levels.add(new MetricLevel<T>());
}
}
@Override
public String getName() {
return name;
}
@Override
public boolean isEnabled() {
return enabled;
}
@Override
public void setEnabled(boolean enabled) {
FDBMetricsService.this.setEnabled(this, enabled, true);
}
@Override
public Future<List<FDBMetric.Value<T>>> readAllValues(Transaction tr) {
return FDBMetricsService.this.readAllValues(tr, this);
}
protected abstract String getType();
protected abstract byte[] encodeValue();
protected abstract byte[] encodeValue(MetricLevel<T> onto);
protected abstract List<FDBMetric.Value<T>> decodeValues(byte[] bytes);
@Override
public String toString() {
return getType() + " " + name + " = " + getObject();
}
}
class BooleanMetricImpl extends BaseMetricImpl<Boolean> implements BooleanMetric {
private final AtomicBoolean bvalue = new AtomicBoolean();
public BooleanMetricImpl(String name) {
super(name);
}
@Override
public boolean get() {
return bvalue.get();
}
@Override
public void set(boolean value) {
if (!enabled) {
bvalue.set(value);
}
else {
synchronized (this) {
boolean ovalue = bvalue.get();
if (value != ovalue) {
if (bvalue.compareAndSet(ovalue, value)) {
metricChanged(this);
}
}
}
}
}
@Override
public Boolean getObject() {
return get();
}
@Override
public void setObject(Boolean value) {
set(value);
}
@Override
public boolean toggle() {
if (!enabled) {
while (true) {
boolean value = bvalue.get();
boolean newValue = !value;
if (bvalue.compareAndSet(value, newValue)) {
return newValue;
}
}
}
else {
synchronized (this) {
boolean value = !get();
set(value);
return value;
}
}
}
@Override
protected String getType() {
return BOOLEAN_TYPE;
}
@Override
protected byte[] encodeValue() {
return Tuple2.from(combine(changeTime, get())).pack();
}
@Override
protected byte[] encodeValue(MetricLevel<Boolean> onto) {
return Tuple2.from(combine(changeTime, get()) -
combine(onto.lastTime, onto.lastValue))
.pack();
}
@Override
protected List<FDBMetric.Value<Boolean>> decodeValues(byte[] bytes) {
Tuple2 tuple = Tuple2.fromBytes(bytes);
int nvalues = tuple.size();
List<FDBMetric.Value<Boolean>> result = new ArrayList<>(nvalues);
long tvalue = 0;
for (int i = 0; i < nvalues; i++) {
tvalue += tuple.getLong(i);
long time = tvalue >>> 1;
boolean value = ((tvalue & 1) != 0);
result.add(new FDBMetric.Value<Boolean>(time, value));
}
return result;
}
// A timestamp and boolean value are stored in a single long
// with the time shifted over to make room for the bit.
private long combine(long time, boolean value) {
return (time * 2) + (value ? 1 : 0);
}
}
class LongMetricImpl extends BaseMetricImpl<Long> implements LongMetric {
private final AtomicLong lvalue = new AtomicLong();
public LongMetricImpl(String name) {
super(name);
}
@Override
public long get() {
return lvalue.get();
}
@Override
public void set(long value) {
if (!enabled) {
lvalue.set(value);
}
else {
synchronized (this) {
long ovalue = lvalue.get();
if (value != ovalue) {
if (lvalue.compareAndSet(ovalue, value)) {
metricChanged(this);
}
}
}
}
}
@Override
public Long getObject() {
return get();
}
@Override
public void setObject(Long value) {
set(value);
}
@Override
public long increment() {
if (!enabled) {
return lvalue.incrementAndGet();
}
else {
synchronized (this) {
long value = get() + 1;
set(value);
return value;
}
}
}
@Override
public long increment(long amount) {
if (!enabled) {
return lvalue.addAndGet(amount);
}
else {
synchronized (this) {
long value = get() + amount;
set(value);
return value;
}
}
}
@Override
protected String getType() {
return LONG_TYPE;
}
@Override
protected byte[] encodeValue() {
// A timestamp and long value are stored as successive integers.
return Tuple2.from(changeTime, get()).pack();
}
@Override
protected byte[] encodeValue(MetricLevel<Long> onto) {
return Tuple2.from(changeTime - onto.lastTime,
get() - onto.lastValue)
.pack();
}
@Override
protected List<FDBMetric.Value<Long>> decodeValues(byte[] bytes) {
Tuple2 tuple = Tuple2.fromBytes(bytes);
int nvalues = tuple.size() / 2;
List<FDBMetric.Value<Long>> result = new ArrayList<>(nvalues);
long time = 0, value = 0;
for (int i = 0; i < nvalues; i++) {
time += tuple.getLong(i * 2);
value += tuple.getLong(i * 2 + 1);
result.add(new FDBMetric.Value<Long>(time, value));
}
return result;
}
}
/* MetricCollection */
@Override
public BooleanMetric addBooleanMetric(String name) {
BooleanMetricImpl metric = new BooleanMetricImpl(name);
addMetric(metric);
return metric;
}
@Override
public LongMetric addLongMetric(String name) {
LongMetricImpl metric = new LongMetricImpl(name);
addMetric(metric);
return metric;
}
@Override
public <T> void removeMetric(BaseMetric<T> metric) {
metrics.remove(metric.getName(), metric);
}
/* Service */
@Override
public void start() {
String clusterFile = configService.getProperty(CONFIG_CLUSTER_FILE);
if (clusterFile.length() == 0) {
ownDatabase = null;
}
else {
ownDatabase = fdbService.getFDB().open(clusterFile);
}
byte[] prefix;
try {
prefix = configService.getProperty(CONFIG_PREFIX).getBytes("ISO-8859-1");
}
catch (UnsupportedEncodingException ex) {
throw new AkibanInternalException("Error encoding prefix", ex);
}
dataSubspace = new Subspace(tupleFrom(METRIC_KEY), prefix);
confSubspace = new Subspace(tupleFrom(METRIC_CONF_KEY), prefix);
confChangesSubspace = new Subspace(tupleFrom(METRIC_CONF_CHANGES_KEY), prefix);
// NOTE: Java does not expose a nanosecond wallclock timer, like POSIX
// CLOCK_REALTIME, only one like CLOCK_MONOTONIC. (Among other things, 64 bits is
// only 292 years.) There is a tradeoff between using synchronized clocks in a
// multi-node environment and using clocks that NTP doesn't change so that
// small-scale deltas are always accurate. fdbserver metrics choose the former.
// TODO: Consider recomputing this periodically.
timebase = System.currentTimeMillis() * 1000000 - System.nanoTime();
address = "127.0.0.1";
try {
address = InetAddress.getLocalHost().getHostAddress();
}
catch (IOException ex) {
// Ignore
}
address = address + ":" + configService.getProperty("fdbsql.postgres.port");
flushInterval = Long.parseLong(configService.getProperty(CONFIG_FLUSH_INTERVAL));
backgroundThread = new Thread() {
@Override
public void run() {
backgroundThread();
}
};
loadConf();
running = true;
backgroundThread.start();
sqlLayerRunningMetric = addBooleanMetric(SQL_LAYER_RUNNING_NAME);
sqlLayerRunningMetric.set(true);
}
@Override
public void stop() {
// Getting this stored back reliably is problematic because there is no
// systematic notion of orderly shutdown.
sqlLayerRunningMetric.set(false);
running = false;
notifyBackground();
try {
backgroundThread.join(1000);
}
catch (InterruptedException ex) {
backgroundThread.interrupt();
}
if (ownDatabase != null) {
ownDatabase.dispose();
}
metrics.clear();
pendingWrites.clear();
conf.clear();
ownDatabase = null;
dataSubspace = confSubspace = confChangesSubspace = null;
timebase = flushInterval = 0;
address = null;
running = anyEnabled = confChanged = metricsConfChanged = metricsDataChanged = backgroundIdle = false;
}
@Override
public void crash() {
stop();
}
/* For testing */
public void reset() {
metrics.clear();
loadConf();
}
public void completeBackgroundWork() {
do {
notifyBackground();
try {
Thread.sleep(100);
}
catch (InterruptedException ex) {
}
} while (!backgroundIdle || confChanged || metricsConfChanged || metricsDataChanged);
}
public void deleteBooleanMetric(Transaction tr, String name) {
tr.clear(Range.startsWith(dataSubspace.pack(tupleFrom(BOOLEAN_TYPE, name))));
tr.clear(Range.startsWith(confSubspace.pack(tupleFrom(BOOLEAN_TYPE, name))));
}
public void deleteLongMetric(Transaction tr, String name) {
tr.clear(Range.startsWith(dataSubspace.pack(tupleFrom(LONG_TYPE, name))));
tr.clear(Range.startsWith(confSubspace.pack(tupleFrom(LONG_TYPE, name))));
}
/* Internal */
protected Database getDatabase() {
if (ownDatabase != null)
return ownDatabase;
else
return fdbService.getDatabase();
}
protected void addMetric(BaseMetricImpl<?> metric) {
if (metrics.putIfAbsent(metric.getName(), metric) != null) {
throw new IllegalArgumentException("There is already a metric named " + metric.getName());
}
updateEnabled(metric); // Get initial enabled state from conf.
}
protected <T> void metricChanged(BaseMetricImpl<T> metric) {
long lastTime = metric.changeTime;
// NOTE: changeTime is only accurate for metrics that are enabled, and
// to guarantee that it always corresponds to the value, updates to
// enabled metrics are synchronized.
metric.changeTime = timebase + System.nanoTime();
metric.valueChanged = true;
metricsDataChanged = true;
int level;
if ((lastTime == 0) ||
(metric.changeTime <= lastTime)) {
level = 0;
}
else {
// Longer duration -> higher level.
double r = random.nextDouble();
if (r == 0) {
level = NLEVELS-1;
}
else {
level = Math.min(NLEVELS-1, (int)(Math.log((metric.changeTime - lastTime) / r) / METRIC_LEVEL_DIVISOR));
}
}
MetricLevel<T> metricLevel = metric.levels.get(level);
MetricLevelValues values = metricLevel.values.peekLast();
if ((values == null) || values.isFull()) {
if (logger.isTraceEnabled()) {
logger.trace("New level {} entry at {} for {}",
new Object[] {
level, metric.changeTime, metric
});
}
values = new MetricLevelValues(metric.changeTime, metric.encodeValue());
metricLevel.values.addLast(values);
}
else {
if (logger.isTraceEnabled()) {
logger.trace("Adding to level {} entry at {} for {}",
new Object[] {
level, metric.changeTime, metric
});
}
values.append(metric.encodeValue(metricLevel));
}
metricLevel.lastValue = metric.getObject();
metricLevel.lastTime = metric.changeTime;
}
protected void notifyBackground() {
if (Thread.currentThread() != backgroundThread) {
synchronized (backgroundThread) {
backgroundThread.notifyAll();
}
}
}
protected void backgroundThread() {
try {
while (running) {
if (!confChanged && !metricsConfChanged) {
try {
synchronized (backgroundThread) {
backgroundIdle = true;
if (anyEnabled)
backgroundThread.wait(flushInterval);
else
backgroundThread.wait();
backgroundIdle = false;
}
}
catch (InterruptedException ex) {
break;
}
}
if (confChanged) {
logger.debug("Metrics configuration has changed and needs to be loaded.");
confChanged = false;
updateConf();
}
if (metricsConfChanged || metricsDataChanged) {
logger.debug("Metrics have changed and need to be saved.");
metricsConfChanged = metricsDataChanged = false;
writeMetrics();
}
}
}
catch (Exception ex) {
logger.error("Error in metrics background thread", ex);
}
}
protected void updateConf() {
loadConf();
for (BaseMetricImpl<?> metric : metrics.values()) {
if (metric.confChanged) continue; // Pending from this side.
updateEnabled(metric);
}
}
// Load configuration settings from storage.
// We can't tell which are SQL layer metrics, so we get them all.
protected void loadConf() {
conf = getDatabase()
.run(new Function<Transaction,Map<List<String>,byte[]>>() {
@Override
public Map<List<String>,byte[]> apply(Transaction tr) {
return readConf(tr);
}
});
logger.debug("Loaded {}: {}", METRIC_CONF_KEY, conf);
}
protected Map<List<String>,byte[]> readConf(Transaction tr) {
tr.options().setAccessSystemKeys();
byte[] confKey = confSubspace.getKey();
List<KeyValue> kvs = tr.getRange(Range.startsWith(confKey)).asList().get();
Map<List<String>,byte[]> result = new HashMap<>();
for (KeyValue kv : kvs) {
byte[] tupleBytes = new byte[kv.getKey().length - confKey.length];
System.arraycopy(kv.getKey(), confKey.length, tupleBytes, 0, tupleBytes.length);
// TODO: It's a shame that there isn't a fromBytes with index offets.
Tuple2 tuple = Tuple2.fromBytes(tupleBytes);
List<String> list = new ArrayList<>(tuple.size());
for (int i = 0; i < tuple.size(); i++) {
if (BINARY_STRINGS) {
try {
list.add(new String(tuple.getBytes(i), "UTF-8"));
}
catch (UnsupportedEncodingException ex) {
throw new AkibanInternalException("Error decoding binary string", ex);
}
}
else {
list.add(tuple.getString(i));
}
}
result.put(list, kv.getValue());
}
// Initiate a watch (from this same transaction) for changes to the key
// used to signal configuration changes.
tr.watch(confChangesSubspace.getKey()).onReady(new Runnable() {
@Override
public void run() {
confChanged = true;
notifyBackground();
}
});
return result;
}
protected static Tuple2 tupleFrom(Object... keys) {
if (BINARY_STRINGS) {
try {
for (int i = 0; i < keys.length; i++) {
if (keys[i] instanceof String) {
keys[i] = ((String)keys[i]).getBytes("UTF-8");
}
}
}
catch (UnsupportedEncodingException ex) {
throw new AkibanInternalException("Error encoding binary string", ex);
}
}
return Tuple2.from(keys);
}
protected void updateEnabled(BaseMetricImpl<?> metric) {
List<String> keys = Arrays.asList(metric.getType(), metric.getName(),
address, DEFAULT_ID, ENABLED_OPTION);
byte[] enabled = conf.get(keys);
if (enabled == null) {
metric.confChanged = true; // Write conf key for brand new metric.
// Check wildcard enabling.
keys.set(2, "");
enabled = conf.get(keys);
if (enabled != null) {
setEnabled(metric, enabled[0] != 0, false);
}
metricsConfChanged = true;
notifyBackground();
}
else {
setEnabled(metric, enabled[0] != 0, false);
}
}
// Change a metric's enabled flag, either from configuration storage of at
// user's request.
protected void setEnabled(BaseMetricImpl<?> metric, boolean enabled, boolean explicit) {
if (metric.enabled == enabled) return;
boolean notifyBackground = false;
synchronized (metric) {
metric.enabled = enabled;
if (enabled) {
// As though just changed to current value.
metricChanged(metric);
}
if (explicit) {
metric.confChanged = true;
}
}
if (enabled && !anyEnabled) {
anyEnabled = true;
notifyBackground = true; // So goes to sleep instead of indefinite wait.
}
if (explicit) {
metricsConfChanged = true;
notifyBackground = true;
}
if (notifyBackground) {
notifyBackground();
}
}
protected void writeMetrics() {
boolean anyConfChanges = false;
for (BaseMetricImpl<?> metric : metrics.values()) {
if (!metric.confChanged && !metric.valueChanged) continue;
synchronized (metric) {
if (metric.confChanged) {
anyConfChanges = true;
pendingWrites.add(new KeyValue(confSubspace.pack(tupleFrom(metric.getType(), metric.getName(), address, DEFAULT_ID, ENABLED_OPTION)),
metric.enabled ? ENABLED_TRUE : ENABLED_FALSE));
metric.confChanged = false;
}
if (metric.valueChanged) {
pendingWrites.add(new KeyValue(dataSubspace.pack(tupleFrom(metric.getType(), metric.getName(), address, DEFAULT_ID)),
metric.encodeValue()));
metric.valueChanged = false;
}
for (int level = 0; level < NLEVELS; level++) {
MetricLevel<?> metricLevel = metric.levels.get(level);
while (true) {
MetricLevelValues values = metricLevel.values.pollFirst();
if (values == null) break;
pendingWrites.add(new KeyValue(dataSubspace.pack(tupleFrom(metric.getType(), metric.getName(), address, DEFAULT_ID, level, values.start)),
values.bytes));
// Continue to fill (will overwrite key with longer value).
if (metricLevel.values.isEmpty() && !values.isFull()) {
metricLevel.values.addLast(values);
break;
}
}
}
}
}
if (anyConfChanges) {
// Signal a change in configuration. We will respond to this change,
// too, but that seemd harmless and difficult to avoid in a general
// way.
byte[] bytes = new byte[16];
random.nextBytes(bytes);
if (logger.isDebugEnabled()) {
logger.debug("Writing {}: {}", METRIC_CONF_CHANGES_KEY, ByteArrayUtil.printable(bytes));
}
pendingWrites.add(new KeyValue(confChangesSubspace.getKey(), bytes));
}
if (!pendingWrites.isEmpty()) {
getDatabase()
.run(new Function<Transaction,Void>() {
@Override
public Void apply(Transaction tr) {
tr.options().setAccessSystemKeys();
for (KeyValue kv : pendingWrites) {
tr.set(kv.getKey(), kv.getValue());
}
return null;
}
});
pendingWrites.clear();
}
}
protected <T> Future<List<FDBMetric.Value<T>>> readAllValues(Transaction tr, final BaseMetricImpl<T> metric) {
tr.options().setAccessSystemKeys();
return tr.getRange(dataSubspace.pack(tupleFrom(metric.getType(), metric.getName(), address, DEFAULT_ID, 0)),
dataSubspace.pack(tupleFrom(metric.getType(), metric.getName(), address, DEFAULT_ID, NLEVELS)))
.asList()
.map(new Function<List<KeyValue>,List<FDBMetric.Value<T>>> () {
@Override
public List<FDBMetric.Value<T>> apply(List<KeyValue> kvs) {
List<FDBMetric.Value<T>> result = new ArrayList<>();
for (KeyValue kv : kvs) {
result.addAll(metric.decodeValues(kv.getValue()));
}
// Merge all levels.
Collections.sort(result,
new Comparator<FDBMetric.Value<T>> () {
@Override
public int compare(FDBMetric.Value<T> v1,
FDBMetric.Value<T> v2) {
if (v1.time < v2.time)
return -1;
else if (v1.time > v2.time)
return +1;
else
return 0;
}
});
return result;
}
});
}
}