/* This file is part of VoltDB.
* Copyright (C) 2008-2014 VoltDB Inc.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with VoltDB. If not, see <http://www.gnu.org/licenses/>.
*/
package org.voltdb.iv2;
import java.io.File;
import java.io.IOException;
import java.util.ArrayDeque;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Queue;
import java.util.Set;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.RejectedExecutionException;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicReference;
import org.apache.zookeeper_voltpatches.CreateMode;
import org.apache.zookeeper_voltpatches.KeeperException;
import org.apache.zookeeper_voltpatches.WatchedEvent;
import org.apache.zookeeper_voltpatches.Watcher;
import org.apache.zookeeper_voltpatches.ZooDefs.Ids;
import org.apache.zookeeper_voltpatches.ZooKeeper;
import org.json_voltpatches.JSONArray;
import org.json_voltpatches.JSONException;
import org.json_voltpatches.JSONObject;
import org.json_voltpatches.JSONStringer;
import org.voltcore.logging.VoltLogger;
import org.voltcore.messaging.HostMessenger;
import org.voltcore.utils.CoreUtils;
import org.voltcore.utils.Pair;
import org.voltcore.zk.BabySitter;
import org.voltcore.zk.LeaderElector;
import org.voltcore.zk.ZKUtil;
import org.voltdb.Promotable;
import org.voltdb.SnapshotFormat;
import org.voltdb.TheHashinator;
import org.voltdb.VoltDB;
import org.voltdb.VoltZK;
import org.voltdb.catalog.SnapshotSchedule;
import org.voltdb.client.ClientResponse;
import org.voltdb.sysprocs.saverestore.SnapshotUtil;
import org.voltdb.sysprocs.saverestore.SnapshotUtil.SnapshotResponseHandler;
import com.google_voltpatches.common.collect.ImmutableMap;
import com.google_voltpatches.common.collect.ImmutableSortedSet;
import com.google_voltpatches.common.util.concurrent.SettableFuture;
/**
* LeaderAppointer handles centralized appointment of partition leaders across
* the partition. This is primarily so that the leaders can be evenly
* distributed throughout the cluster, reducing bottlenecks (at least at
* startup). As a side-effect, this service also controls the initial startup
* of the cluster, blocking operation until each partition has a k-safe set of
* replicas, each partition has a leader, and the MPI has started.
*/
public class LeaderAppointer implements Promotable
{
private static final VoltLogger tmLog = new VoltLogger("TM");
private enum AppointerState {
INIT, // Initial start state, used to inhibit ZK callback actions
CLUSTER_START, // indicates that we're doing the initial cluster startup
DONE // indicates normal running conditions, including repair
}
private final HostMessenger m_hostMessenger;
private final ZooKeeper m_zk;
// This should only be accessed through getInitialPartitionCount() on cluster startup.
private final int m_initialPartitionCount;
private final Map<Integer, BabySitter> m_partitionWatchers;
private final LeaderCache m_iv2appointees;
private final LeaderCache m_iv2masters;
private final Map<Integer, PartitionCallback> m_callbacks;
private final int m_kfactor;
private final JSONObject m_topo;
private final MpInitiator m_MPI;
private final AtomicReference<AppointerState> m_state =
new AtomicReference<AppointerState>(AppointerState.INIT);
private SettableFuture<Object> m_startupLatch = null;
private final boolean m_partitionDetectionEnabled;
private boolean m_partitionDetected = false;
private boolean m_usingCommandLog = false;
private final AtomicBoolean m_replayComplete = new AtomicBoolean(false);
private final KSafetyStats m_stats;
/*
* Track partitions that are cleaned up during election/promotion etc.
* This eliminates the race where the cleanup occurs while constructing babysitters
* for partitions that end up being removed.
*/
private HashSet<Integer> m_removedPartitionsAtPromotionTime = null;
// Provide a single single-threaded executor service to all the BabySitters for each partition.
// This will guarantee that the ordering of events generated by ZooKeeper is preserved in the
// handling of callbacks in LeaderAppointer.
private final ExecutorService m_es =
CoreUtils.getCachedSingleThreadExecutor("LeaderAppointer-Babysitters", 15000);
private final SnapshotSchedule m_partSnapshotSchedule;
private final SnapshotResponseHandler m_snapshotHandler =
new SnapshotResponseHandler() {
@Override
public void handleResponse(ClientResponse resp)
{
if (resp == null) {
VoltDB.crashLocalVoltDB("Received a null response to a snapshot initiation request. " +
"This should be impossible.", true, null);
}
else if (resp.getStatus() != ClientResponse.SUCCESS) {
tmLog.info("Failed to complete partition detection snapshot, status: " + resp.getStatus() +
", reason: " + resp.getStatusString());
tmLog.info("Retrying partition detection snapshot...");
SnapshotUtil.requestSnapshot(0L,
m_partSnapshotSchedule.getPath(),
m_partSnapshotSchedule.getPrefix() + System.currentTimeMillis(),
true, SnapshotFormat.NATIVE, null, m_snapshotHandler,
true);
}
else if (!SnapshotUtil.didSnapshotRequestSucceed(resp.getResults())) {
VoltDB.crashGlobalVoltDB("Unable to complete partition detection snapshot: " +
resp.getResults()[0], false, null);
}
else {
VoltDB.crashGlobalVoltDB("Partition detection snapshot completed. Shutting down.",
false, null);
}
}
};
private class PartitionCallback extends BabySitter.Callback
{
final int m_partitionId;
final Set<Long> m_replicas;
long m_currentLeader;
/** Constructor used when we know (or think we know) who the leader for this partition is */
PartitionCallback(int partitionId, long currentLeader)
{
this(partitionId);
// Try to be clever for repair. Create ourselves with the current leader set to
// whatever is in the LeaderCache, and claim that replica exists, then let the
// first run() call fix the world.
m_currentLeader = currentLeader;
m_replicas.add(currentLeader);
}
/** Constructor used at startup when there is no leader */
PartitionCallback(int partitionId)
{
m_partitionId = partitionId;
// A bit of a hack, but we should never end up with an HSID as Long.MAX_VALUE
m_currentLeader = Long.MAX_VALUE;
m_replicas = new HashSet<Long>();
}
@Override
public void run(List<String> children)
{
List<Long> updatedHSIds = VoltZK.childrenToReplicaHSIds(children);
// compute previously unseen HSId set in the callback list
Set<Long> newHSIds = new HashSet<Long>(updatedHSIds);
newHSIds.removeAll(m_replicas);
tmLog.debug("Newly seen replicas: " + CoreUtils.hsIdCollectionToString(newHSIds));
// compute previously seen but now vanished from the callback list HSId set
Set<Long> missingHSIds = new HashSet<Long>(m_replicas);
missingHSIds.removeAll(updatedHSIds);
tmLog.debug("Newly dead replicas: " + CoreUtils.hsIdCollectionToString(missingHSIds));
tmLog.debug("Handling babysitter callback for partition " + m_partitionId + ": children: " +
CoreUtils.hsIdCollectionToString(updatedHSIds));
if (m_state.get() == AppointerState.CLUSTER_START) {
// We can't yet tolerate a host failure during startup. Crash it all
if (missingHSIds.size() > 0) {
VoltDB.crashGlobalVoltDB("Node failure detected during startup.", false, null);
}
// ENG-3166: Eventually we would like to get rid of the extra replicas beyond k_factor,
// but for now we just look to see how many replicas of this partition we actually expect
// and gate leader assignment on that many copies showing up.
int replicaCount = m_kfactor + 1;
JSONArray parts;
try {
parts = m_topo.getJSONArray("partitions");
for (int p = 0; p < parts.length(); p++) {
JSONObject aPartition = parts.getJSONObject(p);
int pid = aPartition.getInt("partition_id");
if (pid == m_partitionId) {
replicaCount = aPartition.getJSONArray("replicas").length();
}
}
} catch (JSONException e) {
// Ignore and just assume the normal number of replicas
}
if (children.size() == replicaCount) {
m_currentLeader = assignLeader(m_partitionId, updatedHSIds);
}
else {
tmLog.info("Waiting on " + ((m_kfactor + 1) - children.size()) + " more nodes " +
"for k-safety before startup");
}
}
else {
Set<Integer> hostsOnRing = new HashSet<Integer>();
// Check for k-safety
if (!isClusterKSafe(hostsOnRing)) {
VoltDB.crashGlobalVoltDB("Some partitions have no replicas. Cluster has become unviable.",
false, null);
}
// Check if replay has completed
if (m_replayComplete.get() == false) {
VoltDB.crashGlobalVoltDB("Detected node failure during command log replay. Cluster will shut down.",
false, null);
}
// Check to see if there's been a possible network partition and we're not already handling it
if (m_partitionDetectionEnabled && !m_partitionDetected) {
doPartitionDetectionActivities(hostsOnRing);
}
// If we survived the above gauntlet of fail, appoint a new leader for this partition.
if (missingHSIds.contains(m_currentLeader)) {
m_currentLeader = assignLeader(m_partitionId, updatedHSIds);
}
// If this partition doesn't have a leader yet, and we have new replicas added,
// elect a leader.
if (m_currentLeader == Long.MAX_VALUE && !updatedHSIds.isEmpty()) {
m_currentLeader = assignLeader(m_partitionId, updatedHSIds);
}
}
m_replicas.clear();
m_replicas.addAll(updatedHSIds);
}
}
/* We'll use this callback purely for startup so we can discover when all
* the leaders we have appointed have completed their promotions and
* published themselves to Zookeeper */
LeaderCache.Callback m_masterCallback = new LeaderCache.Callback()
{
@Override
public void run(ImmutableMap<Integer, Long> cache) {
Set<Long> currentLeaders = new HashSet<Long>(cache.values());
tmLog.debug("Updated leaders: " + currentLeaders);
if (m_state.get() == AppointerState.CLUSTER_START) {
try {
if (currentLeaders.size() == getInitialPartitionCount()) {
tmLog.debug("Leader appointment complete, promoting MPI and unblocking.");
m_state.set(AppointerState.DONE);
m_MPI.acceptPromotion();
m_startupLatch.set(null);
}
} catch (IllegalAccessException e) {
// This should never happen
VoltDB.crashLocalVoltDB("Failed to get partition count", true, e);
}
}
}
};
Watcher m_partitionCallback = new Watcher() {
@Override
public void process(WatchedEvent event)
{
m_es.submit(new Runnable() {
@Override
public void run()
{
try {
List<String> children = m_zk.getChildren(VoltZK.leaders_initiators, m_partitionCallback);
tmLog.info("Noticed partition change " + children + ", " +
"currenctly watching " + m_partitionWatchers.keySet());
for (String child : children) {
int pid = LeaderElector.getPartitionFromElectionDir(child);
if (!m_partitionWatchers.containsKey(pid) && pid != MpInitiator.MP_INIT_PID) {
watchPartition(pid, m_es, false);
}
}
tmLog.info("Done " + m_partitionWatchers.keySet());
} catch (Exception e) {
VoltDB.crashLocalVoltDB("Cannot read leader initiator directory", false, e);
}
}
});
}
};
public LeaderAppointer(HostMessenger hm, int numberOfPartitions,
int kfactor, boolean partitionDetectionEnabled,
SnapshotSchedule partitionSnapshotSchedule,
boolean usingCommandLog,
JSONObject topology, MpInitiator mpi,
KSafetyStats stats)
{
m_hostMessenger = hm;
m_zk = hm.getZK();
m_kfactor = kfactor;
m_topo = topology;
m_MPI = mpi;
m_initialPartitionCount = numberOfPartitions;
m_callbacks = new HashMap<Integer, PartitionCallback>();
m_partitionWatchers = new HashMap<Integer, BabySitter>();
m_iv2appointees = new LeaderCache(m_zk, VoltZK.iv2appointees);
m_iv2masters = new LeaderCache(m_zk, VoltZK.iv2masters, m_masterCallback);
m_partitionDetectionEnabled = partitionDetectionEnabled;
m_partSnapshotSchedule = partitionSnapshotSchedule;
m_usingCommandLog = usingCommandLog;
m_stats = stats;
if (m_partitionDetectionEnabled) {
if (!testPartitionDetectionDirectory(m_partSnapshotSchedule))
{
VoltDB.crashLocalVoltDB("Unable to create partition detection snapshot directory at " +
m_partSnapshotSchedule.getPath(), false, null);
}
}
}
@Override
public void acceptPromotion() throws InterruptedException, ExecutionException
{
final SettableFuture<Object> blocker = SettableFuture.create();
try {
m_es.submit(new Runnable() {
@Override
public void run() {
try {
acceptPromotionImpl(blocker);
} catch (Throwable t) {
blocker.setException(t);
}
}
});
blocker.get();
} catch (RejectedExecutionException e) {
if (m_es.isShutdown()) return;
throw new RejectedExecutionException(e);
}
}
private void acceptPromotionImpl(final SettableFuture<Object> blocker) throws InterruptedException, ExecutionException, KeeperException {
// Crank up the leader caches. Use blocking startup so that we'll have valid point-in-time caches later.
m_iv2appointees.start(true);
m_iv2masters.start(true);
ImmutableMap<Integer, Long> appointees = m_iv2appointees.pointInTimeCache();
// Figure out what conditions we assumed leadership under.
if (appointees.size() == 0)
{
tmLog.debug("LeaderAppointer in startup");
m_state.set(AppointerState.CLUSTER_START);
}
//INIT is the default before promotion at runtime. Don't do this startup check
//Let the rest of the promotion run and determine k-safety which is the else block.
else if (m_state.get() == AppointerState.INIT && !VoltDB.instance().isRunning()) {
ImmutableMap<Integer, Long> masters = m_iv2masters.pointInTimeCache();
try {
if ((appointees.size() < getInitialPartitionCount()) ||
(masters.size() < getInitialPartitionCount()) ||
(appointees.size() != masters.size())) {
// If we are promoted and the appointees or masters set is partial, the previous appointer failed
// during startup (at least for now, until we add remove a partition on the fly).
VoltDB.crashGlobalVoltDB("Detected failure during startup, unable to start", false, null);
}
} catch (IllegalAccessException e) {
// This should never happen
VoltDB.crashLocalVoltDB("Failed to get partition count", true, e);
}
}
else {
tmLog.debug("LeaderAppointer in repair");
m_state.set(AppointerState.DONE);
}
if (m_state.get() == AppointerState.CLUSTER_START) {
// Need to block the return of acceptPromotion until after the MPI is promoted. Wait for this latch
// to countdown after appointing all the partition leaders. The
// LeaderCache callback will count it down once it has seen all the
// appointed leaders publish themselves as the actual leaders.
m_startupLatch = SettableFuture.create();
writeKnownLiveNodes(new HashSet<Integer>(m_hostMessenger.getLiveHostIds()));
// Theoretically, the whole try/catch block below can be removed because the leader
// appointer now watches the parent dir for any new partitions. It doesn't have to
// create the partition dirs all at once, it can pick them up one by one as they are
// created. But I'm too afraid to remove this block just before the release,
// so leaving it here till later. - ning
try {
final int initialPartitionCount = getInitialPartitionCount();
for (int i = 0; i < initialPartitionCount; i++) {
LeaderElector.createRootIfNotExist(m_zk,
LeaderElector.electionDirForPartition(i));
watchPartition(i, m_es, true);
}
} catch (IllegalAccessException e) {
// This should never happen
VoltDB.crashLocalVoltDB("Failed to get partition count on startup", true, e);
}
//Asynchronously wait for this to finish otherwise it deadlocks
//on task that need to run on this thread
m_startupLatch.addListener(new Runnable() {
@Override
public void run() {
try {
m_zk.getChildren(VoltZK.leaders_initiators, m_partitionCallback);
blocker.set(null);
} catch (Throwable t) {
blocker.setException(t);
}
}
},
m_es);
}
else {
// If we're taking over for a failed LeaderAppointer, we know when
// we get here that every partition had a leader at some point in
// time. We'll seed each of the PartitionCallbacks for each
// partition with the HSID of the last published leader. The
// blocking startup of the BabySitter watching that partition will
// call our callback, get the current full set of replicas, and
// appoint a new leader if the seeded one has actually failed
Map<Integer, Long> masters = m_iv2masters.pointInTimeCache();
tmLog.info("LeaderAppointer repairing with master set: " + CoreUtils.hsIdValueMapToString(masters));
//Setting the map to non-null causes the babysitters to populate it when cleaning up partitions
//We are only racing with ourselves in that the creation of a babysitter can trigger callbacks
//that result in partitions being cleaned up. We don't have to worry about some other leader appointer.
//The iteration order of the partitions doesn't matter
m_removedPartitionsAtPromotionTime = new HashSet<Integer>();
for (Entry<Integer, Long> master : masters.entrySet()) {
//Skip processing the partition if it was cleaned up by a babysitter that was previously
//instantiated
if (m_removedPartitionsAtPromotionTime.contains(master.getKey())) {
tmLog.info("During promotion partition " + master.getKey() + " was cleaned up. Skipping.");
continue;
}
int partId = master.getKey();
String dir = LeaderElector.electionDirForPartition(partId);
m_callbacks.put(partId, new PartitionCallback(partId, master.getValue()));
Pair<BabySitter, List<String>> sitterstuff =
BabySitter.blockingFactory(m_zk, dir, m_callbacks.get(partId), m_es);
//We could get this far and then find out that creating this particular
//babysitter triggered cleanup so we need to bail out here as well
if (!m_removedPartitionsAtPromotionTime.contains(master.getKey())) {
m_partitionWatchers.put(partId, sitterstuff.getFirst());
}
}
m_removedPartitionsAtPromotionTime = null;
// just go ahead and promote our MPI
m_MPI.acceptPromotion();
// set up a watcher on the partitions dir so that new partitions will be picked up
m_zk.getChildren(VoltZK.leaders_initiators, m_partitionCallback);
blocker.set(null);
}
}
/**
* Watch the partition ZK dir in the leader appointer.
*
* This should be called on the elected leader appointer only. m_callbacks and
* m_partitionWatchers are only accessed on initialization, promotion,
* or elastic add node.
*
* @param pid The partition ID
* @param es The executor service to use to construct the baby sitter
* @param shouldBlock Whether or not to wait for the initial read of children
* @throws KeeperException
* @throws InterruptedException
* @throws ExecutionException
*/
void watchPartition(int pid, ExecutorService es, boolean shouldBlock)
throws InterruptedException, ExecutionException
{
String dir = LeaderElector.electionDirForPartition(pid);
m_callbacks.put(pid, new PartitionCallback(pid));
BabySitter babySitter;
if (shouldBlock) {
babySitter = BabySitter.blockingFactory(m_zk, dir, m_callbacks.get(pid), es).getFirst();
} else {
babySitter = BabySitter.nonblockingFactory(m_zk, dir, m_callbacks.get(pid), es);
}
m_partitionWatchers.put(pid, babySitter);
}
private long assignLeader(int partitionId, List<Long> children)
{
// We used masterHostId = -1 as a way to force the leader choice to be
// the first replica in the list, if we don't have some other mechanism
// which has successfully overridden it.
int masterHostId = -1;
if (m_state.get() == AppointerState.CLUSTER_START) {
try {
// find master in topo
JSONArray parts = m_topo.getJSONArray("partitions");
for (int p = 0; p < parts.length(); p++) {
JSONObject aPartition = parts.getJSONObject(p);
int pid = aPartition.getInt("partition_id");
if (pid == partitionId) {
masterHostId = aPartition.getInt("master");
}
}
}
catch (JSONException jse) {
tmLog.error("Failed to find master for partition " + partitionId + ", defaulting to 0");
jse.printStackTrace();
masterHostId = -1; // stupid default
}
}
else {
// For now, if we're appointing a new leader as a result of a
// failure, just pick the first replica in the children list.
// Could eventually do something more complex here to try to keep a
// semi-balance, but it's unclear that this has much utility until
// we add rebalancing on rejoin as well.
masterHostId = -1;
}
long masterHSId = children.get(0);
for (Long child : children) {
if (CoreUtils.getHostIdFromHSId(child) == masterHostId) {
masterHSId = child;
break;
}
}
tmLog.info("Appointing HSId " + CoreUtils.hsIdToString(masterHSId) + " as leader for partition " +
partitionId);
try {
m_iv2appointees.put(partitionId, masterHSId);
}
catch (Exception e) {
VoltDB.crashLocalVoltDB("Unable to appoint new master for partition " + partitionId, true, e);
}
return masterHSId;
}
private void writeKnownLiveNodes(Set<Integer> liveNodes)
{
try {
if (m_zk.exists(VoltZK.lastKnownLiveNodes, null) == null)
{
// VoltZK.createPersistentZKNodes should have done this
m_zk.create(VoltZK.lastKnownLiveNodes, null, Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT);
}
JSONStringer stringer = new JSONStringer();
stringer.object();
stringer.key("liveNodes").array();
for (Integer node : liveNodes) {
stringer.value(node);
}
stringer.endArray();
stringer.endObject();
JSONObject obj = new JSONObject(stringer.toString());
tmLog.debug("Writing live nodes to ZK: " + obj.toString(4));
m_zk.setData(VoltZK.lastKnownLiveNodes, obj.toString(4).getBytes("UTF-8"), -1);
} catch (Exception e) {
VoltDB.crashLocalVoltDB("Unable to update known live nodes at ZK path: " +
VoltZK.lastKnownLiveNodes, true, e);
}
}
private Set<Integer> readPriorKnownLiveNodes()
{
Set<Integer> nodes = new HashSet<Integer>();
try {
byte[] data = m_zk.getData(VoltZK.lastKnownLiveNodes, false, null);
String jsonString = new String(data, "UTF-8");
tmLog.debug("Read prior known live nodes: " + jsonString);
JSONObject jsObj = new JSONObject(jsonString);
JSONArray jsonNodes = jsObj.getJSONArray("liveNodes");
for (int ii = 0; ii < jsonNodes.length(); ii++) {
nodes.add(jsonNodes.getInt(ii));
}
} catch (Exception e) {
VoltDB.crashLocalVoltDB("Unable to read prior known live nodes at ZK path: " +
VoltZK.lastKnownLiveNodes, true, e);
}
return nodes;
}
/*
* Check if the directory specified for the snapshot on partition detection
* exists, and has permissions set correctly.
*/
private boolean testPartitionDetectionDirectory(SnapshotSchedule schedule) {
if (m_partitionDetectionEnabled) {
File partitionPath = new File(schedule.getPath());
if (!partitionPath.exists()) {
tmLog.error("Directory " + partitionPath + " for partition detection snapshots does not exist");
return false;
}
if (!partitionPath.isDirectory()) {
tmLog.error("Directory " + partitionPath + " for partition detection snapshots is not a directory");
return false;
}
File partitionPathFile = new File(partitionPath, Long.toString(System.currentTimeMillis()));
try {
partitionPathFile.createNewFile();
partitionPathFile.delete();
} catch (IOException e) {
tmLog.error(
"Could not create a test file in " +
partitionPath +
" for partition detection snapshots");
e.printStackTrace();
return false;
}
return true;
} else {
return true;
}
}
/**
* Given a set of the known host IDs before a fault, and the known host IDs in the
* post-fault cluster, determine whether or not we think a network partition may have happened.
* NOTE: this assumes that we have already done the k-safety validation for every partition and already failed
* if we weren't a viable cluster.
* ALSO NOTE: not private so it may be unit-tested.
*/
static boolean makePPDDecision(Set<Integer> previousHosts, Set<Integer> currentHosts)
{
// Real partition detection stuff would go here
// find the lowest hostId between the still-alive hosts and the
// failed hosts. Which set contains the lowest hostId?
int blessedHostId = Integer.MAX_VALUE;
boolean blessedHostIdInFailedSet = true;
// This should be all the pre-partition hosts IDs. Any new host IDs
// (say, if this was triggered by rejoin), will be greater than any surviving
// host ID, so don't worry about including it in this search.
for (Integer hostId : previousHosts) {
if (hostId < blessedHostId) {
blessedHostId = hostId;
}
}
for (Integer hostId : currentHosts) {
if (hostId.equals(blessedHostId)) {
blessedHostId = hostId;
blessedHostIdInFailedSet = false;
}
}
// Evaluate PPD triggers.
boolean partitionDetectionTriggered = false;
// Exact 50-50 splits. The set with the lowest survivor host doesn't trigger PPD
// If the blessed host is in the failure set, this set is not blessed.
if (currentHosts.size() * 2 == previousHosts.size()) {
if (blessedHostIdInFailedSet) {
tmLog.info("Partition detection triggered for 50/50 cluster failure. " +
"This survivor set is shutting down.");
partitionDetectionTriggered = true;
}
else {
tmLog.info("Partition detected for 50/50 failure. " +
"This survivor set is continuing execution.");
}
}
// A strict, viable minority is always a partition.
if (currentHosts.size() * 2 < previousHosts.size()) {
tmLog.info("Partition detection triggered. " +
"This minority survivor set is shutting down.");
partitionDetectionTriggered = true;
}
return partitionDetectionTriggered;
}
private void doPartitionDetectionActivities(Set<Integer> currentNodes)
{
// We should never re-enter here once we've decided we're partitioned and doomed
assert(!m_partitionDetected);
Set<Integer> currentHosts = new HashSet<Integer>(currentNodes);
Set<Integer> previousHosts = readPriorKnownLiveNodes();
boolean partitionDetectionTriggered = makePPDDecision(previousHosts, currentHosts);
if (partitionDetectionTriggered) {
m_partitionDetected = true;
if (m_usingCommandLog) {
// Just shut down immediately
VoltDB.crashGlobalVoltDB("Use of command logging detected, no additional database snapshot will " +
"be generated. Please use the 'recover' action to restore the database if necessary.",
false, null);
}
else {
SnapshotUtil.requestSnapshot(0L,
m_partSnapshotSchedule.getPath(),
m_partSnapshotSchedule.getPrefix() + System.currentTimeMillis(),
true, SnapshotFormat.NATIVE, null, m_snapshotHandler,
true);
}
}
// If the cluster host set has changed, then write the new set to ZK
// NOTE: we don't want to update the known live nodes if we've decided that our subcluster is
// dying, otherwise a poorly timed subsequent failure might reverse this decision. Any future promoted
// LeaderAppointer should make their partition detection decision based on the pre-partition cluster state.
else if (!currentHosts.equals(previousHosts)) {
writeKnownLiveNodes(currentNodes);
}
}
private boolean isClusterKSafe(Set<Integer> hostsOnRing)
{
boolean retval = true;
List<String> partitionDirs = null;
ImmutableSortedSet.Builder<KSafetyStats.StatsPoint> lackingReplication =
ImmutableSortedSet.naturalOrder();
try {
partitionDirs = m_zk.getChildren(VoltZK.leaders_initiators, null);
} catch (Exception e) {
VoltDB.crashLocalVoltDB("Unable to read partitions from ZK", true, e);
}
//Don't fetch the values serially do it asynchronously
Queue<ZKUtil.ByteArrayCallback> dataCallbacks = new ArrayDeque<ZKUtil.ByteArrayCallback>();
Queue<ZKUtil.ChildrenCallback> childrenCallbacks = new ArrayDeque<ZKUtil.ChildrenCallback>();
for (String partitionDir : partitionDirs) {
String dir = ZKUtil.joinZKPath(VoltZK.leaders_initiators, partitionDir);
try {
ZKUtil.ByteArrayCallback callback = new ZKUtil.ByteArrayCallback();
m_zk.getData(dir, false, callback, null);
dataCallbacks.offer(callback);
ZKUtil.ChildrenCallback childrenCallback = new ZKUtil.ChildrenCallback();
m_zk.getChildren(dir, false, childrenCallback, null);
childrenCallbacks.offer(childrenCallback);
} catch (Exception e) {
VoltDB.crashLocalVoltDB("Unable to read replicas in ZK dir: " + dir, true, e);
}
}
final long statTs = System.currentTimeMillis();
for (String partitionDir : partitionDirs) {
int pid = LeaderElector.getPartitionFromElectionDir(partitionDir);
String dir = ZKUtil.joinZKPath(VoltZK.leaders_initiators, partitionDir);
try {
// The data of the partition dir indicates whether the partition has finished
// initializing or not. If not, the replicas may still be in the process of
// adding themselves to the dir. So don't check for k-safety if that's the case.
byte[] partitionState = dataCallbacks.poll().getData();
boolean isInitializing = false;
if (partitionState != null && partitionState.length == 1) {
isInitializing = partitionState[0] == LeaderElector.INITIALIZING;
}
List<String> replicas = childrenCallbacks.poll().getChildren();
if (pid == MpInitiator.MP_INIT_PID) continue;
final boolean partitionNotOnHashRing = partitionNotOnHashRing(pid);
if (!isInitializing && replicas.isEmpty()) {
//These partitions can fail, just cleanup and remove the partition from the system
if (partitionNotOnHashRing) {
removeAndCleanupPartition(pid);
continue;
}
tmLog.fatal("K-Safety violation: No replicas found for partition: " + pid);
retval = false;
} else if (!partitionNotOnHashRing) {
//Record host ids for all partitions that are on the ring
//so they are considered for partition detection
for (String replica : replicas) {
final String split[] = replica.split("/");
final long hsId = Long.valueOf(split[split.length - 1].split("_")[0]);
final int hostId = CoreUtils.getHostIdFromHSId(hsId);
hostsOnRing.add(hostId);
}
}
if (!isInitializing && !partitionNotOnHashRing) {
lackingReplication.add(
new KSafetyStats.StatsPoint(statTs, pid, m_kfactor + 1 - replicas.size())
);
}
}
catch (Exception e) {
VoltDB.crashLocalVoltDB("Unable to read replicas in ZK dir: " + dir, true, e);
}
}
m_stats.setSafetySet(lackingReplication.build());
return retval;
}
private void removeAndCleanupPartition(int pid) {
tmLog.info("Removing and cleanup up partition info for partition " + pid);
if (m_removedPartitionsAtPromotionTime != null) {
m_removedPartitionsAtPromotionTime.add(pid);
tmLog.info("Partition " + pid + " was cleaned up during LeaderAppointer promotion and should be skipped");
}
BabySitter sitter = m_partitionWatchers.remove(pid);
if (sitter != null) {
sitter.shutdown();
}
m_callbacks.remove(pid);
try {
ZKUtil.asyncDeleteRecursively(m_zk, ZKUtil.joinZKPath(VoltZK.iv2masters, String.valueOf(pid)));
ZKUtil.asyncDeleteRecursively(m_zk, ZKUtil.joinZKPath(VoltZK.iv2appointees, String.valueOf(pid)));
ZKUtil.asyncDeleteRecursively(m_zk, ZKUtil.joinZKPath(VoltZK.leaders_initiators, "partition_" + String.valueOf(pid)));
} catch (Exception e) {
tmLog.error("Error removing partition info", e);
}
}
private static boolean partitionNotOnHashRing(int pid) {
if (TheHashinator.getConfiguredHashinatorType() == TheHashinator.HashinatorType.LEGACY) return false;
return TheHashinator.getRanges(pid).isEmpty();
}
/**
* Gets the initial cluster partition count on startup. This can only be called during
* initialization. Calling this after initialization throws, because the partition count may
* not reflect the actual partition count in the cluster.
*
* @return
*/
private int getInitialPartitionCount() throws IllegalAccessException
{
AppointerState currentState = m_state.get();
if (currentState != AppointerState.INIT && currentState != AppointerState.CLUSTER_START) {
throw new IllegalAccessException("Getting cached partition count after cluster " +
"startup");
}
return m_initialPartitionCount;
}
public void onReplayCompletion()
{
m_replayComplete.set(true);
}
public void shutdown()
{
try {
m_es.execute(new Runnable() {
@Override
public void run() {
try {
m_iv2appointees.shutdown();
m_iv2masters.shutdown();
for (BabySitter watcher : m_partitionWatchers.values()) {
watcher.shutdown();
}
} catch (Exception e) {
// don't care, we're going down
}
}
});
m_es.shutdown();
m_es.awaitTermination(356, TimeUnit.DAYS);
}
catch (InterruptedException e) {
tmLog.warn("Unexpected interrupted exception", e);
}
}
}