Source Code of net.kuujo.copycat.internal.state.LeaderController

/*
 * Copyright 2014 the original author or authors.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package net.kuujo.copycat.internal.state;


import net.kuujo.copycat.CopycatException;
import net.kuujo.copycat.CopycatState;
import net.kuujo.copycat.cluster.Cluster;
import net.kuujo.copycat.cluster.ClusterConfig;
import net.kuujo.copycat.event.MembershipChangeEvent;
import net.kuujo.copycat.internal.StateMachineExecutor;
import net.kuujo.copycat.internal.log.ConfigurationEntry;
import net.kuujo.copycat.internal.log.OperationEntry;
import net.kuujo.copycat.internal.replication.ClusterReplicator;
import net.kuujo.copycat.internal.replication.Replicator;
import net.kuujo.copycat.protocol.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;


import java.util.Observable;
import java.util.Observer;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ScheduledFuture;
import java.util.concurrent.TimeUnit;


/**
 * Leader state.<p>
 *
 * The leader state is assigned to replicas who have assumed
 * a leadership role in the cluster through a cluster-wide election.
 * The leader's role is to receive operation submissions and log and
 * replicate state changes. All state changes go through the leader
 * for simplicity.
 *
 * @author <a href="http://github.com/kuujo">Jordan Halterman</a>
 */
public class LeaderController extends StateController implements Observer {
  private static final Logger LOGGER = LoggerFactory.getLogger(LeaderController.class);
  private ScheduledFuture<Void> currentTimer;
  private Replicator replicator;


  @Override
  CopycatState state() {
    return CopycatState.LEADER;
  }


  @Override
  Logger logger() {
    return LOGGER;
  }


  @Override
  public void init(StateContext context) {
    super.init(context);


    replicator = new ClusterReplicator(context);


    // When the leader is first elected, it needs to commit any pending operations
    // in its log to the state machine and then commit a snapshot to its log.
    // This methodology differs slightly from the standard Raft algorithm. Instead
    // if storing snapshots in a separate file, we store them as normal log entries.
    // Using this methodology, *all* nodes should always have a snapshot as the
    // first entry in their log, whether it be a local snapshot or a snapshot that
    // was replicated by the leader. This greatly simplifies snapshot management as
    // snapshots are simply replicated as a normal part of each node's log.
    int count = 0;
    for (long i = context.lastApplied() + 1; i <= context.log().lastIndex(); i++) {
      applyEntry(i);
      count++;
    }
    LOGGER.debug("{} - Applied {} entries to state machine", context.clusterManager().localNode(), count);


    // Ensure that the cluster configuration is up-to-date and properly
    // replicated by committing the current configuration to the log. This will
    // ensure that nodes' cluster configurations are consistent with the leader's.
    ConfigurationEntry configEntry = new ConfigurationEntry(context.currentTerm(), context.clusterManager().cluster().config().copy());
    long configIndex = context.log().appendEntry(configEntry);
    LOGGER.debug("{} - Appended {} to log at index {}", context.clusterManager().localNode(), configEntry, configIndex);


    // Start observing the user provided cluster configuration for changes.
    // When the cluster configuration changes, changes will be committed to the
    // log and replicated according to the Raft specification.
    context.cluster().addObserver(this);
    LOGGER.debug("{} - Observing {}", context.clusterManager().localNode(), context.cluster());


    // Set the current leader as this replica.
    context.currentLeader(context.clusterManager().localNode().member().id());


    // Set a timer that will be used to periodically synchronize with other nodes
    // in the cluster. This timer acts as a heartbeat to ensure this node remains
    // the leader.
   replicator.pingAll();


    LOGGER.debug("{} - Setting ping timer", context.clusterManager().localNode());
    setPingTimer();
  }


  @Override
  @SuppressWarnings("rawtypes")
  public void update(Observable o, Object arg) {
    clusterChanged(((Cluster) o).config());
  }


  /**
   * Called when the cluster configuration has changed.
   */
  @SuppressWarnings({"unchecked", "rawtypes"})
  private synchronized void clusterChanged(final ClusterConfig cluster) {
    // All cluster configuration changes must go through the leader. In order to
    // perform cluster configuration changes, the leader observes the local cluster
    // configuration if it is indeed observable. We have to be very careful about
    // the order in which cluster configuration changes occur. If two configuration
    // changes are taking place at the same time, one can overwrite the other.
    // Additionally, if a new cluster configuration immediately overwrites an old
    // configuration without first replicating a joint old/new configuration,
    // a dual-majority can result, meaning logs will ultimately become out of sync.
    // In order to avoid this, we need to perform a two-step configuration change:
    // - First log the combined current cluster configuration and new cluster
    // configuration. For instance, if a node was added to the cluster, log the
    // new configuration. If a node was removed, log the old configuration.
    // - Once the joint cluster configuration has been replicated, log and
    // sync the new configuration.
    // This two-step process ensures log consistency by ensuring that two majorities
    // cannot result from adding and removing too many nodes at once.
    LOGGER.debug("{} - Detected configuration change {}", context.clusterManager().localNode(), cluster);


    // First, store a copy of both the current internal cluster configuration and
    // the user defined cluster configuration. This ensures that mutable configurations
    // are not changed during the reconfiguration process which can be asynchronous.
    // Note also that we create a copy of the configuration in order to ensure that
    // polymorphic types are properly reconstructed.
    final ClusterConfig userConfig = cluster.copy();
    final ClusterConfig internalConfig = context.clusterManager().cluster().config().copy();


    // If another cluster configuration change is occurring right now, it's possible
    // that the two configuration changes could overlap one another. In order to
    // avoid this, we wait until all entries up to the current log index have been
    // committed before beginning the configuration change. This ensures that any
    // previous configuration changes have completed.
    LOGGER.debug("{} - Committing all entries for configuration change", context.clusterManager().localNode());
    replicator.commitAll().whenComplete((commitIndex, commitError) -> {
      // First we need to create a joint old/new cluster configuration entry.
      // We copy the internal configuration again for safety from modifications.
      final ClusterConfig jointConfig = internalConfig.copy().addRemoteMembers(userConfig.getRemoteMembers());


      // Append the joint configuration to the log. This will be replicated to
      // followers and applied to their internal cluster managers.
      ConfigurationEntry jointConfigEntry = new ConfigurationEntry(context.currentTerm(), jointConfig);
      long configIndex = context.log().appendEntry(jointConfigEntry);
      LOGGER.debug("{} - Appended {} to log at index {}", context.clusterManager().localNode(), jointConfigEntry, configIndex);


      // Immediately after the entry is appended to the log, apply the joint
      // configuration. Cluster membership changes do not wait for commitment.
      // Since we're using a joint consensus, it's safe to work with all members
      // of both the old and new configuration without causing split elections.
      context.clusterManager().cluster().update(jointConfig, null);
      context.events().membershipChange().handle(new MembershipChangeEvent(jointConfig.getMembers()));
      LOGGER.debug("{} - Updated internal cluster configuration {}", context.clusterManager().localNode(), context.clusterManager().cluster());


      // Once the cluster is updated, the replicator will be notified and update its
      // internal connections. Then we commit the joint configuration and allow
      // it to be replicated to all the nodes in the updated cluster.
      LOGGER.debug("{} - Committing all entries for configuration change", context.clusterManager().localNode());
      replicator.commit(configIndex).whenComplete((commitIndex2, commitError2) -> {
        // Now that we've gotten to this point, we know that the combined cluster
        // membership has been replicated to a majority of the cluster.
        // Append the new user configuration to the log and force all replicas
        // to be synchronized.
        ConfigurationEntry newConfigEntry = new ConfigurationEntry(context.currentTerm(), userConfig);
        long newConfigIndex = context.log().appendEntry(newConfigEntry);
        LOGGER.debug("{} - Appended {} to log at index {}", context.clusterManager().localNode(), newConfigEntry, newConfigIndex);


        // Again, once we've appended the new configuration to the log, update
        // the local internal configuration.
        context.clusterManager().cluster().update(userConfig, null);
        context.events().membershipChange().handle(new MembershipChangeEvent(userConfig.getMembers()));
        LOGGER.debug("{} - Updated internal cluster configuration {}", context.clusterManager().localNode(), context.clusterManager().cluster());


        // Note again that when the cluster membership changes, the replicator will
        // be notified and remove any replicas that are no longer a part of the cluster.
        // Now that the cluster and replicator have been updated, we can commit the
        // new configuration.
        LOGGER.debug("{} - Committing all entries for configuration change", context.clusterManager().localNode());
        replicator.commitAll();
      });
    });
  }


  /**
   * Resets the ping timer.
   */
  private void setPingTimer() {
    currentTimer = context.config().getTimerStrategy().schedule(() -> {
      replicator.pingAll();
      setPingTimer();
    }, context.config().getHeartbeatInterval(), TimeUnit.MILLISECONDS);
  }


  @Override
  public CompletableFuture<PingResponse> ping(final PingRequest request) {
    if (request.term() > context.currentTerm()) {
      return super.ping(request);
    } else if (request.term() < context.currentTerm()) {
      return CompletableFuture.completedFuture(logResponse(new PingResponse(logRequest(request).id(), context.currentTerm(), false)));
    } else {
      context.transition(FollowerController.class);
      return super.ping(request);
    }
  }


  @Override
  public CompletableFuture<SyncResponse> sync(final SyncRequest request) {
    if (request.term() > context.currentTerm()) {
      return super.sync(request);
    } else if (request.term() < context.currentTerm()) {
      return CompletableFuture.completedFuture(logResponse(new SyncResponse(logRequest(request).id(), context.currentTerm(), false, context.log().lastIndex())));
    } else {
      context.transition(FollowerController.class);
      return super.sync(request);
    }
  }


  @Override
  public CompletableFuture<SubmitResponse> submit(final SubmitRequest request) {
    logRequest(request);


    CompletableFuture<SubmitResponse> future = new CompletableFuture<>();


    // Find the named operation.
    StateMachineExecutor.Operation operation = context.stateMachineExecutor().getOperation(request.operation());


    // If the operation is unknown the immediately return a failure.
    if (operation == null) {
      future.completeExceptionally(new CopycatException("Invalid operation"));
    }
    // Depending on the operation type, read or write operations may or may not be replicated
    // to a quorum based on configuration options. For write operations, if a quorum is
    // required then the operation will be replicated. For read operations, if a quorum is
    // required then we simply ping a quorum of the cluster to ensure that data is not stale.
    else if (operation.isReadOnly()) {
      // Users have the option of whether to allow stale data to be returned. By
      // default, read quorums are enabled. If read quorums are disabled then we
      // simply apply the operation, otherwise we need to ping a quorum of the
      // cluster to ensure that data is up-to-date before responding.
      if (context.config().isRequireQueryQuorum()) {
        long lastIndex = context.log().lastIndex();
        LOGGER.debug("{} - Synchronizing logs to index {} for read", context.clusterManager().localNode(), lastIndex);
        replicator.ping(lastIndex).whenComplete((index, error) -> {
          if (error == null) {
            try {
              future.complete(logResponse(new SubmitResponse(request.id(), operation.apply(request.args()))));
            } catch (Exception e) {
              future.completeExceptionally(e);
            }
          } else {
            future.completeExceptionally(error);
          }
        });
      } else {
        try {
          future.complete(logResponse(new SubmitResponse(request.id(), operation.apply(request.args()))));
        } catch (Exception e) {
          future.completeExceptionally(e);
        }
      }
    } else {
      // For write operations or for operations for which the type is not known, an
      // entry must be logged, replicated, and committed prior to applying it
      // to the state machine and returning the result.
      OperationEntry entry = new OperationEntry(context.currentTerm(), request.operation(), request.args());
      final long index = context.log().appendEntry(entry);
      LOGGER.debug("{} - Appended {} to log at index {}", context.clusterManager().localNode(), entry, index);


      // Write quorums are also optional to the user. The user can optionally
      // indicate that write operations should be immediately applied to the state
      // machine and the result returned.
      if (context.config().isRequireCommandQuorum()) {
        // If the replica requires write quorums, we simply set a task to be
        // executed once the entry has been replicated to a quorum of the cluster.
        LOGGER.debug("{} - Replicating logs up to index {} for write", context.clusterManager().localNode(), index);
        replicator.commit(index).whenComplete((resultIndex, error) -> {
          if (error == null) {
            try {
              future.complete(logResponse(new SubmitResponse(request.id(), operation.apply(request.args()))));
            } catch (Exception e) {
              future.completeExceptionally(e);
            } finally {
              context.lastApplied(index);
              compactLog();
            }
          } else {
            future.completeExceptionally(error);
          }
        });
      } else {
        // If write quorums are not required then just apply it and return the
        // result. We don't need to check the order of the application here since
        // all entries written to the log will not require a quorum and thus
        // we won't be applying any entries out of order.
        try {
          future.complete(logResponse(new SubmitResponse(request.id(), operation.apply(request.args()))));
        } catch (Exception e) {
          future.completeExceptionally(e);
        } finally {
          context.lastApplied(index);
          compactLog();
        }
      }
    }
    return future;
  }


  @Override
  void destroy() {
    if (currentTimer != null) {
      LOGGER.debug("{} - Cancelling ping timer", context.clusterManager().localNode());
      currentTimer.cancel(true);
    }
    // Stop observing the observable cluster configuration.
    context.cluster().deleteObserver(this);
  }


  @Override
  public boolean equals(Object object) {
    return object instanceof LeaderController && ((StateController) object).context.equals(context);
  }


  @Override
  public int hashCode() {
    int hashCode = 23;
    hashCode = 37 * hashCode + context.hashCode();
    return hashCode;
  }


  @Override
  public String toString() {
    return String.format("LeaderController[context=%s]", context);
  }


}
Source Code of net.kuujo.copycat.internal.state.LeaderController

Related Classes of net.kuujo.copycat.internal.state.LeaderController