mirror of https://github.com/railgun-rs/actix-raft
Overhaul the election timeout system.
This uses an interval job instead of juggling a rescheduling processes. Seems to offer quite a lot more stability. Along with the interval job, we are using std::time::Instants for performing the comparisons against the last received heartbeat. Closes #26
This commit is contained in:
parent
77aff0664e
commit
05af50e005
|
@ -1,5 +1,7 @@
|
|||
# Directory Ignores ##########################################################
|
||||
guide/book
|
||||
target
|
||||
vendor
|
||||
|
||||
# File Ignores ###############################################################
|
||||
**/*.rs.bk
|
||||
|
|
|
@ -2,6 +2,9 @@ changelog
|
|||
=========
|
||||
|
||||
### 0.3
|
||||
#### 0.3.1
|
||||
Overhauled the election timeout mechanism. This uses an interval job instead of juggling a rescheduling processes. Seems to offer quite a lot more stability. Along with the interval job, we are using std::time::Instants for performing the comparisons against the last received heartbeat.
|
||||
|
||||
#### 0.3.0
|
||||
Another backwards incompatible change to the `RaftStorage` trait. It is now using associated types to better express the needed trait constraints. These changes were the final bit of work needed to get the entire actix-raft system to work with a Synchronous `RaftStorage` impl. Async impls continue to work as they have, the `RaftStorage` impl block will need to be updated to use the associated types though. The recommend pattern is as follows:
|
||||
|
||||
|
|
|
@ -70,7 +70,7 @@ impl<D: AppData, E: AppError, N: RaftNetwork<D>, S: RaftStorage<D, E>> Handler<A
|
|||
}
|
||||
|
||||
// Update election timeout.
|
||||
self.update_election_timeout(ctx);
|
||||
self.update_election_timeout_stamp();
|
||||
|
||||
// Update current term if needed.
|
||||
if self.current_term != msg.term {
|
||||
|
|
|
@ -38,7 +38,7 @@ impl<D: AppData, E: AppError, N: RaftNetwork<D>, S: RaftStorage<D, E>> Handler<I
|
|||
}
|
||||
|
||||
// Update election timeout.
|
||||
self.update_election_timeout(ctx);
|
||||
self.update_election_timeout_stamp();
|
||||
|
||||
// Update current term if needed.
|
||||
if self.current_term != msg.term {
|
||||
|
|
|
@ -12,12 +12,12 @@ mod vote;
|
|||
use std::{
|
||||
collections::BTreeMap,
|
||||
sync::Arc,
|
||||
time::{Duration, SystemTime},
|
||||
time::{Duration, Instant},
|
||||
};
|
||||
|
||||
use actix::prelude::*;
|
||||
use futures::sync::{mpsc};
|
||||
use log::{debug, error};
|
||||
use log::{error};
|
||||
|
||||
use crate::{
|
||||
AppData, AppError, NodeId,
|
||||
|
@ -152,8 +152,8 @@ pub struct Raft<D: AppData, E: AppError, N: RaftNetwork<D>, S: RaftStorage<D, E>
|
|||
|
||||
/// A handle to the election timeout callback.
|
||||
election_timeout: Option<actix::SpawnHandle>,
|
||||
/// The currently scheduled timeout timestamp in millis.
|
||||
election_timeout_stamp: Option<u128>,
|
||||
/// The currently scheduled election timeout.
|
||||
election_timeout_stamp: Option<Instant>,
|
||||
}
|
||||
|
||||
impl<D: AppData, E: AppError, N: RaftNetwork<D>, S: RaftStorage<D, E>> Raft<D, E, N, S> {
|
||||
|
@ -516,12 +516,13 @@ impl<D: AppData, E: AppError, N: RaftNetwork<D>, S: RaftStorage<D, E>> Raft<D, E
|
|||
|
||||
/// Update the election timeout process.
|
||||
///
|
||||
/// This will run the nodes election timeout mechanism to ensure that elections are held if
|
||||
/// too much time passes before hearing from a leader or a candidate.
|
||||
/// This will schedule a new interval job based on the configured election timeout. The
|
||||
/// interval job will check to see if a campaign should be started based on when the last
|
||||
/// heartbeat was received from the Raft leader or a candidate.
|
||||
///
|
||||
/// The election timeout will be updated everytime this node receives an RPC from the leader
|
||||
/// as well as any time a candidate node sends a RequestVote RPC. We reset on candidate RPCs
|
||||
/// iff the RPC is a valid vote request.
|
||||
/// The election timeout stamp will be updated everytime this node receives an RPC from the
|
||||
/// leader as well as any time a candidate node sends a RequestVote RPC if it is a
|
||||
/// valid vote request.
|
||||
fn update_election_timeout(&mut self, ctx: &mut Context<Self>) {
|
||||
// Don't update if the cluster has this node configured as a non-voter.
|
||||
if !self.membership.contains(&self.id) || self.membership.non_voters.contains(&self.id) {
|
||||
|
@ -531,29 +532,24 @@ impl<D: AppData, E: AppError, N: RaftNetwork<D>, S: RaftStorage<D, E>> Raft<D, E
|
|||
// Cancel any current election timeout before spawning a new one.
|
||||
if let Some(handle) = self.election_timeout.take() {
|
||||
ctx.cancel_future(handle);
|
||||
self.election_timeout_stamp = None;
|
||||
}
|
||||
|
||||
let timeout = Duration::from_millis(self.config.election_timeout_millis);
|
||||
let now = SystemTime::now().duration_since(SystemTime::UNIX_EPOCH).unwrap();
|
||||
self.election_timeout_stamp = Some(now.as_millis() + self.config.election_timeout_millis as u128);
|
||||
self.election_timeout = Some(ctx.run_later(timeout, |act, ctx| {
|
||||
match act.election_timeout_stamp.take() {
|
||||
Some(stamp) if stamp <= SystemTime::now().duration_since(SystemTime::UNIX_EPOCH).unwrap().as_millis() => {
|
||||
if !act.state.is_non_voter() {
|
||||
act.become_candidate(ctx)
|
||||
}
|
||||
self.election_timeout_stamp = Some(Instant::now() + timeout.clone());
|
||||
self.election_timeout = Some(ctx.run_interval(timeout, |act, ctx| {
|
||||
if let Some(stamp) = &act.election_timeout_stamp {
|
||||
if &Instant::now() >= stamp {
|
||||
act.become_candidate(ctx)
|
||||
}
|
||||
Some(stamp) => {
|
||||
debug!("{} invoked election timeout prematurely.", act.id);
|
||||
// If the scheduled timeout is still in the future, put it back.
|
||||
act.election_timeout_stamp = Some(stamp);
|
||||
}
|
||||
None => return,
|
||||
}
|
||||
}));
|
||||
}
|
||||
|
||||
/// Update the election timeout stamp, typically due to receiving a heartbeat from the Raft leader.
|
||||
fn update_election_timeout_stamp(&mut self) {
|
||||
self.election_timeout_stamp = Some(Instant::now() + Duration::from_millis(self.config.election_timeout_millis));
|
||||
}
|
||||
|
||||
/// Update the node's current membership config.
|
||||
///
|
||||
/// NOTE WELL: if a leader is stepping down, it should not call this method, as it will cause
|
||||
|
|
|
@ -68,7 +68,7 @@ impl<D: AppData, E: AppError, N: RaftNetwork<D>, S: RaftStorage<D, E>> Raft<D, E
|
|||
None => {
|
||||
self.voted_for = Some(msg.candidate_id);
|
||||
self.save_hard_state(ctx);
|
||||
self.update_election_timeout(ctx);
|
||||
self.update_election_timeout_stamp();
|
||||
self.become_follower(ctx);
|
||||
Ok(VoteResponse{term: self.current_term, vote_granted: true, is_candidate_unknown: false})
|
||||
},
|
||||
|
|
|
@ -47,7 +47,7 @@ fn client_writes() {
|
|||
// Setup test controller and actions.
|
||||
let mut ctl = RaftTestController::new(network);
|
||||
ctl.register(0, node0.addr.clone()).register(1, node1.addr.clone()).register(2, node2.addr.clone());
|
||||
ctl.start_with_test(4, Box::new(|act, ctx| {
|
||||
ctl.start_with_test(10, Box::new(|act, ctx| {
|
||||
// Get the current leader.
|
||||
ctx.spawn(fut::wrap_future(act.network.send(GetCurrentLeader))
|
||||
.map_err(|_, _: &mut RaftTestController, _| panic!("Failed to get current leader."))
|
||||
|
|
|
@ -47,7 +47,7 @@ fn clustering() {
|
|||
// Setup test controller and actions.
|
||||
let mut ctl = RaftTestController::new(network);
|
||||
ctl.register(0, node0.addr.clone()).register(1, node1.addr.clone()).register(2, node2.addr.clone());
|
||||
ctl.start_with_test(5, Box::new(|act, ctx| {
|
||||
ctl.start_with_test(10, Box::new(|act, ctx| {
|
||||
let (tx0, rx0) = oneshot::channel();
|
||||
act.network.do_send(ExecuteInRaftRouter(Box::new(move |act, _| {
|
||||
let node0: &RaftMetrics = act.metrics.get(&0).unwrap();
|
||||
|
|
|
@ -51,7 +51,7 @@ fn dynamic_membership() {
|
|||
// Setup test controller and actions.
|
||||
let mut ctl = RaftTestController::new(network);
|
||||
ctl.register(0, node0.addr.clone()).register(1, node1.addr.clone()).register(2, node2.addr.clone());
|
||||
ctl.start_with_test(5, Box::new(|act, ctx| {
|
||||
ctl.start_with_test(10, Box::new(|act, ctx| {
|
||||
let task = act.write_data(ctx)
|
||||
// Data has been writtent to new cluster, get the ID of the current leader.
|
||||
.and_then(|_, act, _| {
|
||||
|
@ -78,7 +78,7 @@ fn dynamic_membership() {
|
|||
.map(move |_, _, _| leader_id)
|
||||
})
|
||||
// Delay for a bit to ensure we can target a new leader in the test.
|
||||
.and_then(|leader_id, _, _| fut::wrap_future(Delay::new(Instant::now() + Duration::from_secs(3)))
|
||||
.and_then(|leader_id, _, _| fut::wrap_future(Delay::new(Instant::now() + Duration::from_secs(6)))
|
||||
.map_err(|_, _, _| ())
|
||||
.map(move |_, _, _| leader_id))
|
||||
// Remove old node.
|
||||
|
@ -91,7 +91,7 @@ fn dynamic_membership() {
|
|||
})
|
||||
// Write some additional data to the new leader.
|
||||
.and_then(|old_leader_id, act, ctx| act.write_data(ctx).map(move |_, _, _| old_leader_id))
|
||||
.and_then(|old_leader_id, _, _| fut::wrap_future(Delay::new(Instant::now() + Duration::from_secs(3))
|
||||
.and_then(|old_leader_id, _, _| fut::wrap_future(Delay::new(Instant::now() + Duration::from_secs(6))
|
||||
.map_err(|_| ()))
|
||||
.map(move |_, _, _| old_leader_id))
|
||||
|
||||
|
|
|
@ -174,7 +174,7 @@ impl NodeBuilder {
|
|||
let temp_dir = tempdir_in("/tmp").expect("Tempdir to be created without error.");
|
||||
let snapshot_dir = temp_dir.path().to_string_lossy().to_string();
|
||||
let config = Config::build(snapshot_dir.clone())
|
||||
.election_timeout_min(800).election_timeout_max(1000).heartbeat_interval(300)
|
||||
.election_timeout_min(1500).election_timeout_max(2000).heartbeat_interval(150)
|
||||
.metrics_rate(Duration::from_secs(metrics_rate))
|
||||
.snapshot_policy(snapshot_policy).snapshot_max_chunk_size(10000)
|
||||
.validate().expect("Raft config to be created without error.");
|
||||
|
@ -206,7 +206,7 @@ pub fn new_raft_node(id: NodeId, network: Addr<RaftRouter>, members: Vec<NodeId>
|
|||
let temp_dir = tempdir_in("/tmp").expect("Tempdir to be created without error.");
|
||||
let snapshot_dir = temp_dir.path().to_string_lossy().to_string();
|
||||
let config = Config::build(snapshot_dir.clone())
|
||||
.election_timeout_min(800).election_timeout_max(1000).heartbeat_interval(300)
|
||||
.election_timeout_min(1500).election_timeout_max(2000).heartbeat_interval(150)
|
||||
.metrics_rate(Duration::from_secs(metrics_rate))
|
||||
.snapshot_policy(SnapshotPolicy::Disabled).snapshot_max_chunk_size(10000)
|
||||
.validate().expect("Raft config to be created without error.");
|
||||
|
|
|
@ -47,7 +47,7 @@ fn initialization() {
|
|||
// Setup test controller and actions.
|
||||
let mut ctl = RaftTestController::new(network);
|
||||
ctl.register(0, node0.addr.clone()).register(1, node1.addr.clone()).register(2, node2.addr.clone());
|
||||
ctl.start_with_test(5, Box::new(|act, ctx| {
|
||||
ctl.start_with_test(10, Box::new(|act, ctx| {
|
||||
// Assert that all nodes are in NonVoter state with index 0.
|
||||
let task = fut::wrap_future(act.network.send(ExecuteInRaftRouter(Box::new(move |act, _| {
|
||||
for node in act.metrics.values() {
|
||||
|
|
|
@ -42,7 +42,7 @@ fn singlenode() {
|
|||
// Setup test controller and actions.
|
||||
let mut ctl = RaftTestController::new(network);
|
||||
ctl.register(0, node0.addr.clone());
|
||||
ctl.start_with_test(5, Box::new(|act, ctx| {
|
||||
ctl.start_with_test(10, Box::new(|act, ctx| {
|
||||
// Assert that all nodes are in NonVoter state with index 0.
|
||||
let task = fut::wrap_future(act.network.send(ExecuteInRaftRouter(Box::new(move |act, _| {
|
||||
for node in act.metrics.values() {
|
||||
|
|
|
@ -47,7 +47,7 @@ fn snapshotting() {
|
|||
// Setup test controller and actions.
|
||||
let mut ctl = RaftTestController::new(network);
|
||||
ctl.register(0, node0.addr.clone()).register(1, node1.addr.clone()).register(2, node2.addr.clone());
|
||||
ctl.start_with_test(5, Box::new(|act, ctx| {
|
||||
ctl.start_with_test(10, Box::new(|act, ctx| {
|
||||
// Isolate the current leader.
|
||||
let task = act.isolate_leader(ctx)
|
||||
// Wait for new leader to be elected.
|
||||
|
|
Loading…
Reference in New Issue