Prep for 0.5.1 release.

Painfully giving in to rustfmt ...

Update changelog.

Added some CI release automation.
This commit is contained in:
Anthony Dodd 2020-10-05 21:46:44 -05:00
parent 4a7be31abd
commit d9e4691811
No known key found for this signature in database
GPG Key ID: 6E0613E0F653DBC0
32 changed files with 1397 additions and 565 deletions

42
.github/workflows/release-memstore.yaml vendored Normal file
View File

@ -0,0 +1,42 @@
name: Release
on:
push:
tags:
- "memstore-v*"
jobs:
publish_raft:
runs-on: ubuntu-latest
steps:
- name: Setup | Checkout
uses: actions/checkout@v2
- name: Setup | Rust
uses: actions-rs/toolchain@v1
with:
toolchain: stable
profile: minimal
override: true
- name: Publish | Async Raft
run: cd memstore && cargo publish --token ${{ secrets.CRATES_IO_TOKEN }}
release:
needs: build
runs-on: ubuntu-latest
steps:
- name: Setup | Checkout
uses: actions/checkout@v2
with:
fetch-depth: 0
- name: Setup | Create Release Log
run: cat CHANGELOG.md | tail -n +7 | head -n 25 > RELEASE_LOG.md
- name: Build | Publish Pre-Release
uses: softprops/action-gh-release@v1
with:
body_path: RELEASE_LOG.md
prerelease: true
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

42
.github/workflows/release-raft.yaml vendored Normal file
View File

@ -0,0 +1,42 @@
name: Release
on:
push:
tags:
- "async-raft-v*"
jobs:
publish_raft:
runs-on: ubuntu-latest
steps:
- name: Setup | Checkout
uses: actions/checkout@v2
- name: Setup | Rust
uses: actions-rs/toolchain@v1
with:
toolchain: stable
profile: minimal
override: true
- name: Publish | Async Raft
run: cd async-raft && cargo publish --token ${{ secrets.CRATES_IO_TOKEN }}
release:
needs: build
runs-on: ubuntu-latest
steps:
- name: Setup | Checkout
uses: actions/checkout@v2
with:
fetch-depth: 0
- name: Setup | Create Release Log
run: cat CHANGELOG.md | tail -n +7 | head -n 25 > RELEASE_LOG.md
- name: Build | Publish Pre-Release
uses: softprops/action-gh-release@v1
with:
body_path: RELEASE_LOG.md
prerelease: true
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

View File

@ -1,10 +1,14 @@
changelog
=========
This changelog follows the patterns described here: https://keepachangelog.com/en/1.0.0/.
## [unreleased]
## 0.5.1
### changed
- `ChangeConfigError::NodeNotLeader` now returns the ID of the current cluster leader if known.
- Fix off-by-one error in `get_log_entries` during the replication process.
- Added `#[derive(Serialize, Deserialize)]` to `Config`, `ConfigBuilder` & `SnapshotPolicy`.
## 0.5.0
### changed

View File

@ -2,11 +2,6 @@ CONTRIBUTING
============
This is a Rust project, so [rustup](https://rustup.rs/) is the best place to start.
Check out the `.travis.yml` file to get an idea on how to run tests and the like.
### clippy
Haven't added clippy integration yet, but I am definitely planning on doing so. Don't run rustfmt ...
### the guide
The guide for this project is built using [mdBook](https://rust-lang-nursery.github.io/mdBook/index.html). Review their guide for more details on how to work with mdBook. Here are a few of the pertinents:
@ -23,3 +18,7 @@ mdbook watch
### release checklist
- Any documentation updates should also be reflected in the guide.
- Ensure the changelog is up-to-date.
- Ensure the Cargo.toml version for async-raft or memstore has been updated, depending on which is being released.
- Once the repo is in the desired state, push a tag matching the following pattern: `(async-raft|memstore)-v.+`.
- Once the release CI has been finished, navigate to the release page, update the release info and publish the release.

View File

@ -1,6 +1,6 @@
[package]
name = "async-raft"
version = "0.5.0"
version = "0.5.1"
edition = "2018"
authors = ["Anthony Dodd <Dodd.AnthonyJosiah@gmail.com>"]
categories = ["algorithms", "asynchronous", "data-structures"]

View File

@ -1,7 +1,7 @@
//! Raft runtime configuration.
use rand::{thread_rng, Rng};
use serde::{Serialize, Deserialize};
use serde::{Deserialize, Serialize};
use crate::error::ConfigError;
@ -111,7 +111,7 @@ impl Config {
/// The directory where the log snapshots are to be kept for a Raft node is required and must
/// be specified to start the config builder process.
pub fn build(cluster_name: String) -> ConfigBuilder {
ConfigBuilder{
ConfigBuilder {
cluster_name,
election_timeout_min: None,
election_timeout_max: None,
@ -135,7 +135,8 @@ impl Config {
/// the Raft spec is considered in order to set the appropriate values.
#[derive(Debug, Serialize, Deserialize)]
pub struct ConfigBuilder {
cluster_name: String,
/// The application specific name of this Raft cluster.
pub cluster_name: String,
/// The minimum election timeout, in milliseconds.
pub election_timeout_min: Option<u64>,
/// The maximum election timeout, in milliseconds.
@ -212,7 +213,7 @@ impl ConfigBuilder {
let replication_lag_threshold = self.replication_lag_threshold.unwrap_or(DEFAULT_REPLICATION_LAG_THRESHOLD);
let snapshot_policy = self.snapshot_policy.unwrap_or_else(SnapshotPolicy::default);
let snapshot_max_chunk_size = self.snapshot_max_chunk_size.unwrap_or(DEFAULT_SNAPSHOT_CHUNKSIZE);
Ok(Config{
Ok(Config {
cluster_name: self.cluster_name,
election_timeout_min,
election_timeout_max,
@ -255,7 +256,8 @@ mod tests {
.replication_lag_threshold(100)
.snapshot_max_chunk_size(200)
.snapshot_policy(SnapshotPolicy::LogsSinceLast(10000))
.validate().unwrap();
.validate()
.unwrap();
assert!(cfg.election_timeout_min >= 100);
assert!(cfg.election_timeout_max <= 200);
@ -269,7 +271,9 @@ mod tests {
#[test]
fn test_invalid_election_timeout_config_produces_expected_error() {
let res = Config::build("cluster0".into())
.election_timeout_min(1000).election_timeout_max(700).validate();
.election_timeout_min(1000)
.election_timeout_max(700)
.validate();
assert!(res.is_err());
let err = res.unwrap_err();
assert_eq!(err, ConfigError::InvalidElectionTimeoutMinMax);

View File

@ -3,16 +3,16 @@ use std::collections::HashSet;
use futures::future::{FutureExt, TryFutureExt};
use tokio::sync::oneshot;
use crate::{AppData, AppDataResponse, NodeId, RaftNetwork, RaftStorage};
use crate::error::{InitializeError, ChangeConfigError, RaftError};
use crate::raft::{ChangeMembershipTx, ClientWriteRequest, MembershipConfig};
use crate::core::{ConsensusState, LeaderState, NonVoterReplicationState, NonVoterState, State, UpdateCurrentLeader};
use crate::core::client::ClientRequestEntry;
use crate::core::{ConsensusState, LeaderState, NonVoterReplicationState, NonVoterState, State, UpdateCurrentLeader};
use crate::error::{ChangeConfigError, InitializeError, RaftError};
use crate::raft::{ChangeMembershipTx, ClientWriteRequest, MembershipConfig};
use crate::replication::RaftEvent;
use crate::{AppData, AppDataResponse, NodeId, RaftNetwork, RaftStorage};
impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>> NonVoterState<'a, D, R, N, S> {
/// Handle the admin `init_with_config` command.
#[tracing::instrument(level="trace", skip(self))]
#[tracing::instrument(level = "trace", skip(self))]
pub(super) async fn handle_init_with_config(&mut self, mut members: HashSet<NodeId>) -> Result<(), InitializeError> {
if self.core.last_log_index != 0 || self.core.current_term != 0 {
tracing::error!({self.core.last_log_index, self.core.current_term}, "rejecting init_with_config request as last_log_index or current_term is 0");
@ -26,7 +26,10 @@ impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>
// Build a new membership config from given init data & assign it as the new cluster
// membership config in memory only.
self.core.membership = MembershipConfig{members, members_after_consensus: None};
self.core.membership = MembershipConfig {
members,
members_after_consensus: None,
};
// Become a candidate and start campaigning for leadership. If this node is the only node
// in the cluster, then become leader without holding an election. If members len == 1, we
@ -47,13 +50,20 @@ impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>
impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>> LeaderState<'a, D, R, N, S> {
/// Add a new node to the cluster as a non-voter, bringing it up-to-speed, and then responding
/// on the given channel.
#[tracing::instrument(level="trace", skip(self, tx))]
#[tracing::instrument(level = "trace", skip(self, tx))]
pub(super) fn add_member(&mut self, target: NodeId, tx: oneshot::Sender<Result<(), ChangeConfigError>>) {
// Ensure the node doesn't already exist in the current config, in the set of new nodes
// alreading being synced, or in the nodes being removed.
if self.core.membership.members.contains(&target)
|| self.core.membership.members_after_consensus.as_ref().map(|new| new.contains(&target)).unwrap_or(false)
|| self.non_voters.contains_key(&target) {
|| self
.core
.membership
.members_after_consensus
.as_ref()
.map(|new| new.contains(&target))
.unwrap_or(false)
|| self.non_voters.contains_key(&target)
{
tracing::debug!("target node is already a cluster member or is being synced");
let _ = tx.send(Err(ChangeConfigError::Noop));
return;
@ -62,10 +72,17 @@ impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>
// Spawn a replication stream for the new member. Track state as a non-voter so that it
// can be updated to be added to the cluster config once it has been brought up-to-date.
let state = self.spawn_replication_stream(target);
self.non_voters.insert(target, NonVoterReplicationState{state, is_ready_to_join: false, tx: Some(tx)});
self.non_voters.insert(
target,
NonVoterReplicationState {
state,
is_ready_to_join: false,
tx: Some(tx),
},
);
}
#[tracing::instrument(level="trace", skip(self, tx))]
#[tracing::instrument(level = "trace", skip(self, tx))]
pub(super) async fn change_membership(&mut self, members: HashSet<NodeId>, tx: ChangeMembershipTx) {
// Ensure cluster will have at least one node.
if members.is_empty() {
@ -76,10 +93,10 @@ impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>
// Only allow config updates when currently in a uniform consensus state.
match &self.consensus_state {
ConsensusState::Uniform => (),
ConsensusState::NonVoterSync{..} | ConsensusState::Joint{..} => {
ConsensusState::NonVoterSync { .. } | ConsensusState::Joint { .. } => {
let _ = tx.send(Err(ChangeConfigError::ConfigChangeInProgress));
return;
},
}
}
// Check the proposed config for any new nodes. If ALL new nodes already have replication
@ -89,7 +106,8 @@ impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>
// Here, all we do is check to see which nodes still need to be synced, which determines
// we can proceed.
let diff = members.difference(&self.core.membership.members).cloned().collect::<Vec<_>>();
let awaiting = diff.into_iter()
let awaiting = diff
.into_iter()
.filter(|new_node| match self.non_voters.get(&new_node) {
Some(node) if node.is_ready_to_join => false,
Some(_) => true,
@ -97,7 +115,14 @@ impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>
// Spawn a replication stream for the new member. Track state as a non-voter so that it
// can be updated to be added to the cluster config once it has been brought up-to-date.
let state = self.spawn_replication_stream(*new_node);
self.non_voters.insert(*new_node, NonVoterReplicationState{state, is_ready_to_join: false, tx: None});
self.non_voters.insert(
*new_node,
NonVoterReplicationState {
state,
is_ready_to_join: false,
tx: None,
},
);
true
}
})
@ -105,7 +130,7 @@ impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>
// If there are new nodes which need to sync, then we need to wait until they are synced.
// Once they've finished, this routine will be called again to progress further.
if !awaiting.is_empty() {
self.consensus_state = ConsensusState::NonVoterSync{awaiting, members, tx};
self.consensus_state = ConsensusState::NonVoterSync { awaiting, members, tx };
return;
}
@ -113,7 +138,7 @@ impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>
if !members.contains(&self.core.id) {
self.is_stepping_down = true;
}
self.consensus_state = ConsensusState::Joint{is_committed: false};
self.consensus_state = ConsensusState::Joint { is_committed: false };
self.core.membership.members_after_consensus = Some(members);
// Propagate the command as any other client request.
@ -138,20 +163,22 @@ impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>
let res = rx_cfg_change
.map_err(|_| RaftError::ShuttingDown)
.into_future()
.then(|res| futures::future::ready(match res {
Ok(Ok(_)) => Ok(()),
Ok(Err(err)) => Err(ChangeConfigError::from(err)),
Err(err) => Err(ChangeConfigError::from(err)),
}))
.then(|res| {
futures::future::ready(match res {
Ok(Ok(_)) => Ok(()),
Ok(Err(err)) => Err(ChangeConfigError::from(err)),
Err(err) => Err(ChangeConfigError::from(err)),
})
})
.await;
let _ = tx.send(res);
});
}
/// Handle the commitment of a joint consensus cluster configuration.
#[tracing::instrument(level="trace", skip(self))]
#[tracing::instrument(level = "trace", skip(self))]
pub(super) async fn handle_joint_consensus_committed(&mut self) -> Result<(), RaftError> {
if let ConsensusState::Joint{is_committed, ..} = &mut self.consensus_state {
if let ConsensusState::Joint { is_committed, .. } = &mut self.consensus_state {
*is_committed = true; // Mark as comitted.
}
// Only proceed to finalize this joint consensus if there are no remaining nodes being synced.
@ -162,7 +189,7 @@ impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>
}
/// Finalize the comitted joint consensus.
#[tracing::instrument(level="trace", skip(self))]
#[tracing::instrument(level = "trace", skip(self))]
pub(super) async fn finalize_joint_consensus(&mut self) -> Result<(), RaftError> {
// Only proceed if it is safe to do so.
if !self.consensus_state.is_joint_consensus_safe_to_finalize() {
@ -200,7 +227,7 @@ impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>
}
/// Handle the commitment of a uniform consensus cluster configuration.
#[tracing::instrument(level="trace", skip(self))]
#[tracing::instrument(level = "trace", skip(self))]
pub(super) async fn handle_uniform_consensus_committed(&mut self, index: u64) -> Result<(), RaftError> {
// Step down if needed.
if self.is_stepping_down {
@ -214,7 +241,9 @@ impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>
// cluster members. All other replication streams which are no longer cluster members, but
// which have not yet replicated this config will be marked for removal.
let membership = &self.core.membership;
let nodes_to_remove: Vec<_> = self.nodes.iter_mut()
let nodes_to_remove: Vec<_> = self
.nodes
.iter_mut()
.filter(|(id, _)| !membership.contains(id))
.filter_map(|(idx, replstate)| {
if replstate.match_index >= index {
@ -223,9 +252,10 @@ impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>
replstate.remove_after_commit = Some(index);
None
}
}).collect();
})
.collect();
for node in nodes_to_remove {
tracing::debug!({target=node}, "removing target node from replication pool");
tracing::debug!({ target = node }, "removing target node from replication pool");
if let Some(node) = self.nodes.remove(&node) {
let _ = node.replstream.repltx.send(RaftEvent::Terminate);
}

View File

@ -1,7 +1,7 @@
use crate::{AppData, AppDataResponse, RaftNetwork, RaftStorage};
use crate::core::{RaftCore, State, UpdateCurrentLeader};
use crate::error::RaftResult;
use crate::raft::{AppendEntriesRequest, AppendEntriesResponse, ConflictOpt, Entry, EntryPayload};
use crate::core::{RaftCore, State, UpdateCurrentLeader};
use crate::{AppData, AppDataResponse, RaftNetwork, RaftStorage};
impl<D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>> RaftCore<D, R, N, S> {
/// An RPC invoked by the leader to replicate log entries (§5.3); also used as heartbeat (§5.2).
@ -15,7 +15,11 @@ impl<D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>> Ra
// If message's term is less than most recent term, then we do not honor the request.
if msg.term < self.current_term {
tracing::trace!({self.current_term, rpc_term=msg.term}, "AppendEntries RPC term is less than current term");
return Ok(AppendEntriesResponse{term: self.current_term, success: false, conflict_opt: None});
return Ok(AppendEntriesResponse {
term: self.current_term,
success: false,
conflict_opt: None,
});
}
// Update election timeout.
@ -52,7 +56,11 @@ impl<D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>> Ra
if report_metrics {
self.report_metrics();
}
return Ok(AppendEntriesResponse{term: self.current_term, success: true, conflict_opt: None});
return Ok(AppendEntriesResponse {
term: self.current_term,
success: true,
conflict_opt: None,
});
}
// Else, append log entries.
@ -61,7 +69,11 @@ impl<D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>> Ra
if report_metrics {
self.report_metrics();
}
return Ok(AppendEntriesResponse{term: self.current_term, success: true, conflict_opt: None});
return Ok(AppendEntriesResponse {
term: self.current_term,
success: true,
conflict_opt: None,
});
}
/////////////////////////////////////
@ -69,7 +81,11 @@ impl<D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>> Ra
tracing::trace!("begin log consistency check");
// Previous log info doesn't immediately line up, so perform log consistency check and proceed based on its result.
let entries = self.storage.get_log_entries(msg.prev_log_index, msg.prev_log_index).await.map_err(|err| self.map_fatal_storage_error(err))?;
let entries = self
.storage
.get_log_entries(msg.prev_log_index, msg.prev_log_index + 1)
.await
.map_err(|err| self.map_fatal_storage_error(err))?;
let target_entry = match entries.first() {
Some(target_entry) => target_entry,
// The target entry was not found. This can only mean that we don't have the
@ -78,9 +94,13 @@ impl<D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>> Ra
if report_metrics {
self.report_metrics();
}
return Ok(AppendEntriesResponse{
term: self.current_term, success: false,
conflict_opt: Some(ConflictOpt{term: self.last_log_term, index: self.last_log_index}),
return Ok(AppendEntriesResponse {
term: self.current_term,
success: false,
conflict_opt: Some(ConflictOpt {
term: self.last_log_term,
index: self.last_log_index,
}),
});
}
};
@ -90,8 +110,15 @@ impl<D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>> Ra
// We've found a point of agreement with the leader. If we have any logs present
// with an index greater than this, then we must delete them per §5.3.
if self.last_log_index > target_entry.index {
self.storage.delete_logs_from(target_entry.index + 1, None).await.map_err(|err| self.map_fatal_storage_error(err))?;
let membership = self.storage.get_membership_config().await.map_err(|err| self.map_fatal_storage_error(err))?;
self.storage
.delete_logs_from(target_entry.index + 1, None)
.await
.map_err(|err| self.map_fatal_storage_error(err))?;
let membership = self
.storage
.get_membership_config()
.await
.map_err(|err| self.map_fatal_storage_error(err))?;
self.update_membership(membership)?;
}
}
@ -99,15 +126,29 @@ impl<D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>> Ra
// entry of that payload which is still in the target term for conflict optimization.
else {
let start = if msg.prev_log_index >= 50 { msg.prev_log_index - 50 } else { 0 };
let old_entries = self.storage.get_log_entries(start, msg.prev_log_index).await.map_err(|err| self.map_fatal_storage_error(err))?;
let old_entries = self
.storage
.get_log_entries(start, msg.prev_log_index)
.await
.map_err(|err| self.map_fatal_storage_error(err))?;
let opt = match old_entries.iter().find(|entry| entry.term == msg.prev_log_term) {
Some(entry) => Some(ConflictOpt{term: entry.term, index: entry.index}),
None => Some(ConflictOpt{term: self.last_log_term, index: self.last_log_index}),
Some(entry) => Some(ConflictOpt {
term: entry.term,
index: entry.index,
}),
None => Some(ConflictOpt {
term: self.last_log_term,
index: self.last_log_index,
}),
};
if report_metrics {
self.report_metrics();
}
return Ok(AppendEntriesResponse{term: self.current_term, success: false, conflict_opt: opt});
return Ok(AppendEntriesResponse {
term: self.current_term,
success: false,
conflict_opt: opt,
});
}
///////////////////////////////////
@ -119,17 +160,22 @@ impl<D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>> Ra
if report_metrics {
self.report_metrics();
}
Ok(AppendEntriesResponse{term: self.current_term, success: true, conflict_opt: None})
Ok(AppendEntriesResponse {
term: self.current_term,
success: true,
conflict_opt: None,
})
}
/// Append the given entries to the log.
///
/// Configuration changes are also detected and applied here. See `configuration changes`
/// in the raft-essentials.md in this repo.
#[tracing::instrument(level="trace", skip(self, entries))]
#[tracing::instrument(level = "trace", skip(self, entries))]
async fn append_log_entries(&mut self, entries: &[Entry<D>]) -> RaftResult<()> {
// Check the given entries for any config changes and take the most recent.
let last_conf_change = entries.iter()
let last_conf_change = entries
.iter()
.filter_map(|ent| match &ent.payload {
EntryPayload::ConfigChange(conf) => Some(conf),
_ => None,
@ -141,7 +187,10 @@ impl<D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>> Ra
};
// Replicate entries to log (same as append, but in follower mode).
self.storage.replicate_to_log(entries).await.map_err(|err| self.map_fatal_storage_error(err))?;
self.storage
.replicate_to_log(entries)
.await
.map_err(|err| self.map_fatal_storage_error(err))?;
if let Some(entry) = entries.last() {
self.last_log_index = entry.index;
self.last_log_term = entry.term;
@ -150,17 +199,22 @@ impl<D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>> Ra
}
/// Replicate outstanding logs to the state machine if needed.
#[tracing::instrument(level="trace", skip(self, report_metrics))]
#[tracing::instrument(level = "trace", skip(self, report_metrics))]
async fn replicate_to_state_machine_if_needed(&mut self, report_metrics: &mut bool) -> RaftResult<()> {
if self.commit_index > self.last_applied {
// Fetch the series of entries which must be applied to the state machine, and apply them.
let stop = std::cmp::min(self.commit_index, self.last_log_index) + 1;
let entries = self.storage.get_log_entries(self.last_applied + 1, stop).await.map_err(|err| self.map_fatal_storage_error(err))?;
let entries = self
.storage
.get_log_entries(self.last_applied + 1, stop)
.await
.map_err(|err| self.map_fatal_storage_error(err))?;
if let Some(entry) = entries.last() {
self.last_applied = entry.index;
*report_metrics = true;
}
let data_entries: Vec<_> = entries.iter()
let data_entries: Vec<_> = entries
.iter()
.filter_map(|entry| match &entry.payload {
EntryPayload::Normal(inner) => Some((&entry.index, &inner.data)),
_ => None,
@ -169,7 +223,10 @@ impl<D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>> Ra
if data_entries.is_empty() {
return Ok(());
}
self.storage.replicate_to_state_machine(&data_entries).await.map_err(|err| self.map_fatal_storage_error(err))?;
self.storage
.replicate_to_state_machine(&data_entries)
.await
.map_err(|err| self.map_fatal_storage_error(err))?;
// Request async compaction, if needed.
self.trigger_log_compaction_if_needed();

View File

@ -1,18 +1,18 @@
use std::sync::Arc;
use anyhow::anyhow;
use futures::stream::FuturesUnordered;
use futures::future::TryFutureExt;
use futures::stream::FuturesUnordered;
use tokio::stream::StreamExt;
use tokio::sync::oneshot;
use tokio::time::{Duration, timeout};
use tokio::time::{timeout, Duration};
use crate::{AppData, AppDataResponse, RaftNetwork, RaftStorage};
use crate::core::{LeaderState, State};
use crate::error::{ClientReadError, ClientWriteError, RaftError, RaftResult};
use crate::raft::{ClientWriteRequest, ClientWriteResponse, ClientReadResponseTx, ClientWriteResponseTx, Entry, EntryPayload};
use crate::raft::{AppendEntriesRequest};
use crate::raft::AppendEntriesRequest;
use crate::raft::{ClientReadResponseTx, ClientWriteRequest, ClientWriteResponse, ClientWriteResponseTx, Entry, EntryPayload};
use crate::replication::RaftEvent;
use crate::{AppData, AppDataResponse, RaftNetwork, RaftStorage};
/// A wrapper around a ClientRequest which has been transformed into an Entry, along with its response channel.
pub(super) struct ClientRequestEntry<D: AppData, R: AppDataResponse> {
@ -28,7 +28,10 @@ pub(super) struct ClientRequestEntry<D: AppData, R: AppDataResponse> {
impl<D: AppData, R: AppDataResponse> ClientRequestEntry<D, R> {
/// Create a new instance from the raw components of a client request.
pub(crate) fn from_entry<T: Into<ClientOrInternalResponseTx<D, R>>>(entry: Entry<D>, tx: T) -> Self {
Self{entry: Arc::new(entry), tx: tx.into()}
Self {
entry: Arc::new(entry),
tx: tx.into(),
}
}
}
@ -41,7 +44,7 @@ pub enum ClientOrInternalResponseTx<D: AppData, R: AppDataResponse> {
impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>> LeaderState<'a, D, R, N, S> {
/// Commit the initial entry which new leaders are obligated to create when first coming to power, per §8.
#[tracing::instrument(level="trace", skip(self))]
#[tracing::instrument(level = "trace", skip(self))]
pub(super) async fn commit_initial_leader_entry(&mut self) -> RaftResult<()> {
// If the cluster has just formed, and the current index is 0, then commit the current
// config, else a blank payload.
@ -93,17 +96,21 @@ impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>
/// consensus. Each request will have a timeout, and we respond once we have a majority
/// agreement from each config group. Most of the time, we will have a single uniform
/// config group.
#[tracing::instrument(level="trace", skip(self, tx))]
#[tracing::instrument(level = "trace", skip(self, tx))]
pub(super) async fn handle_client_read_request(&mut self, tx: ClientReadResponseTx) {
// Setup sentinel values to track when we've received majority confirmation of leadership.
let len_members = self.core.membership.members.len();
let mut c0_confirmed = 0usize;
let c0_needed: usize = if (len_members % 2) == 0 { (len_members/2)-1 } else { len_members/2 };
let c0_needed: usize = if (len_members % 2) == 0 {
(len_members / 2) - 1
} else {
len_members / 2
};
let mut c1_confirmed = 0usize;
let mut c1_needed = 0usize;
if let Some(joint_members) = &self.core.membership.members_after_consensus {
let len = joint_members.len(); // Will never be zero, as we don't allow it when proposing config changes.
c1_needed = if (len % 2) == 0 { (len/2)-1 } else { len/2 };
c1_needed = if (len % 2) == 0 { (len / 2) - 1 } else { len / 2 };
}
// As long as we are not about to step down, then increment for our vote.
@ -111,7 +118,14 @@ impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>
if self.core.membership.members.contains(&self.core.id) {
c0_confirmed += 1;
}
if self.core.membership.members_after_consensus.as_ref().map(|members| members.contains(&self.core.id)).unwrap_or(false) {
if self
.core
.membership
.members_after_consensus
.as_ref()
.map(|members| members.contains(&self.core.id))
.unwrap_or(false)
{
c1_confirmed += 1;
}
}
@ -119,7 +133,7 @@ impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>
// Spawn parallel requests, all with the standard timeout for heartbeats.
let mut pending = FuturesUnordered::new();
for (id, node) in self.nodes.iter() {
let rpc = AppendEntriesRequest{
let rpc = AppendEntriesRequest {
term: self.core.current_term,
leader_id: self.core.id,
prev_log_index: node.match_index,
@ -136,7 +150,8 @@ impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>
Ok(Err(err)) => Err((target, err)),
Err(_timeout) => Err((target, anyhow!("timeout waiting for leadership confirmation"))),
}
}).map_err(move |err| (*id, err));
})
.map_err(move |err| (*id, err));
pending.push(task);
}
@ -149,7 +164,7 @@ impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>
continue;
}
Err((target, err)) => {
tracing::error!({target}, "{}", err);
tracing::error!({ target }, "{}", err);
continue;
}
};
@ -164,7 +179,14 @@ impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>
if self.core.membership.members.contains(&target) {
c0_confirmed += 1;
}
if self.core.membership.members_after_consensus.as_ref().map(|members| members.contains(&target)).unwrap_or(false) {
if self
.core
.membership
.members_after_consensus
.as_ref()
.map(|members| members.contains(&target))
.unwrap_or(false)
{
c1_confirmed += 1;
}
if c0_confirmed >= c0_needed && c1_confirmed >= c1_needed {
@ -175,13 +197,13 @@ impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>
// If we've hit this location, then we've failed to gather needed confirmations due to
// request failures.
let _ = tx.send(Err(ClientReadError::RaftError(
RaftError::RaftNetwork(anyhow!("too many requests failed, could not confirm leadership"))
)));
let _ = tx.send(Err(ClientReadError::RaftError(RaftError::RaftNetwork(anyhow!(
"too many requests failed, could not confirm leadership"
)))));
}
/// Handle client write requests.
#[tracing::instrument(level="trace", skip(self, rpc, tx))]
#[tracing::instrument(level = "trace", skip(self, rpc, tx))]
pub(super) async fn handle_client_write_request(&mut self, rpc: ClientWriteRequest<D>, tx: ClientWriteResponseTx<D, R>) {
let entry = match self.append_payload_to_log(rpc.entry).await {
Ok(entry) => ClientRequestEntry::from_entry(entry, tx),
@ -194,10 +216,18 @@ impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>
}
/// Transform the given payload into an entry, assign an index and term, and append the entry to the log.
#[tracing::instrument(level="trace", skip(self, payload))]
#[tracing::instrument(level = "trace", skip(self, payload))]
pub(super) async fn append_payload_to_log(&mut self, payload: EntryPayload<D>) -> RaftResult<Entry<D>> {
let entry = Entry{index: self.core.last_log_index + 1, term: self.core.current_term, payload};
self.core.storage.append_entry_to_log(&entry).await.map_err(|err| self.core.map_fatal_storage_error(err))?;
let entry = Entry {
index: self.core.last_log_index + 1,
term: self.core.current_term,
payload,
};
self.core
.storage
.append_entry_to_log(&entry)
.await
.map_err(|err| self.core.map_fatal_storage_error(err))?;
self.core.last_log_index = entry.index;
Ok(entry)
}
@ -207,7 +237,7 @@ impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>
/// NOTE WELL: this routine does not wait for the request to actually finish replication, it
/// merely beings the process. Once the request is committed to the cluster, its response will
/// be generated asynchronously.
#[tracing::instrument(level="trace", skip(self, req))]
#[tracing::instrument(level = "trace", skip(self, req))]
pub(super) async fn replicate_client_request(&mut self, req: ClientRequestEntry<D, R>) {
// Replicate the request if there are other cluster members. The client response will be
// returned elsewhere after the entry has been committed to the cluster.
@ -215,7 +245,7 @@ impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>
if !self.nodes.is_empty() {
self.awaiting_committed.push(req);
for node in self.nodes.values() {
let _ = node.replstream.repltx.send(RaftEvent::Replicate{
let _ = node.replstream.repltx.send(RaftEvent::Replicate {
entry: entry_arc.clone(),
commit_index: self.core.commit_index,
});
@ -230,7 +260,7 @@ impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>
// Replicate to non-voters.
if !self.non_voters.is_empty() {
for node in self.non_voters.values() {
let _ = node.state.replstream.repltx.send(RaftEvent::Replicate{
let _ = node.state.replstream.repltx.send(RaftEvent::Replicate {
entry: entry_arc.clone(),
commit_index: self.core.commit_index,
});
@ -239,21 +269,22 @@ impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>
}
/// Handle the post-commit logic for a client request.
#[tracing::instrument(level="trace", skip(self, req))]
#[tracing::instrument(level = "trace", skip(self, req))]
pub(super) async fn client_request_post_commit(&mut self, req: ClientRequestEntry<D, R>) {
match req.tx {
// If this is a client response channel, then it means that we are dealing with
ClientOrInternalResponseTx::Client(tx) => match &req.entry.payload {
EntryPayload::Normal(inner) => {
match self.apply_entry_to_state_machine(&req.entry.index, &inner.data).await {
Ok(data) => {
let _ = tx.send(Ok(ClientWriteResponse{index: req.entry.index, data}));
}
Err(err) => {
let _ = tx.send(Err(ClientWriteError::RaftError(err)));
}
EntryPayload::Normal(inner) => match self.apply_entry_to_state_machine(&req.entry.index, &inner.data).await {
Ok(data) => {
let _ = tx.send(Ok(ClientWriteResponse {
index: req.entry.index,
data,
}));
}
}
Err(err) => {
let _ = tx.send(Err(ClientWriteError::RaftError(err)));
}
},
_ => {
// Why is this a bug, and why are we shutting down? This is because we can not easily
// encode these constraints in the type system, and client requests should be the only
@ -262,7 +293,7 @@ impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>
tracing::error!("critical error in Raft, this is a programming bug, please open an issue");
self.core.set_target_state(State::Shutdown);
}
}
},
ClientOrInternalResponseTx::Internal(tx) => {
self.core.last_applied = req.entry.index;
self.core.report_metrics();
@ -275,7 +306,7 @@ impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>
}
/// Apply the given log entry to the state machine.
#[tracing::instrument(level="trace", skip(self, entry))]
#[tracing::instrument(level = "trace", skip(self, entry))]
pub(super) async fn apply_entry_to_state_machine(&mut self, index: &u64, entry: &D) -> RaftResult<R> {
// First, we just ensure that we apply any outstanding up to, but not including, the index
// of the given entry. We need to be able to return the data response from applying this
@ -284,23 +315,38 @@ impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>
// Note that this would only ever happen if a node had unapplied logs from before becoming leader.
let expected_next_index = self.core.last_applied + 1;
if index != &expected_next_index {
let entries = self.core.storage.get_log_entries(expected_next_index, *index).await.map_err(|err| self.core.map_fatal_storage_error(err))?;
let entries = self
.core
.storage
.get_log_entries(expected_next_index, *index)
.await
.map_err(|err| self.core.map_fatal_storage_error(err))?;
if let Some(entry) = entries.last() {
self.core.last_applied = entry.index;
}
let data_entries: Vec<_> = entries.iter()
let data_entries: Vec<_> = entries
.iter()
.filter_map(|entry| match &entry.payload {
EntryPayload::Normal(inner) => Some((&entry.index, &inner.data)),
_ => None,
})
.collect();
if !data_entries.is_empty() {
self.core.storage.replicate_to_state_machine(&data_entries).await.map_err(|err| self.core.map_fatal_storage_error(err))?;
self.core
.storage
.replicate_to_state_machine(&data_entries)
.await
.map_err(|err| self.core.map_fatal_storage_error(err))?;
}
}
// Apply this entry to the state machine and return its data response.
let res = self.core.storage.apply_entry_to_state_machine(index, entry).await.map_err(|err| self.core.map_fatal_storage_error(err))?;
let res = self
.core
.storage
.apply_entry_to_state_machine(index, entry)
.await
.map_err(|err| self.core.map_fatal_storage_error(err))?;
self.core.last_applied = *index;
self.core.report_metrics();
Ok(res)

View File

@ -2,10 +2,10 @@ use std::io::SeekFrom;
use tokio::io::{AsyncSeekExt, AsyncWriteExt};
use crate::{AppData, AppDataResponse, RaftNetwork, RaftStorage};
use crate::core::{State, RaftCore, SnapshotState, UpdateCurrentLeader};
use crate::core::{RaftCore, SnapshotState, State, UpdateCurrentLeader};
use crate::error::RaftResult;
use crate::raft::{InstallSnapshotRequest, InstallSnapshotResponse};
use crate::{AppData, AppDataResponse, RaftNetwork, RaftStorage};
impl<D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>> RaftCore<D, R, N, S> {
/// Invoked by leader to send chunks of a snapshot to a follower (§7).
@ -13,11 +13,11 @@ impl<D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>> Ra
/// Leaders always send chunks in order. It is important to note that, according to the Raft spec,
/// a log may only have one snapshot at any time. As snapshot contents are application specific,
/// the Raft log will only store a pointer to the snapshot file along with the index & term.
#[tracing::instrument(level="trace", skip(self, req))]
#[tracing::instrument(level = "trace", skip(self, req))]
pub(super) async fn handle_install_snapshot_request(&mut self, req: InstallSnapshotRequest) -> RaftResult<InstallSnapshotResponse> {
// If message's term is less than most recent term, then we do not honor the request.
if req.term < self.current_term {
return Ok(InstallSnapshotResponse{term: self.current_term});
return Ok(InstallSnapshotResponse { term: self.current_term });
}
// Update election timeout.
@ -49,43 +49,43 @@ impl<D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>> Ra
// Compare current snapshot state with received RPC and handle as needed.
match self.snapshot_state.take() {
None => Ok(self.begin_installing_snapshot(req).await?),
Some(SnapshotState::Snapshotting{handle, ..}) => {
Some(SnapshotState::Snapshotting { handle, .. }) => {
handle.abort(); // Abort the current compaction in favor of installation from leader.
Ok(self.begin_installing_snapshot(req).await?)
}
Some(SnapshotState::Streaming{snapshot, id, offset}) => Ok(self.continue_installing_snapshot(req, offset, id, snapshot).await?),
Some(SnapshotState::Streaming { snapshot, id, offset }) => Ok(self.continue_installing_snapshot(req, offset, id, snapshot).await?),
}
}
#[tracing::instrument(level="trace", skip(self, req))]
#[tracing::instrument(level = "trace", skip(self, req))]
async fn begin_installing_snapshot(&mut self, req: InstallSnapshotRequest) -> RaftResult<InstallSnapshotResponse> {
// Create a new snapshot and begin writing its contents.
let (id, mut snapshot) = self.storage.create_snapshot().await
.map_err(|err| self.map_fatal_storage_error(err))?;
let (id, mut snapshot) = self.storage.create_snapshot().await.map_err(|err| self.map_fatal_storage_error(err))?;
snapshot.as_mut().write_all(&req.data).await?;
// If this was a small snapshot, and it is already done, then finish up.
if req.done {
self.finalize_snapshot_installation(req, id, snapshot).await?;
return Ok(InstallSnapshotResponse{term: self.current_term});
return Ok(InstallSnapshotResponse { term: self.current_term });
}
// Else, retain snapshot components for later segments & respod.
self.snapshot_state = Some(SnapshotState::Streaming{
self.snapshot_state = Some(SnapshotState::Streaming {
offset: req.data.len() as u64,
id, snapshot,
id,
snapshot,
});
return Ok(InstallSnapshotResponse{term: self.current_term});
return Ok(InstallSnapshotResponse { term: self.current_term });
}
#[tracing::instrument(level="trace", skip(self, req, offset, snapshot))]
#[tracing::instrument(level = "trace", skip(self, req, offset, snapshot))]
async fn continue_installing_snapshot(
&mut self, req: InstallSnapshotRequest, mut offset: u64, id: String, mut snapshot: Box<S::Snapshot>,
) -> RaftResult<InstallSnapshotResponse> {
// Always seek to the target offset if not an exact match.
if req.offset != offset {
if let Err(err) = snapshot.as_mut().seek(SeekFrom::Start(req.offset)).await {
self.snapshot_state = Some(SnapshotState::Streaming{offset, id, snapshot});
self.snapshot_state = Some(SnapshotState::Streaming { offset, id, snapshot });
return Err(err.into());
}
offset = req.offset;
@ -93,7 +93,7 @@ impl<D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>> Ra
// Write the next segment & update offset.
if let Err(err) = snapshot.as_mut().write_all(&req.data).await {
self.snapshot_state = Some(SnapshotState::Streaming{offset, id, snapshot});
self.snapshot_state = Some(SnapshotState::Streaming { offset, id, snapshot });
return Err(err.into());
}
offset += req.data.len() as u64;
@ -102,25 +102,35 @@ impl<D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>> Ra
if req.done {
self.finalize_snapshot_installation(req, id, snapshot).await?;
} else {
self.snapshot_state = Some(SnapshotState::Streaming{offset, id, snapshot});
self.snapshot_state = Some(SnapshotState::Streaming { offset, id, snapshot });
}
return Ok(InstallSnapshotResponse{term: self.current_term});
return Ok(InstallSnapshotResponse { term: self.current_term });
}
/// Finalize the installation of a new snapshot.
///
/// Any errors which come up from this routine will cause the Raft node to go into shutdown.
#[tracing::instrument(level="trace", skip(self, req, snapshot))]
#[tracing::instrument(level = "trace", skip(self, req, snapshot))]
async fn finalize_snapshot_installation(&mut self, req: InstallSnapshotRequest, id: String, mut snapshot: Box<S::Snapshot>) -> RaftResult<()> {
snapshot.as_mut().shutdown().await.map_err(|err| self.map_fatal_storage_error(err.into()))?;
snapshot
.as_mut()
.shutdown()
.await
.map_err(|err| self.map_fatal_storage_error(err.into()))?;
let delete_through = if self.last_log_index > req.last_included_index {
Some(req.last_included_index)
} else {
None
};
self.storage.finalize_snapshot_installation(req.last_included_index, req.last_included_term, delete_through, id, snapshot).await
self.storage
.finalize_snapshot_installation(req.last_included_index, req.last_included_term, delete_through, id, snapshot)
.await
.map_err(|err| self.map_fatal_storage_error(err))?;
let membership = self
.storage
.get_membership_config()
.await
.map_err(|err| self.map_fatal_storage_error(err))?;
let membership = self.storage.get_membership_config().await.map_err(|err| self.map_fatal_storage_error(err))?;
self.update_membership(membership)?;
self.last_log_index = req.last_included_index;
self.last_log_term = req.last_included_term;

View File

@ -8,25 +8,25 @@ pub(crate) mod replication;
mod vote;
use std::collections::{BTreeMap, HashSet};
use std::sync::Arc;
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::Arc;
use futures::future::{Abortable, AbortHandle};
use futures::future::{AbortHandle, Abortable};
use futures::stream::FuturesOrdered;
use tokio::stream::StreamExt;
use tokio::sync::{broadcast, mpsc, oneshot, watch};
use tokio::task::JoinHandle;
use tokio::time::{Instant, Duration, delay_until};
use tokio::time::{delay_until, Duration, Instant};
use tracing_futures::Instrument;
use crate::{AppData, AppDataResponse, RaftNetwork, RaftStorage, NodeId};
use crate::config::{Config, SnapshotPolicy};
use crate::core::client::ClientRequestEntry;
use crate::error::{ClientReadError, ClientWriteError, ChangeConfigError, InitializeError, RaftError, RaftResult};
use crate::error::{ChangeConfigError, ClientReadError, ClientWriteError, InitializeError, RaftError, RaftResult};
use crate::metrics::RaftMetrics;
use crate::raft::{ChangeMembershipTx, ClientWriteRequest, ClientReadResponseTx, ClientWriteResponseTx, RaftMsg, MembershipConfig};
use crate::replication::{RaftEvent, ReplicationStream, ReplicaEvent};
use crate::raft::{ChangeMembershipTx, ClientReadResponseTx, ClientWriteRequest, ClientWriteResponseTx, MembershipConfig, RaftMsg};
use crate::replication::{RaftEvent, ReplicaEvent, ReplicationStream};
use crate::storage::HardState;
use crate::{AppData, AppDataResponse, NodeId, RaftNetwork, RaftStorage};
/// The core type implementing the Raft protocol.
pub struct RaftCore<D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>> {
@ -104,21 +104,33 @@ pub struct RaftCore<D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftSt
impl<D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>> RaftCore<D, R, N, S> {
pub(crate) fn spawn(
id: NodeId, config: Arc<Config>, network: Arc<N>, storage: Arc<S>,
rx_api: mpsc::UnboundedReceiver<RaftMsg<D, R>>,
tx_metrics: watch::Sender<RaftMetrics>,
needs_shutdown: Arc<AtomicBool>,
id: NodeId, config: Arc<Config>, network: Arc<N>, storage: Arc<S>, rx_api: mpsc::UnboundedReceiver<RaftMsg<D, R>>,
tx_metrics: watch::Sender<RaftMetrics>, needs_shutdown: Arc<AtomicBool>,
) -> JoinHandle<RaftResult<()>> {
let membership = MembershipConfig::new_initial(id); // This is updated from storage in the main loop.
let (tx_compaction, rx_compaction) = mpsc::channel(1);
let this = Self{
id, config, membership, network, storage,
let this = Self {
id,
config,
membership,
network,
storage,
target_state: State::Follower,
commit_index: 0, last_applied: 0, current_term: 0, current_leader: None, voted_for: None,
last_log_index: 0, last_log_term: 0,
snapshot_state: None, snapshot_index: 0,
last_heartbeat: None, next_election_timeout: None,
tx_compaction, rx_compaction, rx_api, tx_metrics,
commit_index: 0,
last_applied: 0,
current_term: 0,
current_leader: None,
voted_for: None,
last_log_index: 0,
last_log_term: 0,
snapshot_state: None,
snapshot_index: 0,
last_heartbeat: None,
next_election_timeout: None,
tx_compaction,
rx_compaction,
rx_api,
tx_metrics,
needs_shutdown,
};
tokio::spawn(this.main())
@ -141,7 +153,12 @@ impl<D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>> Ra
self.commit_index = 0;
// Fetch the most recent snapshot in the system.
if let Some(snapshot) = self.storage.get_current_snapshot().await.map_err(|err| self.map_fatal_storage_error(err))? {
if let Some(snapshot) = self
.storage
.get_current_snapshot()
.await
.map_err(|err| self.map_fatal_storage_error(err))?
{
self.snapshot_index = snapshot.index;
}
@ -177,9 +194,9 @@ impl<D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>> Ra
}
/// Report a metrics payload on the current state of the Raft node.
#[tracing::instrument(level="trace", skip(self))]
#[tracing::instrument(level = "trace", skip(self))]
fn report_metrics(&mut self) {
let res = self.tx_metrics.broadcast(RaftMetrics{
let res = self.tx_metrics.broadcast(RaftMetrics {
id: self.id,
state: self.target_state,
current_term: self.current_term,
@ -194,14 +211,17 @@ impl<D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>> Ra
}
/// Save the Raft node's current hard state to disk.
#[tracing::instrument(level="trace", skip(self))]
#[tracing::instrument(level = "trace", skip(self))]
async fn save_hard_state(&mut self) -> RaftResult<()> {
let hs = HardState{current_term: self.current_term, voted_for: self.voted_for};
let hs = HardState {
current_term: self.current_term,
voted_for: self.voted_for,
};
Ok(self.storage.save_hard_state(&hs).await.map_err(|err| self.map_fatal_storage_error(err))?)
}
/// Update core's target state, ensuring all invariants are upheld.
#[tracing::instrument(level="trace", skip(self))]
#[tracing::instrument(level = "trace", skip(self))]
fn set_target_state(&mut self, target_state: State) {
if target_state == State::Follower && !self.membership.contains(&self.id) {
self.target_state = State::NonVoter;
@ -210,7 +230,7 @@ impl<D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>> Ra
}
/// Get the next election timeout, generating a new value if not set.
#[tracing::instrument(level="trace", skip(self))]
#[tracing::instrument(level = "trace", skip(self))]
fn get_next_election_timeout(&mut self) -> Instant {
match self.next_election_timeout {
Some(inst) => inst,
@ -223,13 +243,13 @@ impl<D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>> Ra
}
/// Set a value for the next election timeout.
#[tracing::instrument(level="trace", skip(self))]
#[tracing::instrument(level = "trace", skip(self))]
fn update_next_election_timeout(&mut self) {
self.next_election_timeout = Some(Instant::now() + Duration::from_millis(self.config.new_rand_election_timeout()));
}
/// Update the value of the `current_leader` property.
#[tracing::instrument(level="trace", skip(self))]
#[tracing::instrument(level = "trace", skip(self))]
fn update_current_leader(&mut self, update: UpdateCurrentLeader) {
match update {
UpdateCurrentLeader::ThisNode => {
@ -240,12 +260,12 @@ impl<D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>> Ra
}
UpdateCurrentLeader::Unknown => {
self.current_leader = None;
},
}
}
}
/// Encapsulate the process of updating the current term, as updating the `voted_for` state must also be updated.
#[tracing::instrument(level="trace", skip(self))]
#[tracing::instrument(level = "trace", skip(self))]
fn update_current_term(&mut self, new_term: u64, voted_for: Option<NodeId>) {
if new_term > self.current_term {
self.current_term = new_term;
@ -258,7 +278,7 @@ impl<D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>> Ra
/// This method assumes that a storage error observed here is non-recoverable. As such, the
/// Raft node will be instructed to stop. If such behavior is not needed, then don't use this
/// interface.
#[tracing::instrument(level="trace", skip(self))]
#[tracing::instrument(level = "trace", skip(self))]
fn map_fatal_storage_error(&mut self, err: anyhow::Error) -> RaftError {
tracing::error!({error=%err, id=self.id}, "fatal storage error, shutting down");
self.set_target_state(State::Shutdown);
@ -266,7 +286,7 @@ impl<D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>> Ra
}
/// Update the node's current membership config & save hard state.
#[tracing::instrument(level="trace", skip(self))]
#[tracing::instrument(level = "trace", skip(self))]
fn update_membership(&mut self, cfg: MembershipConfig) -> RaftResult<()> {
// If the given config does not contain this node's ID, it means one of the following:
//
@ -287,19 +307,19 @@ impl<D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>> Ra
}
/// Update the system's snapshot state based on the given data.
#[tracing::instrument(level="trace", skip(self))]
#[tracing::instrument(level = "trace", skip(self))]
fn update_snapshot_state(&mut self, update: SnapshotUpdate) {
if let SnapshotUpdate::SnapshotComplete(index) = update {
self.snapshot_index = index
}
// If snapshot state is anything other than streaming, then drop it.
if let Some(state @ SnapshotState::Streaming{..}) = self.snapshot_state.take() {
if let Some(state @ SnapshotState::Streaming { .. }) = self.snapshot_state.take() {
self.snapshot_state = Some(state)
}
}
/// Trigger a log compaction (snapshot) job if needed.
#[tracing::instrument(level="trace", skip(self))]
#[tracing::instrument(level = "trace", skip(self))]
pub(self) fn trigger_log_compaction_if_needed(&mut self) {
if self.snapshot_state.is_some() {
return;
@ -320,47 +340,54 @@ impl<D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>> Ra
let (handle, reg) = AbortHandle::new_pair();
let (chan_tx, _) = broadcast::channel(1);
let mut tx_compaction = self.tx_compaction.clone();
self.snapshot_state = Some(SnapshotState::Snapshotting{through: through_index, handle, sender: chan_tx.clone()});
tokio::spawn(async move {
let res = Abortable::new(storage.do_log_compaction(through_index), reg).await;
match res {
Ok(res) => match res {
Ok(snapshot) => {
let _ = tx_compaction.try_send(SnapshotUpdate::SnapshotComplete(snapshot.index));
let _ = chan_tx.send(snapshot.index); // This will always succeed.
}
Err(err) => {
tracing::error!({error=%err}, "error while generating snapshot");
self.snapshot_state = Some(SnapshotState::Snapshotting {
through: through_index,
handle,
sender: chan_tx.clone(),
});
tokio::spawn(
async move {
let res = Abortable::new(storage.do_log_compaction(through_index), reg).await;
match res {
Ok(res) => match res {
Ok(snapshot) => {
let _ = tx_compaction.try_send(SnapshotUpdate::SnapshotComplete(snapshot.index));
let _ = chan_tx.send(snapshot.index); // This will always succeed.
}
Err(err) => {
tracing::error!({error=%err}, "error while generating snapshot");
let _ = tx_compaction.try_send(SnapshotUpdate::SnapshotFailed);
}
},
Err(_aborted) => {
let _ = tx_compaction.try_send(SnapshotUpdate::SnapshotFailed);
}
},
Err(_aborted) => {
let _ = tx_compaction.try_send(SnapshotUpdate::SnapshotFailed);
}
}
}.instrument(tracing::debug_span!("beginning new log compaction process")));
.instrument(tracing::debug_span!("beginning new log compaction process")),
);
}
/// Reject an init config request due to the Raft node being in a state which prohibits the request.
#[tracing::instrument(level="trace", skip(self, tx))]
#[tracing::instrument(level = "trace", skip(self, tx))]
fn reject_init_with_config(&self, tx: oneshot::Sender<Result<(), InitializeError>>) {
let _ = tx.send(Err(InitializeError::NotAllowed));
}
/// Reject a proposed config change request due to the Raft node being in a state which prohibits the request.
#[tracing::instrument(level="trace", skip(self, tx))]
#[tracing::instrument(level = "trace", skip(self, tx))]
fn reject_config_change_not_leader(&self, tx: oneshot::Sender<Result<(), ChangeConfigError>>) {
let _ = tx.send(Err(ChangeConfigError::NodeNotLeader(self.current_leader)));
}
/// Forward the given client write request to the leader.
#[tracing::instrument(level="trace", skip(self, req, tx))]
#[tracing::instrument(level = "trace", skip(self, req, tx))]
fn forward_client_write_request(&self, req: ClientWriteRequest<D>, tx: ClientWriteResponseTx<D, R>) {
let _ = tx.send(Err(ClientWriteError::ForwardToLeader(req, self.current_leader)));
}
/// Forward the given client read request to the leader.
#[tracing::instrument(level="trace", skip(self, tx))]
#[tracing::instrument(level = "trace", skip(self, tx))]
fn forward_client_read_request(&self, tx: ClientReadResponseTx) {
let _ = tx.send(Err(ClientReadError::ForwardToLeader(self.current_leader)));
}
@ -426,22 +453,38 @@ pub enum State {
impl State {
/// Check if currently in non-voter state.
pub fn is_non_voter(&self) -> bool {
if let Self::NonVoter = self { true } else { false }
if let Self::NonVoter = self {
true
} else {
false
}
}
/// Check if currently in follower state.
pub fn is_follower(&self) -> bool {
if let Self::Follower = self { true } else { false }
if let Self::Follower = self {
true
} else {
false
}
}
/// Check if currently in candidate state.
pub fn is_candidate(&self) -> bool {
if let Self::Candidate = self { true } else { false }
if let Self::Candidate = self {
true
} else {
false
}
}
/// Check if currently in leader state.
pub fn is_leader(&self) -> bool {
if let Self::Leader = self { true } else { false }
if let Self::Leader = self {
true
} else {
false
}
}
}
@ -479,15 +522,22 @@ impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>
/// Create a new instance.
pub(self) fn new(core: &'a mut RaftCore<D, R, N, S>) -> Self {
let consensus_state = if core.membership.is_in_joint_consensus() {
ConsensusState::Joint{is_committed: false}
ConsensusState::Joint { is_committed: false }
} else {
ConsensusState::Uniform
};
let (replicationtx, replicationrx) = mpsc::unbounded_channel();
Self{
core, nodes: BTreeMap::new(), non_voters: BTreeMap::new(), is_stepping_down: false,
replicationtx, replicationrx, consensus_state, awaiting_committed: Vec::new(),
propose_config_change_cb: None, joint_consensus_cb: FuturesOrdered::new(),
Self {
core,
nodes: BTreeMap::new(),
non_voters: BTreeMap::new(),
is_stepping_down: false,
replicationtx,
replicationrx,
consensus_state,
awaiting_committed: Vec::new(),
propose_config_change_cb: None,
joint_consensus_cb: FuturesOrdered::new(),
uniform_consensus_cb: FuturesOrdered::new(),
}
}
@ -496,7 +546,11 @@ impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>
#[tracing::instrument(level="trace", skip(self), fields(id=self.core.id, raft_state="leader"))]
pub(self) async fn run(mut self) -> RaftResult<()> {
// Spawn replication streams.
let targets = self.core.membership.all_nodes().into_iter()
let targets = self
.core
.membership
.all_nodes()
.into_iter()
.filter(|elem| elem != &self.core.id)
.collect::<Vec<_>>();
for target in targets {
@ -523,7 +577,7 @@ impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>
}
return Ok(());
}
tokio::select!{
tokio::select! {
Some(msg) = self.core.rx_api.next() => match msg {
RaftMsg::AppendEntries{rpc, tx} => {
let _ = tx.send(self.core.handle_append_entries_request(rpc).await);
@ -629,7 +683,7 @@ impl ConsensusState {
/// 2. the corresponding config for this consensus state has been committed to the cluster.
pub fn is_joint_consensus_safe_to_finalize(&self) -> bool {
match self {
ConsensusState::Joint{is_committed} => *is_committed,
ConsensusState::Joint { is_committed } => *is_committed,
_ => false,
}
}
@ -653,7 +707,13 @@ struct CandidateState<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S:
impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>> CandidateState<'a, D, R, N, S> {
pub(self) fn new(core: &'a mut RaftCore<D, R, N, S>) -> Self {
Self{core, votes_granted_old: 0, votes_needed_old: 0, votes_granted_new: 0, votes_needed_new: 0}
Self {
core,
votes_granted_old: 0,
votes_needed_old: 0,
votes_granted_new: 0,
votes_needed_new: 0,
}
}
/// Run the candidate loop.
@ -687,7 +747,7 @@ impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>
}
let mut timeout_fut = delay_until(self.core.get_next_election_timeout());
tokio::select!{
tokio::select! {
_ = &mut timeout_fut => break, // This election has timed-out. Break to outer loop, which starts a new term.
Some((res, peer)) = pending_votes.recv() => self.handle_vote_response(res, peer).await?,
Some(msg) = self.core.rx_api.next() => match msg {
@ -733,7 +793,7 @@ pub struct FollowerState<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>,
impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>> FollowerState<'a, D, R, N, S> {
pub(self) fn new(core: &'a mut RaftCore<D, R, N, S>) -> Self {
Self{core}
Self { core }
}
/// Run the follower loop.
@ -746,7 +806,7 @@ impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>
}
let mut election_timeout = delay_until(self.core.get_next_election_timeout()); // Value is updated as heartbeats are received.
tokio::select!{
tokio::select! {
// If an election timeout is hit, then we need to transition to candidate.
_ = &mut election_timeout => self.core.set_target_state(State::Candidate),
Some(msg) = self.core.rx_api.next() => match msg {
@ -791,7 +851,7 @@ pub struct NonVoterState<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>,
impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>> NonVoterState<'a, D, R, N, S> {
pub(self) fn new(core: &'a mut RaftCore<D, R, N, S>) -> Self {
Self{core}
Self { core }
}
/// Run the non-voter loop.
@ -802,7 +862,7 @@ impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>
if !self.core.target_state.is_non_voter() || self.core.needs_shutdown.load(Ordering::SeqCst) {
return Ok(());
}
tokio::select!{
tokio::select! {
Some(msg) = self.core.rx_api.next() => match msg {
RaftMsg::AppendEntries{rpc, tx} => {
let _ = tx.send(self.core.handle_append_entries_request(rpc).await);

View File

@ -1,22 +1,29 @@
use tokio::sync::oneshot;
use crate::{AppData, AppDataResponse, NodeId, RaftNetwork, RaftStorage};
use crate::config::SnapshotPolicy;
use crate::error::RaftResult;
use crate::core::{ConsensusState, LeaderState, ReplicationState, SnapshotState, State, UpdateCurrentLeader};
use crate::error::RaftResult;
use crate::replication::{RaftEvent, ReplicaEvent, ReplicationStream};
use crate::storage::CurrentSnapshotData;
use crate::{AppData, AppDataResponse, NodeId, RaftNetwork, RaftStorage};
impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>> LeaderState<'a, D, R, N, S> {
/// Spawn a new replication stream returning its replication state handle.
#[tracing::instrument(level="trace", skip(self))]
#[tracing::instrument(level = "trace", skip(self))]
pub(super) fn spawn_replication_stream(&self, target: NodeId) -> ReplicationState<D> {
let replstream = ReplicationStream::new(
self.core.id, target, self.core.current_term, self.core.config.clone(),
self.core.last_log_index, self.core.last_log_term, self.core.commit_index,
self.core.network.clone(), self.core.storage.clone(), self.replicationtx.clone(),
self.core.id,
target,
self.core.current_term,
self.core.config.clone(),
self.core.last_log_index,
self.core.last_log_term,
self.core.commit_index,
self.core.network.clone(),
self.core.storage.clone(),
self.replicationtx.clone(),
);
ReplicationState{
ReplicationState {
match_index: self.core.last_log_index,
match_term: self.core.current_term,
is_at_line_rate: false,
@ -26,13 +33,17 @@ impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>
}
/// Handle a replication event coming from one of the replication streams.
#[tracing::instrument(level="trace", skip(self, event))]
#[tracing::instrument(level = "trace", skip(self, event))]
pub(super) async fn handle_replica_event(&mut self, event: ReplicaEvent<S::Snapshot>) {
let res = match event {
ReplicaEvent::RateUpdate{target, is_line_rate} => self.handle_rate_update(target, is_line_rate).await,
ReplicaEvent::RevertToFollower{target, term} => self.handle_revert_to_follower(target, term).await,
ReplicaEvent::UpdateMatchIndex{target, match_index, match_term} => self.handle_update_match_index(target, match_index, match_term).await,
ReplicaEvent::NeedsSnapshot{target, tx} => self.handle_needs_snapshot(target, tx).await,
ReplicaEvent::RateUpdate { target, is_line_rate } => self.handle_rate_update(target, is_line_rate).await,
ReplicaEvent::RevertToFollower { target, term } => self.handle_revert_to_follower(target, term).await,
ReplicaEvent::UpdateMatchIndex {
target,
match_index,
match_term,
} => self.handle_update_match_index(target, match_index, match_term).await,
ReplicaEvent::NeedsSnapshot { target, tx } => self.handle_needs_snapshot(target, tx).await,
ReplicaEvent::Shutdown => {
self.core.set_target_state(State::Shutdown);
return;
@ -44,7 +55,7 @@ impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>
}
/// Handle events from replication streams updating their replication rate tracker.
#[tracing::instrument(level="trace", skip(self, target, is_line_rate))]
#[tracing::instrument(level = "trace", skip(self, target, is_line_rate))]
async fn handle_rate_update(&mut self, target: NodeId, is_line_rate: bool) -> RaftResult<()> {
// Get a handle the target's replication stat & update it as needed.
if let Some(state) = self.nodes.get_mut(&target) {
@ -62,7 +73,7 @@ impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>
}
// If we are in NonVoterSync state, and this is one of the nodes being awaiting, then update.
match std::mem::replace(&mut self.consensus_state, ConsensusState::Uniform) {
ConsensusState::NonVoterSync{mut awaiting, members, tx} => {
ConsensusState::NonVoterSync { mut awaiting, members, tx } => {
awaiting.remove(&target);
if awaiting.is_empty() {
// We are ready to move forward with entering joint consensus.
@ -70,7 +81,7 @@ impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>
self.change_membership(members, tx).await;
} else {
// We are still awaiting additional nodes, so replace our original state.
self.consensus_state = ConsensusState::NonVoterSync{awaiting, members, tx};
self.consensus_state = ConsensusState::NonVoterSync { awaiting, members, tx };
}
}
other => self.consensus_state = other, // Set the original value back to what it was.
@ -81,7 +92,7 @@ impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>
}
/// Handle events from replication streams for when this node needs to revert to follower state.
#[tracing::instrument(level="trace", skip(self, term))]
#[tracing::instrument(level = "trace", skip(self, term))]
async fn handle_revert_to_follower(&mut self, _: NodeId, term: u64) -> RaftResult<()> {
if term > self.core.current_term {
self.core.update_current_term(term, None);
@ -93,7 +104,7 @@ impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>
}
/// Handle events from a replication stream which updates the target node's match index.
#[tracing::instrument(level="trace", skip(self, target, match_index))]
#[tracing::instrument(level = "trace", skip(self, target, match_index))]
async fn handle_update_match_index(&mut self, target: NodeId, match_index: u64, match_term: u64) -> RaftResult<()> {
// If this is a non-voter, then update and return.
if let Some(state) = self.non_voters.get_mut(&target) {
@ -113,7 +124,7 @@ impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>
needs_removal = true;
}
}
},
}
_ => return Ok(()), // Node not found.
}
@ -125,7 +136,9 @@ impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>
}
// Determine the new commit index of the current membership config nodes.
let mut indices_c0 = self.nodes.iter()
let mut indices_c0 = self
.nodes
.iter()
.filter(|(id, _)| self.core.membership.members.contains(id))
.map(|(_, node)| node.match_index)
.collect::<Vec<_>>();
@ -137,7 +150,9 @@ impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>
// If we are in joint consensus, then calculate the new commit index of the new membership config nodes.
let mut commit_index_c1 = commit_index_c0; // Defaults to just matching C0.
if let Some(members) = &self.core.membership.members_after_consensus {
let indices_c1 = self.nodes.iter()
let indices_c1 = self
.nodes
.iter()
.filter(|(id, _)| members.contains(id))
.map(|(_, node)| node.match_index)
.collect();
@ -152,14 +167,21 @@ impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>
// Update all replication streams based on new commit index.
for node in self.nodes.values() {
let _ = node.replstream.repltx.send(RaftEvent::UpdateCommitIndex{commit_index: self.core.commit_index});
let _ = node.replstream.repltx.send(RaftEvent::UpdateCommitIndex {
commit_index: self.core.commit_index,
});
}
for node in self.non_voters.values() {
let _ = node.state.replstream.repltx.send(RaftEvent::UpdateCommitIndex{commit_index: self.core.commit_index});
let _ = node.state.replstream.repltx.send(RaftEvent::UpdateCommitIndex {
commit_index: self.core.commit_index,
});
}
// Check if there are any pending requests which need to be processed.
let filter = self.awaiting_committed.iter().enumerate()
let filter = self
.awaiting_committed
.iter()
.enumerate()
.take_while(|(_idx, elem)| elem.entry.index <= self.core.commit_index)
.last()
.map(|(idx, _)| idx);
@ -175,7 +197,7 @@ impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>
}
/// Handle events from replication streams requesting for snapshot info.
#[tracing::instrument(level="trace", skip(self, tx))]
#[tracing::instrument(level = "trace", skip(self, tx))]
async fn handle_needs_snapshot(&mut self, _: NodeId, tx: oneshot::Sender<CurrentSnapshotData<S::Snapshot>>) -> RaftResult<()> {
// Ensure snapshotting is configured, else do nothing.
let threshold = match &self.core.config.snapshot_policy {
@ -183,7 +205,11 @@ impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>
};
// Check for existence of current snapshot.
let current_snapshot_opt = self.core.storage.get_current_snapshot().await
let current_snapshot_opt = self
.core
.storage
.get_current_snapshot()
.await
.map_err(|err| self.core.map_fatal_storage_error(err))?;
if let Some(snapshot) = current_snapshot_opt {
// If snapshot exists, ensure its distance from the leader's last log index is <= half
@ -198,13 +224,13 @@ impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>
// completion (or cancellation), and respond to the replication stream. The repl stream
// will wait for the completion and will then send anothe request to fetch the finished snapshot.
// Else we just drop any other state and continue. Leaders never enter `Streaming` state.
if let Some(SnapshotState::Snapshotting{through, handle, sender}) = self.core.snapshot_state.take() {
if let Some(SnapshotState::Snapshotting { through, handle, sender }) = self.core.snapshot_state.take() {
let mut chan = sender.subscribe();
tokio::spawn(async move {
let _ = chan.recv().await;
drop(tx);
});
self.core.snapshot_state = Some(SnapshotState::Snapshotting{through, handle, sender});
self.core.snapshot_state = Some(SnapshotState::Snapshotting { through, handle, sender });
return Ok(());
}
@ -240,7 +266,7 @@ fn calculate_new_commit_index(mut entries: Vec<u64>, current_commit: u64) -> u64
// Calculate offset which will give the majority slice of high-end.
entries.sort();
let offset = if (len % 2) == 0 { (len/2)-1 } else { len/2 };
let offset = if (len % 2) == 0 { (len / 2) - 1 } else { len / 2 };
let new_val = entries.get(offset).unwrap_or(&current_commit);
if new_val < &current_commit {
current_commit
@ -252,7 +278,11 @@ fn calculate_new_commit_index(mut entries: Vec<u64>, current_commit: u64) -> u64
/// Check if the given snapshot data is within half of the configured threshold.
fn snapshot_is_within_half_of_threshold(snapshot_last_index: &u64, last_log_index: &u64, threshold: &u64) -> bool {
// Calculate distance from actor's last log index.
let distance_from_line = if snapshot_last_index > last_log_index { 0u64 } else { last_log_index - snapshot_last_index }; // Guard against underflow.
let distance_from_line = if snapshot_last_index > last_log_index {
0u64
} else {
last_log_index - snapshot_last_index
}; // Guard against underflow.
let half_of_threshold = threshold / 2;
distance_from_line <= half_of_threshold
}
@ -277,7 +307,7 @@ mod tests {
let res = snapshot_is_within_half_of_threshold($snapshot_last_index, $last_log, $thresh);
assert_eq!(res, $exp)
}
}
};
}
test_snapshot_is_within_half_of_threshold!({
@ -311,37 +341,19 @@ mod tests {
entries.sort();
assert_eq!(output, $expected, "Sorted values: {:?}", entries);
}
}
};
}
test_calculate_new_commit_index!(
basic_values,
10, 5, vec![20, 5, 0, 15, 10]
);
test_calculate_new_commit_index!(basic_values, 10, 5, vec![20, 5, 0, 15, 10]);
test_calculate_new_commit_index!(
len_zero_should_return_current_commit,
20, 20, vec![]
);
test_calculate_new_commit_index!(len_zero_should_return_current_commit, 20, 20, vec![]);
test_calculate_new_commit_index!(
len_one_where_greater_than_current,
100, 0, vec![100]
);
test_calculate_new_commit_index!(len_one_where_greater_than_current, 100, 0, vec![100]);
test_calculate_new_commit_index!(
len_one_where_less_than_current,
100, 100, vec![50]
);
test_calculate_new_commit_index!(len_one_where_less_than_current, 100, 100, vec![50]);
test_calculate_new_commit_index!(
even_number_of_nodes,
0, 0, vec![0, 100, 0, 100, 0, 100]
);
test_calculate_new_commit_index!(even_number_of_nodes, 0, 0, vec![0, 100, 0, 100, 0, 100]);
test_calculate_new_commit_index!(
majority_wins,
100, 0, vec![0, 100, 0, 100, 0, 100, 100]
);
test_calculate_new_commit_index!(majority_wins, 100, 0, vec![0, 100, 0, 100, 0, 100, 100]);
}
}

View File

@ -1,22 +1,25 @@
use tokio::time::Instant;
use tokio::sync::mpsc;
use tokio::time::Instant;
use tracing_futures::Instrument;
use crate::{AppData, AppDataResponse, NodeId, RaftNetwork, RaftStorage};
use crate::error::RaftResult;
use crate::core::{CandidateState, RaftCore, State, UpdateCurrentLeader};
use crate::error::RaftResult;
use crate::raft::{VoteRequest, VoteResponse};
use crate::{AppData, AppDataResponse, NodeId, RaftNetwork, RaftStorage};
impl<D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>> RaftCore<D, R, N, S> {
/// An RPC invoked by candidates to gather votes (§5.2).
///
/// See `receiver implementation: RequestVote RPC` in raft-essentials.md in this repo.
#[tracing::instrument(level="trace", skip(self, msg))]
#[tracing::instrument(level = "trace", skip(self, msg))]
pub(super) async fn handle_vote_request(&mut self, msg: VoteRequest) -> RaftResult<VoteResponse> {
// If candidate's current term is less than this nodes current term, reject.
if msg.term < self.current_term {
tracing::trace!({candidate=msg.candidate_id, self.current_term, rpc_term=msg.term}, "RequestVote RPC term is less than current term");
return Ok(VoteResponse{term: self.current_term, vote_granted: false});
return Ok(VoteResponse {
term: self.current_term,
vote_granted: false,
});
}
// Do not respond to the request if we've received a heartbeat within the election timeout minimum.
@ -24,8 +27,14 @@ impl<D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>> Ra
let now = Instant::now();
let delta = now.duration_since(*inst);
if self.config.election_timeout_min >= (delta.as_millis() as u64) {
tracing::trace!({candidate=msg.candidate_id}, "rejecting vote request received within election timeout minimum");
return Ok(VoteResponse{term: self.current_term, vote_granted: false});
tracing::trace!(
{ candidate = msg.candidate_id },
"rejecting vote request received within election timeout minimum"
);
return Ok(VoteResponse {
term: self.current_term,
vote_granted: false,
});
}
}
@ -43,18 +52,28 @@ impl<D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>> Ra
// If candidate's log is not at least as up-to-date as this node, then reject.
let client_is_uptodate = (msg.last_log_term >= self.last_log_term) && (msg.last_log_index >= self.last_log_index);
if !client_is_uptodate {
tracing::trace!({candidate=msg.candidate_id}, "rejecting vote request as candidate's log is not up-to-date");
return Ok(VoteResponse{term: self.current_term, vote_granted: false});
tracing::trace!(
{ candidate = msg.candidate_id },
"rejecting vote request as candidate's log is not up-to-date"
);
return Ok(VoteResponse {
term: self.current_term,
vote_granted: false,
});
}
// Candidate's log is up-to-date so handle voting conditions.
match &self.voted_for {
// This node has already voted for the candidate.
Some(candidate_id) if candidate_id == &msg.candidate_id => {
Ok(VoteResponse{term: self.current_term, vote_granted: true})
}
Some(candidate_id) if candidate_id == &msg.candidate_id => Ok(VoteResponse {
term: self.current_term,
vote_granted: true,
}),
// This node has already voted for a different candidate.
Some(_) => Ok(VoteResponse{term: self.current_term, vote_granted: false}),
Some(_) => Ok(VoteResponse {
term: self.current_term,
vote_granted: false,
}),
// This node has not yet voted for the current term, so vote for the candidate.
None => {
self.voted_for = Some(msg.candidate_id);
@ -62,15 +81,18 @@ impl<D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>> Ra
self.update_next_election_timeout();
self.save_hard_state().await?;
tracing::trace!({candidate=msg.candidate_id, msg.term}, "voted for candidate");
Ok(VoteResponse{term: self.current_term, vote_granted: true})
},
Ok(VoteResponse {
term: self.current_term,
vote_granted: true,
})
}
}
}
}
impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>> CandidateState<'a, D, R, N, S> {
/// Handle response from a vote request sent to a peer.
#[tracing::instrument(level="trace", skip(self, res, target))]
#[tracing::instrument(level = "trace", skip(self, res, target))]
pub(super) async fn handle_vote_response(&mut self, res: VoteResponse, target: NodeId) -> RaftResult<()> {
// If peer's term is greater than current term, revert to follower state.
if res.term > self.core.current_term {
@ -89,7 +111,14 @@ impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>
self.votes_granted_old += 1;
}
// Handle vote responses from members of C1 config group.
if self.core.membership.members_after_consensus.as_ref().map(|members| members.contains(&target)).unwrap_or(false) {
if self
.core
.membership
.members_after_consensus
.as_ref()
.map(|members| members.contains(&target))
.unwrap_or(false)
{
self.votes_granted_new += 1;
}
// If we've received enough votes from both config groups, then transition to leader state`.
@ -105,21 +134,24 @@ impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>
}
/// Spawn parallel vote requests to all cluster members.
#[tracing::instrument(level="trace", skip(self))]
#[tracing::instrument(level = "trace", skip(self))]
pub(super) fn spawn_parallel_vote_requests(&self) -> mpsc::Receiver<(VoteResponse, NodeId)> {
let all_members = self.core.membership.all_nodes();
let (tx, rx) = mpsc::channel(all_members.len());
for member in all_members.into_iter().filter(|member| member != &self.core.id) {
let rpc = VoteRequest::new(self.core.current_term, self.core.id, self.core.last_log_index, self.core.last_log_term);
let (network, mut tx_inner) = (self.core.network.clone(), tx.clone());
let _ = tokio::spawn(async move {
match network.vote(member, rpc).await {
Ok(res) => {
let _ = tx_inner.send((res, member)).await;
let _ = tokio::spawn(
async move {
match network.vote(member, rpc).await {
Ok(res) => {
let _ = tx_inner.send((res, member)).await;
}
Err(err) => tracing::error!({error=%err, peer=member}, "error while requesting vote from peer"),
}
Err(err) => tracing::error!({error=%err, peer=member}, "error while requesting vote from peer"),
}
}.instrument(tracing::trace_span!("requesting vote from peer", target=member)));
.instrument(tracing::trace_span!("requesting vote from peer", target = member)),
);
}
rx
}

View File

@ -2,8 +2,8 @@
use thiserror::Error;
use crate::{AppData, NodeId};
use crate::raft::ClientWriteRequest;
use crate::{AppData, NodeId};
/// A result type where the error variant is always a `RaftError`.
pub type RaftResult<T> = std::result::Result<T, RaftError>;

View File

@ -1,23 +1,23 @@
#![cfg_attr(feature="docinclude", feature(external_doc))]
#![cfg_attr(feature="docinclude", doc(include="../README.md"))]
#![cfg_attr(feature = "docinclude", feature(external_doc))]
#![cfg_attr(feature = "docinclude", doc(include = "../README.md"))]
pub mod config;
mod core;
pub mod error;
pub mod metrics;
pub mod network;
mod replication;
pub mod raft;
mod replication;
pub mod storage;
use std::fmt::Debug;
use serde::{Serialize, de::DeserializeOwned};
use serde::{de::DeserializeOwned, Serialize};
pub use crate::{
config::{Config, ConfigBuilder, SnapshotPolicy},
core::State,
error::{ClientWriteError, ConfigError, InitializeError, ChangeConfigError, RaftError},
error::{ChangeConfigError, ClientWriteError, ConfigError, InitializeError, RaftError},
metrics::RaftMetrics,
network::RaftNetwork,
raft::Raft,

View File

@ -7,9 +7,9 @@
//! Metrics are observed on a running Raft node via the `Raft::metrics()` method, which will
//! return a stream of metrics.
use crate::NodeId;
use crate::core::State;
use crate::raft::MembershipConfig;
use crate::NodeId;
/// A set of metrics describing the current state of a Raft node.
#[derive(Clone, Debug, PartialEq, Eq)]
@ -33,6 +33,14 @@ pub struct RaftMetrics {
impl RaftMetrics {
pub(crate) fn new_initial(id: NodeId) -> Self {
let membership_config = MembershipConfig::new_initial(id);
Self{id, state: State::Follower, current_term: 0, last_log_index: 0, last_applied: 0, current_leader: None, membership_config}
Self {
id,
state: State::Follower,
current_term: 0,
last_log_index: 0,
last_applied: 0,
current_leader: None,
membership_config,
}
}
}

View File

@ -3,10 +3,10 @@
use anyhow::Result;
use async_trait::async_trait;
use crate::{AppData, NodeId};
use crate::raft::{AppendEntriesRequest, AppendEntriesResponse};
use crate::raft::{InstallSnapshotRequest, InstallSnapshotResponse};
use crate::raft::{VoteRequest, VoteResponse};
use crate::{AppData, NodeId};
/// A trait defining the interface for a Raft network between cluster members.
///
@ -14,8 +14,8 @@ use crate::raft::{VoteRequest, VoteResponse};
/// for details and discussion on this trait and how to implement it.
#[async_trait]
pub trait RaftNetwork<D>: Send + Sync + 'static
where
D: AppData,
where
D: AppData,
{
/// Send an AppendEntries RPC to the target Raft node (§5).
async fn append_entries(&self, target: NodeId, rpc: AppendEntriesRequest<D>) -> Result<AppendEntriesResponse>;

View File

@ -1,18 +1,18 @@
//! Public Raft interface and data types.
use std::collections::HashSet;
use std::sync::Arc;
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::Arc;
use serde::{Serialize, Deserialize};
use serde::{Deserialize, Serialize};
use tokio::sync::{mpsc, oneshot, watch};
use tokio::task::JoinHandle;
use crate::{AppData, AppDataResponse, NodeId, RaftNetwork, RaftStorage};
use crate::config::Config;
use crate::error::{ClientReadError, ClientWriteError, ChangeConfigError, InitializeError, RaftError, RaftResult};
use crate::metrics::RaftMetrics;
use crate::core::RaftCore;
use crate::error::{ChangeConfigError, ClientReadError, ClientWriteError, InitializeError, RaftError, RaftResult};
use crate::metrics::RaftMetrics;
use crate::{AppData, AppDataResponse, NodeId, RaftNetwork, RaftStorage};
/// The Raft API.
///
@ -63,14 +63,14 @@ impl<D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>> Ra
let (tx_api, rx_api) = mpsc::unbounded_channel();
let (tx_metrics, rx_metrics) = watch::channel(RaftMetrics::new_initial(id));
let needs_shutdown = Arc::new(AtomicBool::new(false));
let raft_handle = RaftCore::spawn(
id, config, network, storage,
rx_api, tx_metrics,
needs_shutdown.clone(),
);
Self{
tx_api, rx_metrics, raft_handle, needs_shutdown,
marker_n: std::marker::PhantomData, marker_s: std::marker::PhantomData,
let raft_handle = RaftCore::spawn(id, config, network, storage, rx_api, tx_metrics, needs_shutdown.clone());
Self {
tx_api,
rx_metrics,
raft_handle,
needs_shutdown,
marker_n: std::marker::PhantomData,
marker_s: std::marker::PhantomData,
}
}
@ -78,20 +78,22 @@ impl<D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>> Ra
///
/// These RPCs are sent by the cluster leader to replicate log entries (§5.3), and are also
/// used as heartbeats (§5.2).
#[tracing::instrument(level="debug", skip(self, rpc))]
#[tracing::instrument(level = "debug", skip(self, rpc))]
pub async fn append_entries(&self, rpc: AppendEntriesRequest<D>) -> Result<AppendEntriesResponse, RaftError> {
let (tx, rx) = oneshot::channel();
self.tx_api.send(RaftMsg::AppendEntries{rpc, tx}).map_err(|_| RaftError::ShuttingDown)?;
self.tx_api
.send(RaftMsg::AppendEntries { rpc, tx })
.map_err(|_| RaftError::ShuttingDown)?;
Ok(rx.await.map_err(|_| RaftError::ShuttingDown).and_then(|res| res)?)
}
/// Submit a VoteRequest (RequestVote in the spec) RPC to this Raft node.
///
/// These RPCs are sent by cluster peers which are in candidate state attempting to gather votes (§5.2).
#[tracing::instrument(level="debug", skip(self, rpc))]
#[tracing::instrument(level = "debug", skip(self, rpc))]
pub async fn vote(&self, rpc: VoteRequest) -> Result<VoteResponse, RaftError> {
let (tx, rx) = oneshot::channel();
self.tx_api.send(RaftMsg::RequestVote{rpc, tx}).map_err(|_| RaftError::ShuttingDown)?;
self.tx_api.send(RaftMsg::RequestVote { rpc, tx }).map_err(|_| RaftError::ShuttingDown)?;
Ok(rx.await.map_err(|_| RaftError::ShuttingDown).and_then(|res| res)?)
}
@ -99,10 +101,12 @@ impl<D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>> Ra
///
/// These RPCs are sent by the cluster leader in order to bring a new node or a slow node up-to-speed
/// with the leader (§7).
#[tracing::instrument(level="debug", skip(self, rpc))]
#[tracing::instrument(level = "debug", skip(self, rpc))]
pub async fn install_snapshot(&self, rpc: InstallSnapshotRequest) -> Result<InstallSnapshotResponse, RaftError> {
let (tx, rx) = oneshot::channel();
self.tx_api.send(RaftMsg::InstallSnapshot{rpc, tx}).map_err(|_| RaftError::ShuttingDown)?;
self.tx_api
.send(RaftMsg::InstallSnapshot { rpc, tx })
.map_err(|_| RaftError::ShuttingDown)?;
Ok(rx.await.map_err(|_| RaftError::ShuttingDown).and_then(|res| res)?)
}
@ -110,11 +114,16 @@ impl<D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>> Ra
///
/// The actual read operation itself is up to the application, this method just ensures that
/// the read will not be stale.
#[tracing::instrument(level="debug", skip(self))]
#[tracing::instrument(level = "debug", skip(self))]
pub async fn client_read(&self) -> Result<(), ClientReadError> {
let (tx, rx) = oneshot::channel();
self.tx_api.send(RaftMsg::ClientReadRequest{tx}).map_err(|_| ClientReadError::RaftError(RaftError::ShuttingDown))?;
Ok(rx.await.map_err(|_| ClientReadError::RaftError(RaftError::ShuttingDown)).and_then(|res| res)?)
self.tx_api
.send(RaftMsg::ClientReadRequest { tx })
.map_err(|_| ClientReadError::RaftError(RaftError::ShuttingDown))?;
Ok(rx
.await
.map_err(|_| ClientReadError::RaftError(RaftError::ShuttingDown))
.and_then(|res| res)?)
}
/// Submit a mutating client request to Raft to update the state of the system (§5.1).
@ -134,11 +143,16 @@ impl<D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>> Ra
///
/// These are application specific requirements, and must be implemented by the application which is
/// being built on top of Raft.
#[tracing::instrument(level="debug", skip(self, rpc))]
#[tracing::instrument(level = "debug", skip(self, rpc))]
pub async fn client_write(&self, rpc: ClientWriteRequest<D>) -> Result<ClientWriteResponse<R>, ClientWriteError<D>> {
let (tx, rx) = oneshot::channel();
self.tx_api.send(RaftMsg::ClientWriteRequest{rpc, tx}).map_err(|_| ClientWriteError::RaftError(RaftError::ShuttingDown))?;
Ok(rx.await.map_err(|_| ClientWriteError::RaftError(RaftError::ShuttingDown)).and_then(|res| res)?)
self.tx_api
.send(RaftMsg::ClientWriteRequest { rpc, tx })
.map_err(|_| ClientWriteError::RaftError(RaftError::ShuttingDown))?;
Ok(rx
.await
.map_err(|_| ClientWriteError::RaftError(RaftError::ShuttingDown))
.and_then(|res| res)?)
}
/// Initialize a pristine Raft node with the given config.
@ -169,11 +183,16 @@ impl<D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>> Ra
/// Every member of the cluster should perform these actions. This routine is race-condition
/// free, and Raft guarantees that the first node to become the cluster leader will propagate
/// only its own config.
#[tracing::instrument(level="debug", skip(self))]
#[tracing::instrument(level = "debug", skip(self))]
pub async fn initialize(&self, members: HashSet<NodeId>) -> Result<(), InitializeError> {
let (tx, rx) = oneshot::channel();
self.tx_api.send(RaftMsg::Initialize{members, tx}).map_err(|_| RaftError::ShuttingDown)?;
Ok(rx.await.map_err(|_| InitializeError::RaftError(RaftError::ShuttingDown)).and_then(|res| res)?)
self.tx_api
.send(RaftMsg::Initialize { members, tx })
.map_err(|_| RaftError::ShuttingDown)?;
Ok(rx
.await
.map_err(|_| InitializeError::RaftError(RaftError::ShuttingDown))
.and_then(|res| res)?)
}
/// Synchronize a new Raft node, bringing it up-to-speed (§6).
@ -188,11 +207,14 @@ impl<D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>> Ra
/// application to then call `change_membership` once all of the new nodes are synced.
///
/// If this Raft node is not the cluster leader, then this call will fail.
#[tracing::instrument(level="debug", skip(self))]
#[tracing::instrument(level = "debug", skip(self))]
pub async fn add_non_voter(&self, id: NodeId) -> Result<(), ChangeConfigError> {
let (tx, rx) = oneshot::channel();
self.tx_api.send(RaftMsg::AddNonVoter{id, tx}).map_err(|_| RaftError::ShuttingDown)?;
Ok(rx.await.map_err(|_| ChangeConfigError::RaftError(RaftError::ShuttingDown)).and_then(|res| res)?)
self.tx_api.send(RaftMsg::AddNonVoter { id, tx }).map_err(|_| RaftError::ShuttingDown)?;
Ok(rx
.await
.map_err(|_| ChangeConfigError::RaftError(RaftError::ShuttingDown))
.and_then(|res| res)?)
}
/// Propose a cluster configuration change (§6).
@ -206,11 +228,16 @@ impl<D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>> Ra
///
/// If this Raft node is not the cluster leader, then the proposed configuration change will be
/// rejected.
#[tracing::instrument(level="debug", skip(self))]
#[tracing::instrument(level = "debug", skip(self))]
pub async fn change_membership(&self, members: HashSet<NodeId>) -> Result<(), ChangeConfigError> {
let (tx, rx) = oneshot::channel();
self.tx_api.send(RaftMsg::ChangeMembership{members, tx}).map_err(|_| RaftError::ShuttingDown)?;
Ok(rx.await.map_err(|_| ChangeConfigError::RaftError(RaftError::ShuttingDown)).and_then(|res| res)?)
self.tx_api
.send(RaftMsg::ChangeMembership { members, tx })
.map_err(|_| RaftError::ShuttingDown)?;
Ok(rx
.await
.map_err(|_| ChangeConfigError::RaftError(RaftError::ShuttingDown))
.and_then(|res| res)?)
}
/// Get a handle to the metrics channel.
@ -282,7 +309,7 @@ pub struct AppendEntriesRequest<D: AppData> {
///
/// This may be empty when the leader is sending heartbeats. Entries
/// are batched for efficiency.
#[serde(bound="D: AppData")]
#[serde(bound = "D: AppData")]
pub entries: Vec<Entry<D>>,
/// The leader's commit index.
pub leader_commit: u64,
@ -325,7 +352,7 @@ pub struct Entry<D: AppData> {
/// This entry's index.
pub index: u64,
/// This entry's payload.
#[serde(bound="D: AppData")]
#[serde(bound = "D: AppData")]
pub payload: EntryPayload<D>,
}
@ -342,7 +369,11 @@ impl<D: AppData> Entry<D> {
/// The cluster membership config which is contained in the snapshot, which will always be the
/// latest membership covered by the snapshot.
pub fn new_snapshot_pointer(index: u64, term: u64, id: String, membership: MembershipConfig) -> Self {
Entry{term, index, payload: EntryPayload::SnapshotPointer(EntrySnapshotPointer{id, membership})}
Entry {
term,
index,
payload: EntryPayload::SnapshotPointer(EntrySnapshotPointer { id, membership }),
}
}
}
@ -352,7 +383,7 @@ pub enum EntryPayload<D: AppData> {
/// An empty payload committed by a new cluster leader.
Blank,
/// A normal log entry.
#[serde(bound="D: AppData")]
#[serde(bound = "D: AppData")]
Normal(EntryNormal<D>),
/// A config change log entry.
ConfigChange(EntryConfigChange),
@ -364,7 +395,7 @@ pub enum EntryPayload<D: AppData> {
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct EntryNormal<D: AppData> {
/// The contents of this entry.
#[serde(bound="D: AppData")]
#[serde(bound = "D: AppData")]
pub data: D,
}
@ -416,11 +447,12 @@ impl MembershipConfig {
///
/// When in joint consensus, this will check both config groups.
pub fn contains(&self, x: &NodeId) -> bool {
self.members.contains(x) || if let Some(members) = &self.members_after_consensus {
members.contains(x)
} else {
false
}
self.members.contains(x)
|| if let Some(members) = &self.members_after_consensus {
members.contains(x)
} else {
false
}
}
/// Check to see if the config is currently in joint consensus.
@ -432,7 +464,10 @@ impl MembershipConfig {
pub fn new_initial(id: NodeId) -> Self {
let mut members = HashSet::new();
members.insert(id);
Self{members, members_after_consensus: None}
Self {
members,
members_after_consensus: None,
}
}
}
@ -455,7 +490,12 @@ pub struct VoteRequest {
impl VoteRequest {
/// Create a new instance.
pub fn new(term: u64, candidate_id: u64, last_log_index: u64, last_log_term: u64) -> Self {
Self{term, candidate_id, last_log_index, last_log_term}
Self {
term,
candidate_id,
last_log_index,
last_log_term,
}
}
}
@ -507,24 +547,24 @@ pub struct InstallSnapshotResponse {
#[derive(Debug, Serialize, Deserialize)]
pub struct ClientWriteRequest<D: AppData> {
/// The application specific contents of this client request.
#[serde(bound="D: AppData")]
#[serde(bound = "D: AppData")]
pub(crate) entry: EntryPayload<D>,
}
impl<D: AppData> ClientWriteRequest<D> {
/// Create a new client payload instance with a normal entry type.
pub fn new(entry: D) -> Self {
Self::new_base(EntryPayload::Normal(EntryNormal{data: entry}))
Self::new_base(EntryPayload::Normal(EntryNormal { data: entry }))
}
/// Create a new instance.
pub(crate) fn new_base(entry: EntryPayload<D>) -> Self {
Self{entry}
Self { entry }
}
/// Generate a new payload holding a config change.
pub(crate) fn new_config(membership: MembershipConfig) -> Self {
Self::new_base(EntryPayload::ConfigChange(EntryConfigChange{membership}))
Self::new_base(EntryPayload::ConfigChange(EntryConfigChange { membership }))
}
/// Generate a new blank payload.
@ -541,6 +581,6 @@ pub struct ClientWriteResponse<R: AppDataResponse> {
/// The log index of the successfully processed client request.
pub index: u64,
/// Application specific response data.
#[serde(bound="R: AppDataResponse")]
#[serde(bound = "R: AppDataResponse")]
pub data: R,
}

View File

@ -7,13 +7,13 @@ use tokio::io::{AsyncRead, AsyncReadExt, AsyncSeek, AsyncSeekExt};
use tokio::stream::StreamExt;
use tokio::sync::{mpsc, oneshot};
use tokio::task::JoinHandle;
use tokio::time::{Duration, Interval, interval, timeout};
use tokio::time::{interval, timeout, Duration, Interval};
use crate::{AppData, AppDataResponse, NodeId, RaftNetwork, RaftStorage};
use crate::config::{Config, SnapshotPolicy};
use crate::error::RaftResult;
use crate::raft::{AppendEntriesRequest, Entry, EntryPayload, InstallSnapshotRequest};
use crate::storage::CurrentSnapshotData;
use crate::{AppData, AppDataResponse, NodeId, RaftNetwork, RaftStorage};
/// The public handle to a spawned replication stream.
pub(crate) struct ReplicationStream<D: AppData> {
@ -26,13 +26,20 @@ pub(crate) struct ReplicationStream<D: AppData> {
impl<D: AppData> ReplicationStream<D> {
/// Create a new replication stream for the target peer.
pub(crate) fn new<R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>>(
id: NodeId, target: NodeId, term: u64, config: Arc<Config>,
last_log_index: u64, last_log_term: u64, commit_index: u64,
network: Arc<N>, storage: Arc<S>, replicationtx: mpsc::UnboundedSender<ReplicaEvent<S::Snapshot>>,
id: NodeId, target: NodeId, term: u64, config: Arc<Config>, last_log_index: u64, last_log_term: u64, commit_index: u64, network: Arc<N>,
storage: Arc<S>, replicationtx: mpsc::UnboundedSender<ReplicaEvent<S::Snapshot>>,
) -> Self {
ReplicationCore::spawn(
id, target, term, config, last_log_index, last_log_term, commit_index,
network, storage, replicationtx,
id,
target,
term,
config,
last_log_index,
last_log_term,
commit_index,
network,
storage,
replicationtx,
)
}
}
@ -45,7 +52,6 @@ impl<D: AppData> ReplicationStream<D> {
struct ReplicationCore<D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>> {
//////////////////////////////////////////////////////////////////////////
// Static Fields /////////////////////////////////////////////////////////
/// The ID of this Raft node.
id: NodeId,
/// The ID of the target Raft node which replication events are to be sent to.
@ -68,7 +74,6 @@ struct ReplicationCore<D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: Raf
//////////////////////////////////////////////////////////////////////////
// Dynamic Fields ////////////////////////////////////////////////////////
/// The target state of this replication stream.
target_state: TargetReplState,
@ -126,23 +131,36 @@ struct ReplicationCore<D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: Raf
impl<D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>> ReplicationCore<D, R, N, S> {
/// Spawn a new replication task for the target node.
pub(self) fn spawn(
id: NodeId, target: NodeId, term: u64, config: Arc<Config>,
last_log_index: u64, last_log_term: u64, commit_index: u64,
network: Arc<N>, storage: Arc<S>, rafttx: mpsc::UnboundedSender<ReplicaEvent<S::Snapshot>>,
id: NodeId, target: NodeId, term: u64, config: Arc<Config>, last_log_index: u64, last_log_term: u64, commit_index: u64, network: Arc<N>,
storage: Arc<S>, rafttx: mpsc::UnboundedSender<ReplicaEvent<S::Snapshot>>,
) -> ReplicationStream<D> {
let (raftrx_tx, raftrx) = mpsc::unbounded_channel();
let heartbeat_timeout = Duration::from_millis(config.heartbeat_interval);
let max_payload_entries = config.max_payload_entries as usize;
let this = Self{
id, target, term, network, storage, config, max_payload_entries,
let this = Self {
id,
target,
term,
network,
storage,
config,
max_payload_entries,
marker_r: std::marker::PhantomData,
target_state: TargetReplState::Lagging, last_log_index, commit_index,
next_index: last_log_index + 1, match_index: last_log_index, match_term: last_log_term,
rafttx, raftrx, heartbeat: interval(heartbeat_timeout), heartbeat_timeout,
replication_buffer: Vec::new(), outbound_buffer: Vec::new(),
target_state: TargetReplState::Lagging,
last_log_index,
commit_index,
next_index: last_log_index + 1,
match_index: last_log_index,
match_term: last_log_term,
rafttx,
raftrx,
heartbeat: interval(heartbeat_timeout),
heartbeat_timeout,
replication_buffer: Vec::new(),
outbound_buffer: Vec::new(),
};
let handle = tokio::spawn(this.main());
ReplicationStream{handle, repltx: raftrx_tx}
ReplicationStream { handle, repltx: raftrx_tx }
}
#[tracing::instrument(level="trace", skip(self), fields(id=self.id, target=self.target, cluster=%self.config.cluster_name))]
@ -165,24 +183,30 @@ impl<D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>> Re
///
/// This request will timeout if no response is received within the
/// configured heartbeat interval.
#[tracing::instrument(level="trace", skip(self))]
#[tracing::instrument(level = "trace", skip(self))]
async fn send_append_entries(&mut self) {
// Attempt to fill the send buffer from the replication buffer.
if self.outbound_buffer.is_empty() {
let repl_len = self.replication_buffer.len();
if repl_len > 0 {
let chunk_size = if repl_len < self.max_payload_entries { repl_len } else { self.max_payload_entries };
self.outbound_buffer.extend(
self.replication_buffer.drain(..chunk_size)
.map(OutboundEntry::Arc));
let chunk_size = if repl_len < self.max_payload_entries {
repl_len
} else {
self.max_payload_entries
};
self.outbound_buffer
.extend(self.replication_buffer.drain(..chunk_size).map(OutboundEntry::Arc));
}
}
// Build the heartbeat frame to be sent to the follower.
let payload = AppendEntriesRequest{
term: self.term, leader_id: self.id,
prev_log_index: self.match_index, prev_log_term: self.match_term,
leader_commit: self.commit_index, entries: self.outbound_buffer.iter().map(|entry| entry.as_ref().clone()).collect(),
let payload = AppendEntriesRequest {
term: self.term,
leader_id: self.id,
prev_log_index: self.match_index,
prev_log_term: self.match_term,
leader_commit: self.commit_index,
entries: self.outbound_buffer.iter().map(|entry| entry.as_ref().clone()).collect(),
};
// Send the payload.
@ -193,11 +217,11 @@ impl<D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>> Re
tracing::error!({error=%err}, "error sending AppendEntries RPC to target");
return;
}
}
},
Err(err) => {
tracing::error!({error=%err}, "timeout while sending AppendEntries RPC to target");
return;
},
}
};
let last_index_and_term = match self.outbound_buffer.last() {
Some(last) => Some((last.as_ref().index, last.as_ref().term)),
@ -213,7 +237,11 @@ impl<D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>> Re
self.next_index = index + 1; // This should always be the next expected index.
self.match_index = index;
self.match_term = term;
let _ = self.rafttx.send(ReplicaEvent::UpdateMatchIndex{target: self.target, match_index: index, match_term: term});
let _ = self.rafttx.send(ReplicaEvent::UpdateMatchIndex {
target: self.target,
match_index: index,
match_term: term,
});
// If running at line rate, and our buffered outbound requests have accumulated too
// much, we need to purge and transition to a lagging state. The target is not able to
@ -227,8 +255,11 @@ impl<D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>> Re
// Replication was not successful, if a newer term has been returned, revert to follower.
if res.term > self.term {
tracing::trace!({res.term}, "append entries failed, reverting to follower");
let _ = self.rafttx.send(ReplicaEvent::RevertToFollower{target: self.target, term: res.term});
tracing::trace!({ res.term }, "append entries failed, reverting to follower");
let _ = self.rafttx.send(ReplicaEvent::RevertToFollower {
target: self.target,
term: res.term,
});
self.target_state = TargetReplState::Shutdown;
return;
}
@ -249,14 +280,21 @@ impl<D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>> Re
// it will never exist. So instead, we just return, and accept the conflict data.
if conflict.index == 0 {
self.target_state = TargetReplState::Lagging;
let _ = self.rafttx.send(ReplicaEvent::UpdateMatchIndex{
target: self.target, match_index: self.match_index, match_term: self.match_term,
let _ = self.rafttx.send(ReplicaEvent::UpdateMatchIndex {
target: self.target,
match_index: self.match_index,
match_term: self.match_term,
});
return;
}
// Fetch the entry at conflict index and use the term specified there.
match self.storage.get_log_entries(conflict.index, conflict.index + 1).await.map(|entries| entries.get(0).map(|entry| entry.term)) {
match self
.storage
.get_log_entries(conflict.index, conflict.index + 1)
.await
.map(|entries| entries.get(0).map(|entry| entry.term))
{
Ok(Some(term)) => {
self.match_term = term; // If we have the specified log, ensure we use its term.
}
@ -264,8 +302,10 @@ impl<D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>> Re
// This condition would only ever be reached if the log has been removed due to
// log compaction (barring critical storage failure), so transition to snapshotting.
self.target_state = TargetReplState::Snapshotting;
let _ = self.rafttx.send(ReplicaEvent::UpdateMatchIndex{
target: self.target, match_index: self.match_index, match_term: self.match_term,
let _ = self.rafttx.send(ReplicaEvent::UpdateMatchIndex {
target: self.target,
match_index: self.match_index,
match_term: self.match_term,
});
return;
}
@ -278,8 +318,10 @@ impl<D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>> Re
};
// Check snapshot policy and handle conflict as needed.
let _ = self.rafttx.send(ReplicaEvent::UpdateMatchIndex{
target: self.target, match_index: self.match_index, match_term: self.match_term,
let _ = self.rafttx.send(ReplicaEvent::UpdateMatchIndex {
target: self.target,
match_index: self.match_index,
match_term: self.match_term,
});
match &self.config.snapshot_policy {
SnapshotPolicy::LogsSinceLast(threshold) => {
@ -299,7 +341,7 @@ impl<D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>> Re
/// Perform a check to see if this replication stream is lagging behind far enough that a
/// snapshot is warranted.
#[tracing::instrument(level="trace", skip(self))]
#[tracing::instrument(level = "trace", skip(self))]
pub(self) fn needs_snapshot(&self) -> bool {
match &self.config.snapshot_policy {
SnapshotPolicy::LogsSinceLast(threshold) => {
@ -330,10 +372,10 @@ impl<D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>> Re
};
// Process the event.
match event {
RaftEvent::UpdateCommitIndex{commit_index} => {
RaftEvent::UpdateCommitIndex { commit_index } => {
self.commit_index = commit_index;
}
RaftEvent::Replicate{entry, commit_index} => {
RaftEvent::Replicate { entry, commit_index } => {
self.commit_index = commit_index;
self.last_log_index = entry.index;
if self.target_state == TargetReplState::LineRate {
@ -408,10 +450,11 @@ pub(crate) enum RaftEvent<D: AppData> {
/// An event coming from a replication stream.
pub(crate) enum ReplicaEvent<S>
where S: AsyncRead + AsyncSeek + Send + Unpin + 'static,
where
S: AsyncRead + AsyncSeek + Send + Unpin + 'static,
{
/// An event representing an update to the replication rate of a replication stream.
RateUpdate{
RateUpdate {
/// The ID of the Raft node to which this event relates.
target: NodeId,
/// A flag indicating if the corresponding target node is replicating at line rate.
@ -422,7 +465,7 @@ pub(crate) enum ReplicaEvent<S>
is_line_rate: bool,
},
/// An event from a replication stream which updates the target node's match index.
UpdateMatchIndex{
UpdateMatchIndex {
/// The ID of the target node for which the match index is to be updated.
target: NodeId,
/// The index of the most recent log known to have been successfully replicated on the target.
@ -431,14 +474,14 @@ pub(crate) enum ReplicaEvent<S>
match_term: u64,
},
/// An event indicating that the Raft node needs to revert to follower state.
RevertToFollower{
RevertToFollower {
/// The ID of the target node from which the new term was observed.
target: NodeId,
/// The new term observed.
term: u64,
},
/// An event from a replication stream requesting snapshot info.
NeedsSnapshot{
NeedsSnapshot {
/// The ID of the target node from which the event was sent.
target: NodeId,
/// The response channel for delivering the snapshot data.
@ -460,12 +503,15 @@ struct LineRateState<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: R
impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>> LineRateState<'a, D, R, N, S> {
/// Create a new instance.
pub fn new(core: &'a mut ReplicationCore<D, R, N, S>) -> Self {
Self{core}
Self { core }
}
#[tracing::instrument(level="trace", skip(self), fields(state="line-rate"))]
#[tracing::instrument(level = "trace", skip(self), fields(state = "line-rate"))]
pub async fn run(mut self) {
let event = ReplicaEvent::RateUpdate{target: self.core.target, is_line_rate: true};
let event = ReplicaEvent::RateUpdate {
target: self.core.target,
is_line_rate: true,
};
let _ = self.core.rafttx.send(event);
loop {
if self.core.target_state != TargetReplState::LineRate {
@ -473,7 +519,11 @@ impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>
}
// We always prioritize draining our buffers first.
let next_buf_index = self.core.outbound_buffer.first().map(|entry| entry.as_ref().index)
let next_buf_index = self
.core
.outbound_buffer
.first()
.map(|entry| entry.as_ref().index)
.or_else(|| self.core.replication_buffer.first().map(|entry| entry.index));
if let Some(index) = next_buf_index {
// Ensure that our buffered data matches up with `next_index`. When transitioning to
@ -490,7 +540,7 @@ impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>
self.core.send_append_entries().await;
continue;
}
tokio::select!{
tokio::select! {
_ = self.core.heartbeat.next() => self.core.send_append_entries().await,
event = self.core.raftrx.next() => match event {
Some(event) => self.core.drain_raftrx(event),
@ -501,7 +551,7 @@ impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>
}
/// Ensure there are no gaps in the outbound buffer due to transition from lagging.
#[tracing::instrument(level="trace", skip(self))]
#[tracing::instrument(level = "trace", skip(self))]
async fn frontload_outbound_buffer(&mut self, start: u64, stop: u64) {
let entries = match self.core.storage.get_log_entries(start, stop).await {
Ok(entries) => entries,
@ -536,12 +586,15 @@ struct LaggingState<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: Ra
impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>> LaggingState<'a, D, R, N, S> {
/// Create a new instance.
pub fn new(core: &'a mut ReplicationCore<D, R, N, S>) -> Self {
Self{core}
Self { core }
}
#[tracing::instrument(level="trace", skip(self), fields(state="lagging"))]
#[tracing::instrument(level = "trace", skip(self), fields(state = "lagging"))]
pub async fn run(mut self) {
let event = ReplicaEvent::RateUpdate{target: self.core.target, is_line_rate: false};
let event = ReplicaEvent::RateUpdate {
target: self.core.target,
is_line_rate: false,
};
let _ = self.core.rafttx.send(event);
self.core.replication_buffer.clear();
self.core.outbound_buffer.clear();
@ -581,7 +634,7 @@ impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>
}
/// Prep the outbound buffer with the next payload of entries to append.
#[tracing::instrument(level="trace", skip(self))]
#[tracing::instrument(level = "trace", skip(self))]
async fn prep_outbound_buffer_from_storage(&mut self) {
// If the send buffer is empty, we need to fill it.
if self.core.outbound_buffer.is_empty() {
@ -633,12 +686,19 @@ struct SnapshottingState<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>,
impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>> SnapshottingState<'a, D, R, N, S> {
/// Create a new instance.
pub fn new(core: &'a mut ReplicationCore<D, R, N, S>) -> Self {
Self{core, snapshot: None, snapshot_fetch_rx: None}
Self {
core,
snapshot: None,
snapshot_fetch_rx: None,
}
}
#[tracing::instrument(level="trace", skip(self), fields(state="snapshotting"))]
#[tracing::instrument(level = "trace", skip(self), fields(state = "snapshotting"))]
pub async fn run(mut self) {
let event = ReplicaEvent::RateUpdate{target: self.core.target, is_line_rate: false};
let event = ReplicaEvent::RateUpdate {
target: self.core.target,
is_line_rate: false,
};
let _ = self.core.rafttx.send(event);
self.core.replication_buffer.clear();
self.core.outbound_buffer.clear();
@ -651,7 +711,10 @@ impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>
// If we don't have any of the components we need, fetch the current snapshot.
if self.snapshot.is_none() && self.snapshot_fetch_rx.is_none() {
let (tx, rx) = oneshot::channel();
let _ = self.core.rafttx.send(ReplicaEvent::NeedsSnapshot{target: self.core.target, tx});
let _ = self.core.rafttx.send(ReplicaEvent::NeedsSnapshot {
target: self.core.target,
tx,
});
self.snapshot_fetch_rx = Some(rx);
}
@ -676,10 +739,10 @@ impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>
///
/// If an error comes up during processing, this routine should simple be called again after
/// issuing a new request to the storage layer.
#[tracing::instrument(level="trace", skip(self, rx))]
#[tracing::instrument(level = "trace", skip(self, rx))]
async fn wait_for_snapshot(&mut self, mut rx: oneshot::Receiver<CurrentSnapshotData<S::Snapshot>>) {
loop {
tokio::select!{
tokio::select! {
_ = self.core.heartbeat.next() => self.core.send_append_entries().await,
event = self.core.raftrx.next() => match event {
Some(event) => self.core.drain_raftrx(event),
@ -701,7 +764,7 @@ impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>
}
}
#[tracing::instrument(level="trace", skip(self, snapshot))]
#[tracing::instrument(level = "trace", skip(self, snapshot))]
async fn stream_snapshot(&mut self, mut snapshot: CurrentSnapshotData<S::Snapshot>) -> RaftResult<()> {
let mut offset = 0;
self.core.last_log_index = snapshot.index;
@ -714,11 +777,14 @@ impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>
snapshot.snapshot.seek(SeekFrom::Start(offset)).await?;
let nread = snapshot.snapshot.read_buf(&mut buf).await?;
let done = nread == 0; // If bytes read == 0, then we're done.
let req = InstallSnapshotRequest{
term: self.core.term, leader_id: self.core.id,
let req = InstallSnapshotRequest {
term: self.core.term,
leader_id: self.core.id,
last_included_index: snapshot.index,
last_included_term: snapshot.term,
offset, data: Vec::from(&buf[..nread]), done,
offset,
data: Vec::from(&buf[..nread]),
done,
};
buf.clear();
@ -740,7 +806,10 @@ impl<'a, D: AppData, R: AppDataResponse, N: RaftNetwork<D>, S: RaftStorage<D, R>
// Handle response conditions.
if res.term > self.core.term {
let _ = self.core.rafttx.send(ReplicaEvent::RevertToFollower{target: self.core.target, term: res.term});
let _ = self.core.rafttx.send(ReplicaEvent::RevertToFollower {
target: self.core.target,
term: res.term,
});
self.core.target_state = TargetReplState::Shutdown;
return Ok(());
}

View File

@ -2,15 +2,16 @@
use anyhow::Result;
use async_trait::async_trait;
use serde::{Serialize, Deserialize};
use serde::{Deserialize, Serialize};
use tokio::io::{AsyncRead, AsyncSeek, AsyncWrite};
use crate::{AppData, AppDataResponse, NodeId};
use crate::raft::{Entry, MembershipConfig};
use crate::{AppData, AppDataResponse, NodeId};
/// The data associated with the current snapshot.
pub struct CurrentSnapshotData<S>
where S: AsyncRead + AsyncSeek + Send + Unpin + 'static,
where
S: AsyncRead + AsyncSeek + Send + Unpin + 'static,
{
/// The snapshot entry's term.
pub term: u64,
@ -56,9 +57,14 @@ impl InitialState {
/// ### `id`
/// The ID of the Raft node.
pub fn new_initial(id: NodeId) -> Self {
Self{
last_log_index: 0, last_log_term: 0, last_applied_log: 0,
hard_state: HardState{current_term: 0, voted_for: None},
Self {
last_log_index: 0,
last_log_term: 0,
last_applied_log: 0,
hard_state: HardState {
current_term: 0,
voted_for: None,
},
membership: MembershipConfig::new_initial(id),
}
}
@ -70,9 +76,9 @@ impl InitialState {
/// for details and discussion on this trait and how to implement it.
#[async_trait]
pub trait RaftStorage<D, R>: Send + Sync + 'static
where
D: AppData,
R: AppDataResponse,
where
D: AppData,
R: AppDataResponse,
{
/// The storage engine's associated type used for exposing a snapshot for reading & writing.
type Snapshot: AsyncRead + AsyncWrite + AsyncSeek + Send + Unpin + 'static;
@ -184,8 +190,7 @@ pub trait RaftStorage<D, R>: Send + Sync + 'static
/// `AsyncWriteExt.shutdown()` method will have been called, so no additional writes should be
/// made to the snapshot.
async fn finalize_snapshot_installation(
&self, index: u64, term: u64, delete_through: Option<u64>,
id: String, snapshot: Box<Self::Snapshot>,
&self, index: u64, term: u64, delete_through: Option<u64>, id: String, snapshot: Box<Self::Snapshot>,
) -> Result<()>;
/// Get a readable handle to the current snapshot, along with its metadata.

View File

@ -18,7 +18,7 @@ use fixtures::RaftRouter;
/// - call the client_read interface on the followers, and assert failure.
///
/// RUST_LOG=async_raft,memstore,client_reads=trace cargo test -p async-raft --test client_reads
#[tokio::test(core_threads=4)]
#[tokio::test(core_threads = 4)]
async fn client_reads() -> Result<()> {
fixtures::init_tracing();
@ -42,7 +42,10 @@ async fn client_reads() -> Result<()> {
// Get the ID of the leader, and assert that client_read succeeds.
let leader = router.leader().await.expect("leader not found");
assert_eq!(leader, 0, "expected leader to be node 0, got {}", leader);
router.client_read(leader).await.expect(&format!("expected client_read to succeed for cluster leader {}", leader));
router
.client_read(leader)
.await
.unwrap_or_else(|_| panic!("expected client_read to succeed for cluster leader {}", leader));
router.client_read(1).await.expect_err("expected client_read on follower node 1 to fail");
router.client_read(2).await.expect_err("expected client_read on follower node 2 to fail");

View File

@ -4,8 +4,8 @@ use std::sync::Arc;
use std::time::Duration;
use anyhow::Result;
use async_raft::Config;
use async_raft::raft::MembershipConfig;
use async_raft::Config;
use futures::prelude::*;
use maplit::hashset;
use tokio::time::delay_for;
@ -21,7 +21,7 @@ use fixtures::RaftRouter;
/// - assert that the cluster stayed stable and has all of the expected data.
///
/// RUST_LOG=async_raft,memstore,client_writes=trace cargo test -p async-raft --test client_writes
#[tokio::test(core_threads=4)]
#[tokio::test(core_threads = 4)]
async fn client_writes() -> Result<()> {
fixtures::init_tracing();
@ -51,10 +51,25 @@ async fn client_writes() -> Result<()> {
clients.push(router.client_request_many(leader, "3", 1000));
clients.push(router.client_request_many(leader, "4", 1000));
clients.push(router.client_request_many(leader, "5", 1000));
while let Some(_) = clients.next().await { }
while clients.next().await.is_some() {}
delay_for(Duration::from_secs(5)).await; // Ensure enough time is given for replication (this is WAY more than enough).
router.assert_stable_cluster(Some(1), Some(6001)).await; // The extra 1 is from the leader's initial commit entry.
router.assert_storage_state(1, 6001, Some(0), 6001, Some(((5000..5100).into(), 1, MembershipConfig{members: hashset![0, 1, 2], members_after_consensus: None}))).await;
router
.assert_storage_state(
1,
6001,
Some(0),
6001,
Some((
(5000..5100).into(),
1,
MembershipConfig {
members: hashset![0, 1, 2],
members_after_consensus: None,
},
)),
)
.await;
Ok(())
}

View File

@ -4,8 +4,8 @@ use std::sync::Arc;
use std::time::Duration;
use anyhow::Result;
use async_raft::{Config, SnapshotPolicy};
use async_raft::raft::MembershipConfig;
use async_raft::{Config, SnapshotPolicy};
use maplit::hashset;
use tokio::time::delay_for;
@ -20,15 +20,17 @@ use fixtures::RaftRouter;
/// - add new nodes and assert that they receive the snapshot.
///
/// RUST_LOG=async_raft,memstore,compaction=trace cargo test -p async-raft --test compaction
#[tokio::test(core_threads=4)]
#[tokio::test(core_threads = 4)]
async fn compaction() -> Result<()> {
fixtures::init_tracing();
// Setup test dependencies.
let config = Arc::new(Config::build("test".into())
.snapshot_policy(SnapshotPolicy::LogsSinceLast(500))
.validate()
.expect("failed to build Raft config"));
let config = Arc::new(
Config::build("test".into())
.snapshot_policy(SnapshotPolicy::LogsSinceLast(500))
.validate()
.expect("failed to build Raft config"),
);
let router = Arc::new(RaftRouter::new(config.clone()));
router.new_raft_node(0).await;
@ -46,15 +48,48 @@ async fn compaction() -> Result<()> {
router.client_request_many(0, "0", 499).await; // Puts us exactly at the configured snapshot policy threshold.
delay_for(Duration::from_secs(5)).await; // Wait to ensure there is enough time for a snapshot to be built (this is way more than enough).
router.assert_stable_cluster(Some(1), Some(500)).await;
router.assert_storage_state(1, 500, Some(0), 500, Some((500.into(), 1, MembershipConfig{members: hashset![0], members_after_consensus: None}))).await;
router
.assert_storage_state(
1,
500,
Some(0),
500,
Some((
500.into(),
1,
MembershipConfig {
members: hashset![0],
members_after_consensus: None,
},
)),
)
.await;
// Add a new node and assert that it received the same snapshot.
router.new_raft_node(1).await;
router.add_non_voter(0, 1).await.expect("failed to add new node as non-voter");
router.change_membership(0, hashset![0, 1]).await.expect("failed to modify cluster membership");
router
.change_membership(0, hashset![0, 1])
.await
.expect("failed to modify cluster membership");
delay_for(Duration::from_secs(5)).await; // Wait to ensure metrics are updated (this is way more than enough).
router.assert_stable_cluster(Some(1), Some(502)).await; // We expect index to be 500 + 2 (joint & uniform config change entries).
router.assert_storage_state(1, 502, None, 500, Some((500.into(), 1, MembershipConfig{members: hashset![0u64], members_after_consensus: None}))).await;
router
.assert_storage_state(
1,
502,
None,
500,
Some((
500.into(),
1,
MembershipConfig {
members: hashset![0u64],
members_after_consensus: None,
},
)),
)
.await;
// -------------------------------- ^^^^ this value is None because non-voters do not vote.
Ok(())

View File

@ -22,7 +22,7 @@ use fixtures::RaftRouter;
/// - restore the isolated node and assert that it becomes a follower.
///
/// RUST_LOG=async_raft,memstore,dynamic_membership=trace cargo test -p async-raft --test dynamic_membership
#[tokio::test(core_threads=6)]
#[tokio::test(core_threads = 6)]
async fn dynamic_membership() -> Result<()> {
fixtures::init_tracing();

View File

@ -7,15 +7,15 @@ use std::sync::Arc;
use anyhow::{anyhow, Result};
use async_raft::async_trait::async_trait;
use async_raft::{Config, NodeId, Raft, RaftMetrics, RaftNetwork, State};
use async_raft::error::{ChangeConfigError, ClientReadError, ClientWriteError};
use async_raft::raft::ClientWriteRequest;
use async_raft::raft::MembershipConfig;
use async_raft::raft::{AppendEntriesRequest, AppendEntriesResponse};
use async_raft::raft::{InstallSnapshotRequest, InstallSnapshotResponse};
use async_raft::raft::{VoteRequest, VoteResponse};
use async_raft::raft::ClientWriteRequest;
use async_raft::raft::MembershipConfig;
use async_raft::storage::RaftStorage;
use memstore::{MemStore, ClientRequest as MemClientRequest, ClientResponse as MemClientResponse};
use async_raft::{Config, NodeId, Raft, RaftMetrics, RaftNetwork, State};
use memstore::{ClientRequest as MemClientRequest, ClientResponse as MemClientResponse, MemStore};
use tokio::sync::RwLock;
use tracing_subscriber::prelude::*;
@ -49,7 +49,11 @@ pub struct RaftRouter {
impl RaftRouter {
/// Create a new instance.
pub fn new(config: Arc<Config>) -> Self {
Self{config, routing_table: Default::default(), isolated_nodes: Default::default()}
Self {
config,
routing_table: Default::default(),
isolated_nodes: Default::default(),
}
}
/// Create and register a new Raft node bearing the given ID.
@ -70,7 +74,7 @@ impl RaftRouter {
/// Initialize all nodes based on the config in the routing table.
pub async fn initialize_from_single_node(&self, node: NodeId) -> Result<()> {
tracing::info!({node}, "initializing cluster from single node");
tracing::info!({ node }, "initializing cluster from single node");
let rt = self.routing_table.read().await;
let members: HashSet<NodeId> = rt.keys().cloned().collect();
rt.get(&node)
@ -82,7 +86,7 @@ impl RaftRouter {
}
/// Isolate the network of the specified node.
#[tracing::instrument(level="debug", skip(self))]
#[tracing::instrument(level = "debug", skip(self))]
pub async fn isolate_node(&self, id: NodeId) {
self.isolated_nodes.write().await.insert(id);
}
@ -100,19 +104,21 @@ impl RaftRouter {
/// Get the ID of the current leader.
pub async fn leader(&self) -> Option<NodeId> {
let isolated = self.isolated_nodes.read().await;
self.latest_metrics().await.into_iter().find_map(|node| if node.current_leader == Some(node.id) {
if isolated.contains(&node.id) {
None
self.latest_metrics().await.into_iter().find_map(|node| {
if node.current_leader == Some(node.id) {
if isolated.contains(&node.id) {
None
} else {
Some(node.id)
}
} else {
Some(node.id)
None
}
} else {
None
})
}
/// Restore the network of the specified node.
#[tracing::instrument(level="debug", skip(self))]
#[tracing::instrument(level = "debug", skip(self))]
pub async fn restore_node(&self, id: NodeId) {
let mut nodes = self.isolated_nodes.write().await;
nodes.remove(&id);
@ -120,26 +126,30 @@ impl RaftRouter {
pub async fn add_non_voter(&self, leader: NodeId, target: NodeId) -> Result<(), ChangeConfigError> {
let rt = self.routing_table.read().await;
let node = rt.get(&leader).expect(&format!("node with ID {} does not exist", leader));
let node = rt.get(&leader).unwrap_or_else(|| panic!("node with ID {} does not exist", leader));
node.0.add_non_voter(target).await
}
pub async fn change_membership(&self, leader: NodeId, members: HashSet<NodeId>) -> Result<(), ChangeConfigError> {
let rt = self.routing_table.read().await;
let node = rt.get(&leader).expect(&format!("node with ID {} does not exist", leader));
let node = rt.get(&leader).unwrap_or_else(|| panic!("node with ID {} does not exist", leader));
node.0.change_membership(members).await
}
/// Send a client read request to the target node.
pub async fn client_read(&self, target: NodeId) -> Result<(), ClientReadError> {
let rt = self.routing_table.read().await;
let node = rt.get(&target).expect(&format!("node with ID {} does not exist", target));
let node = rt.get(&target).unwrap_or_else(|| panic!("node with ID {} does not exist", target));
node.0.client_read().await
}
/// Send a client request to the target node, causing test failure on error.
pub async fn client_request(&self, target: NodeId, client_id: &str, serial: u64) {
let req = MemClientRequest{client: client_id.into(), serial, status: format!("request-{}", serial)};
let req = MemClientRequest {
client: client_id.into(),
serial,
status: format!("request-{}", serial),
};
if let Err(err) = self.send_client_request(target, req).await {
tracing::error!({error=%err}, "error from client request");
panic!(err)
@ -153,9 +163,13 @@ impl RaftRouter {
}
}
async fn send_client_request(&self, target: NodeId, req: MemClientRequest) -> std::result::Result<MemClientResponse, ClientWriteError<MemClientRequest>> {
async fn send_client_request(
&self, target: NodeId, req: MemClientRequest,
) -> std::result::Result<MemClientResponse, ClientWriteError<MemClientRequest>> {
let rt = self.routing_table.read().await;
let node = rt.get(&target).expect(&format!("node '{}' does not exist in routing table", target));
let node = rt
.get(&target)
.unwrap_or_else(|| panic!("node '{}' does not exist in routing table", target));
node.0.client_write(ClientWriteRequest::new(req)).await.map(|res| res.data)
}
@ -169,11 +183,23 @@ impl RaftRouter {
assert!(node.current_leader.is_none(), "node {} has a current leader, expected none", node.id);
assert_eq!(node.state, State::NonVoter, "node is in state {:?}, expected NonVoter", node.state);
assert_eq!(node.current_term, 0, "node {} has term {}, expected 0", node.id, node.current_term);
assert_eq!(node.last_applied, 0, "node {} has last_applied {}, expected 0", node.id, node.last_applied);
assert_eq!(node.last_log_index, 0, "node {} has last_log_index {}, expected 0", node.id, node.last_log_index);
assert_eq!(
node.last_applied, 0,
"node {} has last_applied {}, expected 0",
node.id, node.last_applied
);
assert_eq!(
node.last_log_index, 0,
"node {} has last_log_index {}, expected 0",
node.id, node.last_log_index
);
let members = node.membership_config.members.iter().collect::<Vec<_>>();
assert_eq!(members, vec![&node.id], "node {0} has membership {1:?}, expected [{0}]", node.id, members);
assert!(node.membership_config.members_after_consensus.is_none(), "node {} is in joint consensus, expected uniform consensus", node.id);
assert!(
node.membership_config.members_after_consensus.is_none(),
"node {} is in joint consensus, expected uniform consensus",
node.id
);
}
}
@ -189,21 +215,24 @@ impl RaftRouter {
let isolated = self.isolated_nodes.read().await;
let nodes = self.latest_metrics().await;
let non_isolated_nodes: Vec<_> = nodes.iter()
.filter(|node| !isolated.contains(&node.id))
.collect();
let leader = nodes.iter()
let non_isolated_nodes: Vec<_> = nodes.iter().filter(|node| !isolated.contains(&node.id)).collect();
let leader = nodes
.iter()
.filter(|node| !isolated.contains(&node.id))
.find(|node| node.state == State::Leader)
.expect("expected to find a cluster leader");
let followers: Vec<_> = nodes.iter()
let followers: Vec<_> = nodes
.iter()
.filter(|node| !isolated.contains(&node.id))
.filter(|node| node.state == State::Follower)
.collect();
assert_eq!(followers.len() + 1, non_isolated_nodes.len(),
assert_eq!(
followers.len() + 1,
non_isolated_nodes.len(),
"expected all nodes to be followers with one leader, got 1 leader and {} followers, expected {} followers",
followers.len(), non_isolated_nodes.len() - 1,
followers.len(),
non_isolated_nodes.len() - 1,
);
let expected_term = match expected_term {
Some(term) => term,
@ -215,46 +244,116 @@ impl RaftRouter {
};
let all_nodes = nodes.iter().map(|node| node.id).collect::<Vec<_>>();
for node in non_isolated_nodes.iter() {
assert_eq!(node.current_leader, Some(leader.id), "node {} has leader {:?}, expected {}", node.id, node.current_leader, leader.id);
assert_eq!(node.current_term, expected_term, "node {} has term {}, expected {}", node.id, node.current_term, expected_term);
assert_eq!(node.last_applied, expected_last_log, "node {} has last_applied {}, expected {}", node.id, node.last_applied, expected_last_log);
assert_eq!(node.last_log_index, expected_last_log, "node {} has last_log_index {}, expected {}", node.id, node.last_log_index, expected_last_log);
assert_eq!(
node.current_leader,
Some(leader.id),
"node {} has leader {:?}, expected {}",
node.id,
node.current_leader,
leader.id
);
assert_eq!(
node.current_term, expected_term,
"node {} has term {}, expected {}",
node.id, node.current_term, expected_term
);
assert_eq!(
node.last_applied, expected_last_log,
"node {} has last_applied {}, expected {}",
node.id, node.last_applied, expected_last_log
);
assert_eq!(
node.last_log_index, expected_last_log,
"node {} has last_log_index {}, expected {}",
node.id, node.last_log_index, expected_last_log
);
let mut members = node.membership_config.members.iter().cloned().collect::<Vec<_>>();
members.sort();
assert_eq!(members, all_nodes, "node {} has membership {:?}, expected {:?}", node.id, members, all_nodes);
assert!(node.membership_config.members_after_consensus.is_none(), "node {} was not in uniform consensus state", node.id);
assert_eq!(
members, all_nodes,
"node {} has membership {:?}, expected {:?}",
node.id, members, all_nodes
);
assert!(
node.membership_config.members_after_consensus.is_none(),
"node {} was not in uniform consensus state",
node.id
);
}
}
/// Assert against the state of the storage system per node in the cluster.
pub async fn assert_storage_state(
&self, expect_term: u64, expect_last_log: u64, expect_voted_for: Option<u64>,
expect_sm_last_applied_log: u64,
&self, expect_term: u64, expect_last_log: u64, expect_voted_for: Option<u64>, expect_sm_last_applied_log: u64,
expect_snapshot: Option<(ValueTest<u64>, u64, MembershipConfig)>,
) {
let rt = self.routing_table.read().await;
for (id, (_node, storage)) in rt.iter() {
let log = storage.get_log().await;
let last_log = log.keys().last().expect(&format!("no last log found for node {}", id));
assert_eq!(last_log, &expect_last_log, "expected node {} to have last_log {}, got {}", id, expect_last_log, last_log);
let hs = storage.read_hard_state().await.clone().expect(&format!("no hardstate found for node {}", id));
assert_eq!(hs.current_term, expect_term, "expected node {} to have term {}, got {}", id, expect_term, hs.current_term);
let last_log = log.keys().last().unwrap_or_else(|| panic!("no last log found for node {}", id));
assert_eq!(
last_log, &expect_last_log,
"expected node {} to have last_log {}, got {}",
id, expect_last_log, last_log
);
let hs = storage
.read_hard_state()
.await
.clone()
.unwrap_or_else(|| panic!("no hardstate found for node {}", id));
assert_eq!(
hs.current_term, expect_term,
"expected node {} to have term {}, got {}",
id, expect_term, hs.current_term
);
if let Some(voted_for) = &expect_voted_for {
assert_eq!(hs.voted_for.as_ref(), Some(voted_for), "expected node {} to have voted for {}, got {:?}", id, voted_for, hs.voted_for);
assert_eq!(
hs.voted_for.as_ref(),
Some(voted_for),
"expected node {} to have voted for {}, got {:?}",
id,
voted_for,
hs.voted_for
);
}
if let Some((index_test, term, cfg)) = &expect_snapshot {
let snap = storage.get_current_snapshot().await
.map_err(|err| panic!("{}", err)).unwrap()
.expect(&format!("no snapshot present for node {}", id));
let snap = storage
.get_current_snapshot()
.await
.map_err(|err| panic!("{}", err))
.unwrap()
.unwrap_or_else(|| panic!("no snapshot present for node {}", id));
match index_test {
ValueTest::Exact(index) => assert_eq!(&snap.index, index, "expected node {} to have snapshot with index {}, got {}", id, index, snap.index),
ValueTest::Range(range) => assert!(range.contains(&snap.index), "expected node {} to have snapshot within range {:?}, got {}", id, range, snap.index),
ValueTest::Exact(index) => assert_eq!(
&snap.index, index,
"expected node {} to have snapshot with index {}, got {}",
id, index, snap.index
),
ValueTest::Range(range) => assert!(
range.contains(&snap.index),
"expected node {} to have snapshot within range {:?}, got {}",
id,
range,
snap.index
),
}
assert_eq!(&snap.term, term, "expected node {} to have snapshot with term {}, got {}", id, term, snap.term);
assert_eq!(&snap.membership, cfg, "expected node {} to have membership config {:?}, got {:?}", id, cfg, snap.membership);
assert_eq!(
&snap.term, term,
"expected node {} to have snapshot with term {}, got {}",
id, term, snap.term
);
assert_eq!(
&snap.membership, cfg,
"expected node {} to have membership config {:?}, got {:?}",
id, cfg, snap.membership
);
}
let sm = storage.get_state_machine().await;
assert_eq!(&sm.last_applied_log, &expect_sm_last_applied_log, "expected node {} to have state machine last_applied_log {}, got {}", id, expect_sm_last_applied_log, sm.last_applied_log);
assert_eq!(
&sm.last_applied_log, &expect_sm_last_applied_log,
"expected node {} to have state machine last_applied_log {}, got {}",
id, expect_sm_last_applied_log, sm.last_applied_log
);
}
}
}

View File

@ -21,7 +21,7 @@ use fixtures::RaftRouter;
/// followers have successfully replicated the payload.
///
/// RUST_LOG=async_raft,memstore,initialization=trace cargo test -p async-raft --test initialization
#[tokio::test(core_threads=4)]
#[tokio::test(core_threads = 4)]
async fn initialization() -> Result<()> {
fixtures::init_tracing();

View File

@ -20,7 +20,7 @@ use fixtures::RaftRouter;
/// - asserts that the leader was able to successfully commit its initial payload.
///
/// RUST_LOG=async_raft,memstore,singlenode=trace cargo test -p async-raft --test singlenode
#[tokio::test(core_threads=4)]
#[tokio::test(core_threads = 4)]
async fn singlenode() -> Result<()> {
fixtures::init_tracing();

View File

@ -20,7 +20,7 @@ use fixtures::RaftRouter;
/// after the config change is committed.
///
/// RUST_LOG=async_raft,memstore,stepdown=trace cargo test -p async-raft --test stepdown
#[tokio::test(core_threads=5)]
#[tokio::test(core_threads = 5)]
async fn stepdown() -> Result<()> {
fixtures::init_tracing();
@ -50,14 +50,35 @@ async fn stepdown() -> Result<()> {
// Assert on the state of the old leader.
{
let metrics = router.latest_metrics().await.into_iter().find(|node| node.id == 0)
let metrics = router
.latest_metrics()
.await
.into_iter()
.find(|node| node.id == 0)
.expect("expected to find metrics on original leader node");
let cfg = metrics.membership_config;
assert!(metrics.state != State::Leader, "expected old leader to have stepped down");
assert_eq!(metrics.current_term, 1, "expected old leader to still be in first term, got {}", metrics.current_term);
assert_eq!(metrics.last_log_index, 3, "expected old leader to have last log index of 3, got {}", metrics.last_log_index);
assert_eq!(metrics.last_applied, 3, "expected old leader to have last applied of 3, got {}", metrics.last_applied);
assert_eq!(cfg.members, hashset![1, 2, 3], "expected old leader to have membership of [1, 2, 3], got {:?}", cfg.members);
assert_eq!(
metrics.current_term, 1,
"expected old leader to still be in first term, got {}",
metrics.current_term
);
assert_eq!(
metrics.last_log_index, 3,
"expected old leader to have last log index of 3, got {}",
metrics.last_log_index
);
assert_eq!(
metrics.last_applied, 3,
"expected old leader to have last applied of 3, got {}",
metrics.last_applied
);
assert_eq!(
cfg.members,
hashset![1, 2, 3],
"expected old leader to have membership of [1, 2, 3], got {:?}",
cfg.members
);
assert!(cfg.members_after_consensus.is_none(), "expected old leader to be out of joint consensus");
}

View File

@ -1 +1,2 @@
too-many-arguments-threshold = 10
cognitive-complexity-threshold = 25

View File

@ -1,5 +1,5 @@
#![cfg_attr(feature="docinclude", feature(external_doc))]
#![cfg_attr(feature="docinclude", doc(include="../README.md"))]
#![cfg_attr(feature = "docinclude", feature(external_doc))]
#![cfg_attr(feature = "docinclude", doc(include = "../README.md"))]
#[cfg(test)]
mod test;
@ -9,10 +9,10 @@ use std::io::Cursor;
use anyhow::Result;
use async_raft::async_trait::async_trait;
use async_raft::{AppData, AppDataResponse, NodeId, RaftStorage};
use async_raft::raft::{Entry, EntryPayload, MembershipConfig};
use async_raft::storage::{CurrentSnapshotData, HardState, InitialState};
use serde::{Serialize, Deserialize};
use async_raft::{AppData, AppDataResponse, NodeId, RaftStorage};
use serde::{Deserialize, Serialize};
use tokio::sync::RwLock;
use tokio::sync::{RwLockReadGuard, RwLockWriteGuard};
@ -94,23 +94,32 @@ impl MemStore {
let sm = RwLock::new(MemStoreStateMachine::default());
let hs = RwLock::new(None);
let current_snapshot = RwLock::new(None);
Self{id, log, sm, hs, current_snapshot}
Self {
id,
log,
sm,
hs,
current_snapshot,
}
}
/// Create a new `MemStore` instance with some existing state (for testing).
#[cfg(test)]
pub fn new_with_state(
id: NodeId,
log: BTreeMap<u64, Entry<ClientRequest>>,
sm: MemStoreStateMachine,
hs: Option<HardState>,
id: NodeId, log: BTreeMap<u64, Entry<ClientRequest>>, sm: MemStoreStateMachine, hs: Option<HardState>,
current_snapshot: Option<MemStoreSnapshot>,
) -> Self {
let log = RwLock::new(log);
let sm = RwLock::new(sm);
let hs = RwLock::new(hs);
let current_snapshot = RwLock::new(current_snapshot);
Self{id, log, sm, hs, current_snapshot}
Self {
id,
log,
sm,
hs,
current_snapshot,
}
}
/// Get a handle to the log for testing purposes.
@ -133,7 +142,7 @@ impl MemStore {
impl RaftStorage<ClientRequest, ClientResponse> for MemStore {
type Snapshot = Cursor<Vec<u8>>;
#[tracing::instrument(level="trace", skip(self))]
#[tracing::instrument(level = "trace", skip(self))]
async fn get_membership_config(&self) -> Result<MembershipConfig> {
let log = self.log.read().await;
let cfg_opt = log.values().rev().find_map(|entry| match &entry.payload {
@ -147,7 +156,7 @@ impl RaftStorage<ClientRequest, ClientResponse> for MemStore {
})
}
#[tracing::instrument(level="trace", skip(self))]
#[tracing::instrument(level = "trace", skip(self))]
async fn get_initial_state(&self) -> Result<InitialState> {
let membership = self.get_membership_config().await?;
let mut hs = self.hs.write().await;
@ -160,7 +169,7 @@ impl RaftStorage<ClientRequest, ClientResponse> for MemStore {
None => (0, 0),
};
let last_applied_log = sm.last_applied_log;
return Ok(InitialState{
return Ok(InitialState {
last_log_index,
last_log_term,
last_applied_log,
@ -176,13 +185,13 @@ impl RaftStorage<ClientRequest, ClientResponse> for MemStore {
}
}
#[tracing::instrument(level="trace", skip(self, hs))]
#[tracing::instrument(level = "trace", skip(self, hs))]
async fn save_hard_state(&self, hs: &HardState) -> Result<()> {
*self.hs.write().await = Some(hs.clone());
Ok(())
}
#[tracing::instrument(level="trace", skip(self))]
#[tracing::instrument(level = "trace", skip(self))]
async fn get_log_entries(&self, start: u64, stop: u64) -> Result<Vec<Entry<ClientRequest>>> {
// Invalid request, return empty vec.
if start > stop {
@ -193,7 +202,7 @@ impl RaftStorage<ClientRequest, ClientResponse> for MemStore {
Ok(log.range(start..stop).map(|(_, val)| val.clone()).collect())
}
#[tracing::instrument(level="trace", skip(self))]
#[tracing::instrument(level = "trace", skip(self))]
async fn delete_logs_from(&self, start: u64, stop: Option<u64>) -> Result<()> {
if stop.as_ref().map(|stop| &start > stop).unwrap_or(false) {
tracing::error!("invalid request, start > stop");
@ -213,14 +222,14 @@ impl RaftStorage<ClientRequest, ClientResponse> for MemStore {
Ok(())
}
#[tracing::instrument(level="trace", skip(self, entry))]
#[tracing::instrument(level = "trace", skip(self, entry))]
async fn append_entry_to_log(&self, entry: &Entry<ClientRequest>) -> Result<()> {
let mut log = self.log.write().await;
log.insert(entry.index, entry.clone());
Ok(())
}
#[tracing::instrument(level="trace", skip(self, entries))]
#[tracing::instrument(level = "trace", skip(self, entries))]
async fn replicate_to_log(&self, entries: &[Entry<ClientRequest>]) -> Result<()> {
let mut log = self.log.write().await;
for entry in entries {
@ -229,13 +238,13 @@ impl RaftStorage<ClientRequest, ClientResponse> for MemStore {
Ok(())
}
#[tracing::instrument(level="trace", skip(self, data))]
#[tracing::instrument(level = "trace", skip(self, data))]
async fn apply_entry_to_state_machine(&self, index: &u64, data: &ClientRequest) -> Result<ClientResponse> {
let mut sm = self.sm.write().await;
sm.last_applied_log = *index;
if let Some((serial, res)) = sm.client_serial_responses.get(&data.client) {
if serial == &data.serial {
return Ok(ClientResponse(Ok(res.clone())))
return Ok(ClientResponse(Ok(res.clone())));
}
}
let previous = sm.client_status.insert(data.client.clone(), data.status.clone());
@ -243,7 +252,7 @@ impl RaftStorage<ClientRequest, ClientResponse> for MemStore {
Ok(ClientResponse(Ok(previous)))
}
#[tracing::instrument(level="trace", skip(self, entries))]
#[tracing::instrument(level = "trace", skip(self, entries))]
async fn replicate_to_state_machine(&self, entries: &[(&u64, &ClientRequest)]) -> Result<()> {
let mut sm = self.sm.write().await;
for (index, data) in entries {
@ -259,7 +268,7 @@ impl RaftStorage<ClientRequest, ClientResponse> for MemStore {
Ok(())
}
#[tracing::instrument(level="trace", skip(self))]
#[tracing::instrument(level = "trace", skip(self))]
async fn do_log_compaction(&self, through: u64) -> Result<CurrentSnapshotData<Self::Snapshot>> {
let data;
{
@ -272,7 +281,9 @@ impl RaftStorage<ClientRequest, ClientResponse> for MemStore {
{
// Go backwards through the log to find the most recent membership config <= the `through` index.
let log = self.log.read().await;
membership_config = log.values().rev()
membership_config = log
.values()
.rev()
.skip_while(|entry| entry.index > through)
.find_map(|entry| match &entry.payload {
EntryPayload::ConfigChange(cfg) => Some(cfg.membership.clone()),
@ -286,30 +297,42 @@ impl RaftStorage<ClientRequest, ClientResponse> for MemStore {
{
let mut log = self.log.write().await;
let mut current_snapshot = self.current_snapshot.write().await;
term = log.get(&through).map(|entry| entry.term).ok_or_else(|| anyhow::anyhow!(ERR_INCONSISTENT_LOG))?;
term = log
.get(&through)
.map(|entry| entry.term)
.ok_or_else(|| anyhow::anyhow!(ERR_INCONSISTENT_LOG))?;
*log = log.split_off(&through);
log.insert(through, Entry::new_snapshot_pointer(through, term, "".into(), membership_config.clone()));
let snapshot = MemStoreSnapshot{index: through, term, membership: membership_config.clone(), data};
let snapshot = MemStoreSnapshot {
index: through,
term,
membership: membership_config.clone(),
data,
};
snapshot_bytes = serde_json::to_vec(&snapshot)?;
*current_snapshot = Some(snapshot);
} // Release log & snapshot write locks.
tracing::trace!({snapshot_size=snapshot_bytes.len()}, "log compaction complete");
Ok(CurrentSnapshotData{
term, index: through, membership: membership_config.clone(),
tracing::trace!({ snapshot_size = snapshot_bytes.len() }, "log compaction complete");
Ok(CurrentSnapshotData {
term,
index: through,
membership: membership_config.clone(),
snapshot: Box::new(Cursor::new(snapshot_bytes)),
})
}
#[tracing::instrument(level="trace", skip(self))]
#[tracing::instrument(level = "trace", skip(self))]
async fn create_snapshot(&self) -> Result<(String, Box<Self::Snapshot>)> {
Ok((String::from(""), Box::new(Cursor::new(Vec::new())))) // Snapshot IDs are insignificant to this storage engine.
}
#[tracing::instrument(level="trace", skip(self, snapshot))]
async fn finalize_snapshot_installation(&self, index: u64, term: u64, delete_through: Option<u64>, id: String, snapshot: Box<Self::Snapshot>) -> Result<()> {
tracing::trace!({snapshot_size=snapshot.get_ref().len()}, "decoding snapshot for installation");
#[tracing::instrument(level = "trace", skip(self, snapshot))]
async fn finalize_snapshot_installation(
&self, index: u64, term: u64, delete_through: Option<u64>, id: String, snapshot: Box<Self::Snapshot>,
) -> Result<()> {
tracing::trace!({ snapshot_size = snapshot.get_ref().len() }, "decoding snapshot for installation");
let raw = serde_json::to_string_pretty(snapshot.get_ref().as_slice())?;
println!("JSON SNAP:\n{}", raw);
let new_snapshot: MemStoreSnapshot = serde_json::from_slice(snapshot.get_ref().as_slice())?;
@ -317,7 +340,9 @@ impl RaftStorage<ClientRequest, ClientResponse> for MemStore {
{
// Go backwards through the log to find the most recent membership config <= the `through` index.
let mut log = self.log.write().await;
let membership_config = log.values().rev()
let membership_config = log
.values()
.rev()
.skip_while(|entry| entry.index > index)
.find_map(|entry| match &entry.payload {
EntryPayload::ConfigChange(cfg) => Some(cfg.membership.clone()),
@ -347,12 +372,12 @@ impl RaftStorage<ClientRequest, ClientResponse> for MemStore {
Ok(())
}
#[tracing::instrument(level="trace", skip(self))]
#[tracing::instrument(level = "trace", skip(self))]
async fn get_current_snapshot(&self) -> Result<Option<CurrentSnapshotData<Self::Snapshot>>> {
match &*self.current_snapshot.read().await {
Some(snapshot) => {
let reader = serde_json::to_vec(&snapshot)?;
Ok(Some(CurrentSnapshotData{
Ok(Some(CurrentSnapshotData {
index: snapshot.index,
term: snapshot.term,
membership: snapshot.membership.clone(),

View File

@ -13,7 +13,10 @@ async fn test_get_membership_config_default() -> Result<()> {
let store = MemStore::new(NODE_ID);
let membership = store.get_membership_config().await?;
assert_eq!(membership.members.len(), 1, "expected members len of 1");
assert!(membership.members_after_consensus.is_none(), "expected None for default members_after_consensus");
assert!(
membership.members_after_consensus.is_none(),
"expected None for default members_after_consensus"
);
Ok(())
}
@ -24,11 +27,24 @@ async fn test_get_membership_config_with_previous_state() -> Result<()> {
members.insert(1);
members.insert(2);
members.insert(3);
log.insert(1, Entry{term: 1, index: 1, payload: EntryPayload::ConfigChange(EntryConfigChange{
membership: MembershipConfig{members: members.clone(), members_after_consensus: None}
})});
log.insert(
1,
Entry {
term: 1,
index: 1,
payload: EntryPayload::ConfigChange(EntryConfigChange {
membership: MembershipConfig {
members: members.clone(),
members_after_consensus: None,
},
}),
},
);
let sm = MemStoreStateMachine::default();
let hs = HardState{current_term: 1, voted_for: Some(NODE_ID)};
let hs = HardState {
current_term: 1,
voted_for: Some(NODE_ID),
};
let store = MemStore::new_with_state(NODE_ID, log, sm, Some(hs.clone()), None);
let initial = store.get_membership_config().await?;
@ -44,7 +60,10 @@ async fn test_get_membership_config_with_previous_state() -> Result<()> {
#[tokio::test]
async fn test_get_initial_state_default() -> Result<()> {
let store = MemStore::new(NODE_ID);
let expected_hs = HardState{current_term: 0, voted_for: None};
let expected_hs = HardState {
current_term: 0,
voted_for: None,
};
let expected_membership = MembershipConfig::new_initial(NODE_ID);
let initial = store.get_initial_state().await?;
@ -60,10 +79,20 @@ async fn test_get_initial_state_default() -> Result<()> {
#[tokio::test]
async fn test_get_initial_state_with_previous_state() -> Result<()> {
let mut log = BTreeMap::new();
log.insert(1, Entry{term: 1, index: 1, payload: EntryPayload::Blank});
log.insert(
1,
Entry {
term: 1,
index: 1,
payload: EntryPayload::Blank,
},
);
let mut sm = MemStoreStateMachine::default();
sm.last_applied_log = 1; // Just stubbed in for testing.
let hs = HardState{current_term: 1, voted_for: Some(NODE_ID)};
let hs = HardState {
current_term: 1,
voted_for: Some(NODE_ID),
};
let store = MemStore::new_with_state(NODE_ID, log, sm, Some(hs.clone()), None);
let initial = store.get_initial_state().await?;
@ -81,13 +110,19 @@ async fn test_get_initial_state_with_previous_state() -> Result<()> {
#[tokio::test]
async fn test_save_hard_state() -> Result<()> {
let store = MemStore::new(NODE_ID);
let new_hs = HardState{current_term: 100, voted_for: Some(NODE_ID)};
let new_hs = HardState {
current_term: 100,
voted_for: Some(NODE_ID),
};
let initial = store.get_initial_state().await?;
store.save_hard_state(&new_hs).await?;
let post = store.get_initial_state().await?;
assert_ne!(initial.hard_state, post.hard_state, "hard state was expected to be different after update");
assert_ne!(
initial.hard_state, post.hard_state,
"hard state was expected to be different after update"
);
Ok(())
}
@ -173,7 +208,13 @@ async fn test_delete_logs_from_deletes_only_target_logs() -> Result<()> {
async fn test_append_entry_to_log() -> Result<()> {
let store = default_store_with_logs();
store.append_entry_to_log(&Entry{term: 2, index: 10, payload: EntryPayload::Blank}).await?;
store
.append_entry_to_log(&Entry {
term: 2,
index: 10,
payload: EntryPayload::Blank,
})
.await?;
let log = store.get_log().await;
assert_eq!(log.len(), 10, "expected 10 entries to exist in the log");
@ -189,7 +230,13 @@ async fn test_append_entry_to_log() -> Result<()> {
async fn test_replicate_to_log() -> Result<()> {
let store = default_store_with_logs();
store.replicate_to_log(&[Entry{term: 1, index: 11, payload: EntryPayload::Blank}]).await?;
store
.replicate_to_log(&[Entry {
term: 1,
index: 11,
payload: EntryPayload::Blank,
}])
.await?;
let log = store.get_log().await;
assert_eq!(log.len(), 11, "expected 11 entries to exist in the log");
@ -205,11 +252,23 @@ async fn test_replicate_to_log() -> Result<()> {
async fn test_apply_entry_to_state_machine() -> Result<()> {
let store = default_store_with_logs();
store.apply_entry_to_state_machine(&1, &ClientRequest{client: "0".into(), serial: 0, status: "lit".into()}).await?;
store
.apply_entry_to_state_machine(
&1,
&ClientRequest {
client: "0".into(),
serial: 0,
status: "lit".into(),
},
)
.await?;
let sm = store.get_state_machine().await;
assert_eq!(sm.last_applied_log, 1, "expected last_applied_log to be 1, got {}", sm.last_applied_log);
let client_serial = sm.client_serial_responses.get("0").expect("expected entry to exist in client_serial_responses");
let client_serial = sm
.client_serial_responses
.get("0")
.expect("expected entry to exist in client_serial_responses");
assert_eq!(client_serial.0, 0, "unexpected client serial response");
assert_eq!(client_serial.1, None, "unexpected client serial response");
let client_status = sm.client_status.get("0").expect("expected entry to exist in client_status");
@ -224,22 +283,36 @@ async fn test_apply_entry_to_state_machine() -> Result<()> {
async fn test_replicate_to_state_machine() -> Result<()> {
let store = default_store_with_logs();
let req0 = ClientRequest{client: "1".into(), serial: 0, status: "old".into()};
let req1 = ClientRequest{client: "1".into(), serial: 1, status: "new".into()};
let req2 = ClientRequest{client: "2".into(), serial: 0, status: "other".into()};
let entries = vec![
(&1u64, &req0),
(&2u64, &req1),
(&3u64, &req2),
];
let req0 = ClientRequest {
client: "1".into(),
serial: 0,
status: "old".into(),
};
let req1 = ClientRequest {
client: "1".into(),
serial: 1,
status: "new".into(),
};
let req2 = ClientRequest {
client: "2".into(),
serial: 0,
status: "other".into(),
};
let entries = vec![(&1u64, &req0), (&2u64, &req1), (&3u64, &req2)];
store.replicate_to_state_machine(&entries).await?;
let sm = store.get_state_machine().await;
assert_eq!(sm.last_applied_log, 3, "expected last_applied_log to be 3, got {}", sm.last_applied_log);
let client_serial1 = sm.client_serial_responses.get("1").expect("expected entry to exist in client_serial_responses for client 1");
let client_serial1 = sm
.client_serial_responses
.get("1")
.expect("expected entry to exist in client_serial_responses for client 1");
assert_eq!(client_serial1.0, 1, "unexpected client serial response");
assert_eq!(client_serial1.1, Some(String::from("old")), "unexpected client serial response");
let client_serial2 = sm.client_serial_responses.get("2").expect("expected entry to exist in client_serial_responses for client 2");
let client_serial2 = sm
.client_serial_responses
.get("2")
.expect("expected entry to exist in client_serial_responses for client 2");
assert_eq!(client_serial2.0, 0, "unexpected client serial response");
assert_eq!(client_serial2.1, None, "unexpected client serial response");
let client_status1 = sm.client_status.get("1").expect("expected entry to exist in client_status for client 1");
@ -254,17 +327,90 @@ async fn test_replicate_to_state_machine() -> Result<()> {
fn default_store_with_logs() -> MemStore {
let mut log = BTreeMap::new();
log.insert(1, Entry{term: 1, index: 1, payload: EntryPayload::Blank});
log.insert(2, Entry{term: 1, index: 2, payload: EntryPayload::Blank});
log.insert(3, Entry{term: 1, index: 3, payload: EntryPayload::Blank});
log.insert(4, Entry{term: 1, index: 4, payload: EntryPayload::Blank});
log.insert(5, Entry{term: 1, index: 5, payload: EntryPayload::Blank});
log.insert(6, Entry{term: 1, index: 6, payload: EntryPayload::Blank});
log.insert(7, Entry{term: 1, index: 7, payload: EntryPayload::Blank});
log.insert(8, Entry{term: 1, index: 8, payload: EntryPayload::Blank});
log.insert(9, Entry{term: 1, index: 9, payload: EntryPayload::Blank});
log.insert(10, Entry{term: 1, index: 10, payload: EntryPayload::Blank});
log.insert(
1,
Entry {
term: 1,
index: 1,
payload: EntryPayload::Blank,
},
);
log.insert(
2,
Entry {
term: 1,
index: 2,
payload: EntryPayload::Blank,
},
);
log.insert(
3,
Entry {
term: 1,
index: 3,
payload: EntryPayload::Blank,
},
);
log.insert(
4,
Entry {
term: 1,
index: 4,
payload: EntryPayload::Blank,
},
);
log.insert(
5,
Entry {
term: 1,
index: 5,
payload: EntryPayload::Blank,
},
);
log.insert(
6,
Entry {
term: 1,
index: 6,
payload: EntryPayload::Blank,
},
);
log.insert(
7,
Entry {
term: 1,
index: 7,
payload: EntryPayload::Blank,
},
);
log.insert(
8,
Entry {
term: 1,
index: 8,
payload: EntryPayload::Blank,
},
);
log.insert(
9,
Entry {
term: 1,
index: 9,
payload: EntryPayload::Blank,
},
);
log.insert(
10,
Entry {
term: 1,
index: 10,
payload: EntryPayload::Blank,
},
);
let sm = MemStoreStateMachine::default();
let hs = HardState{current_term: 1, voted_for: Some(NODE_ID)};
MemStore::new_with_state(NODE_ID, log, sm, Some(hs.clone()), None)
let hs = HardState {
current_term: 1,
voted_for: Some(NODE_ID),
};
MemStore::new_with_state(NODE_ID, log, sm, Some(hs), None)
}

27
rustfmt.toml Normal file
View File

@ -0,0 +1,27 @@
unstable_features = true
edition = "2018"
comment_width = 100
fn_args_layout = "Compressed"
max_width = 150
use_small_heuristics = "Default"
use_try_shorthand = true
# pre-unstable
chain_width = 75
single_line_if_else_max_width = 75
space_around_attr_eq = false
struct_lit_width = 50
# unstable
condense_wildcard_suffixes = true
format_code_in_doc_comments = true
format_strings = true
match_block_trailing_comma = false
normalize_comments = true
normalize_doc_attributes = true
reorder_impl_items = true
struct_lit_single_line = true
trailing_comma = "Vertical"
use_field_init_shorthand = true
wrap_comments = true