actix-raft/async-raft/src/replication/mod.rs

//! Replication stream.

use std::io::SeekFrom;
use std::sync::Arc;

use tokio::io::{AsyncRead, AsyncReadExt, AsyncSeek, AsyncSeekExt};
use tokio::stream::StreamExt;
use tokio::sync::{mpsc, oneshot};
use tokio::task::JoinHandle;
use tokio::time::{Duration, Interval, interval, timeout};

use crate::{AppData, AppDataResponse, AppError, NodeId, RaftNetwork, RaftStorage};
use crate::config::{Config, SnapshotPolicy};
use crate::error::RaftResult;
use crate::raft::{AppendEntriesRequest, Entry, EntryPayload, InstallSnapshotRequest};
use crate::storage::CurrentSnapshotData;

/// The public handle to a spawned replication stream.
pub(crate) struct ReplicationStream<D: AppData> {
    /// The spawn handle the `ReplicationCore` task.
    pub handle: JoinHandle<()>,
    /// The channel used for communicating with the replication task.
    pub repltx: mpsc::UnboundedSender<RaftEvent<D>>,
}

impl<D: AppData> ReplicationStream<D> {
    /// Create a new replication stream for the target peer.
    pub(crate) fn new<R: AppDataResponse, E: AppError, N: RaftNetwork<D, E>, S: RaftStorage<D, R, E>>(
        id: NodeId, target: NodeId, term: u64, config: Arc<Config>,
        last_log_index: u64, last_log_term: u64, commit_index: u64,
        network: Arc<N>, storage: Arc<S>, replicationtx: mpsc::UnboundedSender<ReplicaEvent<S::Snapshot>>,
    ) -> Self {
        ReplicationCore::spawn(
            id, target, term, config, last_log_index, last_log_term, commit_index,
            network, storage, replicationtx,
        )
    }
}

/// A task responsible for sending replication events to a target follower in the Raft cluster.
///
/// NOTE: we do not stack replication requests to targets because this could result in
/// out-of-order delivery. We always buffer until we receive a success response, then send the
/// next payload from the buffer.
struct ReplicationCore<D: AppData, R: AppDataResponse, E: AppError, N: RaftNetwork<D, E>, S: RaftStorage<D, R, E>> {
    //////////////////////////////////////////////////////////////////////////
    // Static Fields /////////////////////////////////////////////////////////

    /// The ID of this Raft node.
    id: NodeId,
    /// The ID of the target Raft node which replication events are to be sent to.
    target: NodeId,
    /// The current term, which will never change during the lifetime of this task.
    term: u64,
    /// A channel for sending events to the Raft node.
    rafttx: mpsc::UnboundedSender<ReplicaEvent<S::Snapshot>>,
    /// A channel for receiving events from the Raft node.
    raftrx: mpsc::UnboundedReceiver<RaftEvent<D>>,
    /// The `RaftNetwork` interface.
    network: Arc<N>,
    /// The `RaftStorage` interface.
    storage: Arc<S>,
    /// The Raft's runtime config.
    config: Arc<Config>,
    /// The configured max payload entries, simply as a usize.
    max_payload_entries: usize,
    marker_r: std::marker::PhantomData<R>,
    marker_e: std::marker::PhantomData<E>,

    //////////////////////////////////////////////////////////////////////////
    // Dynamic Fields ////////////////////////////////////////////////////////

    /// The target state of this replication stream.
    target_state: TargetReplState,

    /// The index of the log entry to most recently be appended to the log by the leader.
    last_log_index: u64,
    /// The index of the highest log entry which is known to be committed in the cluster.
    commit_index: u64,

    /// The index of the next log to send.
    ///
    /// This is initialized to leader's last log index + 1. Per the Raft protocol spec,
    /// this value may be decremented as new nodes enter the cluster and need to catch-up per the
    /// log consistency check.
    ///
    /// If a follower's log is inconsistent with the leader's, the AppendEntries consistency check
    /// will fail in the next AppendEntries RPC. After a rejection, the leader decrements
    /// `next_index` and retries the AppendEntries RPC. Eventually `next_index` will reach a point
    /// where the leader and follower logs match. When this happens, AppendEntries will succeed,
    /// which removes any conflicting entries in the follower's log and appends entries from the
    /// leader's log (if any). Once AppendEntries succeeds, the follower’s log is consistent with
    /// the leader's, and it will remain that way for the rest of the term.
    ///
    /// This Raft implementation also uses a _conflict optimization_ pattern for reducing the
    /// number of RPCs which need to be sent back and forth between a peer which is lagging
    /// behind. This is defined in §5.3.
    next_index: u64,
    /// The last know index to be successfully replicated on the target.
    ///
    /// This will be initialized to the leader's last_log_index, and will be updated as
    /// replication proceeds.
    match_index: u64,
    /// The term of the last know index to be successfully replicated on the target.
    ///
    /// This will be initialized to the leader's last_log_term, and will be updated as
    /// replication proceeds.
    match_term: u64,

    /// A buffer of data to replicate to the target follower.
    ///
    /// The buffered payload here will be expanded as more replication commands come in from the
    /// Raft node. Data from this buffer will flow into the `outbound_buffer` in chunks.
    replication_buffer: Vec<Arc<Entry<D>>>,
    /// A buffer of data which is being sent to the follower.
    ///
    /// Data in this buffer comes directly from the `replication_buffer` in chunks, and will
    /// remain here until it is confirmed that the payload has been successfully received by the
    /// target node. This allows for retransmission of payloads in the face of transient errors.
    outbound_buffer: Vec<OutboundEntry<D>>,
    /// The heartbeat interval for ensuring that heartbeats are always delivered in a timely fashion.
    heartbeat: Interval,
    /// The timeout duration for heartbeats.
    heartbeat_timeout: Duration,
}

impl<D: AppData, R: AppDataResponse, E: AppError, N: RaftNetwork<D, E>, S: RaftStorage<D, R, E>> ReplicationCore<D, R, E, N, S> {
    /// Spawn a new replication task for the target node.
    pub(self) fn spawn(
        id: NodeId, target: NodeId, term: u64, config: Arc<Config>,
        last_log_index: u64, last_log_term: u64, commit_index: u64,
        network: Arc<N>, storage: Arc<S>, rafttx: mpsc::UnboundedSender<ReplicaEvent<S::Snapshot>>,
    ) -> ReplicationStream<D> {
        let (raftrx_tx, raftrx) = mpsc::unbounded_channel();
        let heartbeat_timeout = Duration::from_millis(config.heartbeat_interval);
        let max_payload_entries = config.max_payload_entries as usize;
        let this = Self{
            id, target, term, network, storage, config, max_payload_entries,
            marker_r: std::marker::PhantomData, marker_e: std::marker::PhantomData,
            target_state: TargetReplState::Lagging, last_log_index, commit_index,
            next_index: last_log_index + 1, match_index: last_log_index, match_term: last_log_term,
            rafttx, raftrx, heartbeat: interval(heartbeat_timeout), heartbeat_timeout,
            replication_buffer: Vec::new(), outbound_buffer: Vec::new(),
        };
        let handle = tokio::spawn(this.main());
        ReplicationStream{handle, repltx: raftrx_tx}
    }

    #[tracing::instrument(level="trace", skip(self), fields(id=self.id, target=self.target))]
    async fn main(mut self) {
        // Perform an initial heartbeat.
        self.send_append_entries().await;

        // Proceed to the replication stream's inner loop.
        loop {
            match &self.target_state {
                TargetReplState::LineRate => LineRateState::new(&mut self).run().await,
                TargetReplState::Lagging => LaggingState::new(&mut self).run().await,
                TargetReplState::Snapshotting => SnapshottingState::new(&mut self).run().await,
                TargetReplState::Shutdown => return,
            }
        }
    }

    /// Send an AppendEntries RPC to the target.
    ///
    /// This request will timeout if no response is received within the
    /// configured heartbeat interval.
    #[tracing::instrument(level="trace", skip(self))]
    async fn send_append_entries(&mut self) {
        // Attempt to fill the send buffer from the replication buffer.
        if self.outbound_buffer.len() == 0 {
            let repl_len = self.replication_buffer.len();
            if repl_len > 0 {
                let chunk_size = if repl_len < self.max_payload_entries { repl_len } else { self.max_payload_entries };
                self.outbound_buffer.extend(
                    self.replication_buffer.drain(..=chunk_size)
                    .map(|entry| OutboundEntry::Arc(entry)));
            }
        }

        // Build the heartbeat frame to be sent to the follower.
        let payload = AppendEntriesRequest{
            term: self.term, leader_id: self.id,
            prev_log_index: self.match_index, prev_log_term: self.match_term,
            leader_commit: self.commit_index, entries: self.outbound_buffer.iter().map(|entry| entry.as_ref().clone()).collect(),
        };

        // Send the payload.
        let res = match timeout(self.heartbeat_timeout, self.network.append_entries(self.target, payload)).await {
            Ok(outer_res) => match outer_res {
                Ok(res) => res,
                Err(err) => {
                    tracing::error!({error=%err}, "error sending AppendEntries RPC to target");
                    return;
                }
            }
            Err(err) => {
                tracing::error!({error=%err}, "timeout while sending AppendEntries RPC to target");
                return;
            },
        };
        let last_index_and_term = match self.outbound_buffer.last() {
            Some(last) => Some((last.as_ref().index, last.as_ref().term)),
            None => None,
        };
        self.outbound_buffer.clear(); // Once we've successfully sent a payload of entries, don't send them again.

        // Handle success conditions.
        if res.success {
            // If this was a proper replication event (last index & term were provided), then update state.
            if let Some((index, term)) = last_index_and_term {
                self.next_index = index + 1; // This should always be the next expected index.
                self.match_index = index;
                self.match_term = term;
                let _ = self.rafttx.send(ReplicaEvent::UpdateMatchIndex{target: self.target, match_index: index});

                // If running at line rate, and our buffered outbound requests have accumulated too
                // much, we need to purge and transition to a lagging state. The target is not able to
                // replicate data fast enough.
                if &(&self.last_log_index - &self.match_index) > &self.config.replication_lag_threshold {
                    self.target_state = TargetReplState::Lagging;
                }
            }
            return;
        }

        // Replication was not successful, if a newer term has been returned, revert to follower.
        if &res.term > &self.term {
            let _ = self.rafttx.send(ReplicaEvent::RevertToFollower{target: self.target, term: res.term});
            self.target_state = TargetReplState::Shutdown;
            return;
        }

        // Replication was not successful, handle conflict optimization record, else decrement `next_index`.
        if let Some(conflict) = res.conflict_opt {
            // If the returned conflict opt index is greater than last_log_index, then this is a
            // logical error, and no action should be taken. This represents a replication failure.
            if &conflict.index > &self.last_log_index {
                return;
            }

            // Fetch the entry at conflict index and use the term specified there.
            match self.storage.get_log_entries(conflict.index, conflict.index).await.map(|entries| entries.iter().nth(0).map(|entry| entry.term)) {
                Ok(Some(term)) => {
                    self.next_index = conflict.index + 1;
                    self.match_index = conflict.index;
                    self.match_term = term;
                }
                Ok(None) => {
                    // This condition would only ever be reached if the log has been removed due to
                    // log compaction (barring critical storage failure), so transition to snapshotting.
                    self.target_state = TargetReplState::Snapshotting;
                    return;
                }
                Err(err) => {
                    tracing::error!({error=%err}, "error fetching log entry due to returned AppendEntries RPC conflict_opt");
                    let _ = self.rafttx.send(ReplicaEvent::Shutdown);
                    self.target_state = TargetReplState::Shutdown;
                    return;
                }
            };

            // Check snapshot policy and handle conflict as needed.
            match &self.config.snapshot_policy {
                SnapshotPolicy::LogsSinceLast(threshold) => {
                    let diff = &self.last_log_index - &conflict.index; // NOTE WELL: underflow is guarded against above.
                    if &diff >= threshold {
                        // Follower is far behind and needs to receive an InstallSnapshot RPC.
                        self.target_state = TargetReplState::Snapshotting;
                        return;
                    }
                    // Follower is behind, but not too far behind to receive an InstallSnapshot RPC.
                    self.target_state = TargetReplState::Lagging;
                    return;
                }
            }
        }

        self.next_index = if self.next_index > 0 { self.next_index - 1} else { 0 }; // Guard against underflow.
        self.target_state = TargetReplState::Lagging;
    }

    /// Perform a check to see if this replication stream is lagging behind far enough that a
    /// snapshot is warranted.
    pub(self) fn needs_snapshot(&self) -> bool {
        match &self.config.snapshot_policy {
            SnapshotPolicy::LogsSinceLast(threshold) => {
                if &self.commit_index > &self.match_index && &(&self.commit_index - &self.match_index) >= threshold {
                    true
                } else {
                    false
                }
            }
        }
    }

    /// Fully drain the channel coming in from the Raft node.
    pub(self) fn drain_raftrx(&mut self, first: RaftEvent<D>) {
        let mut event_opt = Some(first);
        loop {
            // Unpack the event opt, else return if we don't have one to process.
            let event = match event_opt.take() {
                Some(event) => event,
                None => return,
            };
            // Process the event.
            match event {
                RaftEvent::UpdateCommitIndex{commit_index} => {
                    self.commit_index = commit_index;
                }
                RaftEvent::Replicate{entry, commit_index} => {
                    self.commit_index = commit_index;
                    self.last_log_index = entry.index;
                    if &self.target_state == &TargetReplState::LineRate {
                        self.replication_buffer.push(entry);
                    }
                }
                RaftEvent::Terminate => {
                    self.target_state = TargetReplState::Shutdown;
                    return;
                }
            }
            // Attempt to unpack the next event for the next loop iteration.
            if let Ok(event) = self.raftrx.try_recv() {
                event_opt = Some(event);
            }
        }
    }
}

/// A type which wraps two possible forms of an outbound entry for replication.
enum OutboundEntry<D: AppData> {
    /// An entry owned by an Arc, hot off the replication stream from the Raft leader.
    Arc(Arc<Entry<D>>),
    /// An entry which was fetched directly from storage.
    Raw(Entry<D>),
}

impl<D: AppData> AsRef<Entry<D>> for OutboundEntry<D> {
    fn as_ref(&self) -> &Entry<D> {
        match self {
            Self::Arc(inner) => inner.as_ref(),
            Self::Raw(inner) => inner,
        }
    }
}

//////////////////////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////////////////

/// The state of the replication stream.
#[derive(Eq, PartialEq)]
enum TargetReplState {
    /// The replication stream is running at line rate.
    LineRate,
    /// The replication stream is lagging behind.
    Lagging,
    /// The replication stream is streaming a snapshot over to the target node.
    Snapshotting,
    /// The replication stream is shutting down.
    Shutdown,
}

/// An event from the Raft node.
pub(crate) enum RaftEvent<D: AppData> {
    Replicate {
        /// The new entry which needs to be replicated.
        ///
        /// This entry will always be the most recent entry to have been appended to the log, so its
        /// index is the new last_log_index value.
        entry: Arc<Entry<D>>,
        /// The index of the highest log entry which is known to be committed in the cluster.
        commit_index: u64,
    },
    /// A message from Raft indicating a new commit index value.
    UpdateCommitIndex {
        /// The index of the highest log entry which is known to be committed in the cluster.
        commit_index: u64,
    },
    Terminate,
}

/// An event coming from a replication stream.
pub(crate) enum ReplicaEvent<S>
    where S: AsyncRead + AsyncSeek + Send + Unpin + 'static,
{
    /// An event representing an update to the replication rate of a replication stream.
    RateUpdate{
        /// The ID of the Raft node to which this event relates.
        target: NodeId,
        /// A flag indicating if the corresponding target node is replicating at line rate.
        ///
        /// When replicating at line rate, the replication stream will receive log entires to
        /// replicate as soon as they are ready. When not running at line rate, the Raft node will
        /// only send over metadata without entries to replicate.
        is_line_rate: bool,
    },
    /// An event from a replication stream which updates the target node's match index.
    UpdateMatchIndex{
        /// The ID of the target node for which the match index is to be updated.
        target: NodeId,
        /// The index of the most recent log known to have been successfully replicated on the target.
        match_index: u64,
    },
    /// An event indicating that the Raft node needs to revert to follower state.
    RevertToFollower{
        /// The ID of the target node from which the new term was observed.
        target: NodeId,
        /// The new term observed.
        term: u64,
    },
    /// An event from a replication stream requesting snapshot info.
    NeedsSnapshot{
        /// The ID of the target node from which the event was sent.
        target: NodeId,
        /// The response channel for delivering the snapshot data.
        tx: oneshot::Sender<CurrentSnapshotData<S>>,
    },
    /// Some critical error has taken place, and Raft needs to shutdown.
    Shutdown,
}

//////////////////////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////////////////

/// LineRate specific state.
struct LineRateState<'a, D: AppData, R: AppDataResponse, E: AppError, N: RaftNetwork<D, E>, S: RaftStorage<D, R, E>> {
    /// An exclusive handle to the replication core.
    core: &'a mut ReplicationCore<D, R, E, N, S>,
}

impl<'a, D: AppData, R: AppDataResponse, E: AppError, N: RaftNetwork<D, E>, S: RaftStorage<D, R, E>> LineRateState<'a, D, R, E, N, S> {
    /// Create a new instance.
    pub fn new(core: &'a mut ReplicationCore<D, R, E, N, S>) -> Self {
        Self{core}
    }

    #[tracing::instrument(level="trace", skip(self), fields(state="line-rate"))]
    pub async fn run(self) {
        let event = ReplicaEvent::RateUpdate{target: self.core.target, is_line_rate: true};
        let _ = self.core.rafttx.send(event);
        loop {
            if &self.core.target_state != &TargetReplState::LineRate {
                return;
            }
            // We always prioritize draining our buffers first.
            if !self.core.outbound_buffer.is_empty() || !self.core.replication_buffer.is_empty() {
                self.core.send_append_entries().await;
                continue;
            }
            tokio::select!{
                _ = self.core.heartbeat.next() => self.core.send_append_entries().await,
                event = self.core.raftrx.next() => match event {
                    Some(event) => self.core.drain_raftrx(event),
                    None => self.core.target_state = TargetReplState::Shutdown,
                }
            }
        }
    }
}

//////////////////////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////////////////

/// Lagging specific state.
struct LaggingState<'a, D: AppData, R: AppDataResponse, E: AppError, N: RaftNetwork<D, E>, S: RaftStorage<D, R, E>> {
    /// An exclusive handle to the replication core.
    core: &'a mut ReplicationCore<D, R, E, N, S>,
}

impl<'a, D: AppData, R: AppDataResponse, E: AppError, N: RaftNetwork<D, E>, S: RaftStorage<D, R, E>> LaggingState<'a, D, R, E, N, S> {
    /// Create a new instance.
    pub fn new(core: &'a mut ReplicationCore<D, R, E, N, S>) -> Self {
        Self{core}
    }

    #[tracing::instrument(level="trace", skip(self), fields(state="lagging"))]
    pub async fn run(mut self) {
        let event = ReplicaEvent::RateUpdate{target: self.core.target, is_line_rate: false};
        let _ = self.core.rafttx.send(event);
        self.core.replication_buffer.clear();
        self.core.outbound_buffer.clear();
        loop {
            if &self.core.target_state != &TargetReplState::Lagging {
                return;
            }
            // If this stream is far enough behind, then transition to snapshotting state.
            if self.core.needs_snapshot() {
                self.core.target_state = TargetReplState::Snapshotting;
                return;
            }

            // Prep entries from storage and send them off for replication.
            if self.is_up_to_speed() {
                self.core.target_state = TargetReplState::LineRate;
                return;
            }
            self.prep_outbound_buffer_from_storage().await;
            self.core.send_append_entries().await;
            if self.is_up_to_speed() {
                self.core.target_state = TargetReplState::LineRate;
                return;
            }

            // Check raft channel to ensure we are staying up-to-date, then loop.
            if let Ok(event) = self.core.raftrx.try_recv() {
                self.core.drain_raftrx(event);
            }
        }
    }

    /// Check if this replication stream is now up-to-speed.
    fn is_up_to_speed(&self) -> bool {
        &self.core.next_index > &self.core.commit_index
    }

    /// Prep the outbound buffer with the next payload of entries to append.
    #[tracing::instrument(level="trace", skip(self))]
    async fn prep_outbound_buffer_from_storage(&mut self) {
        // If the send buffer is empty, we need to fill it.
        if self.core.outbound_buffer.is_empty() {
            // Determine an appropriate stop index for the storage fetch operation. Avoid underflow.
            let distance_behind = &self.core.commit_index - &self.core.next_index; // Underflow is guarded against in the `is_up_to_speed` check in the outer loop.
            let is_within_payload_distance = &distance_behind <= &self.core.config.max_payload_entries;
            let stop_idx = if is_within_payload_distance {
                // If we have caught up to the line index, then that means we will be running at
                // line rate after this payload is successfully replicated.
                self.core.target_state = TargetReplState::LineRate; // Will continue in lagging state until the outer loop cycles.
                let _ = self.core.rafttx.send(ReplicaEvent::RateUpdate{target: self.core.target, is_line_rate: true});
                &self.core.commit_index + &1 // +1 to ensure stop value is included.
            } else {
                &self.core.next_index + &self.core.config.max_payload_entries + &1 // +1 to ensure stop value is included.
            };

            // Bringing the target up-to-date by fetching the largest possible payload of entries
            // from storage within permitted configuration & ensure no snapshot pointer was returned.
            let entries = match self.core.storage.get_log_entries(self.core.next_index, stop_idx).await {
                Ok(entries) => entries,
                Err(err) => {
                    tracing::error!({error=%err}, "error fetching logs from storage");
                    let _ = self.core.rafttx.send(ReplicaEvent::Shutdown);
                    self.core.target_state = TargetReplState::Shutdown;
                    return;
                }
            };
            for entry in entries.iter() {
                if let EntryPayload::SnapshotPointer(_) = entry.payload {
                    self.core.target_state = TargetReplState::Snapshotting;
                    return;
                }
            }
            self.core.outbound_buffer.extend(entries.into_iter().map(|entry| OutboundEntry::Raw(entry)));
        }
    }
}

//////////////////////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////////////////

/// Snapshotting specific state.
struct SnapshottingState<'a, D: AppData, R: AppDataResponse, E: AppError, N: RaftNetwork<D, E>, S: RaftStorage<D, R, E>> {
    /// An exclusive handle to the replication core.
    core: &'a mut ReplicationCore<D, R, E, N, S>,
    snapshot: Option<CurrentSnapshotData<S::Snapshot>>,
    snapshot_fetch_rx: Option<oneshot::Receiver<CurrentSnapshotData<S::Snapshot>>>,
}

impl<'a, D: AppData, R: AppDataResponse, E: AppError, N: RaftNetwork<D, E>, S: RaftStorage<D, R, E>> SnapshottingState<'a, D, R, E, N, S> {
    /// Create a new instance.
    pub fn new(core: &'a mut ReplicationCore<D, R, E, N, S>) -> Self {
        Self{core, snapshot: None, snapshot_fetch_rx: None}
    }

    #[tracing::instrument(level="trace", skip(self), fields(state="snapshotting"))]
    pub async fn run(mut self) {
        let event = ReplicaEvent::RateUpdate{target: self.core.target, is_line_rate: false};
        let _ = self.core.rafttx.send(event);
        self.core.replication_buffer.clear();
        self.core.outbound_buffer.clear();

        loop {
            if &self.core.target_state != &TargetReplState::Snapshotting {
                return;
            }

            // We don't have any of the components we need, so fetch the current snapshot.
            if self.snapshot.is_none() && self.snapshot_fetch_rx.is_none() {
                let (tx, rx) = oneshot::channel();
                let _ = self.core.rafttx.send(ReplicaEvent::NeedsSnapshot{target: self.core.target, tx});
                self.snapshot_fetch_rx = Some(rx);
            }

            // If we are waiting for a snapshot response from the storage layer, then wait for
            // it and send heartbeats in the meantime.
            if let Some(snapshot_fetch_rx) = self.snapshot_fetch_rx.take() {
                self.wait_for_snapshot(snapshot_fetch_rx).await;
                continue;
            }

            // If we have a snapshot to work with, then stream it.
            if let Some(snapshot) = self.snapshot.take() {
                if let Err(err) = self.stream_snapshot(snapshot).await {
                    tracing::error!({error=%err}, "error streaming snapshot to target");
                }
                continue;
            }
        }
    }

    /// Wait for a response from the storage layer for the current snapshot.
    ///
    /// If an error comes up during processing, this routine should simple be called again after
    /// issuing a new request to the storage layer.
    #[tracing::instrument(level="trace", skip(self, rx))]
    async fn wait_for_snapshot(&mut self, mut rx: oneshot::Receiver<CurrentSnapshotData<S::Snapshot>>) {
        loop {
            tokio::select!{
                _ = self.core.heartbeat.next() => self.core.send_append_entries().await,
                event = self.core.raftrx.next() => match event {
                    Some(event) => self.core.drain_raftrx(event),
                    None => {
                        self.core.target_state = TargetReplState::Shutdown;
                        return;
                    }
                },
                res = &mut rx => {
                    match res {
                        Ok(snapshot) => {
                            self.snapshot = Some(snapshot);
                            return;
                        }
                        Err(_) => return, // Channels may close for various acceptable reasons.
                    }
                },
            }
        }
    }

    #[tracing::instrument(level="trace", skip(self, snapshot))]
    async fn stream_snapshot(&mut self, mut snapshot: CurrentSnapshotData<S::Snapshot>) -> RaftResult<(), E> {
        let mut offset = 0;
        self.core.match_index = snapshot.index;
        self.core.match_term = snapshot.term;
        loop {
            // Build the RPC.
            snapshot.snapshot.seek(SeekFrom::Start(offset)).await?;
            let mut data = Vec::with_capacity(self.core.config.snapshot_max_chunk_size as usize);
            let nread = snapshot.snapshot.read(data.as_mut_slice()).await?;
            let done =  nread == 0; // If bytes read == 0, then we're done.
            let req = InstallSnapshotRequest{
                term: self.core.term, leader_id: self.core.id,
                last_included_index: snapshot.index,
                last_included_term: snapshot.term,
                offset, data, done,
            };

            // Send the RPC over to the target.
            let res = match timeout(self.core.heartbeat_timeout, self.core.network.install_snapshot(self.core.target, req)).await {
                Ok(outer_res) => match outer_res {
                    Ok(res) => res,
                    Err(err) => {
                        tracing::error!({error=%err}, "error sending InstallSnapshot RPC to target");
                        continue;
                    }
                },
                Err(err) => {
                    tracing::error!({error=%err}, "timeout while sending InstallSnapshot RPC to target");
                    continue;
                }
            };

            // Handle response conditions.
            if &res.term > &self.core.term {
                let _ = self.core.rafttx.send(ReplicaEvent::RevertToFollower{target: self.core.target, term: res.term});
                self.core.target_state = TargetReplState::Shutdown;
                return Ok(());
            }

            // If we just sent the final chunk of the snapshot, then transition to lagging state.
            if done {
                self.core.target_state = TargetReplState::Lagging;
                return Ok(());
            }

            // Everything is good, so update offset for sending the next chunk.
            offset += nread as u64;

            // Check raft channel to ensure we are staying up-to-date, then loop.
            if let Ok(event) = self.core.raftrx.try_recv() {
                self.core.drain_raftrx(event);
            }
        }
    }
}