rustls/ci-bench/src/main.rs

use std::collections::HashMap;
use std::fs::{self, File};
use std::hint::black_box;
use std::io::{self, BufRead, BufReader, Write};
use std::mem;
use std::os::fd::{AsRawFd, FromRawFd};
use std::path::{Path, PathBuf};
use std::sync::Arc;
use std::time::Instant;

use anyhow::Context;
use async_trait::async_trait;
use clap::{Parser, Subcommand, ValueEnum};
use fxhash::FxHashMap;
use itertools::Itertools;
use rayon::iter::Either;
use rayon::prelude::*;
use rustls::client::Resumption;
use rustls::crypto::{aws_lc_rs, ring};
use rustls::server::{NoServerSessionStorage, ServerSessionMemoryCache, WebPkiClientVerifier};
use rustls::{
    CipherSuite, ClientConfig, ClientConnection, ProtocolVersion, RootCertStore, ServerConfig,
    ServerConnection,
};

use crate::benchmark::{
    get_reported_instr_count, validate_benchmarks, Benchmark, BenchmarkKind, BenchmarkParams,
    ResumptionKind,
};
use crate::cachegrind::CachegrindRunner;
use crate::util::async_io::{self, AsyncRead, AsyncWrite};
use crate::util::transport::{
    read_handshake_message, read_plaintext_to_end_bounded, send_handshake_message,
    write_all_plaintext_bounded,
};
use crate::util::KeyType;

mod benchmark;
mod cachegrind;
mod util;

/// The size in bytes of the plaintext sent in the transfer benchmark
const TRANSFER_PLAINTEXT_SIZE: usize = 1024 * 1024 * 10; // 10 MB

/// The amount of times a resumed handshake should be executed during benchmarking.
///
/// Handshakes with session resumption execute a very small amount of instructions (less than 200_000
/// for some parameters), so a small difference in instructions accounts for a high difference in
/// percentage (making the benchmark more sensitive to noise, because differences as low as 500
/// instructions already raise a flag). Running the handshake multiple times gives additional weight
/// to the instructions involved in the handshake, and less weight to noisy one-time setup code.
///
/// More specifically, great part of the noise in resumed handshakes comes from the usage of
/// [`rustls::client::ClientSessionMemoryCache`] and [`rustls::server::ServerSessionMemoryCache`],
/// which rely on a randomized `HashMap` under the hood (you can check for yourself by that
/// `HashMap` by a `FxHashMap`, which brings the noise down to acceptable levels in a single run).
const RESUMED_HANDSHAKE_RUNS: usize = 30;

/// The name of the file where the instruction counts are stored after a `run-all` run
const ICOUNTS_FILENAME: &str = "icounts.csv";

/// Default size in bytes for internal buffers (256 KB)
const DEFAULT_BUFFER_SIZE: usize = 262144;

#[derive(Parser)]
#[command(about)]
pub struct Cli {
    #[command(subcommand)]
    pub command: Command,
}

#[derive(Subcommand)]
pub enum Command {
    /// Run all benchmarks and print the measured CPU instruction counts in CSV format
    RunAll {
        #[arg(short, long, default_value = "target/ci-bench")]
        output_dir: PathBuf,
    },
    /// Run a single benchmark at the provided index (used by the bench runner to start each benchmark in its own process)
    RunSingle { index: u32, side: Side },
    /// Run all benchmarks in walltime mode and print the measured timings in CSV format
    Walltime {
        #[arg(short, long)]
        iterations_per_scenario: usize,
    },
    /// Compare the results from two previous benchmark runs and print a user-friendly markdown overview
    Compare {
        /// Path to the directory with the results of a previous `run-all` execution
        baseline_dir: PathBuf,
        /// Path to the directory with the results of a previous `run-all` execution
        candidate_dir: PathBuf,
    },
}

#[derive(Copy, Clone, ValueEnum)]
pub enum Side {
    Server,
    Client,
}

impl Side {
    /// Returns the string representation of the side
    pub fn as_str(self) -> &'static str {
        match self {
            Side::Client => "client",
            Side::Server => "server",
        }
    }
}

fn main() -> anyhow::Result<()> {
    let benchmarks = all_benchmarks()?;

    let cli = Cli::parse();
    match cli.command {
        Command::RunAll { output_dir } => {
            let executable = std::env::args().next().unwrap();
            let results = run_all(executable, output_dir.clone(), &benchmarks)?;

            // Output results in CSV (note: not using a library here to avoid extra dependencies)
            let mut csv_file = File::create(output_dir.join(ICOUNTS_FILENAME))
                .context("cannot create output csv file")?;
            for (name, instr_count) in results {
                writeln!(csv_file, "{name},{instr_count}")?;
            }
        }
        Command::RunSingle { index, side } => {
            // `u32::MAX` is used as a signal to do nothing and return. By "running" an empty
            // benchmark we can measure the startup overhead.
            if index == u32::MAX {
                return Ok(());
            }

            let bench = benchmarks
                .get(index as usize)
                .ok_or(anyhow::anyhow!("Benchmark not found: {index}"))?;

            let stdin_lock = io::stdin().lock();
            let stdout_lock = io::stdout().lock();

            // `StdinLock` and `StdoutLock` are buffered, which makes the instruction counts less
            // deterministic (the growth of the internal buffers varies across runs, causing
            // differences of hundreds of instructions). To counter this, we do the actual io
            // operations through `File`, which is unbuffered. The `stdin_lock` and `stdout_lock`
            // variables are kept around to ensure exclusive access.

            // safety: the file descriptor is valid and we have exclusive access to it for the
            // duration of the lock
            let mut stdin = unsafe { File::from_raw_fd(stdin_lock.as_raw_fd()) };
            let mut stdout = unsafe { File::from_raw_fd(stdout_lock.as_raw_fd()) };

            let handshake_buf = &mut [0u8; DEFAULT_BUFFER_SIZE];
            let resumption_kind = bench.kind.resumption_kind();
            let io = StepperIo {
                reader: &mut stdin,
                writer: &mut stdout,
                handshake_buf,
            };
            async_io::block_on_single_poll(async {
                match side {
                    Side::Server => {
                        run_bench(
                            ServerSideStepper {
                                io,
                                config: ServerSideStepper::make_config(
                                    &bench.params,
                                    resumption_kind,
                                ),
                            },
                            bench.kind,
                        )
                        .await
                    }
                    Side::Client => {
                        run_bench(
                            ClientSideStepper {
                                io,
                                resumption_kind,
                                config: ClientSideStepper::make_config(
                                    &bench.params,
                                    resumption_kind,
                                ),
                            },
                            bench.kind,
                        )
                        .await
                    }
                }
            })
            .with_context(|| format!("{} crashed for {} side", bench.name(), side.as_str()))?;

            // Prevent stdin / stdout from being closed
            mem::forget(stdin);
            mem::forget(stdout);
        }
        Command::Walltime {
            iterations_per_scenario,
        } => {
            let mut timings = vec![Vec::with_capacity(iterations_per_scenario); benchmarks.len()];
            for _ in 0..iterations_per_scenario {
                for (i, bench) in benchmarks.iter().enumerate() {
                    let start = Instant::now();

                    // The variables below are used to initialize the client and server configs. We
                    // let them go through `black_box` to ensure the optimizer doesn't take
                    // advantage of knowing both the client and the server side of the
                    // configuration.
                    let resumption_kind = black_box(bench.kind.resumption_kind());
                    let params = black_box(&bench.params);

                    let (mut client_writer, mut server_reader) =
                        async_io::async_pipe(DEFAULT_BUFFER_SIZE);
                    let (mut server_writer, mut client_reader) =
                        async_io::async_pipe(DEFAULT_BUFFER_SIZE);

                    let server_side = async move {
                        let handshake_buf = &mut [0u8; DEFAULT_BUFFER_SIZE];
                        run_bench(
                            ServerSideStepper {
                                io: StepperIo {
                                    reader: &mut server_reader,
                                    writer: &mut server_writer,
                                    handshake_buf,
                                },
                                config: ServerSideStepper::make_config(params, resumption_kind),
                            },
                            bench.kind,
                        )
                        .await
                    };

                    let client_side = async move {
                        let handshake_buf = &mut [0u8; DEFAULT_BUFFER_SIZE];
                        run_bench(
                            ClientSideStepper {
                                io: StepperIo {
                                    reader: &mut client_reader,
                                    writer: &mut client_writer,
                                    handshake_buf,
                                },
                                resumption_kind,
                                config: ClientSideStepper::make_config(params, resumption_kind),
                            },
                            bench.kind,
                        )
                        .await
                    };

                    let (client_result, server_result) =
                        async_io::block_on_concurrent(client_side, server_side);
                    client_result
                        .with_context(|| format!("client side of {} crashed", bench.name()))?;
                    server_result
                        .with_context(|| format!("server side of {} crashed", bench.name()))?;

                    timings[i].push(start.elapsed());
                }
            }

            // Output the results
            for (i, bench_timings) in timings.into_iter().enumerate() {
                print!("{}", benchmarks[i].name());
                for timing in bench_timings {
                    print!(",{}", timing.as_nanos())
                }
                println!();
            }
        }
        Command::Compare {
            baseline_dir,
            candidate_dir,
        } => {
            let baseline = read_results(&baseline_dir.join(ICOUNTS_FILENAME))?;
            let candidate = read_results(&candidate_dir.join(ICOUNTS_FILENAME))?;
            let result = compare_results(&baseline_dir, &candidate_dir, &baseline, &candidate)?;
            print_report(&result);
        }
    }

    Ok(())
}

/// Returns all benchmarks
fn all_benchmarks() -> anyhow::Result<Vec<Benchmark>> {
    let mut benchmarks = Vec::new();
    for param in all_benchmarks_params() {
        add_benchmark_group(&mut benchmarks, param);
    }

    validate_benchmarks(&benchmarks)?;
    Ok(benchmarks)
}

/// The benchmark params to use for each group of benchmarks
fn all_benchmarks_params() -> Vec<BenchmarkParams> {
    let mut all = Vec::new();

    for (provider, suites, ticketer, provider_name) in [
        (
            ring::default_provider(),
            ring::ALL_CIPHER_SUITES,
            &(ring_ticketer as fn() -> Arc<dyn rustls::server::ProducesTickets>),
            "ring",
        ),
        (
            aws_lc_rs::default_provider(),
            aws_lc_rs::ALL_CIPHER_SUITES,
            &(aws_lc_rs_ticketer as fn() -> Arc<dyn rustls::server::ProducesTickets>),
            "aws_lc_rs",
        ),
    ] {
        for (key_type, suite_name, version, name) in [
            (
                KeyType::Rsa2048,
                CipherSuite::TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,
                &rustls::version::TLS12,
                "1.2_rsa_aes",
            ),
            (
                KeyType::Rsa2048,
                CipherSuite::TLS13_AES_128_GCM_SHA256,
                &rustls::version::TLS13,
                "1.3_rsa_aes",
            ),
            (
                KeyType::EcdsaP256,
                CipherSuite::TLS13_AES_128_GCM_SHA256,
                &rustls::version::TLS13,
                "1.3_ecdsap256_aes",
            ),
            (
                KeyType::EcdsaP384,
                CipherSuite::TLS13_AES_128_GCM_SHA256,
                &rustls::version::TLS13,
                "1.3_ecdsap384_aes",
            ),
            (
                KeyType::Rsa2048,
                CipherSuite::TLS13_CHACHA20_POLY1305_SHA256,
                &rustls::version::TLS13,
                "1.3_rsa_chacha",
            ),
            (
                KeyType::EcdsaP256,
                CipherSuite::TLS13_CHACHA20_POLY1305_SHA256,
                &rustls::version::TLS13,
                "1.3_ecdsap256_chacha",
            ),
            (
                KeyType::EcdsaP384,
                CipherSuite::TLS13_CHACHA20_POLY1305_SHA256,
                &rustls::version::TLS13,
                "1.3_ecdsap384_chacha",
            ),
        ] {
            all.push(BenchmarkParams::new(
                provider.clone(),
                ticketer,
                key_type,
                find_suite(suites, suite_name),
                version,
                format!("{provider_name}_{name}"),
            ));
        }
    }

    all
}

fn find_suite(
    all: &[rustls::SupportedCipherSuite],
    name: CipherSuite,
) -> rustls::SupportedCipherSuite {
    *all.iter()
        .find(|suite| suite.suite() == name)
        .unwrap_or_else(|| panic!("cannot find cipher suite {name:?}"))
}

fn ring_ticketer() -> Arc<dyn rustls::server::ProducesTickets> {
    ring::Ticketer::new().unwrap()
}

fn aws_lc_rs_ticketer() -> Arc<dyn rustls::server::ProducesTickets> {
    aws_lc_rs::Ticketer::new().unwrap()
}

/// Adds a group of benchmarks for the specified parameters
///
/// The benchmarks in the group are:
///
/// - Handshake without resumption
/// - Handshake with session id resumption
/// - Handshake with ticket resumption
/// - Transfer a 1MB data stream from the server to the client
fn add_benchmark_group(benchmarks: &mut Vec<Benchmark>, params: BenchmarkParams) {
    let params_label = params.label.clone();

    // Create handshake benchmarks for all resumption kinds
    for &resumption_param in ResumptionKind::ALL {
        let handshake_bench = Benchmark::new(
            format!("handshake_{}_{params_label}", resumption_param.label()),
            BenchmarkKind::Handshake(resumption_param),
            params.clone(),
        );

        let handshake_bench = if resumption_param != ResumptionKind::No {
            // Since resumed handshakes include a first non-resumed handshake, we need to subtract
            // the non-resumed handshake's instructions
            handshake_bench
                .exclude_setup_instructions(format!("handshake_no_resume_{params_label}"))
        } else {
            handshake_bench
        };

        benchmarks.push(handshake_bench);
    }

    // Benchmark data transfer
    benchmarks.push(
        Benchmark::new(
            format!("transfer_no_resume_{params_label}"),
            BenchmarkKind::Transfer,
            params.clone(),
        )
        .exclude_setup_instructions(format!("handshake_no_resume_{params_label}")),
    );
}

/// Run all the provided benches under cachegrind to retrieve their instruction count
pub fn run_all(
    executable: String,
    output_dir: PathBuf,
    benches: &[Benchmark],
) -> anyhow::Result<Vec<(String, u64)>> {
    // Run the benchmarks in parallel
    let cachegrind = CachegrindRunner::new(executable, output_dir)?;
    let results: Vec<_> = benches
        .par_iter()
        .enumerate()
        .map(|(i, bench)| (bench, cachegrind.run_bench(i as u32, bench)))
        .collect();

    // Report possible errors
    let (errors, results): (Vec<_>, FxHashMap<_, _>) =
        results
            .into_iter()
            .partition_map(|(bench, result)| match result {
                Err(_) => Either::Left(()),
                Ok(instr_counts) => Either::Right((bench.name(), instr_counts)),
            });
    if !errors.is_empty() {
        // Note: there is no need to explicitly report the names of each crashed benchmark, because
        // names and other details are automatically printed to stderr by the child process upon
        // crashing
        anyhow::bail!("One or more benchmarks crashed");
    }

    // Gather results keeping the original order of the benchmarks
    let mut measurements = Vec::new();
    for bench in benches {
        let instr_counts = get_reported_instr_count(bench, &results);
        measurements.push((bench.name_with_side(Side::Server), instr_counts.server));
        measurements.push((bench.name_with_side(Side::Client), instr_counts.client));
    }

    Ok(measurements)
}

/// Drives the different steps in a benchmark.
///
/// See [`run_bench`] for specific details on how it is used.
#[async_trait(?Send)]
trait BenchStepper {
    type Endpoint;

    async fn handshake(&mut self) -> anyhow::Result<Self::Endpoint>;
    async fn sync_before_resumed_handshake(&mut self) -> anyhow::Result<()>;
    async fn transmit_data(&mut self, endpoint: &mut Self::Endpoint) -> anyhow::Result<()>;
}

/// Stepper fields necessary for IO
struct StepperIo<'a> {
    reader: &'a mut dyn AsyncRead,
    writer: &'a mut dyn AsyncWrite,
    handshake_buf: &'a mut [u8],
}

/// A benchmark stepper for the client-side of the connection
struct ClientSideStepper<'a> {
    io: StepperIo<'a>,
    resumption_kind: ResumptionKind,
    config: Arc<ClientConfig>,
}

impl ClientSideStepper<'_> {
    fn make_config(params: &BenchmarkParams, resume: ResumptionKind) -> Arc<ClientConfig> {
        assert_eq!(params.ciphersuite.version(), params.version);
        let mut root_store = RootCertStore::empty();
        let mut rootbuf =
            io::BufReader::new(fs::File::open(params.key_type.path_for("ca.cert")).unwrap());
        root_store.add_parsable_certificates(
            rustls_pemfile::certs(&mut rootbuf).map(|result| result.unwrap()),
        );

        let mut cfg = ClientConfig::builder_with_provider(
            rustls::crypto::CryptoProvider {
                cipher_suites: vec![params.ciphersuite],
                ..params.provider.clone()
            }
            .into(),
        )
        .with_protocol_versions(&[params.version])
        .unwrap()
        .with_root_certificates(root_store)
        .with_no_client_auth();

        if resume != ResumptionKind::No {
            cfg.resumption = Resumption::in_memory_sessions(128);
        } else {
            cfg.resumption = Resumption::disabled();
        }

        Arc::new(cfg)
    }
}

#[async_trait(?Send)]
impl BenchStepper for ClientSideStepper<'_> {
    type Endpoint = ClientConnection;

    async fn handshake(&mut self) -> anyhow::Result<Self::Endpoint> {
        let server_name = "localhost".try_into().unwrap();
        let mut client = ClientConnection::new(self.config.clone(), server_name).unwrap();
        client.set_buffer_limit(None);

        loop {
            send_handshake_message(&mut client, self.io.writer, self.io.handshake_buf).await?;
            if !client.is_handshaking() && !client.wants_write() {
                break;
            }
            read_handshake_message(&mut client, self.io.reader, self.io.handshake_buf).await?;
        }

        // Session ids and tickets are no longer part of the handshake in TLS 1.3, so we need to
        // explicitly receive them from the server
        if self.resumption_kind != ResumptionKind::No
            && client.protocol_version().unwrap() == ProtocolVersion::TLSv1_3
        {
            read_handshake_message(&mut client, self.io.reader, self.io.handshake_buf).await?;
        }

        Ok(client)
    }

    async fn sync_before_resumed_handshake(&mut self) -> anyhow::Result<()> {
        // The client syncs by receiving a single byte (we assert that it matches the `42` byte sent
        // by the server, just to be sure)
        let buf = &mut [0];
        self.io.reader.read_exact(buf).await?;
        assert_eq!(buf[0], 42);
        Ok(())
    }

    async fn transmit_data(&mut self, endpoint: &mut Self::Endpoint) -> anyhow::Result<()> {
        let total_plaintext_read = read_plaintext_to_end_bounded(endpoint, self.io.reader).await?;
        assert_eq!(total_plaintext_read, TRANSFER_PLAINTEXT_SIZE);
        Ok(())
    }
}

/// A benchmark stepper for the server-side of the connection
struct ServerSideStepper<'a> {
    io: StepperIo<'a>,
    config: Arc<ServerConfig>,
}

impl ServerSideStepper<'_> {
    fn make_config(params: &BenchmarkParams, resume: ResumptionKind) -> Arc<ServerConfig> {
        assert_eq!(params.ciphersuite.version(), params.version);

        let mut cfg = ServerConfig::builder_with_provider(params.provider.clone().into())
            .with_protocol_versions(&[params.version])
            .unwrap()
            .with_client_cert_verifier(WebPkiClientVerifier::no_client_auth())
            .with_single_cert(params.key_type.get_chain(), params.key_type.get_key())
            .expect("bad certs/private key?");

        if resume == ResumptionKind::SessionId {
            cfg.session_storage = ServerSessionMemoryCache::new(128);
        } else if resume == ResumptionKind::Tickets {
            cfg.ticketer = (params.ticketer)();
        } else {
            cfg.session_storage = Arc::new(NoServerSessionStorage {});
        }

        Arc::new(cfg)
    }
}

#[async_trait(?Send)]
impl BenchStepper for ServerSideStepper<'_> {
    type Endpoint = ServerConnection;

    async fn handshake(&mut self) -> anyhow::Result<Self::Endpoint> {
        let mut server = ServerConnection::new(self.config.clone()).unwrap();
        server.set_buffer_limit(None);

        while server.is_handshaking() {
            read_handshake_message(&mut server, self.io.reader, self.io.handshake_buf).await?;
            send_handshake_message(&mut server, self.io.writer, self.io.handshake_buf).await?;
        }

        Ok(server)
    }

    async fn sync_before_resumed_handshake(&mut self) -> anyhow::Result<()> {
        // The server syncs by sending a single byte
        self.io.writer.write_all(&[42]).await?;
        self.io.writer.flush().await?;
        Ok(())
    }

    async fn transmit_data(&mut self, endpoint: &mut Self::Endpoint) -> anyhow::Result<()> {
        write_all_plaintext_bounded(endpoint, self.io.writer, TRANSFER_PLAINTEXT_SIZE).await?;
        Ok(())
    }
}

/// Runs the benchmark using the provided stepper
async fn run_bench<T: BenchStepper>(mut stepper: T, kind: BenchmarkKind) -> anyhow::Result<()> {
    let mut endpoint = stepper.handshake().await?;

    match kind {
        BenchmarkKind::Handshake(ResumptionKind::No) => {
            // Nothing else to do here, since the handshake already happened
            black_box(endpoint);
        }
        BenchmarkKind::Handshake(_) => {
            // The handshake performed above was non-resumed, because the client didn't have a
            // session ID / ticket; from now on we can perform resumed handshakes. We do it multiple
            // times, for reasons explained in the comments to `RESUMED_HANDSHAKE_RUNS`.
            for _ in 0..RESUMED_HANDSHAKE_RUNS {
                // Wait for the endpoints to sync (i.e. the server must have discarded the previous
                // connection and be ready for a new handshake, otherwise the client will start a
                // handshake before the server is ready and the bytes will be fed to the old
                // connection!)
                stepper
                    .sync_before_resumed_handshake()
                    .await?;
                stepper.handshake().await?;
            }
        }
        BenchmarkKind::Transfer => {
            stepper
                .transmit_data(&mut endpoint)
                .await?;
        }
    }

    Ok(())
}

/// The results of a comparison between two `run-all` executions
struct CompareResult {
    /// Results for benchmark scenarios we know are fairly deterministic.
    ///
    /// The string is a detailed diff between the instruction counts obtained from cachegrind.
    diffs: Vec<(Diff, String)>,
    /// Results for benchmark scenarios we know are extremely non-deterministic
    known_noisy: Vec<Diff>,
    /// Benchmark scenarios present in the candidate but missing in the baseline
    missing_in_baseline: Vec<String>,
}

/// Contains information about instruction counts and their difference for a specific scenario
#[derive(Clone)]
struct Diff {
    scenario: String,
    baseline: u64,
    candidate: u64,
    diff: i64,
    diff_ratio: f64,
}

/// Reads the (benchmark, instruction count) pairs from previous CSV output
fn read_results(path: &Path) -> anyhow::Result<HashMap<String, u64>> {
    let file = File::open(path).context(format!(
        "CSV file for comparison not found: {}",
        path.display()
    ))?;

    let mut measurements = HashMap::new();
    for line in BufReader::new(file).lines() {
        let line = line.context("Unable to read results from CSV file")?;
        let line = line.trim();
        let mut parts = line.split(',');
        measurements.insert(
            parts
                .next()
                .ok_or(anyhow::anyhow!("CSV is wrongly formatted"))?
                .to_string(),
            parts
                .next()
                .ok_or(anyhow::anyhow!("CSV is wrongly formatted"))?
                .parse()
                .context("Unable to parse instruction count from CSV")?,
        );
    }

    Ok(measurements)
}

/// Returns an internal representation of the comparison between the baseline and the candidate
/// measurements
fn compare_results(
    baseline_dir: &Path,
    candidate_dir: &Path,
    baseline: &HashMap<String, u64>,
    candidate: &HashMap<String, u64>,
) -> anyhow::Result<CompareResult> {
    let mut diffs = Vec::new();
    let mut missing = Vec::new();
    let mut known_noisy = Vec::new();

    for (scenario, &instr_count) in candidate {
        let Some(&baseline_instr_count) = baseline.get(scenario) else {
            missing.push(scenario.clone());
            continue;
        };

        let diff = instr_count as i64 - baseline_instr_count as i64;
        let diff_ratio = diff as f64 / baseline_instr_count as f64;
        let diff = Diff {
            scenario: scenario.clone(),
            baseline: baseline_instr_count,
            candidate: instr_count,
            diff,
            diff_ratio,
        };

        match is_known_noisy(scenario) {
            true => known_noisy.push(diff),
            false => diffs.push(diff),
        };
    }

    diffs.sort_by(|diff1, diff2| {
        diff2
            .diff_ratio
            .abs()
            .total_cmp(&diff1.diff_ratio.abs())
    });

    let mut diffs_with_cachegrind_diff = Vec::new();
    for diff in diffs {
        let detailed_diff = cachegrind::diff(baseline_dir, candidate_dir, &diff.scenario)?;
        diffs_with_cachegrind_diff.push((diff, detailed_diff));
    }

    Ok(CompareResult {
        diffs: diffs_with_cachegrind_diff,
        missing_in_baseline: missing,
        known_noisy,
    })
}

fn is_known_noisy(scenario_name: &str) -> bool {
    // aws-lc-rs RSA key validation is non-deterministic, and expensive in relative terms for
    // "cheaper" tests, and only done for server-side tests.  Exclude these tests
    // from comparison.
    //
    // Better solutions for this include:
    // - https://github.com/rustls/rustls/issues/1494: exclude key validation in these tests.
    //   Key validation is benchmarked separately elsewhere, and mostly amortised into
    //   insignificance in real-world scenarios.
    // - Find a way to make aws-lc-rs deterministic, such as by replacing its RNG with a
    //   test-only one.
    scenario_name.contains("_aws_lc_rs_")
        && scenario_name.contains("_rsa_")
        && scenario_name.ends_with("_server")
}

/// Prints a report of the comparison to stdout, using GitHub-flavored markdown
fn print_report(result: &CompareResult) {
    println!("# Benchmark results");

    if !result.missing_in_baseline.is_empty() {
        println!("### ⚠️ Warning: missing benchmarks");
        println!();
        println!("The following benchmark scenarios are present in the candidate but not in the baseline:");
        println!();
        for scenario in &result.missing_in_baseline {
            println!("* {scenario}");
        }
    }

    println!("## Instruction count differences");
    if result.diffs.is_empty() {
        println!("_There are no instruction count differences_");
    } else {
        table(
            result
                .diffs
                .iter()
                .map(|(diff, _)| diff),
            true,
        );
        println!("<details>");
        println!("<summary>Details per scenario</summary>\n");
        for (diff, detailed_diff) in &result.diffs {
            println!("#### {}", diff.scenario);
            println!("```");
            println!("{detailed_diff}");
            println!("```");
        }
        println!("</details>\n")
    }

    if !result.known_noisy.is_empty() {
        println!("### ‼️ Caution: ignored noisy benchmarks");
        println!("<details>");
        println!("<summary>Click to expand</summary>\n");
        table(result.known_noisy.iter(), false);
        println!("</details>\n")
    }
}

/// Renders the diffs as a markdown table
fn table<'a>(diffs: impl Iterator<Item = &'a Diff>, emoji_feedback: bool) {
    println!("| Scenario | Baseline | Candidate | Diff |");
    println!("| --- | ---: | ---: | ---: |");
    for diff in diffs {
        let emoji = match emoji_feedback {
            true if diff.diff > 0 => "⚠️ ",
            true if diff.diff < 0 => "✅ ",
            _ => "",
        };

        println!(
            "| {} | {} | {} | {}{} ({:.2}%) |",
            diff.scenario,
            diff.baseline,
            diff.candidate,
            emoji,
            diff.diff,
            diff.diff_ratio * 100.0
        )
    }
}

#[cfg(not(target_env = "msvc"))]
#[global_allocator]
static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;