sled/src/heap.rs

808 lines
22 KiB
Rust

use std::cmp::Reverse;
use std::collections::BinaryHeap;
use std::fmt;
use std::fs;
use std::io;
use std::num::NonZeroU64;
use std::path::{Path, PathBuf};
use std::sync::atomic::{AtomicPtr, AtomicU64, Ordering};
use std::sync::{Arc, Mutex};
use concurrent_map::Minimum;
use crossbeam_queue::SegQueue;
use ebr::{Ebr, Guard};
use fault_injection::{annotate, fallible, maybe};
use fnv::FnvHashSet;
use fs2::FileExt as _;
use pagetable::PageTable;
use rayon::prelude::*;
use crate::metadata_store::MetadataStore;
use crate::{CollectionId, NodeId};
const WARN: &str = "DO_NOT_PUT_YOUR_FILES_HERE";
const N_SLABS: usize = 78;
const SLAB_SIZES: [usize; N_SLABS] = [
64, // 0x40
80, // 0x50
96, // 0x60
112, // 0x70
128, // 0x80
160, // 0xa0
192, // 0xc0
224, // 0xe0
256, // 0x100
320, // 0x140
384, // 0x180
448, // 0x1c0
512, // 0x200
640, // 0x280
768, // 0x300
896, // 0x380
1024, // 0x400
1280, // 0x500
1536, // 0x600
1792, // 0x700
2048, // 0x800
2560, // 0xa00
3072, // 0xc00
3584, // 0xe00
4096, // 0x1000
5120, // 0x1400
6144, // 0x1800
7168, // 0x1c00
8192, // 0x2000
10240, // 0x2800
12288, // 0x3000
14336, // 0x3800
16384, // 0x4000
20480, // 0x5000
24576, // 0x6000
28672, // 0x7000
32768, // 0x8000
40960, // 0xa000
49152, // 0xc000
57344, // 0xe000
65536, // 0x10000
98304, // 0x1a000
131072, // 0x20000
163840, // 0x28000
196608,
262144,
393216,
524288,
786432,
1048576,
1572864,
2097152,
3145728,
4194304,
6291456,
8388608,
12582912,
16777216,
25165824,
33554432,
50331648,
67108864,
100663296,
134217728,
201326592,
268435456,
402653184,
536870912,
805306368,
1073741824,
1610612736,
2147483648,
3221225472,
4294967296,
6442450944,
8589934592,
12884901888,
17179869184,
];
const fn overhead_for_size(size: usize) -> usize {
if size + 5 <= u8::MAX as usize {
// crc32 + 1 byte frame
5
} else if size + 6 <= u16::MAX as usize {
// crc32 + 2 byte frame
6
} else if size + 8 <= u32::MAX as usize {
// crc32 + 4 byte frame
8
} else {
// crc32 + 8 byte frame
12
}
}
fn slab_for_size(size: usize) -> u8 {
let total_size = size + overhead_for_size(size);
for idx in 0..SLAB_SIZES.len() {
if SLAB_SIZES[idx] >= total_size {
return idx as u8;
}
}
u8::MAX
}
pub use inline_array::InlineArray;
#[derive(Debug, Clone)]
pub struct Stats {}
#[derive(Debug, Clone)]
pub struct Config {
pub path: PathBuf,
}
pub(crate) fn recover<P: AsRef<Path>>(
storage_directory: P,
) -> io::Result<(Heap, Vec<(NodeId, CollectionId, InlineArray)>)> {
Heap::recover(&Config { path: storage_directory.as_ref().into() })
}
struct SlabAddress {
slab_id: u8,
slab_slot: [u8; 7],
}
impl SlabAddress {
fn from_slab_slot(slab: u8, slot: u64) -> SlabAddress {
let slot_bytes = slot.to_be_bytes();
assert_eq!(slot_bytes[0], 0);
SlabAddress {
slab_id: slab,
slab_slot: slot_bytes[1..].try_into().unwrap(),
}
}
fn slot(&self) -> u64 {
u64::from_be_bytes([
0,
self.slab_slot[0],
self.slab_slot[1],
self.slab_slot[2],
self.slab_slot[3],
self.slab_slot[4],
self.slab_slot[5],
self.slab_slot[6],
])
}
}
impl From<NonZeroU64> for SlabAddress {
fn from(i: NonZeroU64) -> SlabAddress {
let i = i.get();
let bytes = i.to_be_bytes();
SlabAddress {
slab_id: bytes[0] - 1,
slab_slot: bytes[1..].try_into().unwrap(),
}
}
}
impl Into<NonZeroU64> for SlabAddress {
fn into(self) -> NonZeroU64 {
NonZeroU64::new(u64::from_be_bytes([
self.slab_id + 1,
self.slab_slot[0],
self.slab_slot[1],
self.slab_slot[2],
self.slab_slot[3],
self.slab_slot[4],
self.slab_slot[5],
self.slab_slot[6],
]))
.unwrap()
}
}
#[derive(Default, Debug)]
struct Allocator {
free_and_pending: Mutex<BinaryHeap<Reverse<u64>>>,
free_queue: SegQueue<u64>,
next_to_allocate: AtomicU64,
}
impl Allocator {
fn from_allocated(allocated: &FnvHashSet<u64>) -> Allocator {
let mut heap = BinaryHeap::<Reverse<u64>>::default();
let max = allocated.iter().copied().max();
for i in 0..max.unwrap_or(0) {
if !allocated.contains(&i) {
heap.push(Reverse(i));
}
}
Allocator {
free_and_pending: Mutex::new(heap),
free_queue: SegQueue::default(),
next_to_allocate: max.map(|m| m + 1).unwrap_or(0).into(),
}
}
fn allocate(&self) -> u64 {
let mut free = self.free_and_pending.lock().unwrap();
while let Some(free_id) = self.free_queue.pop() {
free.push(Reverse(free_id));
}
let pop_attempt = free.pop();
if let Some(id) = pop_attempt {
id.0
} else {
self.next_to_allocate.fetch_add(1, Ordering::Release)
}
}
fn free(&self, id: u64) {
if let Ok(mut free) = self.free_and_pending.try_lock() {
while let Some(free_id) = self.free_queue.pop() {
free.push(Reverse(free_id));
}
free.push(Reverse(id));
} else {
self.free_queue.push(id);
}
}
}
#[cfg(unix)]
mod sys_io {
use std::io;
use std::os::unix::fs::FileExt;
use super::*;
pub fn read_exact_at<F: FileExt>(
file: &F,
buf: &mut [u8],
offset: u64,
) -> io::Result<()> {
maybe!(file.read_exact_at(buf, offset))
}
pub fn write_all_at<F: FileExt>(
file: &F,
buf: &[u8],
offset: u64,
) -> io::Result<()> {
maybe!(file.write_all_at(buf, offset))
}
}
#[cfg(windows)]
mod sys_io {
use std::os::windows::fs::FileExt;
use super::*;
pub fn read_exact_at<F: FileExt>(
file: &F,
mut buf: &mut [u8],
mut offset: u64,
) -> io::Result<()> {
while !buf.is_empty() {
match maybe!(file.seek_read(buf, offset)) {
Ok(0) => break,
Ok(n) => {
let tmp = buf;
buf = &mut tmp[n..];
offset += n as u64;
}
Err(ref e) if e.kind() == io::ErrorKind::Interrupted => {}
Err(e) => return Err(annotate!(e)),
}
}
if !buf.is_empty() {
Err(annotate!(io::Error::new(
io::ErrorKind::UnexpectedEof,
"failed to fill whole buffer"
)))
} else {
Ok(())
}
}
pub fn write_all_at<F: FileExt>(
file: &F,
mut buf: &[u8],
mut offset: u64,
) -> io::Result<()> {
while !buf.is_empty() {
match maybe!(file.seek_write(buf, offset)) {
Ok(0) => {
return Err(annotate!(io::Error::new(
io::ErrorKind::WriteZero,
"failed to write whole buffer",
)));
}
Ok(n) => {
buf = &buf[n..];
offset += n as u64;
}
Err(ref e) if e.kind() == io::ErrorKind::Interrupted => {}
Err(e) => return Err(annotate!(e)),
}
}
Ok(())
}
}
#[derive(Debug)]
struct Slab {
file: fs::File,
slot_size: usize,
slot_allocator: Arc<Allocator>,
}
impl Slab {
fn maintenance(&self) -> io::Result<usize> {
// TODO compact
Ok(0)
}
fn read(
&self,
slot: u64,
_guard: &mut Guard<'_, DeferredFree, 1>,
) -> io::Result<Vec<u8>> {
let mut data = Vec::with_capacity(self.slot_size);
unsafe {
data.set_len(self.slot_size);
}
let whence = self.slot_size as u64 * slot;
sys_io::read_exact_at(&self.file, &mut data, whence)?;
let hash_actual: [u8; 4] =
crc32fast::hash(&data[..self.slot_size - 4]).to_le_bytes();
let hash_expected = &data[self.slot_size - 4..];
if hash_expected != hash_actual {
return Err(annotate!(io::Error::new(
io::ErrorKind::InvalidData,
"crc mismatch - data corruption detected"
)));
}
let len: usize = if self.slot_size <= u8::MAX as usize {
// crc32 + 1 byte frame
usize::from(data[self.slot_size - 5])
} else if self.slot_size <= u16::MAX as usize {
// crc32 + 2 byte frame
let mut size_bytes: [u8; 2] = [0; 2];
size_bytes
.copy_from_slice(&data[self.slot_size - 6..self.slot_size - 4]);
usize::from(u16::from_le_bytes(size_bytes))
} else if self.slot_size <= u32::MAX as usize {
// crc32 + 4 byte frame
let mut size_bytes: [u8; 4] = [0; 4];
size_bytes
.copy_from_slice(&data[self.slot_size - 8..self.slot_size - 4]);
usize::try_from(u32::from_le_bytes(size_bytes)).unwrap()
} else {
// crc32 + 8 byte frame
let mut size_bytes: [u8; 8] = [0; 8];
size_bytes.copy_from_slice(
&data[self.slot_size - 12..self.slot_size - 4],
);
usize::try_from(u64::from_le_bytes(size_bytes)).unwrap()
};
data.truncate(len);
Ok(data)
}
fn write(&self, slot: u64, mut data: Vec<u8>) -> io::Result<()> {
let len = data.len();
assert!(len + overhead_for_size(data.len()) <= self.slot_size);
data.resize(self.slot_size, 0);
if self.slot_size <= u8::MAX as usize {
// crc32 + 1 byte frame
data[self.slot_size - 5] = u8::try_from(len).unwrap();
} else if self.slot_size <= u16::MAX as usize {
// crc32 + 2 byte frame
let size_bytes: [u8; 2] = u16::try_from(len).unwrap().to_le_bytes();
data[self.slot_size - 6..self.slot_size - 4]
.copy_from_slice(&size_bytes);
} else if self.slot_size <= u32::MAX as usize {
// crc32 + 4 byte frame
let size_bytes: [u8; 4] = u32::try_from(len).unwrap().to_le_bytes();
data[self.slot_size - 8..self.slot_size - 4]
.copy_from_slice(&size_bytes);
} else {
// crc32 + 8 byte frame
let size_bytes: [u8; 8] = u64::try_from(len).unwrap().to_le_bytes();
data[self.slot_size - 12..self.slot_size - 4]
.copy_from_slice(&size_bytes);
}
let hash: [u8; 4] =
crc32fast::hash(&data[..self.slot_size - 4]).to_le_bytes();
data[self.slot_size - 4..].copy_from_slice(&hash);
let whence = self.slot_size as u64 * slot;
sys_io::write_all_at(&self.file, &data, whence)
}
}
struct DeferredFree {
allocator: Arc<Allocator>,
freed_slot: u64,
}
impl Drop for DeferredFree {
fn drop(&mut self) {
self.allocator.free(self.freed_slot)
}
}
fn set_error(
global_error: &AtomicPtr<(io::ErrorKind, String)>,
error: &io::Error,
) {
let kind = error.kind();
let reason = error.to_string();
let boxed = Box::new((kind, reason));
let ptr = Box::into_raw(boxed);
if global_error
.compare_exchange(
std::ptr::null_mut(),
ptr,
Ordering::SeqCst,
Ordering::SeqCst,
)
.is_err()
{
// global fatal error already installed, drop this one
unsafe {
drop(Box::from_raw(ptr));
}
}
}
#[derive(Debug)]
pub(crate) enum Update {
Store {
node_id: NodeId,
collection_id: CollectionId,
metadata: InlineArray,
data: Vec<u8>,
},
Free {
node_id: NodeId,
collection_id: CollectionId,
},
}
#[derive(Debug, PartialOrd, Ord, PartialEq, Eq)]
pub(crate) enum UpdateMetadata {
Store {
node_id: NodeId,
collection_id: CollectionId,
metadata: InlineArray,
location: NonZeroU64,
},
Free {
node_id: NodeId,
collection_id: CollectionId,
},
}
impl UpdateMetadata {
pub fn node_id(&self) -> NodeId {
match self {
UpdateMetadata::Store { node_id, .. }
| UpdateMetadata::Free { node_id, .. } => *node_id,
}
}
}
#[derive(Clone)]
pub(crate) struct Heap {
config: Config,
slabs: Arc<[Slab; N_SLABS]>,
pt: PageTable<AtomicU64>,
object_id_allocator: Arc<Allocator>,
metadata_store: Arc<MetadataStore>,
free_ebr: Ebr<DeferredFree, 1>,
global_error: Arc<AtomicPtr<(io::ErrorKind, String)>>,
#[allow(unused)]
directory_lock: Arc<fs::File>,
}
impl fmt::Debug for Heap {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.debug_struct("Heap")
.field("config", &self.config.path)
.field("stats", &self.stats())
.finish()
}
}
impl Heap {
pub fn get_global_error_arc(
&self,
) -> Arc<AtomicPtr<(io::ErrorKind, String)>> {
self.global_error.clone()
}
fn check_error(&self) -> io::Result<()> {
let err_ptr: *const (io::ErrorKind, String) =
self.global_error.load(Ordering::Acquire);
if err_ptr.is_null() {
Ok(())
} else {
let deref: &(io::ErrorKind, String) = unsafe { &*err_ptr };
Err(io::Error::new(deref.0, deref.1.clone()))
}
}
fn set_error(&self, error: &io::Error) {
set_error(&self.global_error, error);
}
pub fn recover(
config: &Config,
) -> io::Result<(Heap, Vec<(NodeId, CollectionId, InlineArray)>)> {
log::trace!("recovering Heap at {:?}", config.path);
let slabs_dir = config.path.join("slabs");
// initialize directories if not present
for p in [&config.path, &slabs_dir] {
if let Err(e) = fs::read_dir(p) {
if e.kind() == io::ErrorKind::NotFound {
fallible!(fs::create_dir_all(p));
}
}
}
let _ = fs::File::create(config.path.join(WARN));
let mut file_lock_opts = fs::OpenOptions::new();
file_lock_opts.create(false).read(false).write(false);
let directory_lock = fallible!(fs::File::open(&config.path));
fallible!(directory_lock.try_lock_exclusive());
let (metadata_store, recovered_metadata) =
MetadataStore::recover(config.path.join("metadata"))?;
let pt = PageTable::<AtomicU64>::default();
let mut user_data =
Vec::<(NodeId, CollectionId, InlineArray)>::with_capacity(
recovered_metadata.len(),
);
let mut node_ids: FnvHashSet<u64> = Default::default();
let mut slots_per_slab: [FnvHashSet<u64>; N_SLABS] =
core::array::from_fn(|_| Default::default());
for update_metadata in recovered_metadata {
match update_metadata {
UpdateMetadata::Store {
node_id,
collection_id,
location,
metadata,
} => {
node_ids.insert(node_id.0);
let slab_address = SlabAddress::from(location);
slots_per_slab[slab_address.slab_id as usize]
.insert(slab_address.slot());
pt.get(node_id.0).store(location.get(), Ordering::Relaxed);
user_data.push((node_id, collection_id, metadata.clone()));
}
UpdateMetadata::Free { .. } => {
unreachable!()
}
}
}
let mut slabs = vec![];
let mut slab_opts = fs::OpenOptions::new();
slab_opts.create(true).read(true).write(true);
for i in 0..N_SLABS {
let slot_size = SLAB_SIZES[i];
let slab_path = slabs_dir.join(format!("{}", slot_size));
let file = fallible!(slab_opts.open(slab_path));
slabs.push(Slab {
slot_size,
file,
slot_allocator: Arc::new(Allocator::from_allocated(
&slots_per_slab[i],
)),
})
}
log::info!("recovery of Heap at {:?} complete", config.path);
Ok((
Heap {
slabs: Arc::new(slabs.try_into().unwrap()),
config: config.clone(),
object_id_allocator: Arc::new(Allocator::from_allocated(
&node_ids,
)),
pt,
global_error: metadata_store.get_global_error_arc(),
metadata_store: Arc::new(metadata_store),
directory_lock: Arc::new(directory_lock),
free_ebr: Ebr::default(),
},
user_data,
))
}
pub fn maintenance(&self) -> io::Result<usize> {
for slab in self.slabs.iter() {
slab.maintenance()?;
}
Ok(0)
}
pub fn stats(&self) -> Stats {
Stats {}
}
pub fn read(&self, object_id: u64) -> io::Result<Vec<u8>> {
self.check_error()?;
let mut trace_spin = false;
let mut guard = self.free_ebr.pin();
let slab_address = loop {
let location_u64 = self.pt.get(object_id).load(Ordering::Acquire);
if let Some(nzu) = NonZeroU64::new(location_u64) {
break SlabAddress::from(nzu);
} else {
if !trace_spin {
log::warn!("spinning for paged-out object to be persisted");
trace_spin = true;
}
std::thread::yield_now();
}
};
let slab = &self.slabs[usize::from(slab_address.slab_id)];
match slab.read(slab_address.slot(), &mut guard) {
Ok(bytes) => Ok(bytes),
Err(e) => {
self.set_error(&e);
Err(e)
}
}
}
pub fn write_batch(&self, batch: Vec<Update>) -> io::Result<()> {
self.check_error()?;
let mut guard = self.free_ebr.pin();
let slabs = &self.slabs;
let map_closure = |update: Update| match update {
Update::Store { node_id, collection_id, metadata, data } => {
let slab_id = slab_for_size(data.len());
let slab = &slabs[usize::from(slab_id)];
let slot = slab.slot_allocator.allocate();
let new_location = SlabAddress::from_slab_slot(slab_id, slot);
let new_location_nzu: NonZeroU64 = new_location.into();
let complete_durability_pipeline =
maybe!(slab.write(slot, data));
if let Err(e) = complete_durability_pipeline {
// can immediately free slot as the
slab.slot_allocator.free(slot);
return Err(e);
}
Ok(UpdateMetadata::Store {
node_id,
collection_id,
metadata,
location: new_location_nzu,
})
}
Update::Free { node_id, collection_id } => {
Ok(UpdateMetadata::Free { node_id, collection_id })
}
};
let metadata_batch_res: io::Result<Vec<UpdateMetadata>> =
batch.into_par_iter().map(map_closure).collect();
let metadata_batch = match metadata_batch_res {
Ok(mut mb) => {
// TODO evaluate impact : cost ratio of this sort
mb.par_sort_unstable();
mb
}
Err(e) => {
self.set_error(&e);
return Err(e);
}
};
// make metadata durable
if let Err(e) = self.metadata_store.insert_batch(&metadata_batch) {
self.set_error(&e);
return Err(e);
}
// reclaim previous disk locations for future writes
for update_metadata in metadata_batch {
let (node_id, new_location) = match update_metadata {
UpdateMetadata::Store { node_id, location, .. } => {
(node_id, location.get())
}
UpdateMetadata::Free { node_id, .. } => (node_id, 0),
};
let last_u64 =
self.pt.get(node_id.0).swap(new_location, Ordering::Release);
if let Some(nzu) = NonZeroU64::new(last_u64) {
let last_address = SlabAddress::from(nzu);
guard.defer_drop(DeferredFree {
allocator: self.slabs[usize::from(last_address.slab_id)]
.slot_allocator
.clone(),
freed_slot: last_address.slot(),
});
}
}
Ok(())
}
pub fn allocate_object_id(&self) -> u64 {
self.object_id_allocator.allocate()
}
//#[allow(unused)]
pub fn free(&self, node_id: NodeId) -> io::Result<()> {
let mut guard = self.free_ebr.pin();
if let Err(e) =
self.metadata_store.insert_batch(&[UpdateMetadata::Free {
node_id,
collection_id: CollectionId::MIN,
}])
{
self.set_error(&e);
return Err(e);
}
let last_u64 = self.pt.get(node_id.0).swap(0, Ordering::Release);
if let Some(nzu) = NonZeroU64::new(last_u64) {
let last_address = SlabAddress::from(nzu);
guard.defer_drop(DeferredFree {
allocator: self.slabs[usize::from(last_address.slab_id)]
.slot_allocator
.clone(),
freed_slot: last_address.slot(),
});
}
Ok(())
}
}