oxbow/crates/oxbow-lambda-shared/src/lib.rs

425 lines
15 KiB
Rust

/**
* oxbow-lambda-shared contains common helper functions and utilities for all oxbow related lambdas
*/
use aws_lambda_events::s3::{S3Event, S3EventRecord, S3Object};
use aws_lambda_events::sqs::SqsEvent;
use chrono::prelude::*;
use deltalake::{DeltaResult, ObjectMeta, Path};
use std::collections::HashMap;
/**
* Return wholly new [`S3EventRecord`] objects with their the [`S3Object`] `url_decoded_key`
* properly filled in
*
* For whatever reason `aws_lambda_events` does not properly handle this
*/
pub fn records_with_url_decoded_keys(records: &[S3EventRecord]) -> Vec<S3EventRecord> {
use urlencoding::decode;
records
.iter()
.filter(|record| match &record.s3.object.key {
None => true,
Some(key) => !key.contains("_delta_log"),
})
.map(|record| {
let mut replacement = record.clone();
if let Some(key) = &replacement.s3.object.key {
if let Ok(decoded_key) = decode(key) {
replacement.s3.object.url_decoded_key = Some(decoded_key.into_owned());
}
}
replacement
})
.collect()
}
/// Struct to keep track of the table modifications needing to be made based on
/// [S3EventRecord] objects .
#[derive(Debug, Clone, Default)]
pub struct TableMods {
pub adds: Vec<ObjectMeta>,
pub removes: Vec<ObjectMeta>,
}
/**
* Group the objects from the notification based on the delta tables they should be added to.
*
* There's a possibility that an S3 bucket notification will have objects mixed in which should be
* destined for different delta tables. Rather than re-opening/loading the table for each object as
* we iterate the records, we can group them based on the delta table and then create the
* appropriate transactions
*/
pub fn objects_by_table(records: &[S3EventRecord]) -> HashMap<String, TableMods> {
let mut mods = HashMap::new();
for record in records.iter() {
if let Some(bucket) = &record.s3.bucket.name {
let log_path = infer_log_path_from(record.s3.object.url_decoded_key.as_ref().unwrap());
let om = into_object_meta(&record.s3.object, Some(&log_path));
let key = format!("s3://{}/{}", bucket, log_path);
if !mods.contains_key(&key) {
mods.insert(key.clone(), TableMods::default());
}
if let Some(objects) = mods.get_mut(&key) {
if let Some(event_name) = &record.event_name {
if event_name.starts_with("ObjectCreated") {
objects.adds.push(om);
} else if event_name == "ObjectRemoved:Delete" {
objects.removes.push(om);
}
}
}
}
}
mods
}
/**
* Infer the log path from the given object path.
*
* The location of `_delta_log/` can technically be _anywhere_ but for convention's
* sake oxbow will attempt to put the `_delta_log/` some place predictable to ensure that
* `add` actions in the log can use relative file paths for newly added objects
*/
pub fn infer_log_path_from(path: &str) -> String {
use std::path::{Component, Path};
let mut root = vec![];
for component in Path::new(path)
.parent()
.expect("Failed to get parent() of path")
.components()
{
if let Component::Normal(os_str) = component {
if let Some(segment) = os_str.to_str() {
/*
* If a segment has what looks like a hive-style partition, bail and call that the root of
* the delta table
*/
if segment.find('=') >= Some(0) {
break;
}
root.push(segment);
}
}
}
root.join("/")
}
/// A simple structure to make deserializing test events for identification easier
///
/// See <fhttps://github.com/buoyant-data/oxbow/issues/8>
#[derive(serde::Deserialize)]
#[serde(rename_all = "PascalCase")]
struct TestEvent {
event: String,
}
/// Convert the given [aws_lambda_events::sqs::SqsEvent] to a collection of
/// [aws_lambda_events::s3::S3EventRecord] entities. This is mostly useful for handling S3 Bucket
/// Notifications which have been passed into SQS
///
/// In the case where the [aws_lambda_events::sqs::SqsEvent] contains an `s3:TestEvent` which is
/// fired when S3 Bucket Notifications are first enabled, the event will be ignored to avoid
/// errorsin the processing pipeline
pub fn s3_from_sqs(event: SqsEvent) -> DeltaResult<Vec<S3EventRecord>> {
let mut records = vec![];
for record in event.records.iter() {
/* each record is an SqsMessage */
if let Some(body) = &record.body {
match serde_json::from_str::<S3Event>(body) {
Ok(s3event) => {
for s3record in s3event.records {
records.push(s3record.clone());
}
}
Err(err) => {
// if we cannot deserialize and the event is an s3::TestEvent, then we should
// just return empty records.
let test_event = serde_json::from_str::<TestEvent>(body);
// Early exit with the original error if we cannot parse the JSON at all
if test_event.is_err() {
return Err(err.into());
}
// Ignore the error on deserialization if the event ends up being an S3
// TestEvent which is fired when bucket notifications are originally configured
if "s3:TestEvent" != test_event.unwrap().event {
return Err(err.into());
}
}
};
}
}
Ok(records)
}
/**
* Convert an [`S3Object`] into an [`ObjectMeta`] for use in the creation of Delta transactions
*
* This is a _lossy_ conversion since the two structs do not share the same set of information,
* therefore this conversion is really only taking the path of the object and the size
*/
fn into_object_meta(s3object: &S3Object, prune_prefix: Option<&str>) -> ObjectMeta {
let location = s3object.url_decoded_key.clone().unwrap_or("".to_string());
let location = match prune_prefix {
Some(prune) => Path::from(location.strip_prefix(prune).unwrap_or(&location)),
None => Path::from(location),
};
ObjectMeta {
size: s3object.size.unwrap_or(0) as usize,
last_modified: Utc::now(),
e_tag: None,
location,
}
}
#[cfg(test)]
mod tests {
use super::*;
use aws_lambda_events::sqs::SqsMessage;
#[test]
fn infer_log_path_from_object() {
let object = "some/path/to/a/prefix/alpha.parquet";
let expected = "some/path/to/a/prefix";
assert_eq!(expected, infer_log_path_from(object));
}
#[test]
fn s3event_object_to_objectmeta() {
let s3object = S3Object {
key: Some("some/path/to/a/prefix/alpha.parquet".into()),
size: Some(1024),
url_decoded_key: Some("some/path/to/a/prefix/alpha.parquet".into()),
version_id: None,
e_tag: None,
sequencer: None,
};
let expected = deltalake::ObjectMeta {
location: deltalake::Path::from("some/path/to/a/prefix/alpha.parquet"),
last_modified: Utc::now(),
size: 1024,
e_tag: None,
};
let result = into_object_meta(&s3object, None);
assert_eq!(expected.location, result.location);
assert_eq!(expected.size, result.size);
}
#[test]
fn into_object_meta_with_prefix() {
let s3object = S3Object {
key: Some("some/path/to/a/prefix/alpha.parquet".into()),
size: Some(1024),
url_decoded_key: Some("some/path/to/a/prefix/alpha.parquet".into()),
version_id: None,
e_tag: None,
sequencer: None,
};
let expected = deltalake::ObjectMeta {
location: deltalake::Path::from("alpha.parquet"),
last_modified: Utc::now(),
e_tag: None,
size: 1024,
};
let result = into_object_meta(&s3object, Some("some/path/to/a/prefix"));
assert_eq!(expected.location, result.location);
assert_eq!(expected.size, result.size);
}
/**
* It is valid to have a bucket totally dedicated to the delta table such that there is no
* prefix
*/
#[test]
fn infer_log_path_from_object_at_root() {
let object = "some.parquet";
let expected = "";
assert_eq!(expected, infer_log_path_from(object));
}
#[test]
fn infer_log_path_from_hive_partitioned_object() {
let object = "some/path/ds=2023-05-05/site=delta.io/beta.parquet";
let expected = "some/path";
assert_eq!(expected, infer_log_path_from(object));
}
#[test]
fn test_records_with_url_decoded_keys() {
let buf = std::fs::read_to_string("../../tests/data/s3-event-multiple-urlencoded.json")
.expect("Failed to read file");
let event: S3Event = serde_json::from_str(&buf).expect("Failed to parse");
assert_eq!(3, event.records.len());
let records = records_with_url_decoded_keys(&event.records);
assert_eq!(event.records.len(), records.len());
}
#[test]
fn test_records_with_url_decoded_keys_checkpoint_parquets() {
let buf = std::fs::read_to_string("../../tests/data/s3-event-multiple.json")
.expect("Failed to read file");
let event: S3Event = serde_json::from_str(&buf).expect("Failed to parse");
assert_eq!(4, event.records.len());
let records = records_with_url_decoded_keys(&event.records);
// Thec checkpoint file should be removewd
assert_eq!(3, records.len());
}
/**
* The keys coming off of the S3Object will be url encodeed, and for hive style partitioning
* that needs to be undone.
*
* In theory S3Object does have `url_decoded_key` but in production testing this was always
* None.
*/
#[test]
fn into_object_meta_urlencoded() {
let key = "databases/deltatbl-partitioned/c2%3Dfoo0/part-00000-2bcc9ff6-0551-4401-bd22-d361a60627e3.c000.snappy.parquet";
let s3object = S3Object {
key: Some(key.into()),
size: Some(1024),
url_decoded_key: Some("databases/deltatbl-partitioned/c2=foo0/part-00000-2bcc9ff6-0551-4401-bd22-d361a60627e3.c000.snappy.parquet".into()),
version_id: None,
e_tag: None,
sequencer: None,
};
let expected = deltalake::ObjectMeta {
location: deltalake::Path::from(
"c2=foo0/part-00000-2bcc9ff6-0551-4401-bd22-d361a60627e3.c000.snappy.parquet",
),
last_modified: Utc::now(),
size: 1024,
e_tag: None,
};
let result = into_object_meta(&s3object, Some("databases/deltatbl-partitioned"));
assert_eq!(expected.location, result.location);
assert_eq!(expected.size, result.size);
}
#[test]
fn group_objects_to_tables() {
let buf = std::fs::read_to_string("../../tests/data/s3-event-multiple.json")
.expect("Failed to read file");
let event: S3Event = serde_json::from_str(&buf).expect("Failed to parse");
assert_eq!(4, event.records.len());
let groupings = objects_by_table(&records_with_url_decoded_keys(&event.records));
assert_eq!(2, groupings.keys().len());
let table_one = groupings
.get("s3://example-bucket/some/first-prefix")
.expect("Failed to get the first table");
assert_eq!(
1,
table_one.adds.len(),
"Shoulid only be one object in table one"
);
let table_two = groupings
.get("s3://example-bucket/some/prefix")
.expect("Failed to get the second table");
assert_eq!(
2,
table_two.adds.len(),
"Shoulid only be two objects in table two"
);
}
#[test]
fn test_s3_from_sqs() {
let buf = std::fs::read_to_string("../../tests/data/s3-event-multiple.json")
.expect("Failed to read file");
let message = SqsMessage {
body: Some(buf),
..Default::default()
};
let event = SqsEvent {
records: vec![message],
};
let events = s3_from_sqs(event).expect("Failed to get events");
assert_eq!(4, events.len(), "Unexpected number of entries");
}
#[test]
fn test_s3_from_sqs_with_delete() {
let buf = r#"{"Records":[{"eventVersion":"2.1","eventSource":"aws:s3","awsRegion":"us-west-2","eventTime":"2023-12-18T00:22:24.292Z","eventName":"ObjectRemoved:Delete","userIdentity":{"principalId":"A16S3A764ZBGJN"},"requestParameters":{"sourceIPAddress":"76.218.225.124"},"responseElements":{"x-amz-request-id":"CWK6W9YANZBH6SK4","x-amz-id-2":"H7P6nIKhchv9soZ4pnX0GsAj3zqqdrShFddk4kX9UpSbC2C5FL9XNvNtSxtTD1Nt0ZtTnREeZIMqO1IsSpkebocjUTRJkumh"},"s3":{"s3SchemaVersion":"1.0","configurationId":"test-delete","bucket":{"name":"oxbow-simple","ownerIdentity":{"principalId":"A16S3A764ZBGJN"},"arn":"arn:aws:s3:::oxbow-simple"},"object":{"key":"gcs-export/ds%3D2023-12-12/testing_oxbow-partitioned2_ds%3D2023-12-12_000000000000.parquet","sequencer":"00657F90C047858AE9"}}}]}"#;
let message = SqsMessage {
body: Some(buf.into()),
..Default::default()
};
let event = SqsEvent {
records: vec![message],
};
let events = s3_from_sqs(event).expect("Failed to get events");
assert_eq!(1, events.len(), "Unexpected number of entries");
let records = records_with_url_decoded_keys(&events);
let tables = objects_by_table(records.as_slice());
if let Some(mods) = tables.get("s3://oxbow-simple/gcs-export") {
assert_eq!(
mods.removes.len(),
1,
"Should have recorded a removes table modification"
);
} else {
assert!(false, "Failed to find the right key on {tables:?}");
}
}
#[test]
fn test_s3_from_sqs_with_invalid() {
let message = SqsMessage {
body: Some("This ain't no valid JSON".into()),
..Default::default()
};
let event = SqsEvent {
records: vec![message],
};
let response = s3_from_sqs(event);
assert!(
response.is_err(),
"Should have returned an error trying to deserialize"
);
}
#[test]
fn test_s3_from_sqs_with_test_event() {
let buf = std::fs::read_to_string("../../tests/data/s3-test-event.json")
.expect("Failed to read file");
let message = SqsMessage {
body: Some(buf),
..Default::default()
};
let event = SqsEvent {
records: vec![message],
};
let response = s3_from_sqs(event);
assert!(
response.is_ok(),
"Should have treated a test event like a no-op: {response:?}"
);
}
}