Sketching out the design and implementation of Delta S3 Loader
There is still more to think about but this is a reasonable checkpoint to commit and push for others to look at.
This commit is contained in:
parent
88264d83c9
commit
c20997b197
|
@ -30,6 +30,15 @@ dependencies = [
|
||||||
"alloc-no-stdlib",
|
"alloc-no-stdlib",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "ansi_term"
|
||||||
|
version = "0.11.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "ee49baf6cb617b853aa8d93bf420db2383fab46d314482ca2803b40d5fde979b"
|
||||||
|
dependencies = [
|
||||||
|
"winapi",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "anyhow"
|
name = "anyhow"
|
||||||
version = "1.0.40"
|
version = "1.0.40"
|
||||||
|
@ -257,6 +266,21 @@ dependencies = [
|
||||||
"winapi",
|
"winapi",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "clap"
|
||||||
|
version = "2.33.3"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "37e58ac78573c40708d45522f0d80fa2f01cc4f9b4e2bf749807255454312002"
|
||||||
|
dependencies = [
|
||||||
|
"ansi_term",
|
||||||
|
"atty",
|
||||||
|
"bitflags",
|
||||||
|
"strsim 0.8.0",
|
||||||
|
"textwrap 0.11.0",
|
||||||
|
"unicode-width",
|
||||||
|
"vec_map",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "clap"
|
name = "clap"
|
||||||
version = "3.0.0-beta.2"
|
version = "3.0.0-beta.2"
|
||||||
|
@ -269,9 +293,9 @@ dependencies = [
|
||||||
"indexmap",
|
"indexmap",
|
||||||
"lazy_static",
|
"lazy_static",
|
||||||
"os_str_bytes",
|
"os_str_bytes",
|
||||||
"strsim",
|
"strsim 0.10.0",
|
||||||
"termcolor",
|
"termcolor",
|
||||||
"textwrap",
|
"textwrap 0.12.1",
|
||||||
"unicode-width",
|
"unicode-width",
|
||||||
"vec_map",
|
"vec_map",
|
||||||
]
|
]
|
||||||
|
@ -364,12 +388,16 @@ version = "0.1.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arrow",
|
"arrow",
|
||||||
"aws_lambda_events",
|
"aws_lambda_events",
|
||||||
|
"clap 2.33.3",
|
||||||
"deltalake",
|
"deltalake",
|
||||||
"lambda_runtime",
|
"lambda_runtime",
|
||||||
"log",
|
"log",
|
||||||
"parquet",
|
"parquet",
|
||||||
"pretty_env_logger",
|
"pretty_env_logger",
|
||||||
"regex",
|
"regex",
|
||||||
|
"rusoto_core",
|
||||||
|
"rusoto_credential",
|
||||||
|
"rusoto_s3",
|
||||||
"serde",
|
"serde",
|
||||||
"serde_json",
|
"serde_json",
|
||||||
"serde_regex",
|
"serde_regex",
|
||||||
|
@ -388,7 +416,7 @@ dependencies = [
|
||||||
"bytes 1.0.1",
|
"bytes 1.0.1",
|
||||||
"cfg-if",
|
"cfg-if",
|
||||||
"chrono",
|
"chrono",
|
||||||
"clap",
|
"clap 3.0.0-beta.2",
|
||||||
"env_logger 0.8.3",
|
"env_logger 0.8.3",
|
||||||
"errno",
|
"errno",
|
||||||
"futures",
|
"futures",
|
||||||
|
@ -1817,6 +1845,12 @@ version = "0.1.5"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "213701ba3370744dcd1a12960caa4843b3d68b4d1c0a5d575e0d65b2ee9d16c0"
|
checksum = "213701ba3370744dcd1a12960caa4843b3d68b4d1c0a5d575e0d65b2ee9d16c0"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "strsim"
|
||||||
|
version = "0.8.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "strsim"
|
name = "strsim"
|
||||||
version = "0.10.0"
|
version = "0.10.0"
|
||||||
|
@ -1863,6 +1897,15 @@ dependencies = [
|
||||||
"winapi-util",
|
"winapi-util",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "textwrap"
|
||||||
|
version = "0.11.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060"
|
||||||
|
dependencies = [
|
||||||
|
"unicode-width",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "textwrap"
|
name = "textwrap"
|
||||||
version = "0.12.1"
|
version = "0.12.1"
|
||||||
|
|
|
@ -6,11 +6,15 @@ edition = "2018"
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
arrow = "4"
|
arrow = "4"
|
||||||
|
clap = "2"
|
||||||
deltalake = { git = "https://github.com/rtyler/delta-rs", branch = "mixing-in-deltalake_ext", features = ["s3"] }
|
deltalake = { git = "https://github.com/rtyler/delta-rs", branch = "mixing-in-deltalake_ext", features = ["s3"] }
|
||||||
log = "0.4"
|
log = "0.4"
|
||||||
parquet = "4"
|
parquet = "4"
|
||||||
pretty_env_logger = "0.4"
|
pretty_env_logger = "0.4"
|
||||||
regex = "1"
|
regex = "1"
|
||||||
|
rusoto_core = "*"
|
||||||
|
rusoto_credential = "*"
|
||||||
|
rusoto_s3 = "*"
|
||||||
serde = { version = "1", features = ["rc", "derive"]}
|
serde = { version = "1", features = ["rc", "derive"]}
|
||||||
serde_json = "1"
|
serde_json = "1"
|
||||||
serde_yaml = "0.8"
|
serde_yaml = "0.8"
|
||||||
|
|
126
README.adoc
126
README.adoc
|
@ -1,13 +1,133 @@
|
||||||
= Delta S3 Loader
|
ifdef::env-github[]
|
||||||
|
:tip-caption: :bulb:
|
||||||
|
:note-caption: :information_source:
|
||||||
|
:important-caption: :heavy_exclamation_mark:
|
||||||
|
:caution-caption: :fire:
|
||||||
|
:warning-caption: :warning:
|
||||||
|
endif::[]
|
||||||
|
|
||||||
|
:toc: macro
|
||||||
|
|
||||||
|
= Delta S3 Loader
|
||||||
|
|
||||||
Delta S3 Loader is a project to quickly and cheaply bring JSON files added to
|
Delta S3 Loader is a project to quickly and cheaply bring JSON files added to
|
||||||
S3 buckets into Delta Lake. This can be highly useful for legacy or external
|
S3 buckets into Delta Lake. This can be highly useful for legacy or external
|
||||||
processes which rely on uploading JSON to an S3 bucket and cannot be properly
|
processes which rely on uploading JSON to an S3 bucket and cannot be properly
|
||||||
updated to write directly to link:https://delta.io[Delta Lake].
|
updated to write directly to link:https://delta.io[Delta Lake].
|
||||||
|
|
||||||
Delta S3 Loader can be built into a stnadalone binary or an
|
toc::[]
|
||||||
link:https://aws.amazon.com/lambda/[AWS Lambda].
|
|
||||||
|
== Modes
|
||||||
|
|
||||||
|
Delta S3 Loader can be built into a standalone binary or an
|
||||||
|
link:https://aws.amazon.com/lambda/[AWS Lambda]. While both modes are
|
||||||
|
functionally identical they have different configuration requirements as
|
||||||
|
mentioned below.
|
||||||
|
|
||||||
|
=== Standalone
|
||||||
|
|
||||||
|
A standalone instance of Delta S3 Loader requires:
|
||||||
|
|
||||||
|
* Destination Delta table path
|
||||||
|
* SQS queue ARN
|
||||||
|
|
||||||
|
=== Lambda
|
||||||
|
|
||||||
|
When deployed with AWS Lambda, the Lambda function should be configured with an
|
||||||
|
AWS SQS trigger. This causes AWS to manage the queue on behalf of Delta S3
|
||||||
|
Loader. Learn more in the
|
||||||
|
link:https://docs.aws.amazon.com/AWSSimpleQueueService/latest/SQSDeveloperGuide/sqs-configure-lambda-function-trigger.html[AWS
|
||||||
|
Lambda trigger documentation].
|
||||||
|
|
||||||
|
== Design
|
||||||
|
|
||||||
|
This project is designed to work in a Lambda or packaged up into a container.
|
||||||
|
It relies on
|
||||||
|
link:https://docs.aws.amazon.com/AmazonS3/latest/userguide/NotificationHowTo.html[S3
|
||||||
|
Event Notifications] which are delivered into an
|
||||||
|
link:http://aws.amazon.com/sqs/[Amazon SQS] queue. The S3 Event Notifications
|
||||||
|
should be configured to funnel events to a single SQS queue per table. The
|
||||||
|
Delta S3 Loader will take **all** messages from a single queue and insert those
|
||||||
|
into a single table.
|
||||||
|
|
||||||
|
Additionally, for source buckets which have _multiple_ types of data in them you may use link:https://docs.aws.amazon.com/AmazonS3/latest/userguide/notification-how-to-filtering.html[filtering on event notifications] to specify different object prefixes, etc.
|
||||||
|
For example, in a bucket named `audit_logs` bucket that has data prefixed with:
|
||||||
|
|
||||||
|
* `databricks/workspaceId=123/*.json`
|
||||||
|
* `tableau/*.json`
|
||||||
|
* `admin_console/domain=github.com/*.json`
|
||||||
|
|
||||||
|
A deployment of Delta S3 Loader to _only_ process the `admin_console` events
|
||||||
|
into an Delta table would require the following event configuration:
|
||||||
|
|
||||||
|
.SQS Event Configuration
|
||||||
|
[source,xml]
|
||||||
|
----
|
||||||
|
<NotificationConfiguration>
|
||||||
|
<QueueConfiguration>
|
||||||
|
<Id>1</Id>
|
||||||
|
<Filter>
|
||||||
|
<S3Key>
|
||||||
|
<FilterRule>
|
||||||
|
<Name>prefix</Name>
|
||||||
|
<Value>admin_console/</Value>
|
||||||
|
</FilterRule>
|
||||||
|
<FilterRule>
|
||||||
|
<Name>suffix</Name>
|
||||||
|
<Value>json</Value>
|
||||||
|
</FilterRule>
|
||||||
|
</S3Key>
|
||||||
|
</Filter>
|
||||||
|
<Queue>arn:aws:sqs:us-west-2:444455556666:admin_console_audit_queue</Queue>
|
||||||
|
<Event>s3:ObjectCreated:Put</Event>
|
||||||
|
</QueueConfiguration>
|
||||||
|
</NotificationConfiguration>
|
||||||
|
----
|
||||||
|
|
||||||
|
[CAUTION]
|
||||||
|
====
|
||||||
|
Always use different source and destination S3 buckets to avoid infinite loops!
|
||||||
|
====
|
||||||
|
|
||||||
|
|
||||||
|
A standalone Delta S3 Loader invocation for the above queue might look something like:
|
||||||
|
|
||||||
|
|
||||||
|
[source,bash]
|
||||||
|
----
|
||||||
|
delta-s3-loader -t s3://warehouse/audit_logs_raw/databricks \ # <1>
|
||||||
|
-p domain \ # <2>
|
||||||
|
-q "arn:aws:sqs:us-west-2:444455556666:admin_console_audit_queue" # <3>
|
||||||
|
----
|
||||||
|
<1> Specify a destination Delta Lake table path in S3.
|
||||||
|
<2> Annotate the partition columns to help the loader partition data properly.
|
||||||
|
<3> Specify the input SQS queue by ARN
|
||||||
|
|
||||||
|
|
||||||
|
== Environment Variables
|
||||||
|
|
||||||
|
When running in an AWS Lambda, Delta S3 Loader should be configured solely with environment variables. In a standalone mode the daemon can be configured with command line options _or_ environment variables
|
||||||
|
|
||||||
|
|
||||||
|
|===
|
||||||
|
| Name | Required | Description
|
||||||
|
|
||||||
|
| `RUST_LOG`
|
||||||
|
| No
|
||||||
|
| Define the log level for the process: `error`, `warn`, `info`, `debug`.
|
||||||
|
|
||||||
|
|===
|
||||||
|
|
||||||
|
|
||||||
|
=== Authentication/Authorization
|
||||||
|
|
||||||
|
Delta S3 Loader assumes that the right AWS environment variables, such as
|
||||||
|
`AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` are defined in the environment.
|
||||||
|
Under the hood the Delta S3 Loader is not responsible for
|
||||||
|
authentication/authorization so please consult the
|
||||||
|
link:https://github.com/rusoto/rusoto/blob/master/AWS-CREDENTIALS.md[Rusoto AWS
|
||||||
|
credentials documentation] for more information.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
== Related work
|
== Related work
|
||||||
|
|
22
config.yml
22
config.yml
|
@ -1,22 +0,0 @@
|
||||||
# This is an example configuration for the delta-s3-loader lambda
|
|
||||||
---
|
|
||||||
sources:
|
|
||||||
# This entry maps the existing S3 structure into a delta table named
|
|
||||||
# `logs.audit_logs` which will be date stamp partitioned
|
|
||||||
#
|
|
||||||
# └── simple
|
|
||||||
# └── date=2021-04-16
|
|
||||||
# └── auditlog-1234.json
|
|
||||||
# The name of the bucket should _just_ be a name of the bucket since this
|
|
||||||
# will come from an AWS Lambda event
|
|
||||||
- bucket: 'raw'
|
|
||||||
# Everything in the simple/ prefix will be considered part of this table
|
|
||||||
# the prefix may be a regular expression that is compatible with the Rust
|
|
||||||
# regex crate
|
|
||||||
prefix: '^simple'
|
|
||||||
# When specifying partitions, the source data is expected to be parttioned,
|
|
||||||
# as it is in this example
|
|
||||||
partitions:
|
|
||||||
- 'date'
|
|
||||||
# The table path is the destination for the written delta
|
|
||||||
tablepath: 's3://delta/audit_logs'
|
|
|
@ -1,57 +0,0 @@
|
||||||
/**
|
|
||||||
* The config module is largely responsible for just reading a configuration yaml file in.
|
|
||||||
*/
|
|
||||||
use regex::Regex;
|
|
||||||
use serde::Deserialize;
|
|
||||||
use std::path::PathBuf;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Root configuration
|
|
||||||
*/
|
|
||||||
#[derive(Clone, Debug, Deserialize)]
|
|
||||||
pub struct Config {
|
|
||||||
pub sources: Vec<Source>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Config {
|
|
||||||
pub fn new(sources: Vec<Source>) -> Config {
|
|
||||||
Config { sources }
|
|
||||||
}
|
|
||||||
/**
|
|
||||||
* from_file will attempt to load a configuration from the given path
|
|
||||||
*
|
|
||||||
* The path should be to a properly formatted YAML file:
|
|
||||||
*/
|
|
||||||
pub fn from_file(path: &PathBuf) -> Result<Config, std::io::Error> {
|
|
||||||
use std::fs::File;
|
|
||||||
use std::io::{Error, ErrorKind};
|
|
||||||
|
|
||||||
let reader = File::open(path)?;
|
|
||||||
|
|
||||||
serde_yaml::from_reader(reader).map_err(|e| Error::new(ErrorKind::Other, e))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* A source maps an existing S3 structure into a named Delta table
|
|
||||||
*/
|
|
||||||
#[derive(Clone, Debug, Deserialize)]
|
|
||||||
pub struct Source {
|
|
||||||
pub bucket: String,
|
|
||||||
#[serde(with = "serde_regex")]
|
|
||||||
pub prefix: Regex,
|
|
||||||
pub partitions: Vec<String>,
|
|
||||||
pub tablepath: String,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(test)]
|
|
||||||
mod tests {
|
|
||||||
use super::*;
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_load_file() -> Result<(), std::io::Error> {
|
|
||||||
let config = Config::from_file(&PathBuf::from("./config.yml"))?;
|
|
||||||
assert_eq!(config.sources.len(), 1);
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
}
|
|
39
src/main.rs
39
src/main.rs
|
@ -1,14 +1,12 @@
|
||||||
/*
|
/*
|
||||||
* The bulk of the application
|
* The bulk of the application
|
||||||
*/
|
*/
|
||||||
|
use clap::{App, Arg};
|
||||||
use log::*;
|
use log::*;
|
||||||
use std::path::PathBuf;
|
|
||||||
|
|
||||||
mod config;
|
|
||||||
mod writer;
|
|
||||||
#[cfg(lambda)]
|
#[cfg(lambda)]
|
||||||
mod lambda;
|
mod lambda;
|
||||||
|
mod writer;
|
||||||
|
|
||||||
fn preboot() {
|
fn preboot() {
|
||||||
#[cfg(debug_assertions)]
|
#[cfg(debug_assertions)]
|
||||||
|
@ -18,6 +16,38 @@ fn preboot() {
|
||||||
"Initializing delta-s3-loader v{}",
|
"Initializing delta-s3-loader v{}",
|
||||||
env!["CARGO_PKG_VERSION"]
|
env!["CARGO_PKG_VERSION"]
|
||||||
);
|
);
|
||||||
|
let _matches = App::new("delta-s3-loader")
|
||||||
|
.version(env!["CARGO_PKG_VERSION"])
|
||||||
|
.author("rtyler@brokenco.de")
|
||||||
|
.about("Watch S3 buckets for new data to load into a Delta Table")
|
||||||
|
.arg(
|
||||||
|
Arg::with_name("table")
|
||||||
|
.required(true)
|
||||||
|
.short("t")
|
||||||
|
.long("table")
|
||||||
|
.env("TABLE_PATH")
|
||||||
|
.value_name("TABLE_PATH")
|
||||||
|
.help("Sets the destination Delta Table for the ingested data")
|
||||||
|
.takes_value(true),
|
||||||
|
)
|
||||||
|
.arg(
|
||||||
|
Arg::with_name("partitions")
|
||||||
|
.short("p")
|
||||||
|
.long("partitions")
|
||||||
|
.env("CSV_PARTITIONS")
|
||||||
|
.value_name("CSV_PARTITIONS")
|
||||||
|
.help("Ordered list of partition from the source data, e.g. year,month")
|
||||||
|
.takes_value(true),
|
||||||
|
)
|
||||||
|
.arg(
|
||||||
|
Arg::with_name("SQS queue ARN")
|
||||||
|
.short("q")
|
||||||
|
.long("queue")
|
||||||
|
.value_name("QUEUE_ARN")
|
||||||
|
.help("ARN of the SQS queue to consume, *required* in standalone mode")
|
||||||
|
.takes_value(true),
|
||||||
|
)
|
||||||
|
.get_matches();
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(not(lambda))]
|
#[cfg(not(lambda))]
|
||||||
|
@ -25,7 +55,6 @@ fn main() {
|
||||||
preboot();
|
preboot();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
#[cfg(lambda)]
|
#[cfg(lambda)]
|
||||||
#[tokio::main]
|
#[tokio::main]
|
||||||
async fn main() -> Result<(), lambda_runtime::Error> {
|
async fn main() -> Result<(), lambda_runtime::Error> {
|
||||||
|
|
Loading…
Reference in New Issue