Checkpointing some work on the lambda side of things, processing inbound events
At this point I'm still spending a lot of time reading through some kafka-delta-ingest and arrow code to figure out exactly how I want to do schema inference and writes.
This commit is contained in:
parent
bf1fb592e3
commit
a795083129
|
@ -112,6 +112,22 @@ version = "1.0.1"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "cdb031dd78e28731d87d56cc8ffef4a8f36ca26c38fe2de700543e627f8a464a"
|
checksum = "cdb031dd78e28731d87d56cc8ffef4a8f36ca26c38fe2de700543e627f8a464a"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "aws_lambda_events"
|
||||||
|
version = "0.4.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "663ef110325f68726f4bc51d12c610b9700f70e8bb8a55279cdeca01b12b5491"
|
||||||
|
dependencies = [
|
||||||
|
"base64 0.12.3",
|
||||||
|
"bytes 0.5.6",
|
||||||
|
"chrono",
|
||||||
|
"http",
|
||||||
|
"http-serde",
|
||||||
|
"serde",
|
||||||
|
"serde_derive",
|
||||||
|
"serde_json",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "base-x"
|
name = "base-x"
|
||||||
version = "0.2.8"
|
version = "0.2.8"
|
||||||
|
@ -190,6 +206,15 @@ version = "1.4.3"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610"
|
checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "bytes"
|
||||||
|
version = "0.5.6"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "0e4cec68f03f32e44924783795810fa50a7035d8c8ebe78580ad7e6c703fba38"
|
||||||
|
dependencies = [
|
||||||
|
"serde",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "bytes"
|
name = "bytes"
|
||||||
version = "1.0.1"
|
version = "1.0.1"
|
||||||
|
@ -336,11 +361,15 @@ dependencies = [
|
||||||
name = "delta-s3-loader"
|
name = "delta-s3-loader"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
|
"arrow",
|
||||||
|
"aws_lambda_events",
|
||||||
"deltalake",
|
"deltalake",
|
||||||
"lambda_runtime",
|
"lambda_runtime",
|
||||||
"log",
|
"log",
|
||||||
|
"parquet",
|
||||||
"pretty_env_logger",
|
"pretty_env_logger",
|
||||||
"serde",
|
"serde",
|
||||||
|
"serde_json",
|
||||||
"tokio",
|
"tokio",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
@ -352,7 +381,7 @@ dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"arrow",
|
"arrow",
|
||||||
"async-trait",
|
"async-trait",
|
||||||
"bytes",
|
"bytes 1.0.1",
|
||||||
"cfg-if",
|
"cfg-if",
|
||||||
"chrono",
|
"chrono",
|
||||||
"clap",
|
"clap",
|
||||||
|
@ -705,7 +734,7 @@ version = "0.2.4"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "527e8c9ac747e28542699a951517aa9a6945af506cd1f2e1b53a576c17b6cc11"
|
checksum = "527e8c9ac747e28542699a951517aa9a6945af506cd1f2e1b53a576c17b6cc11"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"bytes",
|
"bytes 1.0.1",
|
||||||
"fnv",
|
"fnv",
|
||||||
"itoa",
|
"itoa",
|
||||||
]
|
]
|
||||||
|
@ -716,11 +745,21 @@ version = "0.4.1"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "5dfb77c123b4e2f72a2069aeae0b4b4949cc7e966df277813fc16347e7549737"
|
checksum = "5dfb77c123b4e2f72a2069aeae0b4b4949cc7e966df277813fc16347e7549737"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"bytes",
|
"bytes 1.0.1",
|
||||||
"http",
|
"http",
|
||||||
"pin-project-lite",
|
"pin-project-lite",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "http-serde"
|
||||||
|
version = "1.0.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "aaa9564ce1decf49edcbd2b8f4f732843b4df64eabb8dcfcf0085ff34dbc76a2"
|
||||||
|
dependencies = [
|
||||||
|
"http",
|
||||||
|
"serde",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "httparse"
|
name = "httparse"
|
||||||
version = "1.3.6"
|
version = "1.3.6"
|
||||||
|
@ -754,7 +793,7 @@ version = "0.14.5"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "8bf09f61b52cfcf4c00de50df88ae423d6c02354e385a86341133b5338630ad1"
|
checksum = "8bf09f61b52cfcf4c00de50df88ae423d6c02354e385a86341133b5338630ad1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"bytes",
|
"bytes 1.0.1",
|
||||||
"futures-channel",
|
"futures-channel",
|
||||||
"futures-core",
|
"futures-core",
|
||||||
"futures-util",
|
"futures-util",
|
||||||
|
@ -777,7 +816,7 @@ version = "0.5.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "d6183ddfa99b85da61a140bea0efc93fdf56ceaa041b37d553518030827f9905"
|
checksum = "d6183ddfa99b85da61a140bea0efc93fdf56ceaa041b37d553518030827f9905"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"bytes",
|
"bytes 1.0.1",
|
||||||
"hyper",
|
"hyper",
|
||||||
"native-tls",
|
"native-tls",
|
||||||
"tokio",
|
"tokio",
|
||||||
|
@ -833,7 +872,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "f9b08856997d11ca8122121b26b17a27ef1dce689d71ccd754e051f2417aebdd"
|
checksum = "f9b08856997d11ca8122121b26b17a27ef1dce689d71ccd754e051f2417aebdd"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"async-stream",
|
"async-stream",
|
||||||
"bytes",
|
"bytes 1.0.1",
|
||||||
"futures",
|
"futures",
|
||||||
"http",
|
"http",
|
||||||
"hyper",
|
"hyper",
|
||||||
|
@ -1414,7 +1453,7 @@ checksum = "02aff20978970d47630f08de5f0d04799497818d16cafee5aec90c4b4d0806cf"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"async-trait",
|
"async-trait",
|
||||||
"base64 0.13.0",
|
"base64 0.13.0",
|
||||||
"bytes",
|
"bytes 1.0.1",
|
||||||
"crc32fast",
|
"crc32fast",
|
||||||
"futures",
|
"futures",
|
||||||
"http",
|
"http",
|
||||||
|
@ -1456,7 +1495,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "abc3f56f14ccf91f880b9a9c2d0556d8523e8c155041c54db155b384a1dd1119"
|
checksum = "abc3f56f14ccf91f880b9a9c2d0556d8523e8c155041c54db155b384a1dd1119"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"async-trait",
|
"async-trait",
|
||||||
"bytes",
|
"bytes 1.0.1",
|
||||||
"futures",
|
"futures",
|
||||||
"rusoto_core",
|
"rusoto_core",
|
||||||
"xml-rs",
|
"xml-rs",
|
||||||
|
@ -1469,7 +1508,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "5486e6b1673ab3e0ba1ded284fb444845fe1b7f41d13989a54dd60f62a7b2baa"
|
checksum = "5486e6b1673ab3e0ba1ded284fb444845fe1b7f41d13989a54dd60f62a7b2baa"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"base64 0.13.0",
|
"base64 0.13.0",
|
||||||
"bytes",
|
"bytes 1.0.1",
|
||||||
"futures",
|
"futures",
|
||||||
"hex",
|
"hex",
|
||||||
"hmac",
|
"hmac",
|
||||||
|
@ -1494,7 +1533,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "2f93005e0c3b9e40a424b50ca71886d2445cc19bb6cdac3ac84c2daff482eb59"
|
checksum = "2f93005e0c3b9e40a424b50ca71886d2445cc19bb6cdac3ac84c2daff482eb59"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"async-trait",
|
"async-trait",
|
||||||
"bytes",
|
"bytes 1.0.1",
|
||||||
"chrono",
|
"chrono",
|
||||||
"futures",
|
"futures",
|
||||||
"rusoto_core",
|
"rusoto_core",
|
||||||
|
@ -1920,7 +1959,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "83f0c8e7c0addab50b663055baf787d0af7f413a46e6e7fb9559a4e4db7137a5"
|
checksum = "83f0c8e7c0addab50b663055baf787d0af7f413a46e6e7fb9559a4e4db7137a5"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"autocfg",
|
"autocfg",
|
||||||
"bytes",
|
"bytes 1.0.1",
|
||||||
"libc",
|
"libc",
|
||||||
"memchr",
|
"memchr",
|
||||||
"mio",
|
"mio",
|
||||||
|
|
|
@ -5,9 +5,13 @@ authors = ["R. Tyler Croy <rtyler@brokenco.de>"]
|
||||||
edition = "2018"
|
edition = "2018"
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
|
arrow = { git = "https://github.com/apache/arrow.git", rev = "05b36567bd8216bec71b796fe3bb6811c71abbec" }
|
||||||
|
aws_lambda_events = "0.4"
|
||||||
deltalake = { git = "https://github.com/delta-io/delta-rs", branch = "main", features = ["s3"] }
|
deltalake = { git = "https://github.com/delta-io/delta-rs", branch = "main", features = ["s3"] }
|
||||||
lambda_runtime = "0.3"
|
lambda_runtime = "0.3"
|
||||||
log = "0.4"
|
log = "0.4"
|
||||||
|
parquet = { git = "https://github.com/apache/arrow.git", rev = "05b36567bd8216bec71b796fe3bb6811c71abbec" }
|
||||||
pretty_env_logger = "0.4"
|
pretty_env_logger = "0.4"
|
||||||
tokio = { version = "1.0", features = ["macros"]}
|
tokio = { version = "1.0", features = ["macros"]}
|
||||||
serde = { version = "1", features = ["rc", "derive"]}
|
serde = { version = "1", features = ["rc", "derive"]}
|
||||||
|
serde_json = "1"
|
||||||
|
|
108
src/main.rs
108
src/main.rs
|
@ -2,49 +2,97 @@
|
||||||
* The bulk of the application
|
* The bulk of the application
|
||||||
*/
|
*/
|
||||||
|
|
||||||
use deltalake::*;
|
use aws_lambda_events::event::s3::S3Event;
|
||||||
use lambda_runtime::{handler_fn, Context, Error};
|
use lambda_runtime::{handler_fn, Context, Error};
|
||||||
use log::*;
|
use log::*;
|
||||||
use serde::{Deserialize, Serialize};
|
|
||||||
|
|
||||||
/// This is also a made-up example. Requests come into the runtime as unicode
|
mod writer;
|
||||||
/// strings in json format, which can map to any structure that implements `serde::Deserialize`
|
|
||||||
/// The runtime pays no attention to the contents of the request payload.
|
|
||||||
#[derive(Deserialize)]
|
|
||||||
struct Request {
|
|
||||||
command: String,
|
|
||||||
}
|
|
||||||
|
|
||||||
/// This is a made-up example of what a response structure may look like.
|
|
||||||
/// There is no restriction on what it can be. The runtime requires responses
|
|
||||||
/// to be serialized into json. The runtime pays no attention
|
|
||||||
/// to the contents of the response payload.
|
|
||||||
#[derive(Serialize)]
|
|
||||||
struct Response {
|
|
||||||
req_id: String,
|
|
||||||
msg: String,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[tokio::main]
|
#[tokio::main]
|
||||||
async fn main() -> Result<(), Error> {
|
async fn main() -> Result<(), Error> {
|
||||||
pretty_env_logger::init();
|
pretty_env_logger::init();
|
||||||
info!("Initializing delta-s3-loader v{}", env!["CARGO_PKG_VERSION"]);
|
info!("Initializing delta-s3-loader v{}", env!["CARGO_PKG_VERSION"]);
|
||||||
|
|
||||||
let func = handler_fn(my_handler);
|
let func = handler_fn(s3_event_handler);
|
||||||
lambda_runtime::run(func).await?;
|
lambda_runtime::run(func).await?;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn my_handler(event: Request, ctx: Context) -> Result<Response, Error> {
|
/**
|
||||||
// extract some useful info from the request
|
* The s3_event_handler will be invoked with an S3Event which will need to be iterated upon and
|
||||||
let command = event.command;
|
* each S3EventRecord processed:
|
||||||
|
* <https://docs.aws.amazon.com/lambda/latest/dg/with-s3.html>
|
||||||
|
*/
|
||||||
|
async fn s3_event_handler(event: S3Event, _ctx: Context) -> Result<String, Error> {
|
||||||
|
|
||||||
// prepare the response
|
for record in event.records {
|
||||||
let resp = Response {
|
if let Some(ref name) = record.event_name {
|
||||||
req_id: ctx.request_id,
|
trace!("Processing an event named: {}", name);
|
||||||
msg: format!("Command {} executed.", command),
|
/*
|
||||||
};
|
* The only events that delta-s3-loader is interested in are new PUTs which
|
||||||
|
* indicate a new file must be processed.
|
||||||
|
*/
|
||||||
|
if name == "ObjectCreated:Put" {
|
||||||
|
trace!("Processing record: {:?}", record);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
warn!("Received a record without a name: {:?}", record);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// return `Response` (it will be serialized to JSON automatically by the runtime)
|
// Since this was triggered asynchronously, no need for a real response
|
||||||
Ok(resp)
|
Ok("{}".to_string())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_s3_event_handler() {
|
||||||
|
let buf = r#"
|
||||||
|
{
|
||||||
|
"Records": [
|
||||||
|
{
|
||||||
|
"eventVersion": "2.1",
|
||||||
|
"eventSource": "aws:s3",
|
||||||
|
"awsRegion": "us-east-2",
|
||||||
|
"eventTime": "2019-09-03T19:37:27.192Z",
|
||||||
|
"eventName": "ObjectCreated:Put",
|
||||||
|
"userIdentity": {
|
||||||
|
"principalId": "AWS:AIDAINPONIXQXHT3IKHL2"
|
||||||
|
},
|
||||||
|
"requestParameters": {
|
||||||
|
"sourceIPAddress": "205.255.255.255"
|
||||||
|
},
|
||||||
|
"responseElements": {
|
||||||
|
"x-amz-request-id": "D82B88E5F771F645",
|
||||||
|
"x-amz-id-2": "vlR7PnpV2Ce81l0PRw6jlUpck7Jo5ZsQjryTjKlc5aLWGVHPZLj5NeC6qMa0emYBDXOo6QBU0Wo="
|
||||||
|
},
|
||||||
|
"s3": {
|
||||||
|
"s3SchemaVersion": "1.0",
|
||||||
|
"configurationId": "828aa6fc-f7b5-4305-8584-487c791949c1",
|
||||||
|
"bucket": {
|
||||||
|
"name": "lambda-artifacts-deafc19498e3f2df",
|
||||||
|
"ownerIdentity": {
|
||||||
|
"principalId": "A3I5XTEXAMAI3E"
|
||||||
|
},
|
||||||
|
"arn": "arn:aws:s3:::lambda-artifacts-deafc19498e3f2df"
|
||||||
|
},
|
||||||
|
"object": {
|
||||||
|
"key": "b21b84d653bb07b05b1e6b33684dc11b",
|
||||||
|
"size": 1305107,
|
||||||
|
"eTag": "b21b84d653bb07b05b1e6b33684dc11b",
|
||||||
|
"sequencer": "0C0F6F405D6ED209E1"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}"#;
|
||||||
|
let event: S3Event = serde_json::from_str(&buf).expect("Failed to deserialize event");
|
||||||
|
let result = s3_event_handler(event, Context::default()).await.expect("Failed to run event handler");
|
||||||
|
assert_eq!("{}", result);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,56 @@
|
||||||
|
/**
|
||||||
|
* The writer module contains the important code for actually writing to a Delta Lake table
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
use arrow::record_batch::RecordBatch;
|
||||||
|
use serde_json::Value;
|
||||||
|
|
||||||
|
enum WriterError {
|
||||||
|
Generic,
|
||||||
|
}
|
||||||
|
|
||||||
|
fn json_to_batch(json: Vec<Value>) { //-> Result<RecordBatch, WriterError> {
|
||||||
|
use arrow::json::reader::*;
|
||||||
|
|
||||||
|
// infer_json_schema_from_iterator is weird in that it expects each value to be wrapped in a
|
||||||
|
// Result
|
||||||
|
let schema = infer_json_schema_from_iterator(
|
||||||
|
json.into_iter().map(|v| Ok(v)));
|
||||||
|
|
||||||
|
println!("schema: {:#?}", schema);
|
||||||
|
|
||||||
|
//Err(WriterError::Generic)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[ignore]
|
||||||
|
#[test]
|
||||||
|
fn demo() {
|
||||||
|
let delta = deltalake::get_backend_for_uri("./data");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn json_to_arrow_success() {
|
||||||
|
let value: Vec<serde_json::Value> = serde_json::from_str(r#"
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"action" : "commit",
|
||||||
|
"actor" : "rtyler"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"action" : "update",
|
||||||
|
"actor" : "rtyler"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
"#).expect("Failed to create JSON");
|
||||||
|
|
||||||
|
|
||||||
|
let result = json_to_batch(value);
|
||||||
|
assert!(false);
|
||||||
|
//assert!(result.is_ok());
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue