Update the README for some details about using the function

This commit is contained in:
R Tyler Croy 2023-03-26 22:45:09 -07:00
parent 65d34abb9c
commit 939989c0a2
3 changed files with 92 additions and 119 deletions

153
Cargo.lock generated
View File

@ -56,8 +56,9 @@ dependencies = [
[[package]]
name = "arrow"
version = "36.0.0"
source = "git+https://github.com/apache/arrow-rs?branch=master#9bd2bae586ed5b0edfd699f89a0855d79f61b611"
version = "33.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f3724c874f1517cf898cd1c3ad18ab5071edf893c48e73139ab1e16cf0f2affe"
dependencies = [
"ahash",
"arrow-arith",
@ -77,8 +78,9 @@ dependencies = [
[[package]]
name = "arrow-arith"
version = "36.0.0"
source = "git+https://github.com/apache/arrow-rs?branch=master#9bd2bae586ed5b0edfd699f89a0855d79f61b611"
version = "33.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e958823b8383ca14d0a2e973de478dd7674cd9f72837f8c41c132a0fda6a4e5e"
dependencies = [
"arrow-array",
"arrow-buffer",
@ -91,8 +93,9 @@ dependencies = [
[[package]]
name = "arrow-array"
version = "36.0.0"
source = "git+https://github.com/apache/arrow-rs?branch=master#9bd2bae586ed5b0edfd699f89a0855d79f61b611"
version = "33.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "db670eab50e76654065b5aed930f4367101fcddcb2223802007d1e0b4d5a2579"
dependencies = [
"ahash",
"arrow-buffer",
@ -106,8 +109,9 @@ dependencies = [
[[package]]
name = "arrow-buffer"
version = "36.0.0"
source = "git+https://github.com/apache/arrow-rs?branch=master#9bd2bae586ed5b0edfd699f89a0855d79f61b611"
version = "33.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9f0e01c931882448c0407bd32311a624b9f099739e94e786af68adc97016b5f2"
dependencies = [
"half",
"num",
@ -115,8 +119,9 @@ dependencies = [
[[package]]
name = "arrow-cast"
version = "36.0.0"
source = "git+https://github.com/apache/arrow-rs?branch=master#9bd2bae586ed5b0edfd699f89a0855d79f61b611"
version = "33.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4bf35d78836c93f80d9362f3ccb47ff5e2c5ecfc270ff42cdf1ef80334961d44"
dependencies = [
"arrow-array",
"arrow-buffer",
@ -130,8 +135,9 @@ dependencies = [
[[package]]
name = "arrow-csv"
version = "36.0.0"
source = "git+https://github.com/apache/arrow-rs?branch=master#9bd2bae586ed5b0edfd699f89a0855d79f61b611"
version = "33.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0a6aa7c2531d89d01fed8c469a9b1bf97132a0bdf70b4724fe4bbb4537a50880"
dependencies = [
"arrow-array",
"arrow-buffer",
@ -148,8 +154,9 @@ dependencies = [
[[package]]
name = "arrow-data"
version = "36.0.0"
source = "git+https://github.com/apache/arrow-rs?branch=master#9bd2bae586ed5b0edfd699f89a0855d79f61b611"
version = "33.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ea50db4d1e1e4c2da2bfdea7b6d2722eef64267d5ab680d815f7ae42428057f5"
dependencies = [
"arrow-buffer",
"arrow-schema",
@ -159,8 +166,9 @@ dependencies = [
[[package]]
name = "arrow-ipc"
version = "36.0.0"
source = "git+https://github.com/apache/arrow-rs?branch=master#9bd2bae586ed5b0edfd699f89a0855d79f61b611"
version = "33.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a4042fe6585155d1ec28a8e4937ec901a3ca7a19a22b9f6cd3f551b935cd84f5"
dependencies = [
"arrow-array",
"arrow-buffer",
@ -172,8 +180,9 @@ dependencies = [
[[package]]
name = "arrow-json"
version = "36.0.0"
source = "git+https://github.com/apache/arrow-rs?branch=master#9bd2bae586ed5b0edfd699f89a0855d79f61b611"
version = "33.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7c907c4ab4f26970a3719dc06e78e8054a01d0c96da3664d23b941e201b33d2b"
dependencies = [
"arrow-array",
"arrow-buffer",
@ -190,22 +199,23 @@ dependencies = [
[[package]]
name = "arrow-ord"
version = "36.0.0"
source = "git+https://github.com/apache/arrow-rs?branch=master#9bd2bae586ed5b0edfd699f89a0855d79f61b611"
version = "33.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e131b447242a32129efc7932f58ed8931b42f35d8701c1a08f9f524da13b1d3c"
dependencies = [
"arrow-array",
"arrow-buffer",
"arrow-data",
"arrow-schema",
"arrow-select",
"half",
"num",
]
[[package]]
name = "arrow-row"
version = "36.0.0"
source = "git+https://github.com/apache/arrow-rs?branch=master#9bd2bae586ed5b0edfd699f89a0855d79f61b611"
version = "33.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b591ef70d76f4ac28dd7666093295fece0e5f9298f49af51ea49c001e1635bb6"
dependencies = [
"ahash",
"arrow-array",
@ -218,13 +228,15 @@ dependencies = [
[[package]]
name = "arrow-schema"
version = "36.0.0"
source = "git+https://github.com/apache/arrow-rs?branch=master#9bd2bae586ed5b0edfd699f89a0855d79f61b611"
version = "33.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "eb327717d87eb94be5eff3b0cb8987f54059d343ee5235abf7f143c85f54cfc8"
[[package]]
name = "arrow-select"
version = "36.0.0"
source = "git+https://github.com/apache/arrow-rs?branch=master#9bd2bae586ed5b0edfd699f89a0855d79f61b611"
version = "33.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "79d3c389d1cea86793934f31594f914c8547d82e91e3411d4833ad0aac3266a7"
dependencies = [
"arrow-array",
"arrow-buffer",
@ -235,8 +247,9 @@ dependencies = [
[[package]]
name = "arrow-string"
version = "36.0.0"
source = "git+https://github.com/apache/arrow-rs?branch=master#9bd2bae586ed5b0edfd699f89a0855d79f61b611"
version = "33.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "30ee67790496dd310ddbf5096870324431e89aa76453e010020ac29b1184d356"
dependencies = [
"arrow-array",
"arrow-buffer",
@ -933,7 +946,7 @@ dependencies = [
[[package]]
name = "deltalake"
version = "0.8.0"
source = "git+https://github.com/rtyler/delta-rs?branch=testing-map-types-with-partitions#ddf70f11f7258435bf8406126668512929b4b534"
source = "git+https://github.com/delta-io/delta-rs?branch=main#0277e092b8fae5f4c536ea65df6593999850b079"
dependencies = [
"arrow",
"async-trait",
@ -941,7 +954,7 @@ dependencies = [
"cfg-if",
"chrono",
"dynamodb_lock",
"errno 0.3.0",
"errno",
"futures",
"glibc_version",
"itertools",
@ -1018,7 +1031,7 @@ checksum = "fea41bba32d969b513997752735605054bc0dfa92b4c56bf1189f2e174be7a10"
[[package]]
name = "dynamodb_lock"
version = "0.4.3"
source = "git+https://github.com/rtyler/delta-rs?branch=testing-map-types-with-partitions#ddf70f11f7258435bf8406126668512929b4b534"
source = "git+https://github.com/delta-io/delta-rs?branch=main#0277e092b8fae5f4c536ea65df6593999850b079"
dependencies = [
"async-trait",
"log",
@ -1058,17 +1071,6 @@ dependencies = [
"termcolor",
]
[[package]]
name = "errno"
version = "0.2.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f639046355ee4f37944e44f60642c6f3a7efa3cf6b78c78a0d989a8ce6c396a1"
dependencies = [
"errno-dragonfly",
"libc",
"winapi",
]
[[package]]
name = "errno"
version = "0.3.0"
@ -1247,7 +1249,7 @@ dependencies = [
[[package]]
name = "glibc_version"
version = "0.1.2"
source = "git+https://github.com/rtyler/delta-rs?branch=testing-map-types-with-partitions#ddf70f11f7258435bf8406126668512929b4b534"
source = "git+https://github.com/delta-io/delta-rs?branch=main#0277e092b8fae5f4c536ea65df6593999850b079"
dependencies = [
"regex",
]
@ -1317,12 +1319,6 @@ dependencies = [
"libc",
]
[[package]]
name = "hermit-abi"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fed44880c466736ef9a5c5b5facefb5ed0785676d0c02d612db14e54f0d84286"
[[package]]
name = "hex"
version = "0.4.3"
@ -1523,17 +1519,6 @@ version = "3.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02"
[[package]]
name = "io-lifetimes"
version = "1.0.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "09270fd4fa1111bc614ed2246c7ef56239a3063d5be0d1ec3b589c505d400aeb"
dependencies = [
"hermit-abi 0.3.1",
"libc",
"windows-sys 0.45.0",
]
[[package]]
name = "ipnet"
version = "2.7.1"
@ -1587,7 +1572,6 @@ dependencies = [
"rusoto_s3",
"serde",
"serde_json",
"tempfile",
"tokio",
]
@ -1714,12 +1698,6 @@ dependencies = [
"cc",
]
[[package]]
name = "linux-raw-sys"
version = "0.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f051f77a7c8e6957c0696eac88f26b0117e54f52d3fc682ab19397a8812846a4"
[[package]]
name = "lock_api"
version = "0.4.9"
@ -1899,7 +1877,8 @@ dependencies = [
[[package]]
name = "object_store"
version = "0.5.5"
source = "git+https://github.com/apache/arrow-rs?branch=master#9bd2bae586ed5b0edfd699f89a0855d79f61b611"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e1ea8f683b4f89a64181393742c041520a1a87e9775e6b4c0dd5a3281af05fc6"
dependencies = [
"async-trait",
"aws-config",
@ -1983,8 +1962,9 @@ dependencies = [
[[package]]
name = "parquet"
version = "36.0.0"
source = "git+https://github.com/apache/arrow-rs?branch=master#9bd2bae586ed5b0edfd699f89a0855d79f61b611"
version = "33.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b1b076829801167d889795cd1957989055543430fa1469cb1f6e32b789bfc764"
dependencies = [
"ahash",
"arrow-array",
@ -2114,9 +2094,9 @@ checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0"
[[package]]
name = "quick-xml"
version = "0.28.1"
version = "0.27.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e5c1a97b1bc42b1d550bfb48d4262153fe400a12bab1511821736f7eac76d7e2"
checksum = "ffc053f057dd768a56f62cd7e434c42c831d296968997e9ac1f76ea7c2d14c41"
dependencies = [
"memchr",
"serde",
@ -2443,20 +2423,6 @@ dependencies = [
"semver",
]
[[package]]
name = "rustix"
version = "0.36.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "db4165c9963ab29e422d6c26fbc1d37f15bace6b2810221f9d925023480fcf0e"
dependencies = [
"bitflags",
"errno 0.2.8",
"io-lifetimes",
"libc",
"linux-raw-sys",
"windows-sys 0.45.0",
]
[[package]]
name = "rustls"
version = "0.19.1"
@ -2815,19 +2781,6 @@ dependencies = [
"unicode-ident",
]
[[package]]
name = "tempfile"
version = "3.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "af18f7ae1acd354b992402e9ec5864359d693cd8a79dcbef59f76891701c1e95"
dependencies = [
"cfg-if",
"fastrand",
"redox_syscall",
"rustix",
"windows-sys 0.42.0",
]
[[package]]
name = "termcolor"
version = "1.2.0"

View File

@ -1,11 +1,12 @@
[package]
name = "lambda-delta-optimize"
version = "0.1.0"
author = "R Tyler Croy <rtyler@brokenco.de>"
edition = "2021"
[dependencies]
aws_lambda_events = { version = "0.7" }
deltalake = { git = "https://github.com/rtyler/delta-rs", branch = "testing-map-types-with-partitions", features = ["arrow", "parquet", "s3"]}
deltalake = { git = "https://github.com/delta-io/delta-rs", branch = "main", features = ["arrow", "parquet", "s3"]}
lambda_runtime = { version = "0.7" }
log = "0.4"
pretty_env_logger = "0.4"
@ -16,9 +17,6 @@ serde = { version = "*", features = ["rc"]}
serde_json = "1"
tokio = { version = "1", features = ["macros"] }
[dev-dependencies]
tempfile = "*"
[profile.release]
panic = "abort"
lto = true

View File

@ -1,37 +1,59 @@
= Optimizing Lambda
= Delta Optimize Lambda
This Lambda function exists to optimize the given table on a regular basis.
This Lambda function can be used with a periodic trigger to optimize a
configured link:https://delta.io[Delta Lake] table. Consult the `deployment.tf`
file for an example of how to provision the function in AWS.
== Infrastructure requirements
== Building
In order to deploy this in AWS Lambda, it must first be built with the `cargo lambda` command line tool, e.g.:
Building and testing the Lambda can be done with cargo: `cargo test build`.
In order to deploy this in AWS Lambda, it must first be built with the `cargo
lambda` command line tool, e.g.:
[source,bash]
----
cargo lambda build --release --output zip
cargo lambda build --release --output-format zip
----
This will produce the file: `target/lambda/http-to-delta/bootstrap.zip`
This will produce the file: `target/lambda/lambda-delta-optimize/bootstrap.zip`
== Infrastructure
The `deployment.tf` file contains the necessary Terraform to provision the
function, a DynamoDB table for locking, and IAM permissions. This Terraform
does *not* provision an S3 bucket to optimize.
After configuring the necessary authentication for Terraform, the following
steps can be used to provision:
[source,bash]
----
cargo lambda build --release --output-format zip
terraform init
terraform plan
terraform apply
----
[NOTE]
====
Terraform configures the Lambda to run with the smallest amount of memory allowed. For a sizable table, this may not be sufficient for larger tables.
====
=== Environment variables
The following environment variables must be set for the function to run properly
|===
| Name | Value | Notes
| `DATALAKE_LOCATION`
| `s3://my-bucket-name/databases/bronze/http`
| The `s3://` URL of the desired bucket to be written, with the prefix for the specific table this function should write to such as in the example value.
| The `s3://` URL of the desired table to optimize.
| `AWS_S3_LOCKING_PROVIDER`
| `dynamodb`
| This instructs the `deltalake` crate to use DynamoDB for locking to provide consistent writes into s3.
|===
=== AWS configuration
In addition to setting up the Lambda function with the custom runtime, an S3 bucket for writing Delta records to must be created. The execution role for the Lambda must have access to perform S3 operations on that bucket, as well as access DynamoDB.
Create a DynamoDB table named `delta_rs_lock_table` with the partition key of
`key`. This will ensure consistent writes to S3 among multiple Lambda
functions.