Update the README for some details about using the function
This commit is contained in:
parent
65d34abb9c
commit
939989c0a2
|
@ -56,8 +56,9 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "arrow"
|
||||
version = "36.0.0"
|
||||
source = "git+https://github.com/apache/arrow-rs?branch=master#9bd2bae586ed5b0edfd699f89a0855d79f61b611"
|
||||
version = "33.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f3724c874f1517cf898cd1c3ad18ab5071edf893c48e73139ab1e16cf0f2affe"
|
||||
dependencies = [
|
||||
"ahash",
|
||||
"arrow-arith",
|
||||
|
@ -77,8 +78,9 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "arrow-arith"
|
||||
version = "36.0.0"
|
||||
source = "git+https://github.com/apache/arrow-rs?branch=master#9bd2bae586ed5b0edfd699f89a0855d79f61b611"
|
||||
version = "33.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e958823b8383ca14d0a2e973de478dd7674cd9f72837f8c41c132a0fda6a4e5e"
|
||||
dependencies = [
|
||||
"arrow-array",
|
||||
"arrow-buffer",
|
||||
|
@ -91,8 +93,9 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "arrow-array"
|
||||
version = "36.0.0"
|
||||
source = "git+https://github.com/apache/arrow-rs?branch=master#9bd2bae586ed5b0edfd699f89a0855d79f61b611"
|
||||
version = "33.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "db670eab50e76654065b5aed930f4367101fcddcb2223802007d1e0b4d5a2579"
|
||||
dependencies = [
|
||||
"ahash",
|
||||
"arrow-buffer",
|
||||
|
@ -106,8 +109,9 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "arrow-buffer"
|
||||
version = "36.0.0"
|
||||
source = "git+https://github.com/apache/arrow-rs?branch=master#9bd2bae586ed5b0edfd699f89a0855d79f61b611"
|
||||
version = "33.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9f0e01c931882448c0407bd32311a624b9f099739e94e786af68adc97016b5f2"
|
||||
dependencies = [
|
||||
"half",
|
||||
"num",
|
||||
|
@ -115,8 +119,9 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "arrow-cast"
|
||||
version = "36.0.0"
|
||||
source = "git+https://github.com/apache/arrow-rs?branch=master#9bd2bae586ed5b0edfd699f89a0855d79f61b611"
|
||||
version = "33.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4bf35d78836c93f80d9362f3ccb47ff5e2c5ecfc270ff42cdf1ef80334961d44"
|
||||
dependencies = [
|
||||
"arrow-array",
|
||||
"arrow-buffer",
|
||||
|
@ -130,8 +135,9 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "arrow-csv"
|
||||
version = "36.0.0"
|
||||
source = "git+https://github.com/apache/arrow-rs?branch=master#9bd2bae586ed5b0edfd699f89a0855d79f61b611"
|
||||
version = "33.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0a6aa7c2531d89d01fed8c469a9b1bf97132a0bdf70b4724fe4bbb4537a50880"
|
||||
dependencies = [
|
||||
"arrow-array",
|
||||
"arrow-buffer",
|
||||
|
@ -148,8 +154,9 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "arrow-data"
|
||||
version = "36.0.0"
|
||||
source = "git+https://github.com/apache/arrow-rs?branch=master#9bd2bae586ed5b0edfd699f89a0855d79f61b611"
|
||||
version = "33.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ea50db4d1e1e4c2da2bfdea7b6d2722eef64267d5ab680d815f7ae42428057f5"
|
||||
dependencies = [
|
||||
"arrow-buffer",
|
||||
"arrow-schema",
|
||||
|
@ -159,8 +166,9 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "arrow-ipc"
|
||||
version = "36.0.0"
|
||||
source = "git+https://github.com/apache/arrow-rs?branch=master#9bd2bae586ed5b0edfd699f89a0855d79f61b611"
|
||||
version = "33.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a4042fe6585155d1ec28a8e4937ec901a3ca7a19a22b9f6cd3f551b935cd84f5"
|
||||
dependencies = [
|
||||
"arrow-array",
|
||||
"arrow-buffer",
|
||||
|
@ -172,8 +180,9 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "arrow-json"
|
||||
version = "36.0.0"
|
||||
source = "git+https://github.com/apache/arrow-rs?branch=master#9bd2bae586ed5b0edfd699f89a0855d79f61b611"
|
||||
version = "33.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7c907c4ab4f26970a3719dc06e78e8054a01d0c96da3664d23b941e201b33d2b"
|
||||
dependencies = [
|
||||
"arrow-array",
|
||||
"arrow-buffer",
|
||||
|
@ -190,22 +199,23 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "arrow-ord"
|
||||
version = "36.0.0"
|
||||
source = "git+https://github.com/apache/arrow-rs?branch=master#9bd2bae586ed5b0edfd699f89a0855d79f61b611"
|
||||
version = "33.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e131b447242a32129efc7932f58ed8931b42f35d8701c1a08f9f524da13b1d3c"
|
||||
dependencies = [
|
||||
"arrow-array",
|
||||
"arrow-buffer",
|
||||
"arrow-data",
|
||||
"arrow-schema",
|
||||
"arrow-select",
|
||||
"half",
|
||||
"num",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "arrow-row"
|
||||
version = "36.0.0"
|
||||
source = "git+https://github.com/apache/arrow-rs?branch=master#9bd2bae586ed5b0edfd699f89a0855d79f61b611"
|
||||
version = "33.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b591ef70d76f4ac28dd7666093295fece0e5f9298f49af51ea49c001e1635bb6"
|
||||
dependencies = [
|
||||
"ahash",
|
||||
"arrow-array",
|
||||
|
@ -218,13 +228,15 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "arrow-schema"
|
||||
version = "36.0.0"
|
||||
source = "git+https://github.com/apache/arrow-rs?branch=master#9bd2bae586ed5b0edfd699f89a0855d79f61b611"
|
||||
version = "33.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "eb327717d87eb94be5eff3b0cb8987f54059d343ee5235abf7f143c85f54cfc8"
|
||||
|
||||
[[package]]
|
||||
name = "arrow-select"
|
||||
version = "36.0.0"
|
||||
source = "git+https://github.com/apache/arrow-rs?branch=master#9bd2bae586ed5b0edfd699f89a0855d79f61b611"
|
||||
version = "33.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "79d3c389d1cea86793934f31594f914c8547d82e91e3411d4833ad0aac3266a7"
|
||||
dependencies = [
|
||||
"arrow-array",
|
||||
"arrow-buffer",
|
||||
|
@ -235,8 +247,9 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "arrow-string"
|
||||
version = "36.0.0"
|
||||
source = "git+https://github.com/apache/arrow-rs?branch=master#9bd2bae586ed5b0edfd699f89a0855d79f61b611"
|
||||
version = "33.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "30ee67790496dd310ddbf5096870324431e89aa76453e010020ac29b1184d356"
|
||||
dependencies = [
|
||||
"arrow-array",
|
||||
"arrow-buffer",
|
||||
|
@ -933,7 +946,7 @@ dependencies = [
|
|||
[[package]]
|
||||
name = "deltalake"
|
||||
version = "0.8.0"
|
||||
source = "git+https://github.com/rtyler/delta-rs?branch=testing-map-types-with-partitions#ddf70f11f7258435bf8406126668512929b4b534"
|
||||
source = "git+https://github.com/delta-io/delta-rs?branch=main#0277e092b8fae5f4c536ea65df6593999850b079"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"async-trait",
|
||||
|
@ -941,7 +954,7 @@ dependencies = [
|
|||
"cfg-if",
|
||||
"chrono",
|
||||
"dynamodb_lock",
|
||||
"errno 0.3.0",
|
||||
"errno",
|
||||
"futures",
|
||||
"glibc_version",
|
||||
"itertools",
|
||||
|
@ -1018,7 +1031,7 @@ checksum = "fea41bba32d969b513997752735605054bc0dfa92b4c56bf1189f2e174be7a10"
|
|||
[[package]]
|
||||
name = "dynamodb_lock"
|
||||
version = "0.4.3"
|
||||
source = "git+https://github.com/rtyler/delta-rs?branch=testing-map-types-with-partitions#ddf70f11f7258435bf8406126668512929b4b534"
|
||||
source = "git+https://github.com/delta-io/delta-rs?branch=main#0277e092b8fae5f4c536ea65df6593999850b079"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"log",
|
||||
|
@ -1058,17 +1071,6 @@ dependencies = [
|
|||
"termcolor",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "errno"
|
||||
version = "0.2.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f639046355ee4f37944e44f60642c6f3a7efa3cf6b78c78a0d989a8ce6c396a1"
|
||||
dependencies = [
|
||||
"errno-dragonfly",
|
||||
"libc",
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "errno"
|
||||
version = "0.3.0"
|
||||
|
@ -1247,7 +1249,7 @@ dependencies = [
|
|||
[[package]]
|
||||
name = "glibc_version"
|
||||
version = "0.1.2"
|
||||
source = "git+https://github.com/rtyler/delta-rs?branch=testing-map-types-with-partitions#ddf70f11f7258435bf8406126668512929b4b534"
|
||||
source = "git+https://github.com/delta-io/delta-rs?branch=main#0277e092b8fae5f4c536ea65df6593999850b079"
|
||||
dependencies = [
|
||||
"regex",
|
||||
]
|
||||
|
@ -1317,12 +1319,6 @@ dependencies = [
|
|||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "hermit-abi"
|
||||
version = "0.3.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fed44880c466736ef9a5c5b5facefb5ed0785676d0c02d612db14e54f0d84286"
|
||||
|
||||
[[package]]
|
||||
name = "hex"
|
||||
version = "0.4.3"
|
||||
|
@ -1523,17 +1519,6 @@ version = "3.0.4"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02"
|
||||
|
||||
[[package]]
|
||||
name = "io-lifetimes"
|
||||
version = "1.0.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "09270fd4fa1111bc614ed2246c7ef56239a3063d5be0d1ec3b589c505d400aeb"
|
||||
dependencies = [
|
||||
"hermit-abi 0.3.1",
|
||||
"libc",
|
||||
"windows-sys 0.45.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ipnet"
|
||||
version = "2.7.1"
|
||||
|
@ -1587,7 +1572,6 @@ dependencies = [
|
|||
"rusoto_s3",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"tempfile",
|
||||
"tokio",
|
||||
]
|
||||
|
||||
|
@ -1714,12 +1698,6 @@ dependencies = [
|
|||
"cc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "linux-raw-sys"
|
||||
version = "0.1.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f051f77a7c8e6957c0696eac88f26b0117e54f52d3fc682ab19397a8812846a4"
|
||||
|
||||
[[package]]
|
||||
name = "lock_api"
|
||||
version = "0.4.9"
|
||||
|
@ -1899,7 +1877,8 @@ dependencies = [
|
|||
[[package]]
|
||||
name = "object_store"
|
||||
version = "0.5.5"
|
||||
source = "git+https://github.com/apache/arrow-rs?branch=master#9bd2bae586ed5b0edfd699f89a0855d79f61b611"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e1ea8f683b4f89a64181393742c041520a1a87e9775e6b4c0dd5a3281af05fc6"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"aws-config",
|
||||
|
@ -1983,8 +1962,9 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "parquet"
|
||||
version = "36.0.0"
|
||||
source = "git+https://github.com/apache/arrow-rs?branch=master#9bd2bae586ed5b0edfd699f89a0855d79f61b611"
|
||||
version = "33.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b1b076829801167d889795cd1957989055543430fa1469cb1f6e32b789bfc764"
|
||||
dependencies = [
|
||||
"ahash",
|
||||
"arrow-array",
|
||||
|
@ -2114,9 +2094,9 @@ checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0"
|
|||
|
||||
[[package]]
|
||||
name = "quick-xml"
|
||||
version = "0.28.1"
|
||||
version = "0.27.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e5c1a97b1bc42b1d550bfb48d4262153fe400a12bab1511821736f7eac76d7e2"
|
||||
checksum = "ffc053f057dd768a56f62cd7e434c42c831d296968997e9ac1f76ea7c2d14c41"
|
||||
dependencies = [
|
||||
"memchr",
|
||||
"serde",
|
||||
|
@ -2443,20 +2423,6 @@ dependencies = [
|
|||
"semver",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rustix"
|
||||
version = "0.36.11"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "db4165c9963ab29e422d6c26fbc1d37f15bace6b2810221f9d925023480fcf0e"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
"errno 0.2.8",
|
||||
"io-lifetimes",
|
||||
"libc",
|
||||
"linux-raw-sys",
|
||||
"windows-sys 0.45.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rustls"
|
||||
version = "0.19.1"
|
||||
|
@ -2815,19 +2781,6 @@ dependencies = [
|
|||
"unicode-ident",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tempfile"
|
||||
version = "3.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "af18f7ae1acd354b992402e9ec5864359d693cd8a79dcbef59f76891701c1e95"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"fastrand",
|
||||
"redox_syscall",
|
||||
"rustix",
|
||||
"windows-sys 0.42.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "termcolor"
|
||||
version = "1.2.0"
|
||||
|
|
|
@ -1,11 +1,12 @@
|
|||
[package]
|
||||
name = "lambda-delta-optimize"
|
||||
version = "0.1.0"
|
||||
author = "R Tyler Croy <rtyler@brokenco.de>"
|
||||
edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
aws_lambda_events = { version = "0.7" }
|
||||
deltalake = { git = "https://github.com/rtyler/delta-rs", branch = "testing-map-types-with-partitions", features = ["arrow", "parquet", "s3"]}
|
||||
deltalake = { git = "https://github.com/delta-io/delta-rs", branch = "main", features = ["arrow", "parquet", "s3"]}
|
||||
lambda_runtime = { version = "0.7" }
|
||||
log = "0.4"
|
||||
pretty_env_logger = "0.4"
|
||||
|
@ -16,9 +17,6 @@ serde = { version = "*", features = ["rc"]}
|
|||
serde_json = "1"
|
||||
tokio = { version = "1", features = ["macros"] }
|
||||
|
||||
[dev-dependencies]
|
||||
tempfile = "*"
|
||||
|
||||
[profile.release]
|
||||
panic = "abort"
|
||||
lto = true
|
||||
|
|
52
README.adoc
52
README.adoc
|
@ -1,37 +1,59 @@
|
|||
= Optimizing Lambda
|
||||
= Delta Optimize Lambda
|
||||
|
||||
This Lambda function exists to optimize the given table on a regular basis.
|
||||
This Lambda function can be used with a periodic trigger to optimize a
|
||||
configured link:https://delta.io[Delta Lake] table. Consult the `deployment.tf`
|
||||
file for an example of how to provision the function in AWS.
|
||||
|
||||
== Infrastructure requirements
|
||||
== Building
|
||||
|
||||
In order to deploy this in AWS Lambda, it must first be built with the `cargo lambda` command line tool, e.g.:
|
||||
Building and testing the Lambda can be done with cargo: `cargo test build`.
|
||||
|
||||
In order to deploy this in AWS Lambda, it must first be built with the `cargo
|
||||
lambda` command line tool, e.g.:
|
||||
|
||||
[source,bash]
|
||||
----
|
||||
cargo lambda build --release --output zip
|
||||
cargo lambda build --release --output-format zip
|
||||
----
|
||||
|
||||
This will produce the file: `target/lambda/http-to-delta/bootstrap.zip`
|
||||
This will produce the file: `target/lambda/lambda-delta-optimize/bootstrap.zip`
|
||||
|
||||
== Infrastructure
|
||||
|
||||
The `deployment.tf` file contains the necessary Terraform to provision the
|
||||
function, a DynamoDB table for locking, and IAM permissions. This Terraform
|
||||
does *not* provision an S3 bucket to optimize.
|
||||
|
||||
After configuring the necessary authentication for Terraform, the following
|
||||
steps can be used to provision:
|
||||
|
||||
[source,bash]
|
||||
----
|
||||
cargo lambda build --release --output-format zip
|
||||
terraform init
|
||||
terraform plan
|
||||
terraform apply
|
||||
----
|
||||
|
||||
[NOTE]
|
||||
====
|
||||
Terraform configures the Lambda to run with the smallest amount of memory allowed. For a sizable table, this may not be sufficient for larger tables.
|
||||
====
|
||||
|
||||
=== Environment variables
|
||||
|
||||
The following environment variables must be set for the function to run properly
|
||||
|
||||
|===
|
||||
| Name | Value | Notes
|
||||
|
||||
| `DATALAKE_LOCATION`
|
||||
| `s3://my-bucket-name/databases/bronze/http`
|
||||
| The `s3://` URL of the desired bucket to be written, with the prefix for the specific table this function should write to such as in the example value.
|
||||
| The `s3://` URL of the desired table to optimize.
|
||||
|
||||
|
||||
| `AWS_S3_LOCKING_PROVIDER`
|
||||
| `dynamodb`
|
||||
| This instructs the `deltalake` crate to use DynamoDB for locking to provide consistent writes into s3.
|
||||
|
||||
|===
|
||||
|
||||
=== AWS configuration
|
||||
|
||||
In addition to setting up the Lambda function with the custom runtime, an S3 bucket for writing Delta records to must be created. The execution role for the Lambda must have access to perform S3 operations on that bucket, as well as access DynamoDB.
|
||||
|
||||
Create a DynamoDB table named `delta_rs_lock_table` with the partition key of
|
||||
`key`. This will ensure consistent writes to S3 among multiple Lambda
|
||||
functions.
|
||||
|
|
Loading…
Reference in New Issue