Introduce the OPTIMIZE_DS environment variable to scope optimize

The only supported value at the moment is "yesterday" which will only optimize
the `ds` partition for the prior day UTC.
This commit is contained in:
R Tyler Croy 2023-03-30 23:04:52 -07:00
parent d815329c9f
commit 45ec8d5d5d
4 changed files with 186 additions and 76 deletions

223
Cargo.lock generated
View File

@ -290,7 +290,7 @@ checksum = "b9ccdd8f2a161be9bd5c023df56f1b2a0bd1d83872ae53b71a84a12c9bf6e842"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.10",
"syn 2.0.12",
]
[[package]]
@ -332,7 +332,7 @@ dependencies = [
"http",
"hyper",
"ring",
"time",
"time 0.3.20",
"tokio",
"tower",
"tracing",
@ -464,7 +464,7 @@ dependencies = [
"percent-encoding",
"regex",
"sha2 0.10.6",
"time",
"time 0.3.20",
"tracing",
]
@ -568,7 +568,7 @@ dependencies = [
"itoa",
"num-integer",
"ryu",
"time",
"time 0.3.20",
]
[[package]]
@ -736,9 +736,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4e3c5919066adf22df73762e50cffcde3a758f2a848b113b586d1f86728b673b"
dependencies = [
"iana-time-zone",
"js-sys",
"num-integer",
"num-traits",
"serde",
"time 0.1.45",
"wasm-bindgen",
"winapi",
]
@ -888,7 +891,7 @@ dependencies = [
"proc-macro2",
"quote",
"scratch",
"syn 2.0.10",
"syn 2.0.12",
]
[[package]]
@ -905,7 +908,7 @@ checksum = "2345488264226bf682893e25de0769f3360aac9957980ec49361b083ddaa5bc5"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.10",
"syn 2.0.12",
]
[[package]]
@ -946,7 +949,7 @@ dependencies = [
[[package]]
name = "deltalake"
version = "0.8.0"
source = "git+https://github.com/delta-io/delta-rs?branch=main#0277e092b8fae5f4c536ea65df6593999850b079"
source = "git+https://github.com/delta-io/delta-rs?branch=main#d9920aaf6c730fda2878c2271ed396a138243bb1"
dependencies = [
"arrow",
"async-trait",
@ -1031,7 +1034,7 @@ checksum = "fea41bba32d969b513997752735605054bc0dfa92b4c56bf1189f2e174be7a10"
[[package]]
name = "dynamodb_lock"
version = "0.4.3"
source = "git+https://github.com/delta-io/delta-rs?branch=main#0277e092b8fae5f4c536ea65df6593999850b079"
source = "git+https://github.com/delta-io/delta-rs?branch=main#d9920aaf6c730fda2878c2271ed396a138243bb1"
dependencies = [
"async-trait",
"log",
@ -1138,9 +1141,9 @@ dependencies = [
[[package]]
name = "futures"
version = "0.3.27"
version = "0.3.28"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "531ac96c6ff5fd7c62263c5e3c67a603af4fcaee2e1a0ae5565ba3a11e69e549"
checksum = "23342abe12aba583913b2e62f22225ff9c950774065e4bfb61a19cd9770fec40"
dependencies = [
"futures-channel",
"futures-core",
@ -1153,9 +1156,9 @@ dependencies = [
[[package]]
name = "futures-channel"
version = "0.3.27"
version = "0.3.28"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "164713a5a0dcc3e7b4b1ed7d3b433cabc18025386f9339346e8daf15963cf7ac"
checksum = "955518d47e09b25bbebc7a18df10b81f0c766eaf4c4f1cccef2fca5f2a4fb5f2"
dependencies = [
"futures-core",
"futures-sink",
@ -1163,15 +1166,15 @@ dependencies = [
[[package]]
name = "futures-core"
version = "0.3.27"
version = "0.3.28"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "86d7a0c1aa76363dac491de0ee99faf6941128376f1cf96f07db7603b7de69dd"
checksum = "4bca583b7e26f571124fe5b7561d49cb2868d79116cfa0eefce955557c6fee8c"
[[package]]
name = "futures-executor"
version = "0.3.27"
version = "0.3.28"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1997dd9df74cdac935c76252744c1ed5794fac083242ea4fe77ef3ed60ba0f83"
checksum = "ccecee823288125bd88b4d7f565c9e58e41858e47ab72e8ea2d64e93624386e0"
dependencies = [
"futures-core",
"futures-task",
@ -1180,38 +1183,38 @@ dependencies = [
[[package]]
name = "futures-io"
version = "0.3.27"
version = "0.3.28"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "89d422fa3cbe3b40dca574ab087abb5bc98258ea57eea3fd6f1fa7162c778b91"
checksum = "4fff74096e71ed47f8e023204cfd0aa1289cd54ae5430a9523be060cdb849964"
[[package]]
name = "futures-macro"
version = "0.3.27"
version = "0.3.28"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3eb14ed937631bd8b8b8977f2c198443447a8355b6e3ca599f38c975e5a963b6"
checksum = "89ca545a94061b6365f2c7355b4b32bd20df3ff95f02da9329b34ccc3bd6ee72"
dependencies = [
"proc-macro2",
"quote",
"syn 1.0.109",
"syn 2.0.12",
]
[[package]]
name = "futures-sink"
version = "0.3.27"
version = "0.3.28"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ec93083a4aecafb2a80a885c9de1f0ccae9dbd32c2bb54b0c3a65690e0b8d2f2"
checksum = "f43be4fe21a13b9781a69afa4985b0f6ee0e1afab2c6f454a8cf30e2b2237b6e"
[[package]]
name = "futures-task"
version = "0.3.27"
version = "0.3.28"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fd65540d33b37b16542a0438c12e6aeead10d4ac5d05bd3f805b8f35ab592879"
checksum = "76d3d132be6c0e6aa1534069c705a74a5997a356c0dc2f86a47765e5617c5b65"
[[package]]
name = "futures-util"
version = "0.3.27"
version = "0.3.28"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3ef6b17e481503ec85211fed8f39d1970f128935ca1f814cd32ac4a6842e84ab"
checksum = "26b01e40b772d54cf6c6d721c1d1abd0647a0106a12ecaa1c186273392a69533"
dependencies = [
"futures-channel",
"futures-core",
@ -1227,9 +1230,9 @@ dependencies = [
[[package]]
name = "generic-array"
version = "0.14.6"
version = "0.14.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bff49e947297f3312447abdca79f45f4738097cc82b06e72054d2223f601f1b9"
checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a"
dependencies = [
"typenum",
"version_check",
@ -1243,13 +1246,13 @@ checksum = "c05aeb6a22b8f62540c194aac980f2115af067bfe15a0734d7277a768d396b31"
dependencies = [
"cfg-if",
"libc",
"wasi",
"wasi 0.11.0+wasi-snapshot-preview1",
]
[[package]]
name = "glibc_version"
version = "0.1.2"
source = "git+https://github.com/delta-io/delta-rs?branch=main#0277e092b8fae5f4c536ea65df6593999850b079"
source = "git+https://github.com/delta-io/delta-rs?branch=main#d9920aaf6c730fda2878c2271ed396a138243bb1"
dependencies = [
"regex",
]
@ -1455,9 +1458,9 @@ dependencies = [
[[package]]
name = "iana-time-zone"
version = "0.1.54"
version = "0.1.55"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0c17cc76786e99f8d2f055c11159e7f0091c42474dcc3189fbab96072e873e6d"
checksum = "716f12fbcfac6ffab0a5e9ec51d0a0ff70503742bb2dc7b99396394c9dc323f0"
dependencies = [
"android_system_properties",
"core-foundation-sys",
@ -1521,9 +1524,9 @@ checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02"
[[package]]
name = "ipnet"
version = "2.7.1"
version = "2.7.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "30e22bd8629359895450b59ea7a776c850561b96a3b1d31321c1949d9e6c9146"
checksum = "12b6ee2129af8d4fb011108c73d99a1b83a85977f23b82460c0ae2e25bb4b57f"
[[package]]
name = "itertools"
@ -1563,6 +1566,7 @@ name = "lambda-delta-optimize"
version = "0.1.0"
dependencies = [
"aws_lambda_events",
"chrono",
"deltalake",
"lambda_runtime",
"log",
@ -1783,7 +1787,7 @@ checksum = "5b9d9a46eff5b4ff64b45a9e316a6d1e0bc719ef429cbec4dc630684212bfdf9"
dependencies = [
"libc",
"log",
"wasi",
"wasi 0.11.0+wasi-snapshot-preview1",
"windows-sys 0.45.0",
]
@ -2180,9 +2184,9 @@ checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1"
[[package]]
name = "reqwest"
version = "0.11.15"
version = "0.11.16"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0ba30cc2c0cd02af1222ed216ba659cdb2f879dfe3181852fe7c50b1d0005949"
checksum = "27b71749df584b7f4cac2c426c127a7c785a5106cc98f7a8feb044115f0fa254"
dependencies = [
"base64 0.21.0",
"bytes",
@ -2574,29 +2578,29 @@ checksum = "e6b44e8fc93a14e66336d230954dda83d18b4605ccace8fe09bc7514a71ad0bc"
[[package]]
name = "serde"
version = "1.0.158"
version = "1.0.159"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "771d4d9c4163ee138805e12c710dd365e4f44be8be0503cb1bb9eb989425d9c9"
checksum = "3c04e8343c3daeec41f58990b9d77068df31209f2af111e059e9fe9646693065"
dependencies = [
"serde_derive",
]
[[package]]
name = "serde_derive"
version = "1.0.158"
version = "1.0.159"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e801c1712f48475582b7696ac71e0ca34ebb30e09338425384269d9717c62cad"
checksum = "4c614d17805b093df4b147b51339e7e44bf05ef59fba1e45d83500bcfb4d8585"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.10",
"syn 2.0.12",
]
[[package]]
name = "serde_json"
version = "1.0.94"
version = "1.0.95"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1c533a59c9d8a93a09c6ab31f0fd5e5f4dd1b8fc9434804029839884765d04ea"
checksum = "d721eca97ac802aa7777b701877c8004d950fc142651367300d21c1cc0194744"
dependencies = [
"itoa",
"ryu",
@ -2628,7 +2632,7 @@ dependencies = [
"serde",
"serde_json",
"serde_with_macros",
"time",
"time 0.3.20",
]
[[package]]
@ -2772,9 +2776,9 @@ dependencies = [
[[package]]
name = "syn"
version = "2.0.10"
version = "2.0.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5aad1363ed6d37b84299588d62d3a7d95b5a5c2d9aad5c85609fda12afaa1f40"
checksum = "79d9531f94112cfc3e4c8f5f02cb2b58f72c97b7efd85f70203cc6d8efda5927"
dependencies = [
"proc-macro2",
"quote",
@ -2807,7 +2811,7 @@ checksum = "f9456a42c5b0d803c8cd86e73dd7cc9edd429499f37a3550d286d5e86720569f"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.10",
"syn 2.0.12",
]
[[package]]
@ -2821,6 +2825,17 @@ dependencies = [
"ordered-float",
]
[[package]]
name = "time"
version = "0.1.45"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1b797afad3f312d1c66a56d11d0316f916356d11bd158fbc6ca6389ff6bf805a"
dependencies = [
"libc",
"wasi 0.10.0+wasi-snapshot-preview1",
"winapi",
]
[[package]]
name = "time"
version = "0.3.20"
@ -2874,14 +2889,13 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
[[package]]
name = "tokio"
version = "1.26.0"
version = "1.27.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "03201d01c3c27a29c8a5cee5b55a93ddae1ccf6f08f65365c2c918f8c1b76f64"
checksum = "d0de47a4eecbe11f498978a9b29d792f0d2692d1dd003650c24c76510e3bc001"
dependencies = [
"autocfg",
"bytes",
"libc",
"memchr",
"mio",
"num_cpus",
"parking_lot",
@ -2894,13 +2908,13 @@ dependencies = [
[[package]]
name = "tokio-macros"
version = "1.8.2"
version = "2.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d266c00fde287f55d3f1c3e96c500c362a2b8c695076ec180f27918820bc6df8"
checksum = "61a573bdc87985e9d6ddeed1b3d864e8a302c847e40d647746df2f1de209d1ce"
dependencies = [
"proc-macro2",
"quote",
"syn 1.0.109",
"syn 2.0.12",
]
[[package]]
@ -3125,6 +3139,12 @@ dependencies = [
"try-lock",
]
[[package]]
name = "wasi"
version = "0.10.0+wasi-snapshot-preview1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1a143597ca7c7793eff794def352d41792a93c481eb1042423ff7ff72ba2c31f"
[[package]]
name = "wasi"
version = "0.11.0+wasi-snapshot-preview1"
@ -3282,11 +3302,11 @@ checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
[[package]]
name = "windows"
version = "0.46.0"
version = "0.47.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cdacb41e6a96a052c6cb63a144f24900236121c6f63f4f8219fef5977ecb0c25"
checksum = "2649ff315bee4c98757f15dac226efe3d81927adbb6e882084bb1ee3e0c330a7"
dependencies = [
"windows-targets",
"windows-targets 0.47.0",
]
[[package]]
@ -3295,13 +3315,13 @@ version = "0.42.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5a3e1820f08b8513f676f7ab6c1f99ff312fb97b553d30ff4dd86f9f15728aa7"
dependencies = [
"windows_aarch64_gnullvm",
"windows_aarch64_msvc",
"windows_i686_gnu",
"windows_i686_msvc",
"windows_x86_64_gnu",
"windows_x86_64_gnullvm",
"windows_x86_64_msvc",
"windows_aarch64_gnullvm 0.42.2",
"windows_aarch64_msvc 0.42.2",
"windows_i686_gnu 0.42.2",
"windows_i686_msvc 0.42.2",
"windows_x86_64_gnu 0.42.2",
"windows_x86_64_gnullvm 0.42.2",
"windows_x86_64_msvc 0.42.2",
]
[[package]]
@ -3310,7 +3330,7 @@ version = "0.45.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "75283be5efb2831d37ea142365f009c02ec203cd29a3ebecbc093d52315b66d0"
dependencies = [
"windows-targets",
"windows-targets 0.42.2",
]
[[package]]
@ -3319,13 +3339,28 @@ version = "0.42.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8e5180c00cd44c9b1c88adb3693291f1cd93605ded80c250a75d472756b4d071"
dependencies = [
"windows_aarch64_gnullvm",
"windows_aarch64_msvc",
"windows_i686_gnu",
"windows_i686_msvc",
"windows_x86_64_gnu",
"windows_x86_64_gnullvm",
"windows_x86_64_msvc",
"windows_aarch64_gnullvm 0.42.2",
"windows_aarch64_msvc 0.42.2",
"windows_i686_gnu 0.42.2",
"windows_i686_msvc 0.42.2",
"windows_x86_64_gnu 0.42.2",
"windows_x86_64_gnullvm 0.42.2",
"windows_x86_64_msvc 0.42.2",
]
[[package]]
name = "windows-targets"
version = "0.47.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2f8996d3f43b4b2d44327cd71b7b0efd1284ab60e6e9d0e8b630e18555d87d3e"
dependencies = [
"windows_aarch64_gnullvm 0.47.0",
"windows_aarch64_msvc 0.47.0",
"windows_i686_gnu 0.47.0",
"windows_i686_msvc 0.47.0",
"windows_x86_64_gnu 0.47.0",
"windows_x86_64_gnullvm 0.47.0",
"windows_x86_64_msvc 0.47.0",
]
[[package]]
@ -3334,42 +3369,84 @@ version = "0.42.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "597a5118570b68bc08d8d59125332c54f1ba9d9adeedeef5b99b02ba2b0698f8"
[[package]]
name = "windows_aarch64_gnullvm"
version = "0.47.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "831d567d53d4f3cb1db332b68e6e2b6260228eb4d99a777d8b2e8ed794027c90"
[[package]]
name = "windows_aarch64_msvc"
version = "0.42.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e08e8864a60f06ef0d0ff4ba04124db8b0fb3be5776a5cd47641e942e58c4d43"
[[package]]
name = "windows_aarch64_msvc"
version = "0.47.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6a42d54a417c60ce4f0e31661eed628f0fa5aca73448c093ec4d45fab4c51cdf"
[[package]]
name = "windows_i686_gnu"
version = "0.42.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c61d927d8da41da96a81f029489353e68739737d3beca43145c8afec9a31a84f"
[[package]]
name = "windows_i686_gnu"
version = "0.47.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c1925beafdbb22201a53a483db861a5644123157c1c3cee83323a2ed565d71e3"
[[package]]
name = "windows_i686_msvc"
version = "0.42.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "44d840b6ec649f480a41c8d80f9c65108b92d89345dd94027bfe06ac444d1060"
[[package]]
name = "windows_i686_msvc"
version = "0.47.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3a8ef8f2f1711b223947d9b69b596cf5a4e452c930fb58b6fc3fdae7d0ec6b31"
[[package]]
name = "windows_x86_64_gnu"
version = "0.42.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8de912b8b8feb55c064867cf047dda097f92d51efad5b491dfb98f6bbb70cb36"
[[package]]
name = "windows_x86_64_gnu"
version = "0.47.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7acaa0c2cf0d2ef99b61c308a0c3dbae430a51b7345dedec470bd8f53f5a3642"
[[package]]
name = "windows_x86_64_gnullvm"
version = "0.42.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "26d41b46a36d453748aedef1486d5c7a85db22e56aff34643984ea85514e94a3"
[[package]]
name = "windows_x86_64_gnullvm"
version = "0.47.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e5a0628f71be1d11e17ca4a0e9e15b3a5180f6fbf1c2d55e3ba3f850378052c1"
[[package]]
name = "windows_x86_64_msvc"
version = "0.42.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9aec5da331524158c6d1a4ac0ab1541149c0b9505fde06423b02f5ef0106b9f0"
[[package]]
name = "windows_x86_64_msvc"
version = "0.47.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9d6e62c256dc6d40b8c8707df17df8d774e60e39db723675241e7c15e910bce7"
[[package]]
name = "winreg"
version = "0.10.1"

View File

@ -1,11 +1,13 @@
[package]
name = "lambda-delta-optimize"
description = "Simple AWS Lambda for optimizing Delta tables"
version = "0.1.0"
author = "R Tyler Croy <rtyler@brokenco.de>"
authors = ["R Tyler Croy <rtyler@brokenco.de>"]
edition = "2021"
[dependencies]
aws_lambda_events = { version = "0.7" }
chrono = "*"
deltalake = { git = "https://github.com/delta-io/delta-rs", branch = "main", features = ["arrow", "parquet", "s3"]}
lambda_runtime = { version = "0.7" }
log = "0.4"

View File

@ -67,6 +67,10 @@ The following environment variables must be set for the function to run properly
| `dynamodb`
| This instructs the `deltalake` crate to use DynamoDB for locking to provide consistent writes into s3.
| `OPTIMIZE_DS`
| `yesterday`
| Only apply optimizations to the `ds` partition (`YYYY-mm-dd`), the `yesterday` value will use the previous day UTC
|===
== Licensing

View File

@ -20,14 +20,41 @@ async fn main() -> Result<(), Error> {
lambda_runtime::run(func).await
}
const OPTIMIZE_DS_YESTERDAY: &str = "yesterday";
/*
* Lambda function handler
*/
async fn func(event: LambdaEvent<CloudWatchEvent>) -> Result<Value, Error> {
async fn func<'a>(event: LambdaEvent<CloudWatchEvent>) -> Result<Value, Error> {
use deltalake::PartitionFilter;
use chrono::*;
debug!("CloudWatch event: {:?}", event);
let location = std::env::var("DATALAKE_LOCATION")?;
let table = deltalake::open_table(&location).await?;
let (table, metrics) = OptimizeBuilder::new(table.object_store(), table.state).await?;
let mut filters = vec![];
// This variable only exists to provide a long enough lifetime for
// any PartitionFilter values created
let mut _ds = String::new();
if let Ok(ds_filter) = std::env::var("OPTIMIZE_DS") {
match ds_filter.as_str() {
OPTIMIZE_DS_YESTERDAY => {
if let Some(yesterday) = Utc::now().checked_sub_days(Days::new(1)) {
_ds = format!("{}", yesterday.format("%Y-%m-%d"));
let partition = ("ds", "=", _ds.as_str());
info!("Optimizing with partition: {:?}", partition);
filters.push(PartitionFilter::try_from(partition)?);
}
},
unknown => warn!("Unknown value of OPTIMIZE_DS: {}", unknown),
}
}
let (table, metrics) = OptimizeBuilder::new(table.object_store(), table.state)
.with_filters(filters.as_slice())
.await?;
debug!("table: optimize: {:?}", table);
info!("table: metrics: {:?}", metrics);