diff --git a/Cargo.lock b/Cargo.lock index 26f07400..9cdca7e8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -11,18 +11,24 @@ dependencies = [ "gimli", ] +[[package]] +name = "adler2" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa" + [[package]] name = "ahash" -version = "0.8.12" +version = "0.8.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75" +checksum = "77c3a9648d43b9cd48db467b3f87fdd6e146bcc88ab0180006cef2179fe11d01" dependencies = [ "cfg-if", "const-random", - "getrandom 0.3.4", + "getrandom 0.2.16", "once_cell", "version_check", - "zerocopy", + "zerocopy 0.7.35", ] [[package]] @@ -34,6 +40,21 @@ dependencies = [ "memchr", ] +[[package]] +name = "alloc-no-stdlib" +version = "2.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc7bb162ec39d46ab1ca8c77bf72e890535becd1751bb45f64c597edb4c8c6b3" + +[[package]] +name = "alloc-stdlib" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94fb8275041c72129eb51b7d0322c29b8387a0386127718b096429201a5d6ece" +dependencies = [ + "alloc-no-stdlib", +] + [[package]] name = "allocator-api2" version = "0.2.21" @@ -111,12 +132,68 @@ version = "1.0.100" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a23eb6b1614318a8071c9b2521f36b424b2c83db5eb3a0fead4a6c0809af6e61" +[[package]] +name = "ar_archive_writer" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7eb93bbb63b9c227414f6eb3a0adfddca591a8ce1e9b60661bb08969b87e340b" +dependencies = [ + "object", +] + [[package]] name = "arbitrary" version = "1.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c3d036a3c4ab069c7b410a2ce876bd74808d2d0888a82667669f8e783a898bf1" +[[package]] +name = "arrayref" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76a2e8124351fda1ef8aaaa3bbd7ebbcb486bbcd4225aca0aa0d84bb2db8fecb" + +[[package]] +name = "arrayvec" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" + +[[package]] +name = "arrow" +version = "55.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f3f15b4c6b148206ff3a2b35002e08929c2462467b62b9c02036d9c34f9ef994" +dependencies = [ + "arrow-arith", + "arrow-array 55.2.0", + "arrow-buffer 55.2.0", + "arrow-cast 55.2.0", + "arrow-csv", + "arrow-data 55.2.0", + "arrow-ipc 55.2.0", + "arrow-json 55.2.0 (registry+https://github.com/rust-lang/crates.io-index)", + "arrow-ord", + "arrow-row", + "arrow-schema 55.2.0", + "arrow-select 55.2.0", + "arrow-string", +] + +[[package]] +name = "arrow-arith" +version = "55.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "30feb679425110209ae35c3fbf82404a39a4c0436bb3ec36164d8bffed2a4ce4" +dependencies = [ + "arrow-array 55.2.0", + "arrow-buffer 55.2.0", + "arrow-data 55.2.0", + "arrow-schema 55.2.0", + "chrono", + "num", +] + [[package]] name = "arrow-array" version = "52.2.0" @@ -124,15 +201,32 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "16f4a9468c882dc66862cef4e1fd8423d47e67972377d85d80e022786427768c" dependencies = [ "ahash", - "arrow-buffer", - "arrow-data", - "arrow-schema", + "arrow-buffer 52.2.0", + "arrow-data 52.2.0", + "arrow-schema 52.2.0", "chrono", "half", "hashbrown 0.14.5", "num", ] +[[package]] +name = "arrow-array" +version = "55.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70732f04d285d49054a48b72c54f791bb3424abae92d27aafdf776c98af161c8" +dependencies = [ + "ahash", + "arrow-buffer 55.2.0", + "arrow-data 55.2.0", + "arrow-schema 55.2.0", + "chrono", + "chrono-tz", + "half", + "hashbrown 0.15.5", + "num", +] + [[package]] name = "arrow-buffer" version = "52.2.0" @@ -144,34 +238,93 @@ dependencies = [ "num", ] +[[package]] +name = "arrow-buffer" +version = "55.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "169b1d5d6cb390dd92ce582b06b23815c7953e9dfaaea75556e89d890d19993d" +dependencies = [ + "bytes", + "half", + "num", +] + [[package]] name = "arrow-cast" version = "52.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "da26719e76b81d8bc3faad1d4dbdc1bcc10d14704e63dc17fc9f3e7e1e567c8e" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", - "arrow-select", + "arrow-array 52.2.0", + "arrow-buffer 52.2.0", + "arrow-data 52.2.0", + "arrow-schema 52.2.0", + "arrow-select 52.2.0", + "atoi", + "base64", + "chrono", + "half", + "lexical-core 0.8.5", + "num", + "ryu", +] + +[[package]] +name = "arrow-cast" +version = "55.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e4f12eccc3e1c05a766cafb31f6a60a46c2f8efec9b74c6e0648766d30686af8" +dependencies = [ + "arrow-array 55.2.0", + "arrow-buffer 55.2.0", + "arrow-data 55.2.0", + "arrow-schema 55.2.0", + "arrow-select 55.2.0", "atoi", "base64", "chrono", + "comfy-table", "half", - "lexical-core", + "lexical-core 1.0.6", "num", "ryu", ] +[[package]] +name = "arrow-csv" +version = "55.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "012c9fef3f4a11573b2c74aec53712ff9fdae4a95f4ce452d1bbf088ee00f06b" +dependencies = [ + "arrow-array 55.2.0", + "arrow-cast 55.2.0", + "arrow-schema 55.2.0", + "chrono", + "csv", + "csv-core", + "regex", +] + [[package]] name = "arrow-data" version = "52.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dd9d6f18c65ef7a2573ab498c374d8ae364b4a4edf67105357491c031f716ca5" dependencies = [ - "arrow-buffer", - "arrow-schema", + "arrow-buffer 52.2.0", + "arrow-schema 52.2.0", + "half", + "num", +] + +[[package]] +name = "arrow-data" +version = "55.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8de1ce212d803199684b658fc4ba55fb2d7e87b213de5af415308d2fee3619c2" +dependencies = [ + "arrow-buffer 55.2.0", + "arrow-schema 55.2.0", "half", "num", ] @@ -182,12 +335,96 @@ version = "52.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e786e1cdd952205d9a8afc69397b317cfbb6e0095e445c69cda7e8da5c1eeb0f" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-cast", - "arrow-data", - "arrow-schema", - "flatbuffers", + "arrow-array 52.2.0", + "arrow-buffer 52.2.0", + "arrow-cast 52.2.0", + "arrow-data 52.2.0", + "arrow-schema 52.2.0", + "flatbuffers 24.12.23", +] + +[[package]] +name = "arrow-ipc" +version = "55.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9ea5967e8b2af39aff5d9de2197df16e305f47f404781d3230b2dc672da5d92" +dependencies = [ + "arrow-array 55.2.0", + "arrow-buffer 55.2.0", + "arrow-data 55.2.0", + "arrow-schema 55.2.0", + "flatbuffers 25.12.19", + "lz4_flex", +] + +[[package]] +name = "arrow-json" +version = "55.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5709d974c4ea5be96d900c01576c7c0b99705f4a3eec343648cb1ca863988a9c" +dependencies = [ + "arrow-array 55.2.0", + "arrow-buffer 55.2.0", + "arrow-cast 55.2.0", + "arrow-data 55.2.0", + "arrow-schema 55.2.0", + "chrono", + "half", + "indexmap 2.12.1", + "lexical-core 1.0.6", + "memchr", + "num", + "serde", + "serde_json", + "simdutf8", +] + +[[package]] +name = "arrow-json" +version = "55.2.0" +source = "git+https://github.com/ArroyoSystems/arrow-rs?branch=55.2.0%2Fjson#d31f8d8f97c6e1394b52927cd8c23c14fec6ba16" +dependencies = [ + "arrow-array 55.2.0", + "arrow-buffer 55.2.0", + "arrow-cast 55.2.0", + "arrow-data 55.2.0", + "arrow-schema 55.2.0", + "base64", + "chrono", + "half", + "indexmap 2.12.1", + "lexical-core 1.0.6", + "memchr", + "num", + "serde", + "serde_json", + "simdutf8", +] + +[[package]] +name = "arrow-ord" +version = "55.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6506e3a059e3be23023f587f79c82ef0bcf6d293587e3272d20f2d30b969b5a7" +dependencies = [ + "arrow-array 55.2.0", + "arrow-buffer 55.2.0", + "arrow-data 55.2.0", + "arrow-schema 55.2.0", + "arrow-select 55.2.0", +] + +[[package]] +name = "arrow-row" +version = "55.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52bf7393166beaf79b4bed9bfdf19e97472af32ce5b6b48169d321518a08cae2" +dependencies = [ + "arrow-array 55.2.0", + "arrow-buffer 55.2.0", + "arrow-data 55.2.0", + "arrow-schema 55.2.0", + "half", ] [[package]] @@ -196,6 +433,16 @@ version = "52.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9e972cd1ff4a4ccd22f86d3e53e835c2ed92e0eea6a3e8eadb72b4f1ac802cf8" +[[package]] +name = "arrow-schema" +version = "55.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af7686986a3bf2254c9fb130c623cdcb2f8e1f15763e7c71c310f0834da3d292" +dependencies = [ + "serde", + "serde_json", +] + [[package]] name = "arrow-select" version = "52.2.0" @@ -203,11 +450,59 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "600bae05d43483d216fb3494f8c32fdbefd8aa4e1de237e790dbb3d9f44690a3" dependencies = [ "ahash", - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", + "arrow-array 52.2.0", + "arrow-buffer 52.2.0", + "arrow-data 52.2.0", + "arrow-schema 52.2.0", + "num", +] + +[[package]] +name = "arrow-select" +version = "55.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dd2b45757d6a2373faa3352d02ff5b54b098f5e21dccebc45a21806bc34501e5" +dependencies = [ + "ahash", + "arrow-array 55.2.0", + "arrow-buffer 55.2.0", + "arrow-data 55.2.0", + "arrow-schema 55.2.0", + "num", +] + +[[package]] +name = "arrow-string" +version = "55.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0377d532850babb4d927a06294314b316e23311503ed580ec6ce6a0158f49d40" +dependencies = [ + "arrow-array 55.2.0", + "arrow-buffer 55.2.0", + "arrow-data 55.2.0", + "arrow-schema 55.2.0", + "arrow-select 55.2.0", + "memchr", "num", + "regex", + "regex-syntax", +] + +[[package]] +name = "async-compression" +version = "0.4.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06575e6a9673580f52661c92107baabffbf41e2141373441cbcdc47cb733003c" +dependencies = [ + "bzip2", + "flate2", + "futures-core", + "memchr", + "pin-project-lite", + "tokio", + "xz2", + "zstd", + "zstd-safe", ] [[package]] @@ -317,13 +612,37 @@ version = "0.22.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" +[[package]] +name = "bigdecimal" +version = "0.4.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4d6867f1565b3aad85681f1015055b087fcfd840d6aeee6eee7f2da317603695" +dependencies = [ + "autocfg", + "libm", + "num-bigint", + "num-integer", + "num-traits", +] + [[package]] name = "bincode" -version = "1.3.3" +version = "2.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1f45e9417d87227c7a56d22e471c6206462cba514c7590c09aff4cf6d1ddcad" +checksum = "36eaf5d7b090263e8150820482d5d93cd964a81e4019913c972f4edcc6edb740" dependencies = [ + "bincode_derive", "serde", + "unty", +] + +[[package]] +name = "bincode_derive" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf95709a440f45e986983918d0e8a1f30a9b1df04918fc828670606804ac3c09" +dependencies = [ + "virtue", ] [[package]] @@ -386,6 +705,29 @@ dependencies = [ "typenum", ] +[[package]] +name = "blake2" +version = "0.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "46502ad458c9a52b69d4d4d32775c788b7a1b85e8bc9d482d92250fc0e3f8efe" +dependencies = [ + "digest", +] + +[[package]] +name = "blake3" +version = "1.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2468ef7d57b3fb7e16b576e8377cdbde2320c60e1491e961d11da40fc4f02a2d" +dependencies = [ + "arrayref", + "arrayvec", + "cc", + "cfg-if", + "constant_time_eq", + "cpufeatures", +] + [[package]] name = "block-buffer" version = "0.10.4" @@ -395,6 +737,27 @@ dependencies = [ "generic-array", ] +[[package]] +name = "brotli" +version = "8.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4bd8b9603c7aa97359dbd97ecf258968c95f3adddd6db2f7e7a5bef101c84560" +dependencies = [ + "alloc-no-stdlib", + "alloc-stdlib", + "brotli-decompressor", +] + +[[package]] +name = "brotli-decompressor" +version = "5.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "874bb8112abecc98cbd6d81ea4fa7e94fb9449648c93cc89aa40c81c24d7de03" +dependencies = [ + "alloc-no-stdlib", + "alloc-stdlib", +] + [[package]] name = "bumpalo" version = "3.19.1" @@ -404,11 +767,32 @@ dependencies = [ "allocator-api2", ] +[[package]] +name = "bytecount" +version = "0.6.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "175812e0be2bccb6abe50bb8d566126198344f707e304f45c648fd8f2cc0365e" + +[[package]] +name = "byteorder" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" + [[package]] name = "bytes" -version = "1.11.0" +version = "1.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e748733b7cbc798e1434b6ac524f0c1ff2ab456fe201501e6497c8417a4fc33" + +[[package]] +name = "bzip2" +version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b35204fbdc0b3f4446b89fc1ac2cf84a8a68971995d0bf2e925ec7cd960f9cb3" +checksum = "49ecfb22d906f800d4fe833b6282cf4dc1c298f5057ca0b5445e5c209735ca47" +dependencies = [ + "bzip2-sys", +] [[package]] name = "bzip2-sys" @@ -420,6 +804,15 @@ dependencies = [ "pkg-config", ] +[[package]] +name = "camino" +version = "1.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e629a66d692cb9ff1a1c664e41771b3dcaf961985a9774c0eb0bd1b51cf60a48" +dependencies = [ + "serde_core", +] + [[package]] name = "cap-fs-ext" version = "3.4.5" @@ -469,7 +862,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d8144c22e24bbcf26ade86cb6501a0916c46b7e4787abdb0045a467eb1645a1d" dependencies = [ "ambient-authority", - "rand", + "rand 0.8.5", ] [[package]] @@ -498,6 +891,28 @@ dependencies = [ "winx", ] +[[package]] +name = "cargo-platform" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e35af189006b9c0f00a064685c727031e3ed2d8020f7ba284d78cc2671bd36ea" +dependencies = [ + "serde", +] + +[[package]] +name = "cargo_metadata" +version = "0.14.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4acbb09d9ee8e23699b9634375c72795d095bf268439da88562cf9b501f181fa" +dependencies = [ + "camino", + "cargo-platform", + "semver", + "serde", + "serde_json", +] + [[package]] name = "cc" version = "1.2.51" @@ -538,10 +953,22 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "145052bdd345b87320e369255277e3fb5152762ad123a901ef5c262dd38fe8d2" dependencies = [ "iana-time-zone", + "js-sys", "num-traits", + "wasm-bindgen", "windows-link", ] +[[package]] +name = "chrono-tz" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6139a8597ed92cf816dfb33f5dd6cf0bb93a6adc938f11039f371bc5bcd26c3" +dependencies = [ + "chrono", + "phf", +] + [[package]] name = "clang-sys" version = "1.8.1" @@ -581,7 +1008,7 @@ version = "4.5.49" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2a0b5487afeab2deb2ff4e03a807ad1a03ac532ff5a2cee5d86884440c7f7671" dependencies = [ - "heck", + "heck 0.5.0", "proc-macro2", "quote", "syn", @@ -657,6 +1084,12 @@ dependencies = [ "tiny-keccak", ] +[[package]] +name = "constant_time_eq" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d52eff69cd5e647efe296129160853a42795992097e8af39800e1060caeea9b" + [[package]] name = "core-foundation-sys" version = "0.8.7" @@ -757,7 +1190,7 @@ dependencies = [ "cranelift-assembler-x64-meta", "cranelift-codegen-shared", "cranelift-srcgen", - "heck", + "heck 0.5.0", "pulley-interpreter", ] @@ -904,6 +1337,649 @@ dependencies = [ "typenum", ] +[[package]] +name = "csv" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52cd9d68cf7efc6ddfaaee42e7288d3a99d613d4b50f76ce9827ae0c6e14f938" +dependencies = [ + "csv-core", + "itoa", + "ryu", + "serde_core", +] + +[[package]] +name = "csv-core" +version = "0.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "704a3c26996a80471189265814dbc2c257598b96b8a7feae2d31ace646bb9782" +dependencies = [ + "memchr", +] + +[[package]] +name = "dashmap" +version = "5.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "978747c1d849a7d2ee5e8adc0159961c48fb7e5db2f06af6723b80123bb53856" +dependencies = [ + "cfg-if", + "hashbrown 0.14.5", + "lock_api", + "once_cell", + "parking_lot_core", +] + +[[package]] +name = "dashmap" +version = "6.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5041cc499144891f3790297212f32a74fb938e5136a14943f338ef9e0ae276cf" +dependencies = [ + "cfg-if", + "crossbeam-utils", + "hashbrown 0.14.5", + "lock_api", + "once_cell", + "parking_lot_core", +] + +[[package]] +name = "datafusion" +version = "48.0.1" +source = "git+https://github.com/ArroyoSystems/arrow-datafusion?branch=48.0.1%2Farroyo#916b45f5c28d94765ae4a6393c5e126b2ea55e1c" +dependencies = [ + "arrow", + "arrow-ipc 55.2.0", + "arrow-schema 55.2.0", + "async-trait", + "bytes", + "bzip2", + "chrono", + "datafusion-catalog", + "datafusion-catalog-listing", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-datasource", + "datafusion-datasource-csv", + "datafusion-datasource-json", + "datafusion-datasource-parquet", + "datafusion-execution", + "datafusion-expr", + "datafusion-expr-common", + "datafusion-functions", + "datafusion-functions-aggregate", + "datafusion-functions-nested", + "datafusion-functions-table", + "datafusion-functions-window", + "datafusion-optimizer", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "datafusion-physical-optimizer", + "datafusion-physical-plan", + "datafusion-session", + "datafusion-sql", + "flate2", + "futures", + "itertools 0.14.0", + "log", + "object_store", + "parking_lot", + "parquet 55.2.0 (registry+https://github.com/rust-lang/crates.io-index)", + "rand 0.9.2", + "regex", + "sqlparser", + "tempfile", + "tokio", + "url", + "uuid", + "xz2", + "zstd", +] + +[[package]] +name = "datafusion-catalog" +version = "48.0.1" +source = "git+https://github.com/ArroyoSystems/arrow-datafusion?branch=48.0.1%2Farroyo#916b45f5c28d94765ae4a6393c5e126b2ea55e1c" +dependencies = [ + "arrow", + "async-trait", + "dashmap 6.1.0", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-datasource", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr", + "datafusion-physical-plan", + "datafusion-session", + "datafusion-sql", + "futures", + "itertools 0.14.0", + "log", + "object_store", + "parking_lot", + "tokio", +] + +[[package]] +name = "datafusion-catalog-listing" +version = "48.0.1" +source = "git+https://github.com/ArroyoSystems/arrow-datafusion?branch=48.0.1%2Farroyo#916b45f5c28d94765ae4a6393c5e126b2ea55e1c" +dependencies = [ + "arrow", + "async-trait", + "datafusion-catalog", + "datafusion-common", + "datafusion-datasource", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "datafusion-session", + "futures", + "log", + "object_store", + "tokio", +] + +[[package]] +name = "datafusion-common" +version = "48.0.1" +source = "git+https://github.com/ArroyoSystems/arrow-datafusion?branch=48.0.1%2Farroyo#916b45f5c28d94765ae4a6393c5e126b2ea55e1c" +dependencies = [ + "ahash", + "arrow", + "arrow-ipc 55.2.0", + "base64", + "half", + "hashbrown 0.14.5", + "indexmap 2.12.1", + "libc", + "log", + "object_store", + "parquet 55.2.0 (registry+https://github.com/rust-lang/crates.io-index)", + "paste", + "recursive", + "sqlparser", + "tokio", + "web-time", +] + +[[package]] +name = "datafusion-common-runtime" +version = "48.0.1" +source = "git+https://github.com/ArroyoSystems/arrow-datafusion?branch=48.0.1%2Farroyo#916b45f5c28d94765ae4a6393c5e126b2ea55e1c" +dependencies = [ + "futures", + "log", + "tokio", +] + +[[package]] +name = "datafusion-datasource" +version = "48.0.1" +source = "git+https://github.com/ArroyoSystems/arrow-datafusion?branch=48.0.1%2Farroyo#916b45f5c28d94765ae4a6393c5e126b2ea55e1c" +dependencies = [ + "arrow", + "async-compression", + "async-trait", + "bytes", + "bzip2", + "chrono", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "datafusion-session", + "flate2", + "futures", + "glob", + "itertools 0.14.0", + "log", + "object_store", + "parquet 55.2.0 (registry+https://github.com/rust-lang/crates.io-index)", + "rand 0.9.2", + "tempfile", + "tokio", + "tokio-util", + "url", + "xz2", + "zstd", +] + +[[package]] +name = "datafusion-datasource-csv" +version = "48.0.1" +source = "git+https://github.com/ArroyoSystems/arrow-datafusion?branch=48.0.1%2Farroyo#916b45f5c28d94765ae4a6393c5e126b2ea55e1c" +dependencies = [ + "arrow", + "async-trait", + "bytes", + "datafusion-catalog", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-datasource", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "datafusion-session", + "futures", + "object_store", + "regex", + "tokio", +] + +[[package]] +name = "datafusion-datasource-json" +version = "48.0.1" +source = "git+https://github.com/ArroyoSystems/arrow-datafusion?branch=48.0.1%2Farroyo#916b45f5c28d94765ae4a6393c5e126b2ea55e1c" +dependencies = [ + "arrow", + "async-trait", + "bytes", + "datafusion-catalog", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-datasource", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "datafusion-session", + "futures", + "object_store", + "serde_json", + "tokio", +] + +[[package]] +name = "datafusion-datasource-parquet" +version = "48.0.1" +source = "git+https://github.com/ArroyoSystems/arrow-datafusion?branch=48.0.1%2Farroyo#916b45f5c28d94765ae4a6393c5e126b2ea55e1c" +dependencies = [ + "arrow", + "async-trait", + "bytes", + "datafusion-catalog", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-datasource", + "datafusion-execution", + "datafusion-expr", + "datafusion-functions-aggregate", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "datafusion-physical-optimizer", + "datafusion-physical-plan", + "datafusion-session", + "futures", + "itertools 0.14.0", + "log", + "object_store", + "parking_lot", + "parquet 55.2.0 (registry+https://github.com/rust-lang/crates.io-index)", + "rand 0.9.2", + "tokio", +] + +[[package]] +name = "datafusion-doc" +version = "48.0.1" +source = "git+https://github.com/ArroyoSystems/arrow-datafusion?branch=48.0.1%2Farroyo#916b45f5c28d94765ae4a6393c5e126b2ea55e1c" + +[[package]] +name = "datafusion-execution" +version = "48.0.1" +source = "git+https://github.com/ArroyoSystems/arrow-datafusion?branch=48.0.1%2Farroyo#916b45f5c28d94765ae4a6393c5e126b2ea55e1c" +dependencies = [ + "arrow", + "dashmap 6.1.0", + "datafusion-common", + "datafusion-expr", + "futures", + "log", + "object_store", + "parking_lot", + "rand 0.9.2", + "tempfile", + "url", +] + +[[package]] +name = "datafusion-expr" +version = "48.0.1" +source = "git+https://github.com/ArroyoSystems/arrow-datafusion?branch=48.0.1%2Farroyo#916b45f5c28d94765ae4a6393c5e126b2ea55e1c" +dependencies = [ + "arrow", + "chrono", + "datafusion-common", + "datafusion-doc", + "datafusion-expr-common", + "datafusion-functions-aggregate-common", + "datafusion-functions-window-common", + "datafusion-physical-expr-common", + "indexmap 2.12.1", + "paste", + "recursive", + "serde_json", + "sqlparser", +] + +[[package]] +name = "datafusion-expr-common" +version = "48.0.1" +source = "git+https://github.com/ArroyoSystems/arrow-datafusion?branch=48.0.1%2Farroyo#916b45f5c28d94765ae4a6393c5e126b2ea55e1c" +dependencies = [ + "arrow", + "datafusion-common", + "indexmap 2.12.1", + "itertools 0.14.0", + "paste", +] + +[[package]] +name = "datafusion-functions" +version = "48.0.1" +source = "git+https://github.com/ArroyoSystems/arrow-datafusion?branch=48.0.1%2Farroyo#916b45f5c28d94765ae4a6393c5e126b2ea55e1c" +dependencies = [ + "arrow", + "arrow-buffer 55.2.0", + "base64", + "blake2", + "blake3", + "chrono", + "datafusion-common", + "datafusion-doc", + "datafusion-execution", + "datafusion-expr", + "datafusion-expr-common", + "datafusion-macros", + "hex", + "itertools 0.14.0", + "log", + "md-5", + "rand 0.9.2", + "regex", + "sha2", + "unicode-segmentation", + "uuid", +] + +[[package]] +name = "datafusion-functions-aggregate" +version = "48.0.1" +source = "git+https://github.com/ArroyoSystems/arrow-datafusion?branch=48.0.1%2Farroyo#916b45f5c28d94765ae4a6393c5e126b2ea55e1c" +dependencies = [ + "ahash", + "arrow", + "datafusion-common", + "datafusion-doc", + "datafusion-execution", + "datafusion-expr", + "datafusion-functions-aggregate-common", + "datafusion-macros", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "half", + "log", + "paste", +] + +[[package]] +name = "datafusion-functions-aggregate-common" +version = "48.0.1" +source = "git+https://github.com/ArroyoSystems/arrow-datafusion?branch=48.0.1%2Farroyo#916b45f5c28d94765ae4a6393c5e126b2ea55e1c" +dependencies = [ + "ahash", + "arrow", + "datafusion-common", + "datafusion-expr-common", + "datafusion-physical-expr-common", +] + +[[package]] +name = "datafusion-functions-nested" +version = "48.0.1" +source = "git+https://github.com/ArroyoSystems/arrow-datafusion?branch=48.0.1%2Farroyo#916b45f5c28d94765ae4a6393c5e126b2ea55e1c" +dependencies = [ + "arrow", + "arrow-ord", + "datafusion-common", + "datafusion-doc", + "datafusion-execution", + "datafusion-expr", + "datafusion-functions", + "datafusion-functions-aggregate", + "datafusion-macros", + "datafusion-physical-expr-common", + "itertools 0.14.0", + "log", + "paste", +] + +[[package]] +name = "datafusion-functions-table" +version = "48.0.1" +source = "git+https://github.com/ArroyoSystems/arrow-datafusion?branch=48.0.1%2Farroyo#916b45f5c28d94765ae4a6393c5e126b2ea55e1c" +dependencies = [ + "arrow", + "async-trait", + "datafusion-catalog", + "datafusion-common", + "datafusion-expr", + "datafusion-physical-plan", + "parking_lot", + "paste", +] + +[[package]] +name = "datafusion-functions-window" +version = "48.0.1" +source = "git+https://github.com/ArroyoSystems/arrow-datafusion?branch=48.0.1%2Farroyo#916b45f5c28d94765ae4a6393c5e126b2ea55e1c" +dependencies = [ + "arrow", + "datafusion-common", + "datafusion-doc", + "datafusion-expr", + "datafusion-functions-window-common", + "datafusion-macros", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "log", + "paste", +] + +[[package]] +name = "datafusion-functions-window-common" +version = "48.0.1" +source = "git+https://github.com/ArroyoSystems/arrow-datafusion?branch=48.0.1%2Farroyo#916b45f5c28d94765ae4a6393c5e126b2ea55e1c" +dependencies = [ + "datafusion-common", + "datafusion-physical-expr-common", +] + +[[package]] +name = "datafusion-macros" +version = "48.0.1" +source = "git+https://github.com/ArroyoSystems/arrow-datafusion?branch=48.0.1%2Farroyo#916b45f5c28d94765ae4a6393c5e126b2ea55e1c" +dependencies = [ + "datafusion-expr", + "quote", + "syn", +] + +[[package]] +name = "datafusion-optimizer" +version = "48.0.1" +source = "git+https://github.com/ArroyoSystems/arrow-datafusion?branch=48.0.1%2Farroyo#916b45f5c28d94765ae4a6393c5e126b2ea55e1c" +dependencies = [ + "arrow", + "chrono", + "datafusion-common", + "datafusion-expr", + "datafusion-physical-expr", + "indexmap 2.12.1", + "itertools 0.14.0", + "log", + "recursive", + "regex", + "regex-syntax", +] + +[[package]] +name = "datafusion-physical-expr" +version = "48.0.1" +source = "git+https://github.com/ArroyoSystems/arrow-datafusion?branch=48.0.1%2Farroyo#916b45f5c28d94765ae4a6393c5e126b2ea55e1c" +dependencies = [ + "ahash", + "arrow", + "datafusion-common", + "datafusion-expr", + "datafusion-expr-common", + "datafusion-functions-aggregate-common", + "datafusion-physical-expr-common", + "half", + "hashbrown 0.14.5", + "indexmap 2.12.1", + "itertools 0.14.0", + "log", + "paste", + "petgraph 0.8.3", +] + +[[package]] +name = "datafusion-physical-expr-common" +version = "48.0.1" +source = "git+https://github.com/ArroyoSystems/arrow-datafusion?branch=48.0.1%2Farroyo#916b45f5c28d94765ae4a6393c5e126b2ea55e1c" +dependencies = [ + "ahash", + "arrow", + "datafusion-common", + "datafusion-expr-common", + "hashbrown 0.14.5", + "itertools 0.14.0", +] + +[[package]] +name = "datafusion-physical-optimizer" +version = "48.0.1" +source = "git+https://github.com/ArroyoSystems/arrow-datafusion?branch=48.0.1%2Farroyo#916b45f5c28d94765ae4a6393c5e126b2ea55e1c" +dependencies = [ + "arrow", + "datafusion-common", + "datafusion-execution", + "datafusion-expr", + "datafusion-expr-common", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "itertools 0.14.0", + "log", + "recursive", +] + +[[package]] +name = "datafusion-physical-plan" +version = "48.0.1" +source = "git+https://github.com/ArroyoSystems/arrow-datafusion?branch=48.0.1%2Farroyo#916b45f5c28d94765ae4a6393c5e126b2ea55e1c" +dependencies = [ + "ahash", + "arrow", + "arrow-ord", + "arrow-schema 55.2.0", + "async-trait", + "chrono", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-execution", + "datafusion-expr", + "datafusion-functions-window-common", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "futures", + "half", + "hashbrown 0.14.5", + "indexmap 2.12.1", + "itertools 0.14.0", + "log", + "parking_lot", + "pin-project-lite", + "tokio", +] + +[[package]] +name = "datafusion-proto" +version = "48.0.1" +source = "git+https://github.com/ArroyoSystems/arrow-datafusion?branch=48.0.1%2Farroyo#916b45f5c28d94765ae4a6393c5e126b2ea55e1c" +dependencies = [ + "arrow", + "chrono", + "datafusion", + "datafusion-common", + "datafusion-expr", + "datafusion-proto-common", + "object_store", + "prost", +] + +[[package]] +name = "datafusion-proto-common" +version = "48.0.1" +source = "git+https://github.com/ArroyoSystems/arrow-datafusion?branch=48.0.1%2Farroyo#916b45f5c28d94765ae4a6393c5e126b2ea55e1c" +dependencies = [ + "arrow", + "datafusion-common", + "prost", +] + +[[package]] +name = "datafusion-session" +version = "48.0.1" +source = "git+https://github.com/ArroyoSystems/arrow-datafusion?branch=48.0.1%2Farroyo#916b45f5c28d94765ae4a6393c5e126b2ea55e1c" +dependencies = [ + "arrow", + "async-trait", + "dashmap 6.1.0", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr", + "datafusion-physical-plan", + "datafusion-sql", + "futures", + "itertools 0.14.0", + "log", + "object_store", + "parking_lot", + "tokio", +] + +[[package]] +name = "datafusion-sql" +version = "48.0.1" +source = "git+https://github.com/ArroyoSystems/arrow-datafusion?branch=48.0.1%2Farroyo#916b45f5c28d94765ae4a6393c5e126b2ea55e1c" +dependencies = [ + "arrow", + "bigdecimal", + "datafusion-common", + "datafusion-expr", + "indexmap 2.12.1", + "log", + "recursive", + "regex", + "sqlparser", +] + [[package]] name = "debugid" version = "0.8.0" @@ -930,6 +2006,7 @@ checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" dependencies = [ "block-buffer", "crypto-common", + "subtle", ] [[package]] @@ -985,6 +2062,12 @@ dependencies = [ "shared_child", ] +[[package]] +name = "dyn-clone" +version = "1.0.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0881ea181b1df73ff77ffaaf9c7544ecc11e82fba9b5f27b262a3c73a332555" + [[package]] name = "either" version = "1.15.0" @@ -1047,6 +2130,15 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "error-chain" +version = "0.12.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d2f06b9cac1506ece98fe3231e3cc9c4410ec3d5b1f24ae1c8946f0742cdefc" +dependencies = [ + "version_check", +] + [[package]] name = "error-code" version = "3.3.2" @@ -1104,6 +2196,27 @@ dependencies = [ "rustc_version", ] +[[package]] +name = "flatbuffers" +version = "25.12.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "35f6839d7b3b98adde531effaf34f0c2badc6f4735d26fe74709d8e513a96ef3" +dependencies = [ + "bitflags 2.10.0", + "rustc_version", +] + +[[package]] +name = "flate2" +version = "1.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "843fba2746e448b37e26a819579957415c8cef339bf08564fe8b7ddbd959573c" +dependencies = [ + "crc32fast", + "miniz_oxide", + "zlib-rs", +] + [[package]] name = "fnv" version = "1.0.7" @@ -1140,28 +2253,51 @@ dependencies = [ name = "function-stream" version = "0.6.0" dependencies = [ + "ahash", "anyhow", - "arrow-array", - "arrow-ipc", - "arrow-schema", + "arrow", + "arrow-array 55.2.0", + "arrow-ipc 55.2.0", + "arrow-json 55.2.0 (git+https://github.com/ArroyoSystems/arrow-rs?branch=55.2.0%2Fjson)", + "arrow-schema 55.2.0", "async-trait", "base64", "bincode", - "clap", + "chrono", "crossbeam-channel", + "datafusion", + "datafusion-common", + "datafusion-execution", + "datafusion-expr", + "datafusion-functions", + "datafusion-functions-window", + "datafusion-physical-expr", + "datafusion-physical-plan", + "datafusion-proto", + "futures", + "governor", + "hex", + "itertools 0.14.0", "log", "lru", + "mini-moka", "num_cpus", "parking_lot", - "pest", - "pest_derive", + "parquet 55.2.0 (git+https://github.com/ArroyoSystems/arrow-rs?branch=55.2.0%2Fparquet)", + "petgraph 0.7.1", "proctitle", + "prost", "protocol", + "rand 0.8.5", "rdkafka", "rocksdb", "serde", "serde_json", + "serde_json_path", "serde_yaml", + "sha2", + "sqlparser", + "strum", "thiserror 2.0.17", "tokio", "tokio-stream", @@ -1169,24 +2305,25 @@ dependencies = [ "tracing", "tracing-appender", "tracing-subscriber", + "typify", + "unicase", "uuid", "wasmtime", "wasmtime-wasi", + "xxhash-rust", ] [[package]] name = "function-stream-cli" version = "0.1.0" dependencies = [ - "arrow-array", - "arrow-ipc", - "arrow-schema", + "arrow-array 52.2.0", + "arrow-ipc 52.2.0", + "arrow-schema 52.2.0", "clap", "comfy-table", - "function-stream", "protocol", "rustyline", - "thiserror 2.0.17", "tokio", "tonic", ] @@ -1199,6 +2336,7 @@ checksum = "65bc07b1a8bc7c85c5f2e110c476c7389b4554ba72af57d8445ea63a576b0876" dependencies = [ "futures-channel", "futures-core", + "futures-executor", "futures-io", "futures-sink", "futures-task", @@ -1221,12 +2359,34 @@ version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e" +[[package]] +name = "futures-executor" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e28d1d997f585e54aebc3f97d39e72338912123a67330d723fdbb564d646c9f" +dependencies = [ + "futures-core", + "futures-task", + "futures-util", +] + [[package]] name = "futures-io" version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6" +[[package]] +name = "futures-macro" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "futures-sink" version = "0.3.31" @@ -1239,6 +2399,12 @@ version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f90f7dce0722e95104fcb095585910c0977252f286e354b5e3bd38902cd99988" +[[package]] +name = "futures-timer" +version = "3.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f288b0a4f20f9a56b5d1da57e2227c661b7b16168e2f72365f57b63326e29b24" + [[package]] name = "futures-util" version = "0.3.31" @@ -1248,6 +2414,7 @@ dependencies = [ "futures-channel", "futures-core", "futures-io", + "futures-macro", "futures-sink", "futures-task", "memchr", @@ -1298,9 +2465,11 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd" dependencies = [ "cfg-if", + "js-sys", "libc", "r-efi", "wasip2", + "wasm-bindgen", ] [[package]] @@ -1320,6 +2489,29 @@ version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0cc23270f6e1808e30a928bdc84dea0b9b4136a8bc82338574f23baf47bbd280" +[[package]] +name = "governor" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be93b4ec2e4710b04d9264c0c7350cdd62a8c20e5e4ac732552ebb8f0debe8eb" +dependencies = [ + "cfg-if", + "dashmap 6.1.0", + "futures-sink", + "futures-timer", + "futures-util", + "getrandom 0.3.4", + "no-std-compat", + "nonzero_ext", + "parking_lot", + "portable-atomic", + "quanta", + "rand 0.9.2", + "smallvec", + "spinning_top", + "web-time", +] + [[package]] name = "h2" version = "0.4.12" @@ -1348,7 +2540,7 @@ dependencies = [ "cfg-if", "crunchy", "num-traits", - "zerocopy", + "zerocopy 0.8.31", ] [[package]] @@ -1357,11 +2549,24 @@ version = "0.12.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888" +[[package]] +name = "hashbrown" +version = "0.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43a3c133739dddd0d2990f9a4bdf8eb4b21ef50e4851ca85ab661199821d510e" +dependencies = [ + "ahash", +] + [[package]] name = "hashbrown" version = "0.14.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" +dependencies = [ + "ahash", + "allocator-api2", +] [[package]] name = "hashbrown" @@ -1379,7 +2584,13 @@ dependencies = [ name = "hashbrown" version = "0.16.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100" +checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100" + +[[package]] +name = "heck" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" [[package]] name = "heck" @@ -1393,6 +2604,12 @@ version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fc0fef456e4baa96da950455cd02c081ca953b141298e41db3fc7e36b1da849c" +[[package]] +name = "hex" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" + [[package]] name = "home" version = "0.5.12" @@ -1649,7 +2866,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "af1955a75fa080c677d3972822ec4bad316169ab1cfc6c257a942c2265dbe5fe" dependencies = [ "bitmaps", - "rand_core", + "rand_core 0.6.4", "rand_xoshiro", "sized-chunks", "typenum", @@ -1678,6 +2895,21 @@ dependencies = [ "serde_core", ] +[[package]] +name = "integer-encoding" +version = "3.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02" + +[[package]] +name = "inventory" +version = "0.3.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "009ae045c87e7082cb72dab0ccd01ae075dd00141ddc108f43a0ea150a9e7227" +dependencies = [ + "rustversion", +] + [[package]] name = "io-extras" version = "0.18.4" @@ -1811,11 +3043,24 @@ version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2cde5de06e8d4c2faabc400238f9ae1c74d5412d03a7bd067645ccbc47070e46" dependencies = [ - "lexical-parse-float", - "lexical-parse-integer", - "lexical-util", - "lexical-write-float", - "lexical-write-integer", + "lexical-parse-float 0.8.5", + "lexical-parse-integer 0.8.6", + "lexical-util 0.8.5", + "lexical-write-float 0.8.5", + "lexical-write-integer 0.8.5", +] + +[[package]] +name = "lexical-core" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7d8d125a277f807e55a77304455eb7b1cb52f2b18c143b60e766c120bd64a594" +dependencies = [ + "lexical-parse-float 1.0.6", + "lexical-parse-integer 1.0.6", + "lexical-util 1.0.7", + "lexical-write-float 1.0.6", + "lexical-write-integer 1.0.6", ] [[package]] @@ -1824,21 +3069,40 @@ version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "683b3a5ebd0130b8fb52ba0bdc718cc56815b6a097e28ae5a6997d0ad17dc05f" dependencies = [ - "lexical-parse-integer", - "lexical-util", + "lexical-parse-integer 0.8.6", + "lexical-util 0.8.5", "static_assertions", ] +[[package]] +name = "lexical-parse-float" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52a9f232fbd6f550bc0137dcb5f99ab674071ac2d690ac69704593cb4abbea56" +dependencies = [ + "lexical-parse-integer 1.0.6", + "lexical-util 1.0.7", +] + [[package]] name = "lexical-parse-integer" version = "0.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6d0994485ed0c312f6d965766754ea177d07f9c00c9b82a5ee62ed5b47945ee9" dependencies = [ - "lexical-util", + "lexical-util 0.8.5", "static_assertions", ] +[[package]] +name = "lexical-parse-integer" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a7a039f8fb9c19c996cd7b2fcce303c1b2874fe1aca544edc85c4a5f8489b34" +dependencies = [ + "lexical-util 1.0.7", +] + [[package]] name = "lexical-util" version = "0.8.5" @@ -1848,27 +3112,52 @@ dependencies = [ "static_assertions", ] +[[package]] +name = "lexical-util" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2604dd126bb14f13fb5d1bd6a66155079cb9fa655b37f875b3a742c705dbed17" + [[package]] name = "lexical-write-float" version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "accabaa1c4581f05a3923d1b4cfd124c329352288b7b9da09e766b0668116862" dependencies = [ - "lexical-util", - "lexical-write-integer", + "lexical-util 0.8.5", + "lexical-write-integer 0.8.5", "static_assertions", ] +[[package]] +name = "lexical-write-float" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50c438c87c013188d415fbabbb1dceb44249ab81664efbd31b14ae55dabb6361" +dependencies = [ + "lexical-util 1.0.7", + "lexical-write-integer 1.0.6", +] + [[package]] name = "lexical-write-integer" version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e1b6f3d1f4422866b68192d62f77bc5c700bee84f3069f2469d7bc8c77852446" dependencies = [ - "lexical-util", + "lexical-util 0.8.5", "static_assertions", ] +[[package]] +name = "lexical-write-integer" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "409851a618475d2d5796377cad353802345cba92c867d9fbcde9cf4eac4e14df" +dependencies = [ + "lexical-util 1.0.7", +] + [[package]] name = "libc" version = "0.2.179" @@ -1987,6 +3276,26 @@ dependencies = [ "libc", ] +[[package]] +name = "lz4_flex" +version = "0.11.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "373f5eceeeab7925e0c1098212f2fbc4d416adec9d35051a6ab251e824c1854a" +dependencies = [ + "twox-hash", +] + +[[package]] +name = "lzma-sys" +version = "0.1.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5fda04ab3764e6cde78b9974eec4f779acaba7c4e84b36eca3cf77c581b85d27" +dependencies = [ + "cc", + "libc", + "pkg-config", +] + [[package]] name = "mach2" version = "0.4.3" @@ -2017,6 +3326,16 @@ version = "0.3.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4facc753ae494aeb6e3c22f839b158aebd4f9270f55cd3c79906c45476c47ab4" +[[package]] +name = "md-5" +version = "0.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d89e7ee0cfbedfc4da3340218492196241d89eefb6dab27de5df917a6d2e78cf" +dependencies = [ + "cfg-if", + "digest", +] + [[package]] name = "memchr" version = "2.7.6" @@ -2038,12 +3357,37 @@ version = "0.3.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" +[[package]] +name = "mini-moka" +version = "0.10.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c325dfab65f261f386debee8b0969da215b3fa0037e74c8a1234db7ba986d803" +dependencies = [ + "crossbeam-channel", + "crossbeam-utils", + "dashmap 5.5.3", + "skeptic", + "smallvec", + "tagptr", + "triomphe", +] + [[package]] name = "minimal-lexical" version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" +[[package]] +name = "miniz_oxide" +version = "0.8.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fa76a2c86f704bdb222d66965fb3d63269ce38518b83cb0575fca855ebb6316" +dependencies = [ + "adler2", + "simd-adler32", +] + [[package]] name = "mio" version = "1.1.1" @@ -2082,6 +3426,12 @@ dependencies = [ "libc", ] +[[package]] +name = "no-std-compat" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b93853da6d84c2e3c7d730d6473e8817692dd89be387eb01b94d7f108ecb5b8c" + [[package]] name = "nom" version = "7.1.3" @@ -2092,6 +3442,12 @@ dependencies = [ "minimal-lexical", ] +[[package]] +name = "nonzero_ext" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38bf9645c8b145698bb0b18a4637dcacbc421ea49bef2317e4fd8065a387cf21" + [[package]] name = "nu-ansi-term" version = "0.50.3" @@ -2225,6 +3581,30 @@ dependencies = [ "memchr", ] +[[package]] +name = "object_store" +version = "0.12.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fbfbfff40aeccab00ec8a910b57ca8ecf4319b335c542f2edcd19dd25a1e2a00" +dependencies = [ + "async-trait", + "bytes", + "chrono", + "futures", + "http", + "humantime", + "itertools 0.14.0", + "parking_lot", + "percent-encoding", + "thiserror 2.0.17", + "tokio", + "tracing", + "url", + "walkdir", + "wasm-bindgen-futures", + "web-time", +] + [[package]] name = "once_cell" version = "1.21.3" @@ -2249,6 +3629,15 @@ dependencies = [ "vcpkg", ] +[[package]] +name = "ordered-float" +version = "2.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68f19d67e5a2795c94e73e0bb1cc1a7edeb2e28efd39e2e1c9b7a40c1108b11c" +dependencies = [ + "num-traits", +] + [[package]] name = "os_pipe" version = "1.2.3" @@ -2282,6 +3671,80 @@ dependencies = [ "windows-link", ] +[[package]] +name = "parquet" +version = "55.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b17da4150748086bd43352bc77372efa9b6e3dbd06a04831d2a98c041c225cfa" +dependencies = [ + "ahash", + "arrow-array 55.2.0", + "arrow-buffer 55.2.0", + "arrow-cast 55.2.0", + "arrow-data 55.2.0", + "arrow-ipc 55.2.0", + "arrow-schema 55.2.0", + "arrow-select 55.2.0", + "base64", + "brotli", + "bytes", + "chrono", + "flate2", + "futures", + "half", + "hashbrown 0.15.5", + "lz4_flex", + "num", + "num-bigint", + "object_store", + "paste", + "seq-macro", + "simdutf8", + "snap", + "thrift", + "tokio", + "twox-hash", + "zstd", +] + +[[package]] +name = "parquet" +version = "55.2.0" +source = "git+https://github.com/ArroyoSystems/arrow-rs?branch=55.2.0%2Fparquet#d1d2dd8edf673cddc79ba6403dc6508263a2ddda" +dependencies = [ + "ahash", + "arrow-array 55.2.0", + "arrow-buffer 55.2.0", + "arrow-cast 55.2.0", + "arrow-data 55.2.0", + "arrow-ipc 55.2.0", + "arrow-schema 55.2.0", + "arrow-select 55.2.0", + "base64", + "brotli", + "bytes", + "chrono", + "flate2", + "half", + "hashbrown 0.15.5", + "lz4_flex", + "num", + "num-bigint", + "paste", + "seq-macro", + "simdutf8", + "snap", + "thrift", + "twox-hash", + "zstd", +] + +[[package]] +name = "paste" +version = "1.0.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" + [[package]] name = "peeking_take_while" version = "0.1.2" @@ -2295,66 +3758,53 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9b4f627cb1b25917193a259e49bdad08f671f8d9708acfd5fe0a8c1455d87220" [[package]] -name = "pest" -version = "2.8.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c9eb05c21a464ea704b53158d358a31e6425db2f63a1a7312268b05fe2b75f7" -dependencies = [ - "memchr", - "ucd-trie", -] - -[[package]] -name = "pest_derive" -version = "2.8.5" +name = "petgraph" +version = "0.6.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "68f9dbced329c441fa79d80472764b1a2c7e57123553b8519b36663a2fb234ed" +checksum = "b4c5cc86750666a3ed20bdaf5ca2a0344f9c67674cae0515bec2da16fbaa47db" dependencies = [ - "pest", - "pest_generator", + "fixedbitset 0.4.2", + "indexmap 2.12.1", ] [[package]] -name = "pest_generator" -version = "2.8.5" +name = "petgraph" +version = "0.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3bb96d5051a78f44f43c8f712d8e810adb0ebf923fc9ed2655a7f66f63ba8ee5" +checksum = "3672b37090dbd86368a4145bc067582552b29c27377cad4e0a306c97f9bd7772" dependencies = [ - "pest", - "pest_meta", - "proc-macro2", - "quote", - "syn", + "fixedbitset 0.5.7", + "indexmap 2.12.1", ] [[package]] -name = "pest_meta" -version = "2.8.5" +name = "petgraph" +version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "602113b5b5e8621770cfd490cfd90b9f84ab29bd2b0e49ad83eb6d186cef2365" +checksum = "8701b58ea97060d5e5b155d383a69952a60943f0e6dfe30b04c287beb0b27455" dependencies = [ - "pest", - "sha2", + "fixedbitset 0.5.7", + "hashbrown 0.15.5", + "indexmap 2.12.1", + "serde", ] [[package]] -name = "petgraph" -version = "0.6.5" +name = "phf" +version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b4c5cc86750666a3ed20bdaf5ca2a0344f9c67674cae0515bec2da16fbaa47db" +checksum = "913273894cec178f401a31ec4b656318d95473527be05c0752cc41cdc32be8b7" dependencies = [ - "fixedbitset 0.4.2", - "indexmap 2.12.1", + "phf_shared", ] [[package]] -name = "petgraph" -version = "0.7.1" +name = "phf_shared" +version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3672b37090dbd86368a4145bc067582552b29c27377cad4e0a306c97f9bd7772" +checksum = "06005508882fb681fd97892ecff4b7fd0fee13ef1aa569f8695dae7ab9099981" dependencies = [ - "fixedbitset 0.5.7", - "indexmap 2.12.1", + "siphasher", ] [[package]] @@ -2395,6 +3845,12 @@ version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" +[[package]] +name = "portable-atomic" +version = "1.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c33a9471896f1c69cecef8d20cbe2f7accd12527ce60845ff44c153bb2a21b49" + [[package]] name = "postcard" version = "1.1.3" @@ -2428,7 +3884,7 @@ version = "0.2.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9" dependencies = [ - "zerocopy", + "zerocopy 0.8.31", ] [[package]] @@ -2486,7 +3942,7 @@ version = "0.13.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "be769465445e8c1474e9c5dac2018218498557af32d9ed057325ec9a41ae81bf" dependencies = [ - "heck", + "heck 0.5.0", "itertools 0.14.0", "log", "multimap", @@ -2529,10 +3985,32 @@ dependencies = [ "env_logger", "log", "prost", + "serde", "tonic", "tonic-build", ] +[[package]] +name = "psm" +version = "0.1.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3852766467df634d74f0b2d7819bf8dc483a0eb2e3b0f50f756f9cfe8b0d18d8" +dependencies = [ + "ar_archive_writer", + "cc", +] + +[[package]] +name = "pulldown-cmark" +version = "0.9.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57206b407293d2bcd3af849ce869d52068623f19e1b5ff8e8778e3309439682b" +dependencies = [ + "bitflags 2.10.0", + "memchr", + "unicase", +] + [[package]] name = "pulley-interpreter" version = "41.0.3" @@ -2556,6 +4034,21 @@ dependencies = [ "syn", ] +[[package]] +name = "quanta" +version = "0.12.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f3ab5a9d756f0d97bdc89019bd2e4ea098cf9cde50ee7564dde6b81ccc8f06c7" +dependencies = [ + "crossbeam-utils", + "libc", + "once_cell", + "raw-cpuid", + "wasi", + "web-sys", + "winapi", +] + [[package]] name = "quote" version = "1.0.42" @@ -2588,8 +4081,18 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" dependencies = [ "libc", - "rand_chacha", - "rand_core", + "rand_chacha 0.3.1", + "rand_core 0.6.4", +] + +[[package]] +name = "rand" +version = "0.9.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6db2770f06117d490610c7488547d543617b21bfa07796d7a12f6f1bd53850d1" +dependencies = [ + "rand_chacha 0.9.0", + "rand_core 0.9.5", ] [[package]] @@ -2599,7 +4102,17 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" dependencies = [ "ppv-lite86", - "rand_core", + "rand_core 0.6.4", +] + +[[package]] +name = "rand_chacha" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb" +dependencies = [ + "ppv-lite86", + "rand_core 0.9.5", ] [[package]] @@ -2611,13 +4124,31 @@ dependencies = [ "getrandom 0.2.16", ] +[[package]] +name = "rand_core" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76afc826de14238e6e8c374ddcc1fa19e374fd8dd986b0d2af0d02377261d83c" +dependencies = [ + "getrandom 0.3.4", +] + [[package]] name = "rand_xoshiro" version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6f97cdb2a36ed4183de61b2f824cc45c9f1037f28afe0a322e9fff4c108b5aaa" dependencies = [ - "rand_core", + "rand_core 0.6.4", +] + +[[package]] +name = "raw-cpuid" +version = "11.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "498cd0dc59d73224351ee52a95fee0f1a617a2eae0e7d9d720cc622c73a54186" +dependencies = [ + "bitflags 2.10.0", ] [[package]] @@ -2673,6 +4204,26 @@ dependencies = [ "sasl2-sys", ] +[[package]] +name = "recursive" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0786a43debb760f491b1bc0269fe5e84155353c67482b9e60d0cfb596054b43e" +dependencies = [ + "recursive-proc-macro-impl", + "stacker", +] + +[[package]] +name = "recursive-proc-macro-impl" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76009fbe0614077fc1a2ce255e3a1881a2e3a3527097d5dc6d8212c585e7e38b" +dependencies = [ + "quote", + "syn", +] + [[package]] name = "redox_syscall" version = "0.5.18" @@ -2735,7 +4286,17 @@ dependencies = [ name = "regex-syntax" version = "0.8.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a2d987857b319362043e95f5353c0535c1f58eec5336fdfcf626430af7def58" +checksum = "7a2d987857b319362043e95f5353c0535c1f58eec5336fdfcf626430af7def58" + +[[package]] +name = "regress" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "82a9ecfa0cb04d0b04dddb99b8ccf4f66bc8dfd23df694b398570bd8ae3a50fb" +dependencies = [ + "hashbrown 0.13.2", + "memchr", +] [[package]] name = "rocksdb" @@ -2844,6 +4405,15 @@ version = "1.0.22" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a50f4cf475b65d88e057964e0e9bb1f0aa9bbb2036dc65c64596b42932536984" +[[package]] +name = "same-file" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] + [[package]] name = "sasl2-sys" version = "0.1.22+2.1.28" @@ -2856,6 +4426,30 @@ dependencies = [ "pkg-config", ] +[[package]] +name = "schemars" +version = "0.8.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3fbf2ae1b8bc8e02df939598064d22402220cd5bbcca1c76f7d6a310974d5615" +dependencies = [ + "dyn-clone", + "schemars_derive", + "serde", + "serde_json", +] + +[[package]] +name = "schemars_derive" +version = "0.8.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32e265784ad618884abaea0600a9adf15393368d840e0222d101a072f3f7534d" +dependencies = [ + "proc-macro2", + "quote", + "serde_derive_internals", + "syn", +] + [[package]] name = "scopeguard" version = "1.2.0" @@ -2872,6 +4466,12 @@ dependencies = [ "serde_core", ] +[[package]] +name = "seq-macro" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1bc711410fbe7399f390ca1c3b60ad0f53f80e95c5eb935e52268a0e2cd49acc" + [[package]] name = "serde" version = "1.0.228" @@ -2902,6 +4502,17 @@ dependencies = [ "syn", ] +[[package]] +name = "serde_derive_internals" +version = "0.29.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "18d26a20a969b9e3fdf2fc2d9f21eda6c40e2de84c9408bb5d3b05d499aae711" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "serde_json" version = "1.0.148" @@ -2915,6 +4526,56 @@ dependencies = [ "zmij", ] +[[package]] +name = "serde_json_path" +version = "0.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b992cea3194eea663ba99a042d61cea4bd1872da37021af56f6a37e0359b9d33" +dependencies = [ + "inventory", + "nom", + "regex", + "serde", + "serde_json", + "serde_json_path_core", + "serde_json_path_macros", + "thiserror 2.0.17", +] + +[[package]] +name = "serde_json_path_core" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dde67d8dfe7d4967b5a95e247d4148368ddd1e753e500adb34b3ffe40c6bc1bc" +dependencies = [ + "inventory", + "serde", + "serde_json", + "thiserror 2.0.17", +] + +[[package]] +name = "serde_json_path_macros" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "517acfa7f77ddaf5c43d5f119c44a683774e130b4247b7d3210f8924506cfac8" +dependencies = [ + "inventory", + "serde_json_path_core", + "serde_json_path_macros_internal", +] + +[[package]] +name = "serde_json_path_macros_internal" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aafbefbe175fa9bf03ca83ef89beecff7d2a95aaacd5732325b90ac8c3bd7b90" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "serde_spanned" version = "1.0.4" @@ -2924,6 +4585,18 @@ dependencies = [ "serde_core", ] +[[package]] +name = "serde_tokenstream" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7c49585c52c01f13c5c2ebb333f14f6885d76daa768d8a037d28017ec538c69" +dependencies = [ + "proc-macro2", + "quote", + "serde", + "syn", +] + [[package]] name = "serde_yaml" version = "0.9.34+deprecated" @@ -3005,6 +4678,24 @@ dependencies = [ "libc", ] +[[package]] +name = "simd-adler32" +version = "0.3.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e320a6c5ad31d271ad523dcf3ad13e2767ad8b1cb8f047f75a8aeaf8da139da2" + +[[package]] +name = "simdutf8" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3a9fe34e3e7a50316060351f37187a3f546bce95496156754b601a5fa71b76e" + +[[package]] +name = "siphasher" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2aa850e253778c88a04c3d7323b043aeda9d3e30d5971937c1855769763678e" + [[package]] name = "sized-chunks" version = "0.6.5" @@ -3015,6 +4706,21 @@ dependencies = [ "typenum", ] +[[package]] +name = "skeptic" +version = "0.13.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "16d23b015676c90a0f01c197bfdc786c20342c73a0afdda9025adb0bc42940a8" +dependencies = [ + "bytecount", + "cargo_metadata", + "error-chain", + "glob", + "pulldown-cmark", + "tempfile", + "walkdir", +] + [[package]] name = "slab" version = "0.4.11" @@ -3030,6 +4736,12 @@ dependencies = [ "serde", ] +[[package]] +name = "snap" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b6b67fb9a61334225b5b790716f609cd58395f895b3fe8b328786812a40bc3b" + [[package]] name = "socket2" version = "0.5.10" @@ -3050,12 +4762,54 @@ dependencies = [ "windows-sys 0.60.2", ] +[[package]] +name = "spinning_top" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d96d2d1d716fb500937168cc09353ffdc7a012be8475ac7308e1bdf0e3923300" +dependencies = [ + "lock_api", +] + +[[package]] +name = "sqlparser" +version = "0.55.0" +source = "git+https://github.com/FunctionStream/sqlparser-rs?branch=0.6.0%2Ffunction-sql-parser#9783cf9e3e6b61c763f78bcdd460e85edec22250" +dependencies = [ + "log", + "recursive", + "sqlparser_derive", +] + +[[package]] +name = "sqlparser_derive" +version = "0.3.0" +source = "git+https://github.com/FunctionStream/sqlparser-rs?branch=0.6.0%2Ffunction-sql-parser#9783cf9e3e6b61c763f78bcdd460e85edec22250" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "stable_deref_trait" version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596" +[[package]] +name = "stacker" +version = "0.1.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08d74a23609d509411d10e2176dc2a4346e3b4aea2e7b1869f19fdedbc71c013" +dependencies = [ + "cc", + "cfg-if", + "libc", + "psm", + "windows-sys 0.59.0", +] + [[package]] name = "static_assertions" version = "1.1.0" @@ -3068,6 +4822,34 @@ version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" +[[package]] +name = "strum" +version = "0.26.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fec0f0aef304996cf250b31b5a10dee7980c85da9d759361292b8bca5a18f06" +dependencies = [ + "strum_macros", +] + +[[package]] +name = "strum_macros" +version = "0.26.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be" +dependencies = [ + "heck 0.5.0", + "proc-macro2", + "quote", + "rustversion", + "syn", +] + +[[package]] +name = "subtle" +version = "2.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" + [[package]] name = "syn" version = "2.0.113" @@ -3112,6 +4894,12 @@ dependencies = [ "winx", ] +[[package]] +name = "tagptr" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b2093cf4c8eb1e67749a6762251bc9cd836b6fc171623bd0a9d324d37af2417" + [[package]] name = "target-lexicon" version = "0.13.4" @@ -3189,6 +4977,17 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "thrift" +version = "0.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e54bc85fc7faa8bc175c4bab5b92ba8d9a3ce893d0e9f42cc455c8ab16a9e09" +dependencies = [ + "byteorder", + "integer-encoding", + "ordered-float", +] + [[package]] name = "time" version = "0.3.44" @@ -3397,7 +5196,7 @@ dependencies = [ "indexmap 1.9.3", "pin-project", "pin-project-lite", - "rand", + "rand 0.8.5", "slab", "tokio", "tokio-util", @@ -3519,12 +5318,24 @@ dependencies = [ "tracing-serde", ] +[[package]] +name = "triomphe" +version = "0.1.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dd69c5aa8f924c7519d6372789a74eac5b94fb0f8fcf0d4a97eb0bfc3e785f39" + [[package]] name = "try-lock" version = "0.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" +[[package]] +name = "twox-hash" +version = "2.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ea3136b675547379c4bd395ca6b938e5ad3c3d20fad76e7fe85f9e0d011419c" + [[package]] name = "typenum" version = "1.19.0" @@ -3532,10 +5343,51 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "562d481066bde0658276a35467c4af00bdc6ee726305698a55b86e61d7ad82bb" [[package]] -name = "ucd-trie" -version = "0.1.7" +name = "typify" +version = "0.0.13" +source = "git+https://github.com/ArroyoSystems/typify.git?branch=arroyo#d14b6fc016bf9d63618d8b43b4d74a648980737b" +dependencies = [ + "typify-impl", + "typify-macro", +] + +[[package]] +name = "typify-impl" +version = "0.0.13" +source = "git+https://github.com/ArroyoSystems/typify.git?branch=arroyo#d14b6fc016bf9d63618d8b43b4d74a648980737b" +dependencies = [ + "heck 0.4.1", + "log", + "proc-macro2", + "quote", + "regress", + "schemars", + "serde_json", + "syn", + "thiserror 1.0.69", + "unicode-ident", +] + +[[package]] +name = "typify-macro" +version = "0.0.13" +source = "git+https://github.com/ArroyoSystems/typify.git?branch=arroyo#d14b6fc016bf9d63618d8b43b4d74a648980737b" +dependencies = [ + "proc-macro2", + "quote", + "schemars", + "serde", + "serde_json", + "serde_tokenstream", + "syn", + "typify-impl", +] + +[[package]] +name = "unicase" +version = "2.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2896d95c02a80c6d6a5d6e953d479f5ddf2dfdb6a244441010e373ac0fb88971" +checksum = "dbc4bc3a9f746d862c45cb89d705aa10f187bb96c76001afab07a0d35ce60142" [[package]] name = "unicode-ident" @@ -3573,6 +5425,12 @@ version = "0.2.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "673aac59facbab8a9007c7f6108d11f63b603f7cabff99fabf650fea5c32b861" +[[package]] +name = "unty" +version = "0.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d49784317cd0d1ee7ec5c716dd598ec5b4483ea832a2dced265471cc0f690ae" + [[package]] name = "url" version = "2.5.7" @@ -3626,6 +5484,22 @@ version = "0.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" +[[package]] +name = "virtue" +version = "0.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "051eb1abcf10076295e815102942cc58f9d5e3b4560e46e53c21e8ff6f3af7b1" + +[[package]] +name = "walkdir" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b" +dependencies = [ + "same-file", + "winapi-util", +] + [[package]] name = "want" version = "0.3.1" @@ -3663,6 +5537,19 @@ dependencies = [ "wasm-bindgen-shared", ] +[[package]] +name = "wasm-bindgen-futures" +version = "0.4.56" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "836d9622d604feee9e5de25ac10e3ea5f2d65b41eac0d9ce72eb5deae707ce7c" +dependencies = [ + "cfg-if", + "js-sys", + "once_cell", + "wasm-bindgen", + "web-sys", +] + [[package]] name = "wasm-bindgen-macro" version = "0.2.106" @@ -3702,7 +5589,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "af801b6f36459023eaec63fdbaedad2fd5a4ab7dc74ecc110a8b5d375c5775e4" dependencies = [ "anyhow", - "heck", + "heck 0.5.0", "im-rc", "indexmap 2.12.1", "log", @@ -4005,7 +5892,7 @@ checksum = "87acbd416227cdd279565ba49e57cf7f08d112657c3b3f39b70250acdfd094fe" dependencies = [ "anyhow", "bitflags 2.10.0", - "heck", + "heck 0.5.0", "indexmap 2.12.1", "wit-parser", ] @@ -4085,6 +5972,26 @@ dependencies = [ "wast 243.0.0", ] +[[package]] +name = "web-sys" +version = "0.3.83" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b32828d774c412041098d182a8b38b16ea816958e07cf40eec2bc080ae137ac" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "web-time" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a6580f308b1fad9207618087a65c04e7a10bc77e02c8e84e9b00dd4b12fa0bb" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + [[package]] name = "wiggle" version = "41.0.3" @@ -4106,7 +6013,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "57f773d51c1696bd7d028aa35c884d9fc58f48d79a1176dfbad6c908de314235" dependencies = [ "anyhow", - "heck", + "heck 0.5.0", "proc-macro2", "quote", "syn", @@ -4461,6 +6368,21 @@ version = "0.6.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9edde0db4769d2dc68579893f2306b26c6ecfbe0ef499b013d731b7b9247e0b9" +[[package]] +name = "xxhash-rust" +version = "0.8.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fdd20c5420375476fbd4394763288da7eb0cc0b8c11deed431a91562af7335d3" + +[[package]] +name = "xz2" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "388c44dc09d76f1536602ead6d325eb532f5c122f17782bd57fb47baeeb767e2" +dependencies = [ + "lzma-sys", +] + [[package]] name = "yoke" version = "0.8.1" @@ -4484,13 +6406,33 @@ dependencies = [ "synstructure", ] +[[package]] +name = "zerocopy" +version = "0.7.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0" +dependencies = [ + "zerocopy-derive 0.7.35", +] + [[package]] name = "zerocopy" version = "0.8.31" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fd74ec98b9250adb3ca554bdde269adf631549f51d8a8f8f0a10b50f1cb298c3" dependencies = [ - "zerocopy-derive", + "zerocopy-derive 0.8.31", +] + +[[package]] +name = "zerocopy-derive" +version = "0.7.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" +dependencies = [ + "proc-macro2", + "quote", + "syn", ] [[package]] @@ -4558,6 +6500,12 @@ dependencies = [ "syn", ] +[[package]] +name = "zlib-rs" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3be3d40e40a133f9c916ee3f9f4fa2d9d63435b5fbe1bfc6d9dae0aa0ada1513" + [[package]] name = "zmij" version = "1.0.10" diff --git a/Cargo.toml b/Cargo.toml index 4b855aa9..7c49d04c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -35,25 +35,57 @@ tonic = { version = "0.12", features = ["default"] } async-trait = "0.1" num_cpus = "1.0" protocol = { path = "./protocol" } +prost = "0.13" rdkafka = { version = "0.38", features = ["cmake-build", "ssl", "gssapi"] } crossbeam-channel = "0.5" -pest = "2.7" -pest_derive = "2.7" -clap = { version = "4.5", features = ["derive"] } wasmtime = { version = "41.0.3", features = ["component-model", "async"] } base64 = "0.22" wasmtime-wasi = "41.0.3" rocksdb = { version = "0.21", features = ["multi-threaded-cf", "lz4"] } -bincode = "1.3" +bincode = { version = "2", features = ["serde"] } +chrono = "0.4" tokio-stream = "0.1.18" lru = "0.12" parking_lot = "0.12" -arrow-array = "52" -arrow-ipc = "52" -arrow-schema = "52" +arrow = { version = "55", default-features = false } +arrow-array = "55" +arrow-ipc = "55" +arrow-schema = { version = "55", features = ["serde"] } +futures = "0.3" +serde_json_path = "0.7" +xxhash-rust = { version = "0.8", features = ["xxh3"] } proctitle = "0.1" +unicase = "2.7" +petgraph = "0.7" +rand = { version = "0.8", features = ["small_rng"] } +itertools = "0.14" +strum = { version = "0.26", features = ["derive"] } + +typify = { git = 'https://github.com/ArroyoSystems/typify.git', branch = 'arroyo' } +parquet = {git = 'https://github.com/ArroyoSystems/arrow-rs', branch = '55.2.0/parquet'} +arrow-json = {git = 'https://github.com/ArroyoSystems/arrow-rs', branch = '55.2.0/json'} +datafusion = {git = 'https://github.com/ArroyoSystems/arrow-datafusion', branch = '48.0.1/arroyo'} +datafusion-common = {git = 'https://github.com/ArroyoSystems/arrow-datafusion', branch = '48.0.1/arroyo'} +datafusion-execution = {git = 'https://github.com/ArroyoSystems/arrow-datafusion', branch = '48.0.1/arroyo'} +datafusion-expr = {git = 'https://github.com/ArroyoSystems/arrow-datafusion', branch = '48.0.1/arroyo'} +datafusion-physical-expr = {git = 'https://github.com/ArroyoSystems/arrow-datafusion', branch = '48.0.1/arroyo'} +datafusion-physical-plan = {git = 'https://github.com/ArroyoSystems/arrow-datafusion', branch = '48.0.1/arroyo'} +datafusion-proto = {git = 'https://github.com/ArroyoSystems/arrow-datafusion', branch = '48.0.1/arroyo'} +datafusion-functions = {git = 'https://github.com/ArroyoSystems/arrow-datafusion', branch = '48.0.1/arroyo'} +datafusion-functions-window = {git = 'https://github.com/ArroyoSystems/arrow-datafusion', branch = '48.0.1/arroyo'} + +sqlparser = { git = "https://github.com/FunctionStream/sqlparser-rs", branch = "0.6.0/function-sql-parser" } + +ahash = "0.8" +governor = "0.8.0" +mini-moka = "0.10" +sha2 = "0.10" +hex = "0.4" [features] default = ["incremental-cache", "python"] incremental-cache = ["wasmtime/incremental-cache"] python = [] + +[patch."https://github.com/ArroyoSystems/sqlparser-rs"] +sqlparser = { git = "https://github.com/FunctionStream/sqlparser-rs", branch = "0.6.0/function-sql-parser" } diff --git a/README-zh.md b/README-zh.md index b1d68eac..a15bfcc5 100644 --- a/README-zh.md +++ b/README-zh.md @@ -23,7 +23,7 @@ [中文](README-zh.md) | [English](README.md) -**Function Stream** 是一个基于 Rust 构建的高性能、事件驱动的流处理框架。它提供了一个模块化的运行时,用于编排编译为 **WebAssembly (WASM)** 的 Serverless 风格处理函数,支持使用 **Go、Python 和 Rust** 编写函数。 +**Function Stream** 是一个基于 Rust 构建的高性能、事件驱动的流处理框架。它提供了一个模块化的运行时,用于编排编译为 **WebAssembly (WASM)** 的 Serverless 风格处理函数,支持使用 **Go、Python 和 Rust** 编写函数。同时内置 **Streaming SQL** 引擎,可通过纯声明式 SQL 构建实时数据管道 — 包括时间窗口聚合、多流关联和持续 ETL。 ## 目录 @@ -46,6 +46,7 @@ ## 核心特性 +- **Streaming SQL 引擎**:使用纯 SQL 构建实时管道 — 注册数据源(`CREATE TABLE`)、启动持续计算(`CREATE STREAMING TABLE ... AS SELECT`)、管理生命周期(`SHOW` / `DROP`)。支持滚动窗口、滑动窗口、窗口关联等丰富语义。 - **事件驱动的 WASM 运行时**:以接近原生的性能和沙箱隔离的方式执行多语言函数(Go、Python、Rust)。 - **持久化状态管理**:内置支持基于 RocksDB 的状态存储,用于有状态流处理。 - **SQL 驱动的 CLI**:使用类 SQL 命令进行作业管理和流检测的交互式 REPL。 @@ -200,14 +201,16 @@ function-stream-/ ## 文档 -| 文档 | 描述 | -|------------------------------------------------------|---------------| -| [服务端配置与运维指南](docs/server-configuration-zh.md) | 服务端配置与运维操作 | -| [Function 任务配置规范](docs/function-configuration-zh.md) | 任务定义规范 | -| [SQL CLI 交互式管理指南](docs/sql-cli-guide-zh.md) | 交互式管理指南 | -| [Function 管理与开发指南](docs/function-development-zh.md) | 管理与开发指南 | -| [Go SDK 开发与交互指南](docs/Go-SDK/go-sdk-guide-zh.md) | Go SDK 指南 | -| [Python SDK 开发与交互指南](docs/Python-SDK/python-sdk-guide-zh.md) | Python SDK 指南 | +| 文档 | 描述 | +|------------------------------------------------------------------------|--------------------------| +| [Streaming SQL 使用指南](docs/streaming-sql-guide-zh.md) | 声明式 SQL 实时流处理指南 | +| [连接器、格式与类型参考](docs/connectors-and-formats-zh.md) | 支持的 Source/Sink、格式与数据类型 | +| [服务端配置与运维指南](docs/server-configuration-zh.md) | 服务端配置与运维操作 | +| [Function 任务配置规范](docs/function-configuration-zh.md) | 任务定义规范 | +| [SQL CLI 交互式管理指南](docs/sql-cli-guide-zh.md) | 交互式管理指南 | +| [Function 管理与开发指南](docs/function-development-zh.md) | 管理与开发指南 | +| [Go SDK 开发与交互指南](docs/Go-SDK/go-sdk-guide-zh.md) | Go SDK 指南 | +| [Python SDK 开发与交互指南](docs/Python-SDK/python-sdk-guide-zh.md) | Python SDK 指南 | ## 配置 diff --git a/README.md b/README.md index 51a69de1..f74bee33 100644 --- a/README.md +++ b/README.md @@ -23,7 +23,7 @@ [中文](README-zh.md) | [English](README.md) -**Function Stream** is a high-performance, event-driven stream processing framework built in Rust. It provides a modular runtime to orchestrate serverless-style processing functions compiled to **WebAssembly (WASM)**, supporting functions written in **Go, Python, and Rust**. +**Function Stream** is a high-performance, event-driven stream processing framework built in Rust. It provides a modular runtime to orchestrate serverless-style processing functions compiled to **WebAssembly (WASM)**, supporting functions written in **Go, Python, and Rust**. It also features a **Streaming SQL** engine that lets you build real-time data pipelines — including time-windowed aggregations, multi-stream joins, and continuous ETL — using pure declarative SQL. ## Table of Contents @@ -46,6 +46,7 @@ ## Key Features +* **Streaming SQL Engine**: Build real-time pipelines with pure SQL — register sources (`CREATE TABLE`), launch continuous computations (`CREATE STREAMING TABLE ... AS SELECT`), and manage lifecycle (`SHOW` / `DROP`). Supports tumbling windows, hopping windows, window joins, and more. * **Event-Driven WASM Runtime**: Executes polyglot functions (Go, Python, Rust) with near-native performance and sandboxed isolation. * **Durable State Management**: Built-in support for RocksDB-backed state stores for stateful stream processing. * **SQL-Powered CLI**: Interactive REPL for job management and stream inspection using SQL-like commands. @@ -199,14 +200,16 @@ We provide a robust shell script to manage the server process, capable of handli ## Documentation -| Document | Description | -|----------------------------------------------------------|-----------------------------------| -| [Server Configuration](docs/server-configuration.md) | Server Configuration & Operations | -| [Function Configuration](docs/function-configuration.md) | Task Definition Specification | -| [SQL CLI Guide](docs/sql-cli-guide.md) | Interactive Management Guide | -| [Function Development](docs/function-development.md) | Management & Development Guide | -| [Go SDK Guide](docs/Go-SDK/go-sdk-guide.md) | Go SDK Guide | -| [Python SDK Guide](docs/Python-SDK/python-sdk-guide.md) | Python SDK Guide | +| Document | Description | +|----------------------------------------------------------------|-------------------------------------------------| +| [Streaming SQL Guide](docs/streaming-sql-guide.md) | Declarative SQL for Real-Time Stream Processing | +| [Connectors, Formats & Types](docs/connectors-and-formats.md) | Supported Sources, Sinks, Formats & Data Types | +| [Server Configuration](docs/server-configuration.md) | Server Configuration & Operations | +| [Function Configuration](docs/function-configuration.md) | Task Definition Specification | +| [SQL CLI Guide](docs/sql-cli-guide.md) | Interactive Management Guide | +| [Function Development](docs/function-development.md) | Management & Development Guide | +| [Go SDK Guide](docs/Go-SDK/go-sdk-guide.md) | Go SDK Guide | +| [Python SDK Guide](docs/Python-SDK/python-sdk-guide.md) | Python SDK Guide | ## Configuration diff --git a/cli/cli/Cargo.toml b/cli/cli/Cargo.toml index 72352995..3c05d6b4 100644 --- a/cli/cli/Cargo.toml +++ b/cli/cli/Cargo.toml @@ -12,10 +12,8 @@ arrow-array = "52" arrow-ipc = "52" arrow-schema = "52" comfy-table = "7" -function-stream = { path = "../../" } protocol = { path = "../../protocol" } clap = { version = "4.5", features = ["derive"] } -thiserror = "2" tokio = { version = "1.0", features = ["full", "signal"] } tonic = { version = "0.12", features = ["default"] } rustyline = { version = "14.0", features = ["with-dirs"] } diff --git a/cli/cli/src/repl.rs b/cli/cli/src/repl.rs index 7f8087b3..8c3882b2 100644 --- a/cli/cli/src/repl.rs +++ b/cli/cli/src/repl.rs @@ -20,26 +20,62 @@ use comfy_table::{Attribute, Cell, Color, ContentArrangement, Table, TableCompon use protocol::cli::{function_stream_service_client::FunctionStreamServiceClient, SqlRequest}; use rustyline::error::ReadlineError; use rustyline::{Config, DefaultEditor, EditMode}; +use std::fmt; use std::io::{self, Cursor, Write}; use std::sync::Arc; use tokio::sync::Mutex; use tonic::Request; -#[derive(Debug, thiserror::Error)] +/// CLI errors. +/// +/// **Important:** [`tonic::Status`] must not be formatted with `{}` — its [`fmt::Display`] dumps +/// `details` / `metadata` (e.g. HTTP headers). Only [`tonic::Status::message`] is stored in +/// [`ReplError::Rpc`]. +#[derive(Debug)] pub enum ReplError { - #[error("RPC error: {0}")] - Rpc(Box), - #[error("Connection failed: {0}")] + Rpc(String), Connection(String), - #[error("Internal error: {0}")] Internal(String), - #[error("IO error: {0}")] - Io(#[from] io::Error), + Io(io::Error), +} + +impl fmt::Display for ReplError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + ReplError::Rpc(s) => f.write_str(s), + ReplError::Connection(s) => f.write_str(s), + ReplError::Internal(s) => write!(f, "Internal error: {s}"), + ReplError::Io(e) => write!(f, "IO error: {e}"), + } + } +} + +impl std::error::Error for ReplError { + fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { + match self { + ReplError::Io(e) => Some(e), + _ => None, + } + } +} + +impl From for ReplError { + fn from(e: io::Error) -> Self { + ReplError::Io(e) + } } impl From for ReplError { fn from(s: tonic::Status) -> Self { - ReplError::Rpc(Box::new(s)) + let msg = s.message(); + if msg.is_empty() { + ReplError::Rpc(format!( + "gRPC {} (server returned no message)", + s.code() + )) + } else { + ReplError::Rpc(msg.to_string()) + } } } diff --git a/conf/config.yaml b/conf/config.yaml index 3f19493d..9d0f625e 100644 --- a/conf/config.yaml +++ b/conf/config.yaml @@ -117,3 +117,10 @@ task_storage: # Maximum bytes for level base in bytes (optional) max_bytes_for_level_base: 268435456 + +# Stream table catalog (SQL: CREATE TABLE connector sources, SHOW TABLES, SHOW CREATE TABLE). +# When persist is true (default), metadata is stored under RocksDB at db_path (default: data/stream_catalog) +# and reloaded after process restart. Set persist: false only for tests/ephemeral nodes. +stream_catalog: + persist: true + # db_path: data/stream_catalog diff --git a/docs/connectors-and-formats-zh.md b/docs/connectors-and-formats-zh.md new file mode 100644 index 00000000..8f25a7dc --- /dev/null +++ b/docs/connectors-and-formats-zh.md @@ -0,0 +1,197 @@ + + +# 连接器、数据格式与 SQL 类型参考 + +[中文](connectors-and-formats-zh.md) | [English](connectors-and-formats.md) + +本文档是 Function Stream Streaming SQL 引擎所支持的连接器(Source / Sink)、序列化格式以及 SQL 数据类型的权威参考。 + +--- + +## 目录 + +- [1. 连接器 (Connector)](#1-连接器-connector) + - [1.1 Kafka Source(数据源)](#11-kafka-source数据源) + - [1.2 Kafka Sink(数据汇)](#12-kafka-sink数据汇) +- [2. 数据格式 (Format)](#2-数据格式-format) +- [3. SQL 数据类型](#3-sql-数据类型) +- [4. 完整示例](#4-完整示例) + +--- + +## 1. 连接器 (Connector) + +当前 Function Stream 支持 **Kafka** 作为生产可用的连接器,同时可作为数据源(Source)和数据汇(Sink)。 + +### 1.1 Kafka Source(数据源) + +Kafka Source 从一个或多个 Kafka Topic 分区读取消息。在 `CREATE TABLE` 中使用以注册输入流。 + +**必填属性:** + +| 属性 | 说明 | 示例 | +|------|------|------| +| `connector` | 必须为 `kafka`。 | `'kafka'` | +| `topic` | 要消费的 Kafka Topic。 | `'raw_events'` | +| `format` | 消息的序列化格式。 | `'json'` | +| `bootstrap.servers` | Kafka Broker 地址列表,逗号分隔。 | `'broker1:9092,broker2:9092'` | + +**示例:** + +```sql +CREATE TABLE page_views ( + user_id VARCHAR, + page_url VARCHAR, + view_time TIMESTAMP NOT NULL, + WATERMARK FOR view_time AS view_time - INTERVAL '3' SECOND +) WITH ( + 'connector' = 'kafka', + 'topic' = 'page_views', + 'format' = 'json', + 'bootstrap.servers' = 'localhost:9092' +); +``` + +### 1.2 Kafka Sink(数据汇) + +Kafka Sink 将计算结果写入 Kafka Topic。在 `CREATE STREAMING TABLE` 的 `WITH` 子句中配置。 + +**必填属性:** + +| 属性 | 说明 | 示例 | +|------|------|------| +| `connector` | 必须为 `kafka`。 | `'kafka'` | +| `topic` | 要写入的 Kafka Topic。 | `'sink_results'` | +| `format` | 输出消息的序列化格式。 | `'json'` | +| `bootstrap.servers` | Kafka Broker 地址列表。 | `'broker1:9092'` | + +**示例:** + +```sql +CREATE STREAMING TABLE enriched_clicks WITH ( + 'connector' = 'kafka', + 'topic' = 'enriched_clicks', + 'format' = 'json', + 'bootstrap.servers' = 'localhost:9092' +) AS +SELECT click_id, user_id, click_time +FROM ad_clicks; +``` + +--- + +## 2. 数据格式 (Format) + +当前唯一支持的序列化格式是 **JSON**。每条 Kafka 消息应为一个自描述的 JSON 对象,其字段直接映射到 `CREATE TABLE` 中定义的列。 + +在 `WITH` 子句中设置 `'format' = 'json'`(省略时也默认为 JSON)。 + +--- + +## 3. SQL 数据类型 + +以下是 `CREATE TABLE` 列定义中支持的 SQL 数据类型: + +### 数值类型 + +| SQL 类型 | 别名 | Arrow 类型 | 说明 | +|----------|------|-----------|------| +| `BOOLEAN` | `BOOL` | Boolean | 布尔值。 | +| `TINYINT` | — | Int8 | 8 位有符号整数。 | +| `SMALLINT` | `INT2` | Int16 | 16 位有符号整数。 | +| `INT` | `INTEGER`、`INT4` | Int32 | 32 位有符号整数。 | +| `BIGINT` | `INT8` | Int64 | 64 位有符号整数。 | +| `TINYINT UNSIGNED` | — | UInt8 | 8 位无符号整数。 | +| `SMALLINT UNSIGNED` | `INT2 UNSIGNED` | UInt16 | 16 位无符号整数。 | +| `INT UNSIGNED` | `INT4 UNSIGNED` | UInt32 | 32 位无符号整数。 | +| `BIGINT UNSIGNED` | `INT8 UNSIGNED` | UInt64 | 64 位无符号整数。 | +| `FLOAT` | `REAL`、`FLOAT4` | Float32 | 32 位 IEEE 754 浮点数。 | +| `DOUBLE` | `DOUBLE PRECISION`、`FLOAT8` | Float64 | 64 位 IEEE 754 浮点数。 | +| `DECIMAL(p, s)` | `NUMERIC(p, s)` | Decimal128 | 定点小数。精度 1–38,标度 <= 精度。 | + +### 字符串与二进制类型 + +| SQL 类型 | 别名 | Arrow 类型 | 说明 | +|----------|------|-----------|------| +| `VARCHAR` | `TEXT`、`STRING`、`CHAR` | Utf8 | 可变长度 UTF-8 字符串。 | +| `BYTEA` | — | Binary | 可变长度字节数组。 | +| `JSON` | — | Utf8(JSON 扩展) | 带有 FunctionStream 扩展元数据的 JSON 类型字符串。 | + +### 日期与时间类型 + +| SQL 类型 | Arrow 类型 | 说明 | +|----------|-----------|------| +| `TIMESTAMP` | Timestamp(Nanosecond) | 不含时区的日期时间(纳秒精度)。 | +| `TIMESTAMP(0)` | Timestamp(Second) | 秒精度。 | +| `TIMESTAMP(3)` | Timestamp(Millisecond) | 毫秒精度。 | +| `TIMESTAMP(6)` | Timestamp(Microsecond) | 微秒精度。 | +| `TIMESTAMP(9)` | Timestamp(Nanosecond) | 纳秒精度(与 `TIMESTAMP` 相同)。 | +| `DATE` | Date32 | 日历日期(年、月、日)。 | +| `DATETIME` | Timestamp(Nanosecond) | `TIMESTAMP` 的别名。 | +| `TIME` | Time64(Nanosecond) | 不含时区的时刻。 | +| `INTERVAL` | Interval(MonthDayNano) | 时间间隔 / 持续时间。 | + +### 复合类型 + +| SQL 类型 | Arrow 类型 | 说明 | +|----------|-----------|------| +| `STRUCT` | Struct | 命名组合字段。 | +| `ARRAY` | List | 相同类型元素的有序列表。也支持 `element_type[]` 语法。 | + +--- + +## 4. 完整示例 + +以下是一个结合 Kafka Source、Kafka Sink、JSON 格式和多种 SQL 数据类型的完整示例: + +```sql +-- Source:从 Kafka 读取用户活动事件 +CREATE TABLE user_activity ( + event_id VARCHAR, + user_id BIGINT, + action VARCHAR, + amount DECIMAL(10, 2), + tags ARRAY, + event_time TIMESTAMP NOT NULL, + WATERMARK FOR event_time AS event_time - INTERVAL '5' SECOND +) WITH ( + 'connector' = 'kafka', + 'topic' = 'user_activity', + 'format' = 'json', + 'bootstrap.servers' = 'localhost:9092' +); + +-- Sink:1 分钟滚动窗口聚合 +CREATE STREAMING TABLE activity_stats_1m WITH ( + 'connector' = 'kafka', + 'topic' = 'activity_stats_1m', + 'format' = 'json', + 'bootstrap.servers' = 'localhost:9092' +) AS +SELECT + TUMBLE(INTERVAL '1' MINUTE) AS time_window, + action, + COUNT(*) AS event_count, + SUM(amount) AS total_amount +FROM user_activity +GROUP BY 1, action; +``` diff --git a/docs/connectors-and-formats.md b/docs/connectors-and-formats.md new file mode 100644 index 00000000..46d0d964 --- /dev/null +++ b/docs/connectors-and-formats.md @@ -0,0 +1,197 @@ + + +# Connectors, Formats & Data Types + +[中文](connectors-and-formats-zh.md) | [English](connectors-and-formats.md) + +This document is the authoritative reference for connectors (sources & sinks), serialization formats, and SQL data types supported by Function Stream's Streaming SQL engine. + +--- + +## Table of Contents + +- [1. Connectors](#1-connectors) + - [1.1 Kafka (Source)](#11-kafka-source) + - [1.2 Kafka (Sink)](#12-kafka-sink) +- [2. Data Format](#2-data-format) +- [3. SQL Data Types](#3-sql-data-types) +- [4. Full Example](#4-full-example) + +--- + +## 1. Connectors + +Currently Function Stream supports **Kafka** as the production-ready connector for both source (ingestion) and sink (egress). + +### 1.1 Kafka (Source) + +A Kafka source reads records from one or more Kafka topic partitions. Use it in `CREATE TABLE` to register an input stream. + +**Required Properties:** + +| Property | Description | Example | +|----------|-------------|---------| +| `connector` | Must be `kafka`. | `'kafka'` | +| `topic` | Kafka topic to consume from. | `'raw_events'` | +| `format` | Serialization format of messages. | `'json'` | +| `bootstrap.servers` | Comma-separated list of Kafka broker addresses. | `'broker1:9092,broker2:9092'` | + +**Example:** + +```sql +CREATE TABLE page_views ( + user_id VARCHAR, + page_url VARCHAR, + view_time TIMESTAMP NOT NULL, + WATERMARK FOR view_time AS view_time - INTERVAL '3' SECOND +) WITH ( + 'connector' = 'kafka', + 'topic' = 'page_views', + 'format' = 'json', + 'bootstrap.servers' = 'localhost:9092' +); +``` + +### 1.2 Kafka (Sink) + +A Kafka sink writes records into a Kafka topic. It is configured in the `WITH` clause of a `CREATE STREAMING TABLE` statement. + +**Required Properties:** + +| Property | Description | Example | +|----------|-------------|---------| +| `connector` | Must be `kafka`. | `'kafka'` | +| `topic` | Kafka topic to write to. | `'sink_results'` | +| `format` | Serialization format of output messages. | `'json'` | +| `bootstrap.servers` | Comma-separated Kafka broker addresses. | `'broker1:9092'` | + +**Example:** + +```sql +CREATE STREAMING TABLE enriched_clicks WITH ( + 'connector' = 'kafka', + 'topic' = 'enriched_clicks', + 'format' = 'json', + 'bootstrap.servers' = 'localhost:9092' +) AS +SELECT click_id, user_id, click_time +FROM ad_clicks; +``` + +--- + +## 2. Data Format + +Currently the only supported serialization format is **JSON**. Each Kafka message is expected to be a self-describing JSON object whose fields map directly to the columns defined in `CREATE TABLE`. + +Set `'format' = 'json'` in the `WITH` clause (this is also the default when omitted). + +--- + +## 3. SQL Data Types + +The following SQL data types are supported in `CREATE TABLE` column definitions: + +### Numeric Types + +| SQL Type | Aliases | Arrow Type | Description | +|----------|---------|------------|-------------| +| `BOOLEAN` | `BOOL` | Boolean | True / false. | +| `TINYINT` | — | Int8 | 8-bit signed integer. | +| `SMALLINT` | `INT2` | Int16 | 16-bit signed integer. | +| `INT` | `INTEGER`, `INT4` | Int32 | 32-bit signed integer. | +| `BIGINT` | `INT8` | Int64 | 64-bit signed integer. | +| `TINYINT UNSIGNED` | — | UInt8 | 8-bit unsigned integer. | +| `SMALLINT UNSIGNED` | `INT2 UNSIGNED` | UInt16 | 16-bit unsigned integer. | +| `INT UNSIGNED` | `INT4 UNSIGNED` | UInt32 | 32-bit unsigned integer. | +| `BIGINT UNSIGNED` | `INT8 UNSIGNED` | UInt64 | 64-bit unsigned integer. | +| `FLOAT` | `REAL`, `FLOAT4` | Float32 | 32-bit IEEE 754 floating point. | +| `DOUBLE` | `DOUBLE PRECISION`, `FLOAT8` | Float64 | 64-bit IEEE 754 floating point. | +| `DECIMAL(p, s)` | `NUMERIC(p, s)` | Decimal128 | Fixed-point decimal. Precision 1–38, scale <= precision. | + +### String & Binary Types + +| SQL Type | Aliases | Arrow Type | Description | +|----------|---------|------------|-------------| +| `VARCHAR` | `TEXT`, `STRING`, `CHAR` | Utf8 | Variable-length UTF-8 string. | +| `BYTEA` | — | Binary | Variable-length byte array. | +| `JSON` | — | Utf8 (JSON extension) | JSON-typed string with FunctionStream extension metadata. | + +### Date & Time Types + +| SQL Type | Arrow Type | Description | +|----------|------------|-------------| +| `TIMESTAMP` | Timestamp(Nanosecond) | Date and time without timezone (nanosecond precision). | +| `TIMESTAMP(0)` | Timestamp(Second) | Second precision. | +| `TIMESTAMP(3)` | Timestamp(Millisecond) | Millisecond precision. | +| `TIMESTAMP(6)` | Timestamp(Microsecond) | Microsecond precision. | +| `TIMESTAMP(9)` | Timestamp(Nanosecond) | Nanosecond precision (same as `TIMESTAMP`). | +| `DATE` | Date32 | Calendar date (year, month, day). | +| `DATETIME` | Timestamp(Nanosecond) | Alias for `TIMESTAMP`. | +| `TIME` | Time64(Nanosecond) | Time of day without timezone. | +| `INTERVAL` | Interval(MonthDayNano) | Time duration / interval. | + +### Composite Types + +| SQL Type | Arrow Type | Description | +|----------|------------|-------------| +| `STRUCT` | Struct | Named composite fields. | +| `ARRAY` | List | Ordered list of elements of the same type. Also supports `element_type[]` syntax. | + +--- + +## 4. Full Example + +Below is a complete example combining a Kafka source, a Kafka sink, JSON format, and various SQL data types: + +```sql +-- Source: user activity events from Kafka +CREATE TABLE user_activity ( + event_id VARCHAR, + user_id BIGINT, + action VARCHAR, + amount DECIMAL(10, 2), + tags ARRAY, + event_time TIMESTAMP NOT NULL, + WATERMARK FOR event_time AS event_time - INTERVAL '5' SECOND +) WITH ( + 'connector' = 'kafka', + 'topic' = 'user_activity', + 'format' = 'json', + 'bootstrap.servers' = 'localhost:9092' +); + +-- Sink: 1-minute tumbling window aggregation +CREATE STREAMING TABLE activity_stats_1m WITH ( + 'connector' = 'kafka', + 'topic' = 'activity_stats_1m', + 'format' = 'json', + 'bootstrap.servers' = 'localhost:9092' +) AS +SELECT + TUMBLE(INTERVAL '1' MINUTE) AS time_window, + action, + COUNT(*) AS event_count, + SUM(amount) AS total_amount +FROM user_activity +GROUP BY 1, action; +``` diff --git a/docs/sql-cli-guide-zh.md b/docs/sql-cli-guide-zh.md index 8352dea1..bff05932 100644 --- a/docs/sql-cli-guide-zh.md +++ b/docs/sql-cli-guide-zh.md @@ -129,7 +129,69 @@ DROP FUNCTION go_processor_demo; --- -## 三、REPL 内建辅助指令 +## 三、Streaming SQL:TABLE 与 STREAMING TABLE + +除了 Function 管理之外,CLI 还支持一整套 **Streaming SQL** 命令,用于声明数据源和构建实时管道。完整示例请参阅 [Streaming SQL 使用指南](streaming-sql-guide-zh.md)。 + +### 3.1 注册数据源:CREATE TABLE + +声明外部数据源(如 Kafka),包含 Schema、事件时间和水位线策略。此操作仅创建**静态目录条目**,不消耗计算资源。 + +```sql +CREATE TABLE ad_impressions ( + impression_id VARCHAR, + ad_id BIGINT, + campaign_id BIGINT, + user_id VARCHAR, + impression_time TIMESTAMP NOT NULL, + WATERMARK FOR impression_time AS impression_time - INTERVAL '2' SECOND +) WITH ( + 'connector' = 'kafka', + 'topic' = 'raw_ad_impressions', + 'format' = 'json', + 'bootstrap.servers' = 'localhost:9092' +); +``` + +### 3.2 创建流计算管道:CREATE STREAMING TABLE + +使用 CTAS 语法启动持续运行的分布式计算管道。结果以纯追加模式写入目标连接器。 + +```sql +CREATE STREAMING TABLE metric_tumble_impressions_1m WITH ( + 'connector' = 'kafka', + 'topic' = 'sink_impressions_1m', + 'format' = 'json', + 'bootstrap.servers' = 'localhost:9092' +) AS +SELECT + TUMBLE(INTERVAL '1' MINUTE) AS time_window, + campaign_id, + COUNT(*) AS total_impressions +FROM ad_impressions +GROUP BY 1, campaign_id; +``` + +### 3.3 查看与监控 + +| 命令 | 说明 | +|------|------| +| `SHOW TABLES` | 列出所有已注册的数据源表。 | +| `SHOW CREATE TABLE ` | 显示某张表的建表 DDL。 | +| `SHOW STREAMING TABLES` | 列出所有正在运行的流计算管道及其状态。 | +| `SHOW CREATE STREAMING TABLE ` | 查看某条管道的物理执行拓扑图(ASCII 格式)。 | + +### 3.4 销毁流计算管道:DROP STREAMING TABLE + +停止并释放某条流计算管道的所有资源: + +```sql +DROP STREAMING TABLE metric_tumble_impressions_1m; +``` + +--- + +## 四、REPL 内建辅助指令 在 `function-stream>` 提示符下,支持以下便捷指令: @@ -141,7 +203,7 @@ DROP FUNCTION go_processor_demo; --- -## 四、技术约束与注意事项 +## 五、技术约束与注意事项 - **路径隔离**:SQL CLI 本身不负责上传文件。function_path 指向的文件必须预先存在于**服务端机器**的磁盘上。若需远程上传打包,请使用 Python SDK。 - **Python 函数限制**:由于 Python 函数涉及动态依赖分析与代码打包,目前**不支持**通过 SQL CLI 创建,仅能通过 CLI 进行 START / STOP / SHOW 等生命周期管理。 diff --git a/docs/sql-cli-guide.md b/docs/sql-cli-guide.md index be42a37e..a7f36a88 100644 --- a/docs/sql-cli-guide.md +++ b/docs/sql-cli-guide.md @@ -129,7 +129,69 @@ DROP FUNCTION go_processor_demo; --- -## 3. REPL Built-in Auxiliary Commands +## 3. Streaming SQL: TABLE & STREAMING TABLE + +In addition to Function management, the CLI supports a full set of **Streaming SQL** commands for declaring data sources and building real-time pipelines. For a comprehensive guide with examples, see [Streaming SQL Guide](streaming-sql-guide.md). + +### 3.1 Register Data Source: CREATE TABLE + +Declare an external data source (e.g. Kafka) with schema, event time, and watermark strategy. This creates a **static catalog entry** that consumes no compute resources. + +```sql +CREATE TABLE ad_impressions ( + impression_id VARCHAR, + ad_id BIGINT, + campaign_id BIGINT, + user_id VARCHAR, + impression_time TIMESTAMP NOT NULL, + WATERMARK FOR impression_time AS impression_time - INTERVAL '2' SECOND +) WITH ( + 'connector' = 'kafka', + 'topic' = 'raw_ad_impressions', + 'format' = 'json', + 'bootstrap.servers' = 'localhost:9092' +); +``` + +### 3.2 Create Streaming Pipeline: CREATE STREAMING TABLE + +Launch a continuous, distributed compute pipeline using CTAS syntax. Results are written to the target connector in append-only mode. + +```sql +CREATE STREAMING TABLE metric_tumble_impressions_1m WITH ( + 'connector' = 'kafka', + 'topic' = 'sink_impressions_1m', + 'format' = 'json', + 'bootstrap.servers' = 'localhost:9092' +) AS +SELECT + TUMBLE(INTERVAL '1' MINUTE) AS time_window, + campaign_id, + COUNT(*) AS total_impressions +FROM ad_impressions +GROUP BY 1, campaign_id; +``` + +### 3.3 Inspect & Monitor + +| Command | Description | +|---------|-------------| +| `SHOW TABLES` | List all registered source tables. | +| `SHOW CREATE TABLE ` | Display the DDL of a registered table. | +| `SHOW STREAMING TABLES` | List all running streaming pipelines with status. | +| `SHOW CREATE STREAMING TABLE ` | Inspect the physical execution graph (ASCII topology). | + +### 3.4 Destroy Streaming Pipeline: DROP STREAMING TABLE + +Stop and release all resources for a streaming pipeline: + +```sql +DROP STREAMING TABLE metric_tumble_impressions_1m; +``` + +--- + +## 4. REPL Built-in Auxiliary Commands At the `function-stream>` prompt, the following convenient commands are supported: @@ -141,7 +203,7 @@ At the `function-stream>` prompt, the following convenient commands are supporte --- -## 4. Technical Constraints and Notes +## 5. Technical Constraints and Notes - **Path Isolation**: The SQL CLI itself is not responsible for uploading files. The file pointed to by function_path must pre-exist on the **Server machine's** disk. If remote upload packaging is required, please use the Python SDK. - **Python Function Limitations**: Since Python functions involve dynamic dependency analysis and code packaging, they are currently **not supported** for creation via SQL CLI; only lifecycle management such as START / STOP / SHOW via CLI is supported. diff --git a/docs/streaming-sql-guide-zh.md b/docs/streaming-sql-guide-zh.md new file mode 100644 index 00000000..98842614 --- /dev/null +++ b/docs/streaming-sql-guide-zh.md @@ -0,0 +1,284 @@ + + +# Streaming SQL 使用指南 + +[中文](streaming-sql-guide-zh.md) | [English](streaming-sql-guide.md) + +Function Stream 提供了声明式 SQL 接口来构建实时流处理管道。通过 Streaming SQL,您可以轻松应对无界数据流(Unbounded Data)的摄取、时间窗口聚合、流式关联以及任务生命周期管理 — 无需编写任何命令式代码。 + +--- + +## 目录 + +- [核心概念](#核心概念) +- [第一部分:注册数据源 (TABLE)](#第一部分注册数据源-table) +- [第二部分:构建实时 Pipeline (STREAMING TABLE)](#第二部分构建实时-pipeline-streaming-table) + - [滚动窗口 (Tumbling Window)](#场景-1滚动窗口-tumbling-window) + - [滑动窗口 (Hopping Window)](#场景-2滑动窗口-hopping-window) + - [会话窗口 (Session Window)](#场景-3会话窗口-session-window) + - [窗口双流关联 (Window Join)](#场景-4窗口双流关联-window-join) +- [第三部分:生命周期与流任务管理](#第三部分生命周期与流任务管理) + - [数据源管理](#1-数据源与元数据管理) + - [Pipeline 监控](#2-实时-pipeline-监控与排障) + - [停止与释放](#3-安全停止与释放资源) +- [SQL 语法速查表](#sql-语法速查表) + +--- + +## 核心概念 + +| 概念 | SQL 关键字 | 说明 | +|------|-----------|------| +| **TABLE** | `CREATE TABLE` | 系统目录(Catalog)中的静态逻辑定义。只记录外部数据源的连接信息、格式和 Schema,不消耗任何计算资源。 | +| **STREAMING TABLE** | `CREATE STREAMING TABLE ... AS SELECT` | 持续运行的物理数据管道。引擎会在后台拉起真实的分布式计算任务,并将结果以纯追加(Append-only)方式持续写入外部系统。 | +| **事件时间 (Event Time)** | `WATERMARK FOR ` | 引擎内部用于推进时间进度的时间戳列。 | +| **水位线 (Watermark)** | `AS - INTERVAL ...` | 对迟到乱序数据的容忍度。超过水位线的事件将被丢弃。 | + +> 支持的连接器、数据格式和 SQL 数据类型的完整参考,请参阅 [连接器、格式与类型参考](connectors-and-formats-zh.md)。 + +--- + +## 第一部分:注册数据源 (TABLE) + +`TABLE` 是系统目录(Catalog)中的静态逻辑定义。它只记录外部数据源(如 Kafka)的连接信息、格式和 Schema,**不消耗任何计算资源**。 + +在流计算中,我们必须为输入流指定**事件时间(Event Time)**和**水位线(Watermark)**,以此作为引擎内部推进时间、触发计算的唯一依据。 + +### 示例:注册广告曝光流与点击流 + +```sql +-- 1. 注册广告曝光流 +CREATE TABLE ad_impressions ( + impression_id VARCHAR, + ad_id BIGINT, + campaign_id BIGINT, + user_id VARCHAR, + impression_time TIMESTAMP NOT NULL, + -- 核心:将 impression_time 设为事件时间,并容忍最多 2 秒的数据迟到乱序 + WATERMARK FOR impression_time AS impression_time - INTERVAL '2' SECOND +) WITH ( + 'connector' = 'kafka', + 'topic' = 'raw_ad_impressions', + 'format' = 'json', + 'bootstrap.servers' = 'localhost:9092' +); + +-- 2. 注册广告点击流 +CREATE TABLE ad_clicks ( + click_id VARCHAR, + impression_id VARCHAR, + ad_id BIGINT, + click_time TIMESTAMP NOT NULL, + WATERMARK FOR click_time AS click_time - INTERVAL '5' SECOND +) WITH ( + 'connector' = 'kafka', + 'topic' = 'raw_ad_clicks', + 'format' = 'json', + 'bootstrap.servers' = 'localhost:9092' +); +``` + +**关键要素:** + +- `WATERMARK FOR <列> AS <列> - INTERVAL '' SECOND`:声明事件时间列以及允许的最大乱序延迟。 +- `WITH (...)`:连接器属性 — 类型、Topic、格式、Broker 地址。 + +--- + +## 第二部分:构建实时 Pipeline (STREAMING TABLE) + +`STREAMING TABLE` 是持续运行的物理数据管道。使用 `CREATE STREAMING TABLE ... AS SELECT`(CTAS)语法,引擎会在后台拉起真实的分布式计算任务,并将结果以**纯追加(Append-only)**的方式持续写入外部系统。 + +### 场景 1:滚动窗口 (Tumbling Window) + +将时间切分为互不重叠的固定窗口。 + +```sql +-- 需求:每 1 分钟统计一次各广告计划的曝光总量 +CREATE STREAMING TABLE metric_tumble_impressions_1m WITH ( + 'connector' = 'kafka', + 'topic' = 'sink_impressions_1m', + 'format' = 'json', + 'bootstrap.servers' = 'localhost:9092' +) AS +SELECT + TUMBLE(INTERVAL '1' MINUTE) AS time_window, + campaign_id, + COUNT(*) AS total_impressions +FROM ad_impressions +GROUP BY + 1, -- 指代 SELECT 中的第一个字段 (time_window) + campaign_id; +``` + +### 场景 2:滑动窗口 (Hopping Window) + +窗口之间存在重叠,用于平滑趋势监控。 + +```sql +-- 需求:统计过去 10 分钟内各广告的独立访客数(UV),每 1 分钟刷新一次 +CREATE STREAMING TABLE metric_hop_uv_10m WITH ( + 'connector' = 'kafka', + 'topic' = 'sink_uv_10m_step_1m', + 'format' = 'json', + 'bootstrap.servers' = 'localhost:9092' +) AS +SELECT + HOP(INTERVAL '1' MINUTE, INTERVAL '10' MINUTE) AS time_window, + ad_id, + COUNT(DISTINCT user_id) AS unique_users +FROM ad_impressions +GROUP BY + 1, + ad_id; +``` + +### 场景 3:会话窗口 (Session Window) + +会话窗口根据指定的不活跃间隔(Gap)对事件进行分组。如果在 Gap 时间内没有新事件到达,窗口关闭并输出结果。会话窗口非常适合用户行为会话分析。 + +```sql +-- 需求:按用户检测广告曝光会话,30 秒无活动则会话结束 +CREATE STREAMING TABLE metric_session_impressions WITH ( + 'connector' = 'kafka', + 'topic' = 'sink_session_impressions', + 'format' = 'json', + 'bootstrap.servers' = 'localhost:9092' +) AS +SELECT + SESSION(INTERVAL '30' SECOND) AS time_window, + user_id, + COUNT(*) AS impressions_in_session +FROM ad_impressions +GROUP BY + 1, + user_id; +``` + +### 场景 4:窗口双流关联 (Window Join) + +将两条流在完全相同的时间窗口内进行等值关联。因为状态限定在窗口内,水位线越过窗口后状态会自动清理,绝不发生内存泄漏(OOM)。 + +```sql +-- 需求:精确计算 5 分钟级别的点击率 (CTR) +CREATE STREAMING TABLE metric_window_join_ctr_5m WITH ( + 'connector' = 'kafka', + 'topic' = 'sink_ctr_5m', + 'format' = 'json', + 'bootstrap.servers' = 'localhost:9092' +) AS +SELECT + imp.time_window, + imp.ad_id, + imp.impressions, + COALESCE(clk.clicks, 0) AS clicks +FROM ( + SELECT TUMBLE(INTERVAL '5' MINUTE) AS time_window, ad_id, COUNT(*) AS impressions + FROM ad_impressions + GROUP BY 1, ad_id +) imp +LEFT JOIN ( + SELECT TUMBLE(INTERVAL '5' MINUTE) AS time_window, ad_id, COUNT(*) AS clicks + FROM ad_clicks + GROUP BY 1, ad_id +) clk +ON imp.time_window = clk.time_window AND imp.ad_id = clk.ad_id; +``` + +> **要求:**关联条件**必须**包含相同的时间窗口列,以确保状态有界。 + +--- + +## 第三部分:生命周期与流任务管理 + +Function Stream 提供了一套完整的运维指令,帮助您管理元数据目录、排查物理执行图以及销毁流计算任务。 + +### 1. 数据源与元数据管理 + +**查看所有已注册的数据源表:** + +```sql +SHOW TABLES; +``` + +列出当前 Catalog 中的所有静态表定义及其对应的 Event Time 与 Watermark 策略。 + +**查看原始建表语句(DDL):** + +```sql +SHOW CREATE TABLE ad_clicks; +``` + +用于导出或排查某张表的底层连接参数(如 Kafka Topic、Format 等)。 + +### 2. 实时 Pipeline 监控与排障 + +**查看当前运行的计算流:** + +```sql +SHOW STREAMING TABLES; +``` + +输出字段说明: + +| 字段 | 说明 | +|------|------| +| `job_id` | 计算流的名称(如 `metric_tumble_impressions_1m`)。 | +| `status` | 当前生命周期状态(如 `RUNNING`、`FAILED`)。 | +| `pipeline_count` | 该任务在底层被拆分成的并行算子链数量。 | +| `uptime` | 任务已持续运行的时长。 | + +**洞察物理执行拓扑 (Execution Graph):** + +```sql +SHOW CREATE STREAMING TABLE metric_tumble_impressions_1m; +``` + +这是 Function Stream 极其强大的排障指令。它会以 ASCII 格式打印出一条 SQL 是如何在底层被转化为真实分布式计算图的: + +- `[Source]` — 从连接器读取数据。 +- `[Operator] ExpressionWatermark` — 注入水位线。 +- `[Shuffle]` — 重分布网络数据。 +- `[Operator] TumblingWindowAggregate` — 执行真正的窗口聚合。 +- `[Sink] ConnectorSink` — 将结果发往目标连接器(如 Kafka)。 + +### 3. 安全停止与释放资源 + +当某个实时大屏活动结束,或者您需要更新计算逻辑时,必须显式销毁旧的流任务: + +```sql +DROP STREAMING TABLE metric_tumble_impressions_1m; +``` + +--- + +## SQL 语法速查表 + +| 语句 | 说明 | +|------|------| +| `CREATE TABLE ... WITH (...)` | 注册外部数据源,声明 Schema、事件时间和水位线。 | +| `CREATE STREAMING TABLE ... WITH (...) AS SELECT ...` | 创建并启动持续运行的流计算管道。 | +| `SHOW TABLES` | 列出所有已注册的数据源表。 | +| `SHOW CREATE TABLE ` | 显示某张表的建表 DDL。 | +| `SHOW STREAMING TABLES` | 列出所有正在运行的流计算管道及其状态。 | +| `SHOW CREATE STREAMING TABLE ` | 查看某条管道的物理执行拓扑图。 | +| `DROP STREAMING TABLE ` | 销毁流计算管道并释放所有资源。 | diff --git a/docs/streaming-sql-guide.md b/docs/streaming-sql-guide.md new file mode 100644 index 00000000..cafaf887 --- /dev/null +++ b/docs/streaming-sql-guide.md @@ -0,0 +1,283 @@ + + +# Streaming SQL Guide + +[中文](streaming-sql-guide-zh.md) | [English](streaming-sql-guide.md) + +Function Stream provides a declarative SQL interface for building real-time stream processing pipelines. With Streaming SQL you can ingest unbounded data streams, perform time-windowed aggregations, join multiple streams, and manage pipeline lifecycles — all without writing imperative code. + +--- + +## Table of Contents + +- [Core Concepts](#core-concepts) +- [Part 1: Registering Data Sources (TABLE)](#part-1-registering-data-sources-table) +- [Part 2: Building Real-Time Pipelines (STREAMING TABLE)](#part-2-building-real-time-pipelines-streaming-table) + - [Tumbling Window](#scenario-1-tumbling-window) + - [Hopping Window](#scenario-2-hopping-window) + - [Session Window](#scenario-3-session-window) + - [Window Join](#scenario-4-window-join) +- [Part 3: Lifecycle & Pipeline Management](#part-3-lifecycle--pipeline-management) + - [Data Source Management](#1-data-source--metadata-management) + - [Pipeline Monitoring](#2-real-time-pipeline-monitoring--troubleshooting) + - [Stopping & Cleanup](#3-safe-shutdown--resource-release) +- [SQL Reference Summary](#sql-reference-summary) + +--- + +## Core Concepts + +| Concept | SQL Keyword | Description | +|---------|-------------|-------------| +| **TABLE** | `CREATE TABLE` | A static logical definition in the catalog. Records external source connection info, format, and schema. Consumes no compute resources. | +| **STREAMING TABLE** | `CREATE STREAMING TABLE ... AS SELECT` | A physically running data pipeline. The engine allocates distributed compute tasks and continuously writes results to external systems in append-only mode. | +| **Event Time** | `WATERMARK FOR ` | The timestamp column used by the engine to track the progression of time within a stream. | +| **Watermark** | `AS - INTERVAL ...` | A tolerance for late-arriving, out-of-order data. Events arriving after the watermark are dropped. | + +> For the full reference on supported connectors, data formats, and SQL data types, see [Connectors, Formats & Data Types](connectors-and-formats.md). + +--- + +## Part 1: Registering Data Sources (TABLE) + +A `TABLE` is a static logical definition in the system catalog. It only records the connection information (e.g. Kafka broker, topic), data format, and schema of an external data source. **It does not consume any compute resources.** + +In stream processing, you must specify an **Event Time** column and a **Watermark** strategy for each input stream. The engine uses these as the sole basis for advancing time and triggering computations. + +### Example: Register an Ad-Impressions Stream and a Clicks Stream + +```sql +-- 1. Register the ad-impressions stream +CREATE TABLE ad_impressions ( + impression_id VARCHAR, + ad_id BIGINT, + campaign_id BIGINT, + user_id VARCHAR, + impression_time TIMESTAMP NOT NULL, + WATERMARK FOR impression_time AS impression_time - INTERVAL '2' SECOND +) WITH ( + 'connector' = 'kafka', + 'topic' = 'raw_ad_impressions', + 'format' = 'json', + 'bootstrap.servers' = 'localhost:9092' +); + +-- 2. Register the ad-clicks stream +CREATE TABLE ad_clicks ( + click_id VARCHAR, + impression_id VARCHAR, + ad_id BIGINT, + click_time TIMESTAMP NOT NULL, + WATERMARK FOR click_time AS click_time - INTERVAL '5' SECOND +) WITH ( + 'connector' = 'kafka', + 'topic' = 'raw_ad_clicks', + 'format' = 'json', + 'bootstrap.servers' = 'localhost:9092' +); +``` + +**Key elements:** + +- `WATERMARK FOR AS - INTERVAL '' SECOND`: declares the event-time column and the maximum tolerated out-of-order delay. +- `WITH (...)`: connector properties — type, topic, format, and broker address. + +--- + +## Part 2: Building Real-Time Pipelines (STREAMING TABLE) + +A `STREAMING TABLE` is a continuously running physical data pipeline. Using the `CREATE STREAMING TABLE ... AS SELECT` (CTAS) syntax, the engine launches real distributed compute tasks in the background and continuously writes results to an external system in **append-only** mode. + +### Scenario 1: Tumbling Window + +Divides time into fixed, non-overlapping windows. + +```sql +-- Count total impressions per campaign every 1 minute +CREATE STREAMING TABLE metric_tumble_impressions_1m WITH ( + 'connector' = 'kafka', + 'topic' = 'sink_impressions_1m', + 'format' = 'json', + 'bootstrap.servers' = 'localhost:9092' +) AS +SELECT + TUMBLE(INTERVAL '1' MINUTE) AS time_window, + campaign_id, + COUNT(*) AS total_impressions +FROM ad_impressions +GROUP BY + 1, + campaign_id; +``` + +### Scenario 2: Hopping Window + +Windows overlap, useful for smoothed trend monitoring. + +```sql +-- Count distinct visitors (UV) per ad over the last 10 minutes, refreshed every 1 minute +CREATE STREAMING TABLE metric_hop_uv_10m WITH ( + 'connector' = 'kafka', + 'topic' = 'sink_uv_10m_step_1m', + 'format' = 'json', + 'bootstrap.servers' = 'localhost:9092' +) AS +SELECT + HOP(INTERVAL '1' MINUTE, INTERVAL '10' MINUTE) AS time_window, + ad_id, + COUNT(DISTINCT user_id) AS unique_users +FROM ad_impressions +GROUP BY + 1, + ad_id; +``` + +### Scenario 3: Session Window + +A session window groups events that arrive within a specified gap of inactivity. If no new event arrives within the gap duration, the window closes and emits results. Session windows are ideal for user-session analysis. + +```sql +-- Detect ad-impression sessions per user; a session ends after 30 seconds of inactivity +CREATE STREAMING TABLE metric_session_impressions WITH ( + 'connector' = 'kafka', + 'topic' = 'sink_session_impressions', + 'format' = 'json', + 'bootstrap.servers' = 'localhost:9092' +) AS +SELECT + SESSION(INTERVAL '30' SECOND) AS time_window, + user_id, + COUNT(*) AS impressions_in_session +FROM ad_impressions +GROUP BY + 1, + user_id; +``` + +### Scenario 4: Window Join + +Join two streams within exactly the same time window. Because state is bounded by the window, memory is automatically reclaimed once the watermark advances past the window boundary — eliminating the risk of OOM. + +```sql +-- Calculate 5-minute click-through rate (CTR) +CREATE STREAMING TABLE metric_window_join_ctr_5m WITH ( + 'connector' = 'kafka', + 'topic' = 'sink_ctr_5m', + 'format' = 'json', + 'bootstrap.servers' = 'localhost:9092' +) AS +SELECT + imp.time_window, + imp.ad_id, + imp.impressions, + COALESCE(clk.clicks, 0) AS clicks +FROM ( + SELECT TUMBLE(INTERVAL '5' MINUTE) AS time_window, ad_id, COUNT(*) AS impressions + FROM ad_impressions + GROUP BY 1, ad_id +) imp +LEFT JOIN ( + SELECT TUMBLE(INTERVAL '5' MINUTE) AS time_window, ad_id, COUNT(*) AS clicks + FROM ad_clicks + GROUP BY 1, ad_id +) clk +ON imp.time_window = clk.time_window AND imp.ad_id = clk.ad_id; +``` + +> **Requirement:** The join condition **must** include the same time-window column to ensure bounded state. + +--- + +## Part 3: Lifecycle & Pipeline Management + +Function Stream provides a complete set of operational commands for managing the metadata catalog, inspecting physical execution graphs, and destroying streaming pipelines. + +### 1. Data Source & Metadata Management + +**List all registered source tables:** + +```sql +SHOW TABLES; +``` + +Lists all static table definitions in the current catalog along with their Event Time and Watermark strategies. + +**Show the original DDL of a table:** + +```sql +SHOW CREATE TABLE ad_clicks; +``` + +Useful for exporting or auditing the underlying connection parameters (Kafka topic, format, etc.). + +### 2. Real-Time Pipeline Monitoring & Troubleshooting + +**List all running streaming pipelines:** + +```sql +SHOW STREAMING TABLES; +``` + +Output columns: + +| Column | Description | +|--------|-------------| +| `job_id` | Pipeline name (e.g. `metric_tumble_impressions_1m`). | +| `status` | Lifecycle state (`RUNNING`, `FAILED`, etc.). | +| `pipeline_count` | Number of parallel operator chains the engine split the job into. | +| `uptime` | How long the pipeline has been running. | + +**Inspect the physical execution topology:** + +```sql +SHOW CREATE STREAMING TABLE metric_tumble_impressions_1m; +``` + +This prints an ASCII representation of how the SQL was translated into a distributed execution graph: + +- `[Source]` — reads from the connector. +- `[Operator] ExpressionWatermark` — injects watermarks. +- `[Shuffle]` — redistributes data across the network. +- `[Operator] TumblingWindowAggregate` — performs the actual windowed aggregation. +- `[Sink] ConnectorSink` — writes results to the target connector (e.g. Kafka). + +### 3. Safe Shutdown & Resource Release + +When a campaign ends or you need to update the pipeline logic, explicitly destroy the old streaming pipeline: + +```sql +DROP STREAMING TABLE metric_tumble_impressions_1m; +``` + +--- + +## SQL Reference Summary + +| Statement | Description | +|-----------|-------------| +| `CREATE TABLE ... WITH (...)` | Register an external data source with schema, event time, and watermark. | +| `CREATE STREAMING TABLE ... WITH (...) AS SELECT ...` | Create and launch a continuous streaming pipeline. | +| `SHOW TABLES` | List all registered source tables. | +| `SHOW CREATE TABLE ` | Display the DDL of a registered table. | +| `SHOW STREAMING TABLES` | List all running streaming pipelines with status. | +| `SHOW CREATE STREAMING TABLE ` | Inspect the physical execution graph of a pipeline. | +| `DROP STREAMING TABLE ` | Destroy a streaming pipeline and release all resources. | diff --git a/protocol/Cargo.toml b/protocol/Cargo.toml index fde9de52..5fa7d0f0 100644 --- a/protocol/Cargo.toml +++ b/protocol/Cargo.toml @@ -9,6 +9,7 @@ repository = "https://github.com/your-username/rust-function-stream" [dependencies] prost = "0.13" tonic = { version = "0.12", features = ["default"] } +serde = { version = "1.0", features = ["derive"] } log = "0.4" [build-dependencies] diff --git a/protocol/build.rs b/protocol/build.rs index 17e77d30..d3943f53 100644 --- a/protocol/build.rs +++ b/protocol/build.rs @@ -10,54 +10,65 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::path::Path; +use std::path::{Path, PathBuf}; fn main() -> Result<(), Box> { - // Initialize logger for build script env_logger::init(); - // Create output directories in the protocol package directory - // Use CARGO_MANIFEST_DIR to get the package root directory let manifest_dir = std::env::var("CARGO_MANIFEST_DIR")?; let out_dir = Path::new(&manifest_dir).join("generated"); - let proto_file = Path::new(&manifest_dir).join("proto/function_stream.proto"); - - // Note: Cargo doesn't directly support cleaning custom directories via cargo clean. - // The generated directory will be automatically regenerated on each build if needed. - // To clean it manually, use: ./clean.sh or make clean or rm -rf protocol/generated log::info!("Generated code will be placed in: {}", out_dir.display()); - log::info!("Proto file: {}", proto_file.display()); - // Create output directories let cli_dir = out_dir.join("cli"); let service_dir = out_dir.join("service"); std::fs::create_dir_all(&cli_dir)?; std::fs::create_dir_all(&service_dir)?; - log::info!( - "Created output directories: {} and {}", - cli_dir.display(), - service_dir.display() - ); - // Generate code for CLI - only client code needed + // 1. function_stream.proto → CLI (client) and Service (server) tonic_build::configure() .out_dir(&cli_dir) - .build_client(true) // Enable client code generation - .build_server(false) // Disable server code generation + .build_client(true) + .build_server(false) .compile_protos(&["proto/function_stream.proto"], &["proto"])?; - // Generate code for Service - only server code needed tonic_build::configure() .out_dir(&service_dir) - .build_client(false) // Disable client code generation - .build_server(true) // Enable server code generation + .build_client(false) + .build_server(true) .compile_protos(&["proto/function_stream.proto"], &["proto"])?; + let api_dir = out_dir.join("api"); + std::fs::create_dir_all(&api_dir)?; + + let descriptor_path = + PathBuf::from(std::env::var("OUT_DIR").unwrap()).join("fs_api_descriptor.bin"); + + tonic_build::configure() + .out_dir(&api_dir) + .protoc_arg("--experimental_allow_proto3_optional") + .file_descriptor_set_path(&descriptor_path) + .type_attribute(".", "#[derive(serde::Serialize, serde::Deserialize)]") + .type_attribute(".", "#[serde(rename_all = \"camelCase\")]") + .build_client(false) + .build_server(false) + .compile_protos(&["proto/fs_api.proto"], &["proto"])?; + + let storage_dir = out_dir.join("storage"); + std::fs::create_dir_all(&storage_dir)?; + tonic_build::configure() + .out_dir(&storage_dir) + .protoc_arg("--experimental_allow_proto3_optional") + .build_client(false) + .build_server(false) + .compile_protos(&["proto/storage.proto"], &["proto"])?; + log::info!("Protocol Buffers code generated successfully"); println!("cargo:rustc-env=PROTO_GEN_DIR={}", out_dir.display()); - println!("cargo:rerun-if-changed={}", proto_file.display()); + println!("cargo:rerun-if-changed=proto/function_stream.proto"); + println!("cargo:rerun-if-changed=proto/fs_api.proto"); + println!("cargo:rerun-if-changed=proto/storage.proto"); Ok(()) } diff --git a/protocol/proto/fs_api.proto b/protocol/proto/fs_api.proto new file mode 100644 index 00000000..1f578ffe --- /dev/null +++ b/protocol/proto/fs_api.proto @@ -0,0 +1,423 @@ +// Licensed under the Apache License, Version 2.0 +// Adapted from Arroyo's api.proto for FunctionStream + +syntax = "proto3"; +package fs_api; + +// ─────────────────────── Operators ─────────────────────── + +message ConnectorOp { + string connector = 1; + reserved 2; // removed: map config_map + optional FsSchema fs_schema = 3; + string name = 4; + string description = 5; + + oneof config { + KafkaSourceConfig kafka_source = 6; + KafkaSinkConfig kafka_sink = 7; + GenericConnectorConfig generic = 8; + } +} + +// ─────────────────────── Kafka Connector Configs ─────────────────────── + +message KafkaSourceConfig { + string topic = 1; + string bootstrap_servers = 2; + optional string group_id = 3; + optional string group_id_prefix = 4; + KafkaOffsetMode offset_mode = 5; + KafkaReadMode read_mode = 6; + KafkaAuthConfig auth = 7; + map client_configs = 8; + FormatConfig format = 9; + BadDataPolicy bad_data_policy = 10; + uint32 rate_limit_msgs_per_sec = 11; + optional string value_subject = 12; +} + +message KafkaSinkConfig { + string topic = 1; + string bootstrap_servers = 2; + KafkaSinkCommitMode commit_mode = 3; + optional string key_field = 4; + optional string timestamp_field = 5; + KafkaAuthConfig auth = 6; + map client_configs = 7; + FormatConfig format = 8; + optional string value_subject = 9; +} + +// Fallback for non-Kafka connectors that are not yet strongly typed. +message GenericConnectorConfig { + map properties = 1; +} + +// ─────────────────────── Kafka Auth ─────────────────────── + +message KafkaAuthConfig { + oneof auth { + KafkaAuthNone none = 1; + KafkaAuthSasl sasl = 2; + KafkaAuthAwsMskIam aws_msk_iam = 3; + } +} + +message KafkaAuthNone {} + +message KafkaAuthSasl { + string protocol = 1; + string mechanism = 2; + string username = 3; + string password = 4; +} + +message KafkaAuthAwsMskIam { + string region = 1; +} + +// ─────────────────────── Format & Data-Quality ─────────────────────── + +message FormatConfig { + oneof format { + JsonFormatConfig json = 1; + RawStringFormatConfig raw_string = 2; + RawBytesFormatConfig raw_bytes = 3; + } +} + +message JsonFormatConfig { + TimestampFormatProto timestamp_format = 1; + DecimalEncodingProto decimal_encoding = 2; + bool include_schema = 3; + bool confluent_schema_registry = 4; + optional uint32 schema_id = 5; + bool debezium = 6; + bool unstructured = 7; +} + +message RawStringFormatConfig {} +message RawBytesFormatConfig {} + +// ─────────────────────── Kafka Enums ─────────────────────── + +enum TimestampFormatProto { + TIMESTAMP_RFC3339 = 0; + TIMESTAMP_UNIX_MILLIS = 1; +} + +enum DecimalEncodingProto { + DECIMAL_NUMBER = 0; + DECIMAL_STRING = 1; + DECIMAL_BYTES = 2; +} + +enum BadDataPolicy { + BAD_DATA_FAIL = 0; + BAD_DATA_DROP = 1; +} + +enum KafkaOffsetMode { + KAFKA_OFFSET_EARLIEST = 0; + KAFKA_OFFSET_LATEST = 1; + KAFKA_OFFSET_GROUP = 2; +} + +enum KafkaReadMode { + KAFKA_READ_DEFAULT = 0; + KAFKA_READ_COMMITTED = 1; + KAFKA_READ_UNCOMMITTED = 2; +} + +enum KafkaSinkCommitMode { + KAFKA_SINK_AT_LEAST_ONCE = 0; + KAFKA_SINK_EXACTLY_ONCE = 1; +} + +message ValuePlanOperator { + string name = 1; + bytes physical_plan = 2; +} + +message KeyPlanOperator { + string name = 1; + bytes physical_plan = 2; + repeated uint64 key_fields = 3; +} + +message ProjectionOperator { + string name = 1; + FsSchema input_schema = 2; + FsSchema output_schema = 3; + repeated bytes exprs = 4; +} + +message TumblingWindowAggregateOperator { + string name = 1; + uint64 width_micros = 2; + bytes binning_function = 3; + FsSchema input_schema = 4; + FsSchema partial_schema = 5; + bytes partial_aggregation_plan = 6; + bytes final_aggregation_plan = 7; + optional bytes final_projection = 8; +} + +message SlidingWindowAggregateOperator { + string name = 1; + uint64 width_micros = 2; + uint64 slide_micros = 3; + bytes binning_function = 4; + FsSchema input_schema = 5; + FsSchema partial_schema = 6; + bytes partial_aggregation_plan = 7; + bytes final_aggregation_plan = 8; + bytes final_projection = 9; +} + +message SessionWindowAggregateOperator { + string name = 1; + uint64 gap_micros = 2; + string window_field_name = 3; + uint64 window_index = 4; + FsSchema input_schema = 5; + FsSchema unkeyed_aggregate_schema = 6; + bytes partial_aggregation_plan = 7; + bytes final_aggregation_plan = 8; +} + +message JoinOperator { + string name = 1; + FsSchema left_schema = 2; + FsSchema right_schema = 3; + FsSchema output_schema = 4; + bytes join_plan = 5; + optional uint64 ttl_micros = 6; +} + +message LookupJoinCondition { + bytes left_expr = 1; + string right_key = 2; +} + +message LookupJoinOperator { + FsSchema input_schema = 1; + FsSchema lookup_schema = 2; + ConnectorOp connector = 3; + repeated LookupJoinCondition key_exprs = 4; + JoinType join_type = 5; + optional uint64 ttl_micros = 6; + optional uint64 max_capacity_bytes = 7; +} + +message WindowFunctionOperator { + string name = 1; + FsSchema input_schema = 2; + bytes binning_function = 3; + bytes window_function_plan = 4; +} + +enum AsyncUdfOrdering { + UNORDERED = 0; + ORDERED = 1; +} + +message AsyncUdfOperator { + string name = 1; + DylibUdfConfig udf = 2; + repeated bytes arg_exprs = 3; + repeated bytes final_exprs = 4; + AsyncUdfOrdering ordering = 5; + uint32 max_concurrency = 6; + uint64 timeout_micros = 7; +} + +message UpdatingAggregateOperator { + string name = 1; + FsSchema input_schema = 2; + FsSchema final_schema = 3; + bytes aggregate_exec = 5; + bytes metadata_expr = 6; + uint64 flush_interval_micros = 7; + uint64 ttl_micros = 8; +} + +// ─────────────────────── Watermark ─────────────────────── + +message ExpressionWatermarkConfig { + uint64 period_micros = 1; + optional uint64 idle_time_micros = 2; + FsSchema input_schema = 3; + bytes expression = 4; +} + +// ─────────────────────── Windows ─────────────────────── + +message Window { + oneof window { + SlidingWindow sliding_window = 2; + TumblingWindow tumbling_window = 3; + InstantWindow instant_window = 4; + SessionWindow session_window = 5; + } +} + +message SlidingWindow { + uint64 size_micros = 1; + uint64 slide_micros = 2; +} + +message TumblingWindow { + uint64 size_micros = 1; +} + +message InstantWindow {} + +message SessionWindow { + uint64 gap_micros = 1; +} + +// ─────────────────────── Enums ─────────────────────── + +enum JoinType { + INNER = 0; + LEFT = 1; + RIGHT = 2; + FULL = 3; +} + +enum OffsetMode { + EARLIEST = 0; + LATEST = 1; +} + +enum EdgeType { + UNUSED = 0; + FORWARD = 1; + SHUFFLE = 2; + LEFT_JOIN = 3; + RIGHT_JOIN = 4; +} + +// ─────────────────── Physical Extension Nodes ─────────────────── + +message MemExecNode { + string table_name = 1; + string schema = 2; // json-encoded +} + +message UnnestExecNode { + string schema = 1; // json-encoded +} + +message DebeziumDecodeNode { + string schema = 1; // json-encoded + repeated uint64 primary_keys = 2; +} + +message DebeziumEncodeNode { + string schema = 1; // json-encoded +} + +message FsExecNode { + oneof node { + MemExecNode mem_exec = 1; + UnnestExecNode unnest_exec = 2; + DebeziumDecodeNode debezium_decode = 3; + DebeziumEncodeNode debezium_encode = 4; + } +} + +// ─────────────────── Checkpoints ─────────────────── + +enum TaskCheckpointEventType { + ALIGNMENT_STARTED = 0; + CHECKPOINT_STARTED = 1; + CHECKPOINT_OPERATOR_SETUP_FINISHED = 2; + CHECKPOINT_SYNC_FINISHED = 3; + CHECKPOINT_PRE_COMMIT = 4; +} + +message TaskCheckpointEvent { + uint64 time = 1; + TaskCheckpointEventType event_type = 2; +} + +message TaskCheckpointDetail { + uint32 subtask_index = 1; + uint64 start_time = 2; + optional uint64 finish_time = 3; + optional uint64 bytes = 4; + repeated TaskCheckpointEvent events = 5; +} + +message OperatorCheckpointDetail { + string operator_id = 1; + uint64 start_time = 2; + optional uint64 finish_time = 3; + bool has_state = 4; + optional uint64 started_metadata_write = 6; + map tasks = 5; +} + +// ─────────────────── UDF Config ─────────────────── + +message DylibUdfConfig { + string dylib_path = 1; + repeated bytes arg_types = 2; + bytes return_type = 3; + bool aggregate = 4; + bool is_async = 5; +} + +message PythonUdfConfig { + string name = 1; + repeated bytes arg_types = 2; + bytes return_type = 3; + string definition = 4; +} + +message FsProgramConfig { + map udf_dylibs = 1; + map python_udfs = 2; +} + +// ─────────────────── Arrow Program ─────────────────── + +message FsProgram { + repeated FsNode nodes = 1; + repeated FsEdge edges = 2; + FsProgramConfig program_config = 3; +} + +message FsSchema { + string arrow_schema = 1; // json-encoded Arrow Schema + uint32 timestamp_index = 2; + repeated uint32 key_indices = 3; + bool has_keys = 4; + repeated uint32 routing_key_indices = 5; + bool has_routing_keys = 6; +} + +message ChainedOperator { + string operator_id = 1; + string operator_name = 2; + bytes operator_config = 3; +} + +message FsNode { + int32 node_index = 1; + uint32 node_id = 2; + uint32 parallelism = 3; + string description = 4; + repeated ChainedOperator operators = 5; + repeated FsSchema edges = 6; +} + +message FsEdge { + int32 source = 1; + int32 target = 2; + FsSchema schema = 4; + EdgeType edge_type = 5; +} diff --git a/protocol/proto/storage.proto b/protocol/proto/storage.proto new file mode 100644 index 00000000..f107d472 --- /dev/null +++ b/protocol/proto/storage.proto @@ -0,0 +1,107 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// +// All durable / persisted payloads for FunctionStream (single source of truth for storage wire format). +// - Stream table catalog (MetaStore KV) +// - Task rows (RocksDB task_meta / task_payload; values may be prefixed — see runtime codec) + +syntax = "proto3"; + +package function_stream.storage; + +// ============================================================================= +// Catalog table storage (coordinator SQL catalog) +// ============================================================================= + +// Top-level persisted record for one catalog table. +message TableDefinition { + string table_name = 1; + int64 updated_at_millis = 2; + oneof table_type { + // Connector-backed ingestion/egress table definition. + CatalogSourceTable connector_table = 3; + // Connector-backed lookup table definition. + CatalogSourceTable lookup_table = 5; + } +} + +// Shared connector-backed table payload for connector/lookup entries. +message CatalogSourceTable { + bytes arrow_schema_ipc = 1; + optional string event_time_field = 2; + optional string watermark_field = 3; + // Original CREATE TABLE ... WITH ('k'='v', ...) pairs — single source of truth. + map with_options = 4; + // Canonical connector identifier (e.g. kafka, postgres-cdc). + string connector = 5; + reserved 6; // removed: string opaque_config (JSON blob no longer needed) + // Human-readable note from DDL (ConnectorOp.description). + string description = 7; +} + +// ============================================================================= +// Streaming table storage (CREATE STREAMING TABLE persistence) +// ============================================================================= + +// Persisted record for one streaming table (CREATE STREAMING TABLE). +// On restart, the engine re-submits each record to JobManager to resume the pipeline. +message StreamingTableDefinition { + string table_name = 1; + int64 created_at_millis = 2; + // Serialized function_stream.api.FsProgram — the full execution graph. + // Stored as opaque bytes to avoid coupling storage schema with runtime API protos. + bytes fs_program_bytes = 3; + string comment = 4; +} + +// ============================================================================= +// Task storage (RocksDB metadata + module payload) +// ============================================================================= + +// Lifecycle state persisted for task recovery. New enum values MUST be appended +// with new numbers (never renumber) for forward compatibility. +enum ComponentStateKind { + COMPONENT_STATE_KIND_UNSPECIFIED = 0; + UNINITIALIZED = 1; + INITIALIZED = 2; + STARTING = 3; + RUNNING = 4; + CHECKPOINTING = 5; + STOPPING = 6; + STOPPED = 7; + CLOSING = 8; + CLOSED = 9; + ERROR = 10; +} + +message ComponentStateProto { + ComponentStateKind kind = 1; + // Set when kind == ERROR + string error_message = 2; +} + +// Stored in CF task_meta (after magic prefix FSP1). +message TaskMetadataProto { + string task_type = 1; + ComponentStateProto state = 2; + uint64 created_at = 3; + optional uint64 checkpoint_id = 4; +} + +message TaskModuleWasm { + bytes wasm_binary = 1; +} + +message TaskModulePython { + string class_name = 1; + string module_path = 2; + optional bytes embedded_code = 3; +} + +// Stored in CF task_payload (after magic prefix FSP1). +message TaskModulePayloadProto { + oneof payload { + TaskModuleWasm wasm = 1; + TaskModulePython python = 2; + } +} diff --git a/protocol/src/lib.rs b/protocol/src/lib.rs index b0c6da06..d1bdfff9 100644 --- a/protocol/src/lib.rs +++ b/protocol/src/lib.rs @@ -10,25 +10,39 @@ // See the License for the specific language governing permissions and // limitations under the License. -// Protocol Buffers protocol definitions for function stream -// This module exports the generated Protocol Buffers code +// ─────────────── FunctionStream Service (original) ─────────────── -// CLI module - exports client code #[path = "../generated/cli/function_stream.rs"] pub mod cli; -// Service module - exports server code #[path = "../generated/service/function_stream.rs"] pub mod service; -// Re-export commonly used types from both modules -// Data structures are the same in both, so we can re-export from either pub use cli::function_stream_service_client; - -// Re-export client-specific types pub use cli::function_stream_service_client::FunctionStreamServiceClient; - -// Re-export server-specific types pub use service::function_stream_service_server::{ FunctionStreamService, FunctionStreamServiceServer, }; + +// ─────────────── Streaming Pipeline API (fs_api.proto) ─────────────── + +pub mod grpc { + /// Serde-annotated API types for streaming operators, schemas, programs. + #[allow(clippy::all)] + pub mod api { + include!("../generated/api/fs_api.rs"); + } +} + +/// File descriptor set for fs_api.proto (for gRPC reflection / REST gateway). +pub const FS_API_FILE_DESCRIPTOR_SET: &[u8] = + tonic::include_file_descriptor_set!("fs_api_descriptor"); + +// ─────────────── Durable storage (storage.proto: catalog + task rows) ─────────────── + +/// Prost types for persisted stream catalog and task storage (`proto/storage.proto`). +pub mod storage { + #![allow(clippy::all)] + #![allow(warnings)] + include!("../generated/storage/function_stream.storage.rs"); +} diff --git a/src/common/fs_schema.rs b/src/common/fs_schema.rs new file mode 100644 index 00000000..4229b957 --- /dev/null +++ b/src/common/fs_schema.rs @@ -0,0 +1,456 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! FunctionStream table/stream schema: Arrow [`Schema`] plus timestamp index and optional key columns. +//! +//! [`Schema`]: datafusion::arrow::datatypes::Schema + +use datafusion::arrow::array::builder::{ArrayBuilder, make_builder}; +use datafusion::arrow::array::{RecordBatch, TimestampNanosecondArray}; +use datafusion::arrow::datatypes::{DataType, Field, FieldRef, Schema, SchemaBuilder, TimeUnit}; +use datafusion::arrow::error::ArrowError; +use datafusion::common::{DataFusionError, Result as DFResult}; +use std::sync::Arc; +use std::time::SystemTime; +use arrow::compute::{filter_record_batch, lexsort_to_indices, partition, take, SortColumn}; +use arrow::compute::kernels::cmp::gt_eq; +use arrow::compute::kernels::numeric::div; +use arrow::row::SortField; +use arrow_array::{PrimitiveArray, UInt64Array}; +use arrow_array::types::UInt64Type; +use protocol::grpc::api; +use super::{to_nanos, TIMESTAMP_FIELD}; +use std::ops::Range; +use crate::common::converter::Converter; + +pub type FsSchemaRef = Arc; + +#[derive(Debug, Clone, Eq, PartialEq, Hash)] +pub struct FsSchema { + pub schema: Arc, + pub timestamp_index: usize, + key_indices: Option>, + /// If defined, these indices are used for routing (i.e., which subtask gets which piece of data) + routing_key_indices: Option>, +} + +impl TryFrom for FsSchema { + type Error = DataFusionError; + fn try_from(schema_proto: api::FsSchema) -> Result { + let schema: Schema = serde_json::from_str(&schema_proto.arrow_schema) + .map_err(|e| DataFusionError::Plan(format!("Invalid arrow schema: {e}")))?; + let timestamp_index = schema_proto.timestamp_index as usize; + + let key_indices = schema_proto.has_keys.then(|| { + schema_proto + .key_indices + .into_iter() + .map(|index| index as usize) + .collect() + }); + + let routing_key_indices = schema_proto.has_routing_keys.then(|| { + schema_proto + .routing_key_indices + .into_iter() + .map(|index| index as usize) + .collect() + }); + + Ok(Self { + schema: Arc::new(schema), + timestamp_index, + key_indices, + routing_key_indices, + }) + } +} + +impl From for api::FsSchema { + fn from(schema: FsSchema) -> Self { + let arrow_schema = serde_json::to_string(schema.schema.as_ref()).unwrap(); + let timestamp_index = schema.timestamp_index as u32; + + let has_keys = schema.key_indices.is_some(); + let key_indices = schema + .key_indices + .map(|ks| ks.into_iter().map(|index| index as u32).collect()) + .unwrap_or_default(); + + let has_routing_keys = schema.routing_key_indices.is_some(); + let routing_key_indices = schema + .routing_key_indices + .map(|ks| ks.into_iter().map(|index| index as u32).collect()) + .unwrap_or_default(); + + Self { + arrow_schema, + timestamp_index, + key_indices, + has_keys, + routing_key_indices, + has_routing_keys, + } + } +} + +impl FsSchema { + pub fn new( + schema: Arc, + timestamp_index: usize, + key_indices: Option>, + routing_key_indices: Option>, + ) -> Self { + Self { + schema, + timestamp_index, + key_indices, + routing_key_indices, + } + } + pub fn new_unkeyed(schema: Arc, timestamp_index: usize) -> Self { + Self { + schema, + timestamp_index, + key_indices: None, + routing_key_indices: None, + } + } + pub fn new_keyed(schema: Arc, timestamp_index: usize, key_indices: Vec) -> Self { + Self { + schema, + timestamp_index, + key_indices: Some(key_indices), + routing_key_indices: None, + } + } + + pub fn from_fields(mut fields: Vec) -> Self { + if !fields.iter().any(|f| f.name() == TIMESTAMP_FIELD) { + fields.push(Field::new( + TIMESTAMP_FIELD, + DataType::Timestamp(TimeUnit::Nanosecond, None), + false, + )); + } + + Self::from_schema_keys(Arc::new(Schema::new(fields)), vec![]).unwrap() + } + + pub fn from_schema_unkeyed(schema: Arc) -> DFResult { + let timestamp_index = schema + .column_with_name(TIMESTAMP_FIELD) + .ok_or_else(|| { + DataFusionError::Plan(format!( + "no {TIMESTAMP_FIELD} field in schema, schema is {schema:?}" + )) + })? + .0; + + Ok(Self { + schema, + timestamp_index, + key_indices: None, + routing_key_indices: None, + }) + } + + pub fn from_schema_keys(schema: Arc, key_indices: Vec) -> DFResult { + let timestamp_index = schema + .column_with_name(TIMESTAMP_FIELD) + .ok_or_else(|| { + DataFusionError::Plan(format!( + "no {TIMESTAMP_FIELD} field in schema, schema is {schema:?}" + )) + })? + .0; + + Ok(Self { + schema, + timestamp_index, + key_indices: Some(key_indices), + routing_key_indices: None, + }) + } + + pub fn schema_without_timestamp(&self) -> Schema { + let mut builder = SchemaBuilder::from(self.schema.fields()); + builder.remove(self.timestamp_index); + builder.finish() + } + + pub fn remove_timestamp_column(&self, batch: &mut RecordBatch) { + batch.remove_column(self.timestamp_index); + } + + pub fn builders(&self) -> Vec> { + self.schema + .fields + .iter() + .map(|f| make_builder(f.data_type(), 8)) + .collect() + } + + pub fn timestamp_column<'a>(&self, batch: &'a RecordBatch) -> &'a TimestampNanosecondArray { + batch + .column(self.timestamp_index) + .as_any() + .downcast_ref::() + .unwrap() + } + + pub fn has_routing_keys(&self) -> bool { + self.routing_keys().map(|k| !k.is_empty()).unwrap_or(false) + } + + pub fn routing_keys(&self) -> Option<&Vec> { + self.routing_key_indices + .as_ref() + .or(self.key_indices.as_ref()) + } + + pub fn storage_keys(&self) -> Option<&Vec> { + self.key_indices.as_ref() + } + + pub fn filter_by_time( + &self, + batch: RecordBatch, + cutoff: Option, + ) -> Result { + let Some(cutoff) = cutoff else { + // no watermark, so we just return the same batch. + return Ok(batch); + }; + // filter out late data + let timestamp_column = batch + .column(self.timestamp_index) + .as_any() + .downcast_ref::() + .ok_or_else(|| ArrowError::CastError( + format!("failed to downcast column {} of {:?} to timestamp. Schema is supposed to be {:?}", + self.timestamp_index, batch, self.schema)))?; + let cutoff_scalar = TimestampNanosecondArray::new_scalar(to_nanos(cutoff) as i64); + let on_time = gt_eq(timestamp_column, &cutoff_scalar)?; + filter_record_batch(&batch, &on_time) + } + + pub fn sort_columns(&self, batch: &RecordBatch, with_timestamp: bool) -> Vec { + let mut columns = vec![]; + if let Some(keys) = &self.key_indices { + columns.extend(keys.iter().map(|index| SortColumn { + values: batch.column(*index).clone(), + options: None, + })); + } + if with_timestamp { + columns.push(SortColumn { + values: batch.column(self.timestamp_index).clone(), + options: None, + }); + } + columns + } + + pub fn sort_fields(&self, with_timestamp: bool) -> Vec { + let mut sort_fields = vec![]; + if let Some(keys) = &self.key_indices { + sort_fields.extend(keys.iter()); + } + if with_timestamp { + sort_fields.push(self.timestamp_index); + } + self.sort_fields_by_indices(&sort_fields) + } + + fn sort_fields_by_indices(&self, indices: &[usize]) -> Vec { + indices + .iter() + .map(|index| SortField::new(self.schema.field(*index).data_type().clone())) + .collect() + } + + pub fn converter(&self, with_timestamp: bool) -> Result { + Converter::new(self.sort_fields(with_timestamp)) + } + + pub fn value_converter( + &self, + with_timestamp: bool, + generation_index: usize, + ) -> Result { + match &self.key_indices { + None => { + let mut indices = (0..self.schema.fields().len()).collect::>(); + indices.remove(generation_index); + if !with_timestamp { + indices.remove(self.timestamp_index); + } + Converter::new(self.sort_fields_by_indices(&indices)) + } + Some(keys) => { + let indices = (0..self.schema.fields().len()) + .filter(|index| { + !keys.contains(index) + && (with_timestamp || *index != self.timestamp_index) + && *index != generation_index + }) + .collect::>(); + Converter::new(self.sort_fields_by_indices(&indices)) + } + } + } + + pub fn value_indices(&self, with_timestamp: bool) -> Vec { + let field_count = self.schema.fields().len(); + match &self.key_indices { + None => { + let mut indices = (0..field_count).collect::>(); + + if !with_timestamp { + indices.remove(self.timestamp_index); + } + indices + } + Some(keys) => (0..field_count) + .filter(|index| { + !keys.contains(index) && (with_timestamp || *index != self.timestamp_index) + }) + .collect::>(), + } + } + + pub fn sort( + &self, + batch: RecordBatch, + with_timestamp: bool, + ) -> Result { + if self.key_indices.is_none() && !with_timestamp { + return Ok(batch); + } + let sort_columns = self.sort_columns(&batch, with_timestamp); + let sort_indices = lexsort_to_indices(&sort_columns, None).expect("should be able to sort"); + let columns = batch + .columns() + .iter() + .map(|c| take(c, &sort_indices, None).unwrap()) + .collect(); + + RecordBatch::try_new(batch.schema(), columns) + } + + pub fn partition( + &self, + batch: &RecordBatch, + with_timestamp: bool, + ) -> Result>, ArrowError> { + if self.key_indices.is_none() && !with_timestamp { + #[allow(clippy::single_range_in_vec_init)] + return Ok(vec![0..batch.num_rows()]); + } + + let mut partition_columns = vec![]; + + if let Some(keys) = &self.routing_keys() { + partition_columns.extend(keys.iter().map(|index| batch.column(*index).clone())); + } + if with_timestamp { + partition_columns.push(batch.column(self.timestamp_index).clone()); + } + + Ok(partition(&partition_columns)?.ranges()) + } + + pub fn unkeyed_batch(&self, batch: &RecordBatch) -> Result { + if self.key_indices.is_none() { + return Ok(batch.clone()); + } + let columns: Vec<_> = (0..batch.num_columns()) + .filter(|index| !self.key_indices.as_ref().unwrap().contains(index)) + .collect(); + batch.project(&columns) + } + + pub fn schema_without_keys(&self) -> Result { + if self.key_indices.is_none() { + return Ok(self.clone()); + } + let key_indices = self.key_indices.as_ref().unwrap(); + let unkeyed_schema = Schema::new( + self.schema + .fields() + .iter() + .enumerate() + .filter(|(index, _field)| !key_indices.contains(index)) + .map(|(_, field)| field.as_ref().clone()) + .collect::>(), + ); + let timestamp_index = unkeyed_schema.index_of(TIMESTAMP_FIELD)?; + Ok(Self { + schema: Arc::new(unkeyed_schema), + timestamp_index, + key_indices: None, + routing_key_indices: None, + }) + } + + pub fn with_fields(&self, fields: Vec) -> Result { + let schema = Arc::new(Schema::new_with_metadata( + fields, + self.schema.metadata.clone(), + )); + + let timestamp_index = schema.index_of(TIMESTAMP_FIELD)?; + let max_index = *[&self.key_indices, &self.routing_key_indices] + .iter() + .map(|indices| indices.as_ref().and_then(|k| k.iter().max())) + .max() + .flatten() + .unwrap_or(&0); + + if schema.fields.len() - 1 < max_index { + return Err(ArrowError::InvalidArgumentError(format!( + "expected at least {} fields, but were only {}", + max_index + 1, + schema.fields.len() + ))); + } + + Ok(Self { + schema, + timestamp_index, + key_indices: self.key_indices.clone(), + routing_key_indices: self.routing_key_indices.clone(), + }) + } + + pub fn with_additional_fields( + &self, + new_fields: impl Iterator, + ) -> Result { + let mut fields = self.schema.fields.to_vec(); + fields.extend(new_fields.map(Arc::new)); + + self.with_fields(fields) + } +} + +pub fn server_for_hash_array( + hash: &PrimitiveArray, + n: usize, +) -> Result, ArrowError> { + let range_size = u64::MAX / (n as u64) + 1; + let range_scalar = UInt64Array::new_scalar(range_size); + let division = div(hash, &range_scalar)?; + let result: &PrimitiveArray = division.as_any().downcast_ref().unwrap(); + Ok(result.clone()) +} diff --git a/src/common/mod.rs b/src/common/mod.rs new file mode 100644 index 00000000..e0eb8d7a --- /dev/null +++ b/src/common/mod.rs @@ -0,0 +1,72 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Shared core types and constants for FunctionStream (`crate::common`). +//! +//! Used by the runtime, SQL planner, coordinator, and other subsystems — +//! analogous to `arroyo-types` + `arroyo-rpc` in Arroyo. + +pub mod arrow_ext; +pub mod control; +pub mod date; +pub mod debezium; +pub mod fs_schema; +pub mod errors; +pub mod formats; +pub mod hash; +pub mod message; +pub mod operator_config; +pub mod task_info; +pub mod time_utils; +pub mod worker; +mod converter; + +// ── Re-exports from existing modules ── +pub use arrow_ext::{DisplayAsSql, FsExtensionType, GetArrowSchema, GetArrowType}; +pub use date::{DatePart, DateTruncPrecision}; +pub use debezium::{Debezium, DebeziumOp, UpdatingData}; +pub use hash::{range_for_server, server_for_hash, HASH_SEEDS}; +pub use message::{ArrowMessage, CheckpointBarrier, SignalMessage, Watermark}; +pub use task_info::{ChainInfo, TaskInfo}; +pub use time_utils::{from_micros, from_millis, from_nanos, to_micros, to_millis, to_nanos}; +pub use worker::{MachineId, WorkerId}; + +// ── Re-exports from new modules ── +pub use control::{ + CheckpointCompleted, CheckpointEvent, CompactionResult, ControlMessage, ControlResp, + ErrorDomain, RetryHint, StopMode, TaskCheckpointEventType, TaskError, +}; +pub use fs_schema::{FsSchema, FsSchemaRef}; +pub use errors::DataflowError; +pub use formats::{BadData, Format, Framing, JsonFormat}; +pub use operator_config::MetadataField; + +// ── Well-known column names ── +pub const TIMESTAMP_FIELD: &str = "_timestamp"; +pub const UPDATING_META_FIELD: &str = "_updating_meta"; + +// ── Environment variables ── +pub const JOB_ID_ENV: &str = "JOB_ID"; +pub const RUN_ID_ENV: &str = "RUN_ID"; + +// ── Metric names ── +pub const MESSAGES_RECV: &str = "fs_worker_messages_recv"; +pub const MESSAGES_SENT: &str = "fs_worker_messages_sent"; +pub const BYTES_RECV: &str = "fs_worker_bytes_recv"; +pub const BYTES_SENT: &str = "fs_worker_bytes_sent"; +pub const BATCHES_RECV: &str = "fs_worker_batches_recv"; +pub const BATCHES_SENT: &str = "fs_worker_batches_sent"; +pub const TX_QUEUE_SIZE: &str = "fs_worker_tx_queue_size"; +pub const TX_QUEUE_REM: &str = "fs_worker_tx_queue_rem"; +pub const DESERIALIZATION_ERRORS: &str = "fs_worker_deserialization_errors"; + +pub const LOOKUP_KEY_INDEX_FIELD: &str = "__lookup_key_index"; diff --git a/src/config/global_config.rs b/src/config/global_config.rs index b4f92edd..c76bf4b0 100644 --- a/src/config/global_config.rs +++ b/src/config/global_config.rs @@ -19,6 +19,13 @@ use crate::config::python_config::PythonConfig; use crate::config::service_config::ServiceConfig; use crate::config::wasm_config::WasmConfig; +#[derive(Debug, Clone, Serialize, Deserialize, Default)] +pub struct StreamingConfig { + /// Maximum heap memory (in bytes) available to the streaming runtime's memory pool. + /// Defaults to 256 MiB when absent. + pub max_memory_bytes: Option, +} + #[derive(Debug, Clone, Serialize, Deserialize, Default)] pub struct GlobalConfig { pub service: ServiceConfig, @@ -31,6 +38,10 @@ pub struct GlobalConfig { pub state_storage: crate::config::storage::StateStorageConfig, #[serde(default)] pub task_storage: crate::config::storage::TaskStorageConfig, + #[serde(default)] + pub streaming: StreamingConfig, + #[serde(default)] + pub stream_catalog: crate::config::storage::StreamCatalogConfig, } impl GlobalConfig { diff --git a/src/config/storage.rs b/src/config/storage.rs index e5186648..28396d7d 100644 --- a/src/config/storage.rs +++ b/src/config/storage.rs @@ -118,3 +118,27 @@ impl Default for TaskStorageConfig { } } } + +/// Stream table catalog (`CREATE TABLE` / `SHOW TABLES`) storage. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct StreamCatalogConfig { + /// When `false`, the catalog is in-memory only and is **lost on process restart**. + #[serde(default = "default_stream_catalog_persist")] + pub persist: bool, + /// RocksDB directory for persisted catalog. Default: `{data_dir}/stream_catalog`. + #[serde(default)] + pub db_path: Option, +} + +fn default_stream_catalog_persist() -> bool { + true +} + +impl Default for StreamCatalogConfig { + fn default() -> Self { + Self { + persist: default_stream_catalog_persist(), + db_path: None, + } + } +} diff --git a/src/coordinator/analyze/analyzer.rs b/src/coordinator/analyze/analyzer.rs index 30552191..878a9481 100644 --- a/src/coordinator/analyze/analyzer.rs +++ b/src/coordinator/analyze/analyzer.rs @@ -13,8 +13,11 @@ use super::Analysis; use crate::coordinator::execution_context::ExecutionContext; use crate::coordinator::statement::{ - CreateFunction, CreatePythonFunction, DropFunction, ShowFunctions, StartFunction, Statement, - StatementVisitor, StatementVisitorContext, StatementVisitorResult, StopFunction, + CreateFunction, CreatePythonFunction, CreateTable, DropFunction, + DropStreamingTableStatement, DropTableStatement, ShowCatalogTables, + ShowCreateStreamingTable, ShowCreateTable, ShowFunctions, ShowStreamingTables, + StartFunction, Statement, StatementVisitor, StatementVisitorContext, + StatementVisitorResult, StopFunction, StreamingTableStatement, }; use std::fmt; @@ -108,6 +111,22 @@ impl StatementVisitor for Analyzer<'_> { StatementVisitorResult::Analyze(Box::new(stmt.clone())) } + fn visit_show_catalog_tables( + &self, + stmt: &ShowCatalogTables, + _context: &StatementVisitorContext, + ) -> StatementVisitorResult { + StatementVisitorResult::Analyze(Box::new(stmt.clone())) + } + + fn visit_show_create_table( + &self, + stmt: &ShowCreateTable, + _context: &StatementVisitorContext, + ) -> StatementVisitorResult { + StatementVisitorResult::Analyze(Box::new(stmt.clone())) + } + fn visit_create_python_function( &self, stmt: &CreatePythonFunction, @@ -115,4 +134,54 @@ impl StatementVisitor for Analyzer<'_> { ) -> StatementVisitorResult { StatementVisitorResult::Analyze(Box::new(stmt.clone())) } + + fn visit_create_table( + &self, + stmt: &CreateTable, + _context: &StatementVisitorContext, + ) -> StatementVisitorResult { + StatementVisitorResult::Analyze(Box::new(CreateTable::new(stmt.statement.clone()))) + } + + fn visit_streaming_table_statement( + &self, + stmt: &StreamingTableStatement, + _context: &StatementVisitorContext, + ) -> StatementVisitorResult { + StatementVisitorResult::Analyze(Box::new(StreamingTableStatement::new( + stmt.statement.clone(), + ))) + } + + fn visit_drop_table_statement( + &self, + stmt: &DropTableStatement, + _context: &StatementVisitorContext, + ) -> StatementVisitorResult { + StatementVisitorResult::Analyze(Box::new(DropTableStatement::new(stmt.statement.clone()))) + } + + fn visit_show_streaming_tables( + &self, + stmt: &ShowStreamingTables, + _context: &StatementVisitorContext, + ) -> StatementVisitorResult { + StatementVisitorResult::Analyze(Box::new(stmt.clone())) + } + + fn visit_show_create_streaming_table( + &self, + stmt: &ShowCreateStreamingTable, + _context: &StatementVisitorContext, + ) -> StatementVisitorResult { + StatementVisitorResult::Analyze(Box::new(stmt.clone())) + } + + fn visit_drop_streaming_table( + &self, + stmt: &DropStreamingTableStatement, + _context: &StatementVisitorContext, + ) -> StatementVisitorResult { + StatementVisitorResult::Analyze(Box::new(stmt.clone())) + } } diff --git a/src/coordinator/coordinator.rs b/src/coordinator/coordinator.rs index 4ad766d5..b86b1070 100644 --- a/src/coordinator/coordinator.rs +++ b/src/coordinator/coordinator.rs @@ -10,128 +10,139 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::sync::Arc; use std::time::Instant; use anyhow::{Context, Result}; -use crate::coordinator::analyze::{Analysis, Analyzer}; +use crate::coordinator::analyze::Analyzer; use crate::coordinator::dataset::ExecuteResult; use crate::coordinator::execution::Executor; use crate::coordinator::plan::{LogicalPlanVisitor, LogicalPlanner, PlanNode}; use crate::coordinator::statement::Statement; -use crate::runtime::taskexecutor::TaskManager; +use crate::sql::schema::StreamSchemaProvider; use super::execution_context::ExecutionContext; +use super::runtime_context::CoordinatorRuntimeContext; +#[derive(Default)] pub struct Coordinator {} -impl Default for Coordinator { - fn default() -> Self { - Self::new() - } -} - impl Coordinator { pub fn new() -> Self { Self {} } - pub fn execute(&self, stmt: &dyn Statement) -> ExecuteResult { - let start_time = Instant::now(); - let context = ExecutionContext::new(); - let execution_id = context.execution_id; + // ======================================================================== + // Plan compilation + // ======================================================================== - match self.execute_pipeline(&context, stmt) { - Ok(result) => { - log::debug!( - "[{}] Execution completed in {}ms", - execution_id, - start_time.elapsed().as_millis() - ); - result - } - Err(e) => { - log::error!( - "[{}] Execution failed after {}ms. Error: {:#}", - execution_id, - start_time.elapsed().as_millis(), - e - ); - ExecuteResult::err(format!("Execution failed: {:#}", e)) - } - } + pub fn compile_plan( + &self, + stmt: &dyn Statement, + schema_provider: StreamSchemaProvider, + ) -> Result> { + self.compile_plan_internal(&ExecutionContext::new(), stmt, schema_provider) } - fn execute_pipeline( + /// Internal pipeline: Analyze → build logical plan → optimize. + fn compile_plan_internal( &self, context: &ExecutionContext, stmt: &dyn Statement, - ) -> Result { - let analysis = self.step_analyze(context, stmt)?; - let plan = self.step_build_logical_plan(&analysis)?; - let optimized_plan = self.step_optimize(&analysis, plan)?; - self.step_execute(optimized_plan) - } - - fn step_analyze(&self, context: &ExecutionContext, stmt: &dyn Statement) -> Result { + schema_provider: StreamSchemaProvider, + ) -> Result> { + let exec_id = context.execution_id; let start = Instant::now(); - let analyzer = Analyzer::new(context); - let result = analyzer + + let analysis = Analyzer::new(context) .analyze(stmt) .map_err(|e| anyhow::anyhow!(e)) - .context("Analyzer phase failed"); - + .context("Analyzer phase failed")?; log::debug!( "[{}] Analyze phase finished in {}ms", - context.execution_id, + exec_id, start.elapsed().as_millis() ); - result - } - - fn step_build_logical_plan(&self, analysis: &Analysis) -> Result> { - let visitor = LogicalPlanVisitor::new(); - let plan = visitor.visit(analysis); - Ok(plan) - } - fn step_optimize( - &self, - analysis: &Analysis, - plan: Box, - ) -> Result> { - let start = Instant::now(); - let planner = LogicalPlanner::new(); - let optimized = planner.optimize(plan, analysis); + let plan = LogicalPlanVisitor::new(schema_provider).visit(&analysis); + let opt_start = Instant::now(); + let optimized = LogicalPlanner::new().optimize(plan, &analysis); log::debug!( - "Optimizer phase finished in {}ms", - start.elapsed().as_millis() + "[{}] Optimizer phase finished in {}ms", + exec_id, + opt_start.elapsed().as_millis() ); + Ok(optimized) } - fn step_execute(&self, plan: Box) -> Result { + // ======================================================================== + // Execution + // ======================================================================== + + pub fn execute(&self, stmt: &dyn Statement) -> ExecuteResult { + match CoordinatorRuntimeContext::try_from_globals() { + Ok(ctx) => self.execute_with_runtime_context(stmt, &ctx), + Err(e) => ExecuteResult::err(e.to_string()), + } + } + + pub async fn execute_with_stream_catalog(&self, stmt: &dyn Statement) -> ExecuteResult { + self.execute(stmt) + } + + /// Same as [`Self::execute`], but uses an explicit [`CoordinatorRuntimeContext`] (e.g. tests or custom wiring). + pub fn execute_with_runtime_context( + &self, + stmt: &dyn Statement, + runtime: &CoordinatorRuntimeContext, + ) -> ExecuteResult { let start = Instant::now(); - let task_manager = match TaskManager::get() { - Ok(tm) => tm, + let context = ExecutionContext::new(); + let exec_id = context.execution_id; + let schema_provider = runtime.planning_schema_provider(); + + let result = (|| -> Result { + let plan = self.compile_plan_internal(&context, stmt, schema_provider)?; + + let exec_start = Instant::now(); + let res = Executor::new( + Arc::clone(&runtime.task_manager), + runtime.catalog_manager.clone(), + Arc::clone(&runtime.job_manager), + ) + .execute(plan.as_ref()) + .map_err(|e| anyhow::anyhow!(e)) + .context("Executor phase failed")?; + + log::debug!( + "[{}] Executor phase finished in {}ms", + exec_id, + exec_start.elapsed().as_millis() + ); + Ok(res) + })(); + + match result { + Ok(res) => { + log::debug!( + "[{}] Execution completed in {}ms", + exec_id, + start.elapsed().as_millis() + ); + res + } Err(e) => { - return Ok(ExecuteResult::err(format!( - "Failed to get TaskManager: {}", + log::error!( + "[{}] Execution failed after {}ms. Error: {:#}", + exec_id, + start.elapsed().as_millis(), e - ))); + ); + ExecuteResult::err(format!("Execution failed: {:#}", e)) } - }; - let executor = Executor::new(task_manager.clone()); - let result = executor - .execute(plan.as_ref()) - .map_err(|e| anyhow::anyhow!(e)) - .context("Executor phase failed"); - - log::debug!( - "Executor phase finished in {}ms", - start.elapsed().as_millis() - ); - result + } } -} +} \ No newline at end of file diff --git a/src/coordinator/dataset/mod.rs b/src/coordinator/dataset/mod.rs index b72613da..bbcac6f0 100644 --- a/src/coordinator/dataset/mod.rs +++ b/src/coordinator/dataset/mod.rs @@ -12,8 +12,16 @@ mod data_set; mod execute_result; +mod show_catalog_tables_result; +mod show_create_streaming_table_result; +mod show_create_table_result; mod show_functions_result; +mod show_streaming_tables_result; pub use data_set::{DataSet, empty_record_batch}; pub use execute_result::ExecuteResult; +pub use show_catalog_tables_result::ShowCatalogTablesResult; +pub use show_create_streaming_table_result::ShowCreateStreamingTableResult; +pub use show_create_table_result::ShowCreateTableResult; pub use show_functions_result::ShowFunctionsResult; +pub use show_streaming_tables_result::ShowStreamingTablesResult; diff --git a/src/coordinator/dataset/show_catalog_tables_result.rs b/src/coordinator/dataset/show_catalog_tables_result.rs new file mode 100644 index 00000000..74a8cd2d --- /dev/null +++ b/src/coordinator/dataset/show_catalog_tables_result.rs @@ -0,0 +1,100 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; + +use arrow_array::{Int32Array, StringArray}; +use arrow_schema::{DataType, Field, Schema}; +use datafusion::arrow::datatypes::Schema as DfSchema; + +use super::DataSet; +use crate::sql::schema::table::Table as CatalogTable; +use crate::sql::schema::{catalog_table_row_detail, schema_columns_one_line}; + +#[derive(Clone, Debug)] +pub struct ShowCatalogTablesResult { + names: Vec, + kinds: Vec, + column_counts: Vec, + schema_lines: Vec, + details: Vec, +} + +impl ShowCatalogTablesResult { + pub fn from_tables(tables: &[Arc]) -> Self { + let mut names = Vec::with_capacity(tables.len()); + let mut kinds = Vec::with_capacity(tables.len()); + let mut column_counts = Vec::with_capacity(tables.len()); + let mut schema_lines = Vec::with_capacity(tables.len()); + let mut details = Vec::with_capacity(tables.len()); + + for t in tables { + let schema = match t.as_ref() { + CatalogTable::ConnectorTable(source) | CatalogTable::LookupTable(source) => { + source.produce_physical_schema() + } + CatalogTable::TableFromQuery { .. } => DfSchema::new(t.get_fields()), + }; + let ncols = schema.fields().len() as i32; + names.push(t.name().to_string()); + kinds.push(match t.as_ref() { + CatalogTable::ConnectorTable(_) => "SOURCE", + CatalogTable::LookupTable(_) => "LOOKUP", + CatalogTable::TableFromQuery { .. } => "QUERY", + } + .to_string()); + column_counts.push(ncols); + schema_lines.push(schema_columns_one_line(&schema)); + details.push(catalog_table_row_detail(t.as_ref())); + } + + Self { + names, + kinds, + column_counts, + schema_lines, + details, + } + } +} + +impl DataSet for ShowCatalogTablesResult { + fn to_record_batch(&self) -> arrow_array::RecordBatch { + let schema = Arc::new(Schema::new(vec![ + Field::new("table_name", DataType::Utf8, false), + Field::new("kind", DataType::Utf8, false), + Field::new("column_count", DataType::Int32, false), + Field::new("schema_columns", DataType::Utf8, false), + Field::new("details", DataType::Utf8, false), + ])); + + arrow_array::RecordBatch::try_new( + schema, + vec![ + Arc::new(StringArray::from( + self.names.iter().map(|s| s.as_str()).collect::>(), + )), + Arc::new(StringArray::from( + self.kinds.iter().map(|s| s.as_str()).collect::>(), + )), + Arc::new(Int32Array::from(self.column_counts.clone())), + Arc::new(StringArray::from( + self.schema_lines.iter().map(|s| s.as_str()).collect::>(), + )), + Arc::new(StringArray::from( + self.details.iter().map(|s| s.as_str()).collect::>(), + )), + ], + ) + .unwrap_or_else(|_| arrow_array::RecordBatch::new_empty(Arc::new(Schema::empty()))) + } +} diff --git a/src/coordinator/dataset/show_create_streaming_table_result.rs b/src/coordinator/dataset/show_create_streaming_table_result.rs new file mode 100644 index 00000000..ed3ec600 --- /dev/null +++ b/src/coordinator/dataset/show_create_streaming_table_result.rs @@ -0,0 +1,69 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; + +use arrow_array::StringArray; +use arrow_schema::{DataType, Field, Schema}; +use protocol::grpc::api::FsProgram; + +use crate::sql::common::render_program_topology; + +use super::DataSet; + +#[derive(Clone, Debug)] +pub struct ShowCreateStreamingTableResult { + table_name: String, + status: String, + pipeline_detail: String, + program: FsProgram, +} + +impl ShowCreateStreamingTableResult { + pub fn new( + table_name: String, + status: String, + pipeline_detail: String, + program: FsProgram, + ) -> Self { + Self { + table_name, + status, + pipeline_detail, + program, + } + } +} + +impl DataSet for ShowCreateStreamingTableResult { + fn to_record_batch(&self) -> arrow_array::RecordBatch { + let topology = render_program_topology(&self.program); + + let schema = Arc::new(Schema::new(vec![ + Field::new("Streaming Table", DataType::Utf8, false), + Field::new("Status", DataType::Utf8, false), + Field::new("Pipelines", DataType::Utf8, false), + Field::new("Topology", DataType::Utf8, false), + ])); + + arrow_array::RecordBatch::try_new( + schema, + vec![ + Arc::new(StringArray::from(vec![self.table_name.as_str()])), + Arc::new(StringArray::from(vec![self.status.as_str()])), + Arc::new(StringArray::from(vec![self.pipeline_detail.as_str()])), + Arc::new(StringArray::from(vec![topology.as_str()])), + ], + ) + .unwrap_or_else(|_| arrow_array::RecordBatch::new_empty(Arc::new(Schema::empty()))) + } +} diff --git a/src/coordinator/dataset/show_create_table_result.rs b/src/coordinator/dataset/show_create_table_result.rs new file mode 100644 index 00000000..47f49d59 --- /dev/null +++ b/src/coordinator/dataset/show_create_table_result.rs @@ -0,0 +1,51 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; + +use arrow_array::StringArray; +use arrow_schema::{DataType, Field, Schema}; + +use super::DataSet; + +#[derive(Clone, Debug)] +pub struct ShowCreateTableResult { + table_name: String, + create_sql: String, +} + +impl ShowCreateTableResult { + pub fn new(table_name: String, create_sql: String) -> Self { + Self { + table_name, + create_sql, + } + } +} + +impl DataSet for ShowCreateTableResult { + fn to_record_batch(&self) -> arrow_array::RecordBatch { + let schema = Arc::new(Schema::new(vec![ + Field::new("Table", DataType::Utf8, false), + Field::new("Create Table", DataType::Utf8, false), + ])); + + arrow_array::RecordBatch::try_new( + schema, + vec![ + Arc::new(StringArray::from(vec![self.table_name.as_str()])), + Arc::new(StringArray::from(vec![self.create_sql.as_str()])), + ], + ) + .unwrap_or_else(|_| arrow_array::RecordBatch::new_empty(Arc::new(Schema::empty()))) + } +} diff --git a/src/coordinator/dataset/show_streaming_tables_result.rs b/src/coordinator/dataset/show_streaming_tables_result.rs new file mode 100644 index 00000000..a992d1b9 --- /dev/null +++ b/src/coordinator/dataset/show_streaming_tables_result.rs @@ -0,0 +1,75 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; + +use arrow_array::{Int32Array, StringArray}; +use arrow_schema::{DataType, Field, Schema}; + +use super::DataSet; +use crate::runtime::streaming::job::StreamingJobSummary; + +#[derive(Clone, Debug)] +pub struct ShowStreamingTablesResult { + jobs: Vec, +} + +impl ShowStreamingTablesResult { + pub fn new(jobs: Vec) -> Self { + Self { jobs } + } +} + +impl DataSet for ShowStreamingTablesResult { + fn to_record_batch(&self) -> arrow_array::RecordBatch { + let schema = Arc::new(Schema::new(vec![ + Field::new("job_id", DataType::Utf8, false), + Field::new("status", DataType::Utf8, false), + Field::new("pipeline_count", DataType::Int32, false), + Field::new("uptime", DataType::Utf8, false), + ])); + + let job_ids: Vec<&str> = self.jobs.iter().map(|j| j.job_id.as_str()).collect(); + let statuses: Vec<&str> = self.jobs.iter().map(|j| j.status.as_str()).collect(); + let pipeline_counts: Vec = self.jobs.iter().map(|j| j.pipeline_count).collect(); + let uptimes: Vec = self.jobs.iter().map(|j| format_duration(j.uptime_secs)).collect(); + let uptime_refs: Vec<&str> = uptimes.iter().map(|s| s.as_str()).collect(); + + arrow_array::RecordBatch::try_new( + schema, + vec![ + Arc::new(StringArray::from(job_ids)), + Arc::new(StringArray::from(statuses)), + Arc::new(Int32Array::from(pipeline_counts)), + Arc::new(StringArray::from(uptime_refs)), + ], + ) + .unwrap_or_else(|_| arrow_array::RecordBatch::new_empty(Arc::new(Schema::empty()))) + } +} + +fn format_duration(total_secs: u64) -> String { + let days = total_secs / 86400; + let hours = (total_secs % 86400) / 3600; + let mins = (total_secs % 3600) / 60; + let secs = total_secs % 60; + + if days > 0 { + format!("{days}d {hours}h {mins}m {secs}s") + } else if hours > 0 { + format!("{hours}h {mins}m {secs}s") + } else if mins > 0 { + format!("{mins}m {secs}s") + } else { + format!("{secs}s") + } +} diff --git a/src/coordinator/execution/executor.rs b/src/coordinator/execution/executor.rs index 7e44217e..c24a4cda 100644 --- a/src/coordinator/execution/executor.rs +++ b/src/coordinator/execution/executor.rs @@ -10,16 +10,32 @@ // See the License for the specific language governing permissions and // limitations under the License. -use crate::coordinator::dataset::{ExecuteResult, ShowFunctionsResult, empty_record_batch}; +use std::sync::Arc; + +use protocol::grpc::api::FsProgram; +use thiserror::Error; +use tracing::{debug, info, warn}; + +use crate::coordinator::dataset::{ + empty_record_batch, ExecuteResult, ShowCatalogTablesResult, + ShowCreateStreamingTableResult, ShowCreateTableResult, ShowFunctionsResult, + ShowStreamingTablesResult, +}; use crate::coordinator::plan::{ - CreateFunctionPlan, CreatePythonFunctionPlan, DropFunctionPlan, PlanNode, PlanVisitor, - PlanVisitorContext, PlanVisitorResult, ShowFunctionsPlan, StartFunctionPlan, StopFunctionPlan, + CreateFunctionPlan, CreatePythonFunctionPlan, CreateTablePlan, CreateTablePlanBody, + DropFunctionPlan, DropStreamingTablePlan, DropTablePlan, LookupTablePlan, PlanNode, + PlanVisitor, PlanVisitorContext, PlanVisitorResult, ShowCatalogTablesPlan, + ShowCreateStreamingTablePlan, ShowCreateTablePlan, ShowFunctionsPlan, + ShowStreamingTablesPlan, StartFunctionPlan, StopFunctionPlan, StreamingTable, + StreamingTableConnectorPlan, }; use crate::coordinator::statement::{ConfigSource, FunctionSource}; +use crate::runtime::streaming::job::JobManager; +use crate::runtime::streaming::protocol::control::StopMode; use crate::runtime::taskexecutor::TaskManager; -use std::sync::Arc; -use thiserror::Error; -use tracing::{debug, info}; +use crate::sql::schema::table::Table as CatalogTable; +use crate::sql::schema::show_create_catalog_table; +use crate::storage::stream_catalog::CatalogManager; #[derive(Error, Debug)] pub enum ExecuteError { @@ -35,11 +51,21 @@ pub enum ExecuteError { pub struct Executor { task_manager: Arc, + catalog_manager: Arc, + job_manager: Arc, } impl Executor { - pub fn new(task_manager: Arc) -> Self { - Self { task_manager } + pub fn new( + task_manager: Arc, + catalog_manager: Arc, + job_manager: Arc, + ) -> Self { + Self { + task_manager, + catalog_manager, + job_manager, + } } pub fn execute(&self, plan: &dyn PlanNode) -> Result { @@ -50,32 +76,35 @@ impl Executor { match visitor_result { PlanVisitorResult::Execute(result) => { - let elapsed = timer.elapsed(); - debug!(target: "executor", elapsed_ms = elapsed.as_millis(), "Execution completed"); + debug!( + target: "executor", + elapsed_ms = timer.elapsed().as_millis(), + "Execution completed" + ); result } } } } + impl PlanVisitor for Executor { - #[allow(clippy::redundant_closure_call)] fn visit_create_function( &self, plan: &CreateFunctionPlan, _context: &PlanVisitorContext, ) -> PlanVisitorResult { - let result = (|| -> Result { + let execute = || -> Result { let function_bytes = match &plan.function_source { FunctionSource::Path(path) => std::fs::read(path).map_err(|e| { - ExecuteError::Validation(format!("Failed to read function at {}: {}", path, e)) + ExecuteError::Validation(format!("Failed to read function at {path}: {e}")) })?, FunctionSource::Bytes(bytes) => bytes.clone(), }; let config_bytes = match &plan.config_source { Some(ConfigSource::Path(path)) => std::fs::read(path).map_err(|e| { - ExecuteError::Validation(format!("Failed to read config at {}: {}", path, e)) + ExecuteError::Validation(format!("Failed to read config at {path}: {e}")) })?, Some(ConfigSource::Bytes(bytes)) => bytes.clone(), None => { @@ -88,35 +117,34 @@ impl PlanVisitor for Executor { info!(config_size = config_bytes.len(), "Registering Wasm task"); self.task_manager .register_task(&config_bytes, &function_bytes) - .map_err(|e| ExecuteError::Task(format!("Registration failed: {:?}", e)))?; + .map_err(|e| ExecuteError::Task(format!("Registration failed: {e:?}")))?; Ok(ExecuteResult::ok_with_data( "Function registered successfully", empty_record_batch(), )) - })(); + }; - PlanVisitorResult::Execute(result) + PlanVisitorResult::Execute(execute()) } - #[allow(clippy::redundant_closure_call)] fn visit_drop_function( &self, plan: &DropFunctionPlan, _context: &PlanVisitorContext, ) -> PlanVisitorResult { - let result = (|| -> Result { + let execute = || -> Result { self.task_manager .remove_task(&plan.name) - .map_err(|e| ExecuteError::Task(format!("Removal failed: {}", e)))?; + .map_err(|e| ExecuteError::Task(format!("Removal failed: {e}")))?; Ok(ExecuteResult::ok_with_data( format!("Function '{}' dropped", plan.name), empty_record_batch(), )) - })(); + }; - PlanVisitorResult::Execute(result) + PlanVisitorResult::Execute(execute()) } fn visit_start_function( @@ -138,48 +166,85 @@ impl PlanVisitor for Executor { PlanVisitorResult::Execute(result) } - #[allow(clippy::redundant_closure_call)] fn visit_show_functions( &self, _plan: &ShowFunctionsPlan, _context: &PlanVisitorContext, ) -> PlanVisitorResult { - let result = { - let functions = self.task_manager.list_all_functions(); + let functions = self.task_manager.list_all_functions(); + let result = ExecuteResult::ok_with_data( + format!("Found {} task(s)", functions.len()), + ShowFunctionsResult::new(functions), + ); + + PlanVisitorResult::Execute(Ok(result)) + } + fn visit_show_catalog_tables( + &self, + _plan: &ShowCatalogTablesPlan, + _context: &PlanVisitorContext, + ) -> PlanVisitorResult { + let tables = match self.catalog_manager.list_catalog_tables() { + Ok(tables) => tables, + Err(e) => return PlanVisitorResult::Execute(Err(ExecuteError::Internal(e.to_string()))), + }; + let n = tables.len(); + let result = ExecuteResult::ok_with_data( + format!("{n} stream catalog table(s)"), + ShowCatalogTablesResult::from_tables(&tables), + ); + PlanVisitorResult::Execute(Ok(result)) + } + + fn visit_show_create_table( + &self, + plan: &ShowCreateTablePlan, + _context: &PlanVisitorContext, + ) -> PlanVisitorResult { + let execute = || -> Result { + let t = self + .catalog_manager + .get_catalog_table(&plan.table_name) + .map_err(|e| ExecuteError::Internal(e.to_string()))? + .ok_or_else(|| { + ExecuteError::Validation(format!( + "Table '{}' not found in stream catalog", + plan.table_name + )) + })?; + let ddl = show_create_catalog_table(t.as_ref()); Ok(ExecuteResult::ok_with_data( - format!("Found {} task(s)", functions.len()), - ShowFunctionsResult::new(functions), + format!("SHOW CREATE TABLE {}", plan.table_name), + ShowCreateTableResult::new(plan.table_name.clone(), ddl), )) }; - - PlanVisitorResult::Execute(result) + PlanVisitorResult::Execute(execute()) } - #[allow(clippy::redundant_closure_call)] fn visit_create_python_function( &self, plan: &CreatePythonFunctionPlan, _context: &PlanVisitorContext, ) -> PlanVisitorResult { - let result = (|| -> Result { - let modules: Vec<(String, Vec)> = plan + let execute = || -> Result { + let modules = plan .modules .iter() .map(|m| (m.name.clone(), m.bytes.clone())) - .collect(); + .collect::>(); self.task_manager .register_python_task(plan.config_content.as_bytes(), &modules) - .map_err(|e| ExecuteError::Task(format!("Python registration failed: {}", e)))?; + .map_err(|e| ExecuteError::Task(format!("Python registration failed: {e}")))?; Ok(ExecuteResult::ok_with_data( format!("Python function '{}' deployed", plan.class_name), empty_record_batch(), )) - })(); + }; - PlanVisitorResult::Execute(result) + PlanVisitorResult::Execute(execute()) } fn visit_stop_function( @@ -200,4 +265,252 @@ impl PlanVisitor for Executor { PlanVisitorResult::Execute(result) } + + fn visit_create_table_plan( + &self, + plan: &CreateTablePlan, + _context: &PlanVisitorContext, + ) -> PlanVisitorResult { + let execute = || -> Result { + let (table_name, if_not_exists, catalog_table) = match &plan.body { + CreateTablePlanBody::ConnectorSource { + source_table, + if_not_exists, + } => { + let table_name = source_table.name().to_string(); + let table_instance = CatalogTable::ConnectorTable(source_table.clone()); + (table_name, *if_not_exists, table_instance) + } + CreateTablePlanBody::DataFusion(_) => { + return Err(ExecuteError::Internal( + "Operation not supported: Currently, the system strictly supports creating tables backed by an external Connector Source (e.g., Kafka, Postgres). In-memory tables, Views, or CTAS (Create Table As Select) are not supported." + .into(), + )); + } + }; + + if if_not_exists && self.catalog_manager.has_catalog_table(&table_name) { + return Ok(ExecuteResult::ok(format!( + "Table '{table_name}' already exists (skipped)" + ))); + } + + self.catalog_manager + .add_catalog_table(catalog_table) + .map_err(|e| { + ExecuteError::Internal(format!( + "Failed to register connector source table '{}': {}", + table_name, e + )) + })?; + + Ok(ExecuteResult::ok(format!( + "Created connector source table '{table_name}'" + ))) + }; + + PlanVisitorResult::Execute(execute()) + } + + fn visit_streaming_table( + &self, + plan: &StreamingTable, + _context: &PlanVisitorContext, + ) -> PlanVisitorResult { + let execute = || -> Result { + let fs_program: FsProgram = plan.program.clone().into(); + let job_manager: Arc = Arc::clone(&self.job_manager); + + let job_id = plan.name.clone(); + let job_id = tokio::task::block_in_place(|| { + tokio::runtime::Handle::current() + .block_on(job_manager.submit_job(job_id, fs_program.clone())) + }) + .map_err(|e| ExecuteError::Internal(format!("Failed to submit streaming job: {e}")))?; + + self.catalog_manager + .persist_streaming_job( + &plan.name, + &fs_program, + plan.comment.as_deref().unwrap_or(""), + ) + .map_err(|e| { + ExecuteError::Internal(format!( + "Streaming job '{}' submitted but persistence failed: {e}", + plan.name + )) + })?; + + info!( + job_id = %job_id, + table = %plan.name, + "Streaming job submitted and persisted" + ); + + Ok(ExecuteResult::ok_with_data( + format!("Streaming table '{}' created, job_id = {}", plan.name, job_id), + empty_record_batch(), + )) + }; + + PlanVisitorResult::Execute(execute()) + } + + fn visit_lookup_table( + &self, + _plan: &LookupTablePlan, + _context: &PlanVisitorContext, + ) -> PlanVisitorResult { + PlanVisitorResult::Execute(Err(ExecuteError::Internal( + "LookupTable execution not yet implemented".to_string(), + ))) + } + + fn visit_streaming_connector_table( + &self, + _plan: &StreamingTableConnectorPlan, + _context: &PlanVisitorContext, + ) -> PlanVisitorResult { + PlanVisitorResult::Execute(Err(ExecuteError::Internal( + "StreamingTableConnector execution not yet implemented".to_string(), + ))) + } + + fn visit_drop_table_plan( + &self, + plan: &DropTablePlan, + _context: &PlanVisitorContext, + ) -> PlanVisitorResult { + let execute = || -> Result { + self.catalog_manager + .drop_catalog_table(&plan.table_name, plan.if_exists) + .map_err(|e| ExecuteError::Internal(e.to_string()))?; + + Ok(ExecuteResult::ok(format!( + "Dropped table '{}'", + plan.table_name + ))) + }; + + PlanVisitorResult::Execute(execute()) + } + + fn visit_show_streaming_tables( + &self, + _plan: &ShowStreamingTablesPlan, + _context: &PlanVisitorContext, + ) -> PlanVisitorResult { + let execute = || -> Result { + let jobs = self.job_manager.list_jobs(); + let n = jobs.len(); + Ok(ExecuteResult::ok_with_data( + format!("{n} streaming table(s)"), + ShowStreamingTablesResult::new(jobs), + )) + }; + PlanVisitorResult::Execute(execute()) + } + + fn visit_show_create_streaming_table( + &self, + plan: &ShowCreateStreamingTablePlan, + _context: &PlanVisitorContext, + ) -> PlanVisitorResult { + let execute = || -> Result { + let detail = self + .job_manager + .get_job_detail(&plan.table_name) + .ok_or_else(|| { + ExecuteError::Validation(format!( + "Streaming table '{}' not found in active jobs", + plan.table_name + )) + })?; + + let pipeline_lines: Vec = detail + .pipelines + .iter() + .map(|p| format!(" pipeline[{}]: {}", p.pipeline_id, p.status)) + .collect(); + let pipeline_detail = if pipeline_lines.is_empty() { + "(no pipelines)".to_string() + } else { + pipeline_lines.join("\n") + }; + + Ok(ExecuteResult::ok_with_data( + format!("SHOW CREATE STREAMING TABLE {}", plan.table_name), + ShowCreateStreamingTableResult::new( + plan.table_name.clone(), + detail.status, + pipeline_detail, + detail.program, + ), + )) + }; + PlanVisitorResult::Execute(execute()) + } + + fn visit_drop_streaming_table( + &self, + plan: &DropStreamingTablePlan, + _context: &PlanVisitorContext, + ) -> PlanVisitorResult { + let execute = || -> Result { + let job_exists = self.job_manager.has_job(&plan.table_name); + + if !job_exists && !plan.if_exists { + return Err(ExecuteError::Validation(format!( + "Streaming table '{}' not found in active jobs", + plan.table_name + ))); + } + + if job_exists { + let job_manager = Arc::clone(&self.job_manager); + let table_name = plan.table_name.clone(); + tokio::task::block_in_place(|| { + tokio::runtime::Handle::current() + .block_on(job_manager.remove_job(&table_name, StopMode::Graceful)) + }) + .map_err(|e| { + ExecuteError::Internal(format!( + "Failed to stop streaming job '{}': {}", + plan.table_name, e + )) + })?; + + info!( + table = %plan.table_name, + "Streaming job stopped and removed" + ); + } + + if let Err(e) = self.catalog_manager.remove_streaming_job(&plan.table_name) { + warn!( + table = %plan.table_name, + error = %e, + "Failed to remove streaming job persisted definition (non-fatal)" + ); + } + + let _ = self + .catalog_manager + .drop_catalog_table(&plan.table_name, true); + + if job_exists { + Ok(ExecuteResult::ok(format!( + "Dropped streaming table '{}'", + plan.table_name + ))) + } else { + Ok(ExecuteResult::ok(format!( + "Streaming table '{}' does not exist (skipped)", + plan.table_name + ))) + } + }; + + PlanVisitorResult::Execute(execute()) + } } diff --git a/src/coordinator/mod.rs b/src/coordinator/mod.rs index 0b94d4bf..922b793f 100644 --- a/src/coordinator/mod.rs +++ b/src/coordinator/mod.rs @@ -17,11 +17,15 @@ mod dataset; mod execution; mod execution_context; mod plan; +mod runtime_context; mod statement; +mod tool; pub use coordinator::Coordinator; pub use dataset::{DataSet, ShowFunctionsResult}; pub use statement::{ - CreateFunction, CreatePythonFunction, DropFunction, PythonModule, ShowFunctions, StartFunction, - Statement, StopFunction, + CreateFunction, CreatePythonFunction, CreateTable, DropFunction, + DropStreamingTableStatement, DropTableStatement, PythonModule, ShowCatalogTables, + ShowCreateStreamingTable, ShowCreateTable, ShowFunctions, ShowStreamingTables, + StartFunction, Statement, StopFunction, StreamingTableStatement, }; diff --git a/src/coordinator/plan/create_table_plan.rs b/src/coordinator/plan/create_table_plan.rs new file mode 100644 index 00000000..7ad82bb3 --- /dev/null +++ b/src/coordinator/plan/create_table_plan.rs @@ -0,0 +1,55 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use datafusion::logical_expr::LogicalPlan; + +use crate::sql::schema::SourceTable; + +use super::{PlanNode, PlanVisitor, PlanVisitorContext, PlanVisitorResult}; + +/// Payload for [`CreateTablePlan`]: either a DataFusion DDL plan or a connector `CREATE TABLE` (no `AS SELECT`). +#[derive(Debug, Clone)] +pub enum CreateTablePlanBody { + DataFusion(LogicalPlan), + ConnectorSource { + source_table: SourceTable, + if_not_exists: bool, + }, +} + +#[derive(Debug, Clone)] +pub struct CreateTablePlan { + pub body: CreateTablePlanBody, +} + +impl CreateTablePlan { + pub fn new(logical_plan: LogicalPlan) -> Self { + Self { + body: CreateTablePlanBody::DataFusion(logical_plan), + } + } + + pub fn connector_source(source_table: SourceTable, if_not_exists: bool) -> Self { + Self { + body: CreateTablePlanBody::ConnectorSource { + source_table, + if_not_exists, + }, + } + } +} + +impl PlanNode for CreateTablePlan { + fn accept(&self, visitor: &dyn PlanVisitor, context: &PlanVisitorContext) -> PlanVisitorResult { + visitor.visit_create_table_plan(self, context) + } +} diff --git a/src/sql/parser/mod.rs b/src/coordinator/plan/drop_streaming_table_plan.rs similarity index 52% rename from src/sql/parser/mod.rs rename to src/coordinator/plan/drop_streaming_table_plan.rs index 11f4b18e..d06dc836 100644 --- a/src/sql/parser/mod.rs +++ b/src/coordinator/plan/drop_streaming_table_plan.rs @@ -10,33 +10,25 @@ // See the License for the specific language governing permissions and // limitations under the License. -mod sql_parser; +use super::{PlanNode, PlanVisitor, PlanVisitorContext, PlanVisitorResult}; -pub use sql_parser::SqlParser; - -#[derive(Debug)] -pub struct ParseError { - pub message: String, +#[derive(Debug, Clone)] +pub struct DropStreamingTablePlan { + pub table_name: String, + pub if_exists: bool, } -impl std::fmt::Display for ParseError { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "Parse error: {}", self.message) - } -} - -impl std::error::Error for ParseError {} - -impl From for ParseError { - fn from(message: String) -> Self { - ParseError { message } +impl DropStreamingTablePlan { + pub fn new(table_name: String, if_exists: bool) -> Self { + Self { + table_name, + if_exists, + } } } -impl ParseError { - pub fn new(message: impl Into) -> Self { - Self { - message: message.into(), - } +impl PlanNode for DropStreamingTablePlan { + fn accept(&self, visitor: &dyn PlanVisitor, context: &PlanVisitorContext) -> PlanVisitorResult { + visitor.visit_drop_streaming_table(self, context) } } diff --git a/src/coordinator/plan/drop_table_plan.rs b/src/coordinator/plan/drop_table_plan.rs new file mode 100644 index 00000000..7d80a7b7 --- /dev/null +++ b/src/coordinator/plan/drop_table_plan.rs @@ -0,0 +1,34 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use super::{PlanNode, PlanVisitor, PlanVisitorContext, PlanVisitorResult}; + +#[derive(Debug, Clone)] +pub struct DropTablePlan { + pub table_name: String, + pub if_exists: bool, +} + +impl DropTablePlan { + pub fn new(table_name: String, if_exists: bool) -> Self { + Self { + table_name, + if_exists, + } + } +} + +impl PlanNode for DropTablePlan { + fn accept(&self, visitor: &dyn PlanVisitor, context: &PlanVisitorContext) -> PlanVisitorResult { + visitor.visit_drop_table_plan(self, context) + } +} diff --git a/src/coordinator/plan/logical_plan_visitor.rs b/src/coordinator/plan/logical_plan_visitor.rs index 536fec37..77fa9eb4 100644 --- a/src/coordinator/plan/logical_plan_visitor.rs +++ b/src/coordinator/plan/logical_plan_visitor.rs @@ -10,34 +10,311 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::sync::Arc; + +use datafusion::common::{plan_datafusion_err, plan_err, Result}; +use datafusion::execution::SessionStateBuilder; +use datafusion::sql::sqlparser::ast::{ + CreateTable as SqlCreateTable, Expr as SqlExpr, ObjectType, SqlOption, Statement as DFStatement, + TableConstraint, +}; +use datafusion_common::TableReference; +use datafusion_execution::config::SessionConfig; +use datafusion_expr::{col, Extension, Expr, LogicalPlan}; +use sqlparser::ast::Statement; +use tracing::debug; + use crate::coordinator::analyze::analysis::Analysis; use crate::coordinator::plan::{ - CreateFunctionPlan, CreatePythonFunctionPlan, DropFunctionPlan, PlanNode, ShowFunctionsPlan, - StartFunctionPlan, StopFunctionPlan, + CreateFunctionPlan, CreatePythonFunctionPlan, CreateTablePlan, DropFunctionPlan, + DropStreamingTablePlan, DropTablePlan, PlanNode, ShowCatalogTablesPlan, + ShowCreateStreamingTablePlan, ShowCreateTablePlan, ShowFunctionsPlan, + ShowStreamingTablesPlan, StartFunctionPlan, StopFunctionPlan, StreamingTable, }; use crate::coordinator::statement::{ - CreateFunction, CreatePythonFunction, DropFunction, ShowFunctions, StartFunction, - StatementVisitor, StatementVisitorContext, StatementVisitorResult, StopFunction, + CreateFunction, CreatePythonFunction, CreateTable, DropFunction, + DropStreamingTableStatement, DropTableStatement, ShowCatalogTables, + ShowCreateStreamingTable, ShowCreateTable, ShowFunctions, ShowStreamingTables, + StartFunction, StatementVisitor, StatementVisitorContext, StatementVisitorResult, + StopFunction, StreamingTableStatement, +}; +use crate::coordinator::tool::ConnectorOptions; +use crate::sql::analysis::{ + maybe_add_key_extension_to_sink, rewrite_sinks, StreamSchemaProvider, }; +use crate::sql::common::with_option_keys as opt; +use crate::sql::extensions::sink::StreamEgressNode; +use crate::sql::functions::{is_json_union, serialize_outgoing_json}; +use crate::sql::logical_node::logical::{LogicalProgram, ProgramConfig}; +use crate::sql::logical_planner::optimizers::{produce_optimized_plan, ChainingOptimizer}; +use crate::sql::logical_planner::planner::PlanToGraphVisitor; +use crate::sql::rewrite_plan; +use crate::sql::schema::source_table::SourceTable; +use crate::sql::schema::{ColumnDescriptor, ConnectionType, Table}; -#[derive(Debug, Default)] -pub struct LogicalPlanVisitor; +#[derive(Clone)] +pub struct LogicalPlanVisitor { + schema_provider: StreamSchemaProvider, +} impl LogicalPlanVisitor { - pub fn new() -> Self { - Self + pub fn new(schema_provider: StreamSchemaProvider) -> Self { + Self { schema_provider } } pub fn visit(&self, analysis: &Analysis) -> Box { - let context = StatementVisitorContext::Empty; let stmt = analysis.statement(); + let context = StatementVisitorContext::Empty; - let result = stmt.accept(self, &context); - - match result { + match stmt.accept(self, &context) { StatementVisitorResult::Plan(plan) => plan, - _ => panic!("LogicalPlanVisitor should return Plan"), + _ => panic!("Fatal: LogicalPlanVisitor must yield a PlanNode variant"), + } + } + + pub fn build_streaming_table( + schema_provider: &StreamSchemaProvider, + stmt: &StreamingTableStatement, + ) -> Result { + Self::new(schema_provider.clone()).compile_streaming_sink(stmt) + } + + fn compile_streaming_sink( + &self, + stmt: &StreamingTableStatement, + ) -> Result { + let DFStatement::CreateStreamingTable { + name, + with_options, + comment, + query, + } = &stmt.statement + else { + return plan_err!("Statement mismatch: Expected CREATE STREAMING TABLE AST node"); + }; + + let sink_table_name = name.to_string(); + debug!("Initiating streaming sink compilation for identifier: {}", sink_table_name); + + let mut sink_properties = ConnectorOptions::new(with_options, &None)?; + let connector_type = sink_properties.pull_opt_str(opt::CONNECTOR)?.ok_or_else(|| { + plan_datafusion_err!( + "Validation Error: Streaming table '{}' requires the '{}' property", + sink_table_name, + opt::CONNECTOR + ) + })?; + + let partition_keys = Self::extract_partitioning_keys(&mut sink_properties)?; + + let sink_description = comment + .as_deref() + .map(str::trim) + .filter(|s| !s.is_empty()) + .map(str::to_string) + .unwrap_or_else(|| format!("sink `{}` ({connector_type})", sink_table_name)); + + let mut query_logical_plan = rewrite_plan( + produce_optimized_plan(&Statement::Query(query.clone()), &self.schema_provider)?, + &self.schema_provider, + )?; + + if query_logical_plan.schema().fields().iter().any(|f| is_json_union(f.data_type())) { + query_logical_plan = serialize_outgoing_json(&self.schema_provider, Arc::new(query_logical_plan)); + } + + let output_schema_fields = query_logical_plan + .schema() + .fields() + .iter() + .map(|f| ColumnDescriptor::from((**f).clone())) + .collect::>(); + + let mut sink_definition = SourceTable::from_options( + &sink_table_name, + &connector_type, + false, + output_schema_fields, + vec![], + None, + &mut sink_properties, + None, + &self.schema_provider, + Some(ConnectionType::Sink), + sink_description, + )?; + sink_definition.partition_exprs = Arc::new(partition_keys); + + let output_schema = query_logical_plan.schema().clone(); + let sink_plan_node = StreamEgressNode::try_new( + TableReference::bare(sink_table_name.clone()), + Table::ConnectorTable(sink_definition.clone()), + output_schema, + query_logical_plan, + )?; + + let mut rewritten_plans = rewrite_sinks(vec![maybe_add_key_extension_to_sink( + LogicalPlan::Extension(Extension { + node: Arc::new(sink_plan_node), + }), + )?])?; + + let final_logical_plan = rewritten_plans.remove(0); + + let validated_program = self.validate_graph_topology(&final_logical_plan)?; + + Ok(StreamingTable { + name: sink_table_name, + comment: comment.clone(), + program: validated_program, + }) + } + + fn validate_graph_topology(&self, logical_plan: &LogicalPlan) -> Result { + let mut session_config = SessionConfig::new(); + let opts = session_config.options_mut(); + opts.optimizer.enable_round_robin_repartition = false; + opts.optimizer.repartition_aggregations = false; + opts.optimizer.repartition_windows = false; + opts.optimizer.repartition_sorts = false; + opts.optimizer.repartition_joins = false; + opts.execution.target_partitions = 1; + + let session_state = SessionStateBuilder::new() + .with_config(session_config) + .with_default_features() + .with_physical_optimizer_rules(vec![]) + .build(); + + let mut graph_compiler = PlanToGraphVisitor::new(&self.schema_provider, &session_state); + graph_compiler.add_plan(logical_plan.clone())?; + + let mut executable_program = + LogicalProgram::new(graph_compiler.into_graph(), ProgramConfig::default()); + executable_program.optimize(&ChainingOptimizer {}); + + Ok(executable_program) + } + + fn extract_partitioning_keys( + options: &mut ConnectorOptions, + ) -> Result>> { + options + .pull_opt_str(opt::PARTITION_BY)? + .map(|raw_cols| raw_cols.split(',').map(|c| col(c.trim())).collect()) + .map(Ok) + .transpose() + } + + fn contains_connector_property(options: &[SqlOption]) -> bool { + options.iter().any(|opt| match opt { + SqlOption::KeyValue { key, .. } => key.value.eq_ignore_ascii_case(opt::CONNECTOR), + _ => false, + }) + } + + fn parse_primary_keys(constraints: &[TableConstraint]) -> Result> { + let mut keys = None; + for constraint in constraints { + if let TableConstraint::PrimaryKey { columns, .. } = constraint { + if keys.is_some() { + return plan_err!( + "Constraint Violation: Multiple PRIMARY KEY constraints are forbidden" + ); + } + keys = Some(columns.iter().map(|ident| ident.value.clone()).collect()); + } + } + Ok(keys.unwrap_or_default()) + } + + fn parse_watermark_strategy( + constraints: &[TableConstraint], + ) -> Result)>> { + let mut strategy = None; + for constraint in constraints { + if let TableConstraint::Watermark { + column_name, + watermark_expr, + } = constraint + { + if strategy.is_some() { + return plan_err!( + "Constraint Violation: Only a single WATERMARK FOR clause is permitted" + ); + } + strategy = Some((column_name.value.clone(), watermark_expr.clone())); + } } + Ok(strategy) + } + + fn compile_connector_source_plan( + &self, + stmt: &SqlCreateTable, + ) -> Result { + if stmt.query.is_some() { + return plan_err!("Syntax Error: CREATE TABLE ... AS SELECT combined with WITH ('connector'=...) is invalid. Use CREATE STREAMING TABLE instead."); + } + if stmt.or_replace { + return plan_err!( + "Syntax Error: OR REPLACE is not supported for external connector tables." + ); + } + if stmt.temporary { + return plan_err!( + "Syntax Error: TEMPORARY is not supported for external connector tables." + ); + } + if stmt.external { + return plan_err!("Syntax Error: EXTERNAL keyword is redundant and unsupported for connector configurations."); + } + + let target_name = stmt.name.to_string(); + let table_description = stmt + .comment + .clone() + .map(|c| c.to_string()) + .unwrap_or_default(); + + let schema_compiler = datafusion::sql::planner::SqlToRel::new(&self.schema_provider); + let arrow_schema = schema_compiler.build_schema(stmt.columns.clone())?; + + let schema_descriptors = arrow_schema + .fields() + .iter() + .map(|f| ColumnDescriptor::from((**f).clone())) + .collect::>(); + + let mut connector_options = ConnectorOptions::new(&stmt.with_options, &None)?; + let adapter_type = connector_options.pull_opt_str(opt::CONNECTOR)?.ok_or_else(|| { + plan_datafusion_err!( + "Configuration Error: Missing required property '{}' in WITH clause", + opt::CONNECTOR + ) + })?; + + let pk_constraints = Self::parse_primary_keys(&stmt.constraints)?; + let watermark_strategy = Self::parse_watermark_strategy(&stmt.constraints)?; + + let source_definition = SourceTable::from_options( + &target_name, + &adapter_type, + false, + schema_descriptors, + pk_constraints, + watermark_strategy, + &mut connector_options, + None, + &self.schema_provider, + Some(ConnectionType::Source), + table_description, + )?; + + Ok(CreateTablePlan::connector_source( + source_definition, + stmt.if_not_exists, + )) } } @@ -45,24 +322,19 @@ impl StatementVisitor for LogicalPlanVisitor { fn visit_create_function( &self, stmt: &CreateFunction, - _context: &StatementVisitorContext, + _ctx: &StatementVisitorContext, ) -> StatementVisitorResult { - let function_source = stmt.get_function_source().clone(); - let config_source = stmt.get_config_source().cloned(); - let extra_props = stmt.get_extra_properties().clone(); - - // Name will be read from config file during execution StatementVisitorResult::Plan(Box::new(CreateFunctionPlan::new( - function_source, - config_source, - extra_props, + stmt.get_function_source().clone(), + stmt.get_config_source().cloned(), + stmt.get_extra_properties().clone(), ))) } fn visit_drop_function( &self, stmt: &DropFunction, - _context: &StatementVisitorContext, + _ctx: &StatementVisitorContext, ) -> StatementVisitorResult { StatementVisitorResult::Plan(Box::new(DropFunctionPlan::new(stmt.name.clone()))) } @@ -70,7 +342,7 @@ impl StatementVisitor for LogicalPlanVisitor { fn visit_start_function( &self, stmt: &StartFunction, - _context: &StatementVisitorContext, + _ctx: &StatementVisitorContext, ) -> StatementVisitorResult { StatementVisitorResult::Plan(Box::new(StartFunctionPlan::new(stmt.name.clone()))) } @@ -78,7 +350,7 @@ impl StatementVisitor for LogicalPlanVisitor { fn visit_stop_function( &self, stmt: &StopFunction, - _context: &StatementVisitorContext, + _ctx: &StatementVisitorContext, ) -> StatementVisitorResult { StatementVisitorResult::Plan(Box::new(StopFunctionPlan::new(stmt.name.clone()))) } @@ -86,24 +358,137 @@ impl StatementVisitor for LogicalPlanVisitor { fn visit_show_functions( &self, _stmt: &ShowFunctions, - _context: &StatementVisitorContext, + _ctx: &StatementVisitorContext, ) -> StatementVisitorResult { StatementVisitorResult::Plan(Box::new(ShowFunctionsPlan::new())) } + fn visit_show_catalog_tables( + &self, + _stmt: &ShowCatalogTables, + _ctx: &StatementVisitorContext, + ) -> StatementVisitorResult { + StatementVisitorResult::Plan(Box::new(ShowCatalogTablesPlan::new())) + } + + fn visit_show_create_table( + &self, + stmt: &ShowCreateTable, + _ctx: &StatementVisitorContext, + ) -> StatementVisitorResult { + StatementVisitorResult::Plan(Box::new(ShowCreateTablePlan::new( + stmt.table_name.clone(), + ))) + } + fn visit_create_python_function( &self, stmt: &CreatePythonFunction, - _context: &StatementVisitorContext, + _ctx: &StatementVisitorContext, ) -> StatementVisitorResult { - let class_name = stmt.get_class_name().to_string(); - let modules = stmt.get_modules().to_vec(); - let config_content = stmt.get_config_content().to_string(); - StatementVisitorResult::Plan(Box::new(CreatePythonFunctionPlan::new( - class_name, - modules, - config_content, + stmt.get_class_name().to_string(), + stmt.get_modules().to_vec(), + stmt.get_config_content().to_string(), ))) } -} + + fn visit_create_table( + &self, + stmt: &CreateTable, + _ctx: &StatementVisitorContext, + ) -> StatementVisitorResult { + if let Statement::CreateTable(ast_node) = &stmt.statement { + if ast_node.query.is_none() + && Self::contains_connector_property(&ast_node.with_options) + { + let execution_plan = self.compile_connector_source_plan(ast_node).unwrap_or_else( + |err| { + panic!("Fatal Compiler Error: Connector source resolution failed - {err:#}"); + }, + ); + return StatementVisitorResult::Plan(Box::new(execution_plan)); + } + } + + let schema_compiler = datafusion::sql::planner::SqlToRel::new(&self.schema_provider); + match schema_compiler.sql_statement_to_plan(stmt.statement.clone()) { + Ok(logical_plan) => { + debug!( + "Successfully compiled logical DDL topology:\n{}", + logical_plan.display_graphviz() + ); + StatementVisitorResult::Plan(Box::new(CreateTablePlan::new(logical_plan))) + } + Err(err) => panic!("Fatal Compiler Error: Logical plan translation failed - {err}"), + } + } + + fn visit_streaming_table_statement( + &self, + stmt: &StreamingTableStatement, + _ctx: &StatementVisitorContext, + ) -> StatementVisitorResult { + let execution_plan = self.compile_streaming_sink(stmt).unwrap_or_else(|err| { + panic!("Fatal Compiler Error: Streaming sink compilation aborted - {err}"); + }); + StatementVisitorResult::Plan(Box::new(execution_plan)) + } + + fn visit_drop_table_statement( + &self, + stmt: &DropTableStatement, + _ctx: &StatementVisitorContext, + ) -> StatementVisitorResult { + let DFStatement::Drop { + object_type, + if_exists, + names, + .. + } = &stmt.statement + else { + panic!("Fatal Compiler Error: AST mismatch on DropTableStatement"); + }; + + if *object_type != ObjectType::Table { + panic!("Fatal Compiler Error: Drop target must be of type TABLE"); + } + if names.len() != 1 { + panic!("Fatal Compiler Error: Bulk drop operations are not supported. Specify exactly one table."); + } + + StatementVisitorResult::Plan(Box::new(DropTablePlan::new( + names[0].to_string(), + *if_exists, + ))) + } + + fn visit_show_streaming_tables( + &self, + _stmt: &ShowStreamingTables, + _ctx: &StatementVisitorContext, + ) -> StatementVisitorResult { + StatementVisitorResult::Plan(Box::new(ShowStreamingTablesPlan::new())) + } + + fn visit_show_create_streaming_table( + &self, + stmt: &ShowCreateStreamingTable, + _ctx: &StatementVisitorContext, + ) -> StatementVisitorResult { + StatementVisitorResult::Plan(Box::new(ShowCreateStreamingTablePlan::new( + stmt.table_name.clone(), + ))) + } + + fn visit_drop_streaming_table( + &self, + stmt: &DropStreamingTableStatement, + _ctx: &StatementVisitorContext, + ) -> StatementVisitorResult { + StatementVisitorResult::Plan(Box::new(DropStreamingTablePlan::new( + stmt.table_name.clone(), + stmt.if_exists, + ))) + } +} \ No newline at end of file diff --git a/src/coordinator/plan/lookup_table_plan.rs b/src/coordinator/plan/lookup_table_plan.rs new file mode 100644 index 00000000..65103b61 --- /dev/null +++ b/src/coordinator/plan/lookup_table_plan.rs @@ -0,0 +1,27 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use crate::sql::schema::source_table::SourceTable; + +use super::{PlanNode, PlanVisitor, PlanVisitorContext, PlanVisitorResult}; + +/// Plan node that exposes a lookup table config as a logical plan input. +#[derive(Debug)] +pub struct LookupTablePlan { + pub table: SourceTable, +} + +impl PlanNode for LookupTablePlan { + fn accept(&self, visitor: &dyn PlanVisitor, context: &PlanVisitorContext) -> PlanVisitorResult { + visitor.visit_lookup_table(self, context) + } +} diff --git a/src/coordinator/plan/mod.rs b/src/coordinator/plan/mod.rs index 9aa403b5..8166d444 100644 --- a/src/coordinator/plan/mod.rs +++ b/src/coordinator/plan/mod.rs @@ -12,22 +12,42 @@ mod create_function_plan; mod create_python_function_plan; +mod create_table_plan; mod drop_function_plan; +mod drop_streaming_table_plan; +mod drop_table_plan; mod logical_plan_visitor; +mod lookup_table_plan; mod optimizer; +mod show_catalog_tables_plan; +mod show_create_streaming_table_plan; +mod show_create_table_plan; mod show_functions_plan; +mod show_streaming_tables_plan; mod start_function_plan; mod stop_function_plan; +mod streaming_table_connector_plan; +mod streaming_table_plan; mod visitor; pub use create_function_plan::CreateFunctionPlan; pub use create_python_function_plan::CreatePythonFunctionPlan; +pub use create_table_plan::{CreateTablePlan, CreateTablePlanBody}; pub use drop_function_plan::DropFunctionPlan; +pub use drop_streaming_table_plan::DropStreamingTablePlan; +pub use drop_table_plan::DropTablePlan; pub use logical_plan_visitor::LogicalPlanVisitor; +pub use lookup_table_plan::LookupTablePlan; pub use optimizer::LogicalPlanner; +pub use show_catalog_tables_plan::ShowCatalogTablesPlan; +pub use show_create_streaming_table_plan::ShowCreateStreamingTablePlan; +pub use show_create_table_plan::ShowCreateTablePlan; pub use show_functions_plan::ShowFunctionsPlan; +pub use show_streaming_tables_plan::ShowStreamingTablesPlan; pub use start_function_plan::StartFunctionPlan; pub use stop_function_plan::StopFunctionPlan; +pub use streaming_table_connector_plan::StreamingTableConnectorPlan; +pub use streaming_table_plan::StreamingTable; pub use visitor::{PlanVisitor, PlanVisitorContext, PlanVisitorResult}; use std::fmt; diff --git a/src/coordinator/plan/show_catalog_tables_plan.rs b/src/coordinator/plan/show_catalog_tables_plan.rs new file mode 100644 index 00000000..420fdb40 --- /dev/null +++ b/src/coordinator/plan/show_catalog_tables_plan.rs @@ -0,0 +1,28 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use super::{PlanNode, PlanVisitor, PlanVisitorContext, PlanVisitorResult}; + +#[derive(Debug, Default)] +pub struct ShowCatalogTablesPlan; + +impl ShowCatalogTablesPlan { + pub fn new() -> Self { + Self + } +} + +impl PlanNode for ShowCatalogTablesPlan { + fn accept(&self, visitor: &dyn PlanVisitor, context: &PlanVisitorContext) -> PlanVisitorResult { + visitor.visit_show_catalog_tables(self, context) + } +} diff --git a/src/coordinator/plan/show_create_streaming_table_plan.rs b/src/coordinator/plan/show_create_streaming_table_plan.rs new file mode 100644 index 00000000..8d63c0d5 --- /dev/null +++ b/src/coordinator/plan/show_create_streaming_table_plan.rs @@ -0,0 +1,30 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use super::{PlanNode, PlanVisitor, PlanVisitorContext, PlanVisitorResult}; + +#[derive(Debug, Clone)] +pub struct ShowCreateStreamingTablePlan { + pub table_name: String, +} + +impl ShowCreateStreamingTablePlan { + pub fn new(table_name: String) -> Self { + Self { table_name } + } +} + +impl PlanNode for ShowCreateStreamingTablePlan { + fn accept(&self, visitor: &dyn PlanVisitor, context: &PlanVisitorContext) -> PlanVisitorResult { + visitor.visit_show_create_streaming_table(self, context) + } +} diff --git a/src/coordinator/plan/show_create_table_plan.rs b/src/coordinator/plan/show_create_table_plan.rs new file mode 100644 index 00000000..c5fe6376 --- /dev/null +++ b/src/coordinator/plan/show_create_table_plan.rs @@ -0,0 +1,30 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use super::{PlanNode, PlanVisitor, PlanVisitorContext, PlanVisitorResult}; + +#[derive(Debug, Clone)] +pub struct ShowCreateTablePlan { + pub table_name: String, +} + +impl ShowCreateTablePlan { + pub fn new(table_name: String) -> Self { + Self { table_name } + } +} + +impl PlanNode for ShowCreateTablePlan { + fn accept(&self, visitor: &dyn PlanVisitor, context: &PlanVisitorContext) -> PlanVisitorResult { + visitor.visit_show_create_table(self, context) + } +} diff --git a/src/coordinator/plan/show_streaming_tables_plan.rs b/src/coordinator/plan/show_streaming_tables_plan.rs new file mode 100644 index 00000000..08410115 --- /dev/null +++ b/src/coordinator/plan/show_streaming_tables_plan.rs @@ -0,0 +1,28 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use super::{PlanNode, PlanVisitor, PlanVisitorContext, PlanVisitorResult}; + +#[derive(Debug, Default)] +pub struct ShowStreamingTablesPlan; + +impl ShowStreamingTablesPlan { + pub fn new() -> Self { + Self + } +} + +impl PlanNode for ShowStreamingTablesPlan { + fn accept(&self, visitor: &dyn PlanVisitor, context: &PlanVisitorContext) -> PlanVisitorResult { + visitor.visit_show_streaming_tables(self, context) + } +} diff --git a/src/coordinator/plan/streaming_table_connector_plan.rs b/src/coordinator/plan/streaming_table_connector_plan.rs new file mode 100644 index 00000000..214e2e15 --- /dev/null +++ b/src/coordinator/plan/streaming_table_connector_plan.rs @@ -0,0 +1,27 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use crate::sql::schema::source_table::SourceTable; + +use super::{PlanNode, PlanVisitor, PlanVisitorContext, PlanVisitorResult}; + +/// Plan node that exposes a connector table config as a logical plan input. +#[derive(Debug)] +pub struct StreamingTableConnectorPlan { + pub table: SourceTable, +} + +impl PlanNode for StreamingTableConnectorPlan { + fn accept(&self, visitor: &dyn PlanVisitor, context: &PlanVisitorContext) -> PlanVisitorResult { + visitor.visit_streaming_connector_table(self, context) + } +} diff --git a/src/coordinator/plan/streaming_table_plan.rs b/src/coordinator/plan/streaming_table_plan.rs new file mode 100644 index 00000000..512ec266 --- /dev/null +++ b/src/coordinator/plan/streaming_table_plan.rs @@ -0,0 +1,28 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use super::{PlanNode, PlanVisitor, PlanVisitorContext, PlanVisitorResult}; +use crate::sql::logical_node::logical::LogicalProgram; + +/// Plan node representing a fully resolved streaming table (DDL). +#[derive(Debug)] +pub struct StreamingTable { + pub name: String, + pub comment: Option, + pub program: LogicalProgram, +} + +impl PlanNode for StreamingTable { + fn accept(&self, visitor: &dyn PlanVisitor, context: &PlanVisitorContext) -> PlanVisitorResult { + visitor.visit_streaming_table(self, context) + } +} diff --git a/src/coordinator/plan/visitor.rs b/src/coordinator/plan/visitor.rs index 44059c67..bba44a1f 100644 --- a/src/coordinator/plan/visitor.rs +++ b/src/coordinator/plan/visitor.rs @@ -11,8 +11,11 @@ // limitations under the License. use super::{ - CreateFunctionPlan, CreatePythonFunctionPlan, DropFunctionPlan, ShowFunctionsPlan, - StartFunctionPlan, StopFunctionPlan, + CreateFunctionPlan, CreatePythonFunctionPlan, CreateTablePlan, DropFunctionPlan, + DropStreamingTablePlan, DropTablePlan, LookupTablePlan, ShowCatalogTablesPlan, + ShowCreateStreamingTablePlan, ShowCreateTablePlan, ShowFunctionsPlan, + ShowStreamingTablesPlan, StartFunctionPlan, StopFunctionPlan, StreamingTable, + StreamingTableConnectorPlan, }; /// Context passed to PlanVisitor methods @@ -79,9 +82,69 @@ pub trait PlanVisitor { context: &PlanVisitorContext, ) -> PlanVisitorResult; + fn visit_show_catalog_tables( + &self, + plan: &ShowCatalogTablesPlan, + context: &PlanVisitorContext, + ) -> PlanVisitorResult; + + fn visit_show_create_table( + &self, + plan: &ShowCreateTablePlan, + context: &PlanVisitorContext, + ) -> PlanVisitorResult; + fn visit_create_python_function( &self, plan: &CreatePythonFunctionPlan, context: &PlanVisitorContext, ) -> PlanVisitorResult; + + fn visit_create_table_plan( + &self, + plan: &CreateTablePlan, + context: &PlanVisitorContext, + ) -> PlanVisitorResult; + + fn visit_streaming_table( + &self, + plan: &StreamingTable, + context: &PlanVisitorContext, + ) -> PlanVisitorResult; + + fn visit_lookup_table( + &self, + plan: &LookupTablePlan, + context: &PlanVisitorContext, + ) -> PlanVisitorResult; + + fn visit_streaming_connector_table( + &self, + plan: &StreamingTableConnectorPlan, + context: &PlanVisitorContext, + ) -> PlanVisitorResult; + + fn visit_drop_table_plan( + &self, + plan: &DropTablePlan, + context: &PlanVisitorContext, + ) -> PlanVisitorResult; + + fn visit_show_streaming_tables( + &self, + plan: &ShowStreamingTablesPlan, + context: &PlanVisitorContext, + ) -> PlanVisitorResult; + + fn visit_show_create_streaming_table( + &self, + plan: &ShowCreateStreamingTablePlan, + context: &PlanVisitorContext, + ) -> PlanVisitorResult; + + fn visit_drop_streaming_table( + &self, + plan: &DropStreamingTablePlan, + context: &PlanVisitorContext, + ) -> PlanVisitorResult; } diff --git a/src/coordinator/runtime_context.rs b/src/coordinator/runtime_context.rs new file mode 100644 index 00000000..5d671b98 --- /dev/null +++ b/src/coordinator/runtime_context.rs @@ -0,0 +1,61 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Runtime resources for a single coordinator run: [`TaskManager`], [`CatalogManager`], and [`JobManager`]. + +use std::sync::Arc; + +use anyhow::Result; + +use crate::runtime::streaming::job::JobManager; +use crate::runtime::taskexecutor::TaskManager; +use crate::sql::schema::StreamSchemaProvider; +use crate::storage::stream_catalog::CatalogManager; + +/// Dependencies shared by analyze / plan / execute, analogous to installing globals in +/// [`TaskManager`], [`CatalogManager`], and [`JobManager`]. +#[derive(Clone)] +pub struct CoordinatorRuntimeContext { + pub task_manager: Arc, + pub catalog_manager: Arc, + pub job_manager: Arc, +} + +impl CoordinatorRuntimeContext { + pub fn try_from_globals() -> Result { + Ok(Self { + task_manager: TaskManager::get() + .map_err(|e| anyhow::anyhow!("Failed to get TaskManager: {}", e))?, + catalog_manager: CatalogManager::global() + .map_err(|e| anyhow::anyhow!("Failed to get CatalogManager: {}", e))?, + job_manager: JobManager::global() + .map_err(|e| anyhow::anyhow!("Failed to get JobManager: {}", e))?, + }) + } + + pub fn new( + task_manager: Arc, + catalog_manager: Arc, + job_manager: Arc, + ) -> Self { + Self { + task_manager, + catalog_manager, + job_manager, + } + } + + /// Schema provider for [`LogicalPlanVisitor`] / [`SqlToRel`]. + pub fn planning_schema_provider(&self) -> StreamSchemaProvider { + self.catalog_manager.acquire_planning_context() + } +} diff --git a/src/coordinator/statement/create_table.rs b/src/coordinator/statement/create_table.rs new file mode 100644 index 00000000..67a500d1 --- /dev/null +++ b/src/coordinator/statement/create_table.rs @@ -0,0 +1,44 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use datafusion::sql::sqlparser::ast::Statement as DFStatement; + +use super::{Statement, StatementVisitor, StatementVisitorContext, StatementVisitorResult}; + +/// Represents a CREATE TABLE or CREATE VIEW statement. +/// +/// This wraps the raw SQL AST node so the coordinator pipeline can +/// distinguish table/view creation from other streaming SQL operations. +#[derive(Debug)] +pub struct CreateTable { + pub statement: DFStatement, +} + +impl CreateTable { + pub fn new(statement: DFStatement) -> Self { + Self { statement } + } +} + +impl Statement for CreateTable { + fn accept( + &self, + visitor: &dyn StatementVisitor, + context: &StatementVisitorContext, + ) -> StatementVisitorResult { + visitor.visit_create_table(self, context) + } + + fn as_create_table(&self) -> Option<&CreateTable> { + Some(self) + } +} diff --git a/src/coordinator/statement/drop_streaming_table.rs b/src/coordinator/statement/drop_streaming_table.rs new file mode 100644 index 00000000..309abd97 --- /dev/null +++ b/src/coordinator/statement/drop_streaming_table.rs @@ -0,0 +1,40 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use super::{Statement, StatementVisitor, StatementVisitorContext, StatementVisitorResult}; + +/// `DROP STREAMING TABLE [IF EXISTS] ` — stops and removes the streaming +/// job from `JobManager`, then drops the corresponding catalog entry if present. +#[derive(Debug, Clone)] +pub struct DropStreamingTableStatement { + pub table_name: String, + pub if_exists: bool, +} + +impl DropStreamingTableStatement { + pub fn new(table_name: String, if_exists: bool) -> Self { + Self { + table_name, + if_exists, + } + } +} + +impl Statement for DropStreamingTableStatement { + fn accept( + &self, + visitor: &dyn StatementVisitor, + context: &StatementVisitorContext, + ) -> StatementVisitorResult { + visitor.visit_drop_streaming_table(self, context) + } +} diff --git a/src/coordinator/statement/drop_table.rs b/src/coordinator/statement/drop_table.rs new file mode 100644 index 00000000..fa547dca --- /dev/null +++ b/src/coordinator/statement/drop_table.rs @@ -0,0 +1,41 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use datafusion::sql::sqlparser::ast::Statement as DFStatement; + +use super::{Statement, StatementVisitor, StatementVisitorContext, StatementVisitorResult}; + +/// `DROP TABLE` / `DROP TABLE IF EXISTS` (and `DROP STREAMING TABLE`, normalized at parse time). +#[derive(Debug, Clone)] +pub struct DropTableStatement { + pub statement: DFStatement, +} + +impl DropTableStatement { + pub fn new(statement: DFStatement) -> Self { + Self { statement } + } +} + +impl Statement for DropTableStatement { + fn accept( + &self, + visitor: &dyn StatementVisitor, + context: &StatementVisitorContext, + ) -> StatementVisitorResult { + visitor.visit_drop_table_statement(self, context) + } + + fn as_drop_table_statement(&self) -> Option<&DropTableStatement> { + Some(self) + } +} diff --git a/src/coordinator/statement/mod.rs b/src/coordinator/statement/mod.rs index f887209c..80d9c320 100644 --- a/src/coordinator/statement/mod.rs +++ b/src/coordinator/statement/mod.rs @@ -12,18 +12,34 @@ mod create_function; mod create_python_function; +mod create_table; mod drop_function; +mod drop_streaming_table; +mod drop_table; +mod show_catalog_tables; +mod show_create_streaming_table; +mod show_create_table; mod show_functions; +mod show_streaming_tables; mod start_function; mod stop_function; +mod streaming_table; mod visitor; pub use create_function::{ConfigSource, CreateFunction, FunctionSource}; pub use create_python_function::{CreatePythonFunction, PythonModule}; +pub use create_table::CreateTable; pub use drop_function::DropFunction; +pub use drop_streaming_table::DropStreamingTableStatement; +pub use drop_table::DropTableStatement; +pub use show_catalog_tables::ShowCatalogTables; +pub use show_create_streaming_table::ShowCreateStreamingTable; +pub use show_create_table::ShowCreateTable; pub use show_functions::ShowFunctions; +pub use show_streaming_tables::ShowStreamingTables; pub use start_function::StartFunction; pub use stop_function::StopFunction; +pub use streaming_table::StreamingTableStatement; pub use visitor::{StatementVisitor, StatementVisitorContext, StatementVisitorResult}; use std::fmt; @@ -34,4 +50,20 @@ pub trait Statement: fmt::Debug + Send + Sync { visitor: &dyn StatementVisitor, context: &StatementVisitorContext, ) -> StatementVisitorResult; + + fn as_create_table(&self) -> Option<&CreateTable> { + None + } + + fn as_drop_table_statement(&self) -> Option<&DropTableStatement> { + None + } + + fn as_streaming_table_statement(&self) -> Option<&StreamingTableStatement> { + None + } + + fn as_drop_streaming_table_statement(&self) -> Option<&DropStreamingTableStatement> { + None + } } diff --git a/src/coordinator/statement/show_catalog_tables.rs b/src/coordinator/statement/show_catalog_tables.rs new file mode 100644 index 00000000..1f034562 --- /dev/null +++ b/src/coordinator/statement/show_catalog_tables.rs @@ -0,0 +1,33 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use super::{Statement, StatementVisitor, StatementVisitorContext, StatementVisitorResult}; + +/// `SHOW TABLES` over the stream catalog (connector sources + streaming sinks). +#[derive(Debug, Clone, Default)] +pub struct ShowCatalogTables; + +impl ShowCatalogTables { + pub fn new() -> Self { + Self + } +} + +impl Statement for ShowCatalogTables { + fn accept( + &self, + visitor: &dyn StatementVisitor, + context: &StatementVisitorContext, + ) -> StatementVisitorResult { + visitor.visit_show_catalog_tables(self, context) + } +} diff --git a/src/coordinator/statement/show_create_streaming_table.rs b/src/coordinator/statement/show_create_streaming_table.rs new file mode 100644 index 00000000..73f16870 --- /dev/null +++ b/src/coordinator/statement/show_create_streaming_table.rs @@ -0,0 +1,36 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use super::{Statement, StatementVisitor, StatementVisitorContext, StatementVisitorResult}; + +/// `SHOW CREATE STREAMING TABLE ` — displays the pipeline topology and +/// runtime metadata for the named streaming job. +#[derive(Debug, Clone)] +pub struct ShowCreateStreamingTable { + pub table_name: String, +} + +impl ShowCreateStreamingTable { + pub fn new(table_name: String) -> Self { + Self { table_name } + } +} + +impl Statement for ShowCreateStreamingTable { + fn accept( + &self, + visitor: &dyn StatementVisitor, + context: &StatementVisitorContext, + ) -> StatementVisitorResult { + visitor.visit_show_create_streaming_table(self, context) + } +} diff --git a/src/coordinator/statement/show_create_table.rs b/src/coordinator/statement/show_create_table.rs new file mode 100644 index 00000000..5b54a726 --- /dev/null +++ b/src/coordinator/statement/show_create_table.rs @@ -0,0 +1,35 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use super::{Statement, StatementVisitor, StatementVisitorContext, StatementVisitorResult}; + +/// `SHOW CREATE TABLE ` for a stream-catalog table. +#[derive(Debug, Clone)] +pub struct ShowCreateTable { + pub table_name: String, +} + +impl ShowCreateTable { + pub fn new(table_name: String) -> Self { + Self { table_name } + } +} + +impl Statement for ShowCreateTable { + fn accept( + &self, + visitor: &dyn StatementVisitor, + context: &StatementVisitorContext, + ) -> StatementVisitorResult { + visitor.visit_show_create_table(self, context) + } +} diff --git a/src/coordinator/statement/show_streaming_tables.rs b/src/coordinator/statement/show_streaming_tables.rs new file mode 100644 index 00000000..cedf3610 --- /dev/null +++ b/src/coordinator/statement/show_streaming_tables.rs @@ -0,0 +1,33 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use super::{Statement, StatementVisitor, StatementVisitorContext, StatementVisitorResult}; + +/// `SHOW STREAMING TABLES` — lists all active streaming jobs managed by `JobManager`. +#[derive(Debug, Clone, Default)] +pub struct ShowStreamingTables; + +impl ShowStreamingTables { + pub fn new() -> Self { + Self + } +} + +impl Statement for ShowStreamingTables { + fn accept( + &self, + visitor: &dyn StatementVisitor, + context: &StatementVisitorContext, + ) -> StatementVisitorResult { + visitor.visit_show_streaming_tables(self, context) + } +} diff --git a/src/coordinator/statement/streaming_table.rs b/src/coordinator/statement/streaming_table.rs new file mode 100644 index 00000000..bfef3503 --- /dev/null +++ b/src/coordinator/statement/streaming_table.rs @@ -0,0 +1,44 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use datafusion::sql::sqlparser::ast::Statement as DFStatement; + +use super::{Statement, StatementVisitor, StatementVisitorContext, StatementVisitorResult}; + +/// Wrapper for **`CREATE STREAMING TABLE ... WITH (...) AS SELECT ...`** (parsed AST). +/// +/// The coordinator `parse_sql` frontend does **not** support `INSERT`; streaming sinks are +/// defined only via **`CREATE STREAMING TABLE`** (and regular tables via **`CREATE TABLE`**). +#[derive(Debug)] +pub struct StreamingTableStatement { + pub statement: DFStatement, +} + +impl StreamingTableStatement { + pub fn new(statement: DFStatement) -> Self { + Self { statement } + } +} + +impl Statement for StreamingTableStatement { + fn accept( + &self, + visitor: &dyn StatementVisitor, + context: &StatementVisitorContext, + ) -> StatementVisitorResult { + visitor.visit_streaming_table_statement(self, context) + } + + fn as_streaming_table_statement(&self) -> Option<&StreamingTableStatement> { + Some(self) + } +} diff --git a/src/coordinator/statement/visitor.rs b/src/coordinator/statement/visitor.rs index 13ce2cfc..c3cf153a 100644 --- a/src/coordinator/statement/visitor.rs +++ b/src/coordinator/statement/visitor.rs @@ -11,7 +11,10 @@ // limitations under the License. use super::{ - CreateFunction, CreatePythonFunction, DropFunction, ShowFunctions, StartFunction, StopFunction, + CreateFunction, CreatePythonFunction, CreateTable, DropFunction, + DropStreamingTableStatement, DropTableStatement, ShowCatalogTables, + ShowCreateStreamingTable, ShowCreateTable, ShowFunctions, ShowStreamingTables, + StartFunction, StopFunction, StreamingTableStatement, }; use crate::coordinator::plan::PlanNode; use crate::coordinator::statement::Statement; @@ -82,9 +85,57 @@ pub trait StatementVisitor { context: &StatementVisitorContext, ) -> StatementVisitorResult; + fn visit_show_catalog_tables( + &self, + stmt: &ShowCatalogTables, + context: &StatementVisitorContext, + ) -> StatementVisitorResult; + + fn visit_show_create_table( + &self, + stmt: &ShowCreateTable, + context: &StatementVisitorContext, + ) -> StatementVisitorResult; + fn visit_create_python_function( &self, stmt: &CreatePythonFunction, context: &StatementVisitorContext, ) -> StatementVisitorResult; + + fn visit_create_table( + &self, + stmt: &CreateTable, + context: &StatementVisitorContext, + ) -> StatementVisitorResult; + + fn visit_streaming_table_statement( + &self, + stmt: &StreamingTableStatement, + context: &StatementVisitorContext, + ) -> StatementVisitorResult; + + fn visit_drop_table_statement( + &self, + stmt: &DropTableStatement, + context: &StatementVisitorContext, + ) -> StatementVisitorResult; + + fn visit_show_streaming_tables( + &self, + stmt: &ShowStreamingTables, + context: &StatementVisitorContext, + ) -> StatementVisitorResult; + + fn visit_show_create_streaming_table( + &self, + stmt: &ShowCreateStreamingTable, + context: &StatementVisitorContext, + ) -> StatementVisitorResult; + + fn visit_drop_streaming_table( + &self, + stmt: &DropStreamingTableStatement, + context: &StatementVisitorContext, + ) -> StatementVisitorResult; } diff --git a/src/coordinator/tool/mod.rs b/src/coordinator/tool/mod.rs new file mode 100644 index 00000000..6b48aa0e --- /dev/null +++ b/src/coordinator/tool/mod.rs @@ -0,0 +1,13 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +pub use crate::sql::common::ConnectorOptions; diff --git a/src/main.rs b/src/main.rs index 562b1526..1faf45f1 100644 --- a/src/main.rs +++ b/src/main.rs @@ -179,9 +179,7 @@ fn main() -> Result<()> { ); // 2. Component Initialization - let registry = server::register_components(); - registry - .initialize_all(&config) + server::bootstrap_system(&config) .context("Component initialization failed")?; // 3. Server Startup diff --git a/src/runtime/mod.rs b/src/runtime/mod.rs index f69ad017..61b67e1f 100644 --- a/src/runtime/mod.rs +++ b/src/runtime/mod.rs @@ -14,10 +14,12 @@ pub mod buffer_and_event; pub mod common; -pub mod input; -pub mod output; -pub mod processor; -pub mod sink; -pub mod source; +pub mod streaming; +pub mod util; pub mod task; pub mod taskexecutor; +pub mod wasm; + +pub use wasm::input; +pub use wasm::output; +pub use wasm::processor; diff --git a/src/runtime/streaming/api/context.rs b/src/runtime/streaming/api/context.rs new file mode 100644 index 00000000..f0c3dfcb --- /dev/null +++ b/src/runtime/streaming/api/context.rs @@ -0,0 +1,124 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use crate::runtime::streaming::memory::MemoryPool; +use crate::runtime::streaming::protocol::event::StreamEvent; +use crate::runtime::streaming::protocol::tracked::TrackedEvent; +use crate::runtime::streaming::network::endpoint::PhysicalSender; + +use arrow_array::RecordBatch; +use std::sync::Arc; + +pub struct TaskContext { + pub job_id: String, + pub vertex_id: u32, + pub subtask_idx: u32, + pub parallelism: u32, + + pub outboxes: Vec, + + memory_pool: Arc, + + current_watermark: Option, +} + +impl TaskContext { + pub fn new( + job_id: String, + vertex_id: u32, + subtask_idx: u32, + parallelism: u32, + outboxes: Vec, + memory_pool: Arc, + ) -> Self { + Self { + job_id, + vertex_id, + subtask_idx, + parallelism, + outboxes, + memory_pool, + current_watermark: None, + } + } + + // ======================================================================== + // ======================================================================== + + pub fn last_present_watermark(&self) -> Option { + self.current_watermark + } + + pub fn advance_watermark(&mut self, watermark: std::time::SystemTime) { + if let Some(current) = self.current_watermark { + if watermark > current { + self.current_watermark = Some(watermark); + } + } else { + self.current_watermark = Some(watermark); + } + } + + // ======================================================================== + // ======================================================================== + + pub fn task_identity(&self) -> String { + format!( + "Job[{}], Vertex[{}], Subtask[{}/{}]", + self.job_id, self.vertex_id, self.subtask_idx, self.parallelism + ) + } + + // ======================================================================== + // ======================================================================== + + pub async fn collect(&self, batch: RecordBatch) -> anyhow::Result<()> { + if self.outboxes.is_empty() { + return Ok(()); + } + + let bytes_required = batch.get_array_memory_size(); + let ticket = self.memory_pool.request_memory(bytes_required).await; + let tracked_event = TrackedEvent::new(StreamEvent::Data(batch), Some(ticket)); + + for outbox in &self.outboxes { + outbox.send(tracked_event.clone()).await?; + } + Ok(()) + } + + pub async fn collect_keyed( + &self, + key_hash: u64, + batch: RecordBatch, + ) -> anyhow::Result<()> { + if self.outboxes.is_empty() { + return Ok(()); + } + + let bytes_required = batch.get_array_memory_size(); + let ticket = self.memory_pool.request_memory(bytes_required).await; + let tracked_event = TrackedEvent::new(StreamEvent::Data(batch), Some(ticket)); + + let target_idx = (key_hash as usize) % self.outboxes.len(); + self.outboxes[target_idx].send(tracked_event).await?; + Ok(()) + } + + pub async fn broadcast(&self, event: StreamEvent) -> anyhow::Result<()> { + let tracked_event = TrackedEvent::control(event); + for outbox in &self.outboxes { + outbox.send(tracked_event.clone()).await?; + } + Ok(()) + } +} diff --git a/src/runtime/sink/mod.rs b/src/runtime/streaming/api/mod.rs similarity index 91% rename from src/runtime/sink/mod.rs rename to src/runtime/streaming/api/mod.rs index a0a2a6fc..e78ba371 100644 --- a/src/runtime/sink/mod.rs +++ b/src/runtime/streaming/api/mod.rs @@ -10,6 +10,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -// Sink module -// TODO: Add sink implementation here +pub mod context; +pub mod operator; +pub mod source; + diff --git a/src/runtime/streaming/api/operator.rs b/src/runtime/streaming/api/operator.rs new file mode 100644 index 00000000..9acc6e06 --- /dev/null +++ b/src/runtime/streaming/api/operator.rs @@ -0,0 +1,80 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use crate::runtime::streaming::api::context::TaskContext; +use crate::runtime::streaming::api::source::SourceOperator; +use crate::runtime::streaming::protocol::stream_out::StreamOutput; +use arrow_array::RecordBatch; +use async_trait::async_trait; +use std::time::Duration; +use crate::sql::common::{CheckpointBarrier, Watermark}; + +// --------------------------------------------------------------------------- +// ConstructedOperator +// --------------------------------------------------------------------------- + +pub enum ConstructedOperator { + Source(Box), + Operator(Box), +} + +#[async_trait] +pub trait Operator: Send + 'static { + fn name(&self) -> &str; + + async fn on_start(&mut self, _ctx: &mut TaskContext) -> anyhow::Result<()> { + Ok(()) + } + + async fn process_data( + &mut self, + input_idx: usize, + batch: RecordBatch, + ctx: &mut TaskContext, + ) -> anyhow::Result>; + + async fn process_watermark( + &mut self, + watermark: Watermark, + ctx: &mut TaskContext, + ) -> anyhow::Result>; + + async fn snapshot_state( + &mut self, + barrier: CheckpointBarrier, + ctx: &mut TaskContext, + ) -> anyhow::Result<()>; + + async fn commit_checkpoint( + &mut self, + _epoch: u32, + _ctx: &mut TaskContext, + ) -> anyhow::Result<()> { + Ok(()) + } + + fn tick_interval(&self) -> Option { + None + } + + async fn process_tick( + &mut self, + _tick_index: u64, + _ctx: &mut TaskContext, + ) -> anyhow::Result> { + Ok(vec![]) + } + + async fn on_close(&mut self, _ctx: &mut TaskContext) -> anyhow::Result> { + Ok(vec![]) + } +} diff --git a/src/runtime/streaming/api/source.rs b/src/runtime/streaming/api/source.rs new file mode 100644 index 00000000..f46f3de7 --- /dev/null +++ b/src/runtime/streaming/api/source.rs @@ -0,0 +1,58 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +use crate::runtime::streaming::api::context::TaskContext; +use arrow_array::RecordBatch; +use async_trait::async_trait; +use crate::sql::common::{CheckpointBarrier, Watermark}; + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)] +pub enum SourceOffset { + Earliest, + Latest, + #[default] + Group, +} + +#[derive(Debug)] +pub enum SourceEvent { + Data(RecordBatch), + Watermark(Watermark), + Idle, + EndOfStream, +} + +#[async_trait] +pub trait SourceOperator: Send + 'static { + fn name(&self) -> &str; + + async fn on_start(&mut self, _ctx: &mut TaskContext) -> anyhow::Result<()> { + Ok(()) + } + + async fn fetch_next(&mut self, ctx: &mut TaskContext) -> anyhow::Result; + + fn poll_watermark(&mut self) -> Option { + None + } + + async fn snapshot_state( + &mut self, + barrier: CheckpointBarrier, + ctx: &mut TaskContext, + ) -> anyhow::Result<()>; + + async fn on_close(&mut self, _ctx: &mut TaskContext) -> anyhow::Result<()> { + Ok(()) + } +} diff --git a/src/runtime/streaming/error.rs b/src/runtime/streaming/error.rs new file mode 100644 index 00000000..178f5bbb --- /dev/null +++ b/src/runtime/streaming/error.rs @@ -0,0 +1,46 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::fmt::Display; +use thiserror::Error; + +#[derive(Debug, Error)] +pub enum RunError { + #[error("Operator execution failed: {0:#}")] + Operator(#[from] anyhow::Error), + + #[error("Downstream send failed: {0}")] + DownstreamSend(String), + + #[error("Internal engine error: {0}")] + Internal(String), + + #[error("State backend error: {0}")] + State(String), + + #[error("I/O error: {0}")] + Io(#[from] std::io::Error), +} + +impl RunError { + pub fn internal(msg: T) -> Self { + Self::Internal(msg.to_string()) + } + + pub fn downstream(msg: T) -> Self { + Self::DownstreamSend(msg.to_string()) + } + + pub fn state(msg: T) -> Self { + Self::State(msg.to_string()) + } +} \ No newline at end of file diff --git a/src/runtime/source/mod.rs b/src/runtime/streaming/execution/mod.rs similarity index 90% rename from src/runtime/source/mod.rs rename to src/runtime/streaming/execution/mod.rs index 8a05bf30..1a8401ef 100644 --- a/src/runtime/source/mod.rs +++ b/src/runtime/streaming/execution/mod.rs @@ -10,6 +10,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -// Source module -// TODO: Add source implementation here +pub mod runner; +pub mod source; +pub mod tracker; + diff --git a/src/runtime/streaming/execution/runner.rs b/src/runtime/streaming/execution/runner.rs new file mode 100644 index 00000000..c4981d93 --- /dev/null +++ b/src/runtime/streaming/execution/runner.rs @@ -0,0 +1,375 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use async_trait::async_trait; +use tokio::sync::mpsc::Receiver; +use tokio_stream::{StreamExt, StreamMap}; +use tracing::{info, info_span, Instrument}; + +use crate::runtime::streaming::api::context::TaskContext; +use crate::runtime::streaming::api::operator::Operator; +use crate::runtime::streaming::error::RunError; +use crate::runtime::streaming::network::endpoint::BoxedEventStream; +use crate::runtime::streaming::protocol::{ + control::{ControlCommand, StopMode}, + event::StreamEvent, + stream_out::StreamOutput, + tracked::TrackedEvent, +}; +use crate::runtime::streaming::execution::tracker::{ + barrier_aligner::{AlignmentStatus, BarrierAligner}, + watermark_tracker::WatermarkTracker, +}; +use crate::sql::common::{CheckpointBarrier, Watermark}; + +// ========================================== +// ========================================== + +#[async_trait] +pub trait OperatorDrive: Send { + async fn on_start(&mut self, ctx: &mut TaskContext) -> Result<(), RunError>; + async fn process_event( + &mut self, + input_idx: usize, + event: TrackedEvent, + ctx: &mut TaskContext, + ) -> Result; + async fn handle_control( + &mut self, + cmd: ControlCommand, + ctx: &mut TaskContext, + ) -> Result; + async fn on_close(&mut self, ctx: &mut TaskContext) -> Result<(), RunError>; +} + +pub struct ChainedDriver { + operator: Box, + next: Option>, +} + +impl ChainedDriver { + pub fn new(operator: Box, next: Option>) -> Self { + Self { operator, next } + } + + pub fn build_chain(mut operators: Vec>) -> Option> { + if operators.is_empty() { + return None; + } + let mut next_driver: Option> = None; + while let Some(op) = operators.pop() { + let current = ChainedDriver::new(op, next_driver); + next_driver = Some(Box::new(current)); + } + next_driver + } + + async fn dispatch_outputs( + &mut self, + outputs: Vec, + ctx: &mut TaskContext, + ) -> Result<(), RunError> { + for out in outputs { + match out { + StreamOutput::Forward(b) => { + if let Some(next) = &mut self.next { + next.process_event(0, TrackedEvent::control(StreamEvent::Data(b)), ctx) + .await?; + } else { + ctx.collect(b).await?; + } + } + StreamOutput::Keyed(hash, b) => { + if self.next.is_some() { + return Err(RunError::internal(format!( + "Topology Error: Keyed output emitted in the middle of chain by '{}'", + self.operator.name() + ))); + } + ctx.collect_keyed(hash, b).await?; + } + StreamOutput::Broadcast(b) => { + if self.next.is_some() { + return Err(RunError::internal(format!( + "Topology Error: Broadcast output emitted in the middle of chain by '{}'", + self.operator.name() + ))); + } + ctx.collect(b).await?; + } + StreamOutput::Watermark(wm) => { + if let Some(next) = &mut self.next { + next.process_event( + 0, + TrackedEvent::control(StreamEvent::Watermark(wm)), + ctx, + ) + .await?; + } else { + ctx.broadcast(StreamEvent::Watermark(wm)).await?; + } + } + } + } + Ok(()) + } + + async fn forward_signal( + &mut self, + event: StreamEvent, + ctx: &mut TaskContext, + ) -> Result<(), RunError> { + if let Some(next) = &mut self.next { + next.process_event(0, TrackedEvent::control(event), ctx).await?; + } else { + match event { + StreamEvent::Watermark(wm) => ctx.broadcast(StreamEvent::Watermark(wm)).await?, + StreamEvent::Barrier(b) => ctx.broadcast(StreamEvent::Barrier(b)).await?, + StreamEvent::EndOfStream => ctx.broadcast(StreamEvent::EndOfStream).await?, + StreamEvent::Data(_) => unreachable!(), + } + } + Ok(()) + } +} + +#[async_trait] +impl OperatorDrive for ChainedDriver { + async fn on_start(&mut self, ctx: &mut TaskContext) -> Result<(), RunError> { + self.operator.on_start(ctx).await?; + if let Some(next) = &mut self.next { + next.on_start(ctx).await?; + } + Ok(()) + } + + async fn process_event( + &mut self, + input_idx: usize, + tracked: TrackedEvent, + ctx: &mut TaskContext, + ) -> Result { + let mut should_stop = false; + match tracked.event { + StreamEvent::Data(batch) => { + let outputs = self.operator.process_data(input_idx, batch, ctx).await?; + self.dispatch_outputs(outputs, ctx).await?; + } + StreamEvent::Watermark(wm) => { + let outputs = self.operator.process_watermark(wm.clone(), ctx).await?; + self.dispatch_outputs(outputs, ctx).await?; + self.forward_signal(StreamEvent::Watermark(wm), ctx).await?; + } + StreamEvent::Barrier(barrier) => { + self.operator.snapshot_state(barrier.clone(), ctx).await?; + self.forward_signal(StreamEvent::Barrier(barrier), ctx).await?; + } + StreamEvent::EndOfStream => { + should_stop = true; + self.forward_signal(StreamEvent::EndOfStream, ctx).await?; + } + } + Ok(should_stop) + } + + async fn handle_control( + &mut self, + cmd: ControlCommand, + ctx: &mut TaskContext, + ) -> Result { + let mut stop = false; + match &cmd { + ControlCommand::TriggerCheckpoint { barrier } => { + let b: CheckpointBarrier = barrier.clone().into(); + self.operator.snapshot_state(b, ctx).await?; + } + ControlCommand::Commit { epoch } => { + self.operator.commit_checkpoint(*epoch, ctx).await?; + } + ControlCommand::Stop { mode } => { + if *mode == StopMode::Immediate { + stop = true; + } + } + ControlCommand::DropState | ControlCommand::Start | ControlCommand::UpdateConfig { .. } => {} + } + + if let Some(next) = &mut self.next { + if next.handle_control(cmd, ctx).await? { + stop = true; + } + } else if let ControlCommand::TriggerCheckpoint { barrier } = cmd { + ctx.broadcast(StreamEvent::Barrier(barrier.into())).await?; + } + + Ok(stop) + } + + async fn on_close(&mut self, ctx: &mut TaskContext) -> Result<(), RunError> { + let close_outs = self.operator.on_close(ctx).await?; + self.dispatch_outputs(close_outs, ctx).await?; + if let Some(next) = &mut self.next { + next.on_close(ctx).await?; + } + Ok(()) + } +} + +// ========================================== +// ========================================== + +pub struct Pipeline { + chain_head: Box, + ctx: TaskContext, + inboxes: Vec, + control_rx: Receiver, + + wm_tracker: WatermarkTracker, + barrier_aligner: BarrierAligner, + paused_streams: Vec>, +} + +impl Pipeline { + pub fn new( + operators: Vec>, + ctx: TaskContext, + inboxes: Vec, + control_rx: Receiver, + ) -> Result { + let input_count = inboxes.len(); + let chain_head = ChainedDriver::build_chain(operators) + .ok_or_else(|| RunError::internal("Cannot build pipeline with empty operators"))?; + + let paused_streams = (0..input_count).map(|_| None).collect(); + + Ok(Self { + chain_head, + ctx, + inboxes, + control_rx, + wm_tracker: WatermarkTracker::new(input_count), + barrier_aligner: BarrierAligner::new(input_count), + paused_streams, + }) + } + + pub async fn run(mut self) -> Result<(), RunError> { + let span = info_span!( + "pipeline_run", + job_id = %self.ctx.job_id, + vertex = self.ctx.vertex_id + ); + + async move { + info!("Pipeline initializing..."); + self.chain_head.on_start(&mut self.ctx).await?; + + let mut active_streams = StreamMap::new(); + for (i, stream) in std::mem::take(&mut self.inboxes).into_iter().enumerate() { + active_streams.insert(i, stream); + } + + loop { + tokio::select! { + biased; + + Some(cmd) = self.control_rx.recv() => { + if self.chain_head.handle_control(cmd, &mut self.ctx).await? { + break; + } + } + + Some((idx, tracked_event)) = active_streams.next() => { + match tracked_event.event { + StreamEvent::Data(batch) => { + self.chain_head + .process_event( + idx, + TrackedEvent::control(StreamEvent::Data(batch)), + &mut self.ctx, + ) + .await?; + } + + StreamEvent::Barrier(barrier) => { + match self.barrier_aligner.mark(idx, &barrier) { + AlignmentStatus::Pending => { + if let Some(stream) = active_streams.remove(&idx) { + self.paused_streams[idx] = Some(stream); + } + } + AlignmentStatus::Complete => { + self.chain_head + .process_event( + idx, + TrackedEvent::control(StreamEvent::Barrier(barrier)), + &mut self.ctx, + ) + .await?; + + for i in 0..self.paused_streams.len() { + if let Some(stream) = self.paused_streams[i].take() { + active_streams.insert(i, stream); + } + } + } + } + } + + StreamEvent::Watermark(wm) => { + if let Some(aligned_wm) = self.wm_tracker.update(idx, wm) { + if let Watermark::EventTime(t) = aligned_wm { + self.ctx.advance_watermark(t); + } + self.chain_head + .process_event( + idx, + TrackedEvent::control(StreamEvent::Watermark(aligned_wm)), + &mut self.ctx, + ) + .await?; + } + } + + StreamEvent::EndOfStream => { + if self.wm_tracker.increment_eof() == self.wm_tracker.input_count() { + self.chain_head + .process_event( + idx, + TrackedEvent::control(StreamEvent::EndOfStream), + &mut self.ctx, + ) + .await?; + break; + } + } + } + } + + else => break, + } + } + + self.teardown().await + } + .instrument(span) + .await + } + + async fn teardown(mut self) -> Result<(), RunError> { + info!("Pipeline tearing down..."); + self.chain_head.on_close(&mut self.ctx).await?; + Ok(()) + } +} + +pub type SubtaskRunner = Pipeline; diff --git a/src/runtime/streaming/execution/source.rs b/src/runtime/streaming/execution/source.rs new file mode 100644 index 00000000..a85b0839 --- /dev/null +++ b/src/runtime/streaming/execution/source.rs @@ -0,0 +1,180 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +use crate::runtime::streaming::api::context::TaskContext; +use crate::runtime::streaming::api::source::{SourceEvent, SourceOperator}; +use crate::runtime::streaming::error::RunError; +use crate::runtime::streaming::execution::runner::OperatorDrive; +use crate::runtime::streaming::protocol::control::ControlCommand; +use crate::runtime::streaming::protocol::event::StreamEvent; +use crate::runtime::streaming::protocol::tracked::TrackedEvent; +use crate::sql::common::CheckpointBarrier; +use std::time::Duration; +use tokio::sync::mpsc::Receiver; +use tokio::time::{interval, MissedTickBehavior}; +use tracing::{info, info_span, warn, Instrument}; + +pub const SOURCE_IDLE_SLEEP: Duration = Duration::from_millis(50); +pub const WATERMARK_EMIT_INTERVAL: Duration = Duration::from_millis(200); + +pub struct SourceRunner { + operator: Box, + chain_head: Option>, + ctx: TaskContext, + control_rx: Receiver, +} + +impl SourceRunner { + pub fn new( + operator: Box, + chain_head: Option>, + ctx: TaskContext, + control_rx: Receiver, + ) -> Self { + Self { + operator, + chain_head, + ctx, + control_rx, + } + } + + pub async fn run(mut self) -> Result<(), RunError> { + let span = info_span!( + "source_run", + vertex = self.ctx.vertex_id, + op = self.operator.name() + ); + + async move { + info!("Source subtask starting"); + self.operator.on_start(&mut self.ctx).await?; + if let Some(chain) = &mut self.chain_head { + chain.on_start(&mut self.ctx).await?; + } + + let mut idle_timer = interval(SOURCE_IDLE_SLEEP); + idle_timer.set_missed_tick_behavior(MissedTickBehavior::Skip); + + let mut wm_timer = interval(WATERMARK_EMIT_INTERVAL); + wm_timer.set_missed_tick_behavior(MissedTickBehavior::Skip); + + let mut is_idle = false; + let mut is_running = true; + + while is_running { + tokio::select! { + biased; + + cmd_opt = self.control_rx.recv() => { + match cmd_opt { + None => is_running = false, + Some(cmd) => { + if self.handle_control(cmd).await? { + is_running = false; + } + } + } + } + + _ = wm_timer.tick() => { + if let Some(wm) = self.operator.poll_watermark() { + self.dispatch_event(StreamEvent::Watermark(wm)).await?; + } + } + + _ = idle_timer.tick(), if is_idle => { + is_idle = false; + } + + fetch_res = self.operator.fetch_next(&mut self.ctx), if !is_idle => { + match fetch_res { + Ok(SourceEvent::Data(batch)) => { + self.dispatch_event(StreamEvent::Data(batch)).await?; + } + Ok(SourceEvent::Watermark(wm)) => { + self.dispatch_event(StreamEvent::Watermark(wm)).await?; + } + Ok(SourceEvent::Idle) => { + is_idle = true; + idle_timer.reset(); + } + Ok(SourceEvent::EndOfStream) => { + self.dispatch_event(StreamEvent::EndOfStream).await?; + is_running = false; + } + Err(e) => { + warn!("fetch_next error: {}", e); + return Err(RunError::Operator(e)); + } + } + } + } + } + + self.teardown().await + } + .instrument(span) + .await + } + + async fn dispatch_event(&mut self, event: StreamEvent) -> Result<(), RunError> { + if let Some(chain) = &mut self.chain_head { + let _stop = chain + .process_event(0, TrackedEvent::control(event), &mut self.ctx) + .await?; + } else { + match event { + StreamEvent::Data(b) => self.ctx.collect(b).await?, + StreamEvent::Watermark(w) => { + self.ctx.broadcast(StreamEvent::Watermark(w)).await?; + } + StreamEvent::Barrier(b) => { + self.ctx.broadcast(StreamEvent::Barrier(b)).await?; + } + StreamEvent::EndOfStream => { + self.ctx.broadcast(StreamEvent::EndOfStream).await?; + } + } + } + Ok(()) + } + + async fn handle_control(&mut self, cmd: ControlCommand) -> Result { + match cmd { + ControlCommand::TriggerCheckpoint { barrier } => { + let b: CheckpointBarrier = barrier.into(); + self.operator.snapshot_state(b.clone(), &mut self.ctx).await?; + self.dispatch_event(StreamEvent::Barrier(b)).await?; + } + ControlCommand::Stop { .. } => return Ok(true), + other => { + if let Some(chain) = &mut self.chain_head { + if chain.handle_control(other, &mut self.ctx).await? { + return Ok(true); + } + } + } + } + Ok(false) + } + + async fn teardown(mut self) -> Result<(), RunError> { + self.operator.on_close(&mut self.ctx).await?; + if let Some(chain) = &mut self.chain_head { + chain.on_close(&mut self.ctx).await?; + } + info!("Source subtask shutdown"); + Ok(()) + } +} diff --git a/src/runtime/streaming/execution/tracker/barrier_aligner.rs b/src/runtime/streaming/execution/tracker/barrier_aligner.rs new file mode 100644 index 00000000..b227e439 --- /dev/null +++ b/src/runtime/streaming/execution/tracker/barrier_aligner.rs @@ -0,0 +1,56 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +use std::collections::HashSet; + +use crate::sql::common::CheckpointBarrier; + +#[derive(Debug)] +pub enum AlignmentStatus { + Pending, + Complete, +} + +#[derive(Debug)] +pub struct BarrierAligner { + input_count: usize, + current_epoch: Option, + reached_inputs: HashSet, +} + +impl BarrierAligner { + pub fn new(input_count: usize) -> Self { + Self { + input_count, + current_epoch: None, + reached_inputs: HashSet::new(), + } + } + + pub fn mark(&mut self, input_idx: usize, barrier: &CheckpointBarrier) -> AlignmentStatus { + if self.current_epoch != Some(barrier.epoch) { + self.current_epoch = Some(barrier.epoch); + self.reached_inputs.clear(); + } + + self.reached_inputs.insert(input_idx); + + if self.reached_inputs.len() == self.input_count { + self.current_epoch = None; + self.reached_inputs.clear(); + AlignmentStatus::Complete + } else { + AlignmentStatus::Pending + } + } +} diff --git a/src/runtime/streaming/execution/tracker/mod.rs b/src/runtime/streaming/execution/tracker/mod.rs new file mode 100644 index 00000000..3206f352 --- /dev/null +++ b/src/runtime/streaming/execution/tracker/mod.rs @@ -0,0 +1,16 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +pub mod barrier_aligner; +pub mod watermark_tracker; + diff --git a/src/runtime/streaming/execution/tracker/watermark_tracker.rs b/src/runtime/streaming/execution/tracker/watermark_tracker.rs new file mode 100644 index 00000000..6304b4c3 --- /dev/null +++ b/src/runtime/streaming/execution/tracker/watermark_tracker.rs @@ -0,0 +1,109 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use crate::runtime::streaming::protocol::watermark::{merge_watermarks, watermark_strictly_advances}; +use crate::sql::common::Watermark; + +#[derive(Debug)] +pub struct WatermarkTracker { + watermarks: Vec>, + current_min_watermark: Option, + eof_count: usize, +} + +impl WatermarkTracker { + pub fn new(input_count: usize) -> Self { + Self { + watermarks: vec![None; input_count], + current_min_watermark: None, + eof_count: 0, + } + } + + pub fn update(&mut self, input_idx: usize, wm: Watermark) -> Option { + self.watermarks[input_idx] = Some(wm); + + if self.watermarks.iter().any(|w| w.is_none()) { + return None; + } + + let new_min = merge_watermarks(&self.watermarks)?; + + if !watermark_strictly_advances(new_min, self.current_min_watermark) { + return None; + } + + self.current_min_watermark = Some(new_min); + Some(new_min) + } + + pub fn increment_eof(&mut self) -> usize { + self.eof_count += 1; + self.eof_count + } + + pub fn input_count(&self) -> usize { + self.watermarks.len() + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::time::{Duration, SystemTime}; + + #[test] + fn no_emit_until_all_inputs_seen() { + let mut t = WatermarkTracker::new(2); + let w = Watermark::EventTime(SystemTime::UNIX_EPOCH + Duration::from_secs(3)); + assert!(t.update(0, w).is_none()); + let w2 = Watermark::EventTime(SystemTime::UNIX_EPOCH + Duration::from_secs(1)); + assert_eq!(t.update(1, w2), Some(w2)); + } + + #[test] + fn dedup_same_aligned() { + let mut t = WatermarkTracker::new(1); + let w = Watermark::EventTime(SystemTime::UNIX_EPOCH + Duration::from_secs(1)); + assert_eq!(t.update(0, w), Some(w)); + assert!(t.update(0, w).is_none()); + } + + #[test] + fn advances_only_when_min_strictly_increases() { + let mut t = WatermarkTracker::new(2); + let t1 = SystemTime::UNIX_EPOCH + Duration::from_secs(1); + let t5 = SystemTime::UNIX_EPOCH + Duration::from_secs(5); + assert!(t.update(0, Watermark::EventTime(t5)).is_none()); + assert_eq!(t.update(1, Watermark::EventTime(t1)), Some(Watermark::EventTime(t1))); + let t3 = SystemTime::UNIX_EPOCH + Duration::from_secs(3); + assert_eq!( + t.update(1, Watermark::EventTime(t3)), + Some(Watermark::EventTime(t3)) + ); + assert!(t.update(1, Watermark::EventTime(t3)).is_none()); + } + + #[test] + fn backward_aligned_min_is_ignored() { + let mut t = WatermarkTracker::new(2); + let t5 = SystemTime::UNIX_EPOCH + Duration::from_secs(5); + let t10 = SystemTime::UNIX_EPOCH + Duration::from_secs(10); + assert!(t.update(0, Watermark::EventTime(t10)).is_none()); + assert_eq!( + t.update(1, Watermark::EventTime(t5)), + Some(Watermark::EventTime(t5)) + ); + let t2 = SystemTime::UNIX_EPOCH + Duration::from_secs(2); + assert!(t.update(0, Watermark::EventTime(t2)).is_none()); + } +} diff --git a/src/runtime/streaming/factory/connector/dispatchers.rs b/src/runtime/streaming/factory/connector/dispatchers.rs new file mode 100644 index 00000000..40e7242c --- /dev/null +++ b/src/runtime/streaming/factory/connector/dispatchers.rs @@ -0,0 +1,37 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; + +use anyhow::Result; + +use crate::runtime::streaming::api::operator::ConstructedOperator; +use crate::runtime::streaming::factory::global::Registry; +use crate::runtime::streaming::factory::operator_constructor::OperatorConstructor; + +use super::kafka::ConnectorDispatcher; + +pub struct ConnectorSourceDispatcher; + +impl OperatorConstructor for ConnectorSourceDispatcher { + fn with_config(&self, config: &[u8], registry: Arc) -> Result { + ConnectorDispatcher.with_config(config, registry) + } +} + +pub struct ConnectorSinkDispatcher; + +impl OperatorConstructor for ConnectorSinkDispatcher { + fn with_config(&self, config: &[u8], registry: Arc) -> Result { + ConnectorDispatcher.with_config(config, registry) + } +} diff --git a/src/runtime/streaming/factory/connector/kafka.rs b/src/runtime/streaming/factory/connector/kafka.rs new file mode 100644 index 00000000..a55ef477 --- /dev/null +++ b/src/runtime/streaming/factory/connector/kafka.rs @@ -0,0 +1,262 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use anyhow::{bail, Context, Result}; +use prost::Message; +use std::collections::HashMap; +use std::num::NonZeroU32; +use std::sync::Arc; + +use protocol::grpc::api::connector_op::Config; +use protocol::grpc::api::{ + BadDataPolicy, ConnectorOp, DecimalEncodingProto, FormatConfig, + KafkaAuthConfig, KafkaOffsetMode, KafkaReadMode, KafkaSinkCommitMode, KafkaSinkConfig, + KafkaSourceConfig, TimestampFormatProto, +}; +use tracing::info; + +use crate::runtime::streaming::api::operator::ConstructedOperator; +use crate::runtime::streaming::api::source::SourceOffset; +use crate::runtime::streaming::factory::global::Registry; +use crate::runtime::streaming::factory::operator_constructor::OperatorConstructor; +use crate::runtime::streaming::format::{ + BadDataPolicy as RtBadDataPolicy, DataSerializer, DecimalEncoding as RtDecimalEncoding, + Format as RuntimeFormat, JsonFormat as RuntimeJsonFormat, + TimestampFormat as RtTimestampFormat, +}; +use crate::runtime::streaming::operators::sink::kafka::{ConsistencyMode, KafkaSinkOperator}; +use crate::runtime::streaming::operators::source::kafka::{ + BufferedDeserializer, KafkaSourceOperator, +}; +use crate::sql::common::FsSchema; + +const DEFAULT_SOURCE_BATCH_SIZE: usize = 1024; + +// ─────────────── Proto → Runtime type conversions ─────────────── + +fn proto_format_to_runtime(fmt: &Option) -> Result { + let cfg = fmt.as_ref().context("FormatConfig is required")?; + match &cfg.format { + Some(protocol::grpc::api::format_config::Format::Json(j)) => { + Ok(RuntimeFormat::Json(RuntimeJsonFormat { + timestamp_format: match j.timestamp_format() { + TimestampFormatProto::TimestampRfc3339 => RtTimestampFormat::RFC3339, + TimestampFormatProto::TimestampUnixMillis => RtTimestampFormat::UnixMillis, + }, + decimal_encoding: match j.decimal_encoding() { + DecimalEncodingProto::DecimalNumber => RtDecimalEncoding::Number, + DecimalEncodingProto::DecimalString => RtDecimalEncoding::String, + DecimalEncodingProto::DecimalBytes => RtDecimalEncoding::Bytes, + }, + include_schema: j.include_schema, + })) + } + Some(protocol::grpc::api::format_config::Format::RawString(_)) => { + Ok(RuntimeFormat::RawString) + } + Some(protocol::grpc::api::format_config::Format::RawBytes(_)) => { + Ok(RuntimeFormat::RawBytes) + } + None => bail!("FormatConfig has no format variant set"), + } +} + +fn proto_bad_data_to_runtime(policy: i32) -> RtBadDataPolicy { + match BadDataPolicy::try_from(policy) { + Ok(BadDataPolicy::BadDataDrop) => RtBadDataPolicy::Drop, + _ => RtBadDataPolicy::Fail, + } +} + +fn proto_offset_to_runtime(mode: i32) -> SourceOffset { + match KafkaOffsetMode::try_from(mode) { + Ok(KafkaOffsetMode::KafkaOffsetLatest) => SourceOffset::Latest, + Ok(KafkaOffsetMode::KafkaOffsetEarliest) => SourceOffset::Earliest, + _ => SourceOffset::Group, + } +} + +fn build_auth_client_configs(auth: &Option) -> HashMap { + let mut out = HashMap::new(); + let Some(auth) = auth else { return out }; + match &auth.auth { + Some(protocol::grpc::api::kafka_auth_config::Auth::Sasl(sasl)) => { + out.insert("security.protocol".to_string(), sasl.protocol.clone()); + out.insert("sasl.mechanism".to_string(), sasl.mechanism.clone()); + out.insert("sasl.username".to_string(), sasl.username.clone()); + out.insert("sasl.password".to_string(), sasl.password.clone()); + } + Some(protocol::grpc::api::kafka_auth_config::Auth::AwsMskIam(iam)) => { + out.insert("security.protocol".to_string(), "SASL_SSL".to_string()); + out.insert("sasl.mechanism".to_string(), "OAUTHBEARER".to_string()); + out.insert( + "sasl.oauthbearer.extensions".to_string(), + format!("logicalCluster=aws_msk;aws_region={}", iam.region), + ); + } + _ => {} + } + out +} + +fn merge_client_configs( + auth: &Option, + extra: &HashMap, +) -> HashMap { + let mut configs = build_auth_client_configs(auth); + for (k, v) in extra { + configs.insert(k.clone(), v.clone()); + } + configs +} + +// ─────────────── Unified Connector Dispatcher ─────────────── + +pub struct ConnectorDispatcher; + +impl OperatorConstructor for ConnectorDispatcher { + fn with_config(&self, payload: &[u8], _registry: Arc) -> Result { + let op = ConnectorOp::decode(payload) + .context("Failed to decode ConnectorOp protobuf")?; + + let fs_schema = op + .fs_schema + .as_ref() + .map(|fs| FsSchema::try_from(fs.clone())) + .transpose() + .map_err(|e| anyhow::anyhow!("{e}"))?; + + match op.config { + Some(Config::KafkaSource(ref cfg)) => { + Self::build_kafka_source(&op.name, cfg, fs_schema) + } + Some(Config::KafkaSink(ref cfg)) => { + Self::build_kafka_sink(&op.name, cfg, fs_schema) + } + Some(Config::Generic(_)) => bail!( + "ConnectorOp '{}': GenericConnectorConfig dispatch not yet implemented", + op.name + ), + None => bail!("ConnectorOp '{}' has no configuration payload", op.name), + } + } +} + +impl ConnectorDispatcher { + fn build_kafka_source( + _name: &str, + cfg: &KafkaSourceConfig, + fs_schema: Option, + ) -> Result { + info!(topic = %cfg.topic, "Constructing Kafka Source"); + + let fs = fs_schema.context("fs_schema is required for Kafka Source")?; + let client_configs = merge_client_configs(&cfg.auth, &cfg.client_configs); + + let mut final_configs = client_configs; + if cfg.read_mode() == KafkaReadMode::KafkaReadCommitted { + final_configs.insert("isolation.level".to_string(), "read_committed".to_string()); + } + + let runtime_format = proto_format_to_runtime(&cfg.format)?; + let bad_data = proto_bad_data_to_runtime(cfg.bad_data_policy); + + let deserializer = Box::new(BufferedDeserializer::new( + runtime_format, + fs.schema.clone(), + bad_data, + DEFAULT_SOURCE_BATCH_SIZE, + )); + + let rate = NonZeroU32::new(cfg.rate_limit_msgs_per_sec.max(1)) + .unwrap_or_else(|| NonZeroU32::new(1_000_000).expect("nonzero")); + + let source_op = KafkaSourceOperator::new( + cfg.topic.clone(), + cfg.bootstrap_servers.clone(), + cfg.group_id.clone(), + cfg.group_id_prefix.clone(), + proto_offset_to_runtime(cfg.offset_mode), + final_configs, + rate, + vec![], + deserializer, + ); + + Ok(ConstructedOperator::Source(Box::new(source_op))) + } + + fn build_kafka_sink( + _name: &str, + cfg: &KafkaSinkConfig, + fs_schema: Option, + ) -> Result { + info!(topic = %cfg.topic, "Constructing Kafka Sink"); + + let fs_in = fs_schema.context("fs_schema is required for Kafka Sink")?; + let client_configs = merge_client_configs(&cfg.auth, &cfg.client_configs); + + let consistency = match cfg.commit_mode() { + KafkaSinkCommitMode::KafkaSinkExactlyOnce => ConsistencyMode::ExactlyOnce, + KafkaSinkCommitMode::KafkaSinkAtLeastOnce => ConsistencyMode::AtLeastOnce, + }; + + let runtime_format = proto_format_to_runtime(&cfg.format)?; + let fs = sink_fs_schema_adjusted(fs_in, &cfg.key_field, &cfg.timestamp_field)?; + let serializer = DataSerializer::new(runtime_format, fs.schema.clone()); + + let sink_op = KafkaSinkOperator::new( + cfg.topic.clone(), + cfg.bootstrap_servers.clone(), + consistency, + client_configs, + fs, + serializer, + ); + + Ok(ConstructedOperator::Operator(Box::new(sink_op))) + } +} + +fn sink_fs_schema_adjusted( + fs: FsSchema, + key_field: &Option, + timestamp_field: &Option, +) -> Result { + if key_field.is_none() && timestamp_field.is_none() { + return Ok(fs); + } + let schema = fs.schema.clone(); + let ts = if let Some(name) = timestamp_field { + schema + .column_with_name(name) + .ok_or_else(|| anyhow::anyhow!("timestamp column '{name}' not found in schema"))? + .0 + } else { + fs.timestamp_index + }; + let keys = fs.clone_storage_key_indices(); + let routing = if let Some(name) = key_field { + let k = schema + .column_with_name(name) + .ok_or_else(|| anyhow::anyhow!("key column '{name}' not found in schema"))? + .0; + Some(vec![k]) + } else { + fs.clone_routing_key_indices() + }; + Ok(FsSchema::new(schema, ts, keys, routing)) +} + +// Legacy dispatcher aliases kept for backward compatibility with factory registration. +pub type KafkaSourceDispatcher = ConnectorDispatcher; +pub type KafkaSinkDispatcher = ConnectorDispatcher; diff --git a/src/runtime/streaming/factory/connector/mod.rs b/src/runtime/streaming/factory/connector/mod.rs new file mode 100644 index 00000000..be63478d --- /dev/null +++ b/src/runtime/streaming/factory/connector/mod.rs @@ -0,0 +1,18 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +mod dispatchers; +pub mod kafka; + +pub use dispatchers::{ConnectorSinkDispatcher, ConnectorSourceDispatcher}; +pub use kafka::{KafkaSinkDispatcher, KafkaSourceDispatcher}; diff --git a/src/runtime/streaming/factory/global/mod.rs b/src/runtime/streaming/factory/global/mod.rs new file mode 100644 index 00000000..0dc2130e --- /dev/null +++ b/src/runtime/streaming/factory/global/mod.rs @@ -0,0 +1,16 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +mod session_registry; + +pub use session_registry::Registry; diff --git a/src/runtime/streaming/factory/global/session_registry.rs b/src/runtime/streaming/factory/global/session_registry.rs new file mode 100644 index 00000000..4b7895a2 --- /dev/null +++ b/src/runtime/streaming/factory/global/session_registry.rs @@ -0,0 +1,60 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +use std::collections::HashSet; +use std::sync::Arc; + +use datafusion::common::Result as DfResult; +use datafusion::execution::context::SessionContext; +use datafusion::execution::FunctionRegistry; +use datafusion::logical_expr::planner::ExprPlanner; +use datafusion::logical_expr::{AggregateUDF, ScalarUDF, WindowUDF}; + +/// +pub struct Registry { + ctx: SessionContext, +} + +impl Registry { + pub fn new() -> Self { + Self { + ctx: SessionContext::new(), + } + } + + pub fn session_context(&self) -> &SessionContext { + &self.ctx + } +} + +impl FunctionRegistry for Registry { + fn udfs(&self) -> HashSet { + self.ctx.udfs() + } + + fn udf(&self, name: &str) -> DfResult> { + self.ctx.udf(name) + } + + fn udaf(&self, name: &str) -> DfResult> { + self.ctx.udaf(name) + } + + fn udwf(&self, name: &str) -> DfResult> { + self.ctx.udwf(name) + } + + fn expr_planners(&self) -> Vec> { + self.ctx.expr_planners() + } +} diff --git a/src/runtime/streaming/factory/mod.rs b/src/runtime/streaming/factory/mod.rs new file mode 100644 index 00000000..f02ec955 --- /dev/null +++ b/src/runtime/streaming/factory/mod.rs @@ -0,0 +1,56 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +pub mod connector; +pub mod global; + +mod operator_constructor; +mod operator_factory; + +use tracing::info; + +use crate::sql::common::constants::factory_operator_name; + +#[allow(unused_imports)] +pub use connector::{ + ConnectorSinkDispatcher, ConnectorSourceDispatcher, KafkaSinkDispatcher, KafkaSourceDispatcher, +}; +pub use global::Registry; +pub use operator_factory::OperatorFactory; + +fn register_builtin_connectors(factory: &mut OperatorFactory) { + factory.register( + factory_operator_name::CONNECTOR_SOURCE, + Box::new(connector::ConnectorSourceDispatcher), + ); + factory.register( + factory_operator_name::CONNECTOR_SINK, + Box::new(connector::ConnectorSinkDispatcher), + ); +} + +fn register_kafka_connector_plugins(factory: &mut OperatorFactory) { + factory.register( + factory_operator_name::KAFKA_SOURCE, + Box::new(connector::kafka::ConnectorDispatcher), + ); + factory.register( + factory_operator_name::KAFKA_SINK, + Box::new(connector::kafka::ConnectorDispatcher), + ); + info!( + "Registered Kafka connector plugins ({}, {})", + factory_operator_name::KAFKA_SOURCE, + factory_operator_name::KAFKA_SINK + ); +} \ No newline at end of file diff --git a/src/runtime/streaming/factory/operator_constructor.rs b/src/runtime/streaming/factory/operator_constructor.rs new file mode 100644 index 00000000..832fe734 --- /dev/null +++ b/src/runtime/streaming/factory/operator_constructor.rs @@ -0,0 +1,23 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +use anyhow::Result; +use std::sync::Arc; + +use crate::runtime::streaming::api::operator::ConstructedOperator; +use crate::runtime::streaming::factory::global::Registry; + +/// +pub trait OperatorConstructor: Send + Sync { + fn with_config(&self, config: &[u8], registry: Arc) -> Result; +} diff --git a/src/runtime/streaming/factory/operator_factory.rs b/src/runtime/streaming/factory/operator_factory.rs new file mode 100644 index 00000000..5a2dc26f --- /dev/null +++ b/src/runtime/streaming/factory/operator_factory.rs @@ -0,0 +1,261 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +use anyhow::{anyhow, Result}; +use prost::Message; +use std::collections::HashMap; +use std::sync::Arc; +use protocol::grpc::api::ProjectionOperator as ProjectionOperatorProto; +use super::operator_constructor::OperatorConstructor; +use crate::runtime::streaming::api::operator::ConstructedOperator; +use crate::runtime::streaming::factory::connector::{ + ConnectorSinkDispatcher, ConnectorSourceDispatcher, +}; +use crate::runtime::streaming::factory::global::Registry; +use crate::runtime::streaming::operators::grouping::IncrementalAggregatingConstructor; +use crate::runtime::streaming::operators::joins::{ + InstantJoinConstructor, JoinWithExpirationConstructor, +}; +use crate::runtime::streaming::operators::key_by::KeyByConstructor; +use crate::runtime::streaming::operators::watermark::WatermarkGeneratorConstructor; +use crate::runtime::streaming::operators::windows::{ + SessionAggregatingWindowConstructor, SlidingAggregatingWindowConstructor, + TumblingAggregateWindowConstructor, WindowFunctionConstructor, +}; +use crate::runtime::streaming::operators::{ProjectionOperator, StatelessPhysicalExecutor, ValueExecutionOperator}; +use protocol::grpc::api::{ + ExpressionWatermarkConfig, JoinOperator as JoinOperatorProto, + KeyPlanOperator as KeyByProto, + SessionWindowAggregateOperator, SlidingWindowAggregateOperator, TumblingWindowAggregateOperator, + UpdatingAggregateOperator, ValuePlanOperator, WindowFunctionOperator as WindowFunctionProto, +}; + +use crate::sql::logical_node::logical::OperatorName; + +/// +pub struct OperatorFactory { + constructors: HashMap>, + registry: Arc, +} + +impl OperatorFactory { + pub fn new(registry: Arc) -> Self { + let mut factory = Self { + constructors: HashMap::new(), + registry, + }; + factory.register_builtins(); + factory + } + + pub fn register(&mut self, name: &str, constructor: Box) { + self.constructors.insert(name.to_string(), constructor); + } + + pub fn register_named(&mut self, name: OperatorName, constructor: Box) { + self.register(name.as_registry_key(), constructor); + } + + pub fn create_operator(&self, name: &str, payload: &[u8]) -> Result { + let ctor = self + .constructors + .get(name) + .ok_or_else(|| { + anyhow!( + "FATAL: Operator '{}' not found in Factory Registry. \ + Ensure the worker is compiled with the correct plugins.", + name + ) + })?; + + ctor.with_config(payload, self.registry.clone()) + } + + pub fn registered_operators(&self) -> Vec<&str> { + self.constructors.keys().map(|s| s.as_str()).collect() + } + + fn register_builtins(&mut self) { + self.register_named(OperatorName::TumblingWindowAggregate, Box::new(TumblingWindowBridge)); + self.register_named(OperatorName::SlidingWindowAggregate, Box::new(SlidingWindowBridge)); + self.register_named(OperatorName::SessionWindowAggregate, Box::new(SessionWindowBridge)); + + self.register_named(OperatorName::ExpressionWatermark, Box::new(WatermarkBridge)); + + // ─── SQL Window Function ─── + self.register_named(OperatorName::WindowFunction, Box::new(WindowFunctionBridge)); + + // ─── Join ─── + self.register_named(OperatorName::Join, Box::new(JoinWithExpirationBridge)); + self.register_named(OperatorName::InstantJoin, Box::new(InstantJoinBridge)); + self.register_named(OperatorName::LookupJoin, Box::new(LookupJoinBridge)); + + self.register_named(OperatorName::UpdatingAggregate, Box::new(IncrementalAggregateBridge)); + + self.register_named(OperatorName::KeyBy, Box::new(KeyByBridge)); + + self.register_named(OperatorName::Projection, Box::new(ProjectionConstructor)); + self.register_named(OperatorName::Value, Box::new(ValueBridge)); + self.register_named(OperatorName::ConnectorSource, Box::new(ConnectorSourceBridge)); + self.register_named(OperatorName::ConnectorSink, Box::new(ConnectorSinkBridge)); + + crate::runtime::streaming::factory::register_kafka_connector_plugins(self); + } +} + +struct TumblingWindowBridge; +impl OperatorConstructor for TumblingWindowBridge { + fn with_config(&self, config: &[u8], registry: Arc) -> Result { + let proto = TumblingWindowAggregateOperator::decode(config) + .map_err(|e| anyhow!("Decode TumblingWindowAggregateOperator failed: {e}"))?; + let op = TumblingAggregateWindowConstructor.with_config(proto, registry)?; + Ok(ConstructedOperator::Operator(Box::new(op))) + } +} + +struct SlidingWindowBridge; +impl OperatorConstructor for SlidingWindowBridge { + fn with_config(&self, config: &[u8], registry: Arc) -> Result { + let proto = SlidingWindowAggregateOperator::decode(config) + .map_err(|e| anyhow!("Decode SlidingWindowAggregateOperator failed: {e}"))?; + let op = SlidingAggregatingWindowConstructor.with_config(proto, registry)?; + Ok(ConstructedOperator::Operator(Box::new(op))) + } +} + +struct SessionWindowBridge; +impl OperatorConstructor for SessionWindowBridge { + fn with_config(&self, config: &[u8], registry: Arc) -> Result { + let proto = SessionWindowAggregateOperator::decode(config) + .map_err(|e| anyhow!("Decode SessionWindowAggregateOperator failed: {e}"))?; + let op = SessionAggregatingWindowConstructor.with_config(proto, registry)?; + Ok(ConstructedOperator::Operator(Box::new(op))) + } +} + +struct WatermarkBridge; +impl OperatorConstructor for WatermarkBridge { + fn with_config(&self, config: &[u8], registry: Arc) -> Result { + let proto = ExpressionWatermarkConfig::decode(config) + .map_err(|e| anyhow!("Decode ExpressionWatermarkConfig failed: {e}"))?; + let op = WatermarkGeneratorConstructor.with_config(proto, registry)?; + Ok(ConstructedOperator::Operator(Box::new(op))) + } +} + +struct WindowFunctionBridge; +impl OperatorConstructor for WindowFunctionBridge { + fn with_config(&self, config: &[u8], registry: Arc) -> Result { + let proto = WindowFunctionProto::decode(config) + .map_err(|e| anyhow!("Decode WindowFunctionOperator failed: {e}"))?; + let op = WindowFunctionConstructor.with_config(proto, registry)?; + Ok(ConstructedOperator::Operator(Box::new(op))) + } +} + +struct JoinWithExpirationBridge; +impl OperatorConstructor for JoinWithExpirationBridge { + fn with_config(&self, config: &[u8], registry: Arc) -> Result { + let proto = JoinOperatorProto::decode(config) + .map_err(|e| anyhow!("Decode JoinOperator (expiration) failed: {e}"))?; + let op = JoinWithExpirationConstructor.with_config(proto, registry)?; + Ok(ConstructedOperator::Operator(Box::new(op))) + } +} + +struct InstantJoinBridge; +impl OperatorConstructor for InstantJoinBridge { + fn with_config(&self, config: &[u8], registry: Arc) -> Result { + let proto = JoinOperatorProto::decode(config) + .map_err(|e| anyhow!("Decode JoinOperator (instant) failed: {e}"))?; + let op = InstantJoinConstructor.with_config(proto, registry)?; + Ok(ConstructedOperator::Operator(Box::new(op))) + } +} + +struct LookupJoinBridge; +impl OperatorConstructor for LookupJoinBridge { + fn with_config(&self, _config: &[u8], _registry: Arc) -> Result { + Err(anyhow!("LookupJoin is not supported in the current runtime")) + } +} + +struct IncrementalAggregateBridge; +impl OperatorConstructor for IncrementalAggregateBridge { + fn with_config(&self, config: &[u8], registry: Arc) -> Result { + let proto = UpdatingAggregateOperator::decode(config) + .map_err(|e| anyhow!("Decode UpdatingAggregateOperator failed: {e}"))?; + let op = IncrementalAggregatingConstructor.with_config(proto, registry)?; + Ok(ConstructedOperator::Operator(Box::new(op))) + } +} + +struct KeyByBridge; +impl OperatorConstructor for KeyByBridge { + fn with_config(&self, config: &[u8], _registry: Arc) -> Result { + let proto = KeyByProto::decode(config) + .map_err(|e| anyhow!("Decode KeyPlanOperator failed: {e}"))?; + let op = KeyByConstructor.with_config(proto)?; + Ok(ConstructedOperator::Operator(Box::new(op))) + } +} + +pub struct ProjectionConstructor; + +impl OperatorConstructor for ProjectionConstructor { + fn with_config(&self, payload: &[u8], registry: Arc) -> Result { + let proto = ProjectionOperatorProto::decode(payload)?; + let op = ProjectionOperator::from_proto(proto, registry)?; + Ok(ConstructedOperator::Operator(Box::new(op))) + } +} + +struct ValueBridge; +impl OperatorConstructor for ValueBridge { + fn with_config(&self, config: &[u8], registry: Arc) -> Result { + let proto = ValuePlanOperator::decode(config) + .map_err(|e| anyhow!("Decode ValuePlanOperator failed: {e}"))?; + let op = ValueExecutionConstructor.with_config(proto, registry)?; + Ok(ConstructedOperator::Operator(Box::new(op))) + } +} + +/// Generic connector source constructor: decodes `ConnectorOp` and dispatches by connector type. +struct ConnectorSourceBridge; +impl OperatorConstructor for ConnectorSourceBridge { + fn with_config(&self, config: &[u8], registry: Arc) -> Result { + ConnectorSourceDispatcher.with_config(config, registry) + } +} + +/// Generic connector sink constructor: decodes `ConnectorOp` and dispatches by connector type. +struct ConnectorSinkBridge; +impl OperatorConstructor for ConnectorSinkBridge { + fn with_config(&self, config: &[u8], registry: Arc) -> Result { + ConnectorSinkDispatcher.with_config(config, registry) + } +} + + +struct ValueExecutionConstructor; +impl ValueExecutionConstructor { + fn with_config( + &self, + config: ValuePlanOperator, + registry: Arc, + ) -> Result { + let executor = StatelessPhysicalExecutor::new(&config.physical_plan, registry.as_ref()) + .map_err(|e| anyhow!("build value execution plan '{}': {e}", config.name))?; + Ok(ValueExecutionOperator::new(config.name, executor)) + } +} \ No newline at end of file diff --git a/src/runtime/streaming/format/config.rs b/src/runtime/streaming/format/config.rs new file mode 100644 index 00000000..15a58008 --- /dev/null +++ b/src/runtime/streaming/format/config.rs @@ -0,0 +1,47 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use serde::{Deserialize, Serialize}; + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub enum TimestampFormat { + RFC3339, + UnixMillis, + UnixSeconds, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub enum DecimalEncoding { + String, + Number, + Bytes, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub enum BadDataPolicy { + Fail, + Drop, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct JsonFormat { + pub timestamp_format: TimestampFormat, + pub decimal_encoding: DecimalEncoding, + pub include_schema: bool, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub enum Format { + Json(JsonFormat), + RawString, + RawBytes, +} diff --git a/src/runtime/streaming/format/deserializer.rs b/src/runtime/streaming/format/deserializer.rs new file mode 100644 index 00000000..3e9e6d66 --- /dev/null +++ b/src/runtime/streaming/format/deserializer.rs @@ -0,0 +1,95 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +use anyhow::{anyhow, Result}; +use arrow_array::builder::StringBuilder; +use arrow_array::RecordBatch; +use arrow_json::reader::ReaderBuilder; +use arrow_schema::SchemaRef; +use std::sync::Arc; + +use super::config::{BadDataPolicy, Format}; + +pub struct DataDeserializer { + format: Format, + schema: SchemaRef, + bad_data_policy: BadDataPolicy, +} + +impl DataDeserializer { + pub fn new(format: Format, schema: SchemaRef, bad_data_policy: BadDataPolicy) -> Self { + Self { + format, + schema, + bad_data_policy, + } + } + + pub fn deserialize_batch(&self, messages: &[&[u8]]) -> Result { + match &self.format { + Format::Json(_) => self.deserialize_json(messages), + Format::RawString => self.deserialize_raw_string(messages), + Format::RawBytes => self.deserialize_raw_bytes(messages), + } + } + + fn deserialize_json(&self, messages: &[&[u8]]) -> Result { + let mut buffer = Vec::with_capacity(messages.len() * 256); + for msg in messages { + buffer.extend_from_slice(msg); + buffer.push(b'\n'); + } + + let allow_bad_data = self.bad_data_policy == BadDataPolicy::Drop; + let mut decoder = ReaderBuilder::new(self.schema.clone()) + .with_strict_mode(!allow_bad_data) + .build_decoder()?; + + decoder.decode(&buffer)?; + + let batch = if allow_bad_data { + let (batch, _mask, _, _errors) = decoder.flush_with_bad_data()?.unwrap(); + batch + } else { + decoder + .flush()? + .ok_or_else(|| anyhow!("JSON decoder returned no batch"))? + }; + + Ok(batch) + } + + fn deserialize_raw_string(&self, messages: &[&[u8]]) -> Result { + let mut builder = StringBuilder::with_capacity(messages.len(), messages.len() * 64); + for msg in messages { + builder.append_value(String::from_utf8_lossy(msg)); + } + + let array = Arc::new(builder.finish()); + RecordBatch::try_new(self.schema.clone(), vec![array]) + .map_err(|e| anyhow!("build RawString batch: {e}")) + } + + fn deserialize_raw_bytes(&self, messages: &[&[u8]]) -> Result { + use arrow_array::builder::BinaryBuilder; + + let mut builder = BinaryBuilder::with_capacity(messages.len(), messages.len() * 64); + for msg in messages { + builder.append_value(msg); + } + + let array = Arc::new(builder.finish()); + RecordBatch::try_new(self.schema.clone(), vec![array]) + .map_err(|e| anyhow!("build RawBytes batch: {e}")) + } +} diff --git a/src/runtime/streaming/format/json_encoder.rs b/src/runtime/streaming/format/json_encoder.rs new file mode 100644 index 00000000..f834a192 --- /dev/null +++ b/src/runtime/streaming/format/json_encoder.rs @@ -0,0 +1,175 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! + +use arrow_array::{ + Array, Decimal128Array, TimestampMicrosecondArray, + TimestampMillisecondArray, TimestampNanosecondArray, TimestampSecondArray, +}; +use arrow_json::writer::NullableEncoder; +use arrow_json::{Encoder, EncoderFactory, EncoderOptions}; +use arrow_schema::{ArrowError, DataType, FieldRef, TimeUnit}; +use base64::prelude::BASE64_STANDARD; +use base64::Engine; + +use super::config::{DecimalEncoding, TimestampFormat}; + +#[derive(Debug)] +pub struct CustomEncoderFactory { + pub timestamp_format: TimestampFormat, + pub decimal_encoding: DecimalEncoding, +} + +impl EncoderFactory for CustomEncoderFactory { + fn make_default_encoder<'a>( + &self, + _field: &'a FieldRef, + array: &'a dyn Array, + _options: &'a EncoderOptions, + ) -> Result>, ArrowError> { + let encoder: Box = match ( + &self.decimal_encoding, + &self.timestamp_format, + array.data_type(), + ) { + (_, TimestampFormat::UnixMillis, DataType::Timestamp(TimeUnit::Nanosecond, _)) => { + let arr = array + .as_any() + .downcast_ref::() + .unwrap() + .clone(); + Box::new(UnixMillisEncoder::Nanos(arr)) + } + (_, TimestampFormat::UnixMillis, DataType::Timestamp(TimeUnit::Microsecond, _)) => { + let arr = array + .as_any() + .downcast_ref::() + .unwrap() + .clone(); + Box::new(UnixMillisEncoder::Micros(arr)) + } + (_, TimestampFormat::UnixMillis, DataType::Timestamp(TimeUnit::Millisecond, _)) => { + let arr = array + .as_any() + .downcast_ref::() + .unwrap() + .clone(); + Box::new(UnixMillisEncoder::Millis(arr)) + } + (_, TimestampFormat::UnixMillis, DataType::Timestamp(TimeUnit::Second, _)) => { + let arr = array + .as_any() + .downcast_ref::() + .unwrap() + .clone(); + Box::new(UnixMillisEncoder::Seconds(arr)) + } + + // ── Decimal128 → String / Bytes ── + (DecimalEncoding::String, _, DataType::Decimal128(_, _)) => { + let arr = array + .as_any() + .downcast_ref::() + .unwrap() + .clone(); + Box::new(DecimalEncoder::StringEncoder(arr)) + } + (DecimalEncoding::Bytes, _, DataType::Decimal128(_, _)) => { + let arr = array + .as_any() + .downcast_ref::() + .unwrap() + .clone(); + Box::new(DecimalEncoder::BytesEncoder(arr)) + } + + // ── Binary → Base64 ── + (_, _, DataType::Binary) => { + let arr = array + .as_any() + .downcast_ref::() + .unwrap() + .clone(); + Box::new(BinaryEncoder(arr)) + } + + _ => return Ok(None), + }; + + Ok(Some(NullableEncoder::new(encoder, array.nulls().cloned()))) + } +} + +// --------------------------------------------------------------------------- +// --------------------------------------------------------------------------- + +enum UnixMillisEncoder { + Nanos(TimestampNanosecondArray), + Micros(TimestampMicrosecondArray), + Millis(TimestampMillisecondArray), + Seconds(TimestampSecondArray), +} + +impl Encoder for UnixMillisEncoder { + fn encode(&mut self, idx: usize, out: &mut Vec) { + let millis = match self { + Self::Nanos(arr) => arr.value(idx) / 1_000_000, + Self::Micros(arr) => arr.value(idx) / 1_000, + Self::Millis(arr) => arr.value(idx), + Self::Seconds(arr) => arr.value(idx) * 1_000, + }; + out.extend_from_slice(millis.to_string().as_bytes()); + } +} + +// --------------------------------------------------------------------------- +// --------------------------------------------------------------------------- + +enum DecimalEncoder { + StringEncoder(Decimal128Array), + BytesEncoder(Decimal128Array), +} + +impl Encoder for DecimalEncoder { + fn encode(&mut self, idx: usize, out: &mut Vec) { + match self { + Self::StringEncoder(arr) => { + out.push(b'"'); + out.extend_from_slice(arr.value_as_string(idx).as_bytes()); + out.push(b'"'); + } + Self::BytesEncoder(arr) => { + out.push(b'"'); + out.extend_from_slice( + BASE64_STANDARD + .encode(arr.value(idx).to_be_bytes()) + .as_bytes(), + ); + out.push(b'"'); + } + } + } +} + +// --------------------------------------------------------------------------- +// --------------------------------------------------------------------------- + +struct BinaryEncoder(arrow_array::BinaryArray); + +impl Encoder for BinaryEncoder { + fn encode(&mut self, idx: usize, out: &mut Vec) { + out.push(b'"'); + out.extend_from_slice(BASE64_STANDARD.encode(self.0.value(idx)).as_bytes()); + out.push(b'"'); + } +} diff --git a/src/runtime/streaming/format/mod.rs b/src/runtime/streaming/format/mod.rs new file mode 100644 index 00000000..d5e63a9d --- /dev/null +++ b/src/runtime/streaming/format/mod.rs @@ -0,0 +1,20 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +pub mod config; +pub mod deserializer; +pub mod json_encoder; +pub mod serializer; + +pub use config::{BadDataPolicy, DecimalEncoding, Format, JsonFormat, TimestampFormat}; +pub use deserializer::DataDeserializer; +pub use serializer::DataSerializer; diff --git a/src/runtime/streaming/format/serializer.rs b/src/runtime/streaming/format/serializer.rs new file mode 100644 index 00000000..bb123499 --- /dev/null +++ b/src/runtime/streaming/format/serializer.rs @@ -0,0 +1,140 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +use anyhow::{anyhow, Result}; +use arrow_array::{Array, RecordBatch, StructArray}; +use arrow_json::writer::make_encoder; +use arrow_json::EncoderOptions; +use arrow_schema::{DataType, Field, SchemaRef}; +use std::sync::Arc; + +use super::config::{Format, JsonFormat}; +use super::json_encoder::CustomEncoderFactory; + +pub struct DataSerializer { + format: Format, + projection_indices: Vec, +} + +impl DataSerializer { + pub fn new(format: Format, schema: SchemaRef) -> Self { + let projection_indices: Vec = schema + .fields() + .iter() + .enumerate() + .filter(|(_, f)| !f.name().starts_with('_')) + .map(|(i, _)| i) + .collect(); + + Self { + format, + projection_indices, + } + } + + pub fn serialize(&self, batch: &RecordBatch) -> Result>> { + let projected_batch = batch.project(&self.projection_indices)?; + + match &self.format { + Format::Json(config) => self.serialize_json(config, &projected_batch), + Format::RawString => self.serialize_raw_string(&projected_batch), + Format::RawBytes => self.serialize_raw_bytes(&projected_batch), + } + } + + fn serialize_json(&self, config: &JsonFormat, batch: &RecordBatch) -> Result>> { + let array = StructArray::from(batch.clone()); + let field = Arc::new(Field::new_struct( + "", + batch.schema().fields().clone(), + false, + )); + + let options = EncoderOptions::default() + .with_explicit_nulls(true) + .with_encoder_factory(Arc::new(CustomEncoderFactory { + timestamp_format: config.timestamp_format.clone(), + decimal_encoding: config.decimal_encoding.clone(), + })); + + let mut encoder = make_encoder(&field, &array, &options)?; + let mut results = Vec::with_capacity(batch.num_rows()); + + for idx in 0..array.len() { + let mut buffer = Vec::with_capacity(128); + encoder.encode(idx, &mut buffer); + if !buffer.is_empty() { + results.push(buffer); + } + } + Ok(results) + } + + fn serialize_raw_string(&self, batch: &RecordBatch) -> Result>> { + let value_idx = batch + .schema() + .index_of("value") + .map_err(|_| anyhow!("RawString format requires a 'value' column"))?; + + if *batch.schema().field(value_idx).data_type() != DataType::Utf8 { + return Err(anyhow!("RawString 'value' column must be Utf8")); + } + + let string_array = batch + .column(value_idx) + .as_any() + .downcast_ref::() + .unwrap(); + + let values: Vec> = (0..string_array.len()) + .map(|i| { + if string_array.is_null(i) { + vec![] + } else { + string_array.value(i).as_bytes().to_vec() + } + }) + .collect(); + + Ok(values) + } + + fn serialize_raw_bytes(&self, batch: &RecordBatch) -> Result>> { + let value_idx = batch + .schema() + .index_of("value") + .map_err(|_| anyhow!("RawBytes format requires a 'value' column"))?; + + if *batch.schema().field(value_idx).data_type() != DataType::Binary { + return Err(anyhow!("RawBytes 'value' column must be Binary")); + } + + let binary_array = batch + .column(value_idx) + .as_any() + .downcast_ref::() + .unwrap(); + + let values: Vec> = (0..binary_array.len()) + .map(|i| { + if binary_array.is_null(i) { + vec![] + } else { + binary_array.value(i).to_vec() + } + }) + .collect(); + + Ok(values) + } +} diff --git a/src/runtime/streaming/job/edge_manager.rs b/src/runtime/streaming/job/edge_manager.rs new file mode 100644 index 00000000..b57b761f --- /dev/null +++ b/src/runtime/streaming/job/edge_manager.rs @@ -0,0 +1,52 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; + +use protocol::grpc::api::{FsEdge, FsNode}; +use tokio::sync::mpsc; + +use crate::runtime::streaming::protocol::tracked::TrackedEvent; + +pub struct EdgeManager { + endpoints: HashMap>, Vec>)>, +} + +impl EdgeManager { + pub fn build(nodes: &[FsNode], edges: &[FsEdge]) -> Self { + let mut tx_map: HashMap>> = HashMap::new(); + let mut rx_map: HashMap> = HashMap::new(); + + for edge in edges { + let (tx, rx) = mpsc::channel(2048); + tx_map.entry(edge.source as u32).or_default().push(tx); + rx_map.insert(edge.target as u32, rx); + } + + let mut endpoints = HashMap::new(); + for node in nodes { + let id = node.node_index as u32; + endpoints.insert(id, (rx_map.remove(&id), tx_map.remove(&id).unwrap_or_default())); + } + + Self { endpoints } + } + + pub fn take_endpoints( + &mut self, + id: u32, + ) -> (Option>, Vec>) { + self.endpoints + .remove(&id) + .expect("Critical: Execution Graph Inconsistent") + } +} diff --git a/src/runtime/streaming/job/job_manager.rs b/src/runtime/streaming/job/job_manager.rs new file mode 100644 index 00000000..19a8a26e --- /dev/null +++ b/src/runtime/streaming/job/job_manager.rs @@ -0,0 +1,481 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; +use std::sync::{Arc, OnceLock, RwLock}; + +use anyhow::anyhow; +use tokio::sync::mpsc; +use tokio_stream::wrappers::ReceiverStream; +use tracing::{error, info, warn}; + +use protocol::grpc::api::{ChainedOperator, FsProgram}; + +use crate::runtime::streaming::api::context::TaskContext; +use crate::runtime::streaming::api::operator::{ConstructedOperator, Operator}; +use crate::runtime::streaming::api::source::SourceOperator; +use crate::runtime::streaming::execution::runner::{ChainedDriver, Pipeline}; +use crate::runtime::streaming::execution::source::SourceRunner; +use crate::runtime::streaming::factory::OperatorFactory; +use crate::runtime::streaming::job::edge_manager::EdgeManager; +use crate::runtime::streaming::job::models::{PhysicalExecutionGraph, PhysicalPipeline, PipelineStatus}; +use crate::runtime::streaming::memory::MemoryPool; +use crate::runtime::streaming::network::endpoint::{BoxedEventStream, PhysicalSender}; +use crate::runtime::streaming::protocol::control::{ControlCommand, StopMode}; + +#[derive(Debug, Clone)] +pub struct StreamingJobSummary { + pub job_id: String, + pub status: String, + pub pipeline_count: i32, + pub uptime_secs: u64, +} + +#[derive(Debug, Clone)] +pub struct PipelineDetail { + pub pipeline_id: u32, + pub status: String, +} + +#[derive(Debug, Clone)] +pub struct StreamingJobDetail { + pub job_id: String, + pub status: String, + pub pipeline_count: i32, + pub uptime_secs: u64, + pub pipelines: Vec, + pub program: FsProgram, +} + +static GLOBAL_JOB_MANAGER: OnceLock> = OnceLock::new(); + +pub struct JobManager { + active_jobs: Arc>>, + operator_factory: Arc, + memory_pool: Arc, +} + +struct PreparedChain { + source: Option>, + operators: Vec>, +} + +impl JobManager { + pub fn new(operator_factory: Arc, max_memory_bytes: usize) -> Self { + Self { + active_jobs: Arc::new(RwLock::new(HashMap::new())), + operator_factory, + memory_pool: MemoryPool::new(max_memory_bytes), + } + } + + pub fn init(operator_factory: Arc, max_memory_bytes: usize) -> anyhow::Result<()> { + let manager = Arc::new(Self::new(operator_factory, max_memory_bytes)); + GLOBAL_JOB_MANAGER + .set(manager) + .map_err(|_| anyhow!("JobManager singleton already initialized")) + } + + pub fn global() -> anyhow::Result> { + GLOBAL_JOB_MANAGER + .get() + .cloned() + .ok_or_else(|| anyhow!("JobManager not initialized. Call init() first.")) + } + + /// + pub async fn submit_job(&self, job_id: String, program: FsProgram) -> anyhow::Result { + let mut edge_manager = EdgeManager::build(&program.nodes, &program.edges); + let mut pipelines = HashMap::new(); + + for node in &program.nodes { + let pipeline_id = node.node_index as u32; + + let (raw_inboxes, raw_outboxes) = edge_manager.take_endpoints(pipeline_id); + let physical_outboxes = raw_outboxes.into_iter().map(PhysicalSender::Local).collect(); + let physical_inboxes: Vec = raw_inboxes + .into_iter() + .map(|rx| Box::pin(ReceiverStream::new(rx)) as _) + .collect(); + + let chain = self.build_operator_chain(&node.operators)?; + if chain.source.is_none() && physical_inboxes.is_empty() { + anyhow::bail!( + "Topology Error: pipeline '{}' contains no source operator and has no upstream inputs.", + pipeline_id + ); + } + if chain.source.is_some() && !physical_inboxes.is_empty() { + anyhow::bail!( + "Topology Error: source pipeline '{}' should not have upstream inputs.", + pipeline_id + ); + } + + let (control_tx, control_rx) = mpsc::channel(64); + let status = Arc::new(RwLock::new(PipelineStatus::Initializing)); + + let handle = if let Some(source) = chain.source { + self.spawn_source_pipeline_thread( + job_id.clone(), + pipeline_id, + source, + chain.operators, + physical_outboxes, + control_rx, + Arc::clone(&status), + )? + } else { + self.spawn_pipeline_thread( + job_id.clone(), + pipeline_id, + chain.operators, + physical_inboxes, + physical_outboxes, + control_rx, + Arc::clone(&status), + )? + }; + + pipelines.insert( + pipeline_id, + PhysicalPipeline { + pipeline_id, + handle: Some(handle), + status, + control_tx, + }, + ); + } + + let graph = PhysicalExecutionGraph { + job_id: job_id.clone(), + program, + pipelines, + start_time: std::time::Instant::now(), + }; + + self.active_jobs.write().unwrap().insert(job_id.clone(), graph); + info!(job_id = %job_id, "Job submitted successfully."); + + Ok(job_id) + } + + pub async fn stop_job(&self, job_id: &str, mode: StopMode) -> anyhow::Result<()> { + let control_senders: Vec<_> = { + let jobs_guard = self.active_jobs.read().unwrap(); + let graph = jobs_guard + .get(job_id) + .ok_or_else(|| anyhow::anyhow!("Job not found: {job_id}"))?; + + graph.pipelines.values().map(|p| p.control_tx.clone()).collect() + }; + + for tx in control_senders { + let _ = tx.send(ControlCommand::Stop { mode: mode.clone() }).await; + } + + info!(job_id = %job_id, mode = ?mode, "Job stop signal dispatched."); + Ok(()) + } + + pub fn get_pipeline_statuses(&self, job_id: &str) -> Option> { + let jobs_guard = self.active_jobs.read().unwrap(); + let graph = jobs_guard.get(job_id)?; + + Some( + graph.pipelines + .iter() + .map(|(id, pipeline)| { + (*id, pipeline.status.read().unwrap().clone()) + }) + .collect(), + ) + } + + pub fn list_jobs(&self) -> Vec { + let jobs_guard = self.active_jobs.read().unwrap(); + jobs_guard + .values() + .map(|graph| { + let pipeline_count = graph.pipelines.len() as i32; + let uptime_secs = graph.start_time.elapsed().as_secs(); + let status = Self::aggregate_pipeline_status(&graph.pipelines); + StreamingJobSummary { + job_id: graph.job_id.clone(), + status, + pipeline_count, + uptime_secs, + } + }) + .collect() + } + + pub fn get_job_detail(&self, job_id: &str) -> Option { + let jobs_guard = self.active_jobs.read().unwrap(); + let graph = jobs_guard.get(job_id)?; + + let uptime_secs = graph.start_time.elapsed().as_secs(); + let overall_status = Self::aggregate_pipeline_status(&graph.pipelines); + + let pipeline_details: Vec = graph + .pipelines + .iter() + .map(|(id, pipeline)| { + let status = pipeline.status.read().unwrap().clone(); + PipelineDetail { + pipeline_id: *id, + status: format!("{status:?}"), + } + }) + .collect(); + + Some(StreamingJobDetail { + job_id: graph.job_id.clone(), + status: overall_status, + pipeline_count: graph.pipelines.len() as i32, + uptime_secs, + pipelines: pipeline_details, + program: graph.program.clone(), + }) + } + + pub fn has_job(&self, job_id: &str) -> bool { + self.active_jobs.read().unwrap().contains_key(job_id) + } + + pub async fn remove_job(&self, job_id: &str, mode: StopMode) -> anyhow::Result<()> { + { + let jobs_guard = self.active_jobs.read().unwrap(); + if !jobs_guard.contains_key(job_id) { + anyhow::bail!("Job not found: {job_id}"); + } + let graph = &jobs_guard[job_id]; + let control_senders: Vec<_> = + graph.pipelines.values().map(|p| p.control_tx.clone()).collect(); + + drop(jobs_guard); + + for tx in control_senders { + let _ = tx.send(ControlCommand::Stop { mode: mode.clone() }).await; + } + } + + self.active_jobs.write().unwrap().remove(job_id); + info!(job_id = %job_id, "Job stopped and removed."); + Ok(()) + } + + fn aggregate_pipeline_status( + pipelines: &HashMap, + ) -> String { + let mut running = 0u32; + let mut failed = 0u32; + let mut finished = 0u32; + let mut initializing = 0u32; + + for pipeline in pipelines.values() { + match &*pipeline.status.read().unwrap() { + PipelineStatus::Running => running += 1, + PipelineStatus::Failed { .. } => failed += 1, + PipelineStatus::Finished => finished += 1, + PipelineStatus::Initializing => initializing += 1, + PipelineStatus::Stopping => {} + } + } + + if failed > 0 { + "DEGRADED".to_string() + } else if running > 0 && running == pipelines.len() as u32 { + "RUNNING".to_string() + } else if finished == pipelines.len() as u32 { + "FINISHED".to_string() + } else if initializing > 0 { + "INITIALIZING".to_string() + } else { + "PARTIAL".to_string() + } + } + + // ======================================================================== + + fn build_operator_chain( + &self, + operator_configs: &[ChainedOperator], + ) -> anyhow::Result { + let mut source: Option> = None; + let mut chain = Vec::with_capacity(operator_configs.len()); + + for op_config in operator_configs { + let constructed = self.operator_factory + .create_operator(&op_config.operator_name, &op_config.operator_config)?; + + match constructed { + ConstructedOperator::Operator(msg_op) => chain.push(msg_op), + ConstructedOperator::Source(src_op) => { + if source.is_some() { + anyhow::bail!( + "Topology Error: Multiple source operators detected in one physical chain." + ); + } + if !chain.is_empty() { + anyhow::bail!( + "Topology Error: Source operator '{}' cannot be scheduled inside a MessageOperator physical chain.", + op_config.operator_name + ); + } + source = Some(src_op); + } + } + } + Ok(PreparedChain { + source, + operators: chain, + }) + } + + fn spawn_pipeline_thread( + &self, + job_id: String, + pipeline_id: u32, + operators: Vec>, + inboxes: Vec, + outboxes: Vec, + control_rx: mpsc::Receiver, + status: Arc>, + ) -> anyhow::Result> { + let memory_pool = Arc::clone(&self.memory_pool); + let thread_name = format!("Task-{job_id}-{pipeline_id}"); + + let handle = std::thread::Builder::new() + .name(thread_name) + .spawn(move || { + *status.write().unwrap() = PipelineStatus::Running; + + let rt = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .expect("Failed to build current-thread Tokio runtime for pipeline"); + + let job_id_inner = job_id.clone(); + let execution_result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| { + rt.block_on(async move { + let ctx = TaskContext::new( + job_id_inner, + pipeline_id, + 0, + 1, + outboxes, + memory_pool, + ); + + let pipeline = Pipeline::new(operators, ctx, inboxes, control_rx) + .map_err(|e| anyhow::anyhow!("Pipeline init failed: {e}"))?; + + pipeline.run().await.map_err(|e| anyhow::anyhow!("Pipeline execution failed: {e}")) + }) + })); + + Self::handle_pipeline_exit(&job_id, pipeline_id, execution_result, &status); + })?; + + Ok(handle) + } + + fn spawn_source_pipeline_thread( + &self, + job_id: String, + pipeline_id: u32, + source: Box, + operators: Vec>, + outboxes: Vec, + control_rx: mpsc::Receiver, + status: Arc>, + ) -> anyhow::Result> { + let memory_pool = Arc::clone(&self.memory_pool); + let thread_name = format!("Task-{job_id}-{pipeline_id}"); + + let handle = std::thread::Builder::new() + .name(thread_name) + .spawn(move || { + *status.write().unwrap() = PipelineStatus::Running; + + let rt = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .expect("Failed to build current-thread Tokio runtime for source pipeline"); + + let job_id_inner = job_id.clone(); + let execution_result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| { + rt.block_on(async move { + let ctx = TaskContext::new( + job_id_inner, + pipeline_id, + 0, + 1, + outboxes, + memory_pool, + ); + + let chain_head = ChainedDriver::build_chain(operators); + let runner = SourceRunner::new(source, chain_head, ctx, control_rx); + + runner + .run() + .await + .map_err(|e| anyhow::anyhow!("Source pipeline execution failed: {e}")) + }) + })); + + Self::handle_pipeline_exit(&job_id, pipeline_id, execution_result, &status); + })?; + + Ok(handle) + } + + fn handle_pipeline_exit( + job_id: &str, + pipeline_id: u32, + thread_result: std::thread::Result>, + status: &RwLock, + ) { + let mut is_fatal = false; + let final_status = match thread_result { + Ok(Ok(_)) => { + info!(job_id = %job_id, pipeline_id = pipeline_id, "Pipeline finished gracefully."); + PipelineStatus::Finished + } + Ok(Err(e)) => { + error!(job_id = %job_id, pipeline_id = pipeline_id, error = %e, "Pipeline failed."); + is_fatal = true; + PipelineStatus::Failed { + error: e.to_string(), + is_panic: false, + } + } + Err(_) => { + error!(job_id = %job_id, pipeline_id = pipeline_id, "Pipeline thread panicked!"); + is_fatal = true; + PipelineStatus::Failed { + error: "Task thread encountered an unexpected panic".into(), + is_panic: true, + } + } + }; + + *status.write().unwrap() = final_status; + + if is_fatal { + warn!(job_id = %job_id, pipeline_id = pipeline_id, "Pipeline failure detected, Job should be aborted or recovered."); + } + } +} diff --git a/src/runtime/streaming/job/mod.rs b/src/runtime/streaming/job/mod.rs new file mode 100644 index 00000000..02e0343c --- /dev/null +++ b/src/runtime/streaming/job/mod.rs @@ -0,0 +1,17 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +pub mod edge_manager; +pub mod job_manager; +pub mod models; + +pub use job_manager::{JobManager, StreamingJobSummary}; diff --git a/src/runtime/streaming/job/models.rs b/src/runtime/streaming/job/models.rs new file mode 100644 index 00000000..45ea3bb7 --- /dev/null +++ b/src/runtime/streaming/job/models.rs @@ -0,0 +1,44 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; +use std::sync::{Arc, RwLock}; +use std::thread::JoinHandle; +use std::time::Instant; + +use protocol::grpc::api::FsProgram; +use tokio::sync::mpsc; + +use crate::runtime::streaming::protocol::control::ControlCommand; + +#[derive(Debug, Clone, PartialEq)] +pub enum PipelineStatus { + Initializing, + Running, + Failed { error: String, is_panic: bool }, + Finished, + Stopping, +} + +pub struct PhysicalPipeline { + pub pipeline_id: u32, + pub handle: Option>, + pub status: Arc>, + pub control_tx: mpsc::Sender, +} + +pub struct PhysicalExecutionGraph { + pub job_id: String, + pub program: FsProgram, + pub pipelines: HashMap, + pub start_time: Instant, +} diff --git a/src/runtime/streaming/memory/mod.rs b/src/runtime/streaming/memory/mod.rs new file mode 100644 index 00000000..45fc3194 --- /dev/null +++ b/src/runtime/streaming/memory/mod.rs @@ -0,0 +1,17 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +pub mod pool; +pub mod ticket; + +pub use pool::MemoryPool; +pub use ticket::MemoryTicket; diff --git a/src/runtime/streaming/memory/pool.rs b/src/runtime/streaming/memory/pool.rs new file mode 100644 index 00000000..4813a63e --- /dev/null +++ b/src/runtime/streaming/memory/pool.rs @@ -0,0 +1,86 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use parking_lot::Mutex; +use std::sync::atomic::{AtomicUsize, Ordering}; +use std::sync::Arc; +use tokio::sync::Notify; +use tracing::{debug, warn}; + +use super::ticket::MemoryTicket; + +#[derive(Debug)] +pub struct MemoryPool { + max_bytes: usize, + used_bytes: AtomicUsize, + available_bytes: Mutex, + notify: Notify, +} + +impl MemoryPool { + pub fn new(max_bytes: usize) -> Arc { + Arc::new(Self { + max_bytes, + used_bytes: AtomicUsize::new(0), + available_bytes: Mutex::new(max_bytes), + notify: Notify::new(), + }) + } + + pub fn usage_metrics(&self) -> (usize, usize) { + (self.used_bytes.load(Ordering::Relaxed), self.max_bytes) + } + + pub async fn request_memory(self: &Arc, bytes: usize) -> MemoryTicket { + if bytes == 0 { + return MemoryTicket::new(0, self.clone()); + } + + if bytes > self.max_bytes { + warn!( + "Requested memory ({} B) exceeds total pool size ({} B)! \ + Permitting to avoid pipeline deadlock, but OOM risk is critical.", + bytes, self.max_bytes + ); + self.used_bytes.fetch_add(bytes, Ordering::Relaxed); + return MemoryTicket::new(bytes, self.clone()); + } + + loop { + { + let mut available = self.available_bytes.lock(); + if *available >= bytes { + *available -= bytes; + self.used_bytes.fetch_add(bytes, Ordering::Relaxed); + return MemoryTicket::new(bytes, self.clone()); + } + } + + debug!("Backpressure engaged: waiting for {} bytes to be freed...", bytes); + self.notify.notified().await; + } + } + + pub(crate) fn release(&self, bytes: usize) { + if bytes == 0 { + return; + } + + { + let mut available = self.available_bytes.lock(); + *available += bytes; + } + + self.used_bytes.fetch_sub(bytes, Ordering::Relaxed); + self.notify.notify_waiters(); + } +} diff --git a/src/runtime/streaming/memory/ticket.rs b/src/runtime/streaming/memory/ticket.rs new file mode 100644 index 00000000..cb105be0 --- /dev/null +++ b/src/runtime/streaming/memory/ticket.rs @@ -0,0 +1,33 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; + +use super::pool::MemoryPool; + +#[derive(Debug)] +pub struct MemoryTicket { + bytes: usize, + pool: Arc, +} + +impl MemoryTicket { + pub(crate) fn new(bytes: usize, pool: Arc) -> Self { + Self { bytes, pool } + } +} + +impl Drop for MemoryTicket { + fn drop(&mut self) { + self.pool.release(self.bytes); + } +} diff --git a/src/runtime/streaming/mod.rs b/src/runtime/streaming/mod.rs new file mode 100644 index 00000000..7e0ba57a --- /dev/null +++ b/src/runtime/streaming/mod.rs @@ -0,0 +1,27 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Streaming actor runtime (vendored from Arroyo `arroyo-actor-runtime`). + +pub mod api; +pub mod error; +pub mod execution; +pub mod factory; +pub mod format; +pub mod job; +pub mod memory; +pub mod network; +pub mod operators; +pub mod protocol; + +pub use protocol::StreamOutput; diff --git a/src/runtime/streaming/network/endpoint.rs b/src/runtime/streaming/network/endpoint.rs new file mode 100644 index 00000000..7448e9cd --- /dev/null +++ b/src/runtime/streaming/network/endpoint.rs @@ -0,0 +1,64 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use crate::runtime::streaming::protocol::event::StreamEvent; +use crate::runtime::streaming::protocol::tracked::TrackedEvent; +use anyhow::{anyhow, Result}; +use std::pin::Pin; +use tokio::sync::mpsc; +use tokio_stream::Stream; +use tracing::debug; + +// ======================================================================== +// ======================================================================== + +#[derive(Clone)] +pub struct RemoteSenderStub { + pub target_addr: String, +} + +impl RemoteSenderStub { + pub async fn send_over_network(&self, _event: &StreamEvent) -> Result<()> { + unimplemented!("Remote network transport is not yet implemented") + } +} + +// ======================================================================== +// ======================================================================== + +#[derive(Clone)] +pub enum PhysicalSender { + Local(mpsc::Sender), + Remote(RemoteSenderStub), +} + +impl PhysicalSender { + pub async fn send(&self, tracked_event: TrackedEvent) -> Result<()> { + match self { + PhysicalSender::Local(tx) => { + tx.send(tracked_event) + .await + .map_err(|_| anyhow!("Local channel closed! Downstream task may have crashed."))?; + } + PhysicalSender::Remote(stub) => { + stub.send_over_network(&tracked_event.event).await?; + debug!("Sent event over network, local memory ticket will be released."); + } + } + Ok(()) + } +} + +// ======================================================================== +// ======================================================================== + +pub type BoxedEventStream = Pin + Send>>; diff --git a/src/runtime/streaming/network/environment.rs b/src/runtime/streaming/network/environment.rs new file mode 100644 index 00000000..fe8544c5 --- /dev/null +++ b/src/runtime/streaming/network/environment.rs @@ -0,0 +1,51 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use super::endpoint::{BoxedEventStream, PhysicalSender}; +use std::collections::HashMap; + +pub type VertexId = u32; +pub type SubtaskIndex = u32; + +pub struct NetworkEnvironment { + pub outboxes: HashMap<(VertexId, SubtaskIndex), Vec>, + pub inboxes: HashMap<(VertexId, SubtaskIndex), Vec>, +} + +impl NetworkEnvironment { + pub fn new() -> Self { + Self { + outboxes: HashMap::new(), + inboxes: HashMap::new(), + } + } + + pub fn take_outboxes( + &mut self, + vertex_id: VertexId, + subtask_idx: SubtaskIndex, + ) -> Vec { + self.outboxes + .remove(&(vertex_id, subtask_idx)) + .unwrap_or_default() + } + + pub fn take_inboxes( + &mut self, + vertex_id: VertexId, + subtask_idx: SubtaskIndex, + ) -> Vec { + self.inboxes + .remove(&(vertex_id, subtask_idx)) + .unwrap_or_default() + } +} diff --git a/src/runtime/streaming/network/mod.rs b/src/runtime/streaming/network/mod.rs new file mode 100644 index 00000000..16100133 --- /dev/null +++ b/src/runtime/streaming/network/mod.rs @@ -0,0 +1,15 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +pub mod endpoint; +pub mod environment; + diff --git a/src/runtime/streaming/operators/grouping/incremental_aggregate.rs b/src/runtime/streaming/operators/grouping/incremental_aggregate.rs new file mode 100644 index 00000000..f895c173 --- /dev/null +++ b/src/runtime/streaming/operators/grouping/incremental_aggregate.rs @@ -0,0 +1,726 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use anyhow::{bail, Result}; +use arrow::compute::max_array; +use arrow::row::{RowConverter, SortField}; +use arrow_array::builder::{ + BinaryBuilder, TimestampNanosecondBuilder, UInt32Builder, UInt64Builder, +}; +use arrow_array::cast::AsArray; +use arrow_array::types::UInt64Type; +use arrow_array::{ + Array, ArrayRef, BooleanArray, RecordBatch, StructArray, +}; +use arrow_schema::{DataType, Field, FieldRef, Schema, SchemaBuilder, TimeUnit}; +use datafusion::common::{Result as DFResult, ScalarValue}; +use datafusion::physical_expr::aggregate::AggregateFunctionExpr; +use datafusion::physical_plan::{Accumulator, PhysicalExpr}; +use crate::sql::common::constants::updating_state_field; +use datafusion_proto::physical_plan::DefaultPhysicalExtensionCodec; +use datafusion_proto::physical_plan::from_proto::parse_physical_expr; +use datafusion_proto::protobuf::PhysicalExprNode; +use datafusion_proto::protobuf::PhysicalPlanNode; +use datafusion_proto::protobuf::physical_plan_node::PhysicalPlanType; +use itertools::Itertools; +use prost::Message; +use std::collections::HashSet; +use std::sync::LazyLock; +use std::time::{Duration, Instant, SystemTime}; +use std::{collections::HashMap, mem, sync::Arc}; +use tracing::{debug, warn}; +use protocol::grpc::api::UpdatingAggregateOperator; +// ========================================================================= +// ========================================================================= +use crate::runtime::streaming::api::context::TaskContext; +use crate::runtime::streaming::api::operator::Operator; +use crate::runtime::streaming::factory::Registry; +use crate::runtime::util::decode_aggregate; +use crate::runtime::streaming::operators::{Key, UpdatingCache}; +use crate::runtime::streaming::StreamOutput; +use crate::sql::common::{to_nanos, CheckpointBarrier, FsSchema, Watermark, TIMESTAMP_FIELD, UPDATING_META_FIELD}; +use crate::sql::physical::updating_meta_fields; + +#[derive(Debug, Copy, Clone)] +struct BatchData { + count: u64, + generation: u64, +} + +impl BatchData { + fn new(generation: u64) -> Self { + Self { count: 1, generation } + } + + fn inc(&mut self) { + self.count += 1; + self.generation += 1; + } + + fn dec(&mut self) { + self.count = self.count.checked_sub(1).unwrap_or_default(); + self.generation += 1; + } +} + +#[derive(Debug)] +enum IncrementalState { + Sliding { + expr: Arc, + accumulator: Box, + }, + Batch { + expr: Arc, + data: HashMap, + row_converter: Arc, + changed_values: HashSet, + }, +} + +impl IncrementalState { + fn update_batch(&mut self, new_generation: u64, batch: &[ArrayRef]) -> DFResult<()> { + match self { + IncrementalState::Sliding { accumulator, .. } => { + accumulator.update_batch(batch)?; + } + IncrementalState::Batch { data, row_converter, changed_values, .. } => { + for r in row_converter.convert_columns(batch)?.iter() { + if data.contains_key(r.as_ref()) { + data.get_mut(r.as_ref()).unwrap().inc(); + changed_values.insert(data.get_key_value(r.as_ref()).unwrap().0.clone()); + } else { + let key = Key(Arc::new(r.as_ref().to_vec())); + data.insert(key.clone(), BatchData::new(new_generation)); + changed_values.insert(key); + } + } + } + } + Ok(()) + } + + fn retract_batch(&mut self, batch: &[ArrayRef]) -> DFResult<()> { + match self { + IncrementalState::Sliding { accumulator, .. } => accumulator.retract_batch(batch), + IncrementalState::Batch { data, row_converter, changed_values, .. } => { + for r in row_converter.convert_columns(batch)?.iter() { + match data.get(r.as_ref()).map(|d| d.count) { + Some(0) => { + debug!("tried to retract value for key with count 0; implies append lost"); + } + Some(_) => { + data.get_mut(r.as_ref()).unwrap().dec(); + changed_values.insert(data.get_key_value(r.as_ref()).unwrap().0.clone()); + } + None => { + debug!("tried to retract value for missing key: implies append lost"); + } + } + } + Ok(()) + } + } + } + + fn evaluate(&mut self) -> DFResult { + match self { + IncrementalState::Sliding { accumulator, .. } => accumulator.evaluate(), + IncrementalState::Batch { expr, data, row_converter, .. } => { + let parser = row_converter.parser(); + let input = row_converter.convert_rows( + data.iter() + .filter(|(_, c)| c.count > 0) + .map(|(v, _)| parser.parse(&v.0)), + )?; + let mut acc = expr.create_accumulator()?; + acc.update_batch(&input)?; + acc.evaluate_mut() + } + } + } +} + +#[derive(Debug, Copy, Clone, Eq, PartialEq)] +enum AccumulatorType { + Sliding, + Batch, +} + +impl AccumulatorType { + fn state_fields(&self, agg: &AggregateFunctionExpr) -> DFResult> { + Ok(match self { + AccumulatorType::Sliding => agg.sliding_state_fields()?, + AccumulatorType::Batch => vec![], + }) + } +} + +#[derive(Debug)] +struct Aggregator { + func: Arc, + accumulator_type: AccumulatorType, + row_converter: Arc, + state_cols: Vec, +} + +// ========================================================================= +// ========================================================================= + +pub struct IncrementalAggregatingFunc { + flush_interval: Duration, + metadata_expr: Arc, + aggregates: Vec, + accumulators: UpdatingCache>, + updated_keys: HashMap>>, + + input_schema: Arc, + has_routing_keys: bool, + + sliding_state_schema: Arc, + batch_state_schema: Arc, + schema_without_metadata: Arc, + final_output_schema: Arc, + ttl: Duration, + key_converter: RowConverter, + new_generation: u64, +} + +static GLOBAL_KEY: LazyLock>> = LazyLock::new(|| Arc::new(Vec::new())); + +impl IncrementalAggregatingFunc { + fn update_batch(&mut self, key: &[u8], batch: &[Vec], idx: Option) -> DFResult<()> { + self.accumulators + .modify_and_update(key, Instant::now(), |values| { + for (inputs, accs) in batch.iter().zip(values.iter_mut()) { + let values = if let Some(idx) = idx { + &inputs.iter().map(|c| c.slice(idx, 1)).collect() + } else { + inputs + }; + accs.update_batch(self.new_generation, values)?; + } + Ok(()) + }) + .expect("tried to update for non-existent key") + } + + fn retract_batch(&mut self, key: &[u8], batch: &[Vec], idx: Option) -> DFResult<()> { + self.accumulators + .modify(key, |values| { + for (inputs, accs) in batch.iter().zip(values.iter_mut()) { + let values = if let Some(idx) = idx { + &inputs.iter().map(|c| c.slice(idx, 1)).collect() + } else { + inputs + }; + accs.retract_batch(values)?; + } + Ok::<(), datafusion::common::DataFusionError>(()) + }) + .expect("tried to retract state for non-existent key")?; + Ok(()) + } + + fn evaluate(&mut self, key: &[u8]) -> DFResult> { + self.accumulators + .get_mut(key) + .expect("tried to evaluate non-existent key") + .iter_mut() + .map(|s| s.evaluate()) + .collect::>() + } + + fn get_retracts(batch: &RecordBatch) -> Option<&BooleanArray> { + if let Some(meta_col) = batch.column_by_name(UPDATING_META_FIELD) { + let meta_struct = meta_col + .as_any() + .downcast_ref::() + .expect("_updating_meta must be StructArray"); + + let is_retract_array = meta_struct + .column_by_name(updating_state_field::IS_RETRACT) + .expect("meta struct must have is_retract"); + + Some(is_retract_array.as_any().downcast_ref::().expect("is_retract must be BooleanArray")) + } else { + None + } + } + + fn make_accumulators(&self) -> Vec { + self.aggregates + .iter() + .map(|agg| match agg.accumulator_type { + AccumulatorType::Sliding => IncrementalState::Sliding { + expr: agg.func.clone(), + accumulator: agg.func.create_sliding_accumulator().unwrap(), + }, + AccumulatorType::Batch => IncrementalState::Batch { + expr: agg.func.clone(), + data: Default::default(), + row_converter: agg.row_converter.clone(), + changed_values: Default::default(), + }, + }) + .collect() + } + + fn compute_inputs(&self, batch: &RecordBatch) -> Vec> { + self.aggregates + .iter() + .map(|agg| { + agg.func + .expressions() + .iter() + .map(|ex| ex.evaluate(batch).unwrap().into_array(batch.num_rows()).unwrap()) + .collect::>() + }) + .collect::>() + } + + fn global_aggregate(&mut self, batch: &RecordBatch) -> Result<()> { + let retracts = Self::get_retracts(batch); + let aggregate_input_cols = self.compute_inputs(&batch); + + let mut first = false; + if !self + .accumulators + .contains_key(GLOBAL_KEY.as_ref().as_slice()) + { + first = true; + self.accumulators.insert( + GLOBAL_KEY.clone(), + Instant::now(), + self.new_generation, + self.make_accumulators(), + ); + } + + if !self + .updated_keys + .contains_key(GLOBAL_KEY.as_ref().as_slice()) + { + if first { + self.updated_keys.insert(Key(GLOBAL_KEY.clone()), None); + } else { + let v = Some(self.evaluate(GLOBAL_KEY.as_ref().as_slice())?); + self.updated_keys.insert(Key(GLOBAL_KEY.clone()), v); + } + } + + if let Some(retracts) = retracts { + for (i, r) in retracts.iter().enumerate() { + if r.unwrap_or_default() { + self.retract_batch( + GLOBAL_KEY.as_ref().as_slice(), + &aggregate_input_cols, + Some(i), + )?; + } else { + self.update_batch( + GLOBAL_KEY.as_ref().as_slice(), + &aggregate_input_cols, + Some(i), + )?; + } + } + } else { + self.update_batch( + GLOBAL_KEY.as_ref().as_slice(), + &aggregate_input_cols, + None, + ) + .unwrap(); + } + Ok(()) + } + + fn keyed_aggregate(&mut self, batch: &RecordBatch) -> Result<()> { + let retracts = Self::get_retracts(batch); + + let sort_columns = &self.input_schema + .sort_columns(batch, false) + .into_iter() + .map(|e| e.values) + .collect::>(); + + let keys = self.key_converter.convert_columns(sort_columns).unwrap(); + + for k in &keys { + if !self.updated_keys.contains_key(k.as_ref()) { + if let Some((key, accs)) = self.accumulators.get_mut_key_value(k.as_ref()) { + self.updated_keys.insert(key, Some(accs.iter_mut().map(|s| s.evaluate()).collect::>()?)); + } else { + self.updated_keys.insert(Key(Arc::new(k.as_ref().to_vec())), None); + } + } + } + + let aggregate_input_cols = self.compute_inputs(&batch); + + for (i, key) in keys.iter().enumerate() { + if !self.accumulators.contains_key(key.as_ref()) { + self.accumulators.insert(Arc::new(key.as_ref().to_vec()), Instant::now(), 0, self.make_accumulators()); + }; + + let retract = retracts.map(|r| r.value(i)).unwrap_or_default(); + if retract { + self.retract_batch(key.as_ref(), &aggregate_input_cols, Some(i))?; + } else { + self.update_batch(key.as_ref(), &aggregate_input_cols, Some(i))?; + } + } + Ok(()) + } + + // ========================================================================= + // ========================================================================= + + fn checkpoint_sliding(&mut self) -> DFResult>> { + if self.updated_keys.is_empty() { return Ok(None); } + + let mut states = vec![vec![]; self.sliding_state_schema.schema.fields.len()]; + let parser = self.key_converter.parser(); + let mut generation_builder = UInt64Builder::with_capacity(self.updated_keys.len()); + + let mut cols = self.key_converter.convert_rows(self.updated_keys.keys().map(|k| { + let (accumulators, generation) = self.accumulators.get_mut_generation(k.0.as_ref()).unwrap(); + generation_builder.append_value(generation); + + for (state, agg) in accumulators.iter_mut().zip(self.aggregates.iter()) { + let IncrementalState::Sliding { expr, accumulator } = state else { continue; }; + let state = accumulator.state().unwrap_or_else(|_| { + let state = accumulator.state().unwrap(); + *accumulator = expr.create_sliding_accumulator().unwrap(); + let states: Vec<_> = state.iter().map(|s| s.to_array()).try_collect().unwrap(); + accumulator.merge_batch(&states).unwrap(); + state + }); + + for (idx, v) in agg.state_cols.iter().zip(state.into_iter()) { + states[*idx].push(v); + } + } + parser.parse(k.0.as_ref()) + }))?; + + cols.extend(states.into_iter().skip(cols.len()).map(|c| ScalarValue::iter_to_array(c).unwrap())); + + let generations = generation_builder.finish(); + self.new_generation = self.new_generation.max(max_array::(&generations).unwrap()); + cols.push(Arc::new(generations)); + + Ok(Some(cols)) + } + + fn checkpoint_batch(&mut self) -> DFResult>> { + if self.aggregates.iter().all(|agg| agg.accumulator_type == AccumulatorType::Sliding) { return Ok(None); } + if self.updated_keys.is_empty() { return Ok(None); } + + let size = self.updated_keys.len(); + let mut rows = Vec::with_capacity(size); + let mut accumulator_builder = UInt32Builder::with_capacity(size); + let mut args_row_builder = BinaryBuilder::with_capacity(size, size * 4); + let mut count_builder = UInt64Builder::with_capacity(size); + let mut timestamp_builder = TimestampNanosecondBuilder::with_capacity(size); + let mut generation_builder = UInt64Builder::with_capacity(size); + + let now = to_nanos(SystemTime::now()) as i64; + let parser = self.key_converter.parser(); + + for k in self.updated_keys.keys() { + let row = parser.parse(&k.0); + for (i, state) in self.accumulators.get_mut(k.0.as_ref()).unwrap().iter_mut().enumerate() { + let IncrementalState::Batch { data, changed_values, .. } = state else { continue; }; + + for vk in changed_values.iter() { + if let Some(count) = data.get(vk) { + accumulator_builder.append_value(i as u32); + args_row_builder.append_value(&*vk.0); + count_builder.append_value(count.count); + generation_builder.append_value(count.generation); + timestamp_builder.append_value(now); + rows.push(row.to_owned()) + } + } + data.retain(|_, v| v.count > 0); + } + } + + let mut cols = self.key_converter.convert_rows(rows.into_iter())?; + cols.push(Arc::new(accumulator_builder.finish())); + cols.push(Arc::new(args_row_builder.finish())); + cols.push(Arc::new(count_builder.finish())); + cols.push(Arc::new(timestamp_builder.finish())); + + let generations = generation_builder.finish(); + self.new_generation = self.new_generation.max(max_array::(&generations).unwrap()); + cols.push(Arc::new(generations)); + + Ok(Some(cols)) + } + + fn restore_sliding(&mut self, key: &[u8], now: Instant, i: usize, aggregate_states: &Vec>, generation: u64) -> Result<()> { + let mut accumulators = self.make_accumulators(); + for ((_, state_cols), acc) in self.aggregates.iter().zip(aggregate_states.iter()).zip(accumulators.iter_mut()) { + if let IncrementalState::Sliding { accumulator, .. } = acc { + accumulator.merge_batch(&state_cols.iter().map(|c| c.slice(i, 1)).collect_vec())? + } + } + self.accumulators.insert(Arc::new(key.to_vec()), now, generation, accumulators); + Ok(()) + } + + async fn initialize(&mut self, _ctx: &mut TaskContext) -> Result<()> { + + let mut deleted_keys = vec![]; + for (k, v) in self.accumulators.iter_mut() { + let is_deleted = v.last_mut().unwrap().evaluate()?.is_null(); + if is_deleted { deleted_keys.push(k.clone()); } + else { + for is in v { + if let IncrementalState::Batch { data, .. } = is { data.retain(|_, v| v.count > 0); } + } + } + } + for k in deleted_keys { self.accumulators.remove(&k.0); } + Ok(()) + } + + fn generate_changelog(&mut self) -> Result> { + let mut output_keys = Vec::with_capacity(self.updated_keys.len() * 2); + let mut output_values = vec![Vec::with_capacity(self.updated_keys.len() * 2); self.aggregates.len()]; + let mut is_retracts = Vec::with_capacity(self.updated_keys.len() * 2); + + let (updated_keys, updated_values): (Vec<_>, Vec<_>) = mem::take(&mut self.updated_keys).into_iter().unzip(); + let mut deleted_keys = vec![]; + + for (k, retract) in updated_keys.iter().zip(updated_values.into_iter()) { + let append = self.evaluate(&k.0)?; + + if let Some(v) = retract { + if v.iter().zip(append.iter()).take(v.len() - 1).all(|(a, b)| a == b) { continue; } + is_retracts.push(true); + output_keys.push(k.clone()); + for (out, val) in output_values.iter_mut().zip(v) { out.push(val); } + } + + if !append.last().unwrap().is_null() { + is_retracts.push(false); + output_keys.push(k.clone()); + for (out, val) in output_values.iter_mut().zip(append) { out.push(val); } + } else { + deleted_keys.push(k); + } + } + + for k in deleted_keys { self.accumulators.remove(&k.0); } + + let mut ttld_keys = vec![]; + for (k, mut v) in self.accumulators.time_out(Instant::now()) { + is_retracts.push(true); + ttld_keys.push(k); + for (out, val) in output_values.iter_mut().zip(v.iter_mut().map(|s| s.evaluate())) { out.push(val?); } + } + + if output_keys.is_empty() && ttld_keys.is_empty() { return Ok(None); } + + let row_parser = self.key_converter.parser(); + let mut result_cols = self.key_converter.convert_rows( + output_keys.iter().map(|k| row_parser.parse(k.0.as_slice())) + .chain(ttld_keys.iter().map(|k| row_parser.parse(k.as_slice()))) + )?; + + for acc in output_values.into_iter() { result_cols.push(ScalarValue::iter_to_array(acc).unwrap()); } + + let record_batch = RecordBatch::try_new(self.schema_without_metadata.clone(), result_cols).unwrap(); + + let metadata = self.metadata_expr.evaluate(&record_batch).unwrap().into_array(record_batch.num_rows()).unwrap(); + let metadata = set_retract_metadata(metadata, Arc::new(BooleanArray::from(is_retracts))); + + let mut final_batch = record_batch.columns().to_vec(); + final_batch.push(metadata); + + Ok(Some(RecordBatch::try_new( + self.final_output_schema.clone(), + final_batch, + )?)) + } +} + +fn set_retract_metadata(metadata: ArrayRef, is_retract: Arc) -> ArrayRef { + let metadata = metadata.as_struct(); + let arrays: Vec> = vec![is_retract, metadata.column(1).clone()]; + Arc::new(StructArray::new(updating_meta_fields(), arrays, None)) +} + +// ========================================================================= +// ========================================================================= + +#[async_trait::async_trait] +impl Operator for IncrementalAggregatingFunc { + fn name(&self) -> &str { + "UpdatingAggregatingFunc" + } + + async fn on_start(&mut self, ctx: &mut TaskContext) -> Result<()> { + self.initialize(ctx).await?; + Ok(()) + } + + async fn process_data( + &mut self, + _input_idx: usize, + batch: RecordBatch, + _ctx: &mut TaskContext, + ) -> Result> { + if self.has_routing_keys { + self.keyed_aggregate(&batch)?; + } else { + self.global_aggregate(&batch)?; + } + + Ok(vec![]) + } + + async fn process_watermark( + &mut self, + _watermark: Watermark, + _ctx: &mut TaskContext, + ) -> Result> { + if let Some(changelog_batch) = self.generate_changelog()? { + Ok(vec![StreamOutput::Forward(changelog_batch)]) + } else { + Ok(vec![]) + } + } + + async fn snapshot_state( + &mut self, + _barrier: CheckpointBarrier, + _ctx: &mut TaskContext, + ) -> Result<()> { + Ok(()) + } + + async fn on_close(&mut self, _ctx: &mut TaskContext) -> Result> { + Ok(vec![]) + } +} + +// ========================================================================= +// ========================================================================= + +pub struct IncrementalAggregatingConstructor; + +impl IncrementalAggregatingConstructor { + pub fn with_config( + &self, + config: UpdatingAggregateOperator, + registry: Arc, + ) -> anyhow::Result { + let ttl = Duration::from_micros(if config.ttl_micros == 0 { + warn!("ttl was not set for updating aggregate"); + 24 * 60 * 60 * 1000 * 1000 + } else { + config.ttl_micros + }); + + let input_schema: FsSchema = config.input_schema.unwrap().try_into()?; + let final_schema: FsSchema = config.final_schema.unwrap().try_into()?; + let mut schema_without_metadata = SchemaBuilder::from((*final_schema.schema).clone()); + schema_without_metadata.remove(final_schema.schema.index_of(UPDATING_META_FIELD).unwrap()); + + let metadata_expr = parse_physical_expr( + &PhysicalExprNode::decode(&mut config.metadata_expr.as_slice())?, + registry.as_ref(), + &input_schema.schema, + &DefaultPhysicalExtensionCodec {}, + )?; + + let aggregate_exec = PhysicalPlanNode::decode(&mut config.aggregate_exec.as_ref())?; + let PhysicalPlanType::Aggregate(aggregate_exec) = aggregate_exec.physical_plan_type.unwrap() else { bail!("invalid proto"); }; + + let mut sliding_state_fields = input_schema.routing_keys() + .map(|v| v.iter().map(|idx| input_schema.schema.field(*idx).clone()).collect_vec()) + .unwrap_or_default(); + + let has_routing_keys = input_schema.routing_keys().is_some(); + let mut batch_state_fields = sliding_state_fields.clone(); + let key_fields = (0..sliding_state_fields.len()).collect_vec(); + + let aggregates: Vec<_> = aggregate_exec.aggr_expr.iter().zip(aggregate_exec.aggr_expr_name.iter()) + .map(|(expr, name)| Ok(decode_aggregate(&input_schema.schema, name, expr, registry.as_ref())?)) + .map_ok(|agg| { + let retract = match agg.create_sliding_accumulator() { Ok(s) => s.supports_retract_batch(), _ => false }; + (agg, if retract { AccumulatorType::Sliding } else { AccumulatorType::Batch }) + }) + .map_ok(|(agg, t)| { + let row_converter = Arc::new(RowConverter::new( + agg.expressions().iter().map(|ex| Ok(SortField::new(ex.data_type(&input_schema.schema)?))).collect::>()? + )?); + let fields = t.state_fields(&agg)?; + let field_names = fields.iter().map(|f| f.name().to_string()).collect_vec(); + sliding_state_fields.extend(fields.into_iter().map(|f| (*f).clone())); + Ok::<_, anyhow::Error>((agg, t, row_converter, field_names)) + }) + .flatten_ok() + .collect::>()?; + + let state_schema = Schema::new(sliding_state_fields); + + let aggregates = aggregates.into_iter().map(|(agg, t, row_converter, field_names)| Aggregator { + func: agg, accumulator_type: t, row_converter, + state_cols: field_names.iter().map(|f| state_schema.index_of(f).unwrap()).collect(), + }).collect(); + + let mut state_fields = state_schema.fields().to_vec(); + let timestamp_field = state_fields.pop().unwrap(); + state_fields.push(Arc::new((*timestamp_field).clone().with_name(TIMESTAMP_FIELD))); + + let sliding_state_schema = Arc::new(FsSchema::from_schema_keys(Arc::new(Schema::new(state_fields)), key_fields.clone())?); + + batch_state_fields.push(Field::new("accumulator", DataType::UInt32, false)); + batch_state_fields.push(Field::new("args_row", DataType::Binary, false)); + batch_state_fields.push(Field::new("count", DataType::UInt64, false)); + batch_state_fields.push(Field::new(TIMESTAMP_FIELD, DataType::Timestamp(TimeUnit::Nanosecond, None), false)); + let timestamp_index = batch_state_fields.len() - 1; + + let mut storage_key_fields = key_fields.clone(); + storage_key_fields.push(storage_key_fields.len()); + storage_key_fields.push(storage_key_fields.len()); + + let batch_state_schema = Arc::new(FsSchema::new( + Arc::new(Schema::new(batch_state_fields)), + timestamp_index, + Some(storage_key_fields), + Some(key_fields), + )); + + Ok(IncrementalAggregatingFunc { + flush_interval: Duration::from_micros(config.flush_interval_micros), + metadata_expr, + ttl, + aggregates, + accumulators: UpdatingCache::with_time_to_idle(ttl), + schema_without_metadata: Arc::new(schema_without_metadata.finish()), + final_output_schema: final_schema.schema.clone(), + updated_keys: Default::default(), + input_schema: Arc::new(input_schema.clone()), + has_routing_keys, + key_converter: RowConverter::new(input_schema.sort_fields(false))?, + sliding_state_schema, + batch_state_schema, + new_generation: 0, + }) + } +} \ No newline at end of file diff --git a/src/runtime/streaming/operators/grouping/mod.rs b/src/runtime/streaming/operators/grouping/mod.rs new file mode 100644 index 00000000..2a17a49d --- /dev/null +++ b/src/runtime/streaming/operators/grouping/mod.rs @@ -0,0 +1,17 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +pub mod incremental_aggregate; +pub mod updating_cache; + +pub use incremental_aggregate::IncrementalAggregatingConstructor; +pub use updating_cache::{Key, UpdatingCache}; diff --git a/src/runtime/streaming/operators/grouping/updating_cache.rs b/src/runtime/streaming/operators/grouping/updating_cache.rs new file mode 100644 index 00000000..2172535b --- /dev/null +++ b/src/runtime/streaming/operators/grouping/updating_cache.rs @@ -0,0 +1,508 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +use std::borrow::Borrow; +use std::collections::HashMap; +use std::sync::Arc; +use std::time::{Duration, Instant}; + +#[derive(Hash, Eq, PartialEq, Clone, Debug)] +pub struct Key(pub Arc>); + +impl Borrow<[u8]> for Key { + fn borrow(&self) -> &[u8] { + &self.0 + } +} + +struct Node { + key: Key, + data: Option, + generation: u64, + updated: Instant, + prev: Option, + next: Option, +} + +pub struct UpdatingCache { + map: HashMap, + nodes: Vec>, + free_list: Vec, + head: Option, + tail: Option, + ttl: Duration, +} + +struct TTLIter<'a, T: Send + Sync> { + now: Instant, + cache: &'a mut UpdatingCache, +} + +impl Iterator for TTLIter<'_, T> { + type Item = (Arc>, T); + + fn next(&mut self) -> Option { + let head_idx = self.cache.head?; + let node = &self.cache.nodes[head_idx]; + + if self.now.saturating_duration_since(node.updated) < self.cache.ttl { + return None; + } + + let (k, v) = self.cache.pop_front()?; + Some((k.0, v)) + } +} + +impl UpdatingCache { + pub fn with_time_to_idle(ttl: Duration) -> Self { + Self { + map: HashMap::new(), + nodes: Vec::new(), + free_list: Vec::new(), + head: None, + tail: None, + ttl, + } + } + + pub fn insert(&mut self, key: Arc>, now: Instant, generation: u64, value: T) { + let key_obj = Key(key); + + if let Some(&idx) = self.map.get(&key_obj) { + if self.nodes[idx].generation >= generation { + return; + } + self.nodes[idx].data = Some(value); + self.nodes[idx].generation = generation; + self.nodes[idx].updated = now; + self.move_to_tail(idx); + return; + } + + let idx = self.allocate_node(key_obj.clone(), value, generation, now); + self.map.insert(key_obj, idx); + self.push_back(idx); + } + + pub fn time_out(&mut self, now: Instant) -> impl Iterator>, T)> + '_ { + TTLIter { now, cache: self } + } + + pub fn iter_mut(&mut self) -> impl Iterator { + self.nodes.iter_mut().filter_map(|n| { + if let Some(data) = &mut n.data { + Some((&n.key, data)) + } else { + None + } + }) + } + + pub fn modify_and_update Result<(), E>>( + &mut self, + key: &[u8], + now: Instant, + f: F, + ) -> Option> { + let &idx = self.map.get(key)?; + let node = &mut self.nodes[idx]; + + if let Err(e) = f(node.data.as_mut().unwrap()) { + return Some(Err(e)); + } + + node.generation += 1; + node.updated = now; + self.move_to_tail(idx); + + Some(Ok(())) + } + + pub fn modify Result<(), E>>( + &mut self, + key: &[u8], + f: F, + ) -> Option> { + let &idx = self.map.get(key)?; + let node = &mut self.nodes[idx]; + + node.generation += 1; + + if let Err(e) = f(node.data.as_mut().unwrap()) { + return Some(Err(e)); + } + + Some(Ok(())) + } + + pub fn contains_key(&self, k: &[u8]) -> bool { + self.map.contains_key(k) + } + + pub fn get_mut(&mut self, key: &[u8]) -> Option<&mut T> { + let &idx = self.map.get(key)?; + self.nodes[idx].data.as_mut() + } + + pub fn get_mut_generation(&mut self, key: &[u8]) -> Option<(&mut T, u64)> { + let &idx = self.map.get(key)?; + let node = &mut self.nodes[idx]; + Some((node.data.as_mut().unwrap(), node.generation)) + } + + pub fn get_mut_key_value(&mut self, key: &[u8]) -> Option<(Key, &mut T)> { + let &idx = self.map.get(key)?; + let node = &mut self.nodes[idx]; + Some((node.key.clone(), node.data.as_mut().unwrap())) + } + + pub fn remove(&mut self, key: &[u8]) -> Option { + let &idx = self.map.get(key)?; + self.map.remove(key); + self.remove_node(idx); + + let data = self.nodes[idx].data.take().unwrap(); + self.free_list.push(idx); + + Some(data) + } + + fn pop_front(&mut self) -> Option<(Key, T)> { + let head_idx = self.head?; + self.remove_node(head_idx); + + let node = &mut self.nodes[head_idx]; + self.map.remove(&node.key); + + let key = node.key.clone(); + let data = node.data.take().unwrap(); + self.free_list.push(head_idx); + + Some((key, data)) + } + + fn allocate_node(&mut self, key: Key, data: T, generation: u64, updated: Instant) -> usize { + let new_node = Node { + key, + data: Some(data), + generation, + updated, + prev: None, + next: None, + }; + + if let Some(idx) = self.free_list.pop() { + self.nodes[idx] = new_node; + idx + } else { + let idx = self.nodes.len(); + self.nodes.push(new_node); + idx + } + } + + fn push_back(&mut self, index: usize) { + self.nodes[index].prev = self.tail; + self.nodes[index].next = None; + + if let Some(tail_idx) = self.tail { + self.nodes[tail_idx].next = Some(index); + } else { + self.head = Some(index); + } + self.tail = Some(index); + } + + fn remove_node(&mut self, index: usize) { + let prev = self.nodes[index].prev; + let next = self.nodes[index].next; + + if let Some(p) = prev { + self.nodes[p].next = next; + } else { + self.head = next; + } + + if let Some(n) = next { + self.nodes[n].prev = prev; + } else { + self.tail = prev; + } + + self.nodes[index].prev = None; + self.nodes[index].next = None; + } + + fn move_to_tail(&mut self, index: usize) { + if self.tail == Some(index) { + return; + } + self.remove_node(index); + self.push_back(index); + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_insert_and_modify() { + let mut cache = UpdatingCache::with_time_to_idle(Duration::from_secs(60)); + + let key = Arc::new(vec![1, 2, 3]); + let now = Instant::now(); + cache.insert(key.clone(), now, 1, 42); + + assert!( + cache + .modify(key.as_ref(), |x| { + *x = 43; + Ok::<(), ()>(()) + }) + .unwrap() + .is_ok() + ); + + assert_eq!(*cache.get_mut(key.as_ref()).unwrap(), 43); + } + + #[test] + fn test_timeout() { + let mut cache = UpdatingCache::with_time_to_idle(Duration::from_millis(10)); + + let key1 = Arc::new(vec![1]); + let key2 = Arc::new(vec![2]); + + let start = Instant::now(); + cache.insert(key1.clone(), start, 1, "value1"); + cache.insert(key2.clone(), start + Duration::from_millis(5), 2, "value2"); + + let check_time = start + Duration::from_millis(11); + let timed_out: Vec<_> = cache.time_out(check_time).collect(); + assert_eq!(timed_out.len(), 1); + assert_eq!(&*timed_out[0].0, &*key1); + + assert!(cache.contains_key(key2.as_ref())); + assert!(!cache.contains_key(key1.as_ref())); + } + + #[test] + fn test_update_keeps_alive() { + let mut cache = UpdatingCache::with_time_to_idle(Duration::from_millis(10)); + + let key = Arc::new(vec![1]); + let start = Instant::now(); + cache.insert(key.clone(), start, 1, "value"); + + let update_time = start + Duration::from_millis(5); + cache + .modify_and_update(key.as_ref(), update_time, |_| Ok::<(), ()>(())) + .unwrap() + .unwrap(); + + let check_time = start + Duration::from_millis(11); + let timed_out: Vec<_> = cache.time_out(check_time).collect(); + assert!(timed_out.is_empty()); + assert!(cache.contains_key(key.as_ref())); + } + + #[test] + fn test_lru_eviction_order_matches_insertion() { + let mut cache = UpdatingCache::with_time_to_idle(Duration::from_secs(60)); + let key1 = Arc::new(vec![1]); + let key2 = Arc::new(vec![2]); + let key3 = Arc::new(vec![3]); + let now = Instant::now(); + cache.insert(key1.clone(), now, 1, 1); + cache.insert(key2.clone(), now, 2, 2); + cache.insert(key3.clone(), now, 3, 3); + + let evicted: Vec<_> = cache.time_out(now + Duration::from_secs(61)).collect(); + assert_eq!(evicted.len(), 3); + assert_eq!(evicted[0].0.as_ref(), &*key1); + assert_eq!(evicted[1].0.as_ref(), &*key2); + assert_eq!(evicted[2].0.as_ref(), &*key3); + } + + #[test] + fn test_remove_middle_key() { + let mut cache = UpdatingCache::with_time_to_idle(Duration::from_secs(60)); + let key1 = Arc::new(vec![1]); + let key2 = Arc::new(vec![2]); + let key3 = Arc::new(vec![3]); + let now = Instant::now(); + cache.insert(key1.clone(), now, 1, 1); + cache.insert(key2.clone(), now, 2, 2); + cache.insert(key3.clone(), now, 3, 3); + + assert_eq!(cache.remove(&[2]).unwrap(), 2); + assert!(cache.contains_key(&[1])); + assert!(!cache.contains_key(&[2])); + assert!(cache.contains_key(&[3])); + + let evicted: Vec<_> = cache.time_out(now + Duration::from_secs(61)).collect(); + assert_eq!(evicted.len(), 2); + assert_eq!(evicted[0].0.as_ref(), &*key1); + assert_eq!(evicted[1].0.as_ref(), &*key3); + } + + #[test] + fn reorder_with_update() { + let mut cache = UpdatingCache::::with_time_to_idle(Duration::from_secs(10)); + let key1 = Arc::new(vec![1]); + let key2 = Arc::new(vec![2]); + let now = Instant::now(); + + cache.insert(key1.clone(), now, 1, 100); + cache.insert(key2.clone(), now, 2, 200); + + cache + .modify_and_update(&[1], now + Duration::from_secs(1), |v| { + *v += 1; + Ok::<(), ()>(()) + }) + .unwrap() + .unwrap(); + + let _ = cache.modify_and_update(&[1], now + Duration::from_secs(2), |v| { + *v += 1; + Ok::<(), ()>(()) + }); + } + + #[test] + fn test_ttl_eviction() { + let ttl = Duration::from_millis(100); + let mut cache = UpdatingCache::with_time_to_idle(ttl); + let now = Instant::now(); + let key1 = Arc::new(vec![1]); + let key2 = Arc::new(vec![2]); + cache.insert(key1.clone(), now, 1, 10); + cache.insert(key2.clone(), now, 2, 20); + + cache + .modify_and_update(&[2], now + Duration::from_millis(50), |v| { + *v += 1; + Ok::<(), ()>(()) + }) + .unwrap() + .unwrap(); + + let now2 = now + Duration::from_millis(150); + let evicted: Vec<_> = cache.time_out(now2).collect(); + assert_eq!(evicted.len(), 2); + assert_eq!(evicted[0].0.as_ref(), &[1]); + assert_eq!(evicted[1].0.as_ref(), &[2]); + } + + #[test] + fn test_remove_key() { + let ttl = Duration::from_millis(100); + let mut cache = UpdatingCache::with_time_to_idle(ttl); + let now = Instant::now(); + let key = Arc::new(vec![1]); + cache.insert(key.clone(), now, 1, 42); + let value = cache.remove(&[1]).unwrap(); + assert_eq!(value, 42); + assert!(!cache.contains_key(&[1])); + let evicted: Vec<_> = cache.time_out(now + Duration::from_millis(200)).collect(); + assert!(evicted.is_empty()); + } + + #[test] + fn test_update_order() { + let ttl = Duration::from_secs(1); + let mut cache = UpdatingCache::with_time_to_idle(ttl); + let base = Instant::now(); + let key_a = Arc::new(vec![b'A']); + let key_b = Arc::new(vec![b'B']); + let key_c = Arc::new(vec![b'C']); + cache.insert(key_a.clone(), base, 1, 1); + cache.insert(key_b.clone(), base, 2, 2); + cache.insert(key_c.clone(), base, 3, 3); + + let t_update = base + Duration::from_millis(500); + cache + .modify_and_update(b"B", t_update, |v| { + *v += 10; + Ok::<(), ()>(()) + }) + .unwrap() + .unwrap(); + + let t_eviction = base + Duration::from_secs(2); + let evicted: Vec<_> = cache.time_out(t_eviction).collect(); + assert_eq!(evicted.len(), 3); + assert_eq!(evicted[0].0.as_ref(), b"A"); + assert_eq!(evicted[1].0.as_ref(), b"C"); + assert_eq!(evicted[2].0.as_ref(), b"B"); + } + + #[test] + fn test_get_mut_key_value() { + let ttl = Duration::from_secs(1); + let mut cache = UpdatingCache::with_time_to_idle(ttl); + let base = Instant::now(); + let key = Arc::new(vec![1, 2, 3]); + cache.insert(key.clone(), base, 1, 42); + if let Some((k, v)) = cache.get_mut_key_value(&[1, 2, 3]) { + *v += 1; + assert_eq!(*v, 43); + assert_eq!(k.0.as_ref(), &[1, 2, 3]); + } else { + panic!("Key not found"); + } + } + + #[test] + fn test_modify_error() { + let ttl = Duration::from_secs(1); + let mut cache = UpdatingCache::with_time_to_idle(ttl); + let base = Instant::now(); + let key = Arc::new(vec![1]); + cache.insert(key.clone(), base, 1, 42); + let res = cache.modify(&[1], |_v| Err("error")); + assert!(res.unwrap().is_err()); + } + + #[test] + fn test_drop_cleanup() { + let ttl = Duration::from_secs(1); + { + let mut cache = UpdatingCache::with_time_to_idle(ttl); + let base = Instant::now(); + for i in 0..10 { + cache.insert(Arc::new(vec![i as u8]), base, i as u64, i); + } + } + } + + #[test] + fn test_generational_replacement() { + let ttl = Duration::from_secs(1); + let mut cache = UpdatingCache::with_time_to_idle(ttl); + let base = Instant::now(); + let key = Arc::new(vec![1]); + + cache.insert(key.clone(), base, 1, "first"); + assert_eq!(cache.get_mut(&[1]), Some(&mut "first")); + + cache.insert(key.clone(), base, 2, "second"); + assert_eq!(cache.get_mut(&[1]), Some(&mut "second")); + + cache.insert(key.clone(), base, 1, "third"); + assert_eq!(cache.get_mut(&[1]), Some(&mut "second")); + } +} diff --git a/src/runtime/streaming/operators/joins/join_instance.rs b/src/runtime/streaming/operators/joins/join_instance.rs new file mode 100644 index 00000000..18ed3599 --- /dev/null +++ b/src/runtime/streaming/operators/joins/join_instance.rs @@ -0,0 +1,304 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +use anyhow::{anyhow, Result}; +use arrow::compute::{max, min, partition, sort_to_indices, take}; +use arrow_array::{RecordBatch, TimestampNanosecondArray}; +use datafusion::execution::context::SessionContext; +use datafusion::execution::runtime_env::RuntimeEnvBuilder; +use datafusion::execution::SendableRecordBatchStream; +use datafusion::physical_plan::ExecutionPlan; +use datafusion_proto::physical_plan::AsExecutionPlan; +use datafusion_proto::protobuf::PhysicalPlanNode; +use futures::StreamExt; +use prost::Message; +use std::collections::BTreeMap; +use std::sync::{Arc, RwLock}; +use std::time::SystemTime; +use tokio::sync::mpsc::{unbounded_channel, UnboundedReceiver, UnboundedSender}; +use tracing::warn; + +use crate::runtime::streaming::api::context::TaskContext; +use crate::runtime::streaming::api::operator::Operator; +use crate::runtime::streaming::factory::Registry; +use async_trait::async_trait; +use protocol::grpc::api::JoinOperator; +use crate::runtime::streaming::StreamOutput; +use crate::sql::common::constants::mem_exec_join_side; +use crate::sql::common::{from_nanos, CheckpointBarrier, FsSchema, FsSchemaRef, Watermark}; +use crate::sql::physical::{DecodingContext, FsPhysicalExtensionCodec}; + +#[derive(Debug, Copy, Clone, Eq, PartialEq)] +enum JoinSide { + Left, + Right, +} + +impl JoinSide { + #[allow(dead_code)] + fn name(&self) -> &'static str { + match self { + JoinSide::Left => mem_exec_join_side::LEFT, + JoinSide::Right => mem_exec_join_side::RIGHT, + } + } +} + +struct JoinInstance { + left_tx: UnboundedSender, + right_tx: UnboundedSender, + result_stream: SendableRecordBatchStream, +} + +impl JoinInstance { + fn feed_data(&self, batch: RecordBatch, side: JoinSide) -> Result<()> { + match side { + JoinSide::Left => self + .left_tx + .send(batch) + .map_err(|e| anyhow!("Left send err: {}", e)), + JoinSide::Right => self + .right_tx + .send(batch) + .map_err(|e| anyhow!("Right send err: {}", e)), + } + } + + async fn close_and_drain(self) -> Result> { + drop(self.left_tx); + drop(self.right_tx); + + let mut outputs = Vec::new(); + let mut stream = self.result_stream; + + while let Some(result_batch) = stream.next().await { + outputs.push(result_batch?); + } + + Ok(outputs) + } +} + +pub struct InstantJoinOperator { + left_input_schema: FsSchemaRef, + right_input_schema: FsSchemaRef, + active_joins: BTreeMap, + left_receiver_hook: Arc>>>, + right_receiver_hook: Arc>>>, + join_exec_plan: Arc, +} + +impl InstantJoinOperator { + fn input_schema(&self, side: JoinSide) -> FsSchemaRef { + match side { + JoinSide::Left => self.left_input_schema.clone(), + JoinSide::Right => self.right_input_schema.clone(), + } + } + + fn get_or_create_join_instance(&mut self, time: SystemTime) -> Result<&mut JoinInstance> { + use std::collections::btree_map::Entry; + + if let Entry::Vacant(e) = self.active_joins.entry(time) { + let (left_tx, left_rx) = unbounded_channel(); + let (right_tx, right_rx) = unbounded_channel(); + + *self.left_receiver_hook.write().unwrap() = Some(left_rx); + *self.right_receiver_hook.write().unwrap() = Some(right_rx); + + self.join_exec_plan.reset().map_err(|e| anyhow!("{e}"))?; + let result_stream = self + .join_exec_plan + .execute(0, SessionContext::new().task_ctx()) + .map_err(|e| anyhow!("{e}"))?; + + e.insert(JoinInstance { + left_tx, + right_tx, + result_stream, + }); + } + + self.active_joins + .get_mut(&time) + .ok_or_else(|| anyhow!("join instance missing after insert")) + } + + async fn process_side_internal( + &mut self, + side: JoinSide, + batch: RecordBatch, + ctx: &mut TaskContext, + ) -> Result<()> { + if batch.num_rows() == 0 { + return Ok(()); + } + + let time_column = batch + .column(self.input_schema(side).timestamp_index) + .as_any() + .downcast_ref::() + .ok_or_else(|| anyhow!("Missing timestamp column"))?; + + let min_timestamp = min(time_column).ok_or_else(|| anyhow!("empty timestamp column"))?; + let max_timestamp = max(time_column).ok_or_else(|| anyhow!("empty timestamp column"))?; + + if let Some(watermark) = ctx.last_present_watermark() { + if watermark > from_nanos(min_timestamp as u128) { + warn!("Dropped late batch from {:?} before watermark", side); + return Ok(()); + } + } + + let unkeyed_batch = self.input_schema(side).unkeyed_batch(&batch)?; + + if max_timestamp == min_timestamp { + let time_key = from_nanos(max_timestamp as u128); + let join_instance = self.get_or_create_join_instance(time_key)?; + join_instance.feed_data(unkeyed_batch, side)?; + return Ok(()); + } + + let indices = sort_to_indices(time_column, None, None)?; + let columns: Vec<_> = unkeyed_batch + .columns() + .iter() + .map(|c| take(c, &indices, None).unwrap()) + .collect(); + let sorted_batch = RecordBatch::try_new(unkeyed_batch.schema(), columns)?; + let sorted_timestamps = take(time_column, &indices, None).unwrap(); + let typed_timestamps = sorted_timestamps + .as_any() + .downcast_ref::() + .ok_or_else(|| anyhow!("sorted timestamps downcast failed"))?; + let ranges = partition(std::slice::from_ref(&sorted_timestamps)) + .unwrap() + .ranges(); + + for range in ranges { + let sub_batch = sorted_batch.slice(range.start, range.end - range.start); + let time_key = from_nanos(typed_timestamps.value(range.start) as u128); + let join_instance = self.get_or_create_join_instance(time_key)?; + join_instance.feed_data(sub_batch, side)?; + } + + Ok(()) + } +} + +#[async_trait] +impl Operator for InstantJoinOperator { + fn name(&self) -> &str { + "InstantJoin" + } + + async fn on_start(&mut self, _ctx: &mut TaskContext) -> Result<()> { + Ok(()) + } + + async fn process_data( + &mut self, + input_idx: usize, + batch: RecordBatch, + ctx: &mut TaskContext, + ) -> Result> { + let side = if input_idx == 0 { + JoinSide::Left + } else { + JoinSide::Right + }; + self.process_side_internal(side, batch, ctx).await?; + Ok(vec![]) + } + + async fn process_watermark( + &mut self, + watermark: Watermark, + _ctx: &mut TaskContext, + ) -> Result> { + let Watermark::EventTime(current_time) = watermark else { + return Ok(vec![]); + }; + let mut emit_outputs = Vec::new(); + + let mut expired_times = Vec::new(); + for key in self.active_joins.keys() { + if *key < current_time { + expired_times.push(*key); + } else { + break; + } + } + + for time_key in expired_times { + if let Some(join_instance) = self.active_joins.remove(&time_key) { + let joined_batches = join_instance.close_and_drain().await?; + for batch in joined_batches { + emit_outputs.push(StreamOutput::Forward(batch)); + } + } + } + + Ok(emit_outputs) + } + + async fn snapshot_state( + &mut self, + _barrier: CheckpointBarrier, + _ctx: &mut TaskContext, + ) -> Result<()> { + Ok(()) + } +} + +pub struct InstantJoinConstructor; + +impl InstantJoinConstructor { + pub fn with_config( + &self, + config: JoinOperator, + registry: Arc, + ) -> anyhow::Result { + let join_physical_plan_node = PhysicalPlanNode::decode(&mut config.join_plan.as_slice())?; + + let left_input_schema: Arc = + Arc::new(config.left_schema.unwrap().try_into()?); + let right_input_schema: Arc = + Arc::new(config.right_schema.unwrap().try_into()?); + + let left_receiver_hook = Arc::new(RwLock::new(None)); + let right_receiver_hook = Arc::new(RwLock::new(None)); + + let codec = FsPhysicalExtensionCodec { + context: DecodingContext::LockedJoinStream { + left: left_receiver_hook.clone(), + right: right_receiver_hook.clone(), + }, + }; + + let join_exec_plan = join_physical_plan_node.try_into_physical_plan( + registry.as_ref(), + &RuntimeEnvBuilder::new().build()?, + &codec, + )?; + + Ok(InstantJoinOperator { + left_input_schema, + right_input_schema, + active_joins: BTreeMap::new(), + left_receiver_hook, + right_receiver_hook, + join_exec_plan, + }) + } +} diff --git a/src/runtime/streaming/operators/joins/join_with_expiration.rs b/src/runtime/streaming/operators/joins/join_with_expiration.rs new file mode 100644 index 00000000..212cfaad --- /dev/null +++ b/src/runtime/streaming/operators/joins/join_with_expiration.rs @@ -0,0 +1,283 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +use anyhow::{anyhow, Result}; +use arrow::compute::concat_batches; +use arrow_array::RecordBatch; +use datafusion::execution::context::SessionContext; +use datafusion::execution::runtime_env::RuntimeEnvBuilder; +use datafusion::physical_plan::ExecutionPlan; +use datafusion_proto::{physical_plan::AsExecutionPlan, protobuf::PhysicalPlanNode}; +use futures::StreamExt; +use prost::Message; +use std::collections::VecDeque; +use std::sync::{Arc, RwLock}; +use std::time::{Duration, SystemTime}; +use tracing::warn; + +use crate::runtime::streaming::api::context::TaskContext; +use crate::runtime::streaming::api::operator::Operator; +use crate::runtime::streaming::factory::Registry; +use async_trait::async_trait; +use protocol::grpc::api::JoinOperator; +use crate::runtime::streaming::StreamOutput; +use crate::sql::common::{CheckpointBarrier, FsSchema, Watermark}; +use crate::sql::physical::{DecodingContext, FsPhysicalExtensionCodec}; + +#[derive(Debug, Copy, Clone, Eq, PartialEq)] +enum JoinSide { + Left, + Right, +} + +// ============================================================================ +// ============================================================================ + +struct StateBuffer { + batches: VecDeque<(SystemTime, RecordBatch)>, + ttl: Duration, +} + +impl StateBuffer { + fn new(ttl: Duration) -> Self { + Self { + batches: VecDeque::new(), + ttl, + } + } + + fn insert(&mut self, batch: RecordBatch, time: SystemTime) { + self.batches.push_back((time, batch)); + } + + fn expire(&mut self, current_time: SystemTime) { + let cutoff = current_time + .checked_sub(self.ttl) + .unwrap_or(SystemTime::UNIX_EPOCH); + while let Some((time, _)) = self.batches.front() { + if *time < cutoff { + self.batches.pop_front(); + } else { + break; + } + } + } + + fn get_all_batches(&self) -> Vec { + self.batches.iter().map(|(_, b)| b.clone()).collect() + } +} + +// ============================================================================ +// ============================================================================ + +pub struct JoinWithExpirationOperator { + left_input_schema: FsSchema, + right_input_schema: FsSchema, + left_schema: FsSchema, + right_schema: FsSchema, + + left_passer: Arc>>, + right_passer: Arc>>, + join_exec_plan: Arc, + + left_state: StateBuffer, + right_state: StateBuffer, +} + +impl JoinWithExpirationOperator { + async fn compute_pair( + &mut self, + left: RecordBatch, + right: RecordBatch, + ) -> Result> { + if left.num_rows() == 0 || right.num_rows() == 0 { + return Ok(vec![]); + } + + { + self.left_passer.write().unwrap().replace(left); + self.right_passer.write().unwrap().replace(right); + } + + self.join_exec_plan + .reset() + .map_err(|e| anyhow!("join plan reset: {e}"))?; + + let mut result_stream = self + .join_exec_plan + .execute(0, SessionContext::new().task_ctx()) + .map_err(|e| anyhow!("join execute: {e}"))?; + + let mut outputs = Vec::new(); + while let Some(batch) = result_stream.next().await { + outputs.push(batch.map_err(|e| anyhow!("{e}"))?); + } + + Ok(outputs) + } + + async fn process_side( + &mut self, + side: JoinSide, + batch: RecordBatch, + ctx: &mut TaskContext, + ) -> Result> { + let current_time = ctx + .last_present_watermark() + .unwrap_or_else(SystemTime::now); + + self.left_state.expire(current_time); + self.right_state.expire(current_time); + + match side { + JoinSide::Left => self.left_state.insert(batch.clone(), current_time), + JoinSide::Right => self.right_state.insert(batch.clone(), current_time), + } + + let opposite_batches = match side { + JoinSide::Left => self.right_state.get_all_batches(), + JoinSide::Right => self.left_state.get_all_batches(), + }; + + if opposite_batches.is_empty() { + return Ok(vec![]); + } + + let opposite_schema = match side { + JoinSide::Left => &self.right_schema.schema, + JoinSide::Right => &self.left_schema.schema, + }; + let combined_opposite_batch = concat_batches(opposite_schema, opposite_batches.iter())?; + + let unkeyed_target_batch = match side { + JoinSide::Left => self.left_input_schema.unkeyed_batch(&batch)?, + JoinSide::Right => self.right_input_schema.unkeyed_batch(&batch)?, + }; + + let (left_input, right_input) = match side { + JoinSide::Left => (unkeyed_target_batch, combined_opposite_batch), + JoinSide::Right => (combined_opposite_batch, unkeyed_target_batch), + }; + + let result_batches = self.compute_pair(left_input, right_input).await?; + + Ok(result_batches + .into_iter() + .map(StreamOutput::Forward) + .collect()) + } +} + +#[async_trait] +impl Operator for JoinWithExpirationOperator { + fn name(&self) -> &str { + "JoinWithExpiration" + } + + async fn on_start(&mut self, _ctx: &mut TaskContext) -> Result<()> { + Ok(()) + } + + async fn process_data( + &mut self, + input_idx: usize, + batch: RecordBatch, + ctx: &mut TaskContext, + ) -> Result> { + let side = if input_idx == 0 { + JoinSide::Left + } else { + JoinSide::Right + }; + self.process_side(side, batch, ctx).await + } + + async fn process_watermark( + &mut self, + _watermark: Watermark, + _ctx: &mut TaskContext, + ) -> Result> { + Ok(vec![]) + } + + async fn snapshot_state( + &mut self, + _barrier: CheckpointBarrier, + _ctx: &mut TaskContext, + ) -> Result<()> { + Ok(()) + } + + async fn on_close(&mut self, _ctx: &mut TaskContext) -> Result> { + Ok(vec![]) + } +} + +// ============================================================================ +// ============================================================================ + +pub struct JoinWithExpirationConstructor; + +impl JoinWithExpirationConstructor { + pub fn with_config( + &self, + config: JoinOperator, + registry: Arc, + ) -> anyhow::Result { + let left_passer = Arc::new(RwLock::new(None)); + let right_passer = Arc::new(RwLock::new(None)); + + let codec = FsPhysicalExtensionCodec { + context: DecodingContext::LockedJoinPair { + left: left_passer.clone(), + right: right_passer.clone(), + }, + }; + + let join_physical_plan_node = PhysicalPlanNode::decode(&mut config.join_plan.as_slice())?; + let join_exec_plan = join_physical_plan_node.try_into_physical_plan( + registry.as_ref(), + &RuntimeEnvBuilder::new().build()?, + &codec, + )?; + + let left_input_schema: FsSchema = config.left_schema.unwrap().try_into()?; + let right_input_schema: FsSchema = config.right_schema.unwrap().try_into()?; + let left_schema = left_input_schema.schema_without_keys()?; + let right_schema = right_input_schema.schema_without_keys()?; + + let mut ttl = Duration::from_micros( + config + .ttl_micros + .expect("ttl must be set for non-instant join"), + ); + + if ttl == Duration::ZERO { + warn!("TTL was not set for join with expiration, defaulting to 24 hours."); + ttl = Duration::from_secs(24 * 60 * 60); + } + + Ok(JoinWithExpirationOperator { + left_input_schema, + right_input_schema, + left_schema, + right_schema, + left_passer, + right_passer, + join_exec_plan, + left_state: StateBuffer::new(ttl), + right_state: StateBuffer::new(ttl), + }) + } +} diff --git a/src/runtime/streaming/operators/joins/mod.rs b/src/runtime/streaming/operators/joins/mod.rs new file mode 100644 index 00000000..1cc83d36 --- /dev/null +++ b/src/runtime/streaming/operators/joins/mod.rs @@ -0,0 +1,17 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +pub mod join_instance; +pub mod join_with_expiration; + +pub use join_instance::InstantJoinConstructor; +pub use join_with_expiration::JoinWithExpirationConstructor; diff --git a/src/runtime/streaming/operators/key_by.rs b/src/runtime/streaming/operators/key_by.rs new file mode 100644 index 00000000..edafc063 --- /dev/null +++ b/src/runtime/streaming/operators/key_by.rs @@ -0,0 +1,165 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +use anyhow::{anyhow, Result}; +use arrow_array::{Array, RecordBatch, UInt64Array}; +use arrow::compute::{sort_to_indices, take}; +use async_trait::async_trait; +use datafusion::physical_expr::PhysicalExpr; +use datafusion_physical_expr::expressions::Column; +use datafusion_common::hash_utils::create_hashes; +use std::sync::Arc; + +use crate::runtime::streaming::api::context::TaskContext; +use crate::runtime::streaming::api::operator::Operator; +use crate::runtime::streaming::StreamOutput; +use crate::sql::common::{CheckpointBarrier, Watermark}; + +use protocol::grpc::api::KeyPlanOperator; + +pub struct KeyByOperator { + name: String, + key_extractors: Vec>, + random_state: ahash::RandomState, +} + +impl KeyByOperator { + pub fn new(name: String, key_extractors: Vec>) -> Self { + Self { + name, + key_extractors, + random_state: ahash::RandomState::new(), + } + } +} + +#[async_trait] +impl Operator for KeyByOperator { + fn name(&self) -> &str { + &self.name + } + + async fn on_start(&mut self, _ctx: &mut TaskContext) -> Result<()> { + Ok(()) + } + + async fn process_data( + &mut self, + _input_idx: usize, + batch: RecordBatch, + _ctx: &mut TaskContext, + ) -> Result> { + let num_rows = batch.num_rows(); + if num_rows == 0 { + return Ok(vec![]); + } + + let mut key_columns = Vec::with_capacity(self.key_extractors.len()); + for expr in &self.key_extractors { + let column_array = expr + .evaluate(&batch) + .map_err(|e| anyhow!("Failed to evaluate key expr: {}", e))? + .into_array(num_rows) + .map_err(|e| anyhow!("Failed to convert into array: {}", e))?; + key_columns.push(column_array); + } + + let mut hash_buffer = vec![0u64; num_rows]; + create_hashes(&key_columns, &self.random_state, &mut hash_buffer) + .map_err(|e| anyhow!("Failed to compute hashes: {}", e))?; + + let hash_array = UInt64Array::from(hash_buffer); + + let sorted_indices = sort_to_indices(&hash_array, None, None) + .map_err(|e| anyhow!("Failed to sort hashes: {}", e))?; + + let sorted_hashes_ref = take(&hash_array, &sorted_indices, None)?; + let sorted_hashes = sorted_hashes_ref + .as_any() + .downcast_ref::() + .unwrap(); + + let sorted_columns: std::result::Result, _> = batch + .columns() + .iter() + .map(|col| take(col, &sorted_indices, None)) + .collect(); + let sorted_batch = RecordBatch::try_new(batch.schema(), sorted_columns?)?; + + let mut outputs = Vec::new(); + let mut start_idx = 0; + + while start_idx < num_rows { + let current_hash = sorted_hashes.value(start_idx); + let mut end_idx = start_idx + 1; + while end_idx < num_rows && sorted_hashes.value(end_idx) == current_hash { + end_idx += 1; + } + + let sub_batch = sorted_batch.slice(start_idx, end_idx - start_idx); + outputs.push(StreamOutput::Keyed(current_hash, sub_batch)); + start_idx = end_idx; + } + + Ok(outputs) + } + + async fn process_watermark( + &mut self, + watermark: Watermark, + _ctx: &mut TaskContext, + ) -> Result> { + Ok(vec![StreamOutput::Watermark(watermark)]) + } + + async fn snapshot_state( + &mut self, + _barrier: CheckpointBarrier, + _ctx: &mut TaskContext, + ) -> Result<()> { + Ok(()) + } + + async fn on_close(&mut self, _ctx: &mut TaskContext) -> Result> { + Ok(vec![]) + } +} + +// --------------------------------------------------------------------------- +// Constructor +// --------------------------------------------------------------------------- + +pub struct KeyByConstructor; + +impl KeyByConstructor { + pub fn with_config(&self, config: KeyPlanOperator) -> Result { + let mut key_extractors: Vec> = + Vec::with_capacity(config.key_fields.len()); + + for field_idx in &config.key_fields { + let idx = *field_idx as usize; + let expr = Arc::new(Column::new(&format!("col_{}", idx), idx)) + as Arc; + key_extractors.push(expr); + } + + let name = if config.name.is_empty() { + "KeyBy".to_string() + } else { + config.name.clone() + }; + + Ok(KeyByOperator::new(name, key_extractors)) + } +} + diff --git a/src/runtime/streaming/operators/key_operator.rs b/src/runtime/streaming/operators/key_operator.rs new file mode 100644 index 00000000..4a3942e0 --- /dev/null +++ b/src/runtime/streaming/operators/key_operator.rs @@ -0,0 +1,283 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! + +use anyhow::{anyhow, Result}; +use arrow_array::{Array, ArrayRef, RecordBatch, UInt64Array}; +use arrow::compute::{sort_to_indices, take}; +use async_trait::async_trait; +use datafusion::physical_expr::PhysicalExpr; +use datafusion_physical_expr::expressions::Column; +use datafusion_common::hash_utils::create_hashes; +use futures::StreamExt; +use std::sync::Arc; + +use crate::runtime::streaming::api::context::TaskContext; +use crate::runtime::streaming::api::operator::Operator; +use crate::runtime::streaming::operators::StatelessPhysicalExecutor; +use crate::runtime::streaming::StreamOutput; +use crate::sql::common::{CheckpointBarrier, Watermark}; + +use protocol::grpc::api::KeyPlanOperator; + +pub struct KeyByOperator { + name: String, + key_extractors: Vec>, + random_state: ahash::RandomState, +} + +impl KeyByOperator { + pub fn new(name: String, key_extractors: Vec>) -> Self { + Self { + name, + key_extractors, + random_state: ahash::RandomState::new(), + } + } +} + +#[async_trait] +impl Operator for KeyByOperator { + fn name(&self) -> &str { + &self.name + } + + async fn on_start(&mut self, _ctx: &mut TaskContext) -> Result<()> { + Ok(()) + } + + async fn process_data( + &mut self, + _input_idx: usize, + batch: RecordBatch, + _ctx: &mut TaskContext, + ) -> Result> { + let num_rows = batch.num_rows(); + if num_rows == 0 { + return Ok(vec![]); + } + + let mut key_columns = Vec::with_capacity(self.key_extractors.len()); + for expr in &self.key_extractors { + let column_array = expr + .evaluate(&batch) + .map_err(|e| anyhow!("Failed to evaluate key expr: {}", e))? + .into_array(num_rows) + .map_err(|e| anyhow!("Failed to convert into array: {}", e))?; + key_columns.push(column_array); + } + + let mut hash_buffer = vec![0u64; num_rows]; + create_hashes(&key_columns, &self.random_state, &mut hash_buffer) + .map_err(|e| anyhow!("Failed to compute hashes: {}", e))?; + + let hash_array = UInt64Array::from(hash_buffer); + + let sorted_indices = sort_to_indices(&hash_array, None, None) + .map_err(|e| anyhow!("Failed to sort hashes: {}", e))?; + + let sorted_hashes_ref = take(&hash_array, &sorted_indices, None)?; + let sorted_hashes = sorted_hashes_ref + .as_any() + .downcast_ref::() + .unwrap(); + + let sorted_columns: std::result::Result, _> = batch + .columns() + .iter() + .map(|col| take(col, &sorted_indices, None)) + .collect(); + let sorted_batch = RecordBatch::try_new(batch.schema(), sorted_columns?)?; + + let mut outputs = Vec::new(); + let mut start_idx = 0; + + while start_idx < num_rows { + let current_hash = sorted_hashes.value(start_idx); + let mut end_idx = start_idx + 1; + while end_idx < num_rows && sorted_hashes.value(end_idx) == current_hash { + end_idx += 1; + } + + let sub_batch = sorted_batch.slice(start_idx, end_idx - start_idx); + outputs.push(StreamOutput::Keyed(current_hash, sub_batch)); + start_idx = end_idx; + } + + Ok(outputs) + } + + async fn process_watermark( + &mut self, + watermark: Watermark, + _ctx: &mut TaskContext, + ) -> Result> { + Ok(vec![StreamOutput::Watermark(watermark)]) + } + + async fn snapshot_state( + &mut self, + _barrier: CheckpointBarrier, + _ctx: &mut TaskContext, + ) -> Result<()> { + Ok(()) + } + + async fn on_close(&mut self, _ctx: &mut TaskContext) -> Result> { + Ok(vec![]) + } +} + +// --------------------------------------------------------------------------- +// Constructor +// --------------------------------------------------------------------------- + +pub struct KeyByConstructor; + +impl KeyByConstructor { + pub fn with_config(&self, config: KeyPlanOperator) -> Result { + let mut key_extractors: Vec> = + Vec::with_capacity(config.key_fields.len()); + + for field_idx in &config.key_fields { + let idx = *field_idx as usize; + let expr = Arc::new(Column::new(&format!("col_{}", idx), idx)) + as Arc; + key_extractors.push(expr); + } + + let name = if config.name.is_empty() { + "KeyBy".to_string() + } else { + config.name.clone() + }; + + Ok(KeyByOperator::new(name, key_extractors)) + } +} + +// =========================================================================== +// =========================================================================== + +pub struct KeyExecutionOperator { + name: String, + executor: StatelessPhysicalExecutor, + key_fields: Vec, + random_state: ahash::RandomState, +} + +impl KeyExecutionOperator { + pub fn new( + name: String, + executor: StatelessPhysicalExecutor, + key_fields: Vec, + ) -> Self { + Self { + name, + executor, + key_fields, + random_state: ahash::RandomState::new(), + } + } +} + +#[async_trait] +impl Operator for KeyExecutionOperator { + fn name(&self) -> &str { + &self.name + } + + async fn process_data( + &mut self, + _input_idx: usize, + batch: RecordBatch, + _ctx: &mut TaskContext, + ) -> Result> { + let mut outputs = Vec::new(); + + let mut stream = self.executor.process_batch(batch).await?; + + while let Some(batch_result) = stream.next().await { + let out_batch = batch_result?; + let num_rows = out_batch.num_rows(); + if num_rows == 0 { + continue; + } + + let key_columns: Vec = self + .key_fields + .iter() + .map(|&idx| out_batch.column(idx).clone()) + .collect(); + + let mut hash_buffer = vec![0u64; num_rows]; + create_hashes(&key_columns, &self.random_state, &mut hash_buffer) + .map_err(|e| anyhow!("hash compute: {e}"))?; + let hash_array = UInt64Array::from(hash_buffer); + + let sorted_indices = sort_to_indices(&hash_array, None, None) + .map_err(|e| anyhow!("sort hashes: {e}"))?; + + let sorted_hashes_ref = take(&hash_array, &sorted_indices, None)?; + let sorted_hashes = sorted_hashes_ref + .as_any() + .downcast_ref::() + .unwrap(); + + let sorted_columns: std::result::Result, _> = out_batch + .columns() + .iter() + .map(|col| take(col, &sorted_indices, None)) + .collect(); + let sorted_batch = + RecordBatch::try_new(out_batch.schema(), sorted_columns?)?; + + let mut start_idx = 0; + while start_idx < num_rows { + let current_hash = sorted_hashes.value(start_idx); + let mut end_idx = start_idx + 1; + while end_idx < num_rows + && sorted_hashes.value(end_idx) == current_hash + { + end_idx += 1; + } + + let sub_batch = sorted_batch.slice(start_idx, end_idx - start_idx); + outputs.push(StreamOutput::Keyed(current_hash, sub_batch)); + start_idx = end_idx; + } + } + Ok(outputs) + } + + async fn process_watermark( + &mut self, + watermark: Watermark, + _ctx: &mut TaskContext, + ) -> Result> { + Ok(vec![StreamOutput::Watermark(watermark)]) + } + + async fn snapshot_state( + &mut self, + _barrier: CheckpointBarrier, + _ctx: &mut TaskContext, + ) -> Result<()> { + Ok(()) + } + + async fn on_close(&mut self, _ctx: &mut TaskContext) -> Result> { + Ok(vec![]) + } +} + diff --git a/src/runtime/streaming/operators/mod.rs b/src/runtime/streaming/operators/mod.rs new file mode 100644 index 00000000..ffe1c101 --- /dev/null +++ b/src/runtime/streaming/operators/mod.rs @@ -0,0 +1,30 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +pub mod grouping; +pub mod joins; +pub mod key_by; +pub mod sink; +pub mod source; +pub mod watermark; +pub mod windows; +mod key_operator; +pub mod projection; +mod stateless_physical_executor; +mod value_execution; + +pub use stateless_physical_executor::StatelessPhysicalExecutor; +pub use projection::ProjectionOperator; +pub use value_execution::ValueExecutionOperator; + +pub use grouping::{Key, UpdatingCache}; diff --git a/src/runtime/streaming/operators/projection.rs b/src/runtime/streaming/operators/projection.rs new file mode 100644 index 00000000..0136e18e --- /dev/null +++ b/src/runtime/streaming/operators/projection.rs @@ -0,0 +1,140 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use anyhow::{anyhow, Result}; +use arrow_array::RecordBatch; +use async_trait::async_trait; +use datafusion::physical_expr::PhysicalExpr; +use datafusion_proto::physical_plan::DefaultPhysicalExtensionCodec; +use datafusion_proto::physical_plan::from_proto::parse_physical_expr; +use datafusion_proto::protobuf::PhysicalExprNode; +use prost::Message; +use std::sync::Arc; + +use protocol::grpc::api::ProjectionOperator as ProjectionOperatorProto; + +use crate::runtime::streaming::api::context::TaskContext; +use crate::runtime::streaming::api::operator::Operator; +use crate::runtime::streaming::factory::global::Registry; +use crate::runtime::streaming::StreamOutput; +use crate::sql::common::{CheckpointBarrier, FsSchema, FsSchemaRef, Watermark}; +use crate::sql::logical_node::logical::OperatorName; + +pub struct ProjectionOperator { + name: String, + output_schema: FsSchemaRef, + exprs: Vec>, +} + +impl ProjectionOperator { + pub fn new( + name: String, + output_schema: FsSchemaRef, + exprs: Vec>, + ) -> Self { + Self { + name, + output_schema, + exprs, + } + } + + pub fn from_proto( + config: ProjectionOperatorProto, + registry: Arc, + ) -> Result { + let input_schema: FsSchema = config + .input_schema + .ok_or_else(|| anyhow!("missing projection input_schema"))? + .try_into() + .map_err(|e| anyhow!("projection input_schema: {e}"))?; + + let output_schema: FsSchema = config + .output_schema + .ok_or_else(|| anyhow!("missing projection output_schema"))? + .try_into() + .map_err(|e| anyhow!("projection output_schema: {e}"))?; + + let exprs = config + .exprs + .iter() + .map(|raw| { + let expr_node = PhysicalExprNode::decode(&mut raw.as_slice()) + .map_err(|e| anyhow!("decode projection expr: {e}"))?; + parse_physical_expr( + &expr_node, + registry.as_ref(), + &input_schema.schema, + &DefaultPhysicalExtensionCodec {}, + ) + .map_err(|e| anyhow!("parse projection expr: {e}")) + }) + .collect::>>()?; + + let name = if config.name.is_empty() { + OperatorName::Projection.as_registry_key().to_string() + } else { + config.name + }; + + Ok(Self::new(name, Arc::new(output_schema), exprs)) + + } +} + +#[async_trait] +impl Operator for ProjectionOperator { + fn name(&self) -> &str { + &self.name + } + + async fn process_data( + &mut self, + _input_idx: usize, + batch: RecordBatch, + _ctx: &mut TaskContext, + ) -> Result> { + if batch.num_rows() == 0 { + return Ok(vec![]); + } + + let projected_columns = self + .exprs + .iter() + .map(|expr| { + expr.evaluate(&batch) + .and_then(|val| val.into_array(batch.num_rows())) + }) + .collect::>>()?; + + let out_batch = + RecordBatch::try_new(self.output_schema.schema.clone(), projected_columns)?; + + Ok(vec![StreamOutput::Forward(out_batch)]) + } + + async fn process_watermark( + &mut self, + watermark: Watermark, + _ctx: &mut TaskContext, + ) -> Result> { + Ok(vec![StreamOutput::Watermark(watermark)]) + } + + async fn snapshot_state( + &mut self, + _barrier: CheckpointBarrier, + _ctx: &mut TaskContext, + ) -> Result<()> { + Ok(()) + } +} diff --git a/src/runtime/streaming/operators/sink/kafka/mod.rs b/src/runtime/streaming/operators/sink/kafka/mod.rs new file mode 100644 index 00000000..4b6d48cb --- /dev/null +++ b/src/runtime/streaming/operators/sink/kafka/mod.rs @@ -0,0 +1,358 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +use anyhow::{anyhow, bail, Result}; +use arrow_array::cast::AsArray; +use arrow_array::Array; +use arrow_array::RecordBatch; +use arrow_schema::{DataType, TimeUnit}; +use async_trait::async_trait; +use rdkafka::error::{KafkaError, RDKafkaErrorCode}; +use rdkafka::producer::{DeliveryFuture, FutureProducer, FutureRecord, Producer}; +use rdkafka::util::Timeout; +use rdkafka::ClientConfig; +use std::collections::HashMap; +use std::time::Duration; +use tokio::time::sleep; +use tracing::{info, warn}; + +use crate::runtime::streaming::api::context::TaskContext; +use crate::runtime::streaming::api::operator::Operator; +use crate::runtime::streaming::format::DataSerializer; +use crate::runtime::streaming::StreamOutput; +use crate::sql::common::constants::factory_operator_name; +use crate::sql::common::{CheckpointBarrier, FsSchema, Watermark}; +// ============================================================================ +// ============================================================================ + +#[derive(Debug, Clone)] +pub enum ConsistencyMode { + AtLeastOnce, + ExactlyOnce, +} + +struct TransactionalState { + next_transaction_index: usize, + active_producer: FutureProducer, + producer_awaiting_commit: Option, +} + +// ============================================================================ +// ============================================================================ + +pub struct KafkaSinkOperator { + pub topic: String, + pub bootstrap_servers: String, + pub consistency_mode: ConsistencyMode, + pub client_config: HashMap, + + pub input_schema: FsSchema, + pub timestamp_col_idx: Option, + pub key_col_idx: Option, + + pub serializer: DataSerializer, + + at_least_once_producer: Option, + transactional_state: Option, + + write_futures: Vec, +} + +impl KafkaSinkOperator { + pub fn new( + topic: String, + bootstrap_servers: String, + consistency_mode: ConsistencyMode, + client_config: HashMap, + input_schema: FsSchema, + serializer: DataSerializer, + ) -> Self { + Self { + topic, + bootstrap_servers, + consistency_mode, + client_config, + input_schema, + timestamp_col_idx: None, + key_col_idx: None, + serializer, + at_least_once_producer: None, + transactional_state: None, + write_futures: Vec::new(), + } + } + + fn resolve_schema_indices(&mut self) { + self.timestamp_col_idx = Some(self.input_schema.timestamp_index); + + if let Some(routing_keys) = self.input_schema.routing_keys() { + if !routing_keys.is_empty() { + self.key_col_idx = Some(routing_keys[0]); + } + } + } + + fn create_producer(&self, ctx: &TaskContext, tx_index: Option) -> Result { + let mut config = ClientConfig::new(); + config.set("bootstrap.servers", &self.bootstrap_servers); + + for (k, v) in &self.client_config { + config.set(k, v); + } + + if let Some(idx) = tx_index { + config.set("enable.idempotence", "true"); + let transactional_id = format!( + "fs-tx-{}-{}-{}-{}", + ctx.job_id, self.topic, ctx.subtask_idx, idx + ); + config.set("transactional.id", &transactional_id); + + let producer: FutureProducer = config.create()?; + producer + .init_transactions(Timeout::After(Duration::from_secs(30))) + .map_err(|e| anyhow!("Failed to init Kafka transactions: {}", e))?; + producer + .begin_transaction() + .map_err(|e| anyhow!("Failed to begin Kafka transaction: {}", e))?; + + Ok(producer) + } else { + Ok(config.create()?) + } + } + + async fn flush_to_broker(&mut self) -> Result<()> { + let producer = self.current_producer(); + + producer.poll(Timeout::After(Duration::ZERO)); + + for future in self.write_futures.drain(..) { + match future.await { + Ok(Ok(_)) => continue, + Ok(Err((e, _))) => bail!("Kafka producer delivery failed: {}", e), + Err(_) => bail!("Kafka delivery future canceled"), + } + } + Ok(()) + } + + fn current_producer(&self) -> &FutureProducer { + match &self.consistency_mode { + ConsistencyMode::AtLeastOnce => self.at_least_once_producer.as_ref().unwrap(), + ConsistencyMode::ExactlyOnce => &self.transactional_state.as_ref().unwrap().active_producer, + } + } +} + +fn event_timestamp_ms(batch: &RecordBatch, row: usize, col: usize) -> Option { + let arr = batch.column(col); + match arr.data_type() { + DataType::Timestamp(TimeUnit::Second, _) => { + let a = arr.as_primitive::(); + (!a.is_null(row)).then(|| a.value(row) * 1000) + } + DataType::Timestamp(TimeUnit::Millisecond, _) => { + let a = arr.as_primitive::(); + (!a.is_null(row)).then(|| a.value(row)) + } + DataType::Timestamp(TimeUnit::Microsecond, _) => { + let a = arr.as_primitive::(); + (!a.is_null(row)).then(|| a.value(row) / 1000) + } + DataType::Timestamp(TimeUnit::Nanosecond, _) => { + let a = arr.as_primitive::(); + (!a.is_null(row)).then(|| a.value(row) / 1_000_000) + } + _ => None, + } +} + +fn row_key_bytes(batch: &RecordBatch, row: usize, col: usize) -> Option> { + let arr = batch.column(col); + match arr.data_type() { + DataType::Utf8 => { + let s = arr.as_string::(); + if s.is_null(row) { + None + } else { + Some(s.value(row).as_bytes().to_vec()) + } + } + DataType::LargeUtf8 => { + let s = arr.as_string::(); + if s.is_null(row) { + None + } else { + Some(s.value(row).as_bytes().to_vec()) + } + } + _ => None, + } +} + +// ============================================================================ +// ============================================================================ + +#[async_trait] +impl Operator for KafkaSinkOperator { + fn name(&self) -> &str { + factory_operator_name::KAFKA_SINK + } + + async fn on_start(&mut self, ctx: &mut TaskContext) -> Result<()> { + self.resolve_schema_indices(); + + match self.consistency_mode { + ConsistencyMode::AtLeastOnce => { + self.at_least_once_producer = Some(self.create_producer(ctx, None)?); + } + ConsistencyMode::ExactlyOnce => { + let mut next_idx = 0usize; + + let active_producer = self.create_producer(ctx, Some(next_idx))?; + next_idx += 1; + + self.transactional_state = Some(TransactionalState { + next_transaction_index: next_idx, + active_producer, + producer_awaiting_commit: None, + }); + } + } + Ok(()) + } + + async fn process_data( + &mut self, + _input_idx: usize, + batch: RecordBatch, + _ctx: &mut TaskContext, + ) -> Result> { + let payloads = self.serializer.serialize(&batch)?; + let producer = self.current_producer().clone(); + + for (i, payload) in payloads.iter().enumerate() { + let ts_millis = self + .timestamp_col_idx + .and_then(|idx| event_timestamp_ms(&batch, i, idx)); + let key_bytes = self + .key_col_idx + .and_then(|idx| row_key_bytes(&batch, i, idx)); + + let mut record = FutureRecord::, Vec>::to(&self.topic).payload(&payload); + if let Some(ts) = ts_millis { + record = record.timestamp(ts); + } + if let Some(ref k) = key_bytes { + record = record.key(k); + } + + loop { + match producer.send_result(record) { + Ok(delivery_future) => { + self.write_futures.push(delivery_future); + break; + } + Err((KafkaError::MessageProduction(RDKafkaErrorCode::QueueFull), returned_record)) => { + record = returned_record; + sleep(Duration::from_millis(10)).await; + } + Err((e, _)) => bail!("Fatal Kafka send error: {}", e), + } + } + } + + Ok(vec![]) + } + + async fn process_watermark( + &mut self, + _watermark: Watermark, + _ctx: &mut TaskContext, + ) -> Result> { + Ok(vec![]) + } + + async fn snapshot_state( + &mut self, + _barrier: CheckpointBarrier, + ctx: &mut TaskContext, + ) -> Result<()> { + self.flush_to_broker().await?; + + if matches!(self.consistency_mode, ConsistencyMode::ExactlyOnce) { + let next_tx = self + .transactional_state + .as_ref() + .map(|s| s.next_transaction_index) + .unwrap(); + let new_producer = self.create_producer(ctx, Some(next_tx))?; + + let state = self.transactional_state.as_mut().unwrap(); + let old_producer = std::mem::replace(&mut state.active_producer, new_producer); + state.producer_awaiting_commit = Some(old_producer); + + state.next_transaction_index += 1; + } + + Ok(()) + } + + async fn commit_checkpoint(&mut self, epoch: u32, _ctx: &mut TaskContext) -> Result<()> { + if matches!(self.consistency_mode, ConsistencyMode::AtLeastOnce) { + return Ok(()); + } + + let state = self.transactional_state.as_mut().unwrap(); + let Some(committing_producer) = state.producer_awaiting_commit.take() else { + warn!( + "Received Commit for epoch {}, but no stashed producer exists. Possibly a recovery duplicate.", + epoch + ); + return Ok(()); + }; + + let mut retries = 0; + loop { + match committing_producer.commit_transaction(Timeout::After(Duration::from_secs(10))) { + Ok(_) => { + info!("Successfully committed Kafka transaction for epoch {}", epoch); + break; + } + Err(e) => { + retries += 1; + if retries >= 5 { + bail!( + "Failed to commit Kafka transaction after 5 retries. Fatal error: {}", + e + ); + } + warn!( + "Failed to commit Kafka transaction (Attempt {}/5): {}. Retrying...", + retries, e + ); + sleep(Duration::from_secs(2)).await; + } + } + } + + Ok(()) + } + + async fn on_close(&mut self, _ctx: &mut TaskContext) -> Result> { + self.flush_to_broker().await?; + info!("Kafka sink shut down gracefully."); + Ok(vec![]) + } +} diff --git a/src/runtime/streaming/operators/sink/mod.rs b/src/runtime/streaming/operators/sink/mod.rs new file mode 100644 index 00000000..aa340614 --- /dev/null +++ b/src/runtime/streaming/operators/sink/mod.rs @@ -0,0 +1,15 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +pub mod kafka; + diff --git a/src/runtime/streaming/operators/source/kafka/mod.rs b/src/runtime/streaming/operators/source/kafka/mod.rs new file mode 100644 index 00000000..d0de692a --- /dev/null +++ b/src/runtime/streaming/operators/source/kafka/mod.rs @@ -0,0 +1,377 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +use anyhow::{anyhow, Context as _, Result}; +use arrow_array::RecordBatch; +use arrow_schema::SchemaRef; +use async_trait::async_trait; +use bincode::{Decode, Encode}; +use governor::{DefaultDirectRateLimiter, Quota, RateLimiter as GovernorRateLimiter}; +use rdkafka::consumer::{CommitMode, Consumer, StreamConsumer}; +use rdkafka::{ClientConfig, Message as KMessage, Offset, TopicPartitionList}; +use std::collections::HashMap; +use std::num::NonZeroU32; +use std::time::{Duration, Instant}; +use tracing::{debug, error, info, warn}; + +use crate::runtime::streaming::api::context::TaskContext; +use crate::runtime::streaming::api::source::{SourceEvent, SourceOffset, SourceOperator}; +use crate::runtime::streaming::format::{BadDataPolicy, DataDeserializer, Format}; +use crate::sql::common::{CheckpointBarrier, MetadataField}; +use crate::sql::common::fs_schema::FieldValueType; +// ============================================================================ +// ============================================================================ + +#[derive(Copy, Clone, Debug, Encode, Decode, PartialEq, PartialOrd)] +pub struct KafkaState { + partition: i32, + offset: i64, +} + +pub trait BatchDeserializer: Send + 'static { + fn deserialize_slice( + &mut self, + payload: &[u8], + timestamp: u64, + metadata: Option>>, + ) -> Result<()>; + + fn should_flush(&self) -> bool; + + fn flush_buffer(&mut self) -> Result>; + + fn is_empty(&self) -> bool; +} + +// --------------------------------------------------------------------------- +// --------------------------------------------------------------------------- + +pub struct BufferedDeserializer { + inner: DataDeserializer, + buffer: Vec>, + batch_size: usize, +} + +impl BufferedDeserializer { + pub fn new(format: Format, schema: SchemaRef, bad_data_policy: BadDataPolicy, batch_size: usize) -> Self { + Self { + inner: DataDeserializer::new(format, schema, bad_data_policy), + buffer: Vec::with_capacity(batch_size), + batch_size, + } + } +} + +impl BatchDeserializer for BufferedDeserializer { + fn deserialize_slice( + &mut self, + payload: &[u8], + _timestamp: u64, + _metadata: Option>>, + ) -> Result<()> { + self.buffer.push(payload.to_vec()); + Ok(()) + } + + fn should_flush(&self) -> bool { + self.buffer.len() >= self.batch_size + } + + fn flush_buffer(&mut self) -> Result> { + if self.buffer.is_empty() { + return Ok(None); + } + + let refs: Vec<&[u8]> = self.buffer.iter().map(|v| v.as_slice()).collect(); + let batch = self.inner.deserialize_batch(&refs)?; + self.buffer.clear(); + Ok(Some(batch)) + } + + fn is_empty(&self) -> bool { + self.buffer.is_empty() + } +} + +impl SourceOffset { + fn rdkafka_offset(self) -> Offset { + match self { + SourceOffset::Earliest => Offset::Beginning, + SourceOffset::Latest => Offset::End, + SourceOffset::Group => Offset::Stored, + } + } +} + +// ============================================================================ +// ============================================================================ + +const KAFKA_POLL_TIMEOUT: Duration = Duration::from_millis(100); +const MAX_BATCH_LINGER_TIME: Duration = Duration::from_millis(500); + +pub struct KafkaSourceOperator { + pub topic: String, + pub bootstrap_servers: String, + pub group_id: Option, + pub group_id_prefix: Option, + pub offset_mode: SourceOffset, + + pub client_configs: HashMap, + pub messages_per_second: NonZeroU32, + pub metadata_fields: Vec, + + consumer: Option, + rate_limiter: Option, + deserializer: Box, + + current_offsets: HashMap, + is_empty_assignment: bool, + + last_flush_time: Instant, +} + +impl KafkaSourceOperator { + pub fn new( + topic: String, + bootstrap_servers: String, + group_id: Option, + group_id_prefix: Option, + offset_mode: SourceOffset, + client_configs: HashMap, + messages_per_second: NonZeroU32, + metadata_fields: Vec, + deserializer: Box, + ) -> Self { + Self { + topic, + bootstrap_servers, + group_id, + group_id_prefix, + offset_mode, + client_configs, + messages_per_second, + metadata_fields, + consumer: None, + rate_limiter: None, + deserializer, + current_offsets: HashMap::new(), + is_empty_assignment: false, + last_flush_time: Instant::now(), + } + } + + async fn init_and_assign_consumer(&mut self, ctx: &mut TaskContext) -> Result<()> { + info!("Creating kafka consumer for {}", self.bootstrap_servers); + let mut client_config = ClientConfig::new(); + + let group_id = match (&self.group_id, &self.group_id_prefix) { + (Some(gid), _) => gid.clone(), + (None, Some(prefix)) => { + format!("{}-fs-{}-{}", prefix, ctx.job_id, ctx.subtask_idx) + } + (None, None) => format!("fs-{}-{}-consumer", ctx.job_id, ctx.subtask_idx), + }; + + for (key, value) in &self.client_configs { + client_config.set(key, value); + } + + let consumer: StreamConsumer = client_config + .set("bootstrap.servers", &self.bootstrap_servers) + .set("enable.partition.eof", "false") + .set("enable.auto.commit", "false") + .set("group.id", &group_id) + .create()?; + + let has_state = false; + let state_map: HashMap = HashMap::new(); + + let metadata = consumer + .fetch_metadata(Some(&self.topic), Duration::from_secs(30)) + .context("Failed to fetch Kafka metadata")?; + + let topic_meta = metadata + .topics() + .iter() + .find(|t| t.name() == self.topic) + .ok_or_else(|| anyhow!("topic {} not in metadata", self.topic))?; + + let partitions = topic_meta.partitions(); + let mut our_partitions = HashMap::new(); + let pmax = ctx.parallelism.max(1) as i32; + + for p in partitions { + if p.id().rem_euclid(pmax) == ctx.subtask_idx as i32 { + let offset = state_map + .get(&p.id()) + .map(|s| Offset::Offset(s.offset)) + .unwrap_or_else(|| { + if has_state { + Offset::Beginning + } else { + self.offset_mode.rdkafka_offset() + } + }); + our_partitions.insert((self.topic.clone(), p.id()), offset); + } + } + + if our_partitions.is_empty() { + warn!( + "[Task {}] Subscribed to no partitions. Entering idle mode.", + ctx.subtask_idx + ); + self.is_empty_assignment = true; + } else { + let topic_partitions = TopicPartitionList::from_topic_map(&our_partitions)?; + consumer.assign(&topic_partitions)?; + } + + self.consumer = Some(consumer); + Ok(()) + } +} + +// ============================================================================ +// ============================================================================ + +#[async_trait] +impl SourceOperator for KafkaSourceOperator { + fn name(&self) -> &str { + &self.topic + } + + async fn on_start(&mut self, ctx: &mut TaskContext) -> Result<()> { + self.init_and_assign_consumer(ctx).await?; + self.rate_limiter = Some(GovernorRateLimiter::direct(Quota::per_second( + self.messages_per_second, + ))); + Ok(()) + } + + async fn fetch_next(&mut self, _ctx: &mut TaskContext) -> Result { + if self.is_empty_assignment { + return Ok(SourceEvent::Idle); + } + + let consumer = self + .consumer + .as_ref() + .ok_or_else(|| anyhow!("Kafka consumer not initialized"))?; + let rate_limiter = self + .rate_limiter + .as_ref() + .ok_or_else(|| anyhow!("rate limiter not initialized"))?; + + match tokio::time::timeout(KAFKA_POLL_TIMEOUT, consumer.recv()).await { + Ok(Ok(msg)) => { + let partition = msg.partition(); + let offset = msg.offset(); + let timestamp = msg.timestamp().to_millis().unwrap_or(0); + + self.current_offsets.insert(partition, offset); + + if let Some(payload) = msg.payload() { + let topic = msg.topic(); + + let connector_metadata = if !self.metadata_fields.is_empty() { + let mut meta = HashMap::new(); + for f in &self.metadata_fields { + meta.insert( + f.field_name.as_str(), + match f.key.as_str() { + "key" => FieldValueType::Bytes(msg.key()), + "offset_id" => FieldValueType::Int64(Some(msg.offset())), + "partition" => FieldValueType::Int32(Some(msg.partition())), + "topic" => FieldValueType::String(Some(topic)), + "timestamp" => FieldValueType::Int64(Some(timestamp)), + _ => continue, + }, + ); + } + Some(meta) + } else { + None + }; + + self.deserializer.deserialize_slice( + payload, + timestamp.max(0) as u64, + connector_metadata, + )?; + } else { + debug!( + "Received tombstone message at partition {} offset {}", + partition, offset + ); + } + + rate_limiter.until_ready().await; + + let should_flush_by_size = self.deserializer.should_flush(); + let should_flush_by_time = self.last_flush_time.elapsed() > MAX_BATCH_LINGER_TIME; + + if !self.deserializer.is_empty() && (should_flush_by_size || should_flush_by_time) { + if let Some(batch) = self.deserializer.flush_buffer()? { + self.last_flush_time = Instant::now(); + return Ok(SourceEvent::Data(batch)); + } + } + + Ok(SourceEvent::Idle) + } + Ok(Err(e)) => { + error!("Kafka recv error: {}", e); + Err(anyhow!("Kafka error: {}", e)) + } + Err(_) => { + if !self.deserializer.is_empty() { + if let Some(batch) = self.deserializer.flush_buffer()? { + self.last_flush_time = Instant::now(); + return Ok(SourceEvent::Data(batch)); + } + } + Ok(SourceEvent::Idle) + } + } + } + + async fn snapshot_state( + &mut self, + _barrier: CheckpointBarrier, + ctx: &mut TaskContext, + ) -> Result<()> { + debug!("Source [{}] executing checkpoint", ctx.subtask_idx); + + let mut topic_partitions = TopicPartitionList::new(); + for (&partition, &offset) in &self.current_offsets { + topic_partitions + .add_partition_offset(&self.topic, partition, Offset::Offset(offset)) + .map_err(|e| anyhow!("add_partition_offset: {e}"))?; + } + + if let Some(consumer) = &self.consumer { + if let Err(e) = consumer.commit(&topic_partitions, CommitMode::Async) { + warn!("Failed to commit async offset to Kafka Broker: {:?}", e); + } + } + + Ok(()) + } + + async fn on_close(&mut self, _ctx: &mut TaskContext) -> Result<()> { + info!("Kafka source shutting down gracefully"); + self.consumer.take(); + Ok(()) + } +} diff --git a/src/runtime/streaming/operators/source/mod.rs b/src/runtime/streaming/operators/source/mod.rs new file mode 100644 index 00000000..aa340614 --- /dev/null +++ b/src/runtime/streaming/operators/source/mod.rs @@ -0,0 +1,15 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +pub mod kafka; + diff --git a/src/runtime/streaming/operators/stateless_physical_executor.rs b/src/runtime/streaming/operators/stateless_physical_executor.rs new file mode 100644 index 00000000..6c1e5c90 --- /dev/null +++ b/src/runtime/streaming/operators/stateless_physical_executor.rs @@ -0,0 +1,88 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +use std::sync::{Arc, RwLock}; + +use anyhow::{anyhow, Result}; +use arrow_array::RecordBatch; +use datafusion::execution::context::SessionContext; +use datafusion::execution::runtime_env::RuntimeEnvBuilder; +use datafusion::execution::SendableRecordBatchStream; +use datafusion::execution::TaskContext; +use datafusion::physical_plan::ExecutionPlan; +use datafusion_proto::physical_plan::AsExecutionPlan; +use datafusion_proto::protobuf::PhysicalPlanNode; +use futures::StreamExt; +use prost::Message; + +use crate::runtime::streaming::factory::Registry; +use crate::sql::physical::{DecodingContext, FsPhysicalExtensionCodec}; + +pub struct StatelessPhysicalExecutor { + batch: Arc>>, + plan: Arc, + task_context: Arc, +} + +impl StatelessPhysicalExecutor { + pub fn new(mut proto: &[u8], registry: &Registry) -> Result { + let batch = Arc::new(RwLock::default()); + + let plan_node = PhysicalPlanNode::decode(&mut proto) + .map_err(|e| anyhow!("decode PhysicalPlanNode: {e}"))?; + let codec = FsPhysicalExtensionCodec { + context: DecodingContext::SingleLockedBatch(batch.clone()), + }; + + let plan = plan_node.try_into_physical_plan( + registry, + &RuntimeEnvBuilder::new().build()?, + &codec, + )?; + + Ok(Self { + batch, + plan, + task_context: SessionContext::new().task_ctx(), + }) + } + + pub async fn process_batch(&mut self, batch: RecordBatch) -> Result { + { + let mut writer = self + .batch + .write() + .map_err(|e| anyhow!("SingleLockedBatch lock: {e}"))?; + *writer = Some(batch); + } + self.plan + .reset() + .map_err(|e| anyhow!("reset execution plan: {e}"))?; + self.plan + .execute(0, self.task_context.clone()) + .map_err(|e| anyhow!("failed to compute plan: {e}")) + } + + pub async fn process_single(&mut self, batch: RecordBatch) -> Result { + let mut stream = self.process_batch(batch).await?; + let result = stream + .next() + .await + .ok_or_else(|| anyhow!("empty output stream"))??; + anyhow::ensure!( + stream.next().await.is_none(), + "expected exactly one output batch" + ); + Ok(result) + } +} diff --git a/src/runtime/streaming/operators/value_execution.rs b/src/runtime/streaming/operators/value_execution.rs new file mode 100644 index 00000000..effdf5f6 --- /dev/null +++ b/src/runtime/streaming/operators/value_execution.rs @@ -0,0 +1,76 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +use anyhow::Result; +use arrow_array::RecordBatch; +use async_trait::async_trait; +use futures::StreamExt; + +use crate::runtime::streaming::api::context::TaskContext; +use crate::runtime::streaming::api::operator::Operator; +use crate::runtime::streaming::operators::StatelessPhysicalExecutor; +use crate::runtime::streaming::StreamOutput; +use crate::sql::common::{CheckpointBarrier, Watermark}; + +pub struct ValueExecutionOperator { + name: String, + executor: StatelessPhysicalExecutor, +} + +impl ValueExecutionOperator { + pub fn new(name: String, executor: StatelessPhysicalExecutor) -> Self { + Self { name, executor } + } +} + +#[async_trait] +impl Operator for ValueExecutionOperator { + fn name(&self) -> &str { + &self.name + } + + async fn process_data( + &mut self, + _input_idx: usize, + batch: RecordBatch, + _ctx: &mut TaskContext, + ) -> Result> { + let mut outputs = Vec::new(); + + let mut stream = self.executor.process_batch(batch).await?; + + while let Some(batch_result) = stream.next().await { + let out_batch = batch_result?; + if out_batch.num_rows() > 0 { + outputs.push(StreamOutput::Forward(out_batch)); + } + } + Ok(outputs) + } + + async fn process_watermark( + &mut self, + watermark: Watermark, + _ctx: &mut TaskContext, + ) -> Result> { + Ok(vec![StreamOutput::Watermark(watermark)]) + } + + async fn snapshot_state( + &mut self, + _barrier: CheckpointBarrier, + _ctx: &mut TaskContext, + ) -> Result<()> { + Ok(()) + } +} diff --git a/src/runtime/streaming/operators/watermark/mod.rs b/src/runtime/streaming/operators/watermark/mod.rs new file mode 100644 index 00000000..3a0a1099 --- /dev/null +++ b/src/runtime/streaming/operators/watermark/mod.rs @@ -0,0 +1,15 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +pub mod watermark_generator; + +pub use watermark_generator::WatermarkGeneratorConstructor; diff --git a/src/runtime/streaming/operators/watermark/watermark_generator.rs b/src/runtime/streaming/operators/watermark/watermark_generator.rs new file mode 100644 index 00000000..0fee4a38 --- /dev/null +++ b/src/runtime/streaming/operators/watermark/watermark_generator.rs @@ -0,0 +1,239 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +use anyhow::{anyhow, Result}; +use arrow::compute::kernels::aggregate; +use arrow_array::cast::AsArray; +use arrow_array::types::TimestampNanosecondType; +use arrow_array::{RecordBatch, TimestampNanosecondArray}; +use bincode::{Decode, Encode}; +use datafusion::physical_expr::PhysicalExpr; +use datafusion_proto::physical_plan::DefaultPhysicalExtensionCodec; +use datafusion_proto::physical_plan::from_proto::parse_physical_expr; +use datafusion_proto::protobuf::PhysicalExprNode; +use prost::Message; +use std::sync::Arc; +use std::time::{Duration, SystemTime}; +use tracing::{debug, info}; + +use crate::runtime::streaming::api::context::TaskContext; +use crate::runtime::streaming::api::operator::Operator; +use crate::runtime::streaming::factory::Registry; +use async_trait::async_trait; +use protocol::grpc::api::ExpressionWatermarkConfig; +use crate::runtime::streaming::StreamOutput; +use crate::sql::common::{from_nanos, to_millis, CheckpointBarrier, FsSchema, Watermark}; + +#[derive(Debug, Copy, Clone, Encode, Decode, PartialEq, Eq)] +pub struct WatermarkGeneratorState { + pub last_watermark_emitted_at: SystemTime, + pub max_watermark: SystemTime, +} + +impl Default for WatermarkGeneratorState { + fn default() -> Self { + Self { + last_watermark_emitted_at: SystemTime::UNIX_EPOCH, + max_watermark: SystemTime::UNIX_EPOCH, + } + } +} + +pub struct WatermarkGeneratorOperator { + interval: Duration, + idle_time: Option, + expression: Arc, + timestamp_index: usize, + state: WatermarkGeneratorState, + last_event_wall: SystemTime, + is_idle: bool, +} + +impl WatermarkGeneratorOperator { + pub fn new( + interval: Duration, + idle_time: Option, + expression: Arc, + timestamp_index: usize, + ) -> Self { + Self { + interval, + idle_time, + expression, + timestamp_index, + state: WatermarkGeneratorState::default(), + last_event_wall: SystemTime::now(), + is_idle: false, + } + } + + fn extract_max_timestamp(&self, batch: &RecordBatch) -> Option { + let ts_column = batch.column(self.timestamp_index); + let arr = ts_column.as_primitive::(); + let max_ts = aggregate::max(arr)?; + Some(from_nanos(max_ts as u128)) + } + + fn evaluate_watermark(&self, batch: &RecordBatch) -> Result { + let watermark_array = self + .expression + .evaluate(batch)? + .into_array(batch.num_rows())?; + + let typed_array = watermark_array + .as_any() + .downcast_ref::() + .ok_or_else(|| anyhow!("watermark expression must return TimestampNanosecondArray"))?; + + let max_watermark_nanos = aggregate::max(typed_array) + .ok_or_else(|| anyhow!("failed to extract max watermark from batch"))?; + + Ok(from_nanos(max_watermark_nanos as u128)) + } +} + +#[async_trait] +impl Operator for WatermarkGeneratorOperator { + fn name(&self) -> &str { + "ExpressionWatermarkGenerator" + } + + fn tick_interval(&self) -> Option { + Some(Duration::from_secs(1)) + } + + async fn on_start(&mut self, _ctx: &mut TaskContext) -> Result<()> { + self.last_event_wall = SystemTime::now(); + Ok(()) + } + + async fn process_data( + &mut self, + _input_idx: usize, + batch: RecordBatch, + ctx: &mut TaskContext, + ) -> Result> { + self.last_event_wall = SystemTime::now(); + + let mut outputs = vec![StreamOutput::Forward(batch.clone())]; + + let Some(max_batch_ts) = self.extract_max_timestamp(&batch) else { + return Ok(outputs); + }; + + let new_watermark = self.evaluate_watermark(&batch)?; + + self.state.max_watermark = self.state.max_watermark.max(new_watermark); + + let time_since_last_emit = max_batch_ts + .duration_since(self.state.last_watermark_emitted_at) + .unwrap_or(Duration::ZERO); + + if self.is_idle || time_since_last_emit > self.interval { + debug!( + "[{}] emitting expression watermark {}", + ctx.subtask_idx, + to_millis(self.state.max_watermark) + ); + + outputs.push(StreamOutput::Watermark(Watermark::EventTime( + self.state.max_watermark, + ))); + + self.state.last_watermark_emitted_at = max_batch_ts; + self.is_idle = false; + } + + Ok(outputs) + } + + async fn process_watermark( + &mut self, + _watermark: Watermark, + _ctx: &mut TaskContext, + ) -> Result> { + Ok(vec![]) + } + + async fn process_tick( + &mut self, + _tick_index: u64, + ctx: &mut TaskContext, + ) -> Result> { + if let Some(idle_timeout) = self.idle_time { + let elapsed = self + .last_event_wall + .elapsed() + .unwrap_or(Duration::ZERO); + if !self.is_idle && elapsed > idle_timeout { + info!( + "task [{}] entering Idle after {:?}", + ctx.subtask_idx, idle_timeout + ); + self.is_idle = true; + return Ok(vec![StreamOutput::Watermark(Watermark::Idle)]); + } + } + Ok(vec![]) + } + + async fn snapshot_state(&mut self, _barrier: CheckpointBarrier, _ctx: &mut TaskContext) -> Result<()> { + Ok(()) + } + + async fn on_close(&mut self, _ctx: &mut TaskContext) -> Result> { + Ok(vec![StreamOutput::Watermark(Watermark::EventTime(from_nanos( + u64::MAX as u128, + )))]) + } +} + +pub struct WatermarkGeneratorConstructor; + +impl WatermarkGeneratorConstructor { + pub fn with_config( + &self, + config: ExpressionWatermarkConfig, + registry: Arc, + ) -> anyhow::Result { + let input_schema: FsSchema = config + .input_schema + .ok_or_else(|| anyhow!("missing input schema"))? + .try_into() + .map_err(|e| anyhow!("input schema: {e}"))?; + let timestamp_index = input_schema.timestamp_index; + + let expression_node = + PhysicalExprNode::decode(&mut config.expression.as_slice()).map_err(|e| { + anyhow!("decode expression: {e}") + })?; + let expression = parse_physical_expr( + &expression_node, + registry.as_ref(), + &input_schema.schema, + &DefaultPhysicalExtensionCodec {}, + ) + .map_err(|e| anyhow!("parse physical expr: {e}"))?; + + let interval = Duration::from_micros(config.period_micros); + let idle_time = config.idle_time_micros.map(Duration::from_micros); + + Ok(WatermarkGeneratorOperator::new( + interval, + idle_time, + expression, + timestamp_index, + )) + } +} + diff --git a/src/runtime/streaming/operators/windows/mod.rs b/src/runtime/streaming/operators/windows/mod.rs new file mode 100644 index 00000000..f1915f0d --- /dev/null +++ b/src/runtime/streaming/operators/windows/mod.rs @@ -0,0 +1,21 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +pub mod session_aggregating_window; +pub mod sliding_aggregating_window; +pub mod tumbling_aggregating_window; +pub mod window_function; + +pub use session_aggregating_window::SessionAggregatingWindowConstructor; +pub use sliding_aggregating_window::SlidingAggregatingWindowConstructor; +pub use tumbling_aggregating_window::TumblingAggregateWindowConstructor; +pub use window_function::WindowFunctionConstructor; diff --git a/src/runtime/streaming/operators/windows/session_aggregating_window.rs b/src/runtime/streaming/operators/windows/session_aggregating_window.rs new file mode 100644 index 00000000..93376c4c --- /dev/null +++ b/src/runtime/streaming/operators/windows/session_aggregating_window.rs @@ -0,0 +1,740 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +use anyhow::{anyhow, bail, Context, Result}; +use arrow::compute::{ + concat_batches, filter_record_batch, kernels::cmp::gt_eq, lexsort_to_indices, partition, take, +}; +use arrow::row::{RowConverter, SortField}; +use arrow_array::types::TimestampNanosecondType; +use arrow_array::{ + Array, BooleanArray, PrimitiveArray, RecordBatch, StructArray, TimestampNanosecondArray, +}; +use arrow_schema::{DataType, Field, FieldRef, Schema}; +use datafusion::execution::context::SessionContext; +use datafusion::execution::runtime_env::RuntimeEnvBuilder; +use datafusion::execution::SendableRecordBatchStream; +use datafusion::physical_plan::ExecutionPlan; +use datafusion_proto::physical_plan::AsExecutionPlan; +use datafusion_proto::protobuf::PhysicalPlanNode; +use futures::StreamExt; +use prost::Message; +use std::collections::{BTreeMap, HashMap, HashSet}; +use std::sync::{Arc, RwLock}; +use std::time::{Duration, SystemTime}; +use tokio::sync::mpsc::{unbounded_channel, UnboundedReceiver, UnboundedSender}; + +use crate::runtime::streaming::api::context::TaskContext; +use crate::runtime::streaming::api::operator::Operator; +use async_trait::async_trait; +use crate::runtime::streaming::factory::Registry; +use protocol::grpc::api::SessionWindowAggregateOperator; +use crate::runtime::streaming::StreamOutput; +use crate::sql::common::{from_nanos, to_nanos, CheckpointBarrier, FsSchema, FsSchemaRef, Watermark}; +use crate::sql::common::converter::Converter; +use crate::sql::physical::{DecodingContext, FsPhysicalExtensionCodec}; +use crate::sql::schema::utils::window_arrow_struct; +// ============================================================================ +// ============================================================================ + +struct SessionWindowConfig { + gap: Duration, + input_schema_ref: FsSchemaRef, + window_field: FieldRef, + window_index: usize, + final_physical_exec: Arc, + receiver_hook: Arc>>>, + output_schema: Arc, +} + +struct ActiveSession { + data_start: SystemTime, + data_end: SystemTime, + sender: Option>, + result_stream: SendableRecordBatchStream, +} + +impl ActiveSession { + async fn new( + aggregation_plan: Arc, + initial_timestamp: SystemTime, + sender: UnboundedSender, + ) -> Result { + aggregation_plan.reset()?; + let result_exec = aggregation_plan.execute(0, SessionContext::new().task_ctx())?; + Ok(Self { + data_start: initial_timestamp, + data_end: initial_timestamp, + sender: Some(sender), + result_stream: result_exec, + }) + } + + fn ingest_batch( + &mut self, + batch: RecordBatch, + gap: Duration, + ts_idx: usize, + ) -> Result> { + let ts_col = batch + .column(ts_idx) + .as_any() + .downcast_ref::() + .ok_or_else(|| anyhow!("expected timestamp column"))?; + let start_ts = ts_col.value(0); + let end_ts = ts_col.value(batch.num_rows() - 1); + + let current_end_with_gap = to_nanos(self.data_end + gap) as i64; + + if end_ts < current_end_with_gap { + self.data_end = self.data_end.max(from_nanos(end_ts as u128)); + self.data_start = self.data_start.min(from_nanos(start_ts as u128)); + self.sender + .as_ref() + .ok_or_else(|| anyhow!("session sender already closed"))? + .send(batch) + .map_err(|e| anyhow!("session channel send: {e}"))?; + return Ok(None); + } + + if current_end_with_gap < start_ts { + return Ok(Some((from_nanos(start_ts as u128), batch))); + } + + self.data_start = self.data_start.min(from_nanos(start_ts as u128)); + + let mut split_idx = 1; + while split_idx < batch.num_rows() { + let val = ts_col.value(split_idx); + if val < to_nanos(self.data_end) as i64 { + split_idx += 1; + continue; + } + if val < to_nanos(self.data_end + gap) as i64 { + self.data_end = from_nanos(val as u128); + split_idx += 1; + continue; + } + break; + } + + if split_idx == batch.num_rows() { + self.sender + .as_ref() + .ok_or_else(|| anyhow!("session sender already closed"))? + .send(batch) + .map_err(|e| anyhow!("session channel send: {e}"))?; + return Ok(None); + } + + self.sender + .as_ref() + .ok_or_else(|| anyhow!("session sender already closed"))? + .send(batch.slice(0, split_idx)) + .map_err(|e| anyhow!("session channel send: {e}"))?; + let remaining_batch = batch.slice(split_idx, batch.num_rows() - split_idx); + let new_start_time = from_nanos(ts_col.value(split_idx) as u128); + Ok(Some((new_start_time, remaining_batch))) + } + + async fn close_and_drain(mut self, gap: Duration) -> Result { + self.sender.take(); + + let mut result_batches = Vec::new(); + while let Some(batch) = self.result_stream.next().await { + result_batches.push(batch?); + } + + if result_batches.len() != 1 || result_batches[0].num_rows() != 1 { + bail!("active session must yield exactly one aggregate row"); + } + + Ok(SessionWindowResult { + window_start: self.data_start, + window_end: self.data_end + gap, + batch: result_batches.into_iter().next().unwrap(), + }) + } +} + +struct SessionWindowResult { + window_start: SystemTime, + window_end: SystemTime, + batch: RecordBatch, +} + +struct KeySessionState { + config: Arc, + active_session: Option, + buffered_batches: BTreeMap>, +} + +impl KeySessionState { + fn new(config: Arc) -> Self { + Self { + config, + active_session: None, + buffered_batches: BTreeMap::new(), + } + } + + fn is_empty(&self) -> bool { + self.active_session.is_none() && self.buffered_batches.is_empty() + } + + fn earliest_data_time(&self) -> Option { + self.active_session + .as_ref() + .map(|s| s.data_start) + .or_else(|| self.buffered_batches.keys().next().copied()) + } + + fn next_watermark_action_time(&self) -> Option { + self.active_session + .as_ref() + .map(|s| s.data_end + self.config.gap) + .or_else(|| { + self.buffered_batches + .keys() + .next() + .map(|t| *t - self.config.gap) + }) + } + + async fn advance_by_watermark(&mut self, watermark: SystemTime) -> Result> { + let mut results = vec![]; + + loop { + if let Some(session) = &mut self.active_session { + if session.data_end + self.config.gap < watermark { + let closed_session = self + .active_session + .take() + .unwrap() + .close_and_drain(self.config.gap) + .await?; + results.push(closed_session); + } else { + break; + } + } else { + let Some((initial_ts, _)) = self.buffered_batches.first_key_value() else { + break; + }; + if watermark + self.config.gap < *initial_ts { + break; + } + + let (tx, rx) = unbounded_channel(); + *self.config.receiver_hook.write().unwrap() = Some(rx); + + self.active_session = Some( + ActiveSession::new( + self.config.final_physical_exec.clone(), + *initial_ts, + tx, + ) + .await?, + ); + + self.drain_buffer_to_active_session()?; + } + } + Ok(results) + } + + fn drain_buffer_to_active_session(&mut self) -> Result<()> { + let session = self + .active_session + .as_mut() + .ok_or_else(|| anyhow!("drain_buffer_to_active_session without active session"))?; + + while let Some((first_key, _)) = self.buffered_batches.first_key_value() { + if session.data_end + self.config.gap < *first_key { + break; + } + + let (_, batches) = self.buffered_batches.pop_first().unwrap(); + for batch in batches { + if let Some((rem_start, rem_batch)) = session.ingest_batch( + batch, + self.config.gap, + self.config.input_schema_ref.timestamp_index, + )? { + self.buffered_batches + .entry(rem_start) + .or_default() + .push(rem_batch); + } + } + } + Ok(()) + } + + async fn add_data( + &mut self, + start_time: SystemTime, + batch: RecordBatch, + watermark: Option, + ) -> Result<()> { + self.buffered_batches + .entry(start_time) + .or_default() + .push(batch); + + if self.active_session.is_some() { + self.drain_buffer_to_active_session()?; + } + + if let Some(wm) = watermark { + let flushed = self.advance_by_watermark(wm).await?; + if !flushed.is_empty() { + bail!("unexpected flush during data ingestion; session watermark invariant violated"); + } + } + Ok(()) + } +} + +fn start_time_for_sorted_batch(batch: &RecordBatch, schema: &FsSchema) -> SystemTime { + let timestamp_array = batch.column(schema.timestamp_index); + let timestamp_array = timestamp_array + .as_any() + .downcast_ref::>() + .expect("timestamp column"); + from_nanos(timestamp_array.value(0) as u128) +} + +fn build_session_output_schema( + input: &FsSchema, + window_field: FieldRef, + window_index: usize, + agg_schema: &Schema, +) -> Result> { + let key_count = input.routing_keys().map(|k| k.len()).unwrap_or(0); + let mut fields: Vec = (0..key_count) + .map(|i| input.schema.fields()[i].clone()) + .collect(); + fields.insert(window_index, window_field); + fields.extend(agg_schema.fields().iter().cloned()); + fields.push(input.schema.fields()[input.timestamp_index].clone()); + Ok(Arc::new(Schema::new(fields))) +} + +// ============================================================================ +// ============================================================================ + +pub struct SessionWindowOperator { + config: Arc, + row_converter: Converter, + + session_states: HashMap, KeySessionState>, + pq_watermark_actions: BTreeMap>>, + pq_start_times: BTreeMap>>, +} + +impl SessionWindowOperator { + fn filter_batch_by_time(&self, batch: RecordBatch, watermark: Option) -> Result { + let Some(watermark) = watermark else { + return Ok(batch); + }; + + let timestamp_column = batch + .column(self.config.input_schema_ref.timestamp_index) + .as_any() + .downcast_ref::() + .ok_or_else(|| anyhow!("expected timestamp column"))?; + + let watermark_scalar = TimestampNanosecondArray::new_scalar(to_nanos(watermark) as i64); + let on_time = gt_eq(timestamp_column, &watermark_scalar)?; + + Ok(filter_record_batch(&batch, &on_time)?) + } + + fn sort_batch(&self, batch: &RecordBatch) -> Result { + let sort_columns = self.config.input_schema_ref.sort_columns(batch, true); + let sort_indices = lexsort_to_indices(&sort_columns, None)?; + + let columns = batch + .columns() + .iter() + .map(|c| take(c, &sort_indices, None).unwrap()) + .collect(); + + Ok(RecordBatch::try_new(batch.schema(), columns)?) + } + + async fn ingest_sorted_batch( + &mut self, + sorted_batch: RecordBatch, + watermark: Option, + ) -> Result<()> { + let partition_ranges = if !self.config.input_schema_ref.has_routing_keys() { + vec![0..sorted_batch.num_rows()] + } else { + let key_len = self + .config + .input_schema_ref + .routing_keys() + .as_ref() + .unwrap() + .len(); + let key_cols = sorted_batch + .columns() + .iter() + .take(key_len) + .cloned() + .collect::>(); + partition(key_cols.as_slice())?.ranges() + }; + + let key_count = self + .config + .input_schema_ref + .routing_keys() + .map(|k| k.len()) + .unwrap_or(0); + + for range in partition_ranges { + let key_batch = sorted_batch.slice(range.start, range.end - range.start); + + let row_key = if key_count == 0 { + Vec::new() + } else { + self.row_converter + .convert_columns(&key_batch.slice(0, 1).columns()[0..key_count]) + .context("row key convert")? + .as_ref() + .to_vec() + }; + + let state = self + .session_states + .entry(row_key.clone()) + .or_insert_with(|| KeySessionState::new(self.config.clone())); + + let initial_action = state.next_watermark_action_time(); + let initial_start = state.earliest_data_time(); + + let batch_start = start_time_for_sorted_batch(&key_batch, &self.config.input_schema_ref); + + state + .add_data(batch_start, key_batch, watermark) + .await?; + + let new_action = state + .next_watermark_action_time() + .ok_or_else(|| anyhow!("missing next watermark action after add_data"))?; + let new_start = state + .earliest_data_time() + .ok_or_else(|| anyhow!("missing earliest data after add_data"))?; + + match initial_action { + Some(ia) => { + if ia != new_action { + self.pq_watermark_actions + .get_mut(&ia) + .expect("pq watermark entry") + .remove(&row_key); + self.pq_watermark_actions + .entry(new_action) + .or_default() + .insert(row_key.clone()); + } + let is = initial_start.expect("initial start"); + if is != new_start { + self.pq_start_times + .get_mut(&is) + .expect("pq start entry") + .remove(&row_key); + self.pq_start_times + .entry(new_start) + .or_default() + .insert(row_key.clone()); + } + } + None => { + self.pq_watermark_actions + .entry(new_action) + .or_default() + .insert(row_key.clone()); + self.pq_start_times + .entry(new_start) + .or_default() + .insert(row_key); + } + } + } + Ok(()) + } + + async fn evaluate_watermark(&mut self, watermark: SystemTime) -> Result> { + let mut emit_results: Vec<(Vec, Vec)> = Vec::new(); + + loop { + let popped_action_time = match self.pq_watermark_actions.first_key_value() { + Some((t, _)) if *t < watermark => *t, + _ => break, + }; + let keys = self + .pq_watermark_actions + .remove(&popped_action_time) + .expect("pop watermark pq"); + + for key in keys { + let state = self + .session_states + .get_mut(&key) + .ok_or_else(|| anyhow!("missing session state for key"))?; + let initial_start = state + .earliest_data_time() + .ok_or_else(|| anyhow!("missing earliest data in evaluate_watermark"))?; + + let completed_sessions = state.advance_by_watermark(watermark).await?; + if !completed_sessions.is_empty() { + emit_results.push((key.clone(), completed_sessions)); + } + + self.pq_start_times + .get_mut(&initial_start) + .expect("pq start") + .remove(&key); + + if state.is_empty() { + self.session_states.remove(&key); + } else { + let new_start = state + .earliest_data_time() + .expect("earliest after advance"); + self.pq_start_times + .entry(new_start) + .or_default() + .insert(key.clone()); + + let new_next_action = state + .next_watermark_action_time() + .expect("next action after advance"); + if new_next_action == popped_action_time { + bail!( + "processed watermark at {:?} but next watermark action stayed at {:?}", + watermark, popped_action_time + ); + } + self.pq_watermark_actions + .entry(new_next_action) + .or_default() + .insert(key); + } + } + } + + if emit_results.is_empty() { + return Ok(vec![]); + } + + Ok(vec![self.format_to_arrow(emit_results)?]) + } + + fn format_to_arrow(&self, results: Vec<(Vec, Vec)>) -> Result { + let (rows, session_results): (Vec<_>, Vec<_>) = results + .into_iter() + .flat_map(|(row, s_results)| s_results.into_iter().map(move |res| (row.clone(), res))) + .unzip(); + + let key_columns = if let Some(parser) = self.row_converter.parser() { + self.row_converter.convert_rows( + rows.iter() + .map(|row| parser.parse(row.as_ref())) + .collect(), + )? + } else { + vec![] + }; + + let start_times: Vec = session_results + .iter() + .map(|r| to_nanos(r.window_start) as i64) + .collect(); + let end_times: Vec = session_results + .iter() + .map(|r| to_nanos(r.window_end) as i64) + .collect(); + + let window_start_array = PrimitiveArray::::from(start_times); + let window_end_array = PrimitiveArray::::from(end_times.clone()); + + let result_batches: Vec<&RecordBatch> = session_results.iter().map(|res| &res.batch).collect(); + let merged_batch = concat_batches(&session_results[0].batch.schema(), result_batches)?; + + let DataType::Struct(window_fields) = self.config.window_field.data_type() else { + bail!("expected window field to be a struct"); + }; + + let window_struct_array = StructArray::try_new( + window_fields.clone(), + vec![Arc::new(window_start_array), Arc::new(window_end_array)], + None, + )?; + + let mut columns = key_columns; + columns.insert(self.config.window_index, Arc::new(window_struct_array)); + columns.extend_from_slice(merged_batch.columns()); + + RecordBatch::try_new(self.config.output_schema.clone(), columns) + .context("failed to create session window output batch") + } + + #[allow(dead_code)] + fn earliest_batch_time(&self) -> Option { + self.pq_start_times + .first_key_value() + .map(|(start_time, _keys)| *start_time) + } +} + +#[async_trait] +impl Operator for SessionWindowOperator { + fn name(&self) -> &str { + "SessionWindow" + } + + async fn on_start(&mut self, _ctx: &mut TaskContext) -> Result<()> { + Ok(()) + } + + async fn process_data( + &mut self, + _input_idx: usize, + batch: RecordBatch, + ctx: &mut TaskContext, + ) -> Result> { + let watermark_time = ctx.last_present_watermark(); + + let filtered_batch = self.filter_batch_by_time(batch, watermark_time)?; + if filtered_batch.num_rows() == 0 { + return Ok(vec![]); + } + + let sorted_batch = self.sort_batch(&filtered_batch)?; + + self.ingest_sorted_batch(sorted_batch, watermark_time).await?; + + Ok(vec![]) + } + + async fn process_watermark( + &mut self, + watermark: Watermark, + _ctx: &mut TaskContext, + ) -> Result> { + let Watermark::EventTime(current_time) = watermark else { + return Ok(vec![]); + }; + + let output_batches = self.evaluate_watermark(current_time).await?; + Ok(output_batches + .into_iter() + .map(StreamOutput::Forward) + .collect()) + } + + async fn snapshot_state(&mut self, _barrier: CheckpointBarrier, _ctx: &mut TaskContext) -> Result<()> { + Ok(()) + } + + async fn on_close(&mut self, _ctx: &mut TaskContext) -> Result> { + Ok(vec![]) + } +} + +// ============================================================================ +// ============================================================================ + +pub struct SessionAggregatingWindowConstructor; + +impl SessionAggregatingWindowConstructor { + pub fn with_config( + &self, + config: SessionWindowAggregateOperator, + registry: Arc, + ) -> anyhow::Result { + let window_field = Arc::new(Field::new( + config.window_field_name, + window_arrow_struct(), + true, + )); + + let receiver_hook = Arc::new(RwLock::new(None)); + + let codec = FsPhysicalExtensionCodec { + context: DecodingContext::UnboundedBatchStream(receiver_hook.clone()), + }; + + let final_plan = PhysicalPlanNode::decode(&mut config.final_aggregation_plan.as_slice())?; + let final_execution_plan = final_plan.try_into_physical_plan( + registry.as_ref(), + &RuntimeEnvBuilder::new().build()?, + &codec, + )?; + + let input_schema: FsSchema = config + .input_schema + .ok_or_else(|| anyhow!("missing input schema"))? + .try_into()?; + + let row_converter = if input_schema.routing_keys().is_none() { + let array = Arc::new(BooleanArray::from(vec![false])); + Converter::Empty( + RowConverter::new(vec![SortField::new(DataType::Boolean)])?, + array, + ) + } else { + let key_count = input_schema.routing_keys().as_ref().unwrap().len(); + Converter::RowConverter(RowConverter::new( + input_schema + .schema + .fields() + .into_iter() + .take(key_count) + .map(|field| SortField::new(field.data_type().clone())) + .collect(), + )?) + }; + + let output_schema = build_session_output_schema( + &input_schema, + window_field.clone(), + config.window_index as usize, + final_execution_plan.schema().as_ref(), + )?; + + let session_config = Arc::new(SessionWindowConfig { + gap: Duration::from_micros(config.gap_micros), + window_field, + window_index: config.window_index as usize, + input_schema_ref: Arc::new(input_schema), + final_physical_exec: final_execution_plan, + receiver_hook, + output_schema, + }); + + Ok(SessionWindowOperator { + config: session_config, + session_states: HashMap::new(), + pq_start_times: BTreeMap::new(), + pq_watermark_actions: BTreeMap::new(), + row_converter, + }) + } +} + diff --git a/src/runtime/streaming/operators/windows/sliding_aggregating_window.rs b/src/runtime/streaming/operators/windows/sliding_aggregating_window.rs new file mode 100644 index 00000000..19a539f3 --- /dev/null +++ b/src/runtime/streaming/operators/windows/sliding_aggregating_window.rs @@ -0,0 +1,545 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +use anyhow::{anyhow, bail, Result}; +use arrow::compute::{partition, sort_to_indices, take}; +use arrow_array::{Array, PrimitiveArray, RecordBatch, types::TimestampNanosecondType}; +use arrow_schema::SchemaRef; +use datafusion::common::ScalarValue; +use datafusion::execution::context::SessionContext; +use datafusion::execution::runtime_env::RuntimeEnvBuilder; +use datafusion::execution::SendableRecordBatchStream; +use datafusion::physical_expr::PhysicalExpr; +use datafusion::physical_plan::ExecutionPlan; +use datafusion_proto::physical_plan::DefaultPhysicalExtensionCodec; +use datafusion_proto::{ + physical_plan::{from_proto::parse_physical_expr, AsExecutionPlan}, + protobuf::{PhysicalExprNode, PhysicalPlanNode}, +}; +use futures::StreamExt; +use prost::Message; +use std::collections::{BTreeMap, VecDeque}; +use std::sync::{Arc, RwLock}; +use std::time::{Duration, SystemTime}; +use tokio::sync::mpsc::{unbounded_channel, UnboundedReceiver, UnboundedSender}; + +use crate::runtime::streaming::api::context::TaskContext; +use crate::runtime::streaming::api::operator::Operator; +use async_trait::async_trait; +use crate::runtime::streaming::factory::Registry; +use protocol::grpc::api::SlidingWindowAggregateOperator; +use crate::runtime::streaming::StreamOutput; +use crate::sql::common::{from_nanos, to_nanos, CheckpointBarrier, FsSchema, Watermark}; +use crate::sql::physical::{DecodingContext, FsPhysicalExtensionCodec}; +// ============================================================================ +// ============================================================================ + +#[derive(Default, Debug)] +struct RecordBatchPane { + batches: Vec, +} + +#[derive(Debug)] +struct RecordBatchTier { + width: Duration, + start_time: Option, + panes: VecDeque, +} + +impl RecordBatchTier { + fn new(width: Duration) -> Self { + Self { + width, + start_time: None, + panes: VecDeque::new(), + } + } + + fn bin_start(&self, timestamp: SystemTime) -> SystemTime { + if self.width == Duration::ZERO { + return timestamp; + } + let nanos = to_nanos(timestamp) - (to_nanos(timestamp) % self.width.as_nanos()); + from_nanos(nanos) + } + + fn insert(&mut self, batch: RecordBatch, timestamp: SystemTime) -> Result<()> { + let bin_start = self.bin_start(timestamp); + if self.start_time.is_none() { + self.start_time = Some(bin_start); + self.panes.push_back(RecordBatchPane { + batches: vec![batch], + }); + return Ok(()); + } + + let start_time = self.start_time.unwrap(); + let bin_index = + (bin_start.duration_since(start_time)?.as_nanos() / self.width.as_nanos()) as usize; + while self.panes.len() <= bin_index { + self.panes.push_back(RecordBatchPane::default()); + } + self.panes[bin_index].batches.push(batch); + Ok(()) + } + + fn batches_for_timestamp(&self, bin_start: SystemTime) -> Result> { + if self + .start_time + .map(|st| st > bin_start) + .unwrap_or(true) + { + return Ok(vec![]); + } + let bin_index = (bin_start + .duration_since(self.start_time.unwrap())? + .as_nanos() + / self.width.as_nanos()) as usize; + if self.panes.len() <= bin_index { + return Ok(vec![]); + } + Ok(self.panes[bin_index].batches.clone()) + } + + fn delete_before(&mut self, cutoff: SystemTime) -> Result<()> { + let bin_start = self.bin_start(cutoff); + if self + .start_time + .map(|st| st >= bin_start) + .unwrap_or(true) + { + return Ok(()); + } + let bin_index = (bin_start + .duration_since(self.start_time.unwrap()) + .unwrap() + .as_nanos() + / self.width.as_nanos()) as usize; + + if bin_index >= self.panes.len() { + self.panes.clear(); + } else { + self.panes.drain(0..bin_index); + } + self.start_time = Some(bin_start); + Ok(()) + } +} + +#[derive(Debug)] +struct TieredRecordBatchHolder { + tier_widths: Vec, + tiers: Vec, +} + +impl TieredRecordBatchHolder { + fn new(tier_widths: Vec) -> Result { + for i in 0..tier_widths.len().saturating_sub(1) { + if !tier_widths[i + 1].as_nanos().is_multiple_of(tier_widths[i].as_nanos()) { + bail!( + "tier width {} does not evenly divide next {}", + tier_widths[i].as_nanos(), + tier_widths[i + 1].as_nanos() + ); + } + } + let tiers = tier_widths + .iter() + .map(|w| RecordBatchTier::new(*w)) + .collect(); + Ok(Self { tier_widths, tiers }) + } + + fn insert(&mut self, batch: RecordBatch, timestamp: SystemTime) -> Result<()> { + for tier in self.tiers.iter_mut() { + tier.insert(batch.clone(), timestamp)?; + } + Ok(()) + } + + fn batches_for_interval( + &self, + interval_start: SystemTime, + interval_end: SystemTime, + ) -> Result> { + let mut batches = Vec::new(); + let mut current_tier = 0usize; + let mut current_start = interval_start; + + while current_start < interval_end { + let tier_end = current_start + self.tier_widths[current_tier]; + if tier_end > interval_end { + current_tier = current_tier.saturating_sub(1); + continue; + } + if current_tier < self.tier_widths.len() - 1 { + let next_tier = &self.tiers[current_tier + 1]; + if next_tier.bin_start(current_start) == current_start + && current_start + next_tier.width <= interval_end + { + current_tier += 1; + continue; + } + } + batches.extend(self.tiers[current_tier].batches_for_timestamp(current_start)?); + current_start += self.tier_widths[current_tier]; + } + if current_start != interval_end { + bail!( + "interval end {:?} does not match current start {:?}", + interval_end, current_start + ); + } + Ok(batches) + } + + fn delete_before(&mut self, cutoff: SystemTime) -> Result<()> { + for tier in self.tiers.iter_mut() { + tier.delete_before(cutoff)?; + } + Ok(()) + } +} + +// ============================================================================ +// ============================================================================ + +struct ActiveBin { + sender: Option>, + result_stream: Option, + finished_batches: Vec, +} + +impl Default for ActiveBin { + fn default() -> Self { + Self { + sender: None, + result_stream: None, + finished_batches: Vec::new(), + } + } +} + +impl ActiveBin { + fn start_partial( + plan: Arc, + hook: &Arc>>>, + ) -> Result { + let (tx, rx) = unbounded_channel(); + *hook.write().unwrap() = Some(rx); + plan.reset()?; + let result_stream = plan.execute(0, SessionContext::new().task_ctx())?; + Ok(Self { + sender: Some(tx), + result_stream: Some(result_stream), + finished_batches: Vec::new(), + }) + } + + async fn close_and_drain(&mut self) -> Result<()> { + self.sender.take(); + if let Some(mut stream) = self.result_stream.take() { + while let Some(batch) = stream.next().await { + self.finished_batches.push(batch?); + } + } + Ok(()) + } +} + +// ============================================================================ +// ============================================================================ + +pub struct SlidingWindowOperator { + slide: Duration, + width: Duration, + binning_function: Arc, + + partial_aggregation_plan: Arc, + partial_schema: FsSchema, + + finish_execution_plan: Arc, + final_projection: Arc, + projection_input_schema: SchemaRef, + + receiver_hook: Arc>>>, + final_batches_passer: Arc>>, + + active_bins: BTreeMap, + tiered_record_batches: TieredRecordBatchHolder, +} + +impl SlidingWindowOperator { + fn bin_start(&self, timestamp: SystemTime) -> SystemTime { + if self.slide == Duration::ZERO { + return timestamp; + } + let nanos = to_nanos(timestamp) - (to_nanos(timestamp) % self.slide.as_nanos()); + from_nanos(nanos) + } + + fn add_bin_start_as_timestamp( + batch: &RecordBatch, + bin_start: SystemTime, + schema: SchemaRef, + ) -> Result { + let bin_start_scalar = ScalarValue::TimestampNanosecond(Some(to_nanos(bin_start) as i64), None); + let timestamp_array = bin_start_scalar.to_array_of_size(batch.num_rows())?; + let mut columns = batch.columns().to_vec(); + columns.push(timestamp_array); + Ok(RecordBatch::try_new(schema, columns)?) + } + + fn ensure_bin_running( + slot: &mut ActiveBin, + plan: Arc, + hook: &Arc>>>, + ) -> Result<()> { + if slot.sender.is_some() { + return Ok(()); + } + let preserved = std::mem::take(&mut slot.finished_batches); + let mut started = ActiveBin::start_partial(plan, hook)?; + started.finished_batches = preserved; + *slot = started; + Ok(()) + } +} + +#[async_trait] +impl Operator for SlidingWindowOperator { + fn name(&self) -> &str { + "SlidingWindow" + } + + async fn on_start(&mut self, _ctx: &mut TaskContext) -> Result<()> { + Ok(()) + } + + async fn process_data( + &mut self, + _input_idx: usize, + batch: RecordBatch, + ctx: &mut TaskContext, + ) -> Result> { + let bin_array = self + .binning_function + .evaluate(&batch)? + .into_array(batch.num_rows())?; + let indices = sort_to_indices(bin_array.as_ref(), None, None)?; + + let columns = batch + .columns() + .iter() + .map(|c| take(c, &indices, None).unwrap()) + .collect(); + let sorted = RecordBatch::try_new(batch.schema(), columns)?; + let sorted_bins = take(bin_array.as_ref(), &indices, None)?; + + let typed_bin = sorted_bins + .as_any() + .downcast_ref::>() + .ok_or_else(|| anyhow!("binning function must produce TimestampNanosecond"))?; + let partition_ranges = partition(std::slice::from_ref(&sorted_bins))?.ranges(); + + let watermark = ctx.last_present_watermark(); + + for range in partition_ranges { + let bin_start = from_nanos(typed_bin.value(range.start) as u128); + + if let Some(wm) = watermark { + if bin_start < self.bin_start(wm) { + continue; + } + } + + let bin_batch = sorted.slice(range.start, range.end - range.start); + let slot = self.active_bins.entry(bin_start).or_default(); + + Self::ensure_bin_running( + slot, + self.partial_aggregation_plan.clone(), + &self.receiver_hook, + )?; + + let sender = slot + .sender + .as_ref() + .ok_or_else(|| anyhow!("partial bin sender missing after ensure"))?; + sender + .send(bin_batch) + .map_err(|e| anyhow!("partial channel send: {e}"))?; + } + + Ok(vec![]) + } + + async fn process_watermark( + &mut self, + watermark: Watermark, + _ctx: &mut TaskContext, + ) -> Result> { + let Watermark::EventTime(current_time) = watermark else { + return Ok(vec![]); + }; + let watermark_bin = self.bin_start(current_time); + + let mut final_outputs = Vec::new(); + + let mut expired_bins = Vec::new(); + for &k in self.active_bins.keys() { + if k + self.slide <= watermark_bin { + expired_bins.push(k); + } else { + break; + } + } + + for bin_start in expired_bins { + let mut bin = self + .active_bins + .remove(&bin_start) + .ok_or_else(|| anyhow!("missing active bin"))?; + let bin_end = bin_start + self.slide; + + bin.close_and_drain().await?; + for b in bin.finished_batches { + self.tiered_record_batches.insert(b, bin_start)?; + } + + let interval_start = bin_end - self.width; + let interval_end = bin_end; + + let partials = self + .tiered_record_batches + .batches_for_interval(interval_start, interval_end)?; + *self.final_batches_passer.write().unwrap() = partials; + + self.finish_execution_plan.reset()?; + let mut final_exec = self + .finish_execution_plan + .execute(0, SessionContext::new().task_ctx())?; + + let mut aggregate_results = Vec::new(); + while let Some(batch) = final_exec.next().await { + aggregate_results.push(Self::add_bin_start_as_timestamp( + &batch?, + interval_start, + self.projection_input_schema.clone(), + )?); + } + + *self.final_batches_passer.write().unwrap() = aggregate_results; + self.final_projection.reset()?; + let mut proj_exec = self + .final_projection + .execute(0, SessionContext::new().task_ctx())?; + + while let Some(batch) = proj_exec.next().await { + final_outputs.push(StreamOutput::Forward(batch?)); + } + + self.tiered_record_batches + .delete_before(bin_end + self.slide - self.width)?; + } + + Ok(final_outputs) + } + + async fn snapshot_state(&mut self, _barrier: CheckpointBarrier, _ctx: &mut TaskContext) -> Result<()> { + Ok(()) + } + + async fn on_close(&mut self, _ctx: &mut TaskContext) -> Result> { + Ok(vec![]) + } +} + +// ============================================================================ +// ============================================================================ + +pub struct SlidingAggregatingWindowConstructor; + +impl SlidingAggregatingWindowConstructor { + pub fn with_config( + &self, + config: SlidingWindowAggregateOperator, + registry: Arc, + ) -> anyhow::Result { + let width = Duration::from_micros(config.width_micros); + let slide = Duration::from_micros(config.slide_micros); + let input_schema: FsSchema = config + .input_schema + .ok_or_else(|| anyhow!("missing input schema"))? + .try_into()?; + + let binning_function = parse_physical_expr( + &PhysicalExprNode::decode(&mut config.binning_function.as_slice())?, + registry.as_ref(), + &input_schema.schema, + &DefaultPhysicalExtensionCodec {}, + )?; + + let receiver_hook = Arc::new(RwLock::new(None)); + let final_batches_passer = Arc::new(RwLock::new(Vec::new())); + + let codec = FsPhysicalExtensionCodec { + context: DecodingContext::UnboundedBatchStream(receiver_hook.clone()), + }; + let final_codec = FsPhysicalExtensionCodec { + context: DecodingContext::LockedBatchVec(final_batches_passer.clone()), + }; + + let partial_plan = PhysicalPlanNode::decode(&mut config.partial_aggregation_plan.as_slice())? + .try_into_physical_plan( + registry.as_ref(), + &RuntimeEnvBuilder::new().build()?, + &codec, + )?; + + let finish_plan = PhysicalPlanNode::decode(&mut config.final_aggregation_plan.as_slice())? + .try_into_physical_plan( + registry.as_ref(), + &RuntimeEnvBuilder::new().build()?, + &final_codec, + )?; + + let final_proj = PhysicalPlanNode::decode(&mut config.final_projection.as_slice())? + .try_into_physical_plan( + registry.as_ref(), + &RuntimeEnvBuilder::new().build()?, + &final_codec, + )?; + + let partial_schema: FsSchema = config + .partial_schema + .ok_or_else(|| anyhow!("missing partial schema"))? + .try_into()?; + + Ok(SlidingWindowOperator { + slide, + width, + binning_function, + partial_aggregation_plan: partial_plan, + partial_schema, + finish_execution_plan: finish_plan, + final_projection: final_proj.clone(), + projection_input_schema: final_proj.children()[0].schema().clone(), + receiver_hook, + final_batches_passer, + active_bins: BTreeMap::new(), + tiered_record_batches: TieredRecordBatchHolder::new(vec![slide])?, + }) + } +} + diff --git a/src/runtime/streaming/operators/windows/tumbling_aggregating_window.rs b/src/runtime/streaming/operators/windows/tumbling_aggregating_window.rs new file mode 100644 index 00000000..c0342d66 --- /dev/null +++ b/src/runtime/streaming/operators/windows/tumbling_aggregating_window.rs @@ -0,0 +1,376 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +use anyhow::{anyhow, Result}; +use arrow::compute::{partition, sort_to_indices, take}; +use arrow_array::{Array, PrimitiveArray, RecordBatch, types::TimestampNanosecondType}; +use arrow_schema::SchemaRef; +use datafusion::common::ScalarValue; +use datafusion::execution::context::SessionContext; +use datafusion::execution::runtime_env::RuntimeEnvBuilder; +use datafusion::execution::SendableRecordBatchStream; +use datafusion::physical_expr::PhysicalExpr; +use datafusion::physical_plan::ExecutionPlan; +use datafusion_proto::physical_plan::DefaultPhysicalExtensionCodec; +use datafusion_proto::{ + physical_plan::{from_proto::parse_physical_expr, AsExecutionPlan}, + protobuf::{PhysicalExprNode, PhysicalPlanNode}, +}; +use futures::StreamExt; +use prost::Message; +use std::collections::BTreeMap; +use std::mem; +use std::sync::{Arc, RwLock}; +use std::time::{Duration, SystemTime}; +use tokio::sync::mpsc::{unbounded_channel, UnboundedReceiver, UnboundedSender}; +use tracing::warn; + +use crate::runtime::streaming::api::context::TaskContext; +use crate::runtime::streaming::api::operator::Operator; +use async_trait::async_trait; +use crate::runtime::streaming::factory::Registry; +use protocol::grpc::api::TumblingWindowAggregateOperator; +use crate::runtime::streaming::StreamOutput; +use crate::sql::common::{from_nanos, to_nanos, CheckpointBarrier, FsSchema, Watermark}; +use crate::sql::common::time_utils::print_time; +use crate::sql::physical::{DecodingContext, FsPhysicalExtensionCodec}; +use crate::sql::schema::utils::add_timestamp_field_arrow; + +struct ActiveBin { + sender: Option>, + result_stream: Option, + finished_batches: Vec, +} + +impl Default for ActiveBin { + fn default() -> Self { + Self { + sender: None, + result_stream: None, + finished_batches: Vec::new(), + } + } +} + +impl ActiveBin { + fn start_partial( + plan: Arc, + hook: &Arc>>>, + ) -> Result { + let (tx, rx) = unbounded_channel(); + *hook.write().unwrap() = Some(rx); + plan.reset()?; + let result_stream = plan.execute(0, SessionContext::new().task_ctx())?; + Ok(Self { + sender: Some(tx), + result_stream: Some(result_stream), + finished_batches: Vec::new(), + }) + } + + async fn close_and_drain(&mut self) -> Result<()> { + self.sender.take(); + if let Some(mut stream) = self.result_stream.take() { + while let Some(batch) = stream.next().await { + self.finished_batches.push(batch?); + } + } + Ok(()) + } +} + +pub struct TumblingWindowOperator { + width: Duration, + binning_function: Arc, + + partial_aggregation_plan: Arc, + partial_schema: FsSchema, + + finish_execution_plan: Arc, + aggregate_with_timestamp_schema: SchemaRef, + final_projection: Option>, + + receiver_hook: Arc>>>, + final_batches_passer: Arc>>, + + active_bins: BTreeMap, +} + +impl TumblingWindowOperator { + fn bin_start(&self, timestamp: SystemTime) -> SystemTime { + if self.width == Duration::ZERO { + return timestamp; + } + let nanos = to_nanos(timestamp) - (to_nanos(timestamp) % self.width.as_nanos()); + from_nanos(nanos) + } + + fn add_bin_start_as_timestamp( + batch: &RecordBatch, + bin_start: SystemTime, + schema: SchemaRef, + ) -> Result { + let bin_start_scalar = ScalarValue::TimestampNanosecond(Some(to_nanos(bin_start) as i64), None); + let timestamp_array = bin_start_scalar.to_array_of_size(batch.num_rows())?; + let mut columns = batch.columns().to_vec(); + columns.push(timestamp_array); + RecordBatch::try_new(schema.clone(), columns) + .map_err(|e| anyhow!("add _timestamp column: {e}")) + } + + fn ensure_bin_running( + slot: &mut ActiveBin, + plan: Arc, + hook: &Arc>>>, + ) -> Result<()> { + if slot.sender.is_some() { + return Ok(()); + } + let preserved = mem::take(&mut slot.finished_batches); + let mut started = ActiveBin::start_partial(plan, hook)?; + started.finished_batches = preserved; + *slot = started; + Ok(()) + } +} + +#[async_trait] +impl Operator for TumblingWindowOperator { + fn name(&self) -> &str { + "TumblingWindow" + } + + async fn on_start(&mut self, _ctx: &mut TaskContext) -> Result<()> { + Ok(()) + } + + async fn process_data( + &mut self, + _input_idx: usize, + batch: RecordBatch, + ctx: &mut TaskContext, + ) -> Result> { + let bin_array = self + .binning_function + .evaluate(&batch)? + .into_array(batch.num_rows())?; + let indices = sort_to_indices(bin_array.as_ref(), None, None)?; + + let columns = batch + .columns() + .iter() + .map(|c| take(c, &indices, None).unwrap()) + .collect(); + let sorted = RecordBatch::try_new(batch.schema(), columns)?; + let sorted_bins = take(bin_array.as_ref(), &indices, None)?; + + let typed_bin = sorted_bins + .as_any() + .downcast_ref::>() + .ok_or_else(|| anyhow!("binning function must produce TimestampNanosecond"))?; + let partition_ranges = partition(std::slice::from_ref(&sorted_bins))?.ranges(); + + for range in partition_ranges { + let bin_start = from_nanos(typed_bin.value(range.start) as u128); + + if let Some(watermark) = ctx.last_present_watermark() { + if bin_start < self.bin_start(watermark) { + warn!( + "late data dropped: bin {} < watermark {}", + print_time(bin_start), + print_time(watermark) + ); + continue; + } + } + + let bin_batch = sorted.slice(range.start, range.end - range.start); + let slot = self.active_bins.entry(bin_start).or_default(); + + Self::ensure_bin_running( + slot, + self.partial_aggregation_plan.clone(), + &self.receiver_hook, + )?; + + let sender = slot + .sender + .as_ref() + .ok_or_else(|| anyhow!("tumbling bin sender missing after ensure"))?; + sender + .send(bin_batch) + .map_err(|e| anyhow!("partial channel send: {e}"))?; + } + + Ok(vec![]) + } + + async fn process_watermark( + &mut self, + watermark: Watermark, + _ctx: &mut TaskContext, + ) -> Result> { + let Watermark::EventTime(current_time) = watermark else { + return Ok(vec![]); + }; + + let mut final_outputs = Vec::new(); + + let mut expired_bins = Vec::new(); + for &k in self.active_bins.keys() { + if k + self.width <= current_time { + expired_bins.push(k); + } else { + break; + } + } + + for bin_start in expired_bins { + let mut bin = self + .active_bins + .remove(&bin_start) + .ok_or_else(|| anyhow!("missing tumbling bin"))?; + + bin.close_and_drain().await?; + let partial_batches = mem::take(&mut bin.finished_batches); + + if partial_batches.is_empty() { + continue; + } + + *self.final_batches_passer.write().unwrap() = partial_batches; + self.finish_execution_plan.reset()?; + let mut final_exec = self + .finish_execution_plan + .execute(0, SessionContext::new().task_ctx())?; + + let mut aggregate_results = Vec::new(); + while let Some(batch) = final_exec.next().await { + let batch = batch?; + let with_timestamp = Self::add_bin_start_as_timestamp( + &batch, + bin_start, + self.aggregate_with_timestamp_schema.clone(), + )?; + + if self.final_projection.is_none() { + final_outputs.push(StreamOutput::Forward(with_timestamp)); + } else { + aggregate_results.push(with_timestamp); + } + } + + if let Some(final_projection) = &self.final_projection { + *self.final_batches_passer.write().unwrap() = aggregate_results; + final_projection.reset()?; + let mut proj_exec = final_projection.execute(0, SessionContext::new().task_ctx())?; + + while let Some(batch) = proj_exec.next().await { + final_outputs.push(StreamOutput::Forward(batch?)); + } + } + } + + Ok(final_outputs) + } + + async fn snapshot_state(&mut self, _barrier: CheckpointBarrier, _ctx: &mut TaskContext) -> Result<()> { + Ok(()) + } + + async fn on_close(&mut self, _ctx: &mut TaskContext) -> Result> { + Ok(vec![]) + } +} + +pub struct TumblingAggregateWindowConstructor; + +impl TumblingAggregateWindowConstructor { + pub fn with_config( + &self, + config: TumblingWindowAggregateOperator, + registry: Arc, + ) -> anyhow::Result { + let width = Duration::from_micros(config.width_micros); + let input_schema: FsSchema = config + .input_schema + .ok_or_else(|| anyhow!("missing input schema"))? + .try_into()?; + + let binning_function = parse_physical_expr( + &PhysicalExprNode::decode(&mut config.binning_function.as_slice())?, + registry.as_ref(), + &input_schema.schema, + &DefaultPhysicalExtensionCodec {}, + )?; + + let receiver_hook = Arc::new(RwLock::new(None)); + let final_batches_passer = Arc::new(RwLock::new(Vec::new())); + + let codec = FsPhysicalExtensionCodec { + context: DecodingContext::UnboundedBatchStream(receiver_hook.clone()), + }; + let final_codec = FsPhysicalExtensionCodec { + context: DecodingContext::LockedBatchVec(final_batches_passer.clone()), + }; + + let partial_plan = PhysicalPlanNode::decode(&mut config.partial_aggregation_plan.as_slice())? + .try_into_physical_plan( + registry.as_ref(), + &RuntimeEnvBuilder::new().build()?, + &codec, + )?; + + let partial_schema: FsSchema = config + .partial_schema + .ok_or_else(|| anyhow!("missing partial schema"))? + .try_into()?; + + let finish_plan = PhysicalPlanNode::decode(&mut config.final_aggregation_plan.as_slice())?; + let finish_execution_plan = finish_plan.try_into_physical_plan( + registry.as_ref(), + &RuntimeEnvBuilder::new().build()?, + &final_codec, + )?; + + let final_projection_plan = match &config.final_projection { + Some(proto) if !proto.is_empty() => { + let node = PhysicalPlanNode::decode(&mut proto.as_slice()) + .map_err(|e| anyhow!("decode final_projection: {e}"))?; + Some(node.try_into_physical_plan( + registry.as_ref(), + &RuntimeEnvBuilder::new().build()?, + &final_codec, + )?) + } + _ => None, + }; + + let aggregate_with_timestamp_schema = + add_timestamp_field_arrow((*finish_execution_plan.schema()).clone()); + + Ok(TumblingWindowOperator { + width, + binning_function, + partial_aggregation_plan: partial_plan, + partial_schema, + finish_execution_plan, + aggregate_with_timestamp_schema, + final_projection: final_projection_plan, + receiver_hook, + final_batches_passer, + active_bins: BTreeMap::new(), + }) + } +} + diff --git a/src/runtime/streaming/operators/windows/window_function.rs b/src/runtime/streaming/operators/windows/window_function.rs new file mode 100644 index 00000000..4ab68cfd --- /dev/null +++ b/src/runtime/streaming/operators/windows/window_function.rs @@ -0,0 +1,279 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +use anyhow::{anyhow, Result}; +use arrow::compute::{max, min}; +use arrow_array::RecordBatch; +use datafusion::execution::context::SessionContext; +use datafusion::execution::runtime_env::RuntimeEnvBuilder; +use datafusion::execution::SendableRecordBatchStream; +use datafusion::physical_plan::ExecutionPlan; +use datafusion_proto::physical_plan::AsExecutionPlan; +use datafusion_proto::protobuf::PhysicalPlanNode; +use futures::StreamExt; +use prost::Message; +use std::collections::BTreeMap; +use std::sync::{Arc, RwLock}; +use std::time::SystemTime; +use tokio::sync::mpsc::{unbounded_channel, UnboundedReceiver, UnboundedSender}; +use tracing::warn; + +use crate::runtime::streaming::api::context::TaskContext; +use crate::runtime::streaming::api::operator::Operator; +use crate::runtime::streaming::factory::Registry; +use async_trait::async_trait; +use crate::runtime::streaming::StreamOutput; +use crate::sql::common::{from_nanos, CheckpointBarrier, FsSchema, FsSchemaRef, Watermark}; +use crate::sql::common::time_utils::print_time; +use crate::sql::physical::{DecodingContext, FsPhysicalExtensionCodec}; + +// ============================================================================ +// ============================================================================ + +struct ActiveWindowExec { + sender: Option>, + result_stream: Option, +} + +impl ActiveWindowExec { + fn new( + plan: Arc, + hook: &Arc>>>, + ) -> Result { + let (tx, rx) = unbounded_channel(); + *hook.write().unwrap() = Some(rx); + plan.reset()?; + let result_stream = plan.execute(0, SessionContext::new().task_ctx())?; + Ok(Self { + sender: Some(tx), + result_stream: Some(result_stream), + }) + } + + async fn close_and_drain(&mut self) -> Result> { + self.sender.take(); + let mut results = Vec::new(); + if let Some(mut stream) = self.result_stream.take() { + while let Some(batch) = stream.next().await { + results.push(batch?); + } + } + Ok(results) + } +} + +// ============================================================================ +// ============================================================================ + +pub struct WindowFunctionOperator { + input_schema: FsSchemaRef, + input_schema_unkeyed: FsSchemaRef, + window_exec_plan: Arc, + receiver_hook: Arc>>>, + active_execs: BTreeMap, +} + +impl WindowFunctionOperator { + fn filter_and_split_batches( + &self, + batch: RecordBatch, + watermark: Option, + ) -> Result> { + if batch.num_rows() == 0 { + return Ok(vec![]); + } + + let timestamp_column = self.input_schema.timestamp_column(&batch); + let min_timestamp = from_nanos(min(timestamp_column).unwrap() as u128); + let max_timestamp = from_nanos(max(timestamp_column).unwrap() as u128); + + if let Some(wm) = watermark { + if max_timestamp < wm { + warn!( + "dropped late batch: max_ts {} < watermark {}", + print_time(max_timestamp), + print_time(wm) + ); + return Ok(vec![]); + } + } + + if min_timestamp == max_timestamp { + return Ok(vec![(batch, max_timestamp)]); + } + + let sorted_batch = self + .input_schema_unkeyed + .sort(batch, true) + .map_err(|e| anyhow!("sort for window fn: {e}"))?; + let filtered_batch = self + .input_schema_unkeyed + .filter_by_time(sorted_batch, watermark) + .map_err(|e| anyhow!("filter_by_time: {e}"))?; + if filtered_batch.num_rows() == 0 { + return Ok(vec![]); + } + + let filtered_timestamps = self.input_schema.timestamp_column(&filtered_batch); + let ranges = self + .input_schema_unkeyed + .partition(&filtered_batch, true) + .map_err(|e| anyhow!("partition by time: {e}"))?; + + let mut batches = Vec::with_capacity(ranges.len()); + for range in ranges { + let slice = filtered_batch.slice(range.start, range.end - range.start); + let ts = from_nanos(filtered_timestamps.value(range.start) as u128); + batches.push((slice, ts)); + } + Ok(batches) + } + + fn get_or_create_exec(&mut self, timestamp: SystemTime) -> Result<&mut ActiveWindowExec> { + use std::collections::btree_map::Entry; + match self.active_execs.entry(timestamp) { + Entry::Vacant(v) => { + let new_exec = + ActiveWindowExec::new(self.window_exec_plan.clone(), &self.receiver_hook)?; + Ok(v.insert(new_exec)) + } + Entry::Occupied(o) => Ok(o.into_mut()), + } + } +} + +#[async_trait] +impl Operator for WindowFunctionOperator { + fn name(&self) -> &str { + "WindowFunction" + } + + async fn on_start(&mut self, _ctx: &mut TaskContext) -> Result<()> { + Ok(()) + } + + async fn process_data( + &mut self, + _input_idx: usize, + batch: RecordBatch, + ctx: &mut TaskContext, + ) -> Result> { + let current_watermark = ctx.last_present_watermark(); + let split_batches = self.filter_and_split_batches(batch, current_watermark)?; + + for (sub_batch, timestamp) in split_batches { + let exec = self.get_or_create_exec(timestamp)?; + exec.sender + .as_ref() + .ok_or_else(|| anyhow!("window exec sender missing"))? + .send(sub_batch) + .map_err(|e| anyhow!("route batch to plan: {e}"))?; + } + + Ok(vec![]) + } + + async fn process_watermark( + &mut self, + watermark: Watermark, + _ctx: &mut TaskContext, + ) -> Result> { + let Watermark::EventTime(current_time) = watermark else { + return Ok(vec![]); + }; + + let mut final_outputs = Vec::new(); + + let mut expired_timestamps = Vec::new(); + for &k in self.active_execs.keys() { + if k < current_time { + expired_timestamps.push(k); + } else { + break; + } + } + + for ts in expired_timestamps { + let mut exec = self + .active_execs + .remove(&ts) + .ok_or_else(|| anyhow!("missing window exec"))?; + let result_batches = exec.close_and_drain().await?; + for batch in result_batches { + final_outputs.push(StreamOutput::Forward(batch)); + } + } + + Ok(final_outputs) + } + + async fn snapshot_state(&mut self, _barrier: CheckpointBarrier, _ctx: &mut TaskContext) -> Result<()> { + Ok(()) + } + + async fn on_close(&mut self, _ctx: &mut TaskContext) -> Result> { + Ok(vec![]) + } +} + +// ============================================================================ +// ============================================================================ + +pub struct WindowFunctionConstructor; + +impl WindowFunctionConstructor { + pub fn with_config( + &self, + config: protocol::grpc::api::WindowFunctionOperator, + registry: Arc, + ) -> anyhow::Result { + let input_schema = Arc::new( + FsSchema::try_from( + config + .input_schema + .ok_or_else(|| anyhow!("missing input schema"))?, + ) + .map_err(|e| anyhow!("input schema: {e}"))?, + ); + + let input_schema_unkeyed = Arc::new( + FsSchema::from_schema_unkeyed(input_schema.schema.clone()) + .map_err(|e| anyhow!("unkeyed schema: {e}"))?, + ); + + let receiver_hook = Arc::new(RwLock::new(None)); + let codec = FsPhysicalExtensionCodec { + context: DecodingContext::UnboundedBatchStream(receiver_hook.clone()), + }; + + let window_exec_node = + PhysicalPlanNode::decode(&mut config.window_function_plan.as_slice()) + .map_err(|e| anyhow!("decode window_function_plan: {e}"))?; + let window_exec_plan = window_exec_node + .try_into_physical_plan( + registry.as_ref(), + &RuntimeEnvBuilder::new().build()?, + &codec, + ) + .map_err(|e| anyhow!("window physical plan: {e}"))?; + + Ok(WindowFunctionOperator { + input_schema, + input_schema_unkeyed, + window_exec_plan, + receiver_hook, + active_execs: BTreeMap::new(), + }) + } +} + diff --git a/src/runtime/streaming/protocol/control.rs b/src/runtime/streaming/protocol/control.rs new file mode 100644 index 00000000..d337046e --- /dev/null +++ b/src/runtime/streaming/protocol/control.rs @@ -0,0 +1,82 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +use serde::{Deserialize, Serialize}; +use std::time::Duration; +use tokio::sync::mpsc::{self, Receiver, Sender}; +use crate::sql::common::CheckpointBarrier; + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct CheckpointBarrierWire { + pub epoch: u32, + pub min_epoch: u32, + pub timestamp_secs: u64, + pub timestamp_subsec_nanos: u32, + pub then_stop: bool, +} + +impl From for CheckpointBarrierWire { + fn from(b: CheckpointBarrier) -> Self { + let d = b + .timestamp + .duration_since(std::time::UNIX_EPOCH) + .unwrap_or_default(); + Self { + epoch: b.epoch, + min_epoch: b.min_epoch, + timestamp_secs: d.as_secs(), + timestamp_subsec_nanos: d.subsec_nanos(), + then_stop: b.then_stop, + } + } +} + +impl From for CheckpointBarrier { + fn from(w: CheckpointBarrierWire) -> Self { + Self { + epoch: w.epoch, + min_epoch: w.min_epoch, + timestamp: std::time::UNIX_EPOCH + + Duration::new(w.timestamp_secs, w.timestamp_subsec_nanos), + then_stop: w.then_stop, + } + } +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub enum ControlCommand { + Start, + Stop { mode: StopMode }, + DropState, + Commit { epoch: u32 }, + UpdateConfig { config_json: String }, + TriggerCheckpoint { barrier: CheckpointBarrierWire }, +} + +impl ControlCommand { + pub fn trigger_checkpoint(barrier: CheckpointBarrier) -> Self { + Self::TriggerCheckpoint { + barrier: barrier.into(), + } + } +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub enum StopMode { + Graceful, + Immediate, +} + +pub fn control_channel(capacity: usize) -> (Sender, Receiver) { + mpsc::channel(capacity) +} diff --git a/src/runtime/streaming/protocol/event.rs b/src/runtime/streaming/protocol/event.rs new file mode 100644 index 00000000..b78b7fbc --- /dev/null +++ b/src/runtime/streaming/protocol/event.rs @@ -0,0 +1,22 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use arrow_array::RecordBatch; +use crate::sql::common::{CheckpointBarrier, Watermark}; + +#[derive(Debug, Clone)] +pub enum StreamEvent { + Data(RecordBatch), + Watermark(Watermark), + Barrier(CheckpointBarrier), + EndOfStream, +} diff --git a/src/runtime/streaming/protocol/mod.rs b/src/runtime/streaming/protocol/mod.rs new file mode 100644 index 00000000..fb20c59e --- /dev/null +++ b/src/runtime/streaming/protocol/mod.rs @@ -0,0 +1,20 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +pub mod control; +pub mod event; +pub mod stream_out; +pub mod tracked; +pub mod watermark; + +pub use stream_out::StreamOutput; diff --git a/src/runtime/streaming/protocol/stream_out.rs b/src/runtime/streaming/protocol/stream_out.rs new file mode 100644 index 00000000..fc7b9bba --- /dev/null +++ b/src/runtime/streaming/protocol/stream_out.rs @@ -0,0 +1,22 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use arrow_array::RecordBatch; +use crate::sql::common::Watermark; + +#[derive(Debug, Clone)] +pub enum StreamOutput { + Forward(RecordBatch), + Keyed(u64, RecordBatch), + Broadcast(RecordBatch), + Watermark(Watermark), +} diff --git a/src/runtime/streaming/protocol/tracked.rs b/src/runtime/streaming/protocol/tracked.rs new file mode 100644 index 00000000..d4360627 --- /dev/null +++ b/src/runtime/streaming/protocol/tracked.rs @@ -0,0 +1,39 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; + +use crate::runtime::streaming::memory::MemoryTicket; +use crate::runtime::streaming::protocol::event::StreamEvent; + +/// +#[derive(Debug, Clone)] +pub struct TrackedEvent { + pub event: StreamEvent, + pub _ticket: Option>, +} + +impl TrackedEvent { + pub fn new(event: StreamEvent, ticket: Option) -> Self { + Self { + event, + _ticket: ticket.map(Arc::new), + } + } + + pub fn control(event: StreamEvent) -> Self { + Self { + event, + _ticket: None, + } + } +} diff --git a/src/runtime/streaming/protocol/watermark.rs b/src/runtime/streaming/protocol/watermark.rs new file mode 100644 index 00000000..f6e8388a --- /dev/null +++ b/src/runtime/streaming/protocol/watermark.rs @@ -0,0 +1,88 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +use crate::sql::common::Watermark; + +pub fn merge_watermarks(per_input: &[Option]) -> Option { + if per_input.iter().any(|w| w.is_none()) { + return None; + } + + let mut min_event: Option = None; + let mut all_idle = true; + + for w in per_input.iter().flatten() { + match w { + Watermark::Idle => {} + Watermark::EventTime(t) => { + all_idle = false; + min_event = Some(match min_event { + None => *t, + Some(m) => m.min(*t), + }); + } + } + } + + if all_idle { + Some(Watermark::Idle) + } else { + Some(Watermark::EventTime( + min_event.expect("non-idle alignment must have at least one EventTime"), + )) + } +} + +pub fn watermark_strictly_advances(new: Watermark, previous: Option) -> bool { + match previous { + None => true, + Some(prev) => match (new, prev) { + (Watermark::EventTime(tn), Watermark::EventTime(tp)) => tn > tp, + (Watermark::Idle, Watermark::Idle) => false, + (Watermark::Idle, Watermark::EventTime(_)) => true, + (Watermark::EventTime(_), Watermark::Idle) => true, + }, + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::time::{Duration, SystemTime}; + + #[test] + fn merge_waits_for_all_channels() { + let wms = vec![Some(Watermark::EventTime(SystemTime::UNIX_EPOCH)), None]; + assert!(merge_watermarks(&wms).is_none()); + } + + #[test] + fn merge_min_event_time_ignores_idle() { + let t1 = SystemTime::UNIX_EPOCH + Duration::from_secs(10); + let t2 = SystemTime::UNIX_EPOCH + Duration::from_secs(5); + let wms = vec![Some(Watermark::EventTime(t1)), Some(Watermark::Idle)]; + assert_eq!(merge_watermarks(&wms), Some(Watermark::EventTime(t1))); + + let wms = vec![ + Some(Watermark::EventTime(t1)), + Some(Watermark::EventTime(t2)), + ]; + assert_eq!(merge_watermarks(&wms), Some(Watermark::EventTime(t2))); + } + + #[test] + fn merge_all_idle() { + let wms = vec![Some(Watermark::Idle), Some(Watermark::Idle)]; + assert_eq!(merge_watermarks(&wms), Some(Watermark::Idle)); + } +} diff --git a/src/runtime/util/mod.rs b/src/runtime/util/mod.rs new file mode 100644 index 00000000..0e3a3f7b --- /dev/null +++ b/src/runtime/util/mod.rs @@ -0,0 +1,16 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +mod physical_aggregate; + +pub use physical_aggregate::decode_aggregate; diff --git a/src/runtime/util/physical_aggregate.rs b/src/runtime/util/physical_aggregate.rs new file mode 100644 index 00000000..33dd1e9f --- /dev/null +++ b/src/runtime/util/physical_aggregate.rs @@ -0,0 +1,77 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +use std::sync::Arc; + +use arrow::datatypes::SchemaRef; +use datafusion::common::internal_err; +use datafusion::common::Result as DFResult; +use datafusion::execution::FunctionRegistry; +use datafusion::physical_expr::aggregate::{AggregateExprBuilder, AggregateFunctionExpr}; +use datafusion::physical_expr::{LexOrdering, PhysicalExpr}; +use datafusion_proto::physical_plan::from_proto::{parse_physical_expr, parse_physical_sort_expr}; +use datafusion_proto::physical_plan::{DefaultPhysicalExtensionCodec, PhysicalExtensionCodec}; +use datafusion_proto::protobuf::physical_aggregate_expr_node::AggregateFunction; +use datafusion_proto::protobuf::physical_expr_node::ExprType; +use datafusion_proto::protobuf::{PhysicalExprNode, proto_error}; + +pub fn decode_aggregate( + schema: &SchemaRef, + name: &str, + expr: &PhysicalExprNode, + registry: &dyn FunctionRegistry, +) -> DFResult> { + let codec = &DefaultPhysicalExtensionCodec {}; + let expr_type = expr + .expr_type + .as_ref() + .ok_or_else(|| proto_error("Unexpected empty aggregate physical expression"))?; + + match expr_type { + ExprType::AggregateExpr(agg_node) => { + let input_phy_expr: Vec> = agg_node + .expr + .iter() + .map(|e| parse_physical_expr(e, registry, schema, codec)) + .collect::>>()?; + let ordering_req: LexOrdering = agg_node + .ordering_req + .iter() + .map(|e| parse_physical_sort_expr(e, registry, schema, codec)) + .collect::>()?; + agg_node + .aggregate_function + .as_ref() + .map(|func| match func { + AggregateFunction::UserDefinedAggrFunction(udaf_name) => { + let agg_udf = match &agg_node.fun_definition { + Some(buf) => codec.try_decode_udaf(udaf_name, buf)?, + None => registry.udaf(udaf_name)?, + }; + + AggregateExprBuilder::new(agg_udf, input_phy_expr) + .schema(Arc::clone(schema)) + .alias(name) + .with_ignore_nulls(agg_node.ignore_nulls) + .with_distinct(agg_node.distinct) + .order_by(ordering_req) + .build() + .map(Arc::new) + } + }) + .transpose()? + .ok_or_else(|| proto_error("Invalid AggregateExpr, missing aggregate_function")) + } + _ => internal_err!("Invalid aggregate expression for AggregateExec"), + } +} diff --git a/src/runtime/input/input_protocol.rs b/src/runtime/wasm/input/input_protocol.rs similarity index 100% rename from src/runtime/input/input_protocol.rs rename to src/runtime/wasm/input/input_protocol.rs diff --git a/src/runtime/input/input_provider.rs b/src/runtime/wasm/input/input_provider.rs similarity index 100% rename from src/runtime/input/input_provider.rs rename to src/runtime/wasm/input/input_provider.rs diff --git a/src/runtime/input/input_runner.rs b/src/runtime/wasm/input/input_runner.rs similarity index 100% rename from src/runtime/input/input_runner.rs rename to src/runtime/wasm/input/input_runner.rs diff --git a/src/runtime/input/interface.rs b/src/runtime/wasm/input/interface.rs similarity index 100% rename from src/runtime/input/interface.rs rename to src/runtime/wasm/input/interface.rs diff --git a/src/runtime/input/mod.rs b/src/runtime/wasm/input/mod.rs similarity index 100% rename from src/runtime/input/mod.rs rename to src/runtime/wasm/input/mod.rs diff --git a/src/runtime/input/protocol/kafka/config.rs b/src/runtime/wasm/input/protocol/kafka/config.rs similarity index 100% rename from src/runtime/input/protocol/kafka/config.rs rename to src/runtime/wasm/input/protocol/kafka/config.rs diff --git a/src/runtime/input/protocol/kafka/kafka_protocol.rs b/src/runtime/wasm/input/protocol/kafka/kafka_protocol.rs similarity index 100% rename from src/runtime/input/protocol/kafka/kafka_protocol.rs rename to src/runtime/wasm/input/protocol/kafka/kafka_protocol.rs diff --git a/src/runtime/input/protocol/kafka/mod.rs b/src/runtime/wasm/input/protocol/kafka/mod.rs similarity index 100% rename from src/runtime/input/protocol/kafka/mod.rs rename to src/runtime/wasm/input/protocol/kafka/mod.rs diff --git a/src/runtime/input/protocol/mod.rs b/src/runtime/wasm/input/protocol/mod.rs similarity index 100% rename from src/runtime/input/protocol/mod.rs rename to src/runtime/wasm/input/protocol/mod.rs diff --git a/src/runtime/wasm/mod.rs b/src/runtime/wasm/mod.rs new file mode 100644 index 00000000..b1c82f4c --- /dev/null +++ b/src/runtime/wasm/mod.rs @@ -0,0 +1,18 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! WebAssembly runtime integration. + +pub mod input; +pub mod output; +pub mod processor; diff --git a/src/runtime/output/interface.rs b/src/runtime/wasm/output/interface.rs similarity index 100% rename from src/runtime/output/interface.rs rename to src/runtime/wasm/output/interface.rs diff --git a/src/runtime/output/mod.rs b/src/runtime/wasm/output/mod.rs similarity index 100% rename from src/runtime/output/mod.rs rename to src/runtime/wasm/output/mod.rs diff --git a/src/runtime/output/output_protocol.rs b/src/runtime/wasm/output/output_protocol.rs similarity index 100% rename from src/runtime/output/output_protocol.rs rename to src/runtime/wasm/output/output_protocol.rs diff --git a/src/runtime/output/output_provider.rs b/src/runtime/wasm/output/output_provider.rs similarity index 100% rename from src/runtime/output/output_provider.rs rename to src/runtime/wasm/output/output_provider.rs diff --git a/src/runtime/output/output_runner.rs b/src/runtime/wasm/output/output_runner.rs similarity index 100% rename from src/runtime/output/output_runner.rs rename to src/runtime/wasm/output/output_runner.rs diff --git a/src/runtime/output/protocol/kafka/kafka_protocol.rs b/src/runtime/wasm/output/protocol/kafka/kafka_protocol.rs similarity index 100% rename from src/runtime/output/protocol/kafka/kafka_protocol.rs rename to src/runtime/wasm/output/protocol/kafka/kafka_protocol.rs diff --git a/src/runtime/output/protocol/kafka/mod.rs b/src/runtime/wasm/output/protocol/kafka/mod.rs similarity index 100% rename from src/runtime/output/protocol/kafka/mod.rs rename to src/runtime/wasm/output/protocol/kafka/mod.rs diff --git a/src/runtime/output/protocol/kafka/producer_config.rs b/src/runtime/wasm/output/protocol/kafka/producer_config.rs similarity index 100% rename from src/runtime/output/protocol/kafka/producer_config.rs rename to src/runtime/wasm/output/protocol/kafka/producer_config.rs diff --git a/src/runtime/output/protocol/mod.rs b/src/runtime/wasm/output/protocol/mod.rs similarity index 100% rename from src/runtime/output/protocol/mod.rs rename to src/runtime/wasm/output/protocol/mod.rs diff --git a/src/runtime/processor/function_error.rs b/src/runtime/wasm/processor/function_error.rs similarity index 71% rename from src/runtime/processor/function_error.rs rename to src/runtime/wasm/processor/function_error.rs index b38f8dd9..f9b8fe8e 100644 --- a/src/runtime/processor/function_error.rs +++ b/src/runtime/wasm/processor/function_error.rs @@ -1,3 +1,15 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + #[derive(Debug, Clone)] pub enum FunctionErrorStage { Input, diff --git a/src/runtime/processor/mod.rs b/src/runtime/wasm/processor/mod.rs similarity index 100% rename from src/runtime/processor/mod.rs rename to src/runtime/wasm/processor/mod.rs diff --git a/src/runtime/processor/python/mod.rs b/src/runtime/wasm/processor/python/mod.rs similarity index 100% rename from src/runtime/processor/python/mod.rs rename to src/runtime/wasm/processor/python/mod.rs diff --git a/src/runtime/processor/python/python_host.rs b/src/runtime/wasm/processor/python/python_host.rs similarity index 100% rename from src/runtime/processor/python/python_host.rs rename to src/runtime/wasm/processor/python/python_host.rs diff --git a/src/runtime/processor/python/python_service.rs b/src/runtime/wasm/processor/python/python_service.rs similarity index 100% rename from src/runtime/processor/python/python_service.rs rename to src/runtime/wasm/processor/python/python_service.rs diff --git a/src/runtime/processor/wasm/input_strategy.rs b/src/runtime/wasm/processor/wasm/input_strategy.rs similarity index 100% rename from src/runtime/processor/wasm/input_strategy.rs rename to src/runtime/wasm/processor/wasm/input_strategy.rs diff --git a/src/runtime/processor/wasm/mod.rs b/src/runtime/wasm/processor/wasm/mod.rs similarity index 100% rename from src/runtime/processor/wasm/mod.rs rename to src/runtime/wasm/processor/wasm/mod.rs diff --git a/src/runtime/processor/wasm/thread_pool.rs b/src/runtime/wasm/processor/wasm/thread_pool.rs similarity index 100% rename from src/runtime/processor/wasm/thread_pool.rs rename to src/runtime/wasm/processor/wasm/thread_pool.rs diff --git a/src/runtime/processor/wasm/wasm_cache.rs b/src/runtime/wasm/processor/wasm/wasm_cache.rs similarity index 100% rename from src/runtime/processor/wasm/wasm_cache.rs rename to src/runtime/wasm/processor/wasm/wasm_cache.rs diff --git a/src/runtime/processor/wasm/wasm_host.rs b/src/runtime/wasm/processor/wasm/wasm_host.rs similarity index 100% rename from src/runtime/processor/wasm/wasm_host.rs rename to src/runtime/wasm/processor/wasm/wasm_host.rs diff --git a/src/runtime/processor/wasm/wasm_processor.rs b/src/runtime/wasm/processor/wasm/wasm_processor.rs similarity index 99% rename from src/runtime/processor/wasm/wasm_processor.rs rename to src/runtime/wasm/processor/wasm/wasm_processor.rs index 1afc9dcf..cd61be98 100644 --- a/src/runtime/processor/wasm/wasm_processor.rs +++ b/src/runtime/wasm/processor/wasm/wasm_processor.rs @@ -679,3 +679,4 @@ impl WasmProcessor for WasmProcessorImpl { Ok(()) } } + diff --git a/src/runtime/processor/wasm/wasm_processor_trait.rs b/src/runtime/wasm/processor/wasm/wasm_processor_trait.rs similarity index 100% rename from src/runtime/processor/wasm/wasm_processor_trait.rs rename to src/runtime/wasm/processor/wasm/wasm_processor_trait.rs diff --git a/src/runtime/processor/wasm/wasm_task.rs b/src/runtime/wasm/processor/wasm/wasm_task.rs similarity index 100% rename from src/runtime/processor/wasm/wasm_task.rs rename to src/runtime/wasm/processor/wasm/wasm_task.rs diff --git a/src/server/handler.rs b/src/server/handler.rs index 4721a5a1..2ef6b529 100644 --- a/src/server/handler.rs +++ b/src/server/handler.rs @@ -14,22 +14,21 @@ use std::sync::Arc; use std::time::Instant; use arrow_ipc::writer::StreamWriter; -use log::{error, info}; use tonic::{Request, Response as TonicResponse, Status}; +use tracing::{debug, error, info, warn}; use protocol::service::FunctionInfo as ProtoFunctionInfo; use protocol::service::{ - CreateFunctionRequest, CreatePythonFunctionRequest, DropFunctionRequest, Response, - ShowFunctionsRequest, ShowFunctionsResponse, SqlRequest, StartFunctionRequest, StatusCode, - StopFunctionRequest, function_stream_service_server::FunctionStreamService, + function_stream_service_server::FunctionStreamService, CreateFunctionRequest, + CreatePythonFunctionRequest, DropFunctionRequest, Response, ShowFunctionsRequest, + ShowFunctionsResponse, SqlRequest, StartFunctionRequest, StatusCode, StopFunctionRequest, }; -use crate::coordinator::Coordinator; use crate::coordinator::{ - CreateFunction, CreatePythonFunction, DataSet, DropFunction, ShowFunctions, - ShowFunctionsResult, StartFunction, Statement, StopFunction, + Coordinator, CreateFunction, CreatePythonFunction, DataSet, DropFunction, PythonModule, + ShowFunctions, ShowFunctionsResult, StartFunction, Statement, StopFunction, }; -use crate::sql::SqlParser; +use crate::sql::parse::parse_sql; pub struct FunctionStreamServiceImpl { coordinator: Arc, @@ -40,23 +39,66 @@ impl FunctionStreamServiceImpl { Self { coordinator } } - fn build_response(status_code: StatusCode, message: String, data: Option>) -> Response { + fn serialize_dataset(ds: &dyn DataSet) -> Result, String> { + let batch = ds.to_record_batch(); + let mut buf = Vec::new(); + + let mut writer = StreamWriter::try_new(&mut buf, &batch.schema()) + .map_err(|e| format!("IPC writer initialization failed: {e}"))?; + + writer + .write(&batch) + .map_err(|e| format!("IPC write failed: {e}"))?; + + writer + .finish() + .map_err(|e| format!("IPC finish failed: {e}"))?; + + Ok(buf) + } + + fn build_success_response( + status: StatusCode, + message: String, + data: Option>, + ) -> Response { + let payload = match data { + Some(ds) => match Self::serialize_dataset(ds.as_ref()) { + Ok(bytes) => Some(bytes), + Err(e) => { + error!("Data serialization error: {}", e); + return Self::build_error_response( + StatusCode::InternalServerError, + "Internal data serialization error".to_string(), + ); + } + }, + None => None, + }; + Response { - status_code: status_code as i32, + status_code: status as i32, message, - data, + data: payload, } } - fn data_set_to_ipc_bytes(ds: &dyn DataSet) -> Option> { - let batch = ds.to_record_batch(); - let mut buf = Vec::new(); - { - let mut writer = StreamWriter::try_new(&mut buf, &batch.schema()).ok()?; - writer.write(&batch).ok()?; - writer.finish().ok()?; + fn build_error_response(status: StatusCode, message: String) -> Response { + Response { + status_code: status as i32, + message, + data: None, + } + } + + async fn execute_statement(&self, stmt: &dyn Statement, success_status: StatusCode) -> Response { + let result = self.coordinator.execute_with_stream_catalog(stmt).await; + + if result.success { + Self::build_success_response(success_status, result.message, result.data) + } else { + Self::build_error_response(StatusCode::InternalServerError, result.message) } - Some(buf) } } @@ -66,225 +108,134 @@ impl FunctionStreamService for FunctionStreamServiceImpl { &self, request: Request, ) -> Result, Status> { - let start_time = Instant::now(); + let timer = Instant::now(); let req = request.into_inner(); - let parse_start = Instant::now(); - let stmt = match SqlParser::parse(&req.sql) { - Ok(stmt) => { - log::debug!("SQL parsed in {}ms", parse_start.elapsed().as_millis()); - stmt - } - Err(e) => { - return Ok(TonicResponse::new(Self::build_response( - StatusCode::BadRequest, - format!("Parse error: {}", e), - None, + let statements = parse_sql(&req.sql).map_err(|e| { + let detail = e.to_string(); + warn!("SQL parse rejection: {}", detail); + Status::invalid_argument(detail) + })?; + + if statements.is_empty() { + return Ok(TonicResponse::new(Self::build_success_response( + StatusCode::Ok, + "No statements executed".to_string(), + None, + ))); + } + + let mut final_response = None; + + for stmt in statements { + let result = self + .coordinator + .execute_with_stream_catalog(stmt.as_ref()) + .await; + + if !result.success { + error!("SQL execution aborted: {}", result.message); + return Ok(TonicResponse::new(Self::build_error_response( + StatusCode::InternalServerError, + result.message, ))); } - }; - - let exec_start = Instant::now(); - let result = self.coordinator.execute(stmt.as_ref()); - log::debug!( - "Coordinator execution finished in {}ms", - exec_start.elapsed().as_millis() - ); - let status_code = if result.success { - StatusCode::Ok - } else { - error!("Execution failed: {}", result.message); - StatusCode::InternalServerError - }; + final_response = Some(result); + } - log::debug!( - "Total SQL request cost: {}ms", - start_time.elapsed().as_millis() - ); + let result = final_response.unwrap(); + let response = Self::build_success_response(StatusCode::Ok, result.message, result.data); - Ok(TonicResponse::new(Self::build_response( - status_code, - result.message, - result - .data - .as_ref() - .and_then(|ds| Self::data_set_to_ipc_bytes(ds.as_ref())), - ))) + debug!("execute_sql completed in {}ms", timer.elapsed().as_millis()); + Ok(TonicResponse::new(response)) } async fn create_function( &self, request: Request, ) -> Result, Status> { - let start_time = Instant::now(); + let timer = Instant::now(); let req = request.into_inner(); - info!( - "Received CreateFunction request. Config size: {}, Function size: {}", - req.config_bytes.len(), - req.function_bytes.len() - ); - - let config_bytes = if !req.config_bytes.is_empty() { - Some(req.config_bytes) - } else { - None - }; + let config_bytes = (!req.config_bytes.is_empty()).then_some(req.config_bytes); let stmt = CreateFunction::from_bytes(req.function_bytes, config_bytes); - let exec_start = Instant::now(); - let result = self.coordinator.execute(&stmt as &dyn Statement); - info!( - "Coordinator execution finished in {}ms", - exec_start.elapsed().as_millis() - ); - - let status_code = if result.success { - StatusCode::Created - } else { - error!("CreateFunction failed: {}", result.message); - StatusCode::InternalServerError - }; - - info!( - "Total CreateFunction request cost: {}ms", - start_time.elapsed().as_millis() - ); + let response = self.execute_statement(&stmt, StatusCode::Created).await; - Ok(TonicResponse::new(Self::build_response( - status_code, - result.message, - result - .data - .as_ref() - .and_then(|ds| Self::data_set_to_ipc_bytes(ds.as_ref())), - ))) + info!("create_function completed in {}ms", timer.elapsed().as_millis()); + Ok(TonicResponse::new(response)) } async fn create_python_function( &self, request: Request, ) -> Result, Status> { - let start_time = Instant::now(); + let timer = Instant::now(); let req = request.into_inner(); - info!( - "Received CreatePythonFunction request. Class name: {}, Modules: {}", - req.class_name, - req.modules.len() - ); - // Convert proto modules to PythonModule - let modules: Vec = req + if req.modules.is_empty() { + return Ok(TonicResponse::new(Self::build_error_response( + StatusCode::BadRequest, + "Python function creation requires at least one module".to_string(), + ))); + } + + let modules: Vec = req .modules .into_iter() - .map(|m| crate::coordinator::PythonModule { + .map(|m| PythonModule { name: m.module_name, bytes: m.module_bytes, }) .collect(); - if modules.is_empty() { - return Ok(TonicResponse::new(Self::build_response( - StatusCode::BadRequest, - "At least one module is required".to_string(), - None, - ))); - } - let stmt = CreatePythonFunction::new(req.class_name, modules, req.config_content); + let response = self.execute_statement(&stmt, StatusCode::Created).await; - let exec_start = Instant::now(); - let result = self.coordinator.execute(&stmt as &dyn Statement); info!( - "Coordinator execution finished in {}ms", - exec_start.elapsed().as_millis() + "create_python_function completed in {}ms", + timer.elapsed().as_millis() ); - - let status_code = if result.success { - StatusCode::Created - } else { - error!("CreatePythonFunction failed: {}", result.message); - StatusCode::InternalServerError - }; - - info!( - "Total CreatePythonFunction request cost: {}ms", - start_time.elapsed().as_millis() - ); - - Ok(TonicResponse::new(Self::build_response( - status_code, - result.message, - result - .data - .as_ref() - .and_then(|ds| Self::data_set_to_ipc_bytes(ds.as_ref())), - ))) + Ok(TonicResponse::new(response)) } async fn drop_function( &self, request: Request, ) -> Result, Status> { - let start_time = Instant::now(); + let timer = Instant::now(); let req = request.into_inner(); - info!( - "Received DropFunction request: function_name={}", - req.function_name - ); let stmt = DropFunction::new(req.function_name); - let exec_start = Instant::now(); - let result = self.coordinator.execute(&stmt as &dyn Statement); - info!( - "Coordinator execution finished in {}ms", - exec_start.elapsed().as_millis() - ); + let response = self.execute_statement(&stmt, StatusCode::Ok).await; - let status_code = if result.success { - StatusCode::Ok - } else { - error!("DropFunction failed: {}", result.message); - StatusCode::InternalServerError - }; - - info!( - "Total DropFunction request cost: {}ms", - start_time.elapsed().as_millis() - ); - - Ok(TonicResponse::new(Self::build_response( - status_code, - result.message, - None, - ))) + info!("drop_function completed in {}ms", timer.elapsed().as_millis()); + Ok(TonicResponse::new(response)) } async fn show_functions( &self, - request: Request, + _request: Request, ) -> Result, Status> { - let start_time = Instant::now(); - let _req = request.into_inner(); - info!("Received ShowFunctions request"); - + let timer = Instant::now(); let stmt = ShowFunctions::new(); - let exec_start = Instant::now(); - let result = self.coordinator.execute(&stmt as &dyn Statement); - info!( - "Coordinator execution finished in {}ms", - exec_start.elapsed().as_millis() - ); - let (status_code, message) = if result.success { - (StatusCode::Ok as i32, result.message) - } else { - error!("ShowFunctions failed: {}", result.message); - (StatusCode::InternalServerError as i32, result.message) - }; + let result = self + .coordinator + .execute_with_stream_catalog(&stmt) + .await; + + if !result.success { + error!("show_functions execution failed: {}", result.message); + return Ok(TonicResponse::new(ShowFunctionsResponse { + status_code: StatusCode::InternalServerError as i32, + message: result.message, + functions: vec![], + })); + } - let functions: Vec = result + let functions = result .data .as_ref() .and_then(|arc_ds| { @@ -302,15 +253,10 @@ impl FunctionStreamService for FunctionStreamServiceImpl { }) .unwrap_or_default(); - info!( - "Total ShowFunctions request cost: {}ms, count={}", - start_time.elapsed().as_millis(), - functions.len() - ); - + info!("show_functions completed in {}ms", timer.elapsed().as_millis()); Ok(TonicResponse::new(ShowFunctionsResponse { - status_code, - message, + status_code: StatusCode::Ok as i32, + message: result.message, functions, })) } @@ -319,76 +265,28 @@ impl FunctionStreamService for FunctionStreamServiceImpl { &self, request: Request, ) -> Result, Status> { - let start_time = Instant::now(); + let timer = Instant::now(); let req = request.into_inner(); - info!( - "Received StartFunction request: function_name={}", - req.function_name - ); let stmt = StartFunction::new(req.function_name); - let exec_start = Instant::now(); - let result = self.coordinator.execute(&stmt as &dyn Statement); - info!( - "Coordinator execution finished in {}ms", - exec_start.elapsed().as_millis() - ); - - let status_code = if result.success { - StatusCode::Ok - } else { - error!("StartFunction failed: {}", result.message); - StatusCode::InternalServerError - }; + let response = self.execute_statement(&stmt, StatusCode::Ok).await; - info!( - "Total StartFunction request cost: {}ms", - start_time.elapsed().as_millis() - ); - - Ok(TonicResponse::new(Self::build_response( - status_code, - result.message, - None, - ))) + info!("start_function completed in {}ms", timer.elapsed().as_millis()); + Ok(TonicResponse::new(response)) } async fn stop_function( &self, request: Request, ) -> Result, Status> { - let start_time = Instant::now(); + let timer = Instant::now(); let req = request.into_inner(); - info!( - "Received StopFunction request: function_name={}", - req.function_name - ); let stmt = StopFunction::new(req.function_name); - let exec_start = Instant::now(); - let result = self.coordinator.execute(&stmt as &dyn Statement); - info!( - "Coordinator execution finished in {}ms", - exec_start.elapsed().as_millis() - ); - - let status_code = if result.success { - StatusCode::Ok - } else { - error!("StopFunction failed: {}", result.message); - StatusCode::InternalServerError - }; - - info!( - "Total StopFunction request cost: {}ms", - start_time.elapsed().as_millis() - ); + let response = self.execute_statement(&stmt, StatusCode::Ok).await; - Ok(TonicResponse::new(Self::build_response( - status_code, - result.message, - None, - ))) + info!("stop_function completed in {}ms", timer.elapsed().as_millis()); + Ok(TonicResponse::new(response)) } } diff --git a/src/server/initializer.rs b/src/server/initializer.rs index ccb02788..70c19685 100644 --- a/src/server/initializer.rs +++ b/src/server/initializer.rs @@ -10,15 +10,23 @@ // See the License for the specific language governing permissions and // limitations under the License. -use crate::config::GlobalConfig; +use std::time::Instant; + use anyhow::{Context, Result}; +use tracing::{debug, info, warn}; + +use crate::config::GlobalConfig; -type InitializerFn = fn(&GlobalConfig) -> Result<()>; +pub type InitializerFn = fn(&GlobalConfig) -> Result<()>; #[derive(Clone)] -struct Component { - name: &'static str, - initializer: InitializerFn, +pub struct Component { + pub name: &'static str, + pub initializer: InitializerFn, +} + +pub struct ComponentRegistry { + components: Vec, } #[derive(Default)] @@ -27,25 +35,17 @@ pub struct ComponentRegistryBuilder { } impl ComponentRegistryBuilder { - #[inline] pub fn new() -> Self { - Self::with_capacity(8) - } - - #[inline] - pub fn with_capacity(capacity: usize) -> Self { Self { - components: Vec::with_capacity(capacity), + components: Vec::with_capacity(8), } } - #[inline] pub fn register(mut self, name: &'static str, initializer: InitializerFn) -> Self { self.components.push(Component { name, initializer }); self } - #[inline] pub fn build(self) -> ComponentRegistry { ComponentRegistry { components: self.components, @@ -53,57 +53,71 @@ impl ComponentRegistryBuilder { } } -pub struct ComponentRegistry { - components: Vec, -} - impl ComponentRegistry { pub fn initialize_all(&self, config: &GlobalConfig) -> Result<()> { if self.components.is_empty() { - log::warn!("No components registered for initialization"); + warn!("Component registry is empty; no components to initialize"); return Ok(()); } - log::info!("Initializing {} components...", self.components.len()); + let total = self.components.len(); + info!(total_components = total, "Commencing system initialization sequence"); - for (idx, component) in self.components.iter().enumerate() { - let start = std::time::Instant::now(); - log::debug!( - "[{}/{}] Initializing component: {}", - idx + 1, - self.components.len(), - component.name + for (index, component) in self.components.iter().enumerate() { + let start_time = Instant::now(); + + debug!( + component = component.name, + step = format!("{}/{}", index + 1, total), + "Initializing component" ); - (component.initializer)(config) - .with_context(|| format!("Component '{}' initialization failed", component.name))?; + (component.initializer)(config).with_context(|| { + format!("Fatal error initializing component: {}", component.name) + })?; - let elapsed = start.elapsed(); - log::debug!( - "[{}/{}] Component '{}' initialized successfully in {:?}", - idx + 1, - self.components.len(), - component.name, - elapsed + debug!( + component = component.name, + elapsed_ms = start_time.elapsed().as_millis(), + "Component initialized successfully" ); } - log::info!( - "All {} components initialized successfully", - self.components.len() - ); + info!("System initialization sequence completed successfully"); Ok(()) } +} - #[inline] - pub fn len(&self) -> usize { - self.components.len() - } +pub fn build_core_registry() -> ComponentRegistry { + let builder = { + let b = ComponentRegistryBuilder::new() + .register("WasmCache", initialize_wasm_cache) + .register("TaskManager", initialize_task_manager) + .register("JobManager", initialize_job_manager); + #[cfg(feature = "python")] + let b = b.register("PythonService", initialize_python_service); + b + }; - #[inline] - pub fn is_empty(&self) -> bool { - self.components.is_empty() - } + builder + .register( + "StreamCatalog", + crate::storage::stream_catalog::initialize_stream_catalog, + ) + .register("Coordinator", initialize_coordinator) + .build() +} + +pub fn bootstrap_system(config: &GlobalConfig) -> Result<()> { + let registry = build_core_registry(); + + registry.initialize_all(config)?; + + crate::storage::stream_catalog::restore_global_catalog_from_store(); + crate::storage::stream_catalog::restore_streaming_jobs_from_store(); + + info!("System bootstrap finished. Node is ready to accept traffic."); + Ok(()) } fn initialize_wasm_cache(config: &GlobalConfig) -> Result<()> { @@ -114,18 +128,20 @@ fn initialize_wasm_cache(config: &GlobalConfig) -> Result<()> { max_size: config.wasm.max_cache_size, }, ); - log::info!( - "WASM cache configuration: enabled={}, dir={}, max_size={} bytes", - config.wasm.enable_cache, - config.wasm.cache_dir, - config.wasm.max_cache_size + + debug!( + enabled = config.wasm.enable_cache, + dir = %config.wasm.cache_dir, + max_size = config.wasm.max_cache_size, + "WASM cache configured" ); + Ok(()) } fn initialize_task_manager(config: &GlobalConfig) -> Result<()> { crate::runtime::taskexecutor::TaskManager::init(config) - .context("TaskManager initialization failed")?; + .context("TaskManager service failed to start")?; Ok(()) } @@ -136,24 +152,31 @@ fn initialize_python_service(config: &GlobalConfig) -> Result<()> { Ok(()) } -fn initialize_coordinator(_config: &GlobalConfig) -> Result<()> { - crate::runtime::taskexecutor::TaskManager::get() - .context("Coordinator requires TaskManager to be initialized first")?; - log::info!("Coordinator verified and ready"); +fn initialize_job_manager(config: &GlobalConfig) -> Result<()> { + use crate::runtime::streaming::factory::Registry; + use crate::runtime::streaming::factory::OperatorFactory; + use crate::runtime::streaming::job::JobManager; + use std::sync::Arc; + + let registry = Arc::new(Registry::new()); + let factory = Arc::new(OperatorFactory::new(registry)); + let max_memory_bytes = config.streaming.max_memory_bytes.unwrap_or(256 * 1024 * 1024); + + JobManager::init(factory, max_memory_bytes) + .context("JobManager service failed to start")?; + Ok(()) } -pub fn register_components() -> ComponentRegistry { - let builder = { - let b = ComponentRegistryBuilder::new() - .register("WasmCache", initialize_wasm_cache) - .register("TaskManager", initialize_task_manager); - #[cfg(feature = "python")] - let b = b.register("PythonService", initialize_python_service); - b - }; +fn initialize_coordinator(_config: &GlobalConfig) -> Result<()> { + crate::runtime::taskexecutor::TaskManager::get() + .context("Dependency violation: Coordinator requires TaskManager")?; - builder - .register("Coordinator", initialize_coordinator) - .build() + crate::storage::stream_catalog::CatalogManager::global() + .context("Dependency violation: Coordinator requires StreamCatalog")?; + + crate::runtime::streaming::job::JobManager::global() + .context("Dependency violation: Coordinator requires JobManager")?; + + Ok(()) } diff --git a/src/server/mod.rs b/src/server/mod.rs index 03254af3..cb7a4a85 100644 --- a/src/server/mod.rs +++ b/src/server/mod.rs @@ -17,5 +17,5 @@ mod initializer; mod service; pub use handler::FunctionStreamServiceImpl; -pub use initializer::register_components; +pub use initializer::bootstrap_system; pub use service::start_server_with_shutdown; diff --git a/src/sql/analysis/aggregate_rewriter.rs b/src/sql/analysis/aggregate_rewriter.rs new file mode 100644 index 00000000..36024ab0 --- /dev/null +++ b/src/sql/analysis/aggregate_rewriter.rs @@ -0,0 +1,274 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use datafusion::common::tree_node::{Transformed, TreeNodeRewriter}; +use datafusion::common::{DFSchema, DataFusionError, Result, not_impl_err, plan_err}; +use datafusion::functions_aggregate::expr_fn::max; +use datafusion::logical_expr::{Aggregate, Expr, Extension, LogicalPlan, Projection}; +use datafusion::prelude::col; +use std::sync::Arc; + +use crate::sql::schema::StreamSchemaProvider; +use crate::sql::extensions::aggregate::StreamWindowAggregateNode; +use crate::sql::extensions::key_calculation::{KeyExtractionNode, KeyExtractionStrategy}; +use crate::sql::analysis::streaming_window_analzer::StreamingWindowAnalzer; +use crate::sql::types::{ + DFField, TIMESTAMP_FIELD, WindowBehavior, WindowType, fields_with_qualifiers, find_window, + schema_from_df_fields_with_metadata, +}; + +/// AggregateRewriter transforms batch DataFusion aggregates into streaming stateful operators. +/// It handles windowing (Tumble/Hop/Session), watermarks, and continuous updating aggregates. +pub(crate) struct AggregateRewriter<'a> { + pub schema_provider: &'a StreamSchemaProvider, +} + +impl TreeNodeRewriter for AggregateRewriter<'_> { + type Node = LogicalPlan; + + fn f_up(&mut self, node: Self::Node) -> Result> { + let LogicalPlan::Aggregate(mut agg) = node else { + return Ok(Transformed::no(node)); + }; + + // 1. Identify windowing functions (e.g., tumble, hop) in GROUP BY. + let mut window_exprs: Vec<_> = agg + .group_expr + .iter() + .enumerate() + .filter_map(|(i, e)| find_window(e).map(|opt| opt.map(|w| (i, w))).transpose()) + .collect::>>()?; + + if window_exprs.len() > 1 { + return not_impl_err!("Streaming aggregates support at most one window expression"); + } + + // 2. Prepare internal metadata for Key-based distribution. + let mut key_fields: Vec = fields_with_qualifiers(&agg.schema) + .iter() + .take(agg.group_expr.len()) + .map(|f| { + DFField::new( + f.qualifier().cloned(), + format!("_key_{}", f.name()), + f.data_type().clone(), + f.is_nullable(), + ) + }) + .collect(); + + // 3. Dispatch to Updating Aggregate if no windowing is detected. + let input_window = StreamingWindowAnalzer::get_window(&agg.input)?; + if window_exprs.is_empty() && input_window.is_none() { + return self.rewrite_as_updating_aggregate( + agg.input, + key_fields, + agg.group_expr, + agg.aggr_expr, + agg.schema, + ); + } + + // 4. Resolve Windowing Strategy (InData vs FromOperator). + let behavior = self.resolve_window_context( + &agg.input, + &mut agg.group_expr, + &agg.schema, + &mut window_exprs, + )?; + + // Adjust keys if windowing is handled by the operator. + if let WindowBehavior::FromOperator { window_index, .. } = &behavior { + key_fields.remove(*window_index); + } + + let key_count = key_fields.len(); + let keyed_input = + self.build_keyed_input(agg.input.clone(), &agg.group_expr, &key_fields)?; + + // 5. Build the final StreamWindowAggregateNode for the physical planner. + let mut internal_fields = fields_with_qualifiers(&agg.schema); + if let WindowBehavior::FromOperator { window_index, .. } = &behavior { + internal_fields.remove(*window_index); + } + let internal_schema = Arc::new(schema_from_df_fields_with_metadata( + &internal_fields, + agg.schema.metadata().clone(), + )?); + + let rewritten_agg = Aggregate::try_new_with_schema( + Arc::new(keyed_input), + agg.group_expr, + agg.aggr_expr, + internal_schema, + )?; + + let extension = StreamWindowAggregateNode::try_new( + behavior, + LogicalPlan::Aggregate(rewritten_agg), + (0..key_count).collect(), + )?; + + Ok(Transformed::yes(LogicalPlan::Extension(Extension { + node: Arc::new(extension), + }))) + } +} + +impl<'a> AggregateRewriter<'a> { + pub fn new(schema_provider: &'a StreamSchemaProvider) -> Self { + Self { schema_provider } + } + + /// [Internal] Builds the physical Key Calculation layer required for distributed Shuffling. + /// This wraps the input in a Projection and a KeyExtractionNode. + fn build_keyed_input( + &self, + input: Arc, + group_expr: &[Expr], + key_fields: &[DFField], + ) -> Result { + let key_count = group_expr.len(); + let mut projection_fields = key_fields.to_vec(); + projection_fields.extend(fields_with_qualifiers(input.schema())); + + let key_schema = Arc::new(schema_from_df_fields_with_metadata( + &projection_fields, + input.schema().metadata().clone(), + )?); + + // Map group expressions to '_key_' aliases while passing through all original columns. + let mut exprs: Vec<_> = group_expr + .iter() + .zip(key_fields.iter()) + .map(|(expr, f)| expr.clone().alias(f.name().to_string())) + .collect(); + + exprs.extend( + fields_with_qualifiers(input.schema()) + .iter() + .map(|f| Expr::Column(f.qualified_column())), + ); + + let projection = + LogicalPlan::Projection(Projection::try_new_with_schema(exprs, input, key_schema)?); + + Ok(LogicalPlan::Extension(Extension { + node: Arc::new(KeyExtractionNode::new( + projection, + KeyExtractionStrategy::ColumnIndices((0..key_count).collect()), + )), + })) + } + + /// [Strategy] Rewrites standard GROUP BY into a non-windowed updating aggregate. + /// Injected max(_timestamp) ensures the streaming pulse (Watermark) continues to propagate. + fn rewrite_as_updating_aggregate( + &self, + input: Arc, + key_fields: Vec, + group_expr: Vec, + mut aggr_expr: Vec, + schema: Arc, + ) -> Result> { + let keyed_input = self.build_keyed_input(input, &group_expr, &key_fields)?; + + // Ensure the updating stream maintains time awareness. + let timestamp_col = keyed_input + .schema() + .qualified_field_with_unqualified_name(TIMESTAMP_FIELD) + .map_err(|_| { + DataFusionError::Plan( + "Required _timestamp field missing for updating aggregate".to_string(), + ) + })?; + + let timestamp_field: DFField = timestamp_col.into(); + aggr_expr.push(max(col(timestamp_field.qualified_column())).alias(TIMESTAMP_FIELD)); + + let mut output_fields = fields_with_qualifiers(&schema); + output_fields.push(timestamp_field); + + let output_schema = Arc::new(schema_from_df_fields_with_metadata( + &output_fields, + schema.metadata().clone(), + )?); + + let aggregate = Aggregate::try_new_with_schema( + Arc::new(keyed_input), + group_expr, + aggr_expr, + output_schema, + )?; + + Ok(Transformed::yes(LogicalPlan::Aggregate(aggregate))) + } + + /// [Strategy] Reconciles window definitions between the input stream and the current GROUP BY. + fn resolve_window_context( + &self, + input: &LogicalPlan, + group_expr: &mut Vec, + schema: &DFSchema, + window_expr_info: &mut Vec<(usize, WindowType)>, + ) -> Result { + let mut visitor = StreamingWindowAnalzer::default(); + input.visit_with_subqueries(&mut visitor)?; + + let input_window = visitor.window; + let has_group_window = !window_expr_info.is_empty(); + + match (input_window, has_group_window) { + // Re-aggregation or subquery with an existing window. + (Some(i_win), true) => { + let (idx, g_win) = window_expr_info.pop().unwrap(); + if i_win != g_win { + return plan_err!( + "Inconsistent windowing: input is {:?}, but group by is {:?}", + i_win, + g_win + ); + } + + if let Some(field) = visitor.fields.iter().next() { + group_expr[idx] = Expr::Column(field.qualified_column()); + Ok(WindowBehavior::InData) + } else { + if matches!(i_win, WindowType::Session { .. }) { + return plan_err!("Nested session windows are not supported"); + } + group_expr.remove(idx); + Ok(WindowBehavior::FromOperator { + window: i_win, + window_field: schema.qualified_field(idx).into(), + window_index: idx, + is_nested: true, + }) + } + } + // First-time windowing defined in this aggregate. + (None, true) => { + let (idx, g_win) = window_expr_info.pop().unwrap(); + group_expr.remove(idx); + Ok(WindowBehavior::FromOperator { + window: g_win, + window_field: schema.qualified_field(idx).into(), + window_index: idx, + is_nested: false, + }) + } + // Passthrough: input is already windowed, no new window in group by. + (Some(_), false) => Ok(WindowBehavior::InData), + _ => unreachable!("Dispatched to non-windowed path previously"), + } + } +} diff --git a/src/sql/analysis/async_udf_rewriter.rs b/src/sql/analysis/async_udf_rewriter.rs new file mode 100644 index 00000000..073a1f42 --- /dev/null +++ b/src/sql/analysis/async_udf_rewriter.rs @@ -0,0 +1,133 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use crate::sql::extensions::remote_table::RemoteTableBoundaryNode; +use crate::sql::common::constants::sql_field; +use crate::sql::extensions::AsyncFunctionExecutionNode; +use crate::sql::schema::StreamSchemaProvider; +use datafusion::common::tree_node::{Transformed, TreeNode, TreeNodeRewriter}; +use datafusion::common::{Column, Result as DFResult, TableReference, plan_err}; +use datafusion::logical_expr::expr::ScalarFunction; +use datafusion::logical_expr::{Expr, Extension, LogicalPlan}; +use std::sync::Arc; +use std::time::Duration; + +type AsyncSplitResult = (String, AsyncOptions, Vec); + +#[derive(Debug, Clone, Copy)] +pub struct AsyncOptions { + pub ordered: bool, + pub max_concurrency: usize, + pub timeout: Duration, +} + +pub struct AsyncUdfRewriter<'a> { + provider: &'a StreamSchemaProvider, +} + +impl<'a> AsyncUdfRewriter<'a> { + pub fn new(provider: &'a StreamSchemaProvider) -> Self { + Self { provider } + } + + fn split_async( + expr: Expr, + provider: &StreamSchemaProvider, + ) -> DFResult<(Expr, Option)> { + let mut found: Option<(String, AsyncOptions, Vec)> = None; + let expr = expr.transform_up(|e| { + if let Expr::ScalarFunction(ScalarFunction { func: udf, args }) = &e { + if let Some(opts) = provider.get_async_udf_options(udf.name()) { + if found + .replace((udf.name().to_string(), opts, args.clone())) + .is_some() + { + return plan_err!( + "multiple async calls in the same expression, which is not allowed" + ); + } + return Ok(Transformed::yes(Expr::Column(Column::new_unqualified( + sql_field::ASYNC_RESULT, + )))); + } + } + Ok(Transformed::no(e)) + })?; + + Ok((expr.data, found)) + } +} + +impl TreeNodeRewriter for AsyncUdfRewriter<'_> { + type Node = LogicalPlan; + + fn f_up(&mut self, node: Self::Node) -> DFResult> { + let LogicalPlan::Projection(mut projection) = node else { + for e in node.expressions() { + if let (_, Some((udf, _, _))) = Self::split_async(e.clone(), self.provider)? { + return plan_err!( + "async UDFs are only supported in projections, but {udf} was called in another context" + ); + } + } + return Ok(Transformed::no(node)); + }; + + let mut args = None; + for e in projection.expr.iter_mut() { + let (new_e, Some(udf)) = Self::split_async(e.clone(), self.provider)? else { + continue; + }; + if let Some((prev, _, _)) = args.replace(udf) { + return plan_err!( + "Projection contains multiple async UDFs, which is not supported \ + \n(hint: two async UDF calls, {} and {}, appear in the same SELECT statement)", + prev, + args.unwrap().0 + ); + } + *e = new_e; + } + + let Some((name, opts, arg_exprs)) = args else { + return Ok(Transformed::no(LogicalPlan::Projection(projection))); + }; + let udf = self.provider.dylib_udfs.get(&name).unwrap().clone(); + + let input = if matches!(*projection.input, LogicalPlan::Projection(..)) { + Arc::new(LogicalPlan::Extension(Extension { + node: Arc::new(RemoteTableBoundaryNode { + upstream_plan: (*projection.input).clone(), + table_identifier: TableReference::bare("subquery_projection"), + resolved_schema: projection.input.schema().clone(), + requires_materialization: false, + }), + })) + } else { + projection.input + }; + + Ok(Transformed::yes(LogicalPlan::Extension(Extension { + node: Arc::new(AsyncFunctionExecutionNode { + upstream_plan: input, + operator_name: name, + function_config: udf, + invocation_args: arg_exprs, + result_projections: projection.expr, + preserve_ordering: opts.ordered, + concurrency_limit: opts.max_concurrency, + execution_timeout: opts.timeout, + resolved_schema: projection.schema, + }), + }))) + } +} diff --git a/src/sql/analysis/join_rewriter.rs b/src/sql/analysis/join_rewriter.rs new file mode 100644 index 00000000..058a5bd8 --- /dev/null +++ b/src/sql/analysis/join_rewriter.rs @@ -0,0 +1,237 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use crate::sql::schema::StreamSchemaProvider; +use crate::sql::extensions::join::StreamingJoinNode; +use crate::sql::extensions::key_calculation::KeyExtractionNode; +use crate::sql::analysis::streaming_window_analzer::StreamingWindowAnalzer; +use crate::sql::types::{WindowType, fields_with_qualifiers, schema_from_df_fields_with_metadata}; +use crate::sql::common::constants::mem_exec_join_side; +use crate::sql::common::TIMESTAMP_FIELD; +use datafusion::common::tree_node::{Transformed, TreeNodeRewriter}; +use datafusion::common::{ + JoinConstraint, JoinType, Result, ScalarValue, TableReference, + not_impl_err, plan_err, +}; +use datafusion::logical_expr::{ + self, BinaryExpr, Case, Expr, Extension, Join, LogicalPlan, Projection, build_join_schema, +}; +use datafusion::prelude::coalesce; +use std::sync::Arc; + +/// JoinRewriter handles the transformation of standard SQL joins into streaming-capable joins. +/// It manages stateful "Updating Joins" and time-aligned "Instant Joins". +pub(crate) struct JoinRewriter<'a> { + pub schema_provider: &'a StreamSchemaProvider, +} + +impl<'a> JoinRewriter<'a> { + pub fn new(schema_provider: &'a StreamSchemaProvider) -> Self { + Self { schema_provider } + } + + /// [Validation] Ensures left and right streams have compatible windowing strategies. + fn validate_join_windows(&self, join: &Join) -> Result { + let left_win = StreamingWindowAnalzer::get_window(&join.left)?; + let right_win = StreamingWindowAnalzer::get_window(&join.right)?; + + match (left_win, right_win) { + (None, None) => { + if join.join_type == JoinType::Inner { + Ok(false) // Standard Updating Join (Inner) + } else { + plan_err!( + "Non-inner joins (e.g., LEFT/RIGHT) require windowing to bound state." + ) + } + } + (Some(l), Some(r)) => { + if l != r { + return plan_err!( + "Join window mismatch: left={:?}, right={:?}. Windows must match exactly.", + l, + r + ); + } + if let WindowType::Session { .. } = l { + return plan_err!( + "Session windows are currently not supported in streaming joins." + ); + } + Ok(true) // Instant Windowed Join + } + _ => plan_err!( + "Mixed windowing detected. Both sides of a join must be either windowed or non-windowed." + ), + } + } + + /// [Internal] Wraps a join input in a key-extraction layer to facilitate shuffle / key-by distribution. + fn build_keyed_side( + &self, + input: Arc, + keys: Vec, + side: &str, + ) -> Result { + let key_count = keys.len(); + + let projection_exprs = keys + .into_iter() + .enumerate() + .map(|(i, e)| { + e.alias_qualified(Some(TableReference::bare("_stream")), format!("_key_{i}")) + }) + .chain( + fields_with_qualifiers(input.schema()) + .iter() + .map(|f| Expr::Column(f.qualified_column())), + ) + .collect(); + + let projection = Projection::try_new(projection_exprs, input)?; + let key_ext = KeyExtractionNode::try_new_with_projection( + LogicalPlan::Projection(projection), + (0..key_count).collect(), + side.to_string(), + )?; + + Ok(LogicalPlan::Extension(Extension { + node: Arc::new(key_ext), + })) + } + + /// [Strategy] Resolves the output timestamp of the join. + /// Streaming joins must output the 'max' of the two input timestamps to ensure Watermark progression. + fn apply_timestamp_resolution(&self, join_plan: LogicalPlan) -> Result { + let schema = join_plan.schema(); + let all_fields = fields_with_qualifiers(schema); + + let timestamp_fields: Vec<_> = all_fields + .iter() + .filter(|f| f.name() == "_timestamp") + .cloned() + .collect(); + + if timestamp_fields.len() != 2 { + return plan_err!( + "Streaming join requires exactly two input timestamp fields to resolve output time." + ); + } + + // Project all fields except the two raw timestamps + let mut exprs: Vec<_> = all_fields + .iter() + .filter(|f| f.name() != "_timestamp") + .map(|f| Expr::Column(f.qualified_column())) + .collect(); + + // Calculate: GREATEST(left._timestamp, right._timestamp) + let left_ts = Expr::Column(timestamp_fields[0].qualified_column()); + let right_ts = Expr::Column(timestamp_fields[1].qualified_column()); + + let max_ts_expr = Expr::Case(Case { + expr: Some(Box::new(Expr::BinaryExpr(BinaryExpr { + left: Box::new(left_ts.clone()), + op: logical_expr::Operator::GtEq, + right: Box::new(right_ts.clone()), + }))), + when_then_expr: vec![ + ( + Box::new(Expr::Literal(ScalarValue::Boolean(Some(true)), None)), + Box::new(left_ts.clone()), + ), + ( + Box::new(Expr::Literal(ScalarValue::Boolean(Some(false)), None)), + Box::new(right_ts.clone()), + ), + ], + else_expr: Some(Box::new(coalesce(vec![left_ts, right_ts]))), + }) + .alias(TIMESTAMP_FIELD); + + exprs.push(max_ts_expr); + + let out_fields: Vec<_> = all_fields + .iter() + .filter(|f| f.name() != "_timestamp") + .cloned() + .chain(std::iter::once(timestamp_fields[0].clone())) + .collect(); + + let out_schema = Arc::new(schema_from_df_fields_with_metadata( + &out_fields, + schema.metadata().clone(), + )?); + + Ok(LogicalPlan::Projection(Projection::try_new_with_schema( + exprs, + Arc::new(join_plan), + out_schema, + )?)) + } +} + +impl TreeNodeRewriter for JoinRewriter<'_> { + type Node = LogicalPlan; + + fn f_up(&mut self, node: Self::Node) -> Result> { + let LogicalPlan::Join(join) = node else { + return Ok(Transformed::no(node)); + }; + + // 1. Validate Streaming Context + let is_instant = self.validate_join_windows(&join)?; + if join.join_constraint != JoinConstraint::On { + return not_impl_err!("Only 'ON' join constraints are supported in streaming SQL."); + } + if join.on.is_empty() && !is_instant { + return plan_err!("Updating joins require at least one equality condition (Equijoin)."); + } + + // 2. Prepare Keyed Inputs for Shuffle + let (left_on, right_on): (Vec<_>, Vec<_>) = join.on.clone().into_iter().unzip(); + let keyed_left = self.build_keyed_side(join.left, left_on, mem_exec_join_side::LEFT)?; + let keyed_right = self.build_keyed_side(join.right, right_on, mem_exec_join_side::RIGHT)?; + + // 3. Assemble Rewritten Join Node + let join_schema = Arc::new(build_join_schema( + keyed_left.schema(), + keyed_right.schema(), + &join.join_type, + )?); + let rewritten_join = LogicalPlan::Join(Join { + left: Arc::new(keyed_left), + right: Arc::new(keyed_right), + on: join.on, + filter: join.filter, + join_type: join.join_type, + join_constraint: JoinConstraint::On, + schema: join_schema, + null_equals_null: false, + }); + + // 4. Resolve Output Watermark (Timestamp Projection) + let plan_with_timestamp = self.apply_timestamp_resolution(rewritten_join)?; + + // 5. Wrap in StreamingJoinNode for physical planning + let state_retention_ttl = (!is_instant).then_some(self.schema_provider.planning_options.ttl); + let extension = StreamingJoinNode::new( + plan_with_timestamp, + is_instant, + state_retention_ttl, + ); + + Ok(Transformed::yes(LogicalPlan::Extension(Extension { + node: Arc::new(extension), + }))) + } +} diff --git a/src/sql/analysis/mod.rs b/src/sql/analysis/mod.rs new file mode 100644 index 00000000..cd26a4e6 --- /dev/null +++ b/src/sql/analysis/mod.rs @@ -0,0 +1,217 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#![allow(clippy::new_without_default)] + +pub(crate) mod aggregate_rewriter; +pub(crate) mod join_rewriter; +pub(crate) mod row_time_rewriter; +pub(crate) mod stream_rewriter; +pub(crate) mod streaming_window_analzer; +pub(crate) mod window_function_rewriter; + +pub mod async_udf_rewriter; +pub mod sink_input_rewriter; +pub mod source_metadata_visitor; +pub mod source_rewriter; +pub mod time_window; +pub mod unnest_rewriter; + +pub use async_udf_rewriter::AsyncOptions; +pub use sink_input_rewriter::SinkInputRewriter; +pub use time_window::{TimeWindowNullCheckRemover, TimeWindowUdfChecker}; +pub use unnest_rewriter::UNNESTED_COL; + +pub use crate::sql::schema::schema_provider::StreamSchemaProvider; + +use std::collections::HashMap; +use std::sync::Arc; + +use datafusion::common::tree_node::{Transformed, TreeNode}; +use datafusion::common::{Result, plan_err}; +use datafusion::error::DataFusionError; +use datafusion::logical_expr::{Extension, LogicalPlan, UserDefinedLogicalNodeCore}; +use tracing::{debug, info, instrument}; + +use crate::sql::extensions::key_calculation::{KeyExtractionNode, KeyExtractionStrategy}; +use crate::sql::extensions::projection::StreamProjectionNode; +use crate::sql::extensions::sink::StreamEgressNode; +use crate::sql::extensions::StreamingOperatorBlueprint; +use crate::sql::logical_planner::planner::NamedNode; + +fn duration_from_sql_expr( + expr: &datafusion::sql::sqlparser::ast::Expr, +) -> Result { + use datafusion::sql::sqlparser::ast::Expr as SqlExpr; + use datafusion::sql::sqlparser::ast::Value as SqlValue; + use datafusion::sql::sqlparser::ast::ValueWithSpan; + + match expr { + SqlExpr::Interval(interval) => { + let value_str = match interval.value.as_ref() { + SqlExpr::Value(ValueWithSpan { + value: SqlValue::SingleQuotedString(s), + .. + }) => s.clone(), + other => return plan_err!("expected interval string literal, found {other}"), + }; + + parse_interval_to_duration(&value_str) + } + SqlExpr::Value(ValueWithSpan { + value: SqlValue::SingleQuotedString(s), + .. + }) => parse_interval_to_duration(s), + other => plan_err!("expected an interval expression, found {other}"), + } +} + +fn parse_interval_to_duration(s: &str) -> Result { + let parts: Vec<&str> = s.trim().split_whitespace().collect(); + if parts.len() != 2 { + return plan_err!("invalid interval string '{s}'; expected ' '"); + } + let value: u64 = parts[0] + .parse() + .map_err(|_| DataFusionError::Plan(format!("invalid interval number: {}", parts[0])))?; + match parts[1].to_lowercase().as_str() { + "second" | "seconds" | "s" => Ok(std::time::Duration::from_secs(value)), + "minute" | "minutes" | "min" => Ok(std::time::Duration::from_secs(value * 60)), + "hour" | "hours" | "h" => Ok(std::time::Duration::from_secs(value * 3600)), + "day" | "days" | "d" => Ok(std::time::Duration::from_secs(value * 86400)), + unit => plan_err!("unsupported interval unit '{unit}'"), + } +} + +fn build_sink_inputs(extensions: &[LogicalPlan]) -> HashMap> { + let mut sink_inputs = HashMap::>::new(); + for extension in extensions.iter() { + if let LogicalPlan::Extension(ext) = extension { + if let Some(sink_node) = ext.node.as_any().downcast_ref::() { + if let Some(named_node) = sink_node.operator_identity() { + let inputs = sink_node + .inputs() + .into_iter() + .cloned() + .collect::>(); + sink_inputs.entry(named_node).or_default().extend(inputs); + } + } + } + } + sink_inputs +} + +pub(crate) fn maybe_add_key_extension_to_sink(plan: LogicalPlan) -> Result { + let LogicalPlan::Extension(ref ext) = plan else { + return Ok(plan); + }; + + let Some(sink) = ext.node.as_any().downcast_ref::() else { + return Ok(plan); + }; + + let Some(partition_exprs) = sink.destination_table.partition_exprs() else { + return Ok(plan); + }; + + if partition_exprs.is_empty() { + return Ok(plan); + } + + let inputs = plan + .inputs() + .into_iter() + .map(|input| { + Ok(LogicalPlan::Extension(Extension { + node: Arc::new(KeyExtractionNode { + operator_label: Some("key-calc-partition".to_string()), + resolved_schema: input.schema().clone(), + upstream_plan: input.clone(), + extraction_strategy: KeyExtractionStrategy::CalculatedExpressions( + partition_exprs.clone(), + ), + }), + })) + }) + .collect::>()?; + + use datafusion::prelude::col; + let unkey = LogicalPlan::Extension(Extension { + node: Arc::new( + StreamProjectionNode::try_new( + inputs, + Some("unkey".to_string()), + sink.schema().iter().map(|(_, f)| col(f.name())).collect(), + )? + .with_shuffle_routing(), + ), + }); + + let node = sink.with_exprs_and_inputs(vec![], vec![unkey])?; + Ok(LogicalPlan::Extension(Extension { + node: Arc::new(node), + })) +} + +pub fn rewrite_sinks(extensions: Vec) -> Result> { + let mut sink_inputs = build_sink_inputs(&extensions); + let mut new_extensions = vec![]; + for extension in extensions { + let mut rewriter = SinkInputRewriter::new(&mut sink_inputs); + let result = extension.rewrite(&mut rewriter)?; + if !rewriter.was_removed { + new_extensions.push(result.data); + } + } + + new_extensions + .into_iter() + .map(maybe_add_key_extension_to_sink) + .collect() + +} + +/// Entry point for transforming a standard DataFusion LogicalPlan into a +/// Streaming-aware LogicalPlan. +/// +/// This function coordinates multiple rewriting passes and ensures the +/// resulting plan satisfies streaming constraints. +#[instrument(skip_all, level = "debug")] +pub fn rewrite_plan( + plan: LogicalPlan, + schema_provider: &StreamSchemaProvider, +) -> Result { + info!("Starting streaming plan rewrite pipeline"); + + let Transformed { + data: plan, .. + } = plan.rewrite_with_subqueries(&mut source_rewriter::SourceRewriter::new(schema_provider))?; + + let mut rewriter = stream_rewriter::StreamRewriter::new(schema_provider); + let Transformed { + data: rewritten_plan, + .. + } = plan.rewrite_with_subqueries(&mut rewriter)?; + + rewritten_plan.visit_with_subqueries(&mut TimeWindowUdfChecker {})?; + + if cfg!(debug_assertions) { + debug!( + "Streaming logical plan graphviz:\n{}", + rewritten_plan.display_graphviz() + ); + } + + info!("Streaming plan rewrite completed successfully"); + Ok(rewritten_plan) +} diff --git a/src/sql/analysis/row_time_rewriter.rs b/src/sql/analysis/row_time_rewriter.rs new file mode 100644 index 00000000..13e2a048 --- /dev/null +++ b/src/sql/analysis/row_time_rewriter.rs @@ -0,0 +1,49 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use datafusion::common::tree_node::{Transformed, TreeNodeRewriter}; +use datafusion::common::{Column, Result as DFResult}; +use datafusion::logical_expr::Expr; + +use crate::sql::common::constants::planning_placeholder_udf; +use crate::sql::types::TIMESTAMP_FIELD; + +/// Replaces the virtual `row_time()` scalar function with a physical reference to `_timestamp`. +/// +/// This is a critical mapping step that allows users to use a friendly SQL function +/// while the engine operates on the mandatory internal streaming timestamp. +pub struct RowTimeRewriter; + +impl TreeNodeRewriter for RowTimeRewriter { + type Node = Expr; + + fn f_down(&mut self, node: Self::Node) -> DFResult> { + // Use pattern matching to identify the `row_time` scalar function. + if let Expr::ScalarFunction(func) = &node + && func.name() == planning_placeholder_udf::ROW_TIME + { + // Map the virtual function to the physical internal timestamp column. + // We use .alias() to preserve the original name "row_time()" in the output schema, + // ensuring that user-facing column names do not change unexpectedly. + let physical_col = Expr::Column(Column { + relation: None, + name: TIMESTAMP_FIELD.to_string(), + spans: Default::default(), + }) + .alias("row_time()"); + + return Ok(Transformed::yes(physical_col)); + } + + Ok(Transformed::no(node)) + } +} diff --git a/src/sql/analysis/sink_input_rewriter.rs b/src/sql/analysis/sink_input_rewriter.rs new file mode 100644 index 00000000..ad36046f --- /dev/null +++ b/src/sql/analysis/sink_input_rewriter.rs @@ -0,0 +1,59 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use crate::sql::extensions::sink::StreamEgressNode; +use crate::sql::extensions::StreamingOperatorBlueprint; +use datafusion::common::Result as DFResult; +use datafusion::common::tree_node::{Transformed, TreeNodeRecursion, TreeNodeRewriter}; +use datafusion::logical_expr::{Extension, LogicalPlan, UserDefinedLogicalNodeCore}; +use std::collections::HashMap; +use std::sync::Arc; +use crate::sql::logical_planner::planner::NamedNode; + +type SinkInputs = HashMap>; + +/// Merges inputs for sinks with the same name to avoid duplicate sinks in the plan. +pub struct SinkInputRewriter<'a> { + sink_inputs: &'a mut SinkInputs, + pub was_removed: bool, +} + +impl<'a> SinkInputRewriter<'a> { + pub(crate) fn new(sink_inputs: &'a mut SinkInputs) -> Self { + Self { + sink_inputs, + was_removed: false, + } + } +} + +impl TreeNodeRewriter for SinkInputRewriter<'_> { + type Node = LogicalPlan; + + fn f_down(&mut self, node: Self::Node) -> DFResult> { + if let LogicalPlan::Extension(extension) = &node { + if let Some(sink_node) = extension.node.as_any().downcast_ref::() { + if let Some(named_node) = sink_node.operator_identity() { + if let Some(inputs) = self.sink_inputs.remove(&named_node) { + let new_node = LogicalPlan::Extension(Extension { + node: Arc::new(sink_node.with_exprs_and_inputs(vec![], inputs)?), + }); + return Ok(Transformed::new(new_node, true, TreeNodeRecursion::Jump)); + } else { + self.was_removed = true; + } + } + } + } + Ok(Transformed::no(node)) + } +} diff --git a/src/sql/analysis/source_metadata_visitor.rs b/src/sql/analysis/source_metadata_visitor.rs new file mode 100644 index 00000000..81b9b179 --- /dev/null +++ b/src/sql/analysis/source_metadata_visitor.rs @@ -0,0 +1,69 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use crate::sql::extensions::sink::{StreamEgressNode, STREAM_EGRESS_NODE_NAME}; +use crate::sql::extensions::table_source::{StreamIngestionNode, STREAM_INGESTION_NODE_NAME}; +use crate::sql::schema::StreamSchemaProvider; +use datafusion::common::Result as DFResult; +use datafusion::common::tree_node::{TreeNodeRecursion, TreeNodeVisitor}; +use datafusion::logical_expr::{Extension, LogicalPlan}; +use std::collections::HashSet; + +/// Collects connection IDs from source and sink nodes in the logical plan. +pub struct SourceMetadataVisitor<'a> { + schema_provider: &'a StreamSchemaProvider, + pub connection_ids: HashSet, +} + +impl<'a> SourceMetadataVisitor<'a> { + pub fn new(schema_provider: &'a StreamSchemaProvider) -> Self { + Self { + schema_provider, + connection_ids: HashSet::new(), + } + } + + fn get_connection_id(&self, node: &LogicalPlan) -> Option { + let LogicalPlan::Extension(Extension { node }) = node else { + return None; + }; + + let table_name = match node.name() { + name if name == STREAM_INGESTION_NODE_NAME => { + let ext = node.as_any().downcast_ref::()?; + ext.source_identifier.to_string() + } + name if name == STREAM_EGRESS_NODE_NAME => { + let ext = node.as_any().downcast_ref::()?; + ext.target_identifier.to_string() + } + _ => return None, + }; + + let table = self.schema_provider.get_catalog_table(&table_name)?; + match table { + crate::sql::schema::table::Table::ConnectorTable(t) => t.registry_id, + _ => None, + } + } +} + +impl TreeNodeVisitor<'_> for SourceMetadataVisitor<'_> { + type Node = LogicalPlan; + + fn f_down(&mut self, node: &Self::Node) -> DFResult { + if let Some(id) = self.get_connection_id(node) { + self.connection_ids.insert(id); + } + Ok(TreeNodeRecursion::Continue) + } +} diff --git a/src/sql/analysis/source_rewriter.rs b/src/sql/analysis/source_rewriter.rs new file mode 100644 index 00000000..0ade3ea1 --- /dev/null +++ b/src/sql/analysis/source_rewriter.rs @@ -0,0 +1,299 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; +use std::time::Duration; + +use datafusion::common::ScalarValue; +use datafusion::common::tree_node::{Transformed, TreeNodeRewriter}; +use datafusion::common::{Column, DataFusionError, Result as DFResult, TableReference, plan_err}; +use datafusion::logical_expr::{ + self, BinaryExpr, Expr, Extension, LogicalPlan, Projection, TableScan, +}; + +use crate::sql::schema::source_table::SourceTable; +use crate::sql::schema::ColumnDescriptor; +use crate::sql::schema::table::Table; +use crate::sql::schema::StreamSchemaProvider; +use crate::sql::common::UPDATING_META_FIELD; +use crate::sql::extensions::debezium::UnrollDebeziumPayloadNode; +use crate::sql::extensions::remote_table::RemoteTableBoundaryNode; +use crate::sql::extensions::table_source::StreamIngestionNode; +use crate::sql::extensions::watermark_node::EventTimeWatermarkNode; +use crate::sql::types::TIMESTAMP_FIELD; + +/// Rewrites table scans: projections are lifted out of scans into a dedicated projection node +/// (including virtual fields), using a connector table-source extension instead of a bare +/// `TableScan`, optionally with Debezium unrolling for updating sources, then remote boundary and +/// watermark. +pub struct SourceRewriter<'a> { + pub(crate) schema_provider: &'a StreamSchemaProvider, +} + +impl<'a> SourceRewriter<'a> { + pub fn new(schema_provider: &'a StreamSchemaProvider) -> Self { + Self { schema_provider } + } +} + +impl SourceRewriter<'_> { + fn projection_expr_for_column(col: &ColumnDescriptor, qualifier: &TableReference) -> Expr { + if let Some(logic) = col.computation_logic() { + logic + .clone() + .alias_qualified(Some(qualifier.clone()), col.arrow_field().name().to_string()) + } else { + Expr::Column(Column { + relation: Some(qualifier.clone()), + name: col.arrow_field().name().to_string(), + spans: Default::default(), + }) + } + } + + fn watermark_expression(table: &SourceTable) -> DFResult { + match table.temporal_config.watermark_strategy_column.clone() { + Some(watermark_field) => table + .schema_specs + .iter() + .find_map(|c| { + if c.arrow_field().name() == watermark_field.as_str() { + return if let Some(expr) = c.computation_logic() { + Some(expr.clone()) + } else { + Some(Expr::Column(Column { + relation: None, + name: c.arrow_field().name().to_string(), + spans: Default::default(), + })) + }; + } + None + }) + .ok_or_else(|| { + DataFusionError::Plan(format!("Watermark field {watermark_field} not found")) + }), + None => Ok(Expr::BinaryExpr(BinaryExpr { + left: Box::new(Expr::Column(Column { + relation: None, + name: TIMESTAMP_FIELD.to_string(), + spans: Default::default(), + })), + op: logical_expr::Operator::Minus, + right: Box::new(Expr::Literal( + ScalarValue::DurationNanosecond(Some(Duration::from_secs(1).as_nanos() as i64)), + None, + )), + })), + } + } + + fn projection_expressions( + table: &SourceTable, + qualifier: &TableReference, + projection: &Option>, + ) -> DFResult> { + let mut expressions: Vec = table + .schema_specs + .iter() + .map(|col| Self::projection_expr_for_column(col, qualifier)) + .collect(); + + if let Some(proj) = projection { + expressions = proj.iter().map(|i| expressions[*i].clone()).collect(); + } + + if let Some(event_time_field) = table.temporal_config.event_column.clone() { + let expr = table + .schema_specs + .iter() + .find_map(|c| { + if c.arrow_field().name() == event_time_field.as_str() { + return Some(Self::projection_expr_for_column(c, qualifier)); + } + None + }) + .ok_or_else(|| { + DataFusionError::Plan(format!("Event time field {event_time_field} not found")) + })?; + + expressions + .push(expr.alias_qualified(Some(qualifier.clone()), TIMESTAMP_FIELD.to_string())); + } else { + let has_ts = table + .schema_specs + .iter() + .any(|c| c.arrow_field().name() == TIMESTAMP_FIELD); + if !has_ts { + return plan_err!( + "Connector table '{}' has no `{}` column; declare WATERMARK FOR AS ... in CREATE TABLE", + table.table_identifier, + TIMESTAMP_FIELD + ); + } + expressions.push(Expr::Column(Column::new( + Some(qualifier.clone()), + TIMESTAMP_FIELD, + ))); + } + + if table.is_updating() { + expressions.push(Expr::Column(Column::new( + Some(qualifier.clone()), + UPDATING_META_FIELD, + ))); + } + + Ok(expressions) + } + + + /// Connector path: `StreamIngestionNode` (table source) → optional `UnrollDebeziumPayloadNode` + /// → `Projection`, mirroring Arroyo `TableSourceExtension` + Debezium unroll + projection. + fn projection(&self, table_scan: &TableScan, table: &SourceTable) -> DFResult { + let qualifier = table_scan.table_name.clone(); + + let table_source = LogicalPlan::Extension(Extension { + node: Arc::new(StreamIngestionNode::try_new( + qualifier.clone(), + table.clone(), + )?), + }); + + let (projection_input, scan_projection) = if table.is_updating() { + if table.key_constraints.is_empty() { + return plan_err!( + "Updating connector table `{}` requires at least one PRIMARY KEY for CDC unrolling", + table.table_identifier + ); + } + let unrolled = LogicalPlan::Extension(Extension { + node: Arc::new(UnrollDebeziumPayloadNode::try_new( + table_source, + Arc::new(table.key_constraints.clone()), + )?), + }); + (unrolled, None) + } else { + (table_source, table_scan.projection.clone()) + }; + + Ok(LogicalPlan::Projection(Projection::try_new( + Self::projection_expressions(table, &qualifier, &scan_projection)?, + Arc::new(projection_input), + )?)) + } + + fn mutate_connector_table( + &self, + table_scan: &TableScan, + table: &SourceTable, + ) -> DFResult> { + let input = self.projection(table_scan, table)?; + + let schema = input.schema().clone(); + let remote = LogicalPlan::Extension(Extension { + node: Arc::new(RemoteTableBoundaryNode { + upstream_plan: input, + table_identifier: table_scan.table_name.to_owned(), + resolved_schema: schema, + requires_materialization: true, + }), + }); + + let watermark_node = EventTimeWatermarkNode::try_new( + remote, + table_scan.table_name.clone(), + Self::watermark_expression(table)?, + ) + .map_err(|err| { + DataFusionError::Internal(format!("failed to create watermark node: {err}")) + })?; + + Ok(Transformed::yes(LogicalPlan::Extension(Extension { + node: Arc::new(watermark_node), + }))) + } + + fn mutate_table_from_query( + &self, + table_scan: &TableScan, + logical_plan: &LogicalPlan, + ) -> DFResult> { + let column_expressions: Vec<_> = if let Some(projection) = &table_scan.projection { + logical_plan + .schema() + .columns() + .into_iter() + .enumerate() + .filter_map(|(i, col)| { + if projection.contains(&i) { + Some(Expr::Column(col)) + } else { + None + } + }) + .collect() + } else { + logical_plan + .schema() + .columns() + .into_iter() + .map(Expr::Column) + .collect() + }; + + let target_columns: Vec<_> = table_scan.projected_schema.columns().into_iter().collect(); + + let expressions = column_expressions + .into_iter() + .zip(target_columns) + .map(|(expr, col)| expr.alias_qualified(col.relation, col.name)) + .collect(); + + let projection = LogicalPlan::Projection(Projection::try_new_with_schema( + expressions, + Arc::new(logical_plan.clone()), + table_scan.projected_schema.clone(), + )?); + + Ok(Transformed::yes(projection)) + } +} + +impl TreeNodeRewriter for SourceRewriter<'_> { + type Node = LogicalPlan; + + fn f_up(&mut self, node: Self::Node) -> DFResult> { + let LogicalPlan::TableScan(table_scan) = node else { + return Ok(Transformed::no(node)); + }; + + let table_name = table_scan.table_name.table(); + let table = self + .schema_provider + .get_catalog_table(table_name) + .ok_or_else(|| DataFusionError::Plan(format!("Table {table_name} not found")))?; + + match table { + Table::ConnectorTable(table) => self.mutate_connector_table(&table_scan, table), + Table::LookupTable(_table) => { + // TODO: implement LookupSource extension + plan_err!("Lookup tables are not yet supported") + } + Table::TableFromQuery { + name: _, + logical_plan, + } => self.mutate_table_from_query(&table_scan, logical_plan), + } + } +} diff --git a/src/sql/analysis/stream_rewriter.rs b/src/sql/analysis/stream_rewriter.rs new file mode 100644 index 00000000..a62a7bd1 --- /dev/null +++ b/src/sql/analysis/stream_rewriter.rs @@ -0,0 +1,231 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; + +use super::StreamSchemaProvider; +use crate::sql::extensions::StreamingOperatorBlueprint; +use crate::sql::extensions::remote_table::RemoteTableBoundaryNode; +use crate::sql::analysis::row_time_rewriter::RowTimeRewriter; +use crate::sql::analysis::{ + aggregate_rewriter::AggregateRewriter, join_rewriter::JoinRewriter, + window_function_rewriter::WindowFunctionRewriter, +}; +use crate::sql::analysis::TimeWindowNullCheckRemover; +use crate::sql::schema::utils::{add_timestamp_field, has_timestamp_field}; +use crate::sql::types::{DFField, TIMESTAMP_FIELD}; +use datafusion::common::tree_node::{Transformed, TreeNodeRewriter}; +use datafusion::common::{Column, DataFusionError, Result, Spans, TableReference, plan_err}; +use datafusion::logical_expr::{ + Expr, Extension, Filter, LogicalPlan, Projection, SubqueryAlias, Union, +}; +use datafusion_common::tree_node::TreeNode; +use datafusion_expr::{Aggregate, Join}; + +pub struct StreamRewriter<'a> { + pub(crate) schema_provider: &'a StreamSchemaProvider, +} + +impl TreeNodeRewriter for StreamRewriter<'_> { + type Node = LogicalPlan; + + fn f_up(&mut self, node: Self::Node) -> Result> { + match node { + // Logic Delegation + LogicalPlan::Projection(p) => self.rewrite_projection(p), + LogicalPlan::Filter(f) => self.rewrite_filter(f), + LogicalPlan::Union(u) => self.rewrite_union(u), + + // Delegation to specialized sub-rewriters + LogicalPlan::Aggregate(agg) => self.rewrite_aggregate(agg), + LogicalPlan::Join(join) => self.rewrite_join(join), + LogicalPlan::Window(_) => self.rewrite_window(node), + LogicalPlan::SubqueryAlias(sa) => self.rewrite_subquery_alias(sa), + + // Explicitly Unsupported Operations + LogicalPlan::Sort(_) => self.unsupported_error("ORDER BY", &node), + LogicalPlan::Limit(_) => self.unsupported_error("LIMIT", &node), + LogicalPlan::Repartition(_) => self.unsupported_error("Repartitions", &node), + LogicalPlan::Explain(_) => self.unsupported_error("EXPLAIN", &node), + LogicalPlan::Analyze(_) => self.unsupported_error("ANALYZE", &node), + + _ => Ok(Transformed::no(node)), + } + } +} + +impl<'a> StreamRewriter<'a> { + pub fn new(schema_provider: &'a StreamSchemaProvider) -> Self { + Self { schema_provider } + } + + /// Delegates to AggregateRewriter to transform batch aggregates into streaming stateful operators. + fn rewrite_aggregate(&self, agg: Aggregate) -> Result> { + AggregateRewriter { + schema_provider: self.schema_provider, + } + .f_up(LogicalPlan::Aggregate(agg)) + } + + /// Delegates to JoinRewriter to handle streaming join semantics (e.g., TTL, state management). + fn rewrite_join(&self, join: Join) -> Result> { + JoinRewriter { + schema_provider: self.schema_provider, + } + .f_up(LogicalPlan::Join(join)) + } + + /// Delegates to WindowFunctionRewriter for stream-aware windowing logic. + fn rewrite_window(&self, node: LogicalPlan) -> Result> { + WindowFunctionRewriter {}.f_up(node) + } + + /// Refreshes SubqueryAlias metadata to align with potentially rewritten internal schemas. + fn rewrite_subquery_alias(&self, sa: SubqueryAlias) -> Result> { + // Since the inner 'sa.input' has been rewritten (bottom-up), we must re-create + // the alias node to ensure the outer schema correctly reflects internal changes. + let new_sa = SubqueryAlias::try_new(sa.input, sa.alias).map_err(|e| { + DataFusionError::Internal(format!("Failed to re-alias subquery: {}", e)) + })?; + + Ok(Transformed::yes(LogicalPlan::SubqueryAlias(new_sa))) + } + + /// Handles timestamp propagation and row_time() mapping for Projections + fn rewrite_projection(&self, mut projection: Projection) -> Result> { + // Check if the current projection already has a timestamp field; + // if not, we must inject it to maintain streaming heartbeats. + if !has_timestamp_field(&projection.schema) { + let input_schema = projection.input.schema(); + + // Resolve the timestamp field from the input schema using the global constant. + let timestamp_field: DFField = input_schema + .qualified_field_with_unqualified_name(TIMESTAMP_FIELD) + .map_err(|_| { + DataFusionError::Plan(format!( + "No timestamp field found in projection input ({})", + projection.input.display() + )) + })? + .into(); + + // Update the logical schema to include the newly injected timestamp. + projection.schema = add_timestamp_field( + projection.schema.clone(), + timestamp_field.qualifier().cloned(), + ) + .expect("Failed to add timestamp to projection schema"); + + // Physically push the timestamp column into the expression list. + projection.expr.push(Expr::Column(Column { + relation: timestamp_field.qualifier().cloned(), + name: TIMESTAMP_FIELD.to_string(), + spans: Spans::default(), + })); + } + + // Map user-friendly row_time() function calls to internal _timestamp column references. + let rewritten = projection + .expr + .iter() + .map(|expr| expr.clone().rewrite(&mut RowTimeRewriter {})) + .collect::>>()?; + + // If any expressions were modified (e.g., row_time() was replaced), update the projection. + if rewritten.iter().any(|r| r.transformed) { + projection.expr = rewritten.into_iter().map(|r| r.data).collect(); + } + + // Return the updated plan node wrapped in a Transformed container. + Ok(Transformed::yes(LogicalPlan::Projection(projection))) + } + + /// Harmonizes schemas across Union branches and wraps them in RemoteTableBoundaryNodes. + /// + /// This ensures that all inputs to a UNION operation share the exact same schema metadata, + /// preventing "Schema Drift" where different branches have different field qualifiers. + fn rewrite_union(&self, mut union: Union) -> Result> { + // Industrial engines use the first branch as the "Master Schema" for the Union. + // We clone it once to ensure all subsequent branches are forced to comply. + let master_schema = union.inputs[0].schema().clone(); + union.schema = master_schema.clone(); + + for input in union.inputs.iter_mut() { + // Optimization: If the node is already a non-transparent Extension, + // we skip wrapping to avoid unnecessary nesting of logical nodes. + if let LogicalPlan::Extension(Extension { node }) = input.as_ref() { + let stream_ext: &dyn StreamingOperatorBlueprint = node.try_into().map_err(|e| { + DataFusionError::Internal(format!("Failed to resolve StreamingOperatorBlueprint: {}", e)) + })?; + + if !stream_ext.is_passthrough_boundary() { + continue; + } + } + + // Wrap each branch in a RemoteTableBoundaryNode. + // This acts as a logical "bridge" that forces the input to adopt the master_schema, + // effectively stripping away branch-specific qualifiers (e.g., table aliases). + let remote_ext = Arc::new(RemoteTableBoundaryNode { + upstream_plan: input.as_ref().clone(), + table_identifier: TableReference::bare("union_input"), + resolved_schema: master_schema.clone(), + requires_materialization: false, // Internal logical boundary only; does not require physical sink. + }); + + // Atomically replace the input with the wrapped version. + *input = Arc::new(LogicalPlan::Extension(Extension { node: remote_ext })); + } + + Ok(Transformed::yes(LogicalPlan::Union(union))) + } + + /// Optimizes Filter nodes by stripping redundant NULL checks on time window expressions. + /// + /// In streaming SQL, DataFusion often injects 'IS NOT NULL' guards for window functions + /// that are redundant or can interfere with watermark propagation. This rewriter + /// cleans those predicates to simplify the physical execution plan. + fn rewrite_filter(&self, filter: Filter) -> Result> { + // We attempt to rewrite the predicate using a specialized sub-rewriter. + // The TimeWindowNullCheckRemover specifically targets expressions like + // `tumble(...) IS NOT NULL` and simplifies them to `TRUE`. + let rewritten_expr = filter + .predicate + .clone() + .rewrite(&mut TimeWindowNullCheckRemover {})?; + + if !rewritten_expr.transformed { + return Ok(Transformed::no(LogicalPlan::Filter(filter))); + } + + // Industrial Guard: Re-validate the predicate against the input schema. + // 'Filter::try_new' ensures that the transformed expression is still semantically + // valid for the underlying data stream. + let new_filter = Filter::try_new(rewritten_expr.data, filter.input).map_err(|e| { + DataFusionError::Internal(format!( + "Failed to re-validate filtered predicate after NULL-check removal: {}", + e + )) + })?; + + Ok(Transformed::yes(LogicalPlan::Filter(new_filter))) + } + + /// Centralized error handler for unsupported streaming operations + fn unsupported_error(&self, op: &str, node: &LogicalPlan) -> Result> { + plan_err!( + "{} is not currently supported in streaming SQL ({})", + op, + node.display() + ) + } +} diff --git a/src/sql/analysis/streaming_window_analzer.rs b/src/sql/analysis/streaming_window_analzer.rs new file mode 100644 index 00000000..609bd2ee --- /dev/null +++ b/src/sql/analysis/streaming_window_analzer.rs @@ -0,0 +1,215 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashSet; +use std::sync::Arc; + +use datafusion::common::tree_node::{TreeNodeRecursion, TreeNodeVisitor}; +use datafusion::common::{Column, DFSchema, DataFusionError, Result}; +use datafusion::logical_expr::{Expr, Extension, LogicalPlan, expr::Alias}; + +use crate::sql::extensions::aggregate::{STREAM_AGG_EXTENSION_NAME, StreamWindowAggregateNode}; +use crate::sql::extensions::join::STREAM_JOIN_NODE_TYPE; +use crate::sql::types::{DFField, WindowBehavior, WindowType, fields_with_qualifiers, find_window}; + +/// WindowDetectingVisitor identifies windowing strategies and tracks window-carrying fields +/// as they propagate upward through the logical plan tree. +#[derive(Debug, Default)] +pub(crate) struct StreamingWindowAnalzer { + /// The specific window type discovered (Tumble, Hop, etc.) + pub(crate) window: Option, + /// Set of fields in the current plan node that carry window semantics. + pub(crate) fields: HashSet, +} + +impl StreamingWindowAnalzer { + /// Entry point to resolve the WindowType of a given plan branch. + pub(crate) fn get_window(logical_plan: &LogicalPlan) -> Result> { + let mut visitor = Self::default(); + logical_plan.visit_with_subqueries(&mut visitor)?; + Ok(visitor.window) + } + + /// Resolves whether an expression is a reference to an existing window field + /// or a definition of a new window function. + fn resolve_window_from_expr( + &self, + expr: &Expr, + input_schema: &DFSchema, + ) -> Result> { + // 1. Check if the expression directly references a known window field. + if let Some(col) = extract_column(expr) { + let field = input_schema.field_with_name(col.relation.as_ref(), &col.name)?; + let df_field: DFField = (col.relation.clone(), Arc::new(field.clone())).into(); + + if self.fields.contains(&df_field) { + return Ok(self.window.clone()); + } + } + + // 2. Otherwise, check if it's a new window function call (e.g., tumble(), hop()). + find_window(expr) + } + + /// Updates the internal state with new window findings and maps them to the output schema. + fn update_state( + &mut self, + matched_windows: Vec<(usize, WindowType)>, + schema: &DFSchema, + ) -> Result<()> { + // Clear fields from the previous level to maintain schema strictly for the current node. + self.fields.clear(); + + for (index, window) in matched_windows { + if let Some(existing) = &self.window { + if existing != &window { + return Err(DataFusionError::Plan(format!( + "Conflicting windows in the same operator: expected {:?}, found {:?}", + existing, window + ))); + } + } else { + self.window = Some(window); + } + // Record this specific index in the schema as a window carrier. + self.fields.insert(schema.qualified_field(index).into()); + } + Ok(()) + } +} + +pub(crate) fn extract_column(expr: &Expr) -> Option<&Column> { + match expr { + Expr::Column(column) => Some(column), + Expr::Alias(Alias { expr, .. }) => extract_column(expr), + _ => None, + } +} + +impl TreeNodeVisitor<'_> for StreamingWindowAnalzer { + type Node = LogicalPlan; + + fn f_down(&mut self, node: &Self::Node) -> Result { + // Joins require cross-branch validation to ensure left and right sides align on time. + if let LogicalPlan::Extension(Extension { node }) = node + && node.name() == STREAM_JOIN_NODE_TYPE + { + let mut branch_windows = HashSet::new(); + for input in node.inputs() { + if let Some(w) = Self::get_window(input)? { + branch_windows.insert(w); + } + } + + if branch_windows.len() > 1 { + return Err(DataFusionError::Plan( + "Join inputs have mismatched windowing strategies.".into(), + )); + } + self.window = branch_windows.into_iter().next(); + + // Optimization: No need to recurse manually if we've resolved the join boundary. + return Ok(TreeNodeRecursion::Jump); + } + Ok(TreeNodeRecursion::Continue) + } + + fn f_up(&mut self, node: &Self::Node) -> Result { + match node { + LogicalPlan::Projection(p) => { + let windows = p + .expr + .iter() + .enumerate() + .filter_map(|(i, e)| { + self.resolve_window_from_expr(e, p.input.schema()) + .transpose() + .map(|res| res.map(|w| (i, w))) + }) + .collect::>>()?; + + self.update_state(windows, &p.schema)?; + } + + LogicalPlan::Aggregate(agg) => { + let windows = agg + .group_expr + .iter() + .enumerate() + .filter_map(|(i, e)| { + self.resolve_window_from_expr(e, agg.input.schema()) + .transpose() + .map(|res| res.map(|w| (i, w))) + }) + .collect::>>()?; + + self.update_state(windows, &agg.schema)?; + } + + LogicalPlan::SubqueryAlias(sa) => { + // Map fields through the alias layer by resolving column indices. + let input_schema = sa.input.schema(); + let mapped = self + .fields + .drain() + .map(|f| { + let idx = input_schema.index_of_column(&f.qualified_column())?; + Ok(sa.schema.qualified_field(idx).into()) + }) + .collect::>>()?; + + self.fields = mapped; + } + + LogicalPlan::Extension(Extension { node }) + if node.name() == STREAM_AGG_EXTENSION_NAME => + { + let ext = node + .as_any() + .downcast_ref::() + .ok_or_else(|| { + DataFusionError::Internal("StreamWindowAggregateNode is malformed".into()) + })?; + + match &ext.window_spec { + WindowBehavior::FromOperator { + window, + window_field, + is_nested, + .. + } => { + if self.window.is_some() && !*is_nested { + return Err(DataFusionError::Plan( + "Redundant window definition on an already windowed stream.".into(), + )); + } + self.window = Some(window.clone()); + self.fields.insert(window_field.clone()); + } + WindowBehavior::InData => { + let current_schema_fields: HashSet<_> = + fields_with_qualifiers(node.schema()).into_iter().collect(); + self.fields.retain(|f| current_schema_fields.contains(f)); + + if self.fields.is_empty() { + return Err(DataFusionError::Plan( + "Windowed aggregate missing window metadata from its input.".into(), + )); + } + } + } + } + _ => {} + } + Ok(TreeNodeRecursion::Continue) + } +} diff --git a/src/sql/analysis/time_window.rs b/src/sql/analysis/time_window.rs new file mode 100644 index 00000000..104c0cca --- /dev/null +++ b/src/sql/analysis/time_window.rs @@ -0,0 +1,83 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use datafusion::common::tree_node::{ + Transformed, TreeNodeRecursion, TreeNodeRewriter, TreeNodeVisitor, +}; +use datafusion::common::{DataFusionError, Result as DFResult, ScalarValue, plan_err}; +use datafusion::logical_expr::expr::ScalarFunction; +use datafusion::logical_expr::{Expr, LogicalPlan}; + +/// Returns the time window function name if the expression is one (tumble/hop/session). +pub fn is_time_window(expr: &Expr) -> Option<&str> { + if let Expr::ScalarFunction(ScalarFunction { func, args: _ }) = expr { + match func.name() { + "tumble" | "hop" | "session" => return Some(func.name()), + _ => {} + } + } + None +} + +struct TimeWindowExprChecker {} + +impl TreeNodeVisitor<'_> for TimeWindowExprChecker { + type Node = Expr; + + fn f_down(&mut self, node: &Self::Node) -> DFResult { + if let Some(w) = is_time_window(node) { + return plan_err!( + "time window function {} is not allowed in this context. \ + Are you missing a GROUP BY clause?", + w + ); + } + Ok(TreeNodeRecursion::Continue) + } +} + +/// Visitor that checks an entire LogicalPlan for misplaced time window UDFs. +pub struct TimeWindowUdfChecker {} + +impl TreeNodeVisitor<'_> for TimeWindowUdfChecker { + type Node = LogicalPlan; + + fn f_down(&mut self, node: &Self::Node) -> DFResult { + use datafusion::common::tree_node::TreeNode; + node.expressions().iter().try_for_each(|expr| { + let mut checker = TimeWindowExprChecker {}; + expr.visit(&mut checker)?; + Ok::<(), DataFusionError>(()) + })?; + Ok(TreeNodeRecursion::Continue) + } +} + +/// Removes `IS NOT NULL` checks wrapping time window functions, +/// replacing them with `true` since time windows are never null. +pub struct TimeWindowNullCheckRemover {} + +impl TreeNodeRewriter for TimeWindowNullCheckRemover { + type Node = Expr; + + fn f_down(&mut self, node: Self::Node) -> DFResult> { + if let Expr::IsNotNull(expr) = &node + && is_time_window(expr).is_some() + { + return Ok(Transformed::yes(Expr::Literal( + ScalarValue::Boolean(Some(true)), + None, + ))); + } + Ok(Transformed::no(node)) + } +} diff --git a/src/sql/analysis/udafs.rs b/src/sql/analysis/udafs.rs new file mode 100644 index 00000000..73fc062c --- /dev/null +++ b/src/sql/analysis/udafs.rs @@ -0,0 +1,43 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use datafusion::arrow::array::ArrayRef; +use datafusion::error::Result; +use datafusion::physical_plan::Accumulator; +use datafusion::scalar::ScalarValue; +use std::fmt::Debug; + +/// Fake UDAF used just for plan-time placeholder. +#[derive(Debug)] +pub struct EmptyUdaf {} + +impl Accumulator for EmptyUdaf { + fn update_batch(&mut self, _: &[ArrayRef]) -> Result<()> { + unreachable!() + } + + fn evaluate(&self) -> Result { + unreachable!() + } + + fn size(&self) -> usize { + unreachable!() + } + + fn state(&self) -> Result> { + unreachable!() + } + + fn merge_batch(&mut self, _: &[ArrayRef]) -> Result<()> { + unreachable!() + } +} diff --git a/src/sql/analysis/unnest_rewriter.rs b/src/sql/analysis/unnest_rewriter.rs new file mode 100644 index 00000000..535590c8 --- /dev/null +++ b/src/sql/analysis/unnest_rewriter.rs @@ -0,0 +1,179 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; + +use datafusion::arrow::datatypes::DataType; +use datafusion::common::tree_node::{Transformed, TreeNode, TreeNodeRewriter}; +use datafusion::common::{Column, Result as DFResult, plan_err}; +use datafusion::logical_expr::expr::ScalarFunction; +use datafusion::logical_expr::{ColumnUnnestList, Expr, LogicalPlan, Projection, Unnest}; + +use crate::sql::common::constants::planning_placeholder_udf; +use crate::sql::types::{DFField, fields_with_qualifiers, schema_from_df_fields}; + +pub const UNNESTED_COL: &str = "__unnested"; + +/// Rewrites projections containing `unnest()` calls into proper Unnest logical plans. +pub struct UnnestRewriter {} + +impl UnnestRewriter { + fn split_unnest(expr: Expr) -> DFResult<(Expr, Option)> { + let mut captured: Option = None; + + let expr = expr.transform_up(|e| { + if let Expr::ScalarFunction(ScalarFunction { func: udf, args }) = &e + && udf.name() == planning_placeholder_udf::UNNEST + { + match args.len() { + 1 => { + if captured.replace(args[0].clone()).is_some() { + return plan_err!( + "Multiple unnests in expression, which is not allowed" + ); + } + return Ok(Transformed::yes(Expr::Column(Column::new_unqualified( + UNNESTED_COL, + )))); + } + n => { + panic!("Unnest has wrong number of arguments (expected 1, found {n})"); + } + } + } + Ok(Transformed::no(e)) + })?; + + Ok((expr.data, captured)) + } +} + +impl TreeNodeRewriter for UnnestRewriter { + type Node = LogicalPlan; + + fn f_up(&mut self, node: Self::Node) -> DFResult> { + let LogicalPlan::Projection(projection) = &node else { + if node.expressions().iter().any(|e| { + let e = Self::split_unnest(e.clone()); + e.is_err() || e.unwrap().1.is_some() + }) { + return plan_err!("unnest is only supported in SELECT statements"); + } + return Ok(Transformed::no(node)); + }; + + let mut unnest = None; + let exprs = projection + .expr + .clone() + .into_iter() + .enumerate() + .map(|(i, expr)| { + let (expr, opt) = Self::split_unnest(expr)?; + let is_unnest = if let Some(e) = opt { + if let Some(prev) = unnest.replace((e, i)) + && &prev != unnest.as_ref().unwrap() + { + return plan_err!( + "Projection contains multiple unnests, which is not currently supported" + ); + } + true + } else { + false + }; + + Ok((expr, is_unnest)) + }) + .collect::>>()?; + + if let Some((unnest_inner, unnest_idx)) = unnest { + let produce_list = Arc::new(LogicalPlan::Projection(Projection::try_new( + exprs + .iter() + .cloned() + .map(|(e, is_unnest)| { + if is_unnest { + unnest_inner.clone().alias(UNNESTED_COL) + } else { + e + } + }) + .collect(), + projection.input.clone(), + )?)); + + let unnest_fields = fields_with_qualifiers(produce_list.schema()) + .iter() + .enumerate() + .map(|(i, f)| { + if i == unnest_idx { + let DataType::List(inner) = f.data_type() else { + return plan_err!( + "Argument '{}' to unnest is not a List", + f.qualified_name() + ); + }; + Ok(DFField::new_unqualified( + UNNESTED_COL, + inner.data_type().clone(), + inner.is_nullable(), + )) + } else { + Ok((*f).clone()) + } + }) + .collect::>>()?; + + let unnest_node = LogicalPlan::Unnest(Unnest { + exec_columns: vec![ + DFField::from(produce_list.schema().qualified_field(unnest_idx)) + .qualified_column(), + ], + input: produce_list, + list_type_columns: vec![( + unnest_idx, + ColumnUnnestList { + output_column: Column::new_unqualified(UNNESTED_COL), + depth: 1, + }, + )], + struct_type_columns: vec![], + dependency_indices: vec![], + schema: Arc::new(schema_from_df_fields(&unnest_fields)?), + options: Default::default(), + }); + + let output_node = LogicalPlan::Projection(Projection::try_new( + exprs + .iter() + .enumerate() + .map(|(i, (expr, has_unnest))| { + if *has_unnest { + expr.clone() + } else { + Expr::Column( + DFField::from(unnest_node.schema().qualified_field(i)) + .qualified_column(), + ) + } + }) + .collect(), + Arc::new(unnest_node), + )?); + + Ok(Transformed::yes(output_node)) + } else { + Ok(Transformed::no(LogicalPlan::Projection(projection.clone()))) + } + } +} diff --git a/src/sql/analysis/window_function_rewriter.rs b/src/sql/analysis/window_function_rewriter.rs new file mode 100644 index 00000000..63c502bf --- /dev/null +++ b/src/sql/analysis/window_function_rewriter.rs @@ -0,0 +1,203 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use datafusion::common::tree_node::Transformed; +use datafusion::common::{Result as DFResult, plan_err, tree_node::TreeNodeRewriter}; +use datafusion::logical_expr::{ + self, Expr, Extension, LogicalPlan, Projection, Sort, Window, expr::WindowFunction, + expr::WindowFunctionParams, +}; +use datafusion_common::DataFusionError; +use std::sync::Arc; +use tracing::debug; + +use crate::sql::extensions::key_calculation::{KeyExtractionNode, KeyExtractionStrategy}; +use crate::sql::extensions::windows_function::StreamingWindowFunctionNode; +use crate::sql::analysis::streaming_window_analzer::{StreamingWindowAnalzer, extract_column}; +use crate::sql::types::{WindowType, fields_with_qualifiers, schema_from_df_fields}; + +/// WindowFunctionRewriter transforms standard SQL Window functions into streaming-compatible +/// stateful operators, ensuring proper data partitioning and sorting for distributed execution. +pub(crate) struct WindowFunctionRewriter; + +impl WindowFunctionRewriter { + /// Recursively unwraps Aliases to find the underlying WindowFunction. + fn resolve_window_function(&self, expr: &Expr) -> DFResult<(WindowFunction, String)> { + match expr { + Expr::Alias(alias) => { + let (func, _) = self.resolve_window_function(&alias.expr)?; + Ok((func, alias.name.clone())) + } + Expr::WindowFunction(wf) => Ok((wf.as_ref().clone(), expr.name_for_alias()?)), + _ => plan_err!("Expected WindowFunction or Alias, found: {:?}", expr), + } + } + + /// Identifies which field in the PARTITION BY clause corresponds to the streaming window. + fn identify_window_partition( + &self, + params: &WindowFunctionParams, + input: &LogicalPlan, + input_window_fields: &std::collections::HashSet, + ) -> DFResult { + let matched: Vec<_> = params + .partition_by + .iter() + .enumerate() + .filter_map(|(i, e)| { + let col = extract_column(e)?; + let field = input + .schema() + .field_with_name(col.relation.as_ref(), &col.name) + .ok()?; + let df_field = (col.relation.clone(), Arc::new(field.clone())).into(); + + if input_window_fields.contains(&df_field) { + Some(i) + } else { + None + } + }) + .collect(); + + if matched.len() != 1 { + return plan_err!( + "Streaming window functions require exactly one window column in PARTITION BY. Found: {}", + matched.len() + ); + } + Ok(matched[0]) + } + + /// Wraps the input in a Projection and KeyExtractionNode to handle data distribution. + fn build_keyed_input( + &self, + input: Arc, + partition_keys: &[Expr], + ) -> DFResult { + let key_count = partition_keys.len(); + + // 1. Build projection: [_key_0, _key_1, ..., original_columns] + let mut exprs: Vec<_> = partition_keys + .iter() + .enumerate() + .map(|(i, e)| e.clone().alias(format!("_key_{i}"))) + .collect(); + + exprs.extend( + fields_with_qualifiers(input.schema()) + .iter() + .map(|f| Expr::Column(f.qualified_column())), + ); + + // 2. Derive the keyed schema + let mut keyed_fields = + fields_with_qualifiers(&Projection::try_new(exprs.clone(), input.clone())?.schema) + .iter() + .take(key_count) + .cloned() + .collect::>(); + keyed_fields.extend(fields_with_qualifiers(input.schema())); + + let keyed_schema = Arc::new(schema_from_df_fields(&keyed_fields)?); + + let projection = + LogicalPlan::Projection(Projection::try_new_with_schema(exprs, input, keyed_schema)?); + + // 3. Wrap in KeyExtractionNode for the physical planner + Ok(LogicalPlan::Extension(Extension { + node: Arc::new(KeyExtractionNode::new( + projection, + KeyExtractionStrategy::ColumnIndices((0..key_count).collect()), + )), + })) + } +} + +impl TreeNodeRewriter for WindowFunctionRewriter { + type Node = LogicalPlan; + + fn f_up(&mut self, node: Self::Node) -> DFResult> { + let LogicalPlan::Window(window) = node else { + return Ok(Transformed::no(node)); + }; + + debug!("Rewriting window function for streaming: {:?}", window); + + // 1. Analyze input windowing context + let mut analyzer = StreamingWindowAnalzer::default(); + window.input.visit_with_subqueries(&mut analyzer)?; + + let input_window = analyzer.window.ok_or_else(|| { + DataFusionError::Plan( + "Window functions require a windowed input stream (e.g., TUMBLE/HOP)".into(), + ) + })?; + + if matches!(input_window, WindowType::Session { .. }) { + return plan_err!( + "Streaming window functions (OVER) are not supported on Session windows." + ); + } + + // 2. Validate window expression constraints + if window.window_expr.len() != 1 { + return plan_err!( + "Arroyo currently supports exactly one window expression per OVER clause." + ); + } + + let (mut wf, original_name) = self.resolve_window_function(&window.window_expr[0])?; + + // 3. Identify and extract the window column from PARTITION BY + let window_part_idx = + self.identify_window_partition(&wf.params, &window.input, &analyzer.fields)?; + let mut partition_keys = wf.params.partition_by.clone(); + partition_keys.remove(window_part_idx); + + // Update function params to exclude the window column from internal partitioning + // as the streaming engine handles window boundaries natively. + wf.params.partition_by = partition_keys.clone(); + let key_count = partition_keys.len(); + + // 4. Build the data-shuffling pipeline (Projection -> KeyCalc -> Sort) + let keyed_plan = self.build_keyed_input(window.input.clone(), &partition_keys)?; + + let mut sort_exprs: Vec<_> = partition_keys + .iter() + .map(|e| logical_expr::expr::Sort { + expr: e.clone(), + asc: true, + nulls_first: false, + }) + .collect(); + sort_exprs.extend(wf.params.order_by.clone()); + + let sorted_plan = LogicalPlan::Sort(Sort { + expr: sort_exprs, + input: Arc::new(keyed_plan), + fetch: None, + }); + + // 5. Final Assembly + let final_wf_expr = Expr::WindowFunction(Box::new(wf)).alias_if_changed(original_name)?; + let rewritten_window = + LogicalPlan::Window(Window::try_new(vec![final_wf_expr], Arc::new(sorted_plan))?); + + Ok(Transformed::yes(LogicalPlan::Extension(Extension { + node: Arc::new(StreamingWindowFunctionNode::new( + rewritten_window, + (0..key_count).collect(), + )), + }))) + } +} diff --git a/src/sql/api/checkpoints.rs b/src/sql/api/checkpoints.rs new file mode 100644 index 00000000..d9bdc139 --- /dev/null +++ b/src/sql/api/checkpoints.rs @@ -0,0 +1,108 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use crate::sql::common::to_micros; +use serde::{Deserialize, Serialize}; +use std::time::SystemTime; + +#[derive(Serialize, Deserialize, Clone, Debug)] +#[serde(rename_all = "snake_case")] +pub struct Checkpoint { + pub epoch: u32, + pub backend: String, + pub start_time: u64, + pub finish_time: Option, + pub events: Vec, +} + +#[derive(Serialize, Deserialize, Clone, Debug)] +#[serde(rename_all = "snake_case")] +pub struct CheckpointEventSpan { + pub start_time: u64, + pub finish_time: u64, + pub event: String, + pub description: String, +} + +#[derive(Serialize, Deserialize, Clone, Debug)] +#[serde(rename_all = "snake_case")] +pub struct SubtaskCheckpointGroup { + pub index: u32, + pub bytes: u64, + pub event_spans: Vec, +} + +#[derive(Serialize, Deserialize, Clone, Debug)] +#[serde(rename_all = "snake_case")] +pub struct OperatorCheckpointGroup { + pub operator_id: String, + pub bytes: u64, + pub started_metadata_write: Option, + pub finish_time: Option, + pub subtasks: Vec, +} + +#[derive(Debug, Copy, Clone, Eq, PartialEq, Serialize, Deserialize)] +pub enum JobCheckpointEventType { + Checkpointing, + CheckpointingOperators, + WritingMetadata, + Compacting, + Committing, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct JobCheckpointSpan { + pub event: JobCheckpointEventType, + pub start_time: u64, + pub finish_time: Option, +} + +impl JobCheckpointSpan { + pub fn now(event: JobCheckpointEventType) -> Self { + Self { + event, + start_time: to_micros(SystemTime::now()), + finish_time: None, + } + } + + pub fn finish(&mut self) { + if self.finish_time.is_none() { + self.finish_time = Some(to_micros(SystemTime::now())); + } + } +} + +impl From for CheckpointEventSpan { + fn from(value: JobCheckpointSpan) -> Self { + let description = match value.event { + JobCheckpointEventType::Checkpointing => "The entire checkpointing process", + JobCheckpointEventType::CheckpointingOperators => { + "The time spent checkpointing operator states" + } + JobCheckpointEventType::WritingMetadata => "Writing the final checkpoint metadata", + JobCheckpointEventType::Compacting => "Compacting old checkpoints", + JobCheckpointEventType::Committing => { + "Running two-phase commit for transactional connectors" + } + } + .to_string(); + + Self { + start_time: value.start_time, + finish_time: value.finish_time.unwrap_or_default(), + event: format!("{:?}", value.event), + description, + } + } +} diff --git a/src/sql/api/connections.rs b/src/sql/api/connections.rs new file mode 100644 index 00000000..148df69d --- /dev/null +++ b/src/sql/api/connections.rs @@ -0,0 +1,616 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use crate::sql::common::formats::{BadData, Format, Framing}; +use crate::sql::common::{FsExtensionType, FsSchema}; +use datafusion::arrow::datatypes::{DataType, Field, Fields, TimeUnit}; +use serde::ser::SerializeMap; +use serde::{Deserialize, Serialize, Serializer}; +use std::collections::{BTreeMap, HashMap, HashSet}; +use std::fmt::{Display, Formatter}; +use std::sync::Arc; + +#[derive(Serialize, Deserialize, Clone, Debug)] +#[serde(rename_all = "snake_case")] +pub struct Connector { + pub id: String, + pub name: String, + pub icon: String, + pub description: String, + pub table_config: String, + pub enabled: bool, + pub source: bool, + pub sink: bool, + pub custom_schemas: bool, + pub testing: bool, + pub hidden: bool, + pub connection_config: Option, +} + +#[derive(Serialize, Deserialize, Clone, Debug)] +#[serde(rename_all = "snake_case")] +pub struct ConnectionProfile { + pub id: String, + pub name: String, + pub connector: String, + pub config: serde_json::Value, + pub description: String, +} + +#[derive(Serialize, Deserialize, Clone, Debug)] +#[serde(rename_all = "snake_case")] +pub struct ConnectionProfilePost { + pub name: String, + pub connector: String, + pub config: serde_json::Value, +} + +#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)] +#[serde(rename_all = "snake_case")] +pub enum ConnectionType { + Source, + Sink, + Lookup, +} + +impl Display for ConnectionType { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + match self { + ConnectionType::Source => write!(f, "SOURCE"), + ConnectionType::Sink => write!(f, "SINK"), + ConnectionType::Lookup => write!(f, "LOOKUP"), + } + } +} + +impl TryFrom for ConnectionType { + type Error = String; + + fn try_from(value: String) -> Result { + match value.to_lowercase().as_str() { + "source" => Ok(ConnectionType::Source), + "sink" => Ok(ConnectionType::Sink), + "lookup" => Ok(ConnectionType::Lookup), + _ => Err(format!("Invalid connection type: {value}")), + } + } +} + +// ─────────────────── Field Types ─────────────────── + +#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq)] +#[serde(tag = "type", rename_all = "snake_case")] +pub enum FieldType { + Int32, + Int64, + Uint32, + Uint64, + #[serde(alias = "f32")] + Float32, + #[serde(alias = "f64")] + Float64, + Decimal128(DecimalField), + Bool, + #[serde(alias = "utf8")] + String, + #[serde(alias = "binary")] + Bytes, + Timestamp(TimestampField), + Json, + Struct(StructField), + List(ListField), +} + +impl FieldType { + pub fn sql_type(&self) -> String { + match self { + FieldType::Int32 => "INTEGER".into(), + FieldType::Int64 => "BIGINT".into(), + FieldType::Uint32 => "INTEGER UNSIGNED".into(), + FieldType::Uint64 => "BIGINT UNSIGNED".into(), + FieldType::Float32 => "FLOAT".into(), + FieldType::Float64 => "DOUBLE".into(), + FieldType::Decimal128(f) => format!("DECIMAL({}, {})", f.precision, f.scale), + FieldType::Bool => "BOOLEAN".into(), + FieldType::String => "TEXT".into(), + FieldType::Bytes => "BINARY".into(), + FieldType::Timestamp(t) => format!("TIMESTAMP({})", t.unit.precision()), + FieldType::Json => "JSON".into(), + FieldType::List(item) => format!("{}[]", item.items.field_type.sql_type()), + FieldType::Struct(StructField { fields, .. }) => { + format!( + "STRUCT <{}>", + fields + .iter() + .map(|f| format!("{} {}", f.name, f.field_type.sql_type())) + .collect::>() + .join(", ") + ) + } + } + } +} + +#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq, Default)] +#[serde(rename_all = "snake_case")] +pub enum TimestampUnit { + #[serde(alias = "s")] + Second, + #[default] + #[serde(alias = "ms")] + Millisecond, + #[serde(alias = "µs", alias = "us")] + Microsecond, + #[serde(alias = "ns")] + Nanosecond, +} + +impl TimestampUnit { + pub fn precision(&self) -> u8 { + match self { + TimestampUnit::Second => 0, + TimestampUnit::Millisecond => 3, + TimestampUnit::Microsecond => 6, + TimestampUnit::Nanosecond => 9, + } + } +} + +#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq)] +#[serde(rename_all = "snake_case")] +pub struct TimestampField { + #[serde(default)] + pub unit: TimestampUnit, +} + +#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq)] +#[serde(rename_all = "snake_case")] +pub struct DecimalField { + pub precision: u8, + pub scale: i8, +} + +#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq)] +#[serde(rename_all = "snake_case")] +pub struct StructField { + pub fields: Vec, +} + +#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq)] +#[serde(rename_all = "snake_case")] +pub struct ListField { + pub items: Box, +} + +fn default_item_name() -> String { + "item".to_string() +} + +#[derive(Deserialize, Clone, Debug, PartialEq, Eq)] +#[serde(rename_all = "snake_case")] +pub struct ListFieldItem { + #[serde(default = "default_item_name")] + pub name: String, + #[serde(flatten)] + pub field_type: FieldType, + #[serde(default)] + pub required: bool, + #[serde(default)] + pub sql_name: Option, +} + +impl From for Field { + fn from(value: ListFieldItem) -> Self { + SourceField { + name: value.name, + field_type: value.field_type, + required: value.required, + sql_name: None, + metadata_key: None, + } + .into() + } +} + +impl Serialize for ListFieldItem { + fn serialize(&self, s: S) -> Result + where + S: Serializer, + { + let mut f = Serializer::serialize_map(s, None)?; + f.serialize_entry("name", &self.name)?; + serialize_field_type_flat(&self.field_type, &mut f)?; + f.serialize_entry("required", &self.required)?; + f.serialize_entry("sql_name", &self.field_type.sql_type())?; + f.end() + } +} + +impl TryFrom for ListFieldItem { + type Error = String; + + fn try_from(value: Field) -> Result { + let source_field: SourceField = value.try_into()?; + Ok(Self { + name: source_field.name, + field_type: source_field.field_type, + required: source_field.required, + sql_name: None, + }) + } +} + +fn serialize_field_type_flat(ft: &FieldType, map: &mut M) -> Result<(), M::Error> { + let type_tag = match ft { + FieldType::Int32 => "int32", + FieldType::Int64 => "int64", + FieldType::Uint32 => "uint32", + FieldType::Uint64 => "uint64", + FieldType::Float32 => "float32", + FieldType::Float64 => "float64", + FieldType::Decimal128(_) => "decimal128", + FieldType::Bool => "bool", + FieldType::String => "string", + FieldType::Bytes => "bytes", + FieldType::Timestamp(_) => "timestamp", + FieldType::Json => "json", + FieldType::Struct(_) => "struct", + FieldType::List(_) => "list", + }; + map.serialize_entry("type", type_tag)?; + + match ft { + FieldType::Decimal128(d) => { + map.serialize_entry("precision", &d.precision)?; + map.serialize_entry("scale", &d.scale)?; + } + FieldType::Timestamp(t) => { + map.serialize_entry("unit", &t.unit)?; + } + FieldType::Struct(s) => { + map.serialize_entry("fields", &s.fields)?; + } + FieldType::List(l) => { + map.serialize_entry("items", &l.items)?; + } + _ => {} + } + Ok(()) +} + +// ─────────────────── Source Field ─────────────────── + +#[derive(Deserialize, Clone, Debug, PartialEq, Eq)] +#[serde(rename_all = "snake_case")] +pub struct SourceField { + pub name: String, + #[serde(flatten)] + pub field_type: FieldType, + #[serde(default)] + pub required: bool, + #[serde(default)] + pub sql_name: Option, + #[serde(default)] + pub metadata_key: Option, +} + +impl Serialize for SourceField { + fn serialize(&self, s: S) -> Result + where + S: Serializer, + { + let mut f = Serializer::serialize_map(s, None)?; + f.serialize_entry("name", &self.name)?; + serialize_field_type_flat(&self.field_type, &mut f)?; + f.serialize_entry("required", &self.required)?; + if let Some(metadata_key) = &self.metadata_key { + f.serialize_entry("metadata_key", metadata_key)?; + } + f.serialize_entry("sql_name", &self.field_type.sql_type())?; + f.end() + } +} + +impl From for Field { + fn from(f: SourceField) -> Self { + let (t, ext) = match f.field_type { + FieldType::Int32 => (DataType::Int32, None), + FieldType::Int64 => (DataType::Int64, None), + FieldType::Uint32 => (DataType::UInt32, None), + FieldType::Uint64 => (DataType::UInt64, None), + FieldType::Float32 => (DataType::Float32, None), + FieldType::Float64 => (DataType::Float64, None), + FieldType::Bool => (DataType::Boolean, None), + FieldType::String => (DataType::Utf8, None), + FieldType::Bytes => (DataType::Binary, None), + FieldType::Decimal128(d) => (DataType::Decimal128(d.precision, d.scale), None), + FieldType::Timestamp(TimestampField { + unit: TimestampUnit::Second, + }) => (DataType::Timestamp(TimeUnit::Second, None), None), + FieldType::Timestamp(TimestampField { + unit: TimestampUnit::Millisecond, + }) => (DataType::Timestamp(TimeUnit::Millisecond, None), None), + FieldType::Timestamp(TimestampField { + unit: TimestampUnit::Microsecond, + }) => (DataType::Timestamp(TimeUnit::Microsecond, None), None), + FieldType::Timestamp(TimestampField { + unit: TimestampUnit::Nanosecond, + }) => (DataType::Timestamp(TimeUnit::Nanosecond, None), None), + FieldType::Json => (DataType::Utf8, Some(FsExtensionType::JSON)), + FieldType::Struct(s) => ( + DataType::Struct(Fields::from( + s.fields + .into_iter() + .map(|t| t.into()) + .collect::>(), + )), + None, + ), + FieldType::List(t) => (DataType::List(Arc::new((*t.items).into())), None), + }; + + FsExtensionType::add_metadata(ext, Field::new(f.name, t, !f.required)) + } +} + +impl TryFrom for SourceField { + type Error = String; + + fn try_from(f: Field) -> Result { + let field_type = match (f.data_type(), FsExtensionType::from_map(f.metadata())) { + (DataType::Boolean, None) => FieldType::Bool, + (DataType::Int32, None) => FieldType::Int32, + (DataType::Int64, None) => FieldType::Int64, + (DataType::UInt32, None) => FieldType::Uint32, + (DataType::UInt64, None) => FieldType::Uint64, + (DataType::Float32, None) => FieldType::Float32, + (DataType::Float64, None) => FieldType::Float64, + (DataType::Decimal128(p, s), None) => FieldType::Decimal128(DecimalField { + precision: *p, + scale: *s, + }), + (DataType::Binary | DataType::LargeBinary | DataType::BinaryView, None) => FieldType::Bytes, + (DataType::Timestamp(TimeUnit::Second, _), None) => { + FieldType::Timestamp(TimestampField { + unit: TimestampUnit::Second, + }) + } + (DataType::Timestamp(TimeUnit::Millisecond, _), None) => { + FieldType::Timestamp(TimestampField { + unit: TimestampUnit::Millisecond, + }) + } + (DataType::Timestamp(TimeUnit::Microsecond, _), None) => { + FieldType::Timestamp(TimestampField { + unit: TimestampUnit::Microsecond, + }) + } + (DataType::Timestamp(TimeUnit::Nanosecond, _), None) => { + FieldType::Timestamp(TimestampField { + unit: TimestampUnit::Nanosecond, + }) + } + (DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View, None) => FieldType::String, + (DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View, Some(FsExtensionType::JSON)) => FieldType::Json, + (DataType::Struct(fields), None) => { + let fields: Result<_, String> = fields + .into_iter() + .map(|f| (**f).clone().try_into()) + .collect(); + FieldType::Struct(StructField { fields: fields? }) + } + (DataType::List(item), None) => FieldType::List(ListField { + items: Box::new((**item).clone().try_into()?), + }), + dt => return Err(format!("Unsupported data type {dt:?}")), + }; + + Ok(SourceField { + name: f.name().clone(), + field_type, + required: !f.is_nullable(), + sql_name: None, + metadata_key: None, + }) + } +} + +// ─────────────────── Schema Definitions ─────────────────── + +#[derive(Serialize, Deserialize, Clone, Debug, PartialEq)] +#[serde(rename_all = "snake_case", tag = "type")] +pub enum SchemaDefinition { + JsonSchema { + schema: String, + }, + ProtobufSchema { + schema: String, + #[serde(default)] + dependencies: HashMap, + }, + AvroSchema { + schema: String, + }, +} + +#[derive(Serialize, Deserialize, Clone, Debug, PartialEq)] +#[serde(rename_all = "snake_case")] +pub struct ConnectionSchema { + pub format: Option, + #[serde(default)] + pub bad_data: Option, + #[serde(default)] + pub framing: Option, + #[serde(default)] + pub fields: Vec, + #[serde(default)] + pub definition: Option, + #[serde(default)] + pub inferred: Option, + #[serde(default)] + pub primary_keys: HashSet, +} + +impl ConnectionSchema { + pub fn try_new( + format: Option, + bad_data: Option, + framing: Option, + fields: Vec, + definition: Option, + inferred: Option, + primary_keys: HashSet, + ) -> anyhow::Result { + let s = ConnectionSchema { + format, + bad_data, + framing, + fields, + definition, + inferred, + primary_keys, + }; + s.validate() + } + + pub fn validate(self) -> anyhow::Result { + let non_metadata_fields: Vec<_> = self + .fields + .iter() + .filter(|f| f.metadata_key.is_none()) + .collect(); + + if let Some(Format::RawString(_)) = &self.format { + if non_metadata_fields.len() != 1 + || non_metadata_fields.first().unwrap().field_type != FieldType::String + || non_metadata_fields.first().unwrap().name != "value" + { + anyhow::bail!( + "raw_string format requires a schema with a single field called `value` of type TEXT" + ); + } + } + + if let Some(Format::Json(json_format)) = &self.format { + if json_format.unstructured + && (non_metadata_fields.len() != 1 + || non_metadata_fields.first().unwrap().field_type != FieldType::Json + || non_metadata_fields.first().unwrap().name != "value") + { + anyhow::bail!( + "json format with unstructured flag enabled requires a schema with a single field called `value` of type JSON" + ); + } + } + + Ok(self) + } + + pub fn fs_schema(&self) -> Arc { + let fields: Vec = self.fields.iter().map(|f| f.clone().into()).collect(); + Arc::new(FsSchema::from_fields(fields)) + } +} + +impl From for FsSchema { + fn from(val: ConnectionSchema) -> Self { + let fields: Vec = val.fields.into_iter().map(|f| f.into()).collect(); + FsSchema::from_fields(fields) + } +} + +// ─────────────────── Connection Table ─────────────────── + +#[derive(Serialize, Clone, Debug)] +#[serde(rename_all = "snake_case")] +pub struct ConnectionTable { + #[serde(skip_serializing)] + pub id: i64, + #[serde(rename = "id")] + pub pub_id: String, + pub name: String, + pub created_at: u64, + pub connector: String, + pub connection_profile: Option, + pub table_type: ConnectionType, + pub config: serde_json::Value, + pub schema: ConnectionSchema, + pub consumers: u32, +} + +#[derive(Serialize, Deserialize, Clone, Debug)] +#[serde(rename_all = "snake_case")] +pub struct ConnectionTablePost { + pub name: String, + pub connector: String, + pub connection_profile_id: Option, + pub config: serde_json::Value, + pub schema: Option, +} + +#[derive(Serialize, Deserialize, Clone, Debug)] +#[serde(rename_all = "snake_case")] +pub struct ConnectionAutocompleteResp { + pub values: BTreeMap>, +} + +#[derive(Serialize, Deserialize, Clone, Debug)] +#[serde(rename_all = "snake_case")] +pub struct TestSourceMessage { + pub error: bool, + pub done: bool, + pub message: String, +} + +impl TestSourceMessage { + pub fn info(message: impl Into) -> Self { + Self { + error: false, + done: false, + message: message.into(), + } + } + pub fn error(message: impl Into) -> Self { + Self { + error: true, + done: false, + message: message.into(), + } + } + pub fn done(message: impl Into) -> Self { + Self { + error: false, + done: true, + message: message.into(), + } + } + pub fn fail(message: impl Into) -> Self { + Self { + error: true, + done: true, + message: message.into(), + } + } +} + +#[derive(Serialize, Deserialize, Clone, Debug)] +#[serde(rename_all = "snake_case")] +pub struct ConfluentSchema { + pub schema: String, +} + +#[derive(Serialize, Deserialize, Clone, Debug)] +#[serde(rename_all = "snake_case")] +pub struct ConfluentSchemaQueryParams { + pub endpoint: String, + pub topic: String, +} diff --git a/src/sql/api/metrics.rs b/src/sql/api/metrics.rs new file mode 100644 index 00000000..671b52f6 --- /dev/null +++ b/src/sql/api/metrics.rs @@ -0,0 +1,53 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use serde::{Deserialize, Serialize}; + +#[derive(Serialize, Deserialize, Copy, Clone, Debug, Hash, PartialEq, Eq)] +#[serde(rename_all = "snake_case")] +pub enum MetricName { + BytesRecv, + BytesSent, + MessagesRecv, + MessagesSent, + Backpressure, + TxQueueSize, + TxQueueRem, +} + +#[derive(Serialize, Deserialize, Clone, Debug)] +#[serde(rename_all = "snake_case")] +pub struct Metric { + pub time: u64, + pub value: f64, +} + +#[derive(Serialize, Deserialize, Clone, Debug)] +#[serde(rename_all = "snake_case")] +pub struct SubtaskMetrics { + pub index: u32, + pub metrics: Vec, +} + +#[derive(Serialize, Deserialize, Clone, Debug)] +#[serde(rename_all = "snake_case")] +pub struct MetricGroup { + pub name: MetricName, + pub subtasks: Vec, +} + +#[derive(Serialize, Deserialize, Clone, Debug)] +#[serde(rename_all = "snake_case")] +pub struct OperatorMetricGroup { + pub node_id: u32, + pub metric_groups: Vec, +} diff --git a/src/sql/api/mod.rs b/src/sql/api/mod.rs new file mode 100644 index 00000000..cdc119b7 --- /dev/null +++ b/src/sql/api/mod.rs @@ -0,0 +1,48 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! REST/RPC API types for the FunctionStream system. +//! +//! Adapted from Arroyo's `arroyo-rpc/src/api_types` and utility modules. + +pub mod checkpoints; +pub mod connections; +pub mod metrics; +pub mod pipelines; +pub mod public_ids; +pub mod schema_resolver; +pub mod udfs; +pub mod var_str; + +use serde::{Deserialize, Serialize}; + +pub use connections::ConnectionProfile; + +#[derive(Serialize, Deserialize, Clone, Debug)] +#[serde(rename_all = "camelCase")] +pub struct PaginatedCollection { + pub data: Vec, + pub has_more: bool, +} + +#[derive(Serialize, Deserialize, Clone, Debug)] +#[serde(rename_all = "camelCase")] +pub struct NonPaginatedCollection { + pub data: Vec, +} + +#[derive(Serialize, Deserialize, Clone, Debug)] +#[serde(rename_all = "snake_case")] +pub struct PaginationQueryParams { + pub starting_after: Option, + pub limit: Option, +} diff --git a/src/sql/api/pipelines.rs b/src/sql/api/pipelines.rs new file mode 100644 index 00000000..d6cc5253 --- /dev/null +++ b/src/sql/api/pipelines.rs @@ -0,0 +1,168 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use super::udfs::Udf; +use crate::sql::common::control::ErrorDomain; +use serde::{Deserialize, Serialize}; + +#[derive(Serialize, Deserialize, Clone, Debug)] +#[serde(rename_all = "snake_case")] +pub struct ValidateQueryPost { + pub query: String, + pub udfs: Option>, +} + +#[derive(Serialize, Deserialize, Clone, Debug)] +#[serde(rename_all = "snake_case")] +pub struct QueryValidationResult { + pub graph: Option, + pub errors: Vec, +} + +#[derive(Serialize, Deserialize, Clone, Debug)] +#[serde(rename_all = "snake_case")] +pub struct PipelinePost { + pub name: String, + pub query: String, + pub udfs: Option>, + pub parallelism: u64, + pub checkpoint_interval_micros: Option, +} + +#[derive(Serialize, Deserialize, Clone, Debug)] +#[serde(rename_all = "snake_case")] +pub struct PreviewPost { + pub query: String, + pub udfs: Option>, + #[serde(default)] + pub enable_sinks: bool, +} + +#[derive(Serialize, Deserialize, Clone, Debug)] +#[serde(rename_all = "snake_case")] +pub struct PipelinePatch { + pub parallelism: Option, + pub checkpoint_interval_micros: Option, + pub stop: Option, +} + +#[derive(Serialize, Deserialize, Clone, Debug)] +#[serde(rename_all = "snake_case")] +pub struct PipelineRestart { + pub force: Option, + pub ignore_state: Option, +} + +#[derive(Serialize, Deserialize, Clone, Debug)] +#[serde(rename_all = "snake_case")] +pub struct Pipeline { + pub id: String, + pub name: String, + pub query: String, + pub udfs: Vec, + pub checkpoint_interval_micros: u64, + pub stop: StopType, + pub created_at: u64, + pub action: Option, + pub action_text: String, + pub action_in_progress: bool, + pub graph: PipelineGraph, + pub preview: bool, +} + +#[derive(Serialize, Deserialize, Clone, Debug)] +#[serde(rename_all = "snake_case")] +pub struct PipelineGraph { + pub nodes: Vec, + pub edges: Vec, +} + +#[derive(Serialize, Deserialize, Clone, Debug)] +#[serde(rename_all = "snake_case")] +pub struct PipelineNode { + pub node_id: u32, + pub operator: String, + pub description: String, + pub parallelism: u32, +} + +#[derive(Serialize, Deserialize, Clone, Debug)] +#[serde(rename_all = "snake_case")] +pub struct PipelineEdge { + pub src_id: u32, + pub dest_id: u32, + pub key_type: String, + pub value_type: String, + pub edge_type: String, +} + +#[derive(Serialize, Deserialize, Clone, Debug)] +#[serde(rename_all = "snake_case")] +pub enum StopType { + None, + Checkpoint, + Graceful, + Immediate, + Force, +} + +#[derive(Serialize, Deserialize, Clone, Debug)] +#[serde(rename_all = "snake_case")] +pub struct FailureReason { + pub error: String, + pub domain: ErrorDomain, +} + +#[derive(Serialize, Deserialize, Clone, Debug)] +#[serde(rename_all = "snake_case")] +pub struct Job { + pub id: String, + pub running_desired: bool, + pub state: String, + pub run_id: u64, + pub start_time: Option, + pub finish_time: Option, + pub tasks: Option, + pub failure_reason: Option, + pub created_at: u64, +} + +#[derive(Serialize, Deserialize, Clone, Debug)] +#[serde(rename_all = "snake_case")] +pub enum JobLogLevel { + Info, + Warn, + Error, +} + +#[derive(Serialize, Deserialize, Clone, Debug)] +#[serde(rename_all = "snake_case")] +pub struct JobLogMessage { + pub id: String, + pub created_at: u64, + pub operator_id: Option, + pub task_index: Option, + pub level: JobLogLevel, + pub message: String, + pub details: String, + pub error_domain: Option, +} + +#[derive(Serialize, Deserialize, Clone, Debug)] +#[serde(rename_all = "snake_case")] +pub struct OutputData { + pub operator_id: String, + pub subtask_idx: u32, + pub timestamps: Vec, + pub start_id: u64, + pub batch: String, +} diff --git a/src/sql/api/public_ids.rs b/src/sql/api/public_ids.rs new file mode 100644 index 00000000..33aa6427 --- /dev/null +++ b/src/sql/api/public_ids.rs @@ -0,0 +1,69 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::time::{SystemTime, UNIX_EPOCH}; + +const ID_LENGTH: usize = 10; + +const ALPHABET: &[u8; 62] = b"0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; + +pub enum IdTypes { + ApiKey, + ConnectionProfile, + Schema, + Pipeline, + JobConfig, + Checkpoint, + JobStatus, + ClusterInfo, + JobLogMessage, + ConnectionTable, + ConnectionTablePipeline, + Udf, +} + +/// Generates a unique identifier with a type-specific prefix. +/// +/// Uses a simple time + random approach instead of nanoid to avoid an extra dependency. +pub fn generate_id(id_type: IdTypes) -> String { + let prefix = match id_type { + IdTypes::ApiKey => "ak", + IdTypes::ConnectionProfile => "cp", + IdTypes::Schema => "sch", + IdTypes::Pipeline => "pl", + IdTypes::JobConfig => "job", + IdTypes::Checkpoint => "chk", + IdTypes::JobStatus => "js", + IdTypes::ClusterInfo => "ci", + IdTypes::JobLogMessage => "jlm", + IdTypes::ConnectionTable => "ct", + IdTypes::ConnectionTablePipeline => "ctp", + IdTypes::Udf => "udf", + }; + + let nanos = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or_default() + .as_nanos(); + + let mut id = String::with_capacity(ID_LENGTH); + let mut seed = nanos; + for _ in 0..ID_LENGTH { + seed ^= seed + .wrapping_mul(6364136223846793005) + .wrapping_add(1442695040888963407); + let idx = (seed % ALPHABET.len() as u128) as usize; + id.push(ALPHABET[idx] as char); + } + + format!("{prefix}_{id}") +} diff --git a/src/sql/api/schema_resolver.rs b/src/sql/api/schema_resolver.rs new file mode 100644 index 00000000..57d3d702 --- /dev/null +++ b/src/sql/api/schema_resolver.rs @@ -0,0 +1,94 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use async_trait::async_trait; + +/// Trait for resolving schemas by ID (e.g., from a schema registry). +#[async_trait] +pub trait SchemaResolver: Send { + async fn resolve_schema(&self, id: u32) -> Result, String>; +} + +/// A resolver that always fails — used when no schema registry is configured. +pub struct FailingSchemaResolver; + +impl Default for FailingSchemaResolver { + fn default() -> Self { + Self + } +} + +#[async_trait] +impl SchemaResolver for FailingSchemaResolver { + async fn resolve_schema(&self, id: u32) -> Result, String> { + Err(format!( + "Schema with id {id} not available, and no schema registry configured" + )) + } +} + +/// A resolver that returns a fixed schema for a known ID. +pub struct FixedSchemaResolver { + id: u32, + schema: String, +} + +impl FixedSchemaResolver { + pub fn new(id: u32, schema: String) -> Self { + FixedSchemaResolver { id, schema } + } +} + +#[async_trait] +impl SchemaResolver for FixedSchemaResolver { + async fn resolve_schema(&self, id: u32) -> Result, String> { + if id == self.id { + Ok(Some(self.schema.clone())) + } else { + Err(format!("Unexpected schema id {}, expected {}", id, self.id)) + } + } +} + +/// A caching wrapper around any `SchemaResolver`. +pub struct CachingSchemaResolver { + inner: R, + cache: tokio::sync::RwLock>, +} + +impl CachingSchemaResolver { + pub fn new(inner: R) -> Self { + Self { + inner, + cache: tokio::sync::RwLock::new(std::collections::HashMap::new()), + } + } +} + +#[async_trait] +impl SchemaResolver for CachingSchemaResolver { + async fn resolve_schema(&self, id: u32) -> Result, String> { + { + let cache = self.cache.read().await; + if let Some(schema) = cache.get(&id) { + return Ok(Some(schema.clone())); + } + } + + let result = self.inner.resolve_schema(id).await?; + if let Some(ref schema) = result { + let mut cache = self.cache.write().await; + cache.insert(id, schema.clone()); + } + Ok(result) + } +} diff --git a/src/sql/api/udfs.rs b/src/sql/api/udfs.rs new file mode 100644 index 00000000..781d5b07 --- /dev/null +++ b/src/sql/api/udfs.rs @@ -0,0 +1,68 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use serde::{Deserialize, Serialize}; + +#[derive(Serialize, Deserialize, Clone, Debug)] +#[serde(rename_all = "snake_case")] +pub struct Udf { + pub definition: String, + #[serde(default)] + pub language: UdfLanguage, +} + +#[derive(Serialize, Deserialize, Clone, Debug)] +#[serde(rename_all = "snake_case")] +pub struct ValidateUdfPost { + pub definition: String, + #[serde(default)] + pub language: UdfLanguage, +} + +#[derive(Serialize, Deserialize, Clone, Debug)] +#[serde(rename_all = "snake_case")] +pub struct UdfValidationResult { + pub udf_name: Option, + pub errors: Vec, +} + +#[derive(Serialize, Deserialize, Copy, Clone, Debug, Default, Eq, PartialEq)] +#[serde(rename_all = "snake_case")] +pub enum UdfLanguage { + Python, + #[default] + Rust, +} + +#[derive(Serialize, Deserialize, Clone, Debug)] +#[serde(rename_all = "snake_case")] +pub struct UdfPost { + pub prefix: String, + #[serde(default)] + pub language: UdfLanguage, + pub definition: String, + pub description: Option, +} + +#[derive(Serialize, Deserialize, Clone, Debug)] +#[serde(rename_all = "snake_case")] +pub struct GlobalUdf { + pub id: String, + pub prefix: String, + pub name: String, + pub language: UdfLanguage, + pub created_at: u64, + pub updated_at: u64, + pub definition: String, + pub description: Option, + pub dylib_url: Option, +} diff --git a/src/sql/api/var_str.rs b/src/sql/api/var_str.rs new file mode 100644 index 00000000..2638cd06 --- /dev/null +++ b/src/sql/api/var_str.rs @@ -0,0 +1,91 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use serde::{Deserialize, Serialize}; +use std::env; + +/// A string that may contain `{{ VAR }}` placeholders for environment variable substitution. +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +#[serde(transparent)] +pub struct VarStr { + raw_val: String, +} + +impl VarStr { + pub fn new(raw_val: String) -> Self { + VarStr { raw_val } + } + + pub fn raw(&self) -> &str { + &self.raw_val + } + + /// Substitute `{{ VAR_NAME }}` patterns with the corresponding environment variable values. + pub fn sub_env_vars(&self) -> anyhow::Result { + let mut result = self.raw_val.clone(); + let mut start = 0; + + while let Some(open) = result[start..].find("{{") { + let open_abs = start + open; + let Some(close) = result[open_abs..].find("}}") else { + break; + }; + let close_abs = open_abs + close; + + let var_name = result[open_abs + 2..close_abs].trim(); + if var_name.is_empty() { + start = close_abs + 2; + continue; + } + + match env::var(var_name) { + Ok(value) => { + let full_match = &result[open_abs..close_abs + 2]; + let full_match_owned = full_match.to_string(); + result = result.replacen(&full_match_owned, &value, 1); + start = open_abs + value.len(); + } + Err(_) => { + anyhow::bail!("Environment variable {} not found", var_name); + } + } + } + + Ok(result) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_no_placeholders() { + let input = "This is a test string with no placeholders"; + assert_eq!( + VarStr::new(input.to_string()).sub_env_vars().unwrap(), + input + ); + } + + #[test] + fn test_with_placeholders() { + unsafe { env::set_var("FS_TEST_VAR", "environment variable") }; + let input = "This is a {{ FS_TEST_VAR }}"; + let expected = "This is a environment variable"; + assert_eq!( + VarStr::new(input.to_string()).sub_env_vars().unwrap(), + expected + ); + unsafe { env::remove_var("FS_TEST_VAR") }; + } +} diff --git a/src/sql/common/arrow_ext.rs b/src/sql/common/arrow_ext.rs new file mode 100644 index 00000000..782f4358 --- /dev/null +++ b/src/sql/common/arrow_ext.rs @@ -0,0 +1,181 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; +use std::fmt::{Display, Formatter}; +use std::time::SystemTime; + +use datafusion::arrow::datatypes::{DataType, Field, TimeUnit}; + +pub struct DisplayAsSql<'a>(pub &'a DataType); + +impl Display for DisplayAsSql<'_> { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + match self.0 { + DataType::Boolean => write!(f, "BOOLEAN"), + DataType::Int8 | DataType::Int16 | DataType::Int32 => write!(f, "INT"), + DataType::Int64 => write!(f, "BIGINT"), + DataType::UInt8 | DataType::UInt16 | DataType::UInt32 => write!(f, "INT UNSIGNED"), + DataType::UInt64 => write!(f, "BIGINT UNSIGNED"), + DataType::Float16 | DataType::Float32 => write!(f, "FLOAT"), + DataType::Float64 => write!(f, "DOUBLE"), + DataType::Timestamp(_, _) => write!(f, "TIMESTAMP"), + DataType::Date32 => write!(f, "DATE"), + DataType::Date64 => write!(f, "DATETIME"), + DataType::Time32(_) => write!(f, "TIME"), + DataType::Time64(_) => write!(f, "TIME"), + DataType::Duration(_) => write!(f, "INTERVAL"), + DataType::Interval(_) => write!(f, "INTERVAL"), + DataType::Binary | DataType::FixedSizeBinary(_) | DataType::LargeBinary => { + write!(f, "BYTEA") + } + DataType::Utf8 | DataType::LargeUtf8 => write!(f, "TEXT"), + DataType::List(inner) => { + write!(f, "{}[]", DisplayAsSql(inner.data_type())) + } + dt => write!(f, "{dt}"), + } + } +} + +/// Arrow extension type markers for FunctionStream-specific semantics. +#[derive(Copy, Clone, Eq, PartialEq, Debug)] +pub enum FsExtensionType { + JSON, +} + +impl FsExtensionType { + pub fn from_map(map: &HashMap) -> Option { + match map.get("ARROW:extension:name")?.as_str() { + "functionstream.json" => Some(Self::JSON), + _ => None, + } + } + + pub fn add_metadata(v: Option, field: Field) -> Field { + if let Some(v) = v { + let mut m = HashMap::new(); + match v { + FsExtensionType::JSON => { + m.insert( + "ARROW:extension:name".to_string(), + "functionstream.json".to_string(), + ); + } + } + field.with_metadata(m) + } else { + field + } + } +} + +pub trait GetArrowType { + fn arrow_type() -> DataType; +} + +pub trait GetArrowSchema { + fn arrow_schema() -> datafusion::arrow::datatypes::Schema; +} + +impl GetArrowType for T +where + T: GetArrowSchema, +{ + fn arrow_type() -> DataType { + DataType::Struct(Self::arrow_schema().fields.clone()) + } +} + +impl GetArrowType for bool { + fn arrow_type() -> DataType { + DataType::Boolean + } +} + +impl GetArrowType for i8 { + fn arrow_type() -> DataType { + DataType::Int8 + } +} + +impl GetArrowType for i16 { + fn arrow_type() -> DataType { + DataType::Int16 + } +} + +impl GetArrowType for i32 { + fn arrow_type() -> DataType { + DataType::Int32 + } +} + +impl GetArrowType for i64 { + fn arrow_type() -> DataType { + DataType::Int64 + } +} + +impl GetArrowType for u8 { + fn arrow_type() -> DataType { + DataType::UInt8 + } +} + +impl GetArrowType for u16 { + fn arrow_type() -> DataType { + DataType::UInt16 + } +} + +impl GetArrowType for u32 { + fn arrow_type() -> DataType { + DataType::UInt32 + } +} + +impl GetArrowType for u64 { + fn arrow_type() -> DataType { + DataType::UInt64 + } +} + +impl GetArrowType for f32 { + fn arrow_type() -> DataType { + DataType::Float32 + } +} + +impl GetArrowType for f64 { + fn arrow_type() -> DataType { + DataType::Float64 + } +} + +impl GetArrowType for String { + fn arrow_type() -> DataType { + DataType::Utf8 + } +} + +impl GetArrowType for Vec { + fn arrow_type() -> DataType { + DataType::Binary + } +} + +impl GetArrowType for SystemTime { + fn arrow_type() -> DataType { + DataType::Timestamp(TimeUnit::Nanosecond, None) + } +} diff --git a/src/sql/common/connector_options.rs b/src/sql/common/connector_options.rs new file mode 100644 index 00000000..6f82782e --- /dev/null +++ b/src/sql/common/connector_options.rs @@ -0,0 +1,434 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::{BTreeMap, HashMap}; +use std::num::{NonZero, NonZeroU64}; +use std::str::FromStr; +use std::time::Duration; + +use datafusion::common::{Result as DFResult, plan_datafusion_err}; +use datafusion::error::DataFusionError; +use datafusion::sql::sqlparser::ast::{Expr, Ident, SqlOption, Value as SqlValue, ValueWithSpan}; +use tracing::warn; + +use super::constants::{interval_duration_unit, with_opt_bool_str}; + +pub trait FromOpts: Sized { + fn from_opts(opts: &mut ConnectorOptions) -> DFResult; +} + +pub struct ConnectorOptions { + options: HashMap, + partitions: Vec, +} + +fn sql_expr_to_catalog_string(e: &Expr) -> String { + match e { + Expr::Value(ValueWithSpan { value, .. }) => match value { + SqlValue::SingleQuotedString(s) | SqlValue::DoubleQuotedString(s) => s.clone(), + SqlValue::NationalStringLiteral(s) => s.clone(), + SqlValue::HexStringLiteral(s) => s.clone(), + SqlValue::Number(n, _) => n.clone(), + SqlValue::Boolean(b) => b.to_string(), + SqlValue::Null => "NULL".to_string(), + other => other.to_string(), + }, + Expr::Identifier(ident) => ident.value.clone(), + other => other.to_string(), + } +} + +impl ConnectorOptions { + /// Build options from persisted catalog string maps (same semantics as SQL `WITH` literals). + pub fn from_flat_string_map(map: HashMap) -> DFResult { + let mut options = HashMap::with_capacity(map.len()); + for (k, v) in map { + options.insert( + k, + Expr::Value(SqlValue::SingleQuotedString(v).with_empty_span()), + ); + } + Ok(Self { + options, + partitions: Vec::new(), + }) + } + + pub fn new(sql_opts: &[SqlOption], partition_by: &Option>) -> DFResult { + let mut options = HashMap::new(); + + for option in sql_opts { + let SqlOption::KeyValue { key, value } = option else { + return Err(plan_datafusion_err!( + "invalid with option: '{}'; expected an `=` delimited key-value pair", + option + )); + }; + + options.insert(key.value.clone(), value.clone()); + } + + Ok(Self { + options, + partitions: partition_by.clone().unwrap_or_default(), + }) + } + + pub fn partitions(&self) -> &[Expr] { + &self.partitions + } + + pub fn pull_struct(&mut self) -> DFResult { + T::from_opts(self) + } + + pub fn pull_opt_str(&mut self, name: &str) -> DFResult> { + match self.options.remove(name) { + Some(Expr::Value(ValueWithSpan { + value: SqlValue::SingleQuotedString(s), + span: _, + })) => Ok(Some(s)), + Some(e) => Err(plan_datafusion_err!( + "expected with option '{}' to be a single-quoted string, but it was `{:?}`", + name, + e + )), + None => Ok(None), + } + } + + pub fn pull_str(&mut self, name: &str) -> DFResult { + self.pull_opt_str(name)? + .ok_or_else(|| plan_datafusion_err!("required option '{}' not set", name)) + } + + pub fn pull_opt_bool(&mut self, name: &str) -> DFResult> { + match self.options.remove(name) { + Some(Expr::Value(ValueWithSpan { + value: SqlValue::Boolean(b), + span: _, + })) => Ok(Some(b)), + Some(Expr::Value(ValueWithSpan { + value: SqlValue::SingleQuotedString(s), + span: _, + })) => match s.as_str() { + with_opt_bool_str::TRUE | with_opt_bool_str::YES => Ok(Some(true)), + with_opt_bool_str::FALSE | with_opt_bool_str::NO => Ok(Some(false)), + _ => Err(plan_datafusion_err!( + "expected with option '{}' to be a boolean, but it was `'{}'`", + name, + s + )), + }, + Some(e) => Err(plan_datafusion_err!( + "expected with option '{}' to be a boolean, but it was `{:?}`", + name, + e + )), + None => Ok(None), + } + } + + pub fn pull_opt_u64(&mut self, name: &str) -> DFResult> { + match self.options.remove(name) { + Some(Expr::Value(ValueWithSpan { + value: SqlValue::Number(s, _), + span: _, + })) + | Some(Expr::Value(ValueWithSpan { + value: SqlValue::SingleQuotedString(s), + span: _, + })) => s.parse::().map(Some).map_err(|_| { + plan_datafusion_err!( + "expected with option '{}' to be an unsigned integer, but it was `{}`", + name, + s + ) + }), + Some(e) => Err(plan_datafusion_err!( + "expected with option '{}' to be an unsigned integer, but it was `{:?}`", + name, + e + )), + None => Ok(None), + } + } + + pub fn pull_opt_nonzero_u64(&mut self, name: &str) -> DFResult>> { + match self.pull_opt_u64(name)? { + Some(0) => Err(plan_datafusion_err!( + "expected with option '{name}' to be greater than 0, but it was 0" + )), + Some(i) => Ok(Some(NonZeroU64::new(i).unwrap())), + None => Ok(None), + } + } + + pub fn pull_opt_data_size_bytes(&mut self, name: &str) -> DFResult> { + self.pull_opt_str(name)? + .map(|s| { + s.parse::().map_err(|_| { + plan_datafusion_err!( + "expected with option '{}' to be a size in bytes (unsigned integer), but it was `{}`", + name, + s + ) + }) + }) + .transpose() + } + + pub fn pull_opt_i64(&mut self, name: &str) -> DFResult> { + match self.options.remove(name) { + Some(Expr::Value(ValueWithSpan { + value: SqlValue::Number(s, _), + span: _, + })) + | Some(Expr::Value(ValueWithSpan { + value: SqlValue::SingleQuotedString(s), + span: _, + })) => s.parse::().map(Some).map_err(|_| { + plan_datafusion_err!( + "expected with option '{}' to be an integer, but it was `{}`", + name, + s + ) + }), + Some(e) => Err(plan_datafusion_err!( + "expected with option '{}' to be an integer, but it was `{:?}`", + name, + e + )), + None => Ok(None), + } + } + + pub fn pull_i64(&mut self, name: &str) -> DFResult { + self.pull_opt_i64(name)? + .ok_or_else(|| plan_datafusion_err!("required option '{}' not set", name)) + } + + pub fn pull_u64(&mut self, name: &str) -> DFResult { + self.pull_opt_u64(name)? + .ok_or_else(|| plan_datafusion_err!("required option '{}' not set", name)) + } + + pub fn pull_opt_f64(&mut self, name: &str) -> DFResult> { + match self.options.remove(name) { + Some(Expr::Value(ValueWithSpan { + value: SqlValue::Number(s, _), + span: _, + })) + | Some(Expr::Value(ValueWithSpan { + value: SqlValue::SingleQuotedString(s), + span: _, + })) => s.parse::().map(Some).map_err(|_| { + plan_datafusion_err!( + "expected with option '{}' to be a double, but it was `{}`", + name, + s + ) + }), + Some(e) => Err(plan_datafusion_err!( + "expected with option '{}' to be a double, but it was `{:?}`", + name, + e + )), + None => Ok(None), + } + } + + pub fn pull_f64(&mut self, name: &str) -> DFResult { + self.pull_opt_f64(name)? + .ok_or_else(|| plan_datafusion_err!("required option '{}' not set", name)) + } + + pub fn pull_bool(&mut self, name: &str) -> DFResult { + self.pull_opt_bool(name)? + .ok_or_else(|| plan_datafusion_err!("required option '{}' not set", name)) + } + + pub fn pull_opt_duration(&mut self, name: &str) -> DFResult> { + match self.options.remove(name) { + Some(e) => Ok(Some(duration_from_sql_expr(&e).map_err(|e| { + plan_datafusion_err!("in with clause '{name}': {}", e) + })?)), + None => Ok(None), + } + } + + pub fn pull_opt_field(&mut self, name: &str) -> DFResult> { + match self.options.remove(name) { + Some(Expr::Value(ValueWithSpan { + value: SqlValue::SingleQuotedString(s), + span: _, + })) => { + warn!( + "Referred to a field in `{name}` with a string—this is deprecated and will be unsupported after Arroyo 0.14" + ); + Ok(Some(s)) + } + Some(Expr::Identifier(Ident { value, .. })) => Ok(Some(value)), + Some(e) => Err(plan_datafusion_err!( + "expected with option '{}' to be a field, but it was `{:?}`", + name, + e + )), + None => Ok(None), + } + } + + pub fn pull_opt_array(&mut self, name: &str) -> Option> { + Some(match self.options.remove(name)? { + Expr::Value(ValueWithSpan { + value: SqlValue::SingleQuotedString(s), + span, + }) => s + .split(',') + .map(|p| { + Expr::Value(ValueWithSpan { + value: SqlValue::SingleQuotedString(p.to_string()), + span: span.clone(), + }) + }) + .collect(), + Expr::Array(a) => a.elem, + e => vec![e], + }) + } + + pub fn pull_opt_parsed(&mut self, name: &str) -> DFResult> { + Ok(match self.pull_opt_str(name)? { + Some(s) => Some( + s.parse() + .map_err(|_| plan_datafusion_err!("invalid value '{s}' for {name}"))?, + ), + None => None, + }) + } + + pub fn keys(&self) -> impl Iterator { + self.options.keys() + } + + pub fn keys_with_prefix<'a, 'b>( + &'a self, + prefix: &'b str, + ) -> impl Iterator + 'b + where + 'a: 'b, + { + self.options.keys().filter(move |k| k.starts_with(prefix)) + } + + pub fn insert_str( + &mut self, + name: impl Into, + value: impl Into, + ) -> DFResult> { + let name = name.into(); + let value = value.into(); + let existing = self.pull_opt_str(&name)?; + self.options.insert( + name, + Expr::Value(SqlValue::SingleQuotedString(value).with_empty_span()), + ); + Ok(existing) + } + + pub fn is_empty(&self) -> bool { + self.options.is_empty() + } + + pub fn contains_key(&self, key: &str) -> bool { + self.options.contains_key(key) + } + + /// Drain all remaining options into string values (for connector runtime config). + pub fn drain_remaining_string_values(&mut self) -> DFResult> { + let taken = std::mem::take(&mut self.options); + let mut out = HashMap::with_capacity(taken.len()); + for (k, v) in taken { + out.insert(k, format!("{v}")); + } + Ok(out) + } + + /// Snapshot of all current `WITH` key/value pairs for catalog persistence (`SHOW CREATE TABLE`). + /// Call before any `pull_*` consumes options. + pub fn snapshot_for_catalog(&self) -> BTreeMap { + self.options + .iter() + .map(|(k, v)| (k.clone(), sql_expr_to_catalog_string(v))) + .collect() + } +} + +fn duration_from_sql_expr(expr: &Expr) -> Result { + match expr { + Expr::Interval(interval) => { + let s = match interval.value.as_ref() { + Expr::Value(ValueWithSpan { + value: SqlValue::SingleQuotedString(s), + .. + }) => s.clone(), + other => { + return Err(DataFusionError::Plan(format!( + "expected interval string literal, found {other}" + ))); + } + }; + parse_interval_to_duration(&s) + } + Expr::Value(ValueWithSpan { + value: SqlValue::SingleQuotedString(s), + .. + }) => parse_interval_to_duration(s), + other => Err(DataFusionError::Plan(format!( + "expected an interval expression, found {other}" + ))), + } +} + +fn parse_interval_to_duration(s: &str) -> Result { + let parts: Vec<&str> = s.trim().split_whitespace().collect(); + if parts.len() != 2 { + return Err(DataFusionError::Plan(format!( + "invalid interval string '{s}'; expected ' '" + ))); + } + let value: u64 = parts[0] + .parse() + .map_err(|_| DataFusionError::Plan(format!("invalid interval number: {}", parts[0])))?; + let unit_lc = parts[1].to_lowercase(); + let unit = unit_lc.as_str(); + let duration = match unit { + interval_duration_unit::SECOND + | interval_duration_unit::SECONDS + | interval_duration_unit::S => Duration::from_secs(value), + interval_duration_unit::MINUTE + | interval_duration_unit::MINUTES + | interval_duration_unit::MIN => Duration::from_secs(value * 60), + interval_duration_unit::HOUR + | interval_duration_unit::HOURS + | interval_duration_unit::H => Duration::from_secs(value * 3600), + interval_duration_unit::DAY + | interval_duration_unit::DAYS + | interval_duration_unit::D => Duration::from_secs(value * 86400), + unit => { + return Err(DataFusionError::Plan(format!( + "unsupported interval unit '{unit}'" + ))); + } + }; + Ok(duration) +} diff --git a/src/sql/common/constants.rs b/src/sql/common/constants.rs new file mode 100644 index 00000000..8eb697e2 --- /dev/null +++ b/src/sql/common/constants.rs @@ -0,0 +1,299 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + + +pub mod scalar_fn { + pub const GET_FIRST_JSON_OBJECT: &str = "get_first_json_object"; + pub const EXTRACT_JSON: &str = "extract_json"; + pub const EXTRACT_JSON_STRING: &str = "extract_json_string"; + pub const SERIALIZE_JSON_UNION: &str = "serialize_json_union"; + pub const MULTI_HASH: &str = "multi_hash"; +} + + +pub mod window_fn { + pub const HOP: &str = "hop"; + pub const TUMBLE: &str = "tumble"; + pub const SESSION: &str = "session"; +} + + +pub mod planning_placeholder_udf { + pub const UNNEST: &str = "unnest"; + pub const ROW_TIME: &str = "row_time"; + pub const LIST_ELEMENT_FIELD: &str = "field"; +} + + +pub mod operator_feature { + pub const ASYNC_UDF: &str = "async-udf"; + pub const JOIN_WITH_EXPIRATION: &str = "join-with-expiration"; + pub const WINDOWED_JOIN: &str = "windowed-join"; + pub const SQL_WINDOW_FUNCTION: &str = "sql-window-function"; + pub const LOOKUP_JOIN: &str = "lookup-join"; + pub const SQL_TUMBLING_WINDOW_AGGREGATE: &str = "sql-tumbling-window-aggregate"; + pub const SQL_SLIDING_WINDOW_AGGREGATE: &str = "sql-sliding-window-aggregate"; + pub const SQL_SESSION_WINDOW_AGGREGATE: &str = "sql-session-window-aggregate"; + pub const SQL_UPDATING_AGGREGATE: &str = "sql-updating-aggregate"; + pub const KEY_BY_ROUTING: &str = "key-by-routing"; + pub const CONNECTOR_SOURCE: &str = "connector-source"; + pub const CONNECTOR_SINK: &str = "connector-sink"; +} + + +pub mod extension_node { + pub const STREAM_WINDOW_AGGREGATE: &str = "StreamWindowAggregateNode"; + pub const STREAMING_WINDOW_FUNCTION: &str = "StreamingWindowFunctionNode"; + pub const EVENT_TIME_WATERMARK: &str = "EventTimeWatermarkNode"; + pub const CONTINUOUS_AGGREGATE: &str = "ContinuousAggregateNode"; + pub const SYSTEM_TIMESTAMP_INJECTOR: &str = "SystemTimestampInjectorNode"; + pub const STREAM_INGESTION: &str = "StreamIngestionNode"; + pub const STREAM_EGRESS: &str = "StreamEgressNode"; + pub const STREAM_PROJECTION: &str = "StreamProjectionNode"; + pub const REMOTE_TABLE_BOUNDARY: &str = "RemoteTableBoundaryNode"; + pub const REFERENCE_TABLE_SOURCE: &str = "ReferenceTableSource"; + pub const STREAM_REFERENCE_JOIN: &str = "StreamReferenceJoin"; + pub const KEY_EXTRACTION: &str = "KeyExtractionNode"; + pub const STREAMING_JOIN: &str = "StreamingJoinNode"; + pub const ASYNC_FUNCTION_EXECUTION: &str = "AsyncFunctionExecutionNode"; + pub const UNROLL_DEBEZIUM_PAYLOAD: &str = "UnrollDebeziumPayloadNode"; + pub const PACK_DEBEZIUM_ENVELOPE: &str = "PackDebeziumEnvelopeNode"; +} + + +pub mod proto_operator_name { + pub const TUMBLING_WINDOW: &str = "TumblingWindow"; + pub const UPDATING_AGGREGATE: &str = "UpdatingAggregate"; + pub const WINDOW_FUNCTION: &str = "WindowFunction"; + pub const SLIDING_WINDOW_LABEL: &str = "sliding window"; + pub const INSTANT_WINDOW: &str = "InstantWindow"; + pub const INSTANT_WINDOW_LABEL: &str = "instant window"; +} + + +pub mod runtime_operator_kind { + pub const STREAMING_JOIN: &str = "streaming_join"; + pub const WATERMARK_GENERATOR: &str = "watermark_generator"; + pub const STREAMING_WINDOW_EVALUATOR: &str = "streaming_window_evaluator"; +} + + +pub mod factory_operator_name { + pub const CONNECTOR_SOURCE: &str = "ConnectorSource"; + pub const CONNECTOR_SINK: &str = "ConnectorSink"; + pub const KAFKA_SOURCE: &str = "KafkaSource"; + pub const KAFKA_SINK: &str = "KafkaSink"; +} + + +pub mod cdc { + pub const BEFORE: &str = "before"; + pub const AFTER: &str = "after"; + pub const OP: &str = "op"; +} + + +pub mod updating_state_field { + pub const IS_RETRACT: &str = "is_retract"; + pub const ID: &str = "id"; +} + + +pub mod sql_field { + pub const ASYNC_RESULT: &str = "__async_result"; + pub const DEFAULT_KEY_LABEL: &str = "key"; + pub const DEFAULT_PROJECTION_LABEL: &str = "projection"; + pub const COMPUTED_WATERMARK: &str = "__watermark"; + pub const TIMESTAMP_FIELD: &str = "_timestamp"; + pub const UPDATING_META_FIELD: &str = "_updating_meta"; +} + + +pub mod sql_planning_default { + pub const DEFAULT_PARALLELISM: usize = 4; + pub const PLANNING_TTL_SECS: u64 = 24 * 60 * 60; +} + + +pub mod with_opt_bool_str { + pub const TRUE: &str = "true"; + pub const YES: &str = "yes"; + pub const FALSE: &str = "false"; + pub const NO: &str = "no"; +} + +pub mod interval_duration_unit { + pub const SECOND: &str = "second"; + pub const SECONDS: &str = "seconds"; + pub const S: &str = "s"; + pub const MINUTE: &str = "minute"; + pub const MINUTES: &str = "minutes"; + pub const MIN: &str = "min"; + pub const HOUR: &str = "hour"; + pub const HOURS: &str = "hours"; + pub const H: &str = "h"; + pub const DAY: &str = "day"; + pub const DAYS: &str = "days"; + pub const D: &str = "d"; +} + + +pub mod connection_format_value { + pub const JSON: &str = "json"; + pub const DEBEZIUM_JSON: &str = "debezium_json"; + pub const AVRO: &str = "avro"; + pub const PARQUET: &str = "parquet"; + pub const PROTOBUF: &str = "protobuf"; + pub const RAW_STRING: &str = "raw_string"; + pub const RAW_BYTES: &str = "raw_bytes"; +} + +pub mod framing_method_value { + pub const NEWLINE: &str = "newline"; + pub const NEWLINE_DELIMITED: &str = "newline_delimited"; +} + +pub mod bad_data_value { + pub const FAIL: &str = "fail"; + pub const DROP: &str = "drop"; +} + + +pub mod timestamp_format_value { + pub const RFC3339_SNAKE: &str = "rfc3339"; + pub const RFC3339_UPPER: &str = "RFC3339"; + pub const UNIX_MILLIS_SNAKE: &str = "unix_millis"; + pub const UNIX_MILLIS_PASCAL: &str = "UnixMillis"; +} + +pub mod decimal_encoding_value { + pub const NUMBER: &str = "number"; + pub const STRING: &str = "string"; + pub const BYTES: &str = "bytes"; +} + +pub mod json_compression_value { + pub const UNCOMPRESSED: &str = "uncompressed"; + pub const GZIP: &str = "gzip"; +} + +pub mod parquet_compression_value { + pub const UNCOMPRESSED: &str = "uncompressed"; + pub const SNAPPY: &str = "snappy"; + pub const GZIP: &str = "gzip"; + pub const ZSTD: &str = "zstd"; + pub const LZ4: &str = "lz4"; + pub const LZ4_RAW: &str = "lz4_raw"; +} + + +pub mod date_part_keyword { + pub const YEAR: &str = "year"; + pub const MONTH: &str = "month"; + pub const WEEK: &str = "week"; + pub const DAY: &str = "day"; + pub const HOUR: &str = "hour"; + pub const MINUTE: &str = "minute"; + pub const SECOND: &str = "second"; + pub const MILLISECOND: &str = "millisecond"; + pub const MICROSECOND: &str = "microsecond"; + pub const NANOSECOND: &str = "nanosecond"; + pub const DOW: &str = "dow"; + pub const DOY: &str = "doy"; +} + +pub mod date_trunc_keyword { + pub const YEAR: &str = "year"; + pub const QUARTER: &str = "quarter"; + pub const MONTH: &str = "month"; + pub const WEEK: &str = "week"; + pub const DAY: &str = "day"; + pub const HOUR: &str = "hour"; + pub const MINUTE: &str = "minute"; + pub const SECOND: &str = "second"; +} + + +pub mod mem_exec_join_side { + pub const LEFT: &str = "left"; + pub const RIGHT: &str = "right"; +} + +pub mod physical_plan_node_name { + pub const RW_LOCK_READER: &str = "rw_lock_reader"; + pub const UNBOUNDED_READER: &str = "unbounded_reader"; + pub const VEC_READER: &str = "vec_reader"; + pub const MEM_EXEC: &str = "mem_exec"; + pub const DEBEZIUM_UNROLLING_EXEC: &str = "debezium_unrolling_exec"; + pub const TO_DEBEZIUM_EXEC: &str = "to_debezium_exec"; +} + +pub mod window_function_udf { + pub const NAME: &str = "window"; +} + +pub mod window_interval_field { + pub const START: &str = "start"; + pub const END: &str = "end"; +} + +pub mod debezium_op_short { + pub const CREATE: &str = "c"; + pub const READ: &str = "r"; + pub const UPDATE: &str = "u"; + pub const DELETE: &str = "d"; +} + + +pub mod connector_type { + pub const KAFKA: &str = "kafka"; + pub const KINESIS: &str = "kinesis"; + pub const FILESYSTEM: &str = "filesystem"; + pub const DELTA: &str = "delta"; + pub const ICEBERG: &str = "iceberg"; + pub const PULSAR: &str = "pulsar"; + pub const NATS: &str = "nats"; + pub const REDIS: &str = "redis"; + pub const MQTT: &str = "mqtt"; + pub const WEBSOCKET: &str = "websocket"; + pub const SSE: &str = "sse"; + pub const NEXMARK: &str = "nexmark"; + pub const BLACKHOLE: &str = "blackhole"; + pub const MEMORY: &str = "memory"; + pub const POSTGRES: &str = "postgres"; +} + + +pub mod connection_table_role { + pub const SOURCE: &str = "source"; + pub const SINK: &str = "sink"; + pub const LOOKUP: &str = "lookup"; +} + +pub const SUPPORTED_CONNECTOR_ADAPTERS: &[&str] = &[ + connector_type::KAFKA, +]; + + +pub mod kafka_with_value { + pub const SCAN_LATEST: &str = "latest"; + pub const SCAN_EARLIEST: &str = "earliest"; + pub const SCAN_GROUP_OFFSETS: &str = "group-offsets"; + pub const SCAN_GROUP: &str = "group"; + pub const ISOLATION_READ_COMMITTED: &str = "read_committed"; + pub const ISOLATION_READ_UNCOMMITTED: &str = "read_uncommitted"; + pub const SINK_COMMIT_EXACTLY_ONCE_HYPHEN: &str = "exactly-once"; + pub const SINK_COMMIT_EXACTLY_ONCE_UNDERSCORE: &str = "exactly_once"; + pub const SINK_COMMIT_AT_LEAST_ONCE_HYPHEN: &str = "at-least-once"; + pub const SINK_COMMIT_AT_LEAST_ONCE_UNDERSCORE: &str = "at_least_once"; +} diff --git a/src/sql/common/control.rs b/src/sql/common/control.rs new file mode 100644 index 00000000..4ea9a12f --- /dev/null +++ b/src/sql/common/control.rs @@ -0,0 +1,164 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; +use std::time::SystemTime; + +use super::message::CheckpointBarrier; + +/// Control messages sent from the controller to worker tasks. +#[derive(Debug, Clone)] +pub enum ControlMessage { + Checkpoint(CheckpointBarrier), + Stop { + mode: StopMode, + }, + Commit { + epoch: u32, + commit_data: HashMap>>, + }, + LoadCompacted { + compacted: CompactionResult, + }, + NoOp, +} + +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub enum StopMode { + Graceful, + Immediate, +} + +#[derive(Debug, Clone)] +pub struct CompactionResult { + pub operator_id: String, + pub compacted_tables: HashMap, +} + +#[derive(Debug, Clone)] +pub struct TableCheckpointMetadata { + pub table_type: TableType, + pub data: Vec, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum TableType { + GlobalKeyValue, + ExpiringKeyedTimeTable, +} + +/// Responses sent from worker tasks back to the controller. +#[derive(Debug, Clone)] +pub enum ControlResp { + CheckpointEvent(CheckpointEvent), + CheckpointCompleted(CheckpointCompleted), + TaskStarted { + node_id: u32, + task_index: usize, + start_time: SystemTime, + }, + TaskFinished { + node_id: u32, + task_index: usize, + }, + TaskFailed { + node_id: u32, + task_index: usize, + error: TaskError, + }, + Error { + node_id: u32, + operator_id: String, + task_index: usize, + message: String, + details: String, + }, +} + +#[derive(Debug, Clone)] +pub struct CheckpointCompleted { + pub checkpoint_epoch: u32, + pub node_id: u32, + pub operator_id: String, + pub subtask_metadata: SubtaskCheckpointMetadata, +} + +#[derive(Debug, Clone)] +pub struct SubtaskCheckpointMetadata { + pub subtask_index: u32, + pub start_time: u64, + pub finish_time: u64, + pub watermark: Option, + pub bytes: u64, + pub table_metadata: HashMap, + pub table_configs: HashMap, +} + +#[derive(Debug, Clone)] +pub struct TableSubtaskCheckpointMetadata { + pub subtask_index: u32, + pub table_type: TableType, + pub data: Vec, +} + +#[derive(Debug, Clone)] +pub struct TableConfig { + pub table_type: TableType, + pub config: Vec, + pub state_version: u32, +} + +#[derive(Debug, Clone)] +pub struct CheckpointEvent { + pub checkpoint_epoch: u32, + pub node_id: u32, + pub operator_id: String, + pub subtask_index: u32, + pub time: SystemTime, + pub event_type: TaskCheckpointEventType, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum TaskCheckpointEventType { + StartedAlignment, + StartedCheckpointing, + FinishedOperatorSetup, + FinishedSync, + FinishedCommit, +} + +#[derive(Debug, Clone)] +pub struct TaskError { + pub job_id: String, + pub node_id: u32, + pub operator_id: String, + pub operator_subtask: u64, + pub error: String, + pub error_domain: ErrorDomain, + pub retry_hint: RetryHint, + pub details: String, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum ErrorDomain { + User, + Internal, + External, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum RetryHint { + NoRetry, + WithBackoff, +} diff --git a/src/sql/common/converter.rs b/src/sql/common/converter.rs new file mode 100644 index 00000000..ec4687f8 --- /dev/null +++ b/src/sql/common/converter.rs @@ -0,0 +1,95 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; +use arrow::row::{OwnedRow, RowConverter, RowParser, Rows, SortField}; +use arrow_array::{Array, ArrayRef, BooleanArray}; +use arrow_schema::{ArrowError, DataType}; + +// need to handle the empty case as a row converter without sort fields emits empty Rows. +#[derive(Debug)] +pub enum Converter { + RowConverter(RowConverter), + Empty(RowConverter, Arc), +} + +impl Converter { + pub fn new(sort_fields: Vec) -> Result { + if sort_fields.is_empty() { + let array = Arc::new(BooleanArray::from(vec![false])); + Ok(Self::Empty( + RowConverter::new(vec![SortField::new(DataType::Boolean)])?, + array, + )) + } else { + Ok(Self::RowConverter(RowConverter::new(sort_fields)?)) + } + } + + pub fn convert_columns(&self, columns: &[Arc]) -> Result { + match self { + Converter::RowConverter(row_converter) => { + Ok(row_converter.convert_columns(columns)?.row(0).owned()) + } + Converter::Empty(row_converter, array) => Ok(row_converter + .convert_columns(std::slice::from_ref(array))? + .row(0) + .owned()), + } + } + + pub fn convert_all_columns( + &self, + columns: &[Arc], + num_rows: usize, + ) -> Result { + match self { + Converter::RowConverter(row_converter) => Ok(row_converter.convert_columns(columns)?), + Converter::Empty(row_converter, _array) => { + let array = Arc::new(BooleanArray::from(vec![false; num_rows])); + Ok(row_converter.convert_columns(&[array])?) + } + } + } + + pub fn convert_rows( + &self, + rows: Vec>, + ) -> Result, ArrowError> { + match self { + Converter::RowConverter(row_converter) => Ok(row_converter.convert_rows(rows)?), + Converter::Empty(_row_converter, _array) => Ok(vec![]), + } + } + + pub fn convert_raw_rows(&self, row_bytes: Vec<&[u8]>) -> Result, ArrowError> { + match self { + Converter::RowConverter(row_converter) => { + let parser = row_converter.parser(); + let mut row_list = vec![]; + for bytes in row_bytes { + let row = parser.parse(bytes); + row_list.push(row); + } + Ok(row_converter.convert_rows(row_list)?) + } + Converter::Empty(_row_converter, _array) => Ok(vec![]), + } + } + + pub fn parser(&self) -> Option { + match self { + Converter::RowConverter(r) => Some(r.parser()), + Converter::Empty(_, _) => None, + } + } +} \ No newline at end of file diff --git a/src/sql/common/date.rs b/src/sql/common/date.rs new file mode 100644 index 00000000..ec310326 --- /dev/null +++ b/src/sql/common/date.rs @@ -0,0 +1,86 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use serde::Serialize; +use std::convert::TryFrom; + +use super::constants::{date_part_keyword, date_trunc_keyword}; + +#[derive(Debug, Clone, Copy, Eq, PartialEq, PartialOrd, Hash, Serialize)] +pub enum DatePart { + Year, + Month, + Week, + Day, + Hour, + Minute, + Second, + Millisecond, + Microsecond, + Nanosecond, + DayOfWeek, + DayOfYear, +} + +impl TryFrom<&str> for DatePart { + type Error = String; + + fn try_from(value: &str) -> Result { + let v = value.to_lowercase(); + match v.as_str() { + date_part_keyword::YEAR => Ok(DatePart::Year), + date_part_keyword::MONTH => Ok(DatePart::Month), + date_part_keyword::WEEK => Ok(DatePart::Week), + date_part_keyword::DAY => Ok(DatePart::Day), + date_part_keyword::HOUR => Ok(DatePart::Hour), + date_part_keyword::MINUTE => Ok(DatePart::Minute), + date_part_keyword::SECOND => Ok(DatePart::Second), + date_part_keyword::MILLISECOND => Ok(DatePart::Millisecond), + date_part_keyword::MICROSECOND => Ok(DatePart::Microsecond), + date_part_keyword::NANOSECOND => Ok(DatePart::Nanosecond), + date_part_keyword::DOW => Ok(DatePart::DayOfWeek), + date_part_keyword::DOY => Ok(DatePart::DayOfYear), + _ => Err(format!("'{value}' is not a valid DatePart")), + } + } +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq, Hash, PartialOrd, Serialize)] +pub enum DateTruncPrecision { + Year, + Quarter, + Month, + Week, + Day, + Hour, + Minute, + Second, +} + +impl TryFrom<&str> for DateTruncPrecision { + type Error = String; + + fn try_from(value: &str) -> Result { + let v = value.to_lowercase(); + match v.as_str() { + date_trunc_keyword::YEAR => Ok(DateTruncPrecision::Year), + date_trunc_keyword::QUARTER => Ok(DateTruncPrecision::Quarter), + date_trunc_keyword::MONTH => Ok(DateTruncPrecision::Month), + date_trunc_keyword::WEEK => Ok(DateTruncPrecision::Week), + date_trunc_keyword::DAY => Ok(DateTruncPrecision::Day), + date_trunc_keyword::HOUR => Ok(DateTruncPrecision::Hour), + date_trunc_keyword::MINUTE => Ok(DateTruncPrecision::Minute), + date_trunc_keyword::SECOND => Ok(DateTruncPrecision::Second), + _ => Err(format!("'{value}' is not a valid DateTruncPrecision")), + } + } +} diff --git a/src/sql/common/debezium.rs b/src/sql/common/debezium.rs new file mode 100644 index 00000000..9dbc401f --- /dev/null +++ b/src/sql/common/debezium.rs @@ -0,0 +1,148 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use bincode::{Decode, Encode}; +use serde::{Deserialize, Serialize}; +use std::convert::TryFrom; +use std::fmt::Debug; + +pub trait Key: + Debug + Clone + Encode + Decode<()> + std::hash::Hash + PartialEq + Eq + Send + 'static +{ +} +impl + std::hash::Hash + PartialEq + Eq + Send + 'static> Key + for T +{ +} + +pub trait Data: Debug + Clone + Encode + Decode<()> + Send + 'static {} +impl + Send + 'static> Data for T {} + +#[derive(Debug, Clone, PartialEq, Encode, Decode, Serialize, Deserialize)] +pub enum UpdatingData { + Retract(T), + Update { old: T, new: T }, + Append(T), +} + +impl UpdatingData { + pub fn lower(&self) -> T { + match self { + UpdatingData::Retract(_) => panic!("cannot lower retractions"), + UpdatingData::Update { new, .. } => new.clone(), + UpdatingData::Append(t) => t.clone(), + } + } + + pub fn unwrap_append(&self) -> &T { + match self { + UpdatingData::Append(t) => t, + _ => panic!("UpdatingData is not an append"), + } + } +} + +#[derive(Clone, Encode, Decode, Debug, Serialize, Deserialize, PartialEq)] +#[serde(try_from = "DebeziumShadow")] +pub struct Debezium { + pub before: Option, + pub after: Option, + pub op: DebeziumOp, +} + +#[derive(Clone, Encode, Decode, Debug, Serialize, Deserialize, PartialEq)] +struct DebeziumShadow { + before: Option, + after: Option, + op: DebeziumOp, +} + +impl TryFrom> for Debezium { + type Error = &'static str; + + fn try_from(value: DebeziumShadow) -> Result { + match (value.op, &value.before, &value.after) { + (DebeziumOp::Create, _, None) => { + Err("`after` must be set for Debezium create messages") + } + (DebeziumOp::Update, None, _) => { + Err("`before` must be set for Debezium update messages") + } + (DebeziumOp::Update, _, None) => { + Err("`after` must be set for Debezium update messages") + } + (DebeziumOp::Delete, None, _) => { + Err("`before` must be set for Debezium delete messages") + } + _ => Ok(Debezium { + before: value.before, + after: value.after, + op: value.op, + }), + } + } +} + +#[derive(Copy, Clone, Encode, Decode, Debug, PartialEq)] +pub enum DebeziumOp { + Create, + Update, + Delete, +} + +#[allow(clippy::to_string_trait_impl)] +impl ToString for DebeziumOp { + fn to_string(&self) -> String { + match self { + DebeziumOp::Create => "c", + DebeziumOp::Update => "u", + DebeziumOp::Delete => "d", + } + .to_string() + } +} + +impl<'de> Deserialize<'de> for DebeziumOp { + fn deserialize(deserializer: D) -> Result + where + D: serde::Deserializer<'de>, + { + let s = String::deserialize(deserializer)?; + match s.as_str() { + "c" | "r" => Ok(DebeziumOp::Create), + "u" => Ok(DebeziumOp::Update), + "d" => Ok(DebeziumOp::Delete), + _ => Err(serde::de::Error::custom(format!("Invalid DebeziumOp {s}"))), + } + } +} + +impl Serialize for DebeziumOp { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + match self { + DebeziumOp::Create => serializer.serialize_str("c"), + DebeziumOp::Update => serializer.serialize_str("u"), + DebeziumOp::Delete => serializer.serialize_str("d"), + } + } +} + +#[derive(Copy, Clone, Encode, Decode, Debug, PartialEq, Serialize, Deserialize)] +pub enum JoinType { + Inner, + Left, + Right, + Full, +} diff --git a/src/sql/common/errors.rs b/src/sql/common/errors.rs new file mode 100644 index 00000000..fa4a722e --- /dev/null +++ b/src/sql/common/errors.rs @@ -0,0 +1,92 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::fmt; + +/// Result type for streaming operators and collectors. +pub type DataflowResult = std::result::Result; + +/// Unified error type for streaming dataflow operations. +#[derive(Debug)] +pub enum DataflowError { + Arrow(arrow_schema::ArrowError), + DataFusion(datafusion::error::DataFusionError), + Operator(String), + State(String), + Connector(String), + Internal(String), +} + +impl fmt::Display for DataflowError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + DataflowError::Arrow(e) => write!(f, "Arrow error: {e}"), + DataflowError::DataFusion(e) => write!(f, "DataFusion error: {e}"), + DataflowError::Operator(msg) => write!(f, "Operator error: {msg}"), + DataflowError::State(msg) => write!(f, "State error: {msg}"), + DataflowError::Connector(msg) => write!(f, "Connector error: {msg}"), + DataflowError::Internal(msg) => write!(f, "Internal error: {msg}"), + } + } +} + +impl std::error::Error for DataflowError {} + +impl DataflowError { + pub fn with_operator(self, operator_id: impl Into) -> Self { + let id = operator_id.into(); + match self { + DataflowError::Operator(m) => DataflowError::Operator(format!("{id}: {m}")), + other => DataflowError::Operator(format!("{id}: {other}")), + } + } +} + +impl From for DataflowError { + fn from(e: arrow_schema::ArrowError) -> Self { + DataflowError::Arrow(e) + } +} + +impl From for DataflowError { + fn from(e: datafusion::error::DataFusionError) -> Self { + DataflowError::DataFusion(e) + } +} + +/// Macro for creating connector errors. +#[macro_export] +macro_rules! connector_err { + ($($arg:tt)*) => { + $crate::sql::common::errors::DataflowError::Connector(format!($($arg)*)) + }; +} + +/// State-related errors. +#[derive(Debug)] +pub enum StateError { + KeyNotFound(String), + SerializationError(String), + BackendError(String), +} + +impl fmt::Display for StateError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + StateError::KeyNotFound(key) => write!(f, "Key not found: {key}"), + StateError::SerializationError(msg) => write!(f, "Serialization error: {msg}"), + StateError::BackendError(msg) => write!(f, "State backend error: {msg}"), + } + } +} + +impl std::error::Error for StateError {} diff --git a/src/sql/common/format_from_opts.rs b/src/sql/common/format_from_opts.rs new file mode 100644 index 00000000..34b6a586 --- /dev/null +++ b/src/sql/common/format_from_opts.rs @@ -0,0 +1,184 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Parse `WITH` clause format / framing / bad-data options (Arroyo-compatible keys). + +use std::str::FromStr; + +use datafusion::common::{Result as DFResult, plan_datafusion_err, plan_err}; + +use super::connector_options::ConnectorOptions; +use super::constants::{bad_data_value, connection_format_value, framing_method_value}; +use super::with_option_keys as opt; +use super::formats::{ + AvroFormat, BadData, DecimalEncoding, Format, Framing, JsonCompression, JsonFormat, + NewlineDelimitedFraming, ParquetCompression, ParquetFormat, ProtobufFormat, RawBytesFormat, + RawStringFormat, TimestampFormat, +}; + +impl JsonFormat { + pub fn from_opts(opts: &mut ConnectorOptions) -> DFResult { + let mut j = JsonFormat::default(); + if let Some(v) = opts.pull_opt_bool(opt::JSON_CONFLUENT_SCHEMA_REGISTRY)? { + j.confluent_schema_registry = v; + } + if let Some(v) = opts.pull_opt_u64(opt::JSON_CONFLUENT_SCHEMA_VERSION)? { + j.schema_id = Some(v as u32); + } + if let Some(v) = opts.pull_opt_bool(opt::JSON_INCLUDE_SCHEMA)? { + j.include_schema = v; + } + if let Some(v) = opts.pull_opt_bool(opt::JSON_DEBEZIUM)? { + j.debezium = v; + } + if let Some(v) = opts.pull_opt_bool(opt::JSON_UNSTRUCTURED)? { + j.unstructured = v; + } + if let Some(s) = opts.pull_opt_str(opt::JSON_TIMESTAMP_FORMAT)? { + j.timestamp_format = TimestampFormat::try_from(s.as_str()).map_err(|_| { + plan_datafusion_err!("invalid json.timestamp_format '{}'", s) + })?; + } + if let Some(s) = opts.pull_opt_str(opt::JSON_DECIMAL_ENCODING)? { + j.decimal_encoding = DecimalEncoding::try_from(s.as_str()).map_err(|_| { + plan_datafusion_err!("invalid json.decimal_encoding '{s}'") + })?; + } + if let Some(s) = opts.pull_opt_str(opt::JSON_COMPRESSION)? { + j.compression = JsonCompression::from_str(&s) + .map_err(|e| plan_datafusion_err!("invalid json.compression: {e}"))?; + } + Ok(j) + } +} + +impl Format { + pub fn from_opts(opts: &mut ConnectorOptions) -> DFResult> { + let Some(name) = opts.pull_opt_str(opt::FORMAT)? else { + return Ok(None); + }; + let n = name.to_lowercase(); + match n.as_str() { + connection_format_value::JSON => Ok(Some(Format::Json(JsonFormat::from_opts(opts)?))), + connection_format_value::DEBEZIUM_JSON => { + let mut j = JsonFormat::from_opts(opts)?; + j.debezium = true; + Ok(Some(Format::Json(j))) + } + connection_format_value::AVRO => Ok(Some(Format::Avro(AvroFormat::from_opts(opts)?))), + connection_format_value::PARQUET => { + Ok(Some(Format::Parquet(ParquetFormat::from_opts(opts)?))) + } + connection_format_value::PROTOBUF => { + Ok(Some(Format::Protobuf(ProtobufFormat::from_opts(opts)?))) + } + connection_format_value::RAW_STRING => { + Ok(Some(Format::RawString(RawStringFormat {}))) + } + connection_format_value::RAW_BYTES => Ok(Some(Format::RawBytes(RawBytesFormat {}))), + _ => plan_err!("unknown format '{name}'"), + } + } +} + +impl AvroFormat { + fn from_opts(opts: &mut ConnectorOptions) -> DFResult { + let mut a = AvroFormat { + confluent_schema_registry: false, + raw_datums: false, + into_unstructured_json: false, + schema_id: None, + }; + if let Some(v) = opts.pull_opt_bool(opt::AVRO_CONFLUENT_SCHEMA_REGISTRY)? { + a.confluent_schema_registry = v; + } + if let Some(v) = opts.pull_opt_bool(opt::AVRO_RAW_DATUMS)? { + a.raw_datums = v; + } + if let Some(v) = opts.pull_opt_bool(opt::AVRO_INTO_UNSTRUCTURED_JSON)? { + a.into_unstructured_json = v; + } + if let Some(v) = opts.pull_opt_u64(opt::AVRO_SCHEMA_ID)? { + a.schema_id = Some(v as u32); + } + Ok(a) + } +} + +impl ParquetFormat { + fn from_opts(opts: &mut ConnectorOptions) -> DFResult { + let mut p = ParquetFormat::default(); + if let Some(s) = opts.pull_opt_str(opt::PARQUET_COMPRESSION)? { + p.compression = ParquetCompression::from_str(&s) + .map_err(|e| plan_datafusion_err!("invalid parquet.compression: {e}"))?; + } + if let Some(v) = opts.pull_opt_u64(opt::PARQUET_ROW_GROUP_BYTES)? { + p.row_group_bytes = Some(v); + } + Ok(p) + } +} + +impl ProtobufFormat { + fn from_opts(opts: &mut ConnectorOptions) -> DFResult { + let mut p = ProtobufFormat { + into_unstructured_json: false, + message_name: None, + compiled_schema: None, + confluent_schema_registry: false, + length_delimited: false, + }; + if let Some(v) = opts.pull_opt_bool(opt::PROTOBUF_INTO_UNSTRUCTURED_JSON)? { + p.into_unstructured_json = v; + } + if let Some(s) = opts.pull_opt_str(opt::PROTOBUF_MESSAGE_NAME)? { + p.message_name = Some(s); + } + if let Some(v) = opts.pull_opt_bool(opt::PROTOBUF_CONFLUENT_SCHEMA_REGISTRY)? { + p.confluent_schema_registry = v; + } + if let Some(v) = opts.pull_opt_bool(opt::PROTOBUF_LENGTH_DELIMITED)? { + p.length_delimited = v; + } + Ok(p) + } +} + +impl Framing { + pub fn from_opts(opts: &mut ConnectorOptions) -> DFResult> { + let method = opts.pull_opt_str(opt::FRAMING_METHOD)?; + match method.as_deref() { + None => Ok(None), + Some(framing_method_value::NEWLINE) | Some(framing_method_value::NEWLINE_DELIMITED) => { + let max = opts.pull_opt_u64(opt::FRAMING_MAX_LINE_LENGTH)?; + Ok(Some(Framing::Newline(NewlineDelimitedFraming { + max_line_length: max, + }))) + } + Some(other) => plan_err!("unknown framing.method '{other}'"), + } + } +} + +impl BadData { + pub fn from_opts(opts: &mut ConnectorOptions) -> DFResult { + let Some(s) = opts.pull_opt_str(opt::BAD_DATA)? else { + return Ok(BadData::Fail {}); + }; + let v = s.to_lowercase(); + match v.as_str() { + bad_data_value::FAIL => Ok(BadData::Fail {}), + bad_data_value::DROP => Ok(BadData::Drop {}), + _ => plan_err!("invalid bad_data '{s}'"), + } + } +} diff --git a/src/sql/common/formats.rs b/src/sql/common/formats.rs new file mode 100644 index 00000000..b2885797 --- /dev/null +++ b/src/sql/common/formats.rs @@ -0,0 +1,256 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use serde::{Deserialize, Serialize}; +use std::convert::TryFrom; +use std::fmt::{Display, Formatter}; +use std::str::FromStr; + +use super::constants::{ + connection_format_value, decimal_encoding_value, json_compression_value, + parquet_compression_value, timestamp_format_value, +}; + +#[derive(Copy, Clone, Debug, Serialize, Deserialize, PartialEq, Eq, Default, Hash, PartialOrd)] +#[serde(rename_all = "snake_case")] +pub enum TimestampFormat { + #[default] + #[serde(rename = "rfc3339")] + RFC3339, + UnixMillis, +} + +impl TryFrom<&str> for TimestampFormat { + type Error = (); + + fn try_from(value: &str) -> Result { + match value { + timestamp_format_value::RFC3339_UPPER | timestamp_format_value::RFC3339_SNAKE => { + Ok(TimestampFormat::RFC3339) + } + timestamp_format_value::UNIX_MILLIS_PASCAL | timestamp_format_value::UNIX_MILLIS_SNAKE => { + Ok(TimestampFormat::UnixMillis) + } + _ => Err(()), + } + } +} + +#[derive(Copy, Clone, Debug, Serialize, Deserialize, PartialEq, Eq, Default, Hash, PartialOrd)] +#[serde(rename_all = "snake_case")] +pub enum DecimalEncoding { + #[default] + Number, + String, + Bytes, +} + +impl TryFrom<&str> for DecimalEncoding { + type Error = (); + + fn try_from(s: &str) -> Result { + match s { + decimal_encoding_value::NUMBER => Ok(Self::Number), + decimal_encoding_value::STRING => Ok(Self::String), + decimal_encoding_value::BYTES => Ok(Self::Bytes), + _ => Err(()), + } + } +} + +#[derive(Serialize, Deserialize, Default, Copy, Clone, Debug, PartialEq, Eq, Hash, PartialOrd)] +#[serde(rename_all = "snake_case")] +pub enum JsonCompression { + #[default] + Uncompressed, + Gzip, +} + +impl FromStr for JsonCompression { + type Err = String; + + fn from_str(s: &str) -> Result { + match s { + json_compression_value::UNCOMPRESSED => Ok(JsonCompression::Uncompressed), + json_compression_value::GZIP => Ok(JsonCompression::Gzip), + _ => Err(format!("invalid json compression '{s}'")), + } + } +} + +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq, Default, Hash, PartialOrd)] +#[serde(rename_all = "snake_case")] +pub struct JsonFormat { + #[serde(default)] + pub confluent_schema_registry: bool, + #[serde(default, alias = "confluent_schema_version")] + pub schema_id: Option, + #[serde(default)] + pub include_schema: bool, + #[serde(default)] + pub debezium: bool, + #[serde(default)] + pub unstructured: bool, + #[serde(default)] + pub timestamp_format: TimestampFormat, + #[serde(default)] + pub decimal_encoding: DecimalEncoding, + #[serde(default)] + pub compression: JsonCompression, +} + +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq, Hash, PartialOrd)] +#[serde(rename_all = "snake_case")] +pub struct RawStringFormat {} + +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq, Hash, PartialOrd)] +#[serde(rename_all = "snake_case")] +pub struct RawBytesFormat {} + +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq, Hash, PartialOrd)] +#[serde(rename_all = "snake_case")] +pub struct AvroFormat { + #[serde(default)] + pub confluent_schema_registry: bool, + #[serde(default)] + pub raw_datums: bool, + #[serde(default)] + pub into_unstructured_json: bool, + #[serde(default)] + pub schema_id: Option, +} + +impl AvroFormat { + pub fn new( + confluent_schema_registry: bool, + raw_datums: bool, + into_unstructured_json: bool, + ) -> Self { + Self { + confluent_schema_registry, + raw_datums, + into_unstructured_json, + schema_id: None, + } + } +} + +#[derive(Serialize, Deserialize, Copy, Clone, Debug, PartialEq, Eq, Hash, PartialOrd, Default)] +#[serde(rename_all = "snake_case")] +pub enum ParquetCompression { + Uncompressed, + Snappy, + Gzip, + #[default] + Zstd, + Lz4, + Lz4Raw, +} + +impl FromStr for ParquetCompression { + type Err = String; + + fn from_str(s: &str) -> Result { + match s { + parquet_compression_value::UNCOMPRESSED => Ok(ParquetCompression::Uncompressed), + parquet_compression_value::SNAPPY => Ok(ParquetCompression::Snappy), + parquet_compression_value::GZIP => Ok(ParquetCompression::Gzip), + parquet_compression_value::ZSTD => Ok(ParquetCompression::Zstd), + parquet_compression_value::LZ4 => Ok(ParquetCompression::Lz4), + parquet_compression_value::LZ4_RAW => Ok(ParquetCompression::Lz4Raw), + _ => Err(format!("invalid parquet compression '{s}'")), + } + } +} + +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq, Hash, PartialOrd, Default)] +#[serde(rename_all = "snake_case")] +pub struct ParquetFormat { + #[serde(default)] + pub compression: ParquetCompression, + #[serde(default)] + pub row_group_bytes: Option, +} + +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq, Hash, PartialOrd)] +#[serde(rename_all = "snake_case")] +pub struct ProtobufFormat { + #[serde(default)] + pub into_unstructured_json: bool, + #[serde(default)] + pub message_name: Option, + #[serde(default)] + pub compiled_schema: Option>, + #[serde(default)] + pub confluent_schema_registry: bool, + #[serde(default)] + pub length_delimited: bool, +} + +#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq, Hash, PartialOrd)] +#[serde(rename_all = "snake_case", tag = "type")] +pub enum Format { + Json(JsonFormat), + Avro(AvroFormat), + Protobuf(ProtobufFormat), + Parquet(ParquetFormat), + RawString(RawStringFormat), + RawBytes(RawBytesFormat), +} + +impl Display for Format { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + f.write_str(self.name()) + } +} + +impl Format { + pub fn name(&self) -> &'static str { + match self { + Format::Json(_) => connection_format_value::JSON, + Format::Avro(_) => connection_format_value::AVRO, + Format::Protobuf(_) => connection_format_value::PROTOBUF, + Format::Parquet(_) => connection_format_value::PARQUET, + Format::RawString(_) => connection_format_value::RAW_STRING, + Format::RawBytes(_) => connection_format_value::RAW_BYTES, + } + } + + pub fn is_updating(&self) -> bool { + matches!(self, Format::Json(JsonFormat { debezium: true, .. })) + } +} + +#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq, Hash, PartialOrd)] +#[serde(rename_all = "snake_case", tag = "behavior")] +pub enum BadData { + Fail {}, + Drop {}, +} + +impl Default for BadData { + fn default() -> Self { + BadData::Fail {} + } +} + +#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq, Hash, PartialOrd)] +#[serde(rename_all = "snake_case", tag = "method")] +pub enum Framing { + Newline(NewlineDelimitedFraming), +} + +#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq, Hash, PartialOrd)] +#[serde(rename_all = "snake_case")] +pub struct NewlineDelimitedFraming { + pub max_line_length: Option, +} diff --git a/src/sql/common/fs_schema.rs b/src/sql/common/fs_schema.rs new file mode 100644 index 00000000..eb92d4ac --- /dev/null +++ b/src/sql/common/fs_schema.rs @@ -0,0 +1,474 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! FunctionStream table/stream schema: Arrow [`Schema`] plus timestamp index and optional key columns. +//! +//! [`Schema`]: datafusion::arrow::datatypes::Schema + +use datafusion::arrow::array::builder::{ArrayBuilder, make_builder}; +use datafusion::arrow::array::{RecordBatch, TimestampNanosecondArray}; +use datafusion::arrow::datatypes::{DataType, Field, FieldRef, Schema, SchemaBuilder, TimeUnit}; +use datafusion::arrow::error::ArrowError; +use datafusion::common::{DataFusionError, Result as DFResult}; +use serde::{Deserialize, Serialize}; +use std::sync::Arc; +use std::time::SystemTime; +use arrow::compute::{filter_record_batch, lexsort_to_indices, partition, take, SortColumn}; +use arrow::compute::kernels::cmp::gt_eq; +use arrow::compute::kernels::numeric::div; +use arrow::row::SortField; +use arrow_array::{PrimitiveArray, UInt64Array}; +use arrow_array::types::UInt64Type; +use protocol::grpc::api; +use super::{to_nanos, TIMESTAMP_FIELD}; +use std::ops::Range; +use crate::sql::common::converter::Converter; + +#[derive(Debug, Copy, Clone)] +pub enum FieldValueType<'a> { + Int64(Option), + UInt64(Option), + Int32(Option), + String(Option<&'a str>), + Bytes(Option<&'a [u8]>), +} + +pub type FsSchemaRef = Arc; + +#[derive(Debug, Clone, Eq, PartialEq, Hash, Serialize, Deserialize)] +pub struct FsSchema { + pub schema: Arc, + pub timestamp_index: usize, + key_indices: Option>, + /// If defined, these indices are used for routing (i.e., which subtask gets which piece of data) + routing_key_indices: Option>, +} + +impl TryFrom for FsSchema { + type Error = DataFusionError; + fn try_from(schema_proto: api::FsSchema) -> Result { + let schema: Schema = serde_json::from_str(&schema_proto.arrow_schema) + .map_err(|e| DataFusionError::Plan(format!("Invalid arrow schema: {e}")))?; + let timestamp_index = schema_proto.timestamp_index as usize; + + let key_indices = schema_proto.has_keys.then(|| { + schema_proto + .key_indices + .into_iter() + .map(|index| index as usize) + .collect() + }); + + let routing_key_indices = schema_proto.has_routing_keys.then(|| { + schema_proto + .routing_key_indices + .into_iter() + .map(|index| index as usize) + .collect() + }); + + Ok(Self { + schema: Arc::new(schema), + timestamp_index, + key_indices, + routing_key_indices, + }) + } +} + +impl From for api::FsSchema { + fn from(schema: FsSchema) -> Self { + let arrow_schema = serde_json::to_string(schema.schema.as_ref()).unwrap(); + let timestamp_index = schema.timestamp_index as u32; + + let has_keys = schema.key_indices.is_some(); + let key_indices = schema + .key_indices + .map(|ks| ks.into_iter().map(|index| index as u32).collect()) + .unwrap_or_default(); + + let has_routing_keys = schema.routing_key_indices.is_some(); + let routing_key_indices = schema + .routing_key_indices + .map(|ks| ks.into_iter().map(|index| index as u32).collect()) + .unwrap_or_default(); + + Self { + arrow_schema, + timestamp_index, + key_indices, + has_keys, + routing_key_indices, + has_routing_keys, + } + } +} + +impl FsSchema { + pub fn new( + schema: Arc, + timestamp_index: usize, + key_indices: Option>, + routing_key_indices: Option>, + ) -> Self { + Self { + schema, + timestamp_index, + key_indices, + routing_key_indices, + } + } + pub fn new_unkeyed(schema: Arc, timestamp_index: usize) -> Self { + Self { + schema, + timestamp_index, + key_indices: None, + routing_key_indices: None, + } + } + pub fn new_keyed(schema: Arc, timestamp_index: usize, key_indices: Vec) -> Self { + Self { + schema, + timestamp_index, + key_indices: Some(key_indices), + routing_key_indices: None, + } + } + + pub fn from_fields(mut fields: Vec) -> Self { + if !fields.iter().any(|f| f.name() == TIMESTAMP_FIELD) { + fields.push(Field::new( + TIMESTAMP_FIELD, + DataType::Timestamp(TimeUnit::Nanosecond, None), + false, + )); + } + + Self::from_schema_keys(Arc::new(Schema::new(fields)), vec![]).unwrap() + } + + pub fn from_schema_unkeyed(schema: Arc) -> DFResult { + let timestamp_index = schema + .column_with_name(TIMESTAMP_FIELD) + .ok_or_else(|| { + DataFusionError::Plan(format!( + "no {TIMESTAMP_FIELD} field in schema, schema is {schema:?}" + )) + })? + .0; + + Ok(Self { + schema, + timestamp_index, + key_indices: None, + routing_key_indices: None, + }) + } + + pub fn from_schema_keys(schema: Arc, key_indices: Vec) -> DFResult { + let timestamp_index = schema + .column_with_name(TIMESTAMP_FIELD) + .ok_or_else(|| { + DataFusionError::Plan(format!( + "no {TIMESTAMP_FIELD} field in schema, schema is {schema:?}" + )) + })? + .0; + + Ok(Self { + schema, + timestamp_index, + key_indices: Some(key_indices), + routing_key_indices: None, + }) + } + + pub fn schema_without_timestamp(&self) -> Schema { + let mut builder = SchemaBuilder::from(self.schema.fields()); + builder.remove(self.timestamp_index); + builder.finish() + } + + pub fn remove_timestamp_column(&self, batch: &mut RecordBatch) { + batch.remove_column(self.timestamp_index); + } + + pub fn builders(&self) -> Vec> { + self.schema + .fields + .iter() + .map(|f| make_builder(f.data_type(), 8)) + .collect() + } + + pub fn timestamp_column<'a>(&self, batch: &'a RecordBatch) -> &'a TimestampNanosecondArray { + batch + .column(self.timestamp_index) + .as_any() + .downcast_ref::() + .unwrap() + } + + pub fn has_routing_keys(&self) -> bool { + self.routing_keys().map(|k| !k.is_empty()).unwrap_or(false) + } + + pub fn routing_keys(&self) -> Option<&Vec> { + self.routing_key_indices + .as_ref() + .or(self.key_indices.as_ref()) + } + + pub fn storage_keys(&self) -> Option<&Vec> { + self.key_indices.as_ref() + } + + pub fn clone_storage_key_indices(&self) -> Option> { + self.key_indices.clone() + } + + pub fn clone_routing_key_indices(&self) -> Option> { + self.routing_key_indices.clone() + } + + pub fn filter_by_time( + &self, + batch: RecordBatch, + cutoff: Option, + ) -> Result { + let Some(cutoff) = cutoff else { + // no watermark, so we just return the same batch. + return Ok(batch); + }; + // filter out late data + let timestamp_column = batch + .column(self.timestamp_index) + .as_any() + .downcast_ref::() + .ok_or_else(|| ArrowError::CastError( + format!("failed to downcast column {} of {:?} to timestamp. Schema is supposed to be {:?}", + self.timestamp_index, batch, self.schema)))?; + let cutoff_scalar = TimestampNanosecondArray::new_scalar(to_nanos(cutoff) as i64); + let on_time = gt_eq(timestamp_column, &cutoff_scalar)?; + filter_record_batch(&batch, &on_time) + } + + pub fn sort_columns(&self, batch: &RecordBatch, with_timestamp: bool) -> Vec { + let mut columns = vec![]; + if let Some(keys) = &self.key_indices { + columns.extend(keys.iter().map(|index| SortColumn { + values: batch.column(*index).clone(), + options: None, + })); + } + if with_timestamp { + columns.push(SortColumn { + values: batch.column(self.timestamp_index).clone(), + options: None, + }); + } + columns + } + + pub fn sort_fields(&self, with_timestamp: bool) -> Vec { + let mut sort_fields = vec![]; + if let Some(keys) = &self.key_indices { + sort_fields.extend(keys.iter()); + } + if with_timestamp { + sort_fields.push(self.timestamp_index); + } + self.sort_fields_by_indices(&sort_fields) + } + + fn sort_fields_by_indices(&self, indices: &[usize]) -> Vec { + indices + .iter() + .map(|index| SortField::new(self.schema.field(*index).data_type().clone())) + .collect() + } + + pub fn converter(&self, with_timestamp: bool) -> Result { + Converter::new(self.sort_fields(with_timestamp)) + } + + pub fn value_converter( + &self, + with_timestamp: bool, + generation_index: usize, + ) -> Result { + match &self.key_indices { + None => { + let mut indices = (0..self.schema.fields().len()).collect::>(); + indices.remove(generation_index); + if !with_timestamp { + indices.remove(self.timestamp_index); + } + Converter::new(self.sort_fields_by_indices(&indices)) + } + Some(keys) => { + let indices = (0..self.schema.fields().len()) + .filter(|index| { + !keys.contains(index) + && (with_timestamp || *index != self.timestamp_index) + && *index != generation_index + }) + .collect::>(); + Converter::new(self.sort_fields_by_indices(&indices)) + } + } + } + + pub fn value_indices(&self, with_timestamp: bool) -> Vec { + let field_count = self.schema.fields().len(); + match &self.key_indices { + None => { + let mut indices = (0..field_count).collect::>(); + + if !with_timestamp { + indices.remove(self.timestamp_index); + } + indices + } + Some(keys) => (0..field_count) + .filter(|index| { + !keys.contains(index) && (with_timestamp || *index != self.timestamp_index) + }) + .collect::>(), + } + } + + pub fn sort( + &self, + batch: RecordBatch, + with_timestamp: bool, + ) -> Result { + if self.key_indices.is_none() && !with_timestamp { + return Ok(batch); + } + let sort_columns = self.sort_columns(&batch, with_timestamp); + let sort_indices = lexsort_to_indices(&sort_columns, None).expect("should be able to sort"); + let columns = batch + .columns() + .iter() + .map(|c| take(c, &sort_indices, None).unwrap()) + .collect(); + + RecordBatch::try_new(batch.schema(), columns) + } + + pub fn partition( + &self, + batch: &RecordBatch, + with_timestamp: bool, + ) -> Result>, ArrowError> { + if self.key_indices.is_none() && !with_timestamp { + #[allow(clippy::single_range_in_vec_init)] + return Ok(vec![0..batch.num_rows()]); + } + + let mut partition_columns = vec![]; + + if let Some(keys) = &self.routing_keys() { + partition_columns.extend(keys.iter().map(|index| batch.column(*index).clone())); + } + if with_timestamp { + partition_columns.push(batch.column(self.timestamp_index).clone()); + } + + Ok(partition(&partition_columns)?.ranges()) + } + + pub fn unkeyed_batch(&self, batch: &RecordBatch) -> Result { + if self.key_indices.is_none() { + return Ok(batch.clone()); + } + let columns: Vec<_> = (0..batch.num_columns()) + .filter(|index| !self.key_indices.as_ref().unwrap().contains(index)) + .collect(); + batch.project(&columns) + } + + pub fn schema_without_keys(&self) -> Result { + if self.key_indices.is_none() { + return Ok(self.clone()); + } + let key_indices = self.key_indices.as_ref().unwrap(); + let unkeyed_schema = Schema::new( + self.schema + .fields() + .iter() + .enumerate() + .filter(|(index, _field)| !key_indices.contains(index)) + .map(|(_, field)| field.as_ref().clone()) + .collect::>(), + ); + let timestamp_index = unkeyed_schema.index_of(TIMESTAMP_FIELD)?; + Ok(Self { + schema: Arc::new(unkeyed_schema), + timestamp_index, + key_indices: None, + routing_key_indices: None, + }) + } + + pub fn with_fields(&self, fields: Vec) -> Result { + let schema = Arc::new(Schema::new_with_metadata( + fields, + self.schema.metadata.clone(), + )); + + let timestamp_index = schema.index_of(TIMESTAMP_FIELD)?; + let max_index = *[&self.key_indices, &self.routing_key_indices] + .iter() + .map(|indices| indices.as_ref().and_then(|k| k.iter().max())) + .max() + .flatten() + .unwrap_or(&0); + + if schema.fields.len() - 1 < max_index { + return Err(ArrowError::InvalidArgumentError(format!( + "expected at least {} fields, but were only {}", + max_index + 1, + schema.fields.len() + ))); + } + + Ok(Self { + schema, + timestamp_index, + key_indices: self.key_indices.clone(), + routing_key_indices: self.routing_key_indices.clone(), + }) + } + + pub fn with_additional_fields( + &self, + new_fields: impl Iterator, + ) -> Result { + let mut fields = self.schema.fields.to_vec(); + fields.extend(new_fields.map(Arc::new)); + + self.with_fields(fields) + } +} + +pub fn server_for_hash_array( + hash: &PrimitiveArray, + n: usize, +) -> Result, ArrowError> { + let range_size = u64::MAX / (n as u64) + 1; + let range_scalar = UInt64Array::new_scalar(range_size); + let division = div(hash, &range_scalar)?; + let result: &PrimitiveArray = division.as_any().downcast_ref().unwrap(); + Ok(result.clone()) +} diff --git a/src/sql/common/kafka_catalog.rs b/src/sql/common/kafka_catalog.rs new file mode 100644 index 00000000..5d54b1b2 --- /dev/null +++ b/src/sql/common/kafka_catalog.rs @@ -0,0 +1,126 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! +//! + +use serde::{Deserialize, Serialize}; +use std::collections::HashMap; + + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct KafkaTable { + pub topic: String, + #[serde(flatten)] + pub kind: TableType, + #[serde(default)] + pub client_configs: HashMap, + pub value_subject: Option, +} + +impl KafkaTable { + pub fn subject(&self) -> String { + self.value_subject + .clone() + .unwrap_or_else(|| format!("{}-value", self.topic)) + } +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +#[serde(tag = "type", rename_all = "snake_case")] +pub enum TableType { + Source { + offset: KafkaTableSourceOffset, + read_mode: Option, + group_id: Option, + group_id_prefix: Option, + }, + Sink { + commit_mode: SinkCommitMode, + key_field: Option, + timestamp_field: Option, + }, +} + +#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Default)] +#[serde(rename_all = "snake_case")] +pub enum KafkaTableSourceOffset { + Latest, + Earliest, + #[default] + Group, +} + +#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)] +#[serde(rename_all = "snake_case")] +pub enum ReadMode { + ReadUncommitted, + ReadCommitted, +} + +#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Default)] +#[serde(rename_all = "snake_case")] +pub enum SinkCommitMode { + #[default] + AtLeastOnce, + ExactlyOnce, +} + + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +#[serde(rename_all = "camelCase")] +pub struct KafkaConfig { + pub bootstrap_servers: String, + #[serde(default)] + pub authentication: KafkaConfigAuthentication, + #[serde(default)] + pub schema_registry_enum: Option, + #[serde(default)] + pub connection_properties: HashMap, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +#[serde(tag = "type")] +pub enum KafkaConfigAuthentication { + #[serde(rename = "None")] + None, + #[serde(rename = "AWS_MSK_IAM")] + AwsMskIam { region: String }, + #[serde(rename = "SASL")] + Sasl { + protocol: String, + mechanism: String, + username: String, + password: String, + }, +} + +impl Default for KafkaConfigAuthentication { + fn default() -> Self { + Self::None + } +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +#[serde(tag = "type")] +pub enum SchemaRegistryConfig { + #[serde(rename = "None")] + None, + #[serde(rename = "Confluent Schema Registry")] + ConfluentSchemaRegistry { + endpoint: String, + #[serde(rename = "apiKey")] + api_key: Option, + #[serde(rename = "apiSecret")] + api_secret: Option, + }, +} diff --git a/src/sql/common/message.rs b/src/sql/common/message.rs new file mode 100644 index 00000000..4dcde95b --- /dev/null +++ b/src/sql/common/message.rs @@ -0,0 +1,54 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use bincode::{Decode, Encode}; +use datafusion::arrow::array::RecordBatch; +use serde::{Deserialize, Serialize}; +use std::time::SystemTime; + +#[derive(Debug, Copy, Clone, PartialEq, Eq, Encode, Decode, Serialize, Deserialize)] +pub enum Watermark { + EventTime(SystemTime), + Idle, +} + +#[derive(Debug, Clone, PartialEq)] +pub enum ArrowMessage { + Data(RecordBatch), + Signal(SignalMessage), +} + +impl ArrowMessage { + pub fn is_end(&self) -> bool { + matches!( + self, + ArrowMessage::Signal(SignalMessage::Stop) + | ArrowMessage::Signal(SignalMessage::EndOfData) + ) + } +} + +#[derive(Debug, Clone, PartialEq, Encode, Decode)] +pub enum SignalMessage { + Barrier(CheckpointBarrier), + Watermark(Watermark), + Stop, + EndOfData, +} + +#[derive(Debug, Copy, Clone, PartialEq, Eq, Encode, Decode, Serialize, Deserialize)] +pub struct CheckpointBarrier { + pub epoch: u32, + pub min_epoch: u32, + pub timestamp: SystemTime, + pub then_stop: bool, +} diff --git a/src/sql/common/mod.rs b/src/sql/common/mod.rs new file mode 100644 index 00000000..e042aea6 --- /dev/null +++ b/src/sql/common/mod.rs @@ -0,0 +1,66 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Shared core types and constants for FunctionStream (`crate::sql::common`). +//! +//! Used by the runtime, SQL planner, coordinator, and other subsystems — +//! analogous to `arroyo-types` + `arroyo-rpc` in Arroyo. + +pub mod arrow_ext; +pub mod connector_options; +pub mod with_option_keys; +pub mod constants; +pub mod control; +pub mod date; +pub mod debezium; +pub mod fs_schema; +pub mod errors; +pub mod format_from_opts; +pub mod formats; +pub mod kafka_catalog; +pub mod message; +pub mod operator_config; +pub mod time_utils; +pub mod converter; +pub mod topology; + +// ── Re-exports from existing modules ── +pub use arrow_ext::FsExtensionType; +pub use message::{CheckpointBarrier, Watermark}; +pub use time_utils::{from_nanos, to_micros, to_millis, to_nanos}; + +// ── Re-exports from new modules ── +pub use fs_schema::{FsSchema, FsSchemaRef}; +pub use connector_options::ConnectorOptions; +pub use formats::{BadData, Format, Framing, JsonCompression, JsonFormat}; +pub use operator_config::MetadataField; + +// ── Well-known column names ── +pub use constants::sql_field::{TIMESTAMP_FIELD, UPDATING_META_FIELD}; +pub use topology::render_program_topology; + +// ── Environment variables ── +pub const JOB_ID_ENV: &str = "JOB_ID"; +pub const RUN_ID_ENV: &str = "RUN_ID"; + +// ── Metric names ── +pub const MESSAGES_RECV: &str = "fs_worker_messages_recv"; +pub const MESSAGES_SENT: &str = "fs_worker_messages_sent"; +pub const BYTES_RECV: &str = "fs_worker_bytes_recv"; +pub const BYTES_SENT: &str = "fs_worker_bytes_sent"; +pub const BATCHES_RECV: &str = "fs_worker_batches_recv"; +pub const BATCHES_SENT: &str = "fs_worker_batches_sent"; +pub const TX_QUEUE_SIZE: &str = "fs_worker_tx_queue_size"; +pub const TX_QUEUE_REM: &str = "fs_worker_tx_queue_rem"; +pub const DESERIALIZATION_ERRORS: &str = "fs_worker_deserialization_errors"; + +pub const LOOKUP_KEY_INDEX_FIELD: &str = "__lookup_key_index"; diff --git a/src/sql/common/operator_config.rs b/src/sql/common/operator_config.rs new file mode 100644 index 00000000..b5360cd7 --- /dev/null +++ b/src/sql/common/operator_config.rs @@ -0,0 +1,12 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. + +use serde::{Deserialize, Serialize}; + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct MetadataField { + pub field_name: String, + pub key: String, + #[serde(default)] + pub data_type: Option, +} diff --git a/src/sql/common/time_utils.rs b/src/sql/common/time_utils.rs new file mode 100644 index 00000000..323445cd --- /dev/null +++ b/src/sql/common/time_utils.rs @@ -0,0 +1,74 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; +use std::hash::Hash; +use std::time::{Duration, SystemTime, UNIX_EPOCH}; + +pub fn to_millis(time: SystemTime) -> u64 { + time.duration_since(UNIX_EPOCH).unwrap().as_millis() as u64 +} + +pub fn to_micros(time: SystemTime) -> u64 { + time.duration_since(UNIX_EPOCH).unwrap().as_micros() as u64 +} + +pub fn from_millis(ts: u64) -> SystemTime { + UNIX_EPOCH + Duration::from_millis(ts) +} + +pub fn from_micros(ts: u64) -> SystemTime { + UNIX_EPOCH + Duration::from_micros(ts) +} + +pub fn to_nanos(time: SystemTime) -> u128 { + time.duration_since(UNIX_EPOCH).unwrap().as_nanos() +} + +pub fn from_nanos(ts: u128) -> SystemTime { + UNIX_EPOCH + + Duration::from_secs((ts / 1_000_000_000) as u64) + + Duration::from_nanos((ts % 1_000_000_000) as u64) +} + +pub fn print_time(time: SystemTime) -> String { + chrono::DateTime::::from(time) + .format("%Y-%m-%d %H:%M:%S%.3f") + .to_string() +} + +/// Returns the number of days since the UNIX epoch (for Avro serialization). +pub fn days_since_epoch(time: SystemTime) -> i32 { + time.duration_since(UNIX_EPOCH) + .unwrap() + .as_secs() + .div_euclid(86400) as i32 +} + +pub fn single_item_hash_map, K: Hash + Eq, V>(key: I, value: V) -> HashMap { + let mut map = HashMap::new(); + map.insert(key.into(), value); + map +} + +pub fn string_to_map(s: &str, pair_delimiter: char) -> Option> { + if s.trim().is_empty() { + return Some(HashMap::new()); + } + + s.split(',') + .map(|s| { + let mut kv = s.trim().split(pair_delimiter); + Some((kv.next()?.trim().to_string(), kv.next()?.trim().to_string())) + }) + .collect() +} diff --git a/src/sql/common/topology.rs b/src/sql/common/topology.rs new file mode 100644 index 00000000..bc71d57f --- /dev/null +++ b/src/sql/common/topology.rs @@ -0,0 +1,280 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! EXPLAIN-like DAG text renderer for [`FsProgram`]. +//! +//! Renders a streaming pipeline topology as a human-readable ASCII graph using +//! Kahn's topological sort. Handles linear chains, fan-out, and fan-in (JOIN). + +use std::collections::{BTreeMap, VecDeque}; +use std::fmt::Write; + +use protocol::grpc::api::FsProgram; + +fn edge_type_label(edge_type: i32) -> &'static str { + match edge_type { + 1 => "Forward", + 2 => "Shuffle", + 3 => "LeftJoin", + 4 => "RightJoin", + _ => "Unknown", + } +} + +/// Render an [`FsProgram`] as an EXPLAIN-style topology string. +pub fn render_program_topology(program: &FsProgram) -> String { + if program.nodes.is_empty() { + return "(empty topology)".to_string(); + } + + struct EdgeInfo { target: i32, edge_type: i32 } + struct InputInfo { source: i32, edge_type: i32 } + + let node_map: BTreeMap = + program.nodes.iter().map(|n| (n.node_index, n)).collect(); + + let mut downstream: BTreeMap> = BTreeMap::new(); + let mut upstream: BTreeMap> = BTreeMap::new(); + let mut in_degree: BTreeMap = BTreeMap::new(); + + for idx in node_map.keys() { + in_degree.entry(*idx).or_insert(0); + } + for edge in &program.edges { + downstream.entry(edge.source).or_default().push(EdgeInfo { + target: edge.target, + edge_type: edge.edge_type, + }); + upstream.entry(edge.target).or_default().push(InputInfo { + source: edge.source, + edge_type: edge.edge_type, + }); + *in_degree.entry(edge.target).or_insert(0) += 1; + } + + // Kahn's topological sort + let mut queue: VecDeque = in_degree + .iter() + .filter(|(_, deg)| **deg == 0) + .map(|(idx, _)| *idx) + .collect(); + let mut topo_order: Vec = Vec::with_capacity(node_map.len()); + let mut remaining = in_degree.clone(); + while let Some(idx) = queue.pop_front() { + topo_order.push(idx); + if let Some(edges) = downstream.get(&idx) { + for e in edges { + if let Some(deg) = remaining.get_mut(&e.target) { + *deg -= 1; + if *deg == 0 { + queue.push_back(e.target); + } + } + } + } + } + for idx in node_map.keys() { + if !topo_order.contains(idx) { + topo_order.push(*idx); + } + } + + let is_source = |idx: &i32| upstream.get(idx).map_or(true, |v| v.is_empty()); + let is_sink = |idx: &i32| downstream.get(idx).map_or(true, |v| v.is_empty()); + + let mut out = String::new(); + let _ = writeln!( + out, + "Pipeline Topology ({} nodes, {} edges)", + program.nodes.len(), + program.edges.len(), + ); + let _ = writeln!(out, "{}", "=".repeat(50)); + + for (pos, &node_idx) in topo_order.iter().enumerate() { + let Some(node) = node_map.get(&node_idx) else { + continue; + }; + + let op_chain: String = node + .operators + .iter() + .map(|op| op.operator_name.as_str()) + .collect::>() + .join(" -> "); + + let role = if is_source(&node_idx) { + "Source" + } else if is_sink(&node_idx) { + "Sink" + } else { + "Operator" + }; + + let _ = writeln!(out); + let _ = writeln!( + out, + "[{role}] Node {node_idx} parallelism = {}", + node.parallelism, + ); + let _ = writeln!(out, " operators: {op_chain}"); + + if !node.description.is_empty() { + let _ = writeln!(out, " desc: {}", node.description); + } + + if let Some(inputs) = upstream.get(&node_idx) { + if inputs.len() == 1 { + let i = &inputs[0]; + let _ = writeln!( + out, + " input: <-- [{}] Node {}", + edge_type_label(i.edge_type), + i.source, + ); + } else if inputs.len() > 1 { + let _ = writeln!(out, " inputs:"); + for i in inputs { + let _ = writeln!( + out, + " <-- [{}] Node {}", + edge_type_label(i.edge_type), + i.source, + ); + } + } + } + + if let Some(outputs) = downstream.get(&node_idx) { + if outputs.len() == 1 { + let e = &outputs[0]; + let _ = writeln!( + out, + " output: --> [{}] Node {}", + edge_type_label(e.edge_type), + e.target, + ); + } else if outputs.len() > 1 { + let _ = writeln!(out, " outputs:"); + for e in outputs { + let _ = writeln!( + out, + " --> [{}] Node {}", + edge_type_label(e.edge_type), + e.target, + ); + } + } + } + + if pos < topo_order.len() - 1 { + let single_out = downstream.get(&node_idx).map_or(false, |v| v.len() == 1); + let next_idx = topo_order.get(pos + 1).copied(); + let is_direct = single_out + && next_idx.map_or(false, |n| { + downstream.get(&node_idx).map_or(false, |v| v[0].target == n) + }); + let next_single_in = next_idx + .and_then(|n| upstream.get(&n)) + .map_or(false, |v| v.len() == 1); + + if is_direct && next_single_in { + let etype = downstream.get(&node_idx).unwrap()[0].edge_type; + let _ = writeln!(out, " |"); + let _ = writeln!(out, " | {}", edge_type_label(etype)); + let _ = writeln!(out, " v"); + } + } + } + + out.trim_end().to_string() +} + +#[cfg(test)] +mod tests { + use super::*; + use protocol::grpc::api::{ChainedOperator, FsEdge, FsNode, FsProgram}; + + fn make_node(node_index: i32, operators: Vec<(&str, &str)>, desc: &str, parallelism: u32) -> FsNode { + FsNode { + node_index, + node_id: node_index as u32, + parallelism, + description: desc.to_string(), + operators: operators + .into_iter() + .map(|(id, name)| ChainedOperator { + operator_id: id.to_string(), + operator_name: name.to_string(), + operator_config: Vec::new(), + }) + .collect(), + edges: Vec::new(), + } + } + + fn make_edge(source: i32, target: i32, edge_type: i32) -> FsEdge { + FsEdge { source, target, schema: None, edge_type } + } + + #[test] + fn empty_program_renders_placeholder() { + let program = FsProgram { nodes: vec![], edges: vec![], program_config: None }; + assert_eq!(render_program_topology(&program), "(empty topology)"); + } + + #[test] + fn linear_pipeline_renders_correctly() { + let program = FsProgram { + nodes: vec![ + make_node(0, vec![("src_0", "ConnectorSource")], "", 1), + make_node(1, vec![("val_1", "Value"), ("wm_2", "ExpressionWatermark")], "source -> watermark", 1), + make_node(2, vec![("sink_3", "ConnectorSink")], "sink (kafka)", 1), + ], + edges: vec![ + make_edge(0, 1, 1), + make_edge(1, 2, 1), + ], + program_config: None, + }; + let result = render_program_topology(&program); + assert!(result.contains("[Source] Node 0")); + assert!(result.contains("[Operator] Node 1")); + assert!(result.contains("[Sink] Node 2")); + assert!(result.contains("ConnectorSource")); + assert!(result.contains("Value -> ExpressionWatermark")); + assert!(result.contains("Forward")); + } + + #[test] + fn join_topology_shows_multiple_inputs() { + let program = FsProgram { + nodes: vec![ + make_node(0, vec![("src_a", "ConnectorSource")], "source A", 1), + make_node(1, vec![("src_b", "ConnectorSource")], "source B", 1), + make_node(2, vec![("join_0", "WindowJoin")], "join node", 2), + make_node(3, vec![("sink_0", "ConnectorSink")], "sink", 1), + ], + edges: vec![ + make_edge(0, 2, 3), // LeftJoin + make_edge(1, 2, 4), // RightJoin + make_edge(2, 3, 1), // Forward + ], + program_config: None, + }; + let result = render_program_topology(&program); + assert!(result.contains("inputs:")); + assert!(result.contains("LeftJoin")); + assert!(result.contains("RightJoin")); + assert!(result.contains("[Operator] Node 2")); + } +} diff --git a/src/sql/common/with_option_keys.rs b/src/sql/common/with_option_keys.rs new file mode 100644 index 00000000..a42f7405 --- /dev/null +++ b/src/sql/common/with_option_keys.rs @@ -0,0 +1,91 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + + +pub const CONNECTOR: &str = "connector"; +pub const TYPE: &str = "type"; +pub const FORMAT: &str = "format"; +pub const DEFAULT_FORMAT_VALUE: &str = "json"; +pub const BAD_DATA: &str = "bad_data"; +pub const PARTITION_BY: &str = "partition_by"; + +pub const EVENT_TIME_FIELD: &str = "event_time_field"; +pub const WATERMARK_FIELD: &str = "watermark_field"; + +pub const IDLE_MICROS: &str = "idle_micros"; +pub const IDLE_TIME: &str = "idle_time"; + +pub const LOOKUP_CACHE_MAX_BYTES: &str = "lookup.cache.max_bytes"; +pub const LOOKUP_CACHE_TTL: &str = "lookup.cache.ttl"; + + +pub const CONNECTION_SCHEMA: &str = "connection_schema"; + + +pub const ADAPTER: &str = "adapter"; + +// ── Kafka ───────────────────────────────────────────────────────────────── + +pub const KAFKA_BOOTSTRAP_SERVERS: &str = "bootstrap.servers"; +pub const KAFKA_BOOTSTRAP_SERVERS_LEGACY: &str = "bootstrap_servers"; +pub const KAFKA_TOPIC: &str = "topic"; +pub const KAFKA_RATE_LIMIT_MESSAGES_PER_SECOND: &str = "rate_limit.messages_per_second"; +pub const KAFKA_VALUE_SUBJECT: &str = "value.subject"; +pub const KAFKA_SCAN_STARTUP_MODE: &str = "scan.startup.mode"; +pub const KAFKA_ISOLATION_LEVEL: &str = "isolation.level"; +pub const KAFKA_GROUP_ID: &str = "group.id"; +pub const KAFKA_GROUP_ID_LEGACY: &str = "group_id"; +pub const KAFKA_GROUP_ID_PREFIX: &str = "group.id.prefix"; +pub const KAFKA_SINK_COMMIT_MODE: &str = "sink.commit.mode"; +pub const KAFKA_SINK_KEY_FIELD: &str = "sink.key.field"; +pub const KAFKA_KEY_FIELD_LEGACY: &str = "key.field"; +pub const KAFKA_SINK_TIMESTAMP_FIELD: &str = "sink.timestamp.field"; +pub const KAFKA_TIMESTAMP_FIELD_LEGACY: &str = "timestamp.field"; + +// ── JSON format ─────────────────────────────────────────────────────────── + +pub const JSON_CONFLUENT_SCHEMA_REGISTRY: &str = "json.confluent_schema_registry"; +pub const JSON_CONFLUENT_SCHEMA_VERSION: &str = "json.confluent_schema_version"; +pub const JSON_INCLUDE_SCHEMA: &str = "json.include_schema"; +pub const JSON_DEBEZIUM: &str = "json.debezium"; +pub const JSON_UNSTRUCTURED: &str = "json.unstructured"; +pub const JSON_TIMESTAMP_FORMAT: &str = "json.timestamp_format"; +pub const JSON_DECIMAL_ENCODING: &str = "json.decimal_encoding"; +pub const JSON_COMPRESSION: &str = "json.compression"; + +// ── Avro ────────────────────────────────────────────────────────────────── + +pub const AVRO_CONFLUENT_SCHEMA_REGISTRY: &str = "avro.confluent_schema_registry"; +pub const AVRO_RAW_DATUMS: &str = "avro.raw_datums"; +pub const AVRO_INTO_UNSTRUCTURED_JSON: &str = "avro.into_unstructured_json"; +pub const AVRO_SCHEMA_ID: &str = "avro.schema_id"; + +// ── Parquet ─────────────────────────────────────────────────────────────── + +pub const PARQUET_COMPRESSION: &str = "parquet.compression"; +pub const PARQUET_ROW_GROUP_BYTES: &str = "parquet.row_group_bytes"; + +// ── Protobuf ──────────────────────────────────────────────────────────────── + +pub const PROTOBUF_INTO_UNSTRUCTURED_JSON: &str = "protobuf.into_unstructured_json"; +pub const PROTOBUF_MESSAGE_NAME: &str = "protobuf.message_name"; +pub const PROTOBUF_CONFLUENT_SCHEMA_REGISTRY: &str = "protobuf.confluent_schema_registry"; +pub const PROTOBUF_LENGTH_DELIMITED: &str = "protobuf.length_delimited"; + +// ── Framing ───────────────────────────────────────────────────────────────── + +pub const FRAMING_METHOD: &str = "framing.method"; +pub const FRAMING_MAX_LINE_LENGTH: &str = "framing.max_line_length"; + + +pub const FORMAT_DEBEZIUM_FLAG: &str = "format.debezium"; diff --git a/src/sql/datastream/logical.rs b/src/sql/datastream/logical.rs new file mode 100644 index 00000000..e26be9f3 --- /dev/null +++ b/src/sql/datastream/logical.rs @@ -0,0 +1,371 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use itertools::Itertools; + +use datafusion::arrow::datatypes::DataType; +use petgraph::Direction; +use petgraph::dot::Dot; +use petgraph::graph::DiGraph; +use std::collections::{HashMap, HashSet}; +use std::fmt::{Debug, Display, Formatter}; +use std::sync::Arc; +use datafusion_proto::protobuf::ArrowType; +use prost::Message; +use strum::{Display, EnumString}; +use protocol::grpc::api; +use crate::types::FsSchema; + +#[derive(Clone, Copy, Debug, Eq, PartialEq, EnumString, Display)] +pub enum OperatorName { + ExpressionWatermark, + ArrowValue, + ArrowKey, + Projection, + AsyncUdf, + Join, + InstantJoin, + LookupJoin, + WindowFunction, + TumblingWindowAggregate, + SlidingWindowAggregate, + SessionWindowAggregate, + UpdatingAggregate, + KeyBy, + ConnectorSource, + ConnectorSink, +} + +#[derive(Copy, Clone, Debug, Eq, PartialEq, PartialOrd, Ord)] +pub enum LogicalEdgeType { + Forward, + Shuffle, + LeftJoin, + RightJoin, +} + +impl Display for LogicalEdgeType { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + match self { + LogicalEdgeType::Forward => write!(f, "→"), + LogicalEdgeType::Shuffle => write!(f, "⤨"), + LogicalEdgeType::LeftJoin => write!(f, "-[left]⤨"), + LogicalEdgeType::RightJoin => write!(f, "-[right]⤨"), + } + } +} + +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct LogicalEdge { + pub edge_type: LogicalEdgeType, + pub schema: Arc, +} + +impl LogicalEdge { + pub fn new(edge_type: LogicalEdgeType, schema: FsSchema) -> Self { + LogicalEdge { + edge_type, + schema: Arc::new(schema), + } + } + + pub fn project_all(edge_type: LogicalEdgeType, schema: FsSchema) -> Self { + LogicalEdge { + edge_type, + schema: Arc::new(schema), + } + } +} + +#[derive(Clone, Debug)] +pub struct ChainedLogicalOperator { + pub operator_id: String, + pub operator_name: OperatorName, + pub operator_config: Vec, +} + +#[derive(Clone, Debug)] +pub struct OperatorChain { + pub(crate) operators: Vec, + pub(crate) edges: Vec>, +} + +impl OperatorChain { + pub fn new(operator: ChainedLogicalOperator) -> Self { + Self { + operators: vec![operator], + edges: vec![], + } + } + + pub fn iter( + &self, + ) -> impl Iterator>)> { + self.operators + .iter() + .zip_longest(self.edges.iter()) + .map(|e| e.left_and_right()) + .map(|(l, r)| (l.unwrap(), r)) + } + + pub fn iter_mut( + &mut self, + ) -> impl Iterator>)> { + self.operators + .iter_mut() + .zip_longest(self.edges.iter()) + .map(|e| e.left_and_right()) + .map(|(l, r)| (l.unwrap(), r)) + } + + pub fn first(&self) -> &ChainedLogicalOperator { + &self.operators[0] + } + + pub fn len(&self) -> usize { + self.operators.len() + } + + pub fn is_empty(&self) -> bool { + self.operators.is_empty() + } + + pub fn is_source(&self) -> bool { + self.operators[0].operator_name == OperatorName::ConnectorSource + } + + pub fn is_sink(&self) -> bool { + self.operators[0].operator_name == OperatorName::ConnectorSink + } +} + +#[derive(Clone)] +pub struct LogicalNode { + pub node_id: u32, + pub description: String, + pub operator_chain: OperatorChain, + pub parallelism: usize, +} + +impl LogicalNode { + pub fn single( + id: u32, + operator_id: String, + name: OperatorName, + config: Vec, + description: String, + parallelism: usize, + ) -> Self { + Self { + node_id: id, + description, + operator_chain: OperatorChain { + operators: vec![ChainedLogicalOperator { + operator_id, + operator_name: name, + operator_config: config, + }], + edges: vec![], + }, + parallelism, + } + } +} + +impl Display for LogicalNode { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.description) + } +} + +impl Debug for LogicalNode { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!( + f, + "{}[{}]", + self.operator_chain + .operators + .iter() + .map(|op| op.operator_id.clone()) + .collect::>() + .join(" -> "), + self.parallelism + ) + } +} + +pub type LogicalGraph = DiGraph; + +pub trait Optimizer { + fn optimize_once(&self, plan: &mut LogicalGraph) -> bool; + + fn optimize(&self, plan: &mut LogicalGraph) { + loop { + if !self.optimize_once(plan) { + break; + } + } + } +} + +#[derive(Clone, Debug, Eq, PartialEq, Hash, PartialOrd)] +pub struct DylibUdfConfig { + pub dylib_path: String, + pub arg_types: Vec, + pub return_type: DataType, + pub aggregate: bool, + pub is_async: bool, +} + +#[derive(Clone, Debug, Eq, PartialEq, Hash)] +pub struct PythonUdfConfig { + pub arg_types: Vec, + pub return_type: DataType, + pub name: Arc, + pub definition: Arc, +} + +#[derive(Clone, Debug, Default)] +pub struct ProgramConfig { + pub udf_dylibs: HashMap, + pub python_udfs: HashMap, +} + +#[derive(Clone, Debug, Default)] +pub struct LogicalProgram { + pub graph: LogicalGraph, + pub program_config: ProgramConfig, +} + +impl LogicalProgram { + pub fn new(graph: LogicalGraph, program_config: ProgramConfig) -> Self { + Self { + graph, + program_config, + } + } + + pub fn optimize(&mut self, optimizer: &dyn Optimizer) { + optimizer.optimize(&mut self.graph); + } + + pub fn update_parallelism(&mut self, overrides: &HashMap) { + for node in self.graph.node_weights_mut() { + if let Some(p) = overrides.get(&node.node_id) { + node.parallelism = *p; + } + } + } + + pub fn dot(&self) -> String { + format!("{:?}", Dot::with_config(&self.graph, &[])) + } + + pub fn task_count(&self) -> usize { + self.graph.node_weights().map(|nw| nw.parallelism).sum() + } + + pub fn sources(&self) -> HashSet { + self.graph + .externals(Direction::Incoming) + .map(|t| self.graph.node_weight(t).unwrap().node_id) + .collect() + } + + pub fn tasks_per_operator(&self) -> HashMap { + let mut tasks_per_operator = HashMap::new(); + for node in self.graph.node_weights() { + for op in &node.operator_chain.operators { + tasks_per_operator.insert(op.operator_id.clone(), node.parallelism); + } + } + tasks_per_operator + } + + pub fn operator_names_by_id(&self) -> HashMap { + let mut m = HashMap::new(); + for node in self.graph.node_weights() { + for op in &node.operator_chain.operators { + m.insert(op.operator_id.clone(), op.operator_name.to_string()); + } + } + m + } + + pub fn tasks_per_node(&self) -> HashMap { + let mut tasks_per_node = HashMap::new(); + for node in self.graph.node_weights() { + tasks_per_node.insert(node.node_id, node.parallelism); + } + tasks_per_node + } + + pub fn features(&self) -> HashSet { + let mut s = HashSet::new(); + for n in self.graph.node_weights() { + for t in &n.operator_chain.operators { + let Some(tag) = t.operator_name.feature_tag() else { + continue; + }; + s.insert(tag.to_string()); + } + } + s + } +} + + +impl From for api::DylibUdfConfig { + fn from(from: DylibUdfConfig) -> Self { + api::DylibUdfConfig { + dylib_path: from.dylib_path, + arg_types: from + .arg_types + .iter() + .map(|t| { + ArrowType::try_from(t) + .expect("unsupported data type") + .encode_to_vec() + }) + .collect(), + return_type: ArrowType::try_from(&from.return_type) + .expect("unsupported data type") + .encode_to_vec(), + aggregate: from.aggregate, + is_async: from.is_async, + } + } +} + +impl From for DylibUdfConfig { + fn from(from: api::DylibUdfConfig) -> Self { + DylibUdfConfig { + dylib_path: from.dylib_path, + arg_types: from + .arg_types + .iter() + .map(|t| { + DataType::try_from( + &ArrowType::decode(&mut t.as_slice()).expect("invalid arrow type"), + ) + .expect("invalid arrow type") + }) + .collect(), + return_type: DataType::try_from( + &ArrowType::decode(&mut from.return_type.as_slice()).unwrap(), + ) + .expect("invalid arrow type"), + aggregate: from.aggregate, + is_async: from.is_async, + } + } +} \ No newline at end of file diff --git a/src/sql/datastream/mod.rs b/src/sql/datastream/mod.rs new file mode 100644 index 00000000..922801f6 --- /dev/null +++ b/src/sql/datastream/mod.rs @@ -0,0 +1,13 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +pub mod logical; diff --git a/src/sql/extensions/aggregate.rs b/src/sql/extensions/aggregate.rs new file mode 100644 index 00000000..645315af --- /dev/null +++ b/src/sql/extensions/aggregate.rs @@ -0,0 +1,633 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::fmt::Formatter; +use std::sync::Arc; +use std::time::Duration; + +use arrow_array::types::IntervalMonthDayNanoType; +use datafusion::common::{Column, DFSchemaRef, Result, ScalarValue, internal_err}; +use datafusion::logical_expr::{ + self, expr::ScalarFunction, BinaryExpr, Expr, Extension, LogicalPlan, + UserDefinedLogicalNodeCore, +}; +use datafusion_common::{plan_err, DFSchema, DataFusionError}; +use datafusion_expr::Aggregate; +use datafusion_proto::physical_plan::{AsExecutionPlan, DefaultPhysicalExtensionCodec}; +use datafusion_proto::physical_plan::to_proto::serialize_physical_expr; +use datafusion_proto::protobuf::PhysicalPlanNode; +use prost::Message; +use protocol::grpc::api::{ + SessionWindowAggregateOperator, SlidingWindowAggregateOperator, TumblingWindowAggregateOperator, +}; + +use crate::multifield_partial_ord; +use crate::sql::common::constants::{extension_node, proto_operator_name}; +use crate::sql::common::{FsSchema, FsSchemaRef}; +use crate::sql::extensions::{ + CompiledTopologyNode, StreamingOperatorBlueprint, SystemTimestampInjectorNode, +}; +use crate::sql::logical_node::logical::{LogicalEdge, LogicalEdgeType, LogicalNode, OperatorName}; +use crate::sql::logical_planner::planner::{NamedNode, Planner, SplitPlanOutput}; +use crate::sql::physical::{window, FsPhysicalExtensionCodec}; +use crate::sql::types::{ + DFField, TIMESTAMP_FIELD, WindowBehavior, WindowType, fields_with_qualifiers, + schema_from_df_fields, schema_from_df_fields_with_metadata, +}; + +pub(crate) const STREAM_AGG_EXTENSION_NAME: &str = extension_node::STREAM_WINDOW_AGGREGATE; + +/// Represents a streaming windowed aggregation node in the logical plan. +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub(crate) struct StreamWindowAggregateNode { + pub(crate) window_spec: WindowBehavior, + pub(crate) base_agg_plan: LogicalPlan, + pub(crate) output_schema: DFSchemaRef, + pub(crate) partition_keys: Vec, + pub(crate) post_aggregation_plan: LogicalPlan, +} + +multifield_partial_ord!( + StreamWindowAggregateNode, + base_agg_plan, + partition_keys, + post_aggregation_plan +); + +impl StreamWindowAggregateNode { + /// Safely constructs a new node, computing the final projection without panicking. + pub fn try_new( + window_spec: WindowBehavior, + base_agg_plan: LogicalPlan, + partition_keys: Vec, + ) -> Result { + let post_aggregation_plan = + WindowBoundaryMath::build_post_aggregation(&base_agg_plan, window_spec.clone())?; + + Ok(Self { + window_spec, + base_agg_plan, + output_schema: post_aggregation_plan.schema().clone(), + partition_keys, + post_aggregation_plan, + }) + } + + fn build_tumbling_operator( + &self, + planner: &Planner, + node_id: usize, + input_schema: DFSchemaRef, + duration: Duration, + ) -> Result { + let binning_expr = planner.binning_function_proto(duration, input_schema.clone())?; + + let SplitPlanOutput { + partial_aggregation_plan, + partial_schema, + finish_plan, + } = planner.split_physical_plan(self.partition_keys.clone(), &self.base_agg_plan, true)?; + + let final_physical = planner.sync_plan(&self.post_aggregation_plan)?; + let final_physical_proto = PhysicalPlanNode::try_from_physical_plan( + final_physical, + &FsPhysicalExtensionCodec::default(), + )?; + + let operator_config = TumblingWindowAggregateOperator { + name: proto_operator_name::TUMBLING_WINDOW.to_string(), + width_micros: duration.as_micros() as u64, + binning_function: binning_expr.encode_to_vec(), + input_schema: Some( + FsSchema::from_schema_keys( + Arc::new(input_schema.as_ref().into()), + self.partition_keys.clone(), + )? + .into(), + ), + partial_schema: Some(partial_schema.into()), + partial_aggregation_plan: partial_aggregation_plan.encode_to_vec(), + final_aggregation_plan: finish_plan.encode_to_vec(), + final_projection: Some(final_physical_proto.encode_to_vec()), + }; + + Ok(LogicalNode::single( + node_id as u32, + format!("tumbling_{node_id}"), + OperatorName::TumblingWindowAggregate, + operator_config.encode_to_vec(), + format!("TumblingWindow<{}>", operator_config.name), + 1, + )) + } + + fn build_sliding_operator( + &self, + planner: &Planner, + node_id: usize, + input_schema: DFSchemaRef, + duration: Duration, + slide_interval: Duration, + ) -> Result { + let binning_expr = planner.binning_function_proto(slide_interval, input_schema.clone())?; + + let SplitPlanOutput { + partial_aggregation_plan, + partial_schema, + finish_plan, + } = planner.split_physical_plan(self.partition_keys.clone(), &self.base_agg_plan, true)?; + + let final_physical = planner.sync_plan(&self.post_aggregation_plan)?; + let final_physical_proto = PhysicalPlanNode::try_from_physical_plan( + final_physical, + &FsPhysicalExtensionCodec::default(), + )?; + + let operator_config = SlidingWindowAggregateOperator { + name: format!("SlidingWindow<{duration:?}>"), + width_micros: duration.as_micros() as u64, + slide_micros: slide_interval.as_micros() as u64, + binning_function: binning_expr.encode_to_vec(), + input_schema: Some( + FsSchema::from_schema_keys( + Arc::new(input_schema.as_ref().into()), + self.partition_keys.clone(), + )? + .into(), + ), + partial_schema: Some(partial_schema.into()), + partial_aggregation_plan: partial_aggregation_plan.encode_to_vec(), + final_aggregation_plan: finish_plan.encode_to_vec(), + final_projection: final_physical_proto.encode_to_vec(), + }; + + Ok(LogicalNode::single( + node_id as u32, + format!("sliding_window_{node_id}"), + OperatorName::SlidingWindowAggregate, + operator_config.encode_to_vec(), + proto_operator_name::SLIDING_WINDOW_LABEL.to_string(), + 1, + )) + } + + fn build_session_operator( + &self, + planner: &Planner, + node_id: usize, + input_schema: DFSchemaRef, + ) -> Result { + let WindowBehavior::FromOperator { + window: WindowType::Session { gap }, + window_index, + window_field, + is_nested: false, + } = &self.window_spec + else { + return plan_err!("Expected standard session window configuration"); + }; + + let output_fields = fields_with_qualifiers(self.base_agg_plan.schema()); + let LogicalPlan::Aggregate(base_agg) = self.base_agg_plan.clone() else { + return plan_err!("Base plan must be an Aggregate node"); + }; + + let key_count = self.partition_keys.len(); + let unkeyed_schema = Arc::new(schema_from_df_fields_with_metadata( + &output_fields[key_count..], + self.base_agg_plan.schema().metadata().clone(), + )?); + + let unkeyed_agg_node = Aggregate::try_new_with_schema( + base_agg.input.clone(), + vec![], + base_agg.aggr_expr.clone(), + unkeyed_schema, + )?; + + let physical_agg = planner.sync_plan(&LogicalPlan::Aggregate(unkeyed_agg_node))?; + let physical_agg_proto = PhysicalPlanNode::try_from_physical_plan( + physical_agg, + &FsPhysicalExtensionCodec::default(), + )?; + + let operator_config = SessionWindowAggregateOperator { + name: format!("session_window_{node_id}"), + gap_micros: gap.as_micros() as u64, + window_field_name: window_field.name().to_string(), + window_index: *window_index as u64, + input_schema: Some( + FsSchema::from_schema_keys( + Arc::new(input_schema.as_ref().into()), + self.partition_keys.clone(), + )? + .into(), + ), + unkeyed_aggregate_schema: None, + partial_aggregation_plan: vec![], + final_aggregation_plan: physical_agg_proto.encode_to_vec(), + }; + + Ok(LogicalNode::single( + node_id as u32, + format!("SessionWindow<{gap:?}>"), + OperatorName::SessionWindowAggregate, + operator_config.encode_to_vec(), + operator_config.name.clone(), + 1, + )) + } + + fn build_instant_operator( + &self, + planner: &Planner, + node_id: usize, + input_schema: DFSchemaRef, + apply_final_projection: bool, + ) -> Result { + let ts_column_expr = + Expr::Column(Column::new_unqualified(TIMESTAMP_FIELD.to_string())); + let binning_expr = planner.create_physical_expr(&ts_column_expr, &input_schema)?; + let binning_proto = serialize_physical_expr(&binning_expr, &DefaultPhysicalExtensionCodec {})?; + + let final_projection_payload = if apply_final_projection { + let physical_plan = planner.sync_plan(&self.post_aggregation_plan)?; + let proto_node = PhysicalPlanNode::try_from_physical_plan( + physical_plan, + &FsPhysicalExtensionCodec::default(), + )?; + Some(proto_node.encode_to_vec()) + } else { + None + }; + + let SplitPlanOutput { + partial_aggregation_plan, + partial_schema, + finish_plan, + } = planner.split_physical_plan(self.partition_keys.clone(), &self.base_agg_plan, true)?; + + let operator_config = TumblingWindowAggregateOperator { + name: proto_operator_name::INSTANT_WINDOW.to_string(), + width_micros: 0, + binning_function: binning_proto.encode_to_vec(), + input_schema: Some( + FsSchema::from_schema_keys( + Arc::new(input_schema.as_ref().into()), + self.partition_keys.clone(), + )? + .into(), + ), + partial_schema: Some(partial_schema.into()), + partial_aggregation_plan: partial_aggregation_plan.encode_to_vec(), + final_aggregation_plan: finish_plan.encode_to_vec(), + final_projection: final_projection_payload, + }; + + Ok(LogicalNode::single( + node_id as u32, + format!("instant_window_{node_id}"), + OperatorName::TumblingWindowAggregate, + operator_config.encode_to_vec(), + proto_operator_name::INSTANT_WINDOW_LABEL.to_string(), + 1, + )) + } +} + +impl StreamingOperatorBlueprint for StreamWindowAggregateNode { + fn operator_identity(&self) -> Option { + None + } + + fn compile_to_graph_node( + &self, + planner: &Planner, + node_id: usize, + mut input_schemas: Vec, + ) -> Result { + if input_schemas.len() != 1 { + return plan_err!("StreamWindowAggregateNode requires exactly one input schema"); + } + + let raw_schema = input_schemas.remove(0); + let df_schema = Arc::new(DFSchema::try_from(raw_schema.schema.as_ref().clone())?); + + let logical_operator = match &self.window_spec { + WindowBehavior::FromOperator { window, is_nested, .. } => { + if *is_nested { + self.build_instant_operator(planner, node_id, df_schema, true)? + } else { + match window { + WindowType::Tumbling { width } => { + self.build_tumbling_operator(planner, node_id, df_schema, *width)? + } + WindowType::Sliding { width, slide } => { + self.build_sliding_operator(planner, node_id, df_schema, *width, *slide)? + } + WindowType::Session { .. } => { + self.build_session_operator(planner, node_id, df_schema)? + } + WindowType::Instant => { + return plan_err!( + "Instant window is invalid within standard operator context" + ); + } + } + } + } + WindowBehavior::InData => self + .build_instant_operator(planner, node_id, df_schema, false) + .map_err(|e| e.context("Failed compiling instant window"))?, + }; + + let link = LogicalEdge::project_all(LogicalEdgeType::Shuffle, (*raw_schema).clone()); + Ok(CompiledTopologyNode { + execution_unit: logical_operator, + routing_edges: vec![link], + }) + } + + fn yielded_schema(&self) -> FsSchema { + let schema_ref = (*self.output_schema).clone().into(); + FsSchema::from_schema_unkeyed(Arc::new(schema_ref)).expect( + "StreamWindowAggregateNode output schema must contain timestamp column", + ) + } +} + +impl UserDefinedLogicalNodeCore for StreamWindowAggregateNode { + fn name(&self) -> &str { + STREAM_AGG_EXTENSION_NAME + } + + fn inputs(&self) -> Vec<&LogicalPlan> { + vec![&self.base_agg_plan] + } + + fn schema(&self) -> &DFSchemaRef { + &self.output_schema + } + + fn expressions(&self) -> Vec { + vec![] + } + + fn fmt_for_explain(&self, f: &mut Formatter) -> std::fmt::Result { + let spec_desc = match &self.window_spec { + WindowBehavior::InData => "InData".to_string(), + WindowBehavior::FromOperator { window, .. } => format!("FromOperator({window:?})"), + }; + write!( + f, + "StreamWindowAggregate: {} | spec: {}", + self.schema(), + spec_desc + ) + } + + fn with_exprs_and_inputs(&self, _exprs: Vec, inputs: Vec) -> Result { + if inputs.len() != 1 { + return internal_err!("StreamWindowAggregateNode expects exactly 1 input"); + } + Self::try_new( + self.window_spec.clone(), + inputs[0].clone(), + self.partition_keys.clone(), + ) + } +} + +// ----------------------------------------------------------------------------- +// Dedicated boundary math for window bin / post-aggregation projection +// ----------------------------------------------------------------------------- + +struct WindowBoundaryMath; + +impl WindowBoundaryMath { + fn interval_nanos(nanos: i64) -> Expr { + Expr::Literal( + ScalarValue::IntervalMonthDayNano(Some( + IntervalMonthDayNanoType::make_value(0, 0, nanos), + )), + None, + ) + } + + fn build_post_aggregation( + agg_plan: &LogicalPlan, + window_spec: WindowBehavior, + ) -> Result { + let ts_field: DFField = agg_plan + .inputs() + .first() + .ok_or_else(|| DataFusionError::Plan("Aggregate has no inputs".into()))? + .schema() + .qualified_field_with_unqualified_name(TIMESTAMP_FIELD)? + .into(); + + let plan_with_ts = LogicalPlan::Extension(Extension { + node: Arc::new(SystemTimestampInjectorNode::try_new( + agg_plan.clone(), + ts_field.qualifier().cloned(), + )?), + }); + + let (win_field, win_index, duration, is_nested) = match window_spec { + WindowBehavior::InData => return Ok(plan_with_ts), + WindowBehavior::FromOperator { + window, + window_field, + window_index, + is_nested, + } => match window { + WindowType::Tumbling { width } | WindowType::Sliding { width, .. } => { + (window_field, window_index, width, is_nested) + } + WindowType::Session { .. } => { + return Ok(LogicalPlan::Extension(Extension { + node: Arc::new(InjectWindowFieldNode::try_new( + plan_with_ts, + window_field, + window_index, + )?), + })); + } + WindowType::Instant => return Ok(plan_with_ts), + }, + }; + + if is_nested { + return Self::build_nested_projection(plan_with_ts, win_field, win_index, duration); + } + + let mut output_fields = fields_with_qualifiers(agg_plan.schema()); + let mut projections: Vec<_> = output_fields + .iter() + .map(|f| Expr::Column(f.qualified_column())) + .collect(); + + let ts_col_expr = Expr::Column(Column::new(ts_field.qualifier().cloned(), ts_field.name())); + + output_fields.insert(win_index, win_field.clone()); + + let win_func_expr = Expr::ScalarFunction(ScalarFunction { + func: window(), + args: vec![ + ts_col_expr.clone(), + Expr::BinaryExpr(BinaryExpr { + left: Box::new(ts_col_expr.clone()), + op: logical_expr::Operator::Plus, + right: Box::new(Self::interval_nanos(duration.as_nanos() as i64)), + }), + ], + }); + + projections.insert( + win_index, + win_func_expr.alias_qualified(win_field.qualifier().cloned(), win_field.name()), + ); + + output_fields.push(ts_field); + + let bin_end_expr = Expr::BinaryExpr(BinaryExpr { + left: Box::new(ts_col_expr), + op: logical_expr::Operator::Plus, + right: Box::new(Self::interval_nanos((duration.as_nanos() - 1) as i64)), + }); + projections.push(bin_end_expr); + + Ok(LogicalPlan::Projection(logical_expr::Projection::try_new_with_schema( + projections, + Arc::new(plan_with_ts), + Arc::new(schema_from_df_fields(&output_fields)?), + )?)) + } + + fn build_nested_projection( + plan: LogicalPlan, + win_field: DFField, + win_index: usize, + duration: Duration, + ) -> Result { + let ts_field: DFField = plan + .schema() + .qualified_field_with_unqualified_name(TIMESTAMP_FIELD)? + .into(); + let ts_col_expr = Expr::Column(Column::new(ts_field.qualifier().cloned(), ts_field.name())); + + let mut output_fields = fields_with_qualifiers(plan.schema()); + let mut projections: Vec<_> = output_fields + .iter() + .map(|f| Expr::Column(f.qualified_column())) + .collect(); + + output_fields.insert(win_index, win_field.clone()); + + let win_func_expr = Expr::ScalarFunction(ScalarFunction { + func: window(), + args: vec![ + Expr::BinaryExpr(BinaryExpr { + left: Box::new(ts_col_expr.clone()), + op: logical_expr::Operator::Minus, + right: Box::new(Self::interval_nanos(duration.as_nanos() as i64 - 1)), + }), + Expr::BinaryExpr(BinaryExpr { + left: Box::new(ts_col_expr), + op: logical_expr::Operator::Plus, + right: Box::new(Self::interval_nanos(1)), + }), + ], + }); + + projections.insert( + win_index, + win_func_expr.alias_qualified(win_field.qualifier().cloned(), win_field.name()), + ); + + Ok(LogicalPlan::Projection(logical_expr::Projection::try_new_with_schema( + projections, + Arc::new(plan), + Arc::new(schema_from_df_fields(&output_fields)?), + )?)) + } +} + +// ----------------------------------------------------------------------------- +// Field injection node (session window column placement) +// ----------------------------------------------------------------------------- + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +struct InjectWindowFieldNode { + pub(crate) upstream_plan: LogicalPlan, + pub(crate) target_field: DFField, + pub(crate) insertion_index: usize, + pub(crate) new_schema: DFSchemaRef, +} + +multifield_partial_ord!(InjectWindowFieldNode, upstream_plan, insertion_index); + +impl InjectWindowFieldNode { + fn try_new( + upstream_plan: LogicalPlan, + target_field: DFField, + insertion_index: usize, + ) -> Result { + let mut fields = fields_with_qualifiers(upstream_plan.schema()); + fields.insert(insertion_index, target_field.clone()); + let meta = upstream_plan.schema().metadata().clone(); + + Ok(Self { + upstream_plan, + target_field, + insertion_index, + new_schema: Arc::new(schema_from_df_fields_with_metadata(&fields, meta)?), + }) + } +} + +impl UserDefinedLogicalNodeCore for InjectWindowFieldNode { + fn name(&self) -> &str { + "InjectWindowFieldNode" + } + + fn inputs(&self) -> Vec<&LogicalPlan> { + vec![&self.upstream_plan] + } + + fn schema(&self) -> &DFSchemaRef { + &self.new_schema + } + + fn expressions(&self) -> Vec { + vec![] + } + + fn fmt_for_explain(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!( + f, + "InjectWindowField: insert {:?} at offset {}", + self.target_field, self.insertion_index + ) + } + + fn with_exprs_and_inputs(&self, _exprs: Vec, inputs: Vec) -> Result { + if inputs.len() != 1 { + return internal_err!("InjectWindowFieldNode expects exactly 1 input"); + } + Self::try_new( + inputs[0].clone(), + self.target_field.clone(), + self.insertion_index, + ) + } +} diff --git a/src/sql/extensions/async_udf.rs b/src/sql/extensions/async_udf.rs new file mode 100644 index 00000000..ee2ce60a --- /dev/null +++ b/src/sql/extensions/async_udf.rs @@ -0,0 +1,243 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::fmt::Formatter; +use std::sync::Arc; +use std::time::Duration; + +use datafusion::common::{DFSchemaRef, Result}; +use datafusion::logical_expr::{ + Expr, LogicalPlan, UserDefinedLogicalNode, UserDefinedLogicalNodeCore, +}; +use datafusion_common::{internal_err, plan_err}; +use datafusion_proto::physical_plan::DefaultPhysicalExtensionCodec; +use datafusion_proto::physical_plan::to_proto::serialize_physical_expr; +use prost::Message; +use protocol::grpc::api::{AsyncUdfOperator, AsyncUdfOrdering}; + +use crate::multifield_partial_ord; +use crate::sql::common::constants::extension_node; +use crate::sql::common::{FsSchema, FsSchemaRef}; +use crate::sql::extensions::streaming_operator_blueprint::{CompiledTopologyNode, StreamingOperatorBlueprint}; +use crate::sql::logical_node::logical::{ + DylibUdfConfig, LogicalEdge, LogicalEdgeType, LogicalNode, OperatorName, +}; +use crate::sql::common::constants::sql_field; +use crate::sql::logical_planner::planner::{NamedNode, Planner}; +use crate::sql::types::{DFField, fields_with_qualifiers, schema_from_df_fields}; + +pub(crate) const NODE_TYPE_NAME: &str = extension_node::ASYNC_FUNCTION_EXECUTION; + +/// Represents a logical node that executes an external asynchronous function (UDF) +/// and projects the final results into the streaming pipeline. +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub(crate) struct AsyncFunctionExecutionNode { + pub(crate) upstream_plan: Arc, + pub(crate) operator_name: String, + pub(crate) function_config: DylibUdfConfig, + pub(crate) invocation_args: Vec, + pub(crate) result_projections: Vec, + pub(crate) preserve_ordering: bool, + pub(crate) concurrency_limit: usize, + pub(crate) execution_timeout: Duration, + pub(crate) resolved_schema: DFSchemaRef, +} + +multifield_partial_ord!( + AsyncFunctionExecutionNode, + upstream_plan, + operator_name, + function_config, + invocation_args, + result_projections, + preserve_ordering, + concurrency_limit, + execution_timeout +); + +impl AsyncFunctionExecutionNode { + /// Compiles logical expressions into serialized physical protobuf bytes. + fn compile_physical_expressions( + &self, + planner: &Planner, + expressions: &[Expr], + schema_context: &DFSchemaRef, + ) -> Result>> { + expressions + .iter() + .map(|logical_expr| { + let physical_expr = planner.create_physical_expr(logical_expr, schema_context)?; + let serialized = + serialize_physical_expr(&physical_expr, &DefaultPhysicalExtensionCodec {})?; + Ok(serialized.encode_to_vec()) + }) + .collect() + } + + /// Computes the intermediate schema which bridges the upstream output + /// and the raw asynchronous result injected by the UDF execution. + fn compute_intermediate_schema(&self) -> Result { + let mut fields = fields_with_qualifiers(self.upstream_plan.schema()); + + let raw_result_field = DFField::new( + None, + sql_field::ASYNC_RESULT, + self.function_config.return_type.clone(), + true, + ); + fields.push(raw_result_field); + + Ok(Arc::new(schema_from_df_fields(&fields)?)) + } + + fn to_protobuf_config( + &self, + compiled_args: Vec>, + compiled_projections: Vec>, + ) -> AsyncUdfOperator { + let ordering_strategy = if self.preserve_ordering { + AsyncUdfOrdering::Ordered + } else { + AsyncUdfOrdering::Unordered + }; + + AsyncUdfOperator { + name: self.operator_name.clone(), + udf: Some(self.function_config.clone().into()), + arg_exprs: compiled_args, + final_exprs: compiled_projections, + ordering: ordering_strategy as i32, + max_concurrency: self.concurrency_limit as u32, + timeout_micros: self.execution_timeout.as_micros() as u64, + } + } +} + +impl StreamingOperatorBlueprint for AsyncFunctionExecutionNode { + fn operator_identity(&self) -> Option { + None + } + + fn compile_to_graph_node( + &self, + planner: &Planner, + node_index: usize, + mut input_schemas: Vec, + ) -> Result { + if input_schemas.len() != 1 { + return plan_err!("AsyncFunctionExecutionNode requires exactly one input schema"); + } + + let compiled_args = self.compile_physical_expressions( + planner, + &self.invocation_args, + self.upstream_plan.schema(), + )?; + + let intermediate_schema = self.compute_intermediate_schema()?; + let compiled_projections = self.compile_physical_expressions( + planner, + &self.result_projections, + &intermediate_schema, + )?; + + let operator_config = self.to_protobuf_config(compiled_args, compiled_projections); + + let logical_node = LogicalNode::single( + node_index as u32, + format!("async_udf_{node_index}"), + OperatorName::AsyncUdf, + operator_config.encode_to_vec(), + format!("AsyncUdf<{}>", self.operator_name), + 1, + ); + + let upstream_schema = input_schemas.remove(0); + let data_edge = + LogicalEdge::project_all(LogicalEdgeType::Forward, (*upstream_schema).clone()); + + Ok(CompiledTopologyNode { + execution_unit: logical_node, + routing_edges: vec![data_edge], + }) + } + + fn yielded_schema(&self) -> FsSchema { + let arrow_fields: Vec<_> = self + .resolved_schema + .fields() + .iter() + .map(|f| (**f).clone()) + .collect(); + + FsSchema::from_fields(arrow_fields) + } +} + +impl UserDefinedLogicalNodeCore for AsyncFunctionExecutionNode { + fn name(&self) -> &str { + NODE_TYPE_NAME + } + + fn inputs(&self) -> Vec<&LogicalPlan> { + vec![&self.upstream_plan] + } + + fn schema(&self) -> &DFSchemaRef { + &self.resolved_schema + } + + fn expressions(&self) -> Vec { + self.invocation_args + .iter() + .chain(self.result_projections.iter()) + .cloned() + .collect() + } + + fn fmt_for_explain(&self, f: &mut Formatter) -> std::fmt::Result { + write!( + f, + "AsyncFunctionExecution<{}>: Concurrency={}, Ordered={}", + self.operator_name, + self.concurrency_limit, + self.preserve_ordering + ) + } + + fn with_exprs_and_inputs(&self, exprs: Vec, mut inputs: Vec) -> Result { + if inputs.len() != 1 { + return internal_err!( + "AsyncFunctionExecutionNode expects exactly 1 input, but received {}", + inputs.len() + ); + } + + if UserDefinedLogicalNode::expressions(self) != exprs { + return internal_err!( + "Attempted to mutate async UDF expressions during logical planning, which is not supported." + ); + } + + Ok(Self { + upstream_plan: Arc::new(inputs.remove(0)), + operator_name: self.operator_name.clone(), + function_config: self.function_config.clone(), + invocation_args: self.invocation_args.clone(), + result_projections: self.result_projections.clone(), + preserve_ordering: self.preserve_ordering, + concurrency_limit: self.concurrency_limit, + execution_timeout: self.execution_timeout, + resolved_schema: self.resolved_schema.clone(), + }) + } +} diff --git a/src/sql/extensions/debezium.rs b/src/sql/extensions/debezium.rs new file mode 100644 index 00000000..2afda2b4 --- /dev/null +++ b/src/sql/extensions/debezium.rs @@ -0,0 +1,384 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; + +use arrow_schema::{DataType, Field, Schema}; +use datafusion::common::{ + internal_err, plan_err, DFSchema, DFSchemaRef, DataFusionError, Result, TableReference, +}; +use datafusion::logical_expr::{Expr, LogicalPlan, UserDefinedLogicalNodeCore}; +use datafusion::physical_plan::DisplayAs; + +use crate::multifield_partial_ord; +use crate::sql::common::constants::{cdc, extension_node}; +use crate::sql::common::{FsSchema, FsSchemaRef, UPDATING_META_FIELD}; +use crate::sql::logical_planner::planner::{NamedNode, Planner}; +use crate::sql::physical::updating_meta_field; +use crate::sql::types::TIMESTAMP_FIELD; + +use super::{CompiledTopologyNode, StreamingOperatorBlueprint}; + +// ----------------------------------------------------------------------------- +// Constants & Identifiers +// ----------------------------------------------------------------------------- + +pub(crate) const UNROLL_NODE_NAME: &str = extension_node::UNROLL_DEBEZIUM_PAYLOAD; +pub(crate) const PACK_NODE_NAME: &str = extension_node::PACK_DEBEZIUM_ENVELOPE; + +// ----------------------------------------------------------------------------- +// Core Schema Codec +// ----------------------------------------------------------------------------- + +/// Transforms between flat schemas and Debezium CDC envelopes. +pub(crate) struct DebeziumSchemaCodec; + +impl DebeziumSchemaCodec { + /// Wraps a flat physical schema into a Debezium CDC envelope structure. + pub(crate) fn wrap_into_envelope( + flat_schema: &DFSchemaRef, + qualifier_override: Option, + ) -> Result { + let ts_field = if flat_schema.has_column_with_unqualified_name(TIMESTAMP_FIELD) { + Some(flat_schema.field_with_unqualified_name(TIMESTAMP_FIELD)?.clone()) + } else { + None + }; + + let payload_fields: Vec<_> = flat_schema + .fields() + .iter() + .filter(|f| f.name() != TIMESTAMP_FIELD && f.name() != UPDATING_META_FIELD) + .cloned() + .collect(); + + let payload_struct_type = DataType::Struct(payload_fields.into()); + + let mut envelope_fields = vec![ + Arc::new(Field::new( + cdc::BEFORE, + payload_struct_type.clone(), + true, + )), + Arc::new(Field::new(cdc::AFTER, payload_struct_type, true)), + Arc::new(Field::new(cdc::OP, DataType::Utf8, true)), + ]; + + if let Some(ts) = ts_field { + envelope_fields.push(Arc::new(ts)); + } + + let arrow_schema = Schema::new(envelope_fields); + let final_schema = match qualifier_override { + Some(qualifier) => DFSchema::try_from_qualified_schema(qualifier, &arrow_schema)?, + None => DFSchema::try_from(arrow_schema)?, + }; + + Ok(Arc::new(final_schema)) + } +} + +// ----------------------------------------------------------------------------- +// Logical Node: Unroll Debezium Payload +// ----------------------------------------------------------------------------- + +/// Decodes an incoming Debezium envelope into a flat, updating stream representation. +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct UnrollDebeziumPayloadNode { + upstream_plan: LogicalPlan, + resolved_schema: DFSchemaRef, + pub pk_indices: Vec, + pk_names: Arc>, +} + +multifield_partial_ord!( + UnrollDebeziumPayloadNode, + upstream_plan, + pk_indices, + pk_names +); + +impl UnrollDebeziumPayloadNode { + pub fn try_new(upstream_plan: LogicalPlan, pk_names: Arc>) -> Result { + let input_schema = upstream_plan.schema(); + + let (before_idx, after_idx) = Self::validate_envelope_structure(input_schema)?; + + let payload_fields = Self::extract_payload_fields(input_schema, before_idx)?; + + let pk_indices = Self::map_primary_keys(payload_fields, &pk_names)?; + + let qualifier = Self::resolve_schema_qualifier(input_schema, before_idx, after_idx)?; + + let resolved_schema = + Self::compile_unrolled_schema(input_schema, payload_fields, qualifier)?; + + Ok(Self { + upstream_plan, + resolved_schema, + pk_indices, + pk_names, + }) + } + + fn validate_envelope_structure(schema: &DFSchemaRef) -> Result<(usize, usize)> { + let before_idx = schema.index_of_column_by_name(None, cdc::BEFORE).ok_or_else( + || DataFusionError::Plan("Missing 'before' state column in CDC stream".into()), + )?; + + let after_idx = schema.index_of_column_by_name(None, cdc::AFTER).ok_or_else( + || DataFusionError::Plan("Missing 'after' state column in CDC stream".into()), + )?; + + let op_idx = schema.index_of_column_by_name(None, cdc::OP).ok_or_else(|| { + DataFusionError::Plan("Missing 'op' operation column in CDC stream".into()) + })?; + + let before_type = schema.field(before_idx).data_type(); + let after_type = schema.field(after_idx).data_type(); + + if before_type != after_type { + return plan_err!( + "State column type mismatch: 'before' is {before_type}, but 'after' is {after_type}" + ); + } + + if *schema.field(op_idx).data_type() != DataType::Utf8 { + return plan_err!( + "The '{}' column must be of type Utf8", + cdc::OP + ); + } + + Ok((before_idx, after_idx)) + } + + fn extract_payload_fields<'a>( + schema: &'a DFSchemaRef, + state_idx: usize, + ) -> Result<&'a arrow_schema::Fields> { + match schema.field(state_idx).data_type() { + DataType::Struct(fields) => Ok(fields), + other => plan_err!("State columns must be of type Struct, found {other}"), + } + } + + fn map_primary_keys( + fields: &arrow_schema::Fields, + pk_names: &[String], + ) -> Result> { + pk_names + .iter() + .map(|pk| fields.find(pk).map(|(idx, _)| idx)) + .collect::>>() + .ok_or_else(|| { + DataFusionError::Plan("Specified primary key not found in payload schema".into()) + }) + } + + fn resolve_schema_qualifier( + schema: &DFSchemaRef, + before_idx: usize, + after_idx: usize, + ) -> Result> { + let before_qualifier = schema.qualified_field(before_idx).0; + let after_qualifier = schema.qualified_field(after_idx).0; + + match (before_qualifier, after_qualifier) { + (Some(bq), Some(aq)) if bq == aq => Ok(Some(bq.clone())), + (None, None) => Ok(None), + _ => plan_err!( + "'before' and 'after' columns must share the same namespace/qualifier" + ), + } + } + + fn compile_unrolled_schema( + original_schema: &DFSchemaRef, + payload_fields: &arrow_schema::Fields, + qualifier: Option, + ) -> Result { + let mut flat_fields = payload_fields.to_vec(); + + flat_fields.push(updating_meta_field()); + + let ts_idx = original_schema + .index_of_column_by_name(None, TIMESTAMP_FIELD) + .ok_or_else(|| { + DataFusionError::Plan(format!( + "Required event time field '{TIMESTAMP_FIELD}' is missing" + )) + })?; + + flat_fields.push(Arc::new(original_schema.field(ts_idx).clone())); + + let arrow_schema = Schema::new(flat_fields); + let compiled_schema = match qualifier { + Some(q) => DFSchema::try_from_qualified_schema(q, &arrow_schema)?, + None => DFSchema::try_from(arrow_schema)?, + }; + + Ok(Arc::new(compiled_schema)) + } +} + +impl UserDefinedLogicalNodeCore for UnrollDebeziumPayloadNode { + fn name(&self) -> &str { + UNROLL_NODE_NAME + } + + fn inputs(&self) -> Vec<&LogicalPlan> { + vec![&self.upstream_plan] + } + + fn schema(&self) -> &DFSchemaRef { + &self.resolved_schema + } + + fn expressions(&self) -> Vec { + vec![] + } + + fn fmt_for_explain(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(f, "UnrollDebeziumPayload") + } + + fn with_exprs_and_inputs(&self, _exprs: Vec, mut inputs: Vec) -> Result { + if inputs.len() != 1 { + return internal_err!( + "UnrollDebeziumPayloadNode expects exactly 1 input, got {}", + inputs.len() + ); + } + Self::try_new(inputs.remove(0), self.pk_names.clone()) + } +} + +impl StreamingOperatorBlueprint for UnrollDebeziumPayloadNode { + fn operator_identity(&self) -> Option { + None + } + + fn is_passthrough_boundary(&self) -> bool { + true + } + + fn compile_to_graph_node( + &self, + _: &Planner, + _: usize, + _: Vec, + ) -> Result { + plan_err!("UnrollDebeziumPayloadNode is a logical boundary and should not be physically planned") + } + + fn yielded_schema(&self) -> FsSchema { + FsSchema::from_schema_unkeyed(Arc::new(self.resolved_schema.as_ref().into())).unwrap_or_else( + |_| panic!("Failed to extract physical schema for {}", UNROLL_NODE_NAME), + ) + } +} + +// ----------------------------------------------------------------------------- +// Logical Node: Pack Debezium Envelope +// ----------------------------------------------------------------------------- + +/// Encodes a flat updating stream back into a Debezium CDC envelope representation. +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub(crate) struct PackDebeziumEnvelopeNode { + upstream_plan: Arc, + envelope_schema: DFSchemaRef, +} + +multifield_partial_ord!(PackDebeziumEnvelopeNode, upstream_plan); + +impl PackDebeziumEnvelopeNode { + pub(crate) fn try_new(upstream_plan: LogicalPlan) -> Result { + let envelope_schema = DebeziumSchemaCodec::wrap_into_envelope(upstream_plan.schema(), None) + .map_err(|e| { + DataFusionError::Plan(format!("Failed to compile Debezium envelope schema: {e}")) + })?; + + Ok(Self { + upstream_plan: Arc::new(upstream_plan), + envelope_schema, + }) + } +} + +impl DisplayAs for PackDebeziumEnvelopeNode { + fn fmt_as( + &self, + _t: datafusion::physical_plan::DisplayFormatType, + f: &mut std::fmt::Formatter, + ) -> std::fmt::Result { + write!(f, "PackDebeziumEnvelope") + } +} + +impl UserDefinedLogicalNodeCore for PackDebeziumEnvelopeNode { + fn name(&self) -> &str { + PACK_NODE_NAME + } + + fn inputs(&self) -> Vec<&LogicalPlan> { + vec![&self.upstream_plan] + } + + fn schema(&self) -> &DFSchemaRef { + &self.envelope_schema + } + + fn expressions(&self) -> Vec { + vec![] + } + + fn fmt_for_explain(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(f, "PackDebeziumEnvelope") + } + + fn with_exprs_and_inputs(&self, _exprs: Vec, mut inputs: Vec) -> Result { + if inputs.len() != 1 { + return internal_err!( + "PackDebeziumEnvelopeNode expects exactly 1 input, got {}", + inputs.len() + ); + } + Self::try_new(inputs.remove(0)) + } +} + +impl StreamingOperatorBlueprint for PackDebeziumEnvelopeNode { + fn operator_identity(&self) -> Option { + None + } + + fn is_passthrough_boundary(&self) -> bool { + true + } + + fn compile_to_graph_node( + &self, + _: &Planner, + _: usize, + _: Vec, + ) -> Result { + internal_err!("PackDebeziumEnvelopeNode is a logical boundary and should not be physically planned") + } + + fn yielded_schema(&self) -> FsSchema { + FsSchema::from_schema_unkeyed(Arc::new(self.envelope_schema.as_ref().into())) + .unwrap_or_else(|_| { + panic!("Failed to extract physical schema for {}", PACK_NODE_NAME) + }) + } +} diff --git a/src/sql/extensions/extension_try_from.rs b/src/sql/extensions/extension_try_from.rs new file mode 100644 index 00000000..a64ac9cf --- /dev/null +++ b/src/sql/extensions/extension_try_from.rs @@ -0,0 +1,70 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; + +use datafusion::common::{DataFusionError, Result}; +use datafusion::logical_expr::UserDefinedLogicalNode; + +use crate::sql::extensions::aggregate::StreamWindowAggregateNode; +use crate::sql::extensions::async_udf::AsyncFunctionExecutionNode; +use crate::sql::extensions::debezium::{PackDebeziumEnvelopeNode, UnrollDebeziumPayloadNode}; +use crate::sql::extensions::join::StreamingJoinNode; +use crate::sql::extensions::key_calculation::KeyExtractionNode; +use crate::sql::extensions::lookup::StreamReferenceJoinNode; +use crate::sql::extensions::projection::StreamProjectionNode; +use crate::sql::extensions::remote_table::RemoteTableBoundaryNode; +use crate::sql::extensions::sink::StreamEgressNode; +use crate::sql::extensions::streaming_operator_blueprint::StreamingOperatorBlueprint; +use crate::sql::extensions::table_source::StreamIngestionNode; +use crate::sql::extensions::updating_aggregate::ContinuousAggregateNode; +use crate::sql::extensions::watermark_node::EventTimeWatermarkNode; +use crate::sql::extensions::windows_function::StreamingWindowFunctionNode; + +fn try_from_t( + node: &dyn UserDefinedLogicalNode, +) -> std::result::Result<&dyn StreamingOperatorBlueprint, ()> { + node.as_any() + .downcast_ref::() + .map(|t| t as &dyn StreamingOperatorBlueprint) + .ok_or(()) +} + +impl<'a> TryFrom<&'a dyn UserDefinedLogicalNode> for &'a dyn StreamingOperatorBlueprint { + type Error = DataFusionError; + + fn try_from(node: &'a dyn UserDefinedLogicalNode) -> Result { + try_from_t::(node) + .or_else(|_| try_from_t::(node)) + .or_else(|_| try_from_t::(node)) + .or_else(|_| try_from_t::(node)) + .or_else(|_| try_from_t::(node)) + .or_else(|_| try_from_t::(node)) + .or_else(|_| try_from_t::(node)) + .or_else(|_| try_from_t::(node)) + .or_else(|_| try_from_t::(node)) + .or_else(|_| try_from_t::(node)) + .or_else(|_| try_from_t::(node)) + .or_else(|_| try_from_t::(node)) + .or_else(|_| try_from_t::(node)) + .or_else(|_| try_from_t::(node)) + .map_err(|_| DataFusionError::Plan(format!("unexpected node: {}", node.name()))) + } +} + +impl<'a> TryFrom<&'a Arc> for &'a dyn StreamingOperatorBlueprint { + type Error = DataFusionError; + + fn try_from(node: &'a Arc) -> Result { + TryFrom::try_from(node.as_ref()) + } +} diff --git a/src/sql/extensions/is_retract.rs b/src/sql/extensions/is_retract.rs new file mode 100644 index 00000000..96493781 --- /dev/null +++ b/src/sql/extensions/is_retract.rs @@ -0,0 +1,80 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; + +use datafusion::arrow::datatypes::{DataType, TimeUnit}; +use datafusion::common::{DFSchemaRef, Result, TableReference}; +use datafusion::logical_expr::{Expr, LogicalPlan, UserDefinedLogicalNodeCore}; + +use crate::multifield_partial_ord; +use crate::sql::physical::updating_meta_field; +use crate::sql::types::{DFField, TIMESTAMP_FIELD, fields_with_qualifiers, schema_from_df_fields}; + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub(crate) struct IsRetractExtension { + pub(crate) input: LogicalPlan, + pub(crate) schema: DFSchemaRef, + pub(crate) timestamp_qualifier: Option, +} + +multifield_partial_ord!(IsRetractExtension, input, timestamp_qualifier); + +impl IsRetractExtension { + pub(crate) fn new(input: LogicalPlan, timestamp_qualifier: Option) -> Self { + let mut output_fields = fields_with_qualifiers(input.schema()); + + let timestamp_index = output_fields.len() - 1; + output_fields[timestamp_index] = DFField::new( + timestamp_qualifier.clone(), + TIMESTAMP_FIELD, + DataType::Timestamp(TimeUnit::Nanosecond, None), + false, + ); + output_fields.push((timestamp_qualifier.clone(), updating_meta_field()).into()); + let schema = Arc::new(schema_from_df_fields(&output_fields).unwrap()); + Self { + input, + schema, + timestamp_qualifier, + } + } +} + +impl UserDefinedLogicalNodeCore for IsRetractExtension { + fn name(&self) -> &str { + "IsRetractExtension" + } + + fn inputs(&self) -> Vec<&LogicalPlan> { + vec![&self.input] + } + + fn schema(&self) -> &DFSchemaRef { + &self.schema + } + + fn expressions(&self) -> Vec { + vec![] + } + + fn fmt_for_explain(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(f, "IsRetractExtension") + } + + fn with_exprs_and_inputs(&self, _exprs: Vec, inputs: Vec) -> Result { + Ok(Self::new( + inputs[0].clone(), + self.timestamp_qualifier.clone(), + )) + } +} diff --git a/src/sql/extensions/join.rs b/src/sql/extensions/join.rs new file mode 100644 index 00000000..829247ae --- /dev/null +++ b/src/sql/extensions/join.rs @@ -0,0 +1,209 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::fmt::Formatter; +use std::time::Duration; + +use datafusion::common::{DFSchemaRef, Result}; +use datafusion::logical_expr::expr::Expr; +use datafusion::logical_expr::{LogicalPlan, UserDefinedLogicalNodeCore}; +use datafusion_common::plan_err; +use datafusion_proto::physical_plan::AsExecutionPlan; +use datafusion_proto::protobuf::PhysicalPlanNode; +use prost::Message; +use protocol::grpc::api::JoinOperator; + +use crate::sql::common::constants::{extension_node, runtime_operator_kind}; +use crate::sql::common::{FsSchema, FsSchemaRef}; +use crate::sql::extensions::{CompiledTopologyNode, StreamingOperatorBlueprint}; +use crate::sql::logical_node::logical::{ + LogicalEdge, LogicalEdgeType, LogicalNode, OperatorName, +}; +use crate::sql::logical_planner::planner::{NamedNode, Planner}; +use crate::sql::physical::FsPhysicalExtensionCodec; + +// ----------------------------------------------------------------------------- +// Constants +// ----------------------------------------------------------------------------- + +pub(crate) const STREAM_JOIN_NODE_TYPE: &str = extension_node::STREAMING_JOIN; + +// ----------------------------------------------------------------------------- +// Logical Node Definition +// ----------------------------------------------------------------------------- + +/// A logical plan node representing a streaming join operation. +/// It bridges the DataFusion logical plan with the physical streaming execution engine. +#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd)] +pub struct StreamingJoinNode { + pub(crate) underlying_plan: LogicalPlan, + pub(crate) instant_execution_mode: bool, + pub(crate) state_retention_ttl: Option, +} + +impl StreamingJoinNode { + /// Creates a new instance of the streaming join node. + pub fn new( + underlying_plan: LogicalPlan, + instant_execution_mode: bool, + state_retention_ttl: Option, + ) -> Self { + Self { + underlying_plan, + instant_execution_mode, + state_retention_ttl, + } + } + + /// Compiles the physical execution plan and serializes it into a Protobuf configuration payload. + fn compile_operator_config( + &self, + planner: &Planner, + node_identifier: &str, + left_schema: FsSchemaRef, + right_schema: FsSchemaRef, + ) -> Result { + let physical_plan = planner.sync_plan(&self.underlying_plan)?; + + let proto_node = PhysicalPlanNode::try_from_physical_plan( + physical_plan, + &FsPhysicalExtensionCodec::default(), + )?; + + Ok(JoinOperator { + name: node_identifier.to_string(), + left_schema: Some(left_schema.as_ref().clone().into()), + right_schema: Some(right_schema.as_ref().clone().into()), + output_schema: Some(self.extract_fs_schema().into()), + join_plan: proto_node.encode_to_vec(), + ttl_micros: self.state_retention_ttl.map(|ttl| ttl.as_micros() as u64), + }) + } + + fn determine_operator_type(&self) -> OperatorName { + if self.instant_execution_mode { + OperatorName::InstantJoin + } else { + OperatorName::Join + } + } + + fn extract_fs_schema(&self) -> FsSchema { + FsSchema::from_schema_unkeyed(self.underlying_plan.schema().inner().clone()) + .expect("Fatal: Failed to convert internal join schema to FsSchema without keys") + } +} + +// ----------------------------------------------------------------------------- +// DataFusion Logical Node Core Implementation +// ----------------------------------------------------------------------------- + +impl UserDefinedLogicalNodeCore for StreamingJoinNode { + fn name(&self) -> &str { + STREAM_JOIN_NODE_TYPE + } + + fn inputs(&self) -> Vec<&LogicalPlan> { + vec![&self.underlying_plan] + } + + fn schema(&self) -> &DFSchemaRef { + self.underlying_plan.schema() + } + + fn expressions(&self) -> Vec { + vec![] + } + + fn fmt_for_explain(&self, f: &mut Formatter) -> std::fmt::Result { + write!( + f, + "StreamingJoinNode: Schema={}, InstantMode={}, TTL={:?}", + self.schema(), + self.instant_execution_mode, + self.state_retention_ttl + ) + } + + fn with_exprs_and_inputs(&self, _exprs: Vec, mut inputs: Vec) -> Result { + if inputs.len() != 1 { + return plan_err!( + "StreamingJoinNode expects exactly 1 underlying logical plan during recreation" + ); + } + + Ok(Self::new( + inputs.remove(0), + self.instant_execution_mode, + self.state_retention_ttl, + )) + } +} + +// ----------------------------------------------------------------------------- +// Streaming Graph Extension Implementation +// ----------------------------------------------------------------------------- + +impl StreamingOperatorBlueprint for StreamingJoinNode { + fn operator_identity(&self) -> Option { + None + } + + fn compile_to_graph_node( + &self, + planner: &Planner, + node_index: usize, + mut input_schemas: Vec, + ) -> Result { + if input_schemas.len() != 2 { + return plan_err!( + "Invalid topology: StreamingJoinNode requires exactly two upstream inputs, received {}", + input_schemas.len() + ); + } + + let right_schema = input_schemas.pop().unwrap(); + let left_schema = input_schemas.pop().unwrap(); + + let node_identifier = format!("stream_join_{node_index}"); + + let operator_config = self.compile_operator_config( + planner, + &node_identifier, + left_schema.clone(), + right_schema.clone(), + )?; + + let logical_node = LogicalNode::single( + node_index as u32, + node_identifier.clone(), + self.determine_operator_type(), + operator_config.encode_to_vec(), + runtime_operator_kind::STREAMING_JOIN.to_string(), + 1, + ); + + let left_edge = + LogicalEdge::project_all(LogicalEdgeType::LeftJoin, left_schema.as_ref().clone()); + let right_edge = + LogicalEdge::project_all(LogicalEdgeType::RightJoin, right_schema.as_ref().clone()); + + Ok(CompiledTopologyNode { + execution_unit: logical_node, + routing_edges: vec![left_edge, right_edge], + }) + } + + fn yielded_schema(&self) -> FsSchema { + self.extract_fs_schema() + } +} diff --git a/src/sql/extensions/key_calculation.rs b/src/sql/extensions/key_calculation.rs new file mode 100644 index 00000000..25206429 --- /dev/null +++ b/src/sql/extensions/key_calculation.rs @@ -0,0 +1,302 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::fmt::Formatter; +use std::sync::Arc; + +use datafusion::arrow::datatypes::{Field, Schema}; +use datafusion::common::{DFSchemaRef, Result, internal_err, plan_err}; +use datafusion::logical_expr::{Expr, ExprSchemable, LogicalPlan, UserDefinedLogicalNodeCore}; +use datafusion_common::DFSchema; +use datafusion_expr::col; +use datafusion_proto::physical_plan::{AsExecutionPlan, DefaultPhysicalExtensionCodec}; +use datafusion_proto::physical_plan::to_proto::serialize_physical_expr; +use datafusion_proto::protobuf::PhysicalPlanNode; +use itertools::Itertools; +use prost::Message; + +use protocol::grpc::api::{KeyPlanOperator, ProjectionOperator}; + +use crate::multifield_partial_ord; +use crate::sql::common::constants::{extension_node, sql_field}; +use crate::sql::common::{FsSchema, FsSchemaRef}; +use crate::sql::extensions::{CompiledTopologyNode, StreamingOperatorBlueprint}; +use crate::sql::logical_node::logical::{LogicalEdge, LogicalEdgeType, LogicalNode, OperatorName}; +use crate::sql::physical::FsPhysicalExtensionCodec; +use crate::sql::logical_planner::planner::{NamedNode, Planner}; +use crate::sql::types::{fields_with_qualifiers, schema_from_df_fields_with_metadata}; + +pub(crate) const EXTENSION_NODE_IDENTIFIER: &str = extension_node::KEY_EXTRACTION; + +/// Routing strategy for shuffling data across the stream topology. +#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd)] +pub enum KeyExtractionStrategy { + ColumnIndices(Vec), + CalculatedExpressions(Vec), +} + +/// Logical node that computes or extracts routing keys. +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub(crate) struct KeyExtractionNode { + pub(crate) operator_label: Option, + pub(crate) upstream_plan: LogicalPlan, + pub(crate) extraction_strategy: KeyExtractionStrategy, + pub(crate) resolved_schema: DFSchemaRef, +} + +multifield_partial_ord!( + KeyExtractionNode, + operator_label, + upstream_plan, + extraction_strategy +); + +impl KeyExtractionNode { + /// Extracts keys and hides them from the downstream projection. + pub fn try_new_with_projection( + upstream_plan: LogicalPlan, + target_indices: Vec, + label: String, + ) -> Result { + let projected_fields: Vec<_> = fields_with_qualifiers(upstream_plan.schema()) + .into_iter() + .enumerate() + .filter(|(idx, _)| !target_indices.contains(idx)) + .map(|(_, field)| field) + .collect(); + + let metadata = upstream_plan.schema().metadata().clone(); + let resolved_schema = schema_from_df_fields_with_metadata(&projected_fields, metadata)?; + + Ok(Self { + operator_label: Some(label), + upstream_plan, + extraction_strategy: KeyExtractionStrategy::ColumnIndices(target_indices), + resolved_schema: Arc::new(resolved_schema), + }) + } + + /// Creates a node using an explicit strategy without changing the visible schema. + pub fn new(upstream_plan: LogicalPlan, strategy: KeyExtractionStrategy) -> Self { + let resolved_schema = upstream_plan.schema().clone(); + Self { + operator_label: None, + upstream_plan, + extraction_strategy: strategy, + resolved_schema, + } + } + + fn compile_index_router( + &self, + physical_plan_proto: PhysicalPlanNode, + indices: &[usize], + ) -> (Vec, OperatorName) { + let operator_config = KeyPlanOperator { + name: sql_field::DEFAULT_KEY_LABEL.into(), + physical_plan: physical_plan_proto.encode_to_vec(), + key_fields: indices.iter().map(|&idx| idx as u64).collect(), + }; + + (operator_config.encode_to_vec(), OperatorName::KeyBy) + } + + fn compile_expression_router( + &self, + planner: &Planner, + expressions: &[Expr], + input_schema_ref: &FsSchemaRef, + input_df_schema: &DFSchemaRef, + ) -> Result<(Vec, OperatorName)> { + let mut target_exprs = expressions.to_vec(); + + for field in input_schema_ref.schema.fields.iter() { + target_exprs.push(col(field.name())); + } + + let output_fs_schema = self.generate_fs_schema()?; + + for (compiled_expr, expected_field) in target_exprs + .iter() + .zip(output_fs_schema.schema.fields()) + { + let (expr_type, expr_nullable) = compiled_expr.data_type_and_nullable(input_df_schema)?; + if expr_type != *expected_field.data_type() || expr_nullable != expected_field.is_nullable() + { + return plan_err!( + "Type mismatch in key calculation: Expected {} (nullable: {}), got {} (nullable: {})", + expected_field.data_type(), + expected_field.is_nullable(), + expr_type, + expr_nullable + ); + } + } + + let mut physical_expr_payloads = Vec::with_capacity(target_exprs.len()); + for logical_expr in target_exprs { + let physical_expr = planner + .create_physical_expr(&logical_expr, input_df_schema) + .map_err(|e| e.context("Failed to physicalize PARTITION BY expression"))?; + + let serialized_expr = + serialize_physical_expr(&physical_expr, &DefaultPhysicalExtensionCodec {})?; + physical_expr_payloads.push(serialized_expr.encode_to_vec()); + } + + let operator_config = ProjectionOperator { + name: self + .operator_label + .as_deref() + .unwrap_or(sql_field::DEFAULT_KEY_LABEL) + .to_string(), + input_schema: Some(input_schema_ref.as_ref().clone().into()), + output_schema: Some(output_fs_schema.into()), + exprs: physical_expr_payloads, + }; + + Ok((operator_config.encode_to_vec(), OperatorName::Projection)) + } + + fn generate_fs_schema(&self) -> Result { + let base_arrow_schema = self.upstream_plan.schema().as_ref(); + + match &self.extraction_strategy { + KeyExtractionStrategy::ColumnIndices(indices) => { + FsSchema::from_schema_keys(Arc::new(base_arrow_schema.into()), indices.clone()) + } + KeyExtractionStrategy::CalculatedExpressions(expressions) => { + let mut composite_fields = + Vec::with_capacity(expressions.len() + base_arrow_schema.fields().len()); + + for (idx, expr) in expressions.iter().enumerate() { + let (data_type, nullable) = expr.data_type_and_nullable(base_arrow_schema)?; + composite_fields.push(Field::new(format!("__key_{idx}"), data_type, nullable).into()); + } + + for field in base_arrow_schema.fields().iter() { + composite_fields.push(field.clone()); + } + + let final_schema = Arc::new(Schema::new(composite_fields)); + let key_mapping = (1..=expressions.len()).collect_vec(); + FsSchema::from_schema_keys(final_schema, key_mapping) + } + } + } +} + +impl StreamingOperatorBlueprint for KeyExtractionNode { + fn operator_identity(&self) -> Option { + None + } + + fn compile_to_graph_node( + &self, + planner: &Planner, + node_index: usize, + mut input_schemas: Vec, + ) -> Result { + if input_schemas.len() != 1 { + return plan_err!("KeyExtractionNode requires exactly one upstream input schema"); + } + + let input_schema_ref = input_schemas.remove(0); + let input_df_schema = Arc::new(DFSchema::try_from(input_schema_ref.schema.as_ref().clone())?); + + let physical_plan = planner.sync_plan(&self.upstream_plan)?; + let physical_plan_proto = PhysicalPlanNode::try_from_physical_plan( + physical_plan, + &FsPhysicalExtensionCodec::default(), + )?; + + let (protobuf_payload, engine_operator_name) = match &self.extraction_strategy { + KeyExtractionStrategy::ColumnIndices(indices) => { + self.compile_index_router(physical_plan_proto, indices) + } + KeyExtractionStrategy::CalculatedExpressions(exprs) => { + self.compile_expression_router(planner, exprs, &input_schema_ref, &input_df_schema)? + } + }; + + let logical_node = LogicalNode::single( + node_index as u32, + format!("key_{node_index}"), + engine_operator_name, + protobuf_payload, + format!("ArrowKey<{}>", self.operator_label.as_deref().unwrap_or("_")), + 1, + ); + + let data_edge = + LogicalEdge::project_all(LogicalEdgeType::Forward, (*input_schema_ref).clone()); + + Ok(CompiledTopologyNode { + execution_unit: logical_node, + routing_edges: vec![data_edge], + }) + } + + fn yielded_schema(&self) -> FsSchema { + self.generate_fs_schema() + .expect("Fatal: Failed to generate output schema for KeyExtractionNode") + } +} + +impl UserDefinedLogicalNodeCore for KeyExtractionNode { + fn name(&self) -> &str { + EXTENSION_NODE_IDENTIFIER + } + + fn inputs(&self) -> Vec<&LogicalPlan> { + vec![&self.upstream_plan] + } + + fn schema(&self) -> &DFSchemaRef { + &self.resolved_schema + } + + fn expressions(&self) -> Vec { + vec![] + } + + fn fmt_for_explain(&self, f: &mut Formatter) -> std::fmt::Result { + write!( + f, + "KeyExtractionNode: Strategy={:?} | Schema={}", + self.extraction_strategy, + self.resolved_schema + ) + } + + fn with_exprs_and_inputs(&self, exprs: Vec, mut inputs: Vec) -> Result { + if inputs.len() != 1 { + return internal_err!("KeyExtractionNode requires exactly 1 input logical plan"); + } + + let strategy = match &self.extraction_strategy { + KeyExtractionStrategy::ColumnIndices(indices) => { + KeyExtractionStrategy::ColumnIndices(indices.clone()) + } + KeyExtractionStrategy::CalculatedExpressions(_) => { + KeyExtractionStrategy::CalculatedExpressions(exprs) + } + }; + + Ok(Self { + operator_label: self.operator_label.clone(), + upstream_plan: inputs.remove(0), + extraction_strategy: strategy, + resolved_schema: self.resolved_schema.clone(), + }) + } +} diff --git a/src/sql/extensions/lookup.rs b/src/sql/extensions/lookup.rs new file mode 100644 index 00000000..8371efce --- /dev/null +++ b/src/sql/extensions/lookup.rs @@ -0,0 +1,287 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; +use std::fmt::Formatter; +use std::sync::Arc; + +use datafusion::common::{Column, DFSchemaRef, JoinType, Result, internal_err, plan_err}; +use datafusion::logical_expr::{Expr, LogicalPlan, UserDefinedLogicalNodeCore}; +use datafusion::sql::TableReference; +use datafusion_proto::physical_plan::DefaultPhysicalExtensionCodec; +use datafusion_proto::physical_plan::to_proto::serialize_physical_expr; +use prost::Message; + +use protocol::grpc::api; +use protocol::grpc::api::{ConnectorOp, GenericConnectorConfig, LookupJoinCondition, LookupJoinOperator}; + +use crate::multifield_partial_ord; +use crate::sql::common::constants::extension_node; +use crate::sql::common::{FsSchema, FsSchemaRef}; +use crate::sql::extensions::{CompiledTopologyNode, StreamingOperatorBlueprint}; +use crate::sql::logical_node::logical::{LogicalEdge, LogicalEdgeType, LogicalNode, OperatorName}; +use crate::sql::logical_planner::planner::{NamedNode, Planner}; +use crate::sql::schema::SourceTable; +use crate::sql::schema::utils::add_timestamp_field_arrow; + +// ----------------------------------------------------------------------------- +// Constants & Identifiers +// ----------------------------------------------------------------------------- + +pub const DICTIONARY_SOURCE_NODE_NAME: &str = extension_node::REFERENCE_TABLE_SOURCE; +pub const STREAM_DICTIONARY_JOIN_NODE_NAME: &str = extension_node::STREAM_REFERENCE_JOIN; + +// ----------------------------------------------------------------------------- +// Logical Node: Reference Table Source +// ----------------------------------------------------------------------------- + +/// Static or periodically updated reference table used for lookups. +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct ReferenceTableSourceNode { + pub(crate) source_definition: SourceTable, + pub(crate) resolved_schema: DFSchemaRef, +} + +multifield_partial_ord!(ReferenceTableSourceNode, source_definition); + +impl UserDefinedLogicalNodeCore for ReferenceTableSourceNode { + fn name(&self) -> &str { + DICTIONARY_SOURCE_NODE_NAME + } + + fn inputs(&self) -> Vec<&LogicalPlan> { + vec![] + } + + fn schema(&self) -> &DFSchemaRef { + &self.resolved_schema + } + + fn expressions(&self) -> Vec { + vec![] + } + + fn fmt_for_explain(&self, f: &mut Formatter) -> std::fmt::Result { + write!(f, "ReferenceTableSource: Schema={}", self.resolved_schema) + } + + fn with_exprs_and_inputs( + &self, + _exprs: Vec, + inputs: Vec, + ) -> Result { + if !inputs.is_empty() { + return internal_err!( + "ReferenceTableSource is a leaf node and cannot accept upstream inputs" + ); + } + + Ok(Self { + source_definition: self.source_definition.clone(), + resolved_schema: self.resolved_schema.clone(), + }) + } +} + +// ----------------------------------------------------------------------------- +// Logical Node: Stream to Reference Join +// ----------------------------------------------------------------------------- + +/// Join between an unbounded stream and a reference (lookup) table. +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct StreamReferenceJoinNode { + pub(crate) upstream_stream_plan: LogicalPlan, + pub(crate) output_schema: DFSchemaRef, + pub(crate) external_dictionary: SourceTable, + pub(crate) equijoin_conditions: Vec<(Expr, Column)>, + pub(crate) post_join_filter: Option, + pub(crate) namespace_alias: Option, + pub(crate) join_semantics: JoinType, +} + +multifield_partial_ord!( + StreamReferenceJoinNode, + upstream_stream_plan, + external_dictionary, + equijoin_conditions, + post_join_filter, + namespace_alias +); + +impl StreamReferenceJoinNode { + fn compile_join_conditions(&self, planner: &Planner) -> Result> { + self.equijoin_conditions + .iter() + .map(|(logical_left_expr, right_column)| { + let physical_expr = + planner.create_physical_expr(logical_left_expr, &self.output_schema)?; + let serialized_expr = + serialize_physical_expr(&physical_expr, &DefaultPhysicalExtensionCodec {})?; + + Ok(LookupJoinCondition { + left_expr: serialized_expr.encode_to_vec(), + right_key: right_column.name.clone(), + }) + }) + .collect() + } + + fn map_api_join_type(&self) -> Result { + match self.join_semantics { + JoinType::Inner => Ok(api::JoinType::Inner as i32), + JoinType::Left => Ok(api::JoinType::Left as i32), + unsupported => plan_err!( + "Unsupported join type '{unsupported}' for dictionary lookups. Only INNER and LEFT joins are permitted." + ), + } + } + + fn build_engine_operator( + &self, + planner: &Planner, + _upstream_schema: &FsSchemaRef, + ) -> Result { + let internal_input_schema = FsSchema::from_schema_unkeyed(Arc::new( + self.output_schema.as_ref().into(), + ))?; + let dictionary_physical_schema = self.external_dictionary.produce_physical_schema(); + let lookup_fs_schema = + FsSchema::from_schema_unkeyed(add_timestamp_field_arrow(dictionary_physical_schema))?; + + let properties: HashMap = self + .external_dictionary + .catalog_with_options + .iter() + .map(|(k, v)| (k.clone(), v.clone())) + .collect(); + + Ok(LookupJoinOperator { + input_schema: Some(internal_input_schema.into()), + lookup_schema: Some(lookup_fs_schema.clone().into()), + connector: Some(ConnectorOp { + connector: self.external_dictionary.adapter_type.clone(), + fs_schema: Some(lookup_fs_schema.into()), + name: self.external_dictionary.table_identifier.clone(), + description: self.external_dictionary.description.clone(), + config: Some(protocol::grpc::api::connector_op::Config::Generic( + GenericConnectorConfig { properties }, + )), + }), + key_exprs: self.compile_join_conditions(planner)?, + join_type: self.map_api_join_type()?, + ttl_micros: self + .external_dictionary + .lookup_cache_ttl + .map(|t| t.as_micros() as u64), + max_capacity_bytes: self.external_dictionary.lookup_cache_max_bytes, + }) + } +} + +impl StreamingOperatorBlueprint for StreamReferenceJoinNode { + fn operator_identity(&self) -> Option { + None + } + + fn compile_to_graph_node( + &self, + planner: &Planner, + node_index: usize, + mut input_schemas: Vec, + ) -> Result { + if input_schemas.len() != 1 { + return plan_err!( + "StreamReferenceJoinNode requires exactly one upstream stream input" + ); + } + let upstream_schema = input_schemas.remove(0); + + let operator_config = self.build_engine_operator(planner, &upstream_schema)?; + + let logical_node = LogicalNode::single( + node_index as u32, + format!("lookup_join_{node_index}"), + OperatorName::LookupJoin, + operator_config.encode_to_vec(), + format!("DictionaryJoin<{}>", self.external_dictionary.table_identifier), + 1, + ); + + let incoming_edge = LogicalEdge::project_all( + LogicalEdgeType::Shuffle, + (*upstream_schema).clone(), + ); + + Ok(CompiledTopologyNode { + execution_unit: logical_node, + routing_edges: vec![incoming_edge], + }) + } + + fn yielded_schema(&self) -> FsSchema { + FsSchema::from_schema_unkeyed(self.output_schema.inner().clone()) + .expect("Failed to convert lookup join output schema to FsSchema") + } +} + +impl UserDefinedLogicalNodeCore for StreamReferenceJoinNode { + fn name(&self) -> &str { + STREAM_DICTIONARY_JOIN_NODE_NAME + } + + fn inputs(&self) -> Vec<&LogicalPlan> { + vec![&self.upstream_stream_plan] + } + + fn schema(&self) -> &DFSchemaRef { + &self.output_schema + } + + fn expressions(&self) -> Vec { + let mut exprs: Vec<_> = self + .equijoin_conditions + .iter() + .map(|(l, _)| l.clone()) + .collect(); + if let Some(filter) = &self.post_join_filter { + exprs.push(filter.clone()); + } + exprs + } + + fn fmt_for_explain(&self, f: &mut Formatter) -> std::fmt::Result { + write!( + f, + "StreamReferenceJoin: join_type={:?} | {}", + self.join_semantics, + self.output_schema + ) + } + + fn with_exprs_and_inputs(&self, _: Vec, inputs: Vec) -> Result { + if inputs.len() != 1 { + return internal_err!( + "StreamReferenceJoinNode expects exactly 1 upstream plan, got {}", + inputs.len() + ); + } + Ok(Self { + upstream_stream_plan: inputs[0].clone(), + output_schema: self.output_schema.clone(), + external_dictionary: self.external_dictionary.clone(), + equijoin_conditions: self.equijoin_conditions.clone(), + post_join_filter: self.post_join_filter.clone(), + namespace_alias: self.namespace_alias.clone(), + join_semantics: self.join_semantics, + }) + } +} diff --git a/src/sql/extensions/macros.rs b/src/sql/extensions/macros.rs new file mode 100644 index 00000000..4ce649c2 --- /dev/null +++ b/src/sql/extensions/macros.rs @@ -0,0 +1,28 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#[macro_export] +macro_rules! multifield_partial_ord { + ($ty:ty, $($field:tt), *) => { + impl PartialOrd for $ty { + fn partial_cmp(&self, other: &Self) -> Option { + $( + let cmp = self.$field.partial_cmp(&other.$field)?; + if cmp != std::cmp::Ordering::Equal { + return Some(cmp); + } + )* + Some(std::cmp::Ordering::Equal) + } + } + }; +} diff --git a/src/sql/extensions/mod.rs b/src/sql/extensions/mod.rs new file mode 100644 index 00000000..6c0ca08a --- /dev/null +++ b/src/sql/extensions/mod.rs @@ -0,0 +1,40 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +mod macros; + +pub(crate) mod streaming_operator_blueprint; +pub(crate) use streaming_operator_blueprint::{CompiledTopologyNode, StreamingOperatorBlueprint}; + +pub(crate) mod aggregate; +pub(crate) mod debezium; +pub(crate) mod join; +pub(crate) mod key_calculation; +pub(crate) mod lookup; +pub(crate) mod projection; +pub(crate) mod remote_table; +pub(crate) mod sink; +pub(crate) mod table_source; +pub(crate) mod updating_aggregate; +pub(crate) mod watermark_node; +pub(crate) mod windows_function; + +pub(crate) mod timestamp_append; +pub(crate) use timestamp_append::SystemTimestampInjectorNode; + +pub(crate) mod async_udf; +pub(crate) use async_udf::AsyncFunctionExecutionNode; + +pub(crate) mod is_retract; +pub(crate) use is_retract::IsRetractExtension; + +mod extension_try_from; diff --git a/src/sql/extensions/projection.rs b/src/sql/extensions/projection.rs new file mode 100644 index 00000000..d1b9e755 --- /dev/null +++ b/src/sql/extensions/projection.rs @@ -0,0 +1,240 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::fmt::Formatter; +use std::sync::Arc; + +use datafusion::common::{DFSchema, DFSchemaRef, Result, internal_err}; +use datafusion::logical_expr::{Expr, ExprSchemable, LogicalPlan, UserDefinedLogicalNodeCore}; +use datafusion_proto::physical_plan::DefaultPhysicalExtensionCodec; +use datafusion_proto::physical_plan::to_proto::serialize_physical_expr; +use prost::Message; + +use protocol::grpc::api::ProjectionOperator; + +use crate::multifield_partial_ord; +use crate::sql::common::constants::{extension_node, sql_field}; +use crate::sql::common::{FsSchema, FsSchemaRef}; +use crate::sql::extensions::{CompiledTopologyNode, StreamingOperatorBlueprint}; +use crate::sql::logical_node::logical::{LogicalEdge, LogicalEdgeType, LogicalNode, OperatorName}; +use crate::sql::logical_planner::planner::{NamedNode, Planner}; +use crate::sql::types::{DFField, schema_from_df_fields}; + +// ----------------------------------------------------------------------------- +// Constants & Identifiers +// ----------------------------------------------------------------------------- + +pub(crate) const STREAM_PROJECTION_NODE_NAME: &str = extension_node::STREAM_PROJECTION; +const DEFAULT_PROJECTION_LABEL: &str = sql_field::DEFAULT_PROJECTION_LABEL; + +// ----------------------------------------------------------------------------- +// Logical Node Definition +// ----------------------------------------------------------------------------- + +/// Projection within a streaming execution topology. +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub(crate) struct StreamProjectionNode { + pub(crate) upstream_plans: Vec, + pub(crate) operator_label: Option, + pub(crate) projection_exprs: Vec, + pub(crate) resolved_schema: DFSchemaRef, + pub(crate) requires_shuffle: bool, +} + +multifield_partial_ord!(StreamProjectionNode, operator_label, projection_exprs); + +impl StreamProjectionNode { + pub(crate) fn try_new( + upstream_plans: Vec, + operator_label: Option, + projection_exprs: Vec, + ) -> Result { + if upstream_plans.is_empty() { + return internal_err!("StreamProjectionNode requires at least one upstream plan"); + } + let primary_input = &upstream_plans[0]; + let upstream_schema = primary_input.schema(); + + let mut projected_fields = Vec::with_capacity(projection_exprs.len()); + for logical_expr in &projection_exprs { + let arrow_field = logical_expr.to_field(upstream_schema)?; + projected_fields.push(DFField::from(arrow_field)); + } + + let resolved_schema = Arc::new(schema_from_df_fields(&projected_fields)?); + + Ok(Self { + upstream_plans, + operator_label, + projection_exprs, + resolved_schema, + requires_shuffle: false, + }) + } + + pub(crate) fn with_shuffle_routing(mut self) -> Self { + self.requires_shuffle = true; + self + } + + fn validate_uniform_schemas(input_schemas: &[FsSchemaRef]) -> Result { + if input_schemas.is_empty() { + return internal_err!("No input schemas provided to projection planner"); + } + let primary_schema = input_schemas[0].clone(); + + for schema in input_schemas.iter().skip(1) { + if **schema != *primary_schema { + return internal_err!( + "Schema mismatch: All upstream inputs to a projection node must share the identical schema topology." + ); + } + } + + Ok(primary_schema) + } + + fn compile_physical_expressions( + &self, + planner: &Planner, + input_df_schema: &DFSchemaRef, + ) -> Result>> { + self.projection_exprs + .iter() + .map(|logical_expr| { + let physical_expr = planner + .create_physical_expr(logical_expr, input_df_schema) + .map_err(|e| e.context("Failed to compile physical projection expression"))?; + + let serialized_expr = serialize_physical_expr( + &physical_expr, + &DefaultPhysicalExtensionCodec {}, + )?; + + Ok(serialized_expr.encode_to_vec()) + }) + .collect() + } +} + +// ----------------------------------------------------------------------------- +// Stream Extension Trait Implementation +// ----------------------------------------------------------------------------- + +impl StreamingOperatorBlueprint for StreamProjectionNode { + fn operator_identity(&self) -> Option { + None + } + + fn compile_to_graph_node( + &self, + planner: &Planner, + node_index: usize, + input_schemas: Vec, + ) -> Result { + let unified_input_schema = Self::validate_uniform_schemas(&input_schemas)?; + let input_df_schema = + Arc::new(DFSchema::try_from(unified_input_schema.schema.as_ref().clone())?); + + let compiled_expr_payloads = self.compile_physical_expressions(planner, &input_df_schema)?; + + let operator_config = ProjectionOperator { + name: self + .operator_label + .as_deref() + .unwrap_or(DEFAULT_PROJECTION_LABEL) + .to_string(), + input_schema: Some(unified_input_schema.as_ref().clone().into()), + output_schema: Some(self.yielded_schema().into()), + exprs: compiled_expr_payloads, + }; + + let node_identifier = format!("projection_{node_index}"); + let label = format!( + "ArrowProjection<{}>", + self.operator_label.as_deref().unwrap_or("_") + ); + + let logical_node = LogicalNode::single( + node_index as u32, + node_identifier, + OperatorName::Projection, + operator_config.encode_to_vec(), + label, + 1, + ); + + let routing_strategy = if self.requires_shuffle { + LogicalEdgeType::Shuffle + } else { + LogicalEdgeType::Forward + }; + + let outgoing_edge = + LogicalEdge::project_all(routing_strategy, (*unified_input_schema).clone()); + + Ok(CompiledTopologyNode { + execution_unit: logical_node, + routing_edges: vec![outgoing_edge], + }) + } + + fn yielded_schema(&self) -> FsSchema { + FsSchema::from_schema_unkeyed(Arc::new(self.resolved_schema.as_arrow().clone())) + .expect("Fatal: Failed to generate unkeyed output schema for projection") + } +} + +// ----------------------------------------------------------------------------- +// DataFusion Logical Node Hooks +// ----------------------------------------------------------------------------- + +impl UserDefinedLogicalNodeCore for StreamProjectionNode { + fn name(&self) -> &str { + STREAM_PROJECTION_NODE_NAME + } + + fn inputs(&self) -> Vec<&LogicalPlan> { + self.upstream_plans.iter().collect() + } + + fn schema(&self) -> &DFSchemaRef { + &self.resolved_schema + } + + fn expressions(&self) -> Vec { + vec![] + } + + fn fmt_for_explain(&self, f: &mut Formatter) -> std::fmt::Result { + write!( + f, + "StreamProjectionNode: RequiresShuffle={}, Schema={}", + self.requires_shuffle, + self.resolved_schema + ) + } + + fn with_exprs_and_inputs(&self, _exprs: Vec, inputs: Vec) -> Result { + let mut new_node = Self::try_new( + inputs, + self.operator_label.clone(), + self.projection_exprs.clone(), + )?; + + if self.requires_shuffle { + new_node = new_node.with_shuffle_routing(); + } + + Ok(new_node) + } +} diff --git a/src/sql/extensions/remote_table.rs b/src/sql/extensions/remote_table.rs new file mode 100644 index 00000000..72b6150c --- /dev/null +++ b/src/sql/extensions/remote_table.rs @@ -0,0 +1,188 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::fmt::Formatter; +use std::sync::Arc; + +use datafusion::common::{DFSchemaRef, Result, TableReference, internal_err, plan_err}; +use datafusion::logical_expr::{Expr, LogicalPlan, UserDefinedLogicalNodeCore}; +use datafusion_proto::physical_plan::AsExecutionPlan; +use datafusion_proto::protobuf::PhysicalPlanNode; +use prost::Message; + +use protocol::grpc::api::ValuePlanOperator; + +use crate::multifield_partial_ord; +use crate::sql::common::constants::extension_node; +use crate::sql::common::{FsSchema, FsSchemaRef}; +use crate::sql::extensions::{CompiledTopologyNode, StreamingOperatorBlueprint}; +use crate::sql::logical_node::logical::{LogicalEdge, LogicalEdgeType, LogicalNode, OperatorName}; +use crate::sql::physical::FsPhysicalExtensionCodec; +use crate::sql::logical_planner::planner::{NamedNode, Planner}; + +// ----------------------------------------------------------------------------- +// Constants & Identifiers +// ----------------------------------------------------------------------------- + +pub(crate) const REMOTE_TABLE_NODE_NAME: &str = extension_node::REMOTE_TABLE_BOUNDARY; + +// ----------------------------------------------------------------------------- +// Logical Node Definition +// ----------------------------------------------------------------------------- + +/// Segments the execution graph and merges nodes sharing the same identifier; acts as a boundary. +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub(crate) struct RemoteTableBoundaryNode { + pub(crate) upstream_plan: LogicalPlan, + pub(crate) table_identifier: TableReference, + pub(crate) resolved_schema: DFSchemaRef, + pub(crate) requires_materialization: bool, +} + +multifield_partial_ord!( + RemoteTableBoundaryNode, + upstream_plan, + table_identifier, + requires_materialization +); + +impl RemoteTableBoundaryNode { + fn compile_engine_operator(&self, planner: &Planner) -> Result> { + let physical_plan = planner.sync_plan(&self.upstream_plan)?; + + let physical_plan_proto = PhysicalPlanNode::try_from_physical_plan( + physical_plan, + &FsPhysicalExtensionCodec::default(), + )?; + + let operator_config = ValuePlanOperator { + name: format!("value_calculation({})", self.table_identifier), + physical_plan: physical_plan_proto.encode_to_vec(), + }; + + Ok(operator_config.encode_to_vec()) + } + + fn validate_uniform_schemas(input_schemas: &[FsSchemaRef]) -> Result<()> { + if input_schemas.len() <= 1 { + return Ok(()); + } + + let primary_schema = &input_schemas[0]; + for schema in input_schemas.iter().skip(1) { + if *schema != *primary_schema { + return plan_err!( + "Topology error: Multiple input streams routed to the same remote table must share an identical schema structure." + ); + } + } + + Ok(()) + } +} + +// ----------------------------------------------------------------------------- +// Stream Extension Trait Implementation +// ----------------------------------------------------------------------------- + +impl StreamingOperatorBlueprint for RemoteTableBoundaryNode { + fn operator_identity(&self) -> Option { + if self.requires_materialization { + Some(NamedNode::RemoteTable(self.table_identifier.clone())) + } else { + None + } + } + + fn compile_to_graph_node( + &self, + planner: &Planner, + node_index: usize, + input_schemas: Vec, + ) -> Result { + Self::validate_uniform_schemas(&input_schemas)?; + + let operator_payload = self.compile_engine_operator(planner)?; + + let logical_node = LogicalNode::single( + node_index as u32, + format!("value_{node_index}"), + OperatorName::Value, + operator_payload, + self.table_identifier.to_string(), + 1, + ); + + let routing_edges: Vec = input_schemas + .into_iter() + .map(|schema| LogicalEdge::project_all(LogicalEdgeType::Forward, (*schema).clone())) + .collect(); + + Ok(CompiledTopologyNode { + execution_unit: logical_node, + routing_edges: routing_edges, + }) + } + + fn yielded_schema(&self) -> FsSchema { + FsSchema::from_schema_keys(Arc::new(self.resolved_schema.as_ref().into()), vec![]) + .expect("Fatal: Failed to generate output schema for remote table boundary") + } +} + +// ----------------------------------------------------------------------------- +// DataFusion Logical Node Hooks +// ----------------------------------------------------------------------------- + +impl UserDefinedLogicalNodeCore for RemoteTableBoundaryNode { + fn name(&self) -> &str { + REMOTE_TABLE_NODE_NAME + } + + fn inputs(&self) -> Vec<&LogicalPlan> { + vec![&self.upstream_plan] + } + + fn schema(&self) -> &DFSchemaRef { + &self.resolved_schema + } + + fn expressions(&self) -> Vec { + vec![] + } + + fn fmt_for_explain(&self, f: &mut Formatter) -> std::fmt::Result { + write!( + f, + "RemoteTableBoundaryNode: Identifier={}, Materialized={}, Schema={}", + self.table_identifier, + self.requires_materialization, + self.resolved_schema + ) + } + + fn with_exprs_and_inputs(&self, _exprs: Vec, mut inputs: Vec) -> Result { + if inputs.len() != 1 { + return internal_err!( + "RemoteTableBoundaryNode expects exactly 1 upstream logical plan, but received {}", + inputs.len() + ); + } + + Ok(Self { + upstream_plan: inputs.remove(0), + table_identifier: self.table_identifier.clone(), + resolved_schema: self.resolved_schema.clone(), + requires_materialization: self.requires_materialization, + }) + } +} diff --git a/src/sql/extensions/sink.rs b/src/sql/extensions/sink.rs new file mode 100644 index 00000000..d2916486 --- /dev/null +++ b/src/sql/extensions/sink.rs @@ -0,0 +1,229 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::fmt::Formatter; +use std::sync::Arc; + +use datafusion::common::{DFSchemaRef, Result, TableReference, plan_err}; +use datafusion::logical_expr::{Expr, Extension, LogicalPlan, UserDefinedLogicalNodeCore}; +use prost::Message; + +use crate::multifield_partial_ord; +use crate::sql::common::constants::extension_node; +use crate::sql::common::{FsSchema, FsSchemaRef, UPDATING_META_FIELD}; +use crate::sql::extensions::{CompiledTopologyNode, StreamingOperatorBlueprint}; +use crate::sql::logical_node::logical::{LogicalEdge, LogicalEdgeType, LogicalNode, OperatorName}; +use crate::sql::logical_planner::planner::{NamedNode, Planner}; +use crate::sql::schema::Table; + +use super::debezium::PackDebeziumEnvelopeNode; +use super::remote_table::RemoteTableBoundaryNode; + +// ----------------------------------------------------------------------------- +// Constants & Identifiers +// ----------------------------------------------------------------------------- + +pub(crate) const STREAM_EGRESS_NODE_NAME: &str = extension_node::STREAM_EGRESS; + +// ----------------------------------------------------------------------------- +// Logical Node Definition +// ----------------------------------------------------------------------------- + +/// Terminal node routing processed data into an external sink (e.g. Kafka, PostgreSQL). +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub(crate) struct StreamEgressNode { + pub(crate) target_identifier: TableReference, + pub(crate) destination_table: Table, + pub(crate) egress_schema: DFSchemaRef, + upstream_plans: Arc>, +} + +multifield_partial_ord!(StreamEgressNode, target_identifier, upstream_plans); + +impl StreamEgressNode { + pub fn try_new( + target_identifier: TableReference, + destination_table: Table, + initial_schema: DFSchemaRef, + upstream_plan: LogicalPlan, + ) -> Result { + let (mut processed_plan, mut resolved_schema) = Self::apply_cdc_transformations( + upstream_plan, + initial_schema, + &destination_table, + )?; + + Self::enforce_computational_boundary(&mut resolved_schema, &mut processed_plan); + + Ok(Self { + target_identifier, + destination_table, + egress_schema: resolved_schema, + upstream_plans: Arc::new(vec![processed_plan]), + }) + } + + fn apply_cdc_transformations( + plan: LogicalPlan, + schema: DFSchemaRef, + destination: &Table, + ) -> Result<(LogicalPlan, DFSchemaRef)> { + let is_upstream_updating = plan + .schema() + .has_column_with_unqualified_name(UPDATING_META_FIELD); + + match destination { + Table::ConnectorTable(connector) => { + let is_sink_updating = connector.is_updating(); + + match (is_upstream_updating, is_sink_updating) { + (_, true) => { + let debezium_encoder = PackDebeziumEnvelopeNode::try_new(plan)?; + let wrapped_plan = LogicalPlan::Extension(Extension { + node: Arc::new(debezium_encoder), + }); + let new_schema = wrapped_plan.schema().clone(); + + Ok((wrapped_plan, new_schema)) + } + (true, false) => { + plan_err!( + "Topology Mismatch: The upstream is producing an updating stream (CDC), \ + but the target sink '{}' is not configured to accept updates. \ + Hint: set `format = 'debezium_json'` in the WITH clause.", + connector.name() + ) + } + (false, false) => Ok((plan, schema)), + } + } + Table::LookupTable(..) => { + plan_err!("Topology Violation: A Lookup Table cannot be used as a streaming data sink.") + } + Table::TableFromQuery { .. } => Ok((plan, schema)), + } + } + + fn enforce_computational_boundary(schema: &mut DFSchemaRef, plan: &mut LogicalPlan) { + let requires_boundary = if let LogicalPlan::Extension(extension) = plan { + let stream_ext: &dyn StreamingOperatorBlueprint = (&extension.node) + .try_into() + .expect("Fatal: Egress node encountered an extension that does not implement StreamingOperatorBlueprint"); + + stream_ext.is_passthrough_boundary() + } else { + true + }; + + if requires_boundary { + let boundary_node = RemoteTableBoundaryNode { + upstream_plan: plan.clone(), + table_identifier: TableReference::bare("sink projection"), + resolved_schema: schema.clone(), + requires_materialization: false, + }; + + *plan = LogicalPlan::Extension(Extension { + node: Arc::new(boundary_node), + }); + } + } +} + +// ----------------------------------------------------------------------------- +// Stream Extension Trait Implementation +// ----------------------------------------------------------------------------- + +impl StreamingOperatorBlueprint for StreamEgressNode { + fn operator_identity(&self) -> Option { + Some(NamedNode::Sink(self.target_identifier.clone())) + } + + fn compile_to_graph_node( + &self, + _planner: &Planner, + node_index: usize, + input_schemas: Vec, + ) -> Result { + let connector_operator = self + .destination_table + .connector_op() + .map_err(|e| e.context("Failed to generate connector operation payload"))?; + + let operator_description = connector_operator.description.clone(); + let operator_payload = connector_operator.encode_to_vec(); + + let logical_node = LogicalNode::single( + node_index as u32, + format!("sink_{}_{node_index}", self.target_identifier), + OperatorName::ConnectorSink, + operator_payload, + operator_description, + 1, + ); + + let routing_edges: Vec = input_schemas + .into_iter() + .map(|input_schema| { + LogicalEdge::project_all(LogicalEdgeType::Forward, (*input_schema).clone()) + }) + .collect(); + + Ok(CompiledTopologyNode { + execution_unit: logical_node, + routing_edges: routing_edges, + }) + } + + fn yielded_schema(&self) -> FsSchema { + FsSchema::from_fields(vec![]) + } +} + +// ----------------------------------------------------------------------------- +// DataFusion Logical Node Hooks +// ----------------------------------------------------------------------------- + +impl UserDefinedLogicalNodeCore for StreamEgressNode { + fn name(&self) -> &str { + STREAM_EGRESS_NODE_NAME + } + + fn inputs(&self) -> Vec<&LogicalPlan> { + self.upstream_plans.iter().collect() + } + + fn schema(&self) -> &DFSchemaRef { + &self.egress_schema + } + + fn expressions(&self) -> Vec { + vec![] + } + + fn fmt_for_explain(&self, f: &mut Formatter) -> std::fmt::Result { + write!( + f, + "StreamEgressNode({:?}): Schema={}", + self.target_identifier, self.egress_schema + ) + } + + fn with_exprs_and_inputs(&self, _exprs: Vec, inputs: Vec) -> Result { + Ok(Self { + target_identifier: self.target_identifier.clone(), + destination_table: self.destination_table.clone(), + egress_schema: self.egress_schema.clone(), + upstream_plans: Arc::new(inputs), + }) + } +} diff --git a/src/sql/extensions/streaming_operator_blueprint.rs b/src/sql/extensions/streaming_operator_blueprint.rs new file mode 100644 index 00000000..d3f9d459 --- /dev/null +++ b/src/sql/extensions/streaming_operator_blueprint.rs @@ -0,0 +1,65 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::fmt::Debug; + +use datafusion::common::Result; + +use crate::sql::common::{FsSchema, FsSchemaRef}; +use crate::sql::logical_node::logical::{LogicalEdge, LogicalNode}; +use crate::sql::logical_planner::planner::{NamedNode, Planner}; + +// ----------------------------------------------------------------------------- +// Core Execution Blueprint +// ----------------------------------------------------------------------------- + +/// Atomic unit within a streaming execution topology: translates streaming SQL into graph nodes. +pub(crate) trait StreamingOperatorBlueprint: Debug { + /// Canonical named identity for this operator, if any (sources, sinks, etc.). + fn operator_identity(&self) -> Option; + + /// Compiles this operator into a graph vertex and its incoming routing edges. + fn compile_to_graph_node( + &self, + compiler_context: &Planner, + node_id_sequence: usize, + upstream_schemas: Vec, + ) -> Result; + + /// Schema of records this operator yields downstream. + fn yielded_schema(&self) -> FsSchema; + + /// Logical passthrough boundary (no physical state change); default is stateful / materializing. + fn is_passthrough_boundary(&self) -> bool { + false + } +} + +// ----------------------------------------------------------------------------- +// Graph Topology Structures +// ----------------------------------------------------------------------------- + +/// Compiled vertex: execution unit plus upstream routing edges. +#[derive(Debug, Clone)] +pub(crate) struct CompiledTopologyNode { + pub execution_unit: LogicalNode, + pub routing_edges: Vec, +} + +impl CompiledTopologyNode { + pub fn new(execution_unit: LogicalNode, routing_edges: Vec) -> Self { + Self { + execution_unit, + routing_edges, + } + } +} diff --git a/src/sql/extensions/table_source.rs b/src/sql/extensions/table_source.rs new file mode 100644 index 00000000..3f998c5a --- /dev/null +++ b/src/sql/extensions/table_source.rs @@ -0,0 +1,176 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::fmt::Formatter; +use std::sync::Arc; + +use datafusion::common::{DFSchemaRef, Result, TableReference, plan_err}; +use datafusion::logical_expr::{Expr, LogicalPlan, UserDefinedLogicalNodeCore}; +use prost::Message; + +use crate::multifield_partial_ord; +use crate::sql::common::constants::extension_node; +use crate::sql::common::{FsSchema, FsSchemaRef}; +use crate::sql::extensions::debezium::DebeziumSchemaCodec; +use crate::sql::logical_node::logical::{LogicalNode, OperatorName}; +use crate::sql::logical_planner::planner::{NamedNode, Planner}; +use crate::sql::schema::SourceTable; +use crate::sql::schema::utils::add_timestamp_field; +use crate::sql::types::schema_from_df_fields; + +use super::{CompiledTopologyNode, StreamingOperatorBlueprint}; + +// ----------------------------------------------------------------------------- +// Constants & Identifiers +// ----------------------------------------------------------------------------- + +pub(crate) const STREAM_INGESTION_NODE_NAME: &str = extension_node::STREAM_INGESTION; + +// ----------------------------------------------------------------------------- +// Logical Node Definition +// ----------------------------------------------------------------------------- + +/// Foundational ingestion point: connects to external systems and injects raw or CDC data. +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub(crate) struct StreamIngestionNode { + pub(crate) source_identifier: TableReference, + pub(crate) source_definition: SourceTable, + pub(crate) resolved_schema: DFSchemaRef, +} + +multifield_partial_ord!(StreamIngestionNode, source_identifier, source_definition); + +impl StreamIngestionNode { + pub fn try_new( + source_identifier: TableReference, + source_definition: SourceTable, + ) -> Result { + let resolved_schema = + Self::build_ingestion_schema(&source_identifier, &source_definition)?; + + Ok(Self { + source_identifier, + source_definition, + resolved_schema, + }) + } + + fn build_ingestion_schema( + identifier: &TableReference, + definition: &SourceTable, + ) -> Result { + let physical_fields: Vec<_> = definition + .schema_specs + .iter() + .filter(|col| !col.is_computed()) + .map(|col| (Some(identifier.clone()), Arc::new(col.arrow_field().clone())).into()) + .collect(); + + let base_schema = Arc::new(schema_from_df_fields(&physical_fields)?); + + let enveloped_schema = if definition.is_updating() { + DebeziumSchemaCodec::wrap_into_envelope(&base_schema, Some(identifier.clone()))? + } else { + base_schema + }; + + add_timestamp_field(enveloped_schema, Some(identifier.clone())) + } +} + +// ----------------------------------------------------------------------------- +// DataFusion Logical Node Hooks +// ----------------------------------------------------------------------------- + +impl UserDefinedLogicalNodeCore for StreamIngestionNode { + fn name(&self) -> &str { + STREAM_INGESTION_NODE_NAME + } + + fn inputs(&self) -> Vec<&LogicalPlan> { + vec![] + } + + fn schema(&self) -> &DFSchemaRef { + &self.resolved_schema + } + + fn expressions(&self) -> Vec { + vec![] + } + + fn fmt_for_explain(&self, f: &mut Formatter) -> std::fmt::Result { + write!( + f, + "StreamIngestionNode({}): Schema={}", + self.source_identifier, self.resolved_schema + ) + } + + fn with_exprs_and_inputs(&self, _exprs: Vec, inputs: Vec) -> Result { + if !inputs.is_empty() { + return plan_err!( + "StreamIngestionNode acts as a leaf boundary and cannot accept upstream inputs." + ); + } + + Ok(Self { + source_identifier: self.source_identifier.clone(), + source_definition: self.source_definition.clone(), + resolved_schema: self.resolved_schema.clone(), + }) + } +} + +// ----------------------------------------------------------------------------- +// Core Execution Blueprint Implementation +// ----------------------------------------------------------------------------- + +impl StreamingOperatorBlueprint for StreamIngestionNode { + fn operator_identity(&self) -> Option { + Some(NamedNode::Source(self.source_identifier.clone())) + } + + fn compile_to_graph_node( + &self, + _compiler_context: &Planner, + node_id_sequence: usize, + upstream_schemas: Vec, + ) -> Result { + if !upstream_schemas.is_empty() { + return plan_err!( + "Topology Violation: StreamIngestionNode is a source origin and cannot process upstream routing edges." + ); + } + + let sql_source = self.source_definition.as_sql_source()?; + let connector_payload = sql_source.source.config.encode_to_vec(); + let operator_description = sql_source.source.config.description.clone(); + + let execution_unit = LogicalNode::single( + node_id_sequence as u32, + format!("source_{}_{node_id_sequence}", self.source_identifier), + OperatorName::ConnectorSource, + connector_payload, + operator_description, + 1, + ); + + Ok(CompiledTopologyNode::new(execution_unit, vec![])) + } + + fn yielded_schema(&self) -> FsSchema { + FsSchema::from_schema_keys(Arc::new(self.resolved_schema.as_ref().into()), vec![]).expect( + "Fatal: Failed to generate output schema for stream ingestion", + ) + } +} diff --git a/src/sql/extensions/timestamp_append.rs b/src/sql/extensions/timestamp_append.rs new file mode 100644 index 00000000..2d8b985b --- /dev/null +++ b/src/sql/extensions/timestamp_append.rs @@ -0,0 +1,117 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::fmt::Formatter; + +use datafusion::common::{DFSchemaRef, Result, TableReference, internal_err}; +use datafusion::logical_expr::{Expr, LogicalPlan, UserDefinedLogicalNodeCore}; + +use crate::multifield_partial_ord; +use crate::sql::common::constants::extension_node; +use crate::sql::schema::utils::{add_timestamp_field, has_timestamp_field}; + +// ----------------------------------------------------------------------------- +// Constants & Identifiers +// ----------------------------------------------------------------------------- + +pub(crate) const TIMESTAMP_INJECTOR_NODE_NAME: &str = extension_node::SYSTEM_TIMESTAMP_INJECTOR; + +// ----------------------------------------------------------------------------- +// Logical Node Definition +// ----------------------------------------------------------------------------- + +/// Injects the mandatory system `_timestamp` field into the upstream streaming schema. +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub(crate) struct SystemTimestampInjectorNode { + pub(crate) upstream_plan: LogicalPlan, + pub(crate) target_qualifier: Option, + pub(crate) resolved_schema: DFSchemaRef, +} + +multifield_partial_ord!(SystemTimestampInjectorNode, upstream_plan, target_qualifier); + +impl SystemTimestampInjectorNode { + pub(crate) fn try_new( + upstream_plan: LogicalPlan, + target_qualifier: Option, + ) -> Result { + let upstream_schema = upstream_plan.schema(); + + if has_timestamp_field(upstream_schema) { + return internal_err!( + "Topology Violation: Attempted to inject a system timestamp into an upstream plan \ + that already contains one. \ + \nPlan:\n {:?} \nSchema:\n {:?}", + upstream_plan, + upstream_schema + ); + } + + let resolved_schema = + add_timestamp_field(upstream_schema.clone(), target_qualifier.clone())?; + + Ok(Self { + upstream_plan, + target_qualifier, + resolved_schema, + }) + } +} + +// ----------------------------------------------------------------------------- +// DataFusion Logical Node Hooks +// ----------------------------------------------------------------------------- + +impl UserDefinedLogicalNodeCore for SystemTimestampInjectorNode { + fn name(&self) -> &str { + TIMESTAMP_INJECTOR_NODE_NAME + } + + fn inputs(&self) -> Vec<&LogicalPlan> { + vec![&self.upstream_plan] + } + + fn schema(&self) -> &DFSchemaRef { + &self.resolved_schema + } + + fn expressions(&self) -> Vec { + vec![] + } + + fn fmt_for_explain(&self, f: &mut Formatter) -> std::fmt::Result { + let field_names = self + .resolved_schema + .fields() + .iter() + .map(|field| field.name().to_string()) + .collect::>() + .join(", "); + + write!( + f, + "SystemTimestampInjector(Qualifier={:?}): [{}]", + self.target_qualifier, field_names + ) + } + + fn with_exprs_and_inputs(&self, _exprs: Vec, mut inputs: Vec) -> Result { + if inputs.len() != 1 { + return internal_err!( + "SystemTimestampInjectorNode requires exactly 1 upstream logical plan, but received {}", + inputs.len() + ); + } + + Self::try_new(inputs.remove(0), self.target_qualifier.clone()) + } +} diff --git a/src/sql/extensions/updating_aggregate.rs b/src/sql/extensions/updating_aggregate.rs new file mode 100644 index 00000000..a76d15d4 --- /dev/null +++ b/src/sql/extensions/updating_aggregate.rs @@ -0,0 +1,242 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; +use std::time::Duration; + +use datafusion::common::{DFSchemaRef, Result, TableReference, ToDFSchema, internal_err, plan_err}; +use datafusion::logical_expr::expr::ScalarFunction; +use datafusion::logical_expr::{ + col, lit, Expr, Extension, LogicalPlan, UserDefinedLogicalNodeCore, +}; +use datafusion::prelude::named_struct; +use datafusion::scalar::ScalarValue; +use datafusion_proto::physical_plan::AsExecutionPlan; +use datafusion_proto::protobuf::PhysicalPlanNode; +use prost::Message; +use protocol::grpc::api::UpdatingAggregateOperator; + +use crate::sql::common::constants::{extension_node, proto_operator_name, updating_state_field}; +use crate::sql::common::{FsSchema, FsSchemaRef}; +use crate::sql::extensions::{CompiledTopologyNode, IsRetractExtension, StreamingOperatorBlueprint}; +use crate::sql::functions::multi_hash; +use crate::sql::logical_node::logical::{LogicalEdge, LogicalEdgeType, LogicalNode, OperatorName}; +use crate::sql::physical::FsPhysicalExtensionCodec; +use crate::sql::logical_planner::planner::{NamedNode, Planner}; + +// ----------------------------------------------------------------------------- +// Constants & Configuration +// ----------------------------------------------------------------------------- + +pub(crate) const CONTINUOUS_AGGREGATE_NODE_NAME: &str = extension_node::CONTINUOUS_AGGREGATE; + +const DEFAULT_FLUSH_INTERVAL_MICROS: u64 = 10_000_000; + +const STATIC_HASH_SIZE_BYTES: i32 = 16; + +// ----------------------------------------------------------------------------- +// Logical Node Definition +// ----------------------------------------------------------------------------- + +/// Stateful continuous aggregation: running aggregates with updating / retraction semantics. +#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd)] +pub(crate) struct ContinuousAggregateNode { + pub(crate) base_aggregate_plan: LogicalPlan, + pub(crate) partition_key_indices: Vec, + pub(crate) retract_injected_plan: LogicalPlan, + pub(crate) namespace_qualifier: Option, + pub(crate) state_retention_ttl: Duration, +} + +impl ContinuousAggregateNode { + pub fn try_new( + base_aggregate_plan: LogicalPlan, + partition_key_indices: Vec, + namespace_qualifier: Option, + state_retention_ttl: Duration, + ) -> Result { + let retract_injected_plan = LogicalPlan::Extension(Extension { + node: Arc::new(IsRetractExtension::new( + base_aggregate_plan.clone(), + namespace_qualifier.clone(), + )), + }); + + Ok(Self { + base_aggregate_plan, + partition_key_indices, + retract_injected_plan, + namespace_qualifier, + state_retention_ttl, + }) + } + + fn construct_state_metadata_expr(&self, upstream_schema: &FsSchemaRef) -> Expr { + let routing_keys: Vec = self + .partition_key_indices + .iter() + .map(|&idx| col(upstream_schema.schema.field(idx).name())) + .collect(); + + let state_id_hash = if routing_keys.is_empty() { + Expr::Literal( + ScalarValue::FixedSizeBinary( + STATIC_HASH_SIZE_BYTES, + Some(vec![0; STATIC_HASH_SIZE_BYTES as usize]), + ), + None, + ) + } else { + Expr::ScalarFunction(ScalarFunction { + func: multi_hash(), + args: routing_keys, + }) + }; + + named_struct(vec![ + lit(updating_state_field::IS_RETRACT), + lit(false), + lit(updating_state_field::ID), + state_id_hash, + ]) + } + + fn compile_operator_config( + &self, + planner: &Planner, + upstream_schema: &FsSchemaRef, + ) -> Result { + let upstream_df_schema = upstream_schema.schema.clone().to_dfschema()?; + + let physical_agg_plan = planner.sync_plan(&self.base_aggregate_plan)?; + let compiled_agg_payload = PhysicalPlanNode::try_from_physical_plan( + physical_agg_plan, + &FsPhysicalExtensionCodec::default(), + )? + .encode_to_vec(); + + let meta_expr = self.construct_state_metadata_expr(upstream_schema); + let compiled_meta_expr = + planner.serialize_as_physical_expr(&meta_expr, &upstream_df_schema)?; + + Ok(UpdatingAggregateOperator { + name: proto_operator_name::UPDATING_AGGREGATE.to_string(), + input_schema: Some((**upstream_schema).clone().into()), + final_schema: Some(self.yielded_schema().into()), + aggregate_exec: compiled_agg_payload, + metadata_expr: compiled_meta_expr, + flush_interval_micros: DEFAULT_FLUSH_INTERVAL_MICROS, + ttl_micros: self.state_retention_ttl.as_micros() as u64, + }) + } +} + +// ----------------------------------------------------------------------------- +// DataFusion Logical Node Hooks +// ----------------------------------------------------------------------------- + +impl UserDefinedLogicalNodeCore for ContinuousAggregateNode { + fn name(&self) -> &str { + CONTINUOUS_AGGREGATE_NODE_NAME + } + + fn inputs(&self) -> Vec<&LogicalPlan> { + vec![&self.base_aggregate_plan] + } + + fn schema(&self) -> &DFSchemaRef { + self.retract_injected_plan.schema() + } + + fn expressions(&self) -> Vec { + vec![] + } + + fn fmt_for_explain(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!( + f, + "ContinuousAggregateNode(TTL={:?})", + self.state_retention_ttl + ) + } + + fn with_exprs_and_inputs( + &self, + _exprs: Vec, + mut inputs: Vec, + ) -> Result { + if inputs.len() != 1 { + return internal_err!( + "ContinuousAggregateNode requires exactly 1 upstream input, got {}", + inputs.len() + ); + } + + Self::try_new( + inputs.remove(0), + self.partition_key_indices.clone(), + self.namespace_qualifier.clone(), + self.state_retention_ttl, + ) + } +} + +// ----------------------------------------------------------------------------- +// Core Execution Blueprint Implementation +// ----------------------------------------------------------------------------- + +impl StreamingOperatorBlueprint for ContinuousAggregateNode { + fn operator_identity(&self) -> Option { + None + } + + fn compile_to_graph_node( + &self, + planner: &Planner, + node_index: usize, + mut upstream_schemas: Vec, + ) -> Result { + if upstream_schemas.len() != 1 { + return plan_err!( + "Topology Violation: ContinuousAggregateNode requires exactly 1 upstream input, received {}", + upstream_schemas.len() + ); + } + + let upstream_schema = upstream_schemas.remove(0); + + let operator_config = self.compile_operator_config(planner, &upstream_schema)?; + + let logical_node = LogicalNode::single( + node_index as u32, + format!("updating_aggregate_{node_index}"), + OperatorName::UpdatingAggregate, + operator_config.encode_to_vec(), + proto_operator_name::UPDATING_AGGREGATE.to_string(), + 1, + ); + + let shuffle_edge = + LogicalEdge::project_all(LogicalEdgeType::Shuffle, (*upstream_schema).clone()); + + Ok(CompiledTopologyNode { + execution_unit: logical_node, + routing_edges: vec![shuffle_edge], + }) + } + + fn yielded_schema(&self) -> FsSchema { + FsSchema::from_schema_unkeyed(Arc::new(self.schema().as_ref().into())).expect( + "Fatal: Failed to generate unkeyed output schema for continuous aggregate", + ) + } +} diff --git a/src/sql/extensions/watermark_node.rs b/src/sql/extensions/watermark_node.rs new file mode 100644 index 00000000..231e1951 --- /dev/null +++ b/src/sql/extensions/watermark_node.rs @@ -0,0 +1,231 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::fmt::Formatter; +use std::sync::Arc; + +use datafusion::common::{DFSchemaRef, Result, TableReference, internal_err, plan_err}; +use datafusion::error::DataFusionError; +use datafusion::logical_expr::{Expr, LogicalPlan, UserDefinedLogicalNodeCore}; +use datafusion_proto::physical_plan::DefaultPhysicalExtensionCodec; +use datafusion_proto::physical_plan::to_proto::serialize_physical_expr; +use prost::Message; +use protocol::grpc::api::ExpressionWatermarkConfig; + +use crate::multifield_partial_ord; +use crate::sql::common::constants::{extension_node, runtime_operator_kind}; +use crate::sql::common::{FsSchema, FsSchemaRef}; +use crate::sql::extensions::{CompiledTopologyNode, StreamingOperatorBlueprint}; +use crate::sql::logical_node::logical::{LogicalEdge, LogicalEdgeType, LogicalNode, OperatorName}; +use crate::sql::logical_planner::planner::{NamedNode, Planner}; +use crate::sql::schema::utils::add_timestamp_field; +use crate::sql::types::TIMESTAMP_FIELD; + +// ----------------------------------------------------------------------------- +// Constants & Identifiers +// ----------------------------------------------------------------------------- + +pub(crate) const EVENT_TIME_WATERMARK_NODE_NAME: &str = extension_node::EVENT_TIME_WATERMARK; + +const DEFAULT_WATERMARK_EMISSION_PERIOD_MICROS: u64 = 1_000_000; + +// ----------------------------------------------------------------------------- +// Logical Node Definition +// ----------------------------------------------------------------------------- + +/// Event-time watermark from a user strategy; drives time progress in stateful operators. +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub(crate) struct EventTimeWatermarkNode { + pub(crate) upstream_plan: LogicalPlan, + pub(crate) namespace_qualifier: TableReference, + pub(crate) watermark_strategy_expr: Expr, + pub(crate) resolved_schema: DFSchemaRef, + pub(crate) internal_timestamp_offset: usize, +} + +multifield_partial_ord!( + EventTimeWatermarkNode, + upstream_plan, + namespace_qualifier, + watermark_strategy_expr, + internal_timestamp_offset +); + +impl EventTimeWatermarkNode { + pub(crate) fn try_new( + upstream_plan: LogicalPlan, + namespace_qualifier: TableReference, + watermark_strategy_expr: Expr, + ) -> Result { + let resolved_schema = add_timestamp_field( + upstream_plan.schema().clone(), + Some(namespace_qualifier.clone()), + )?; + + let internal_timestamp_offset = resolved_schema + .index_of_column_by_name(None, TIMESTAMP_FIELD) + .ok_or_else(|| { + DataFusionError::Plan(format!( + "Fatal: Failed to resolve mandatory temporal column '{}'", + TIMESTAMP_FIELD + )) + })?; + + Ok(Self { + upstream_plan, + namespace_qualifier, + watermark_strategy_expr, + resolved_schema, + internal_timestamp_offset, + }) + } + + pub(crate) fn generate_fs_schema(&self) -> FsSchema { + FsSchema::new_unkeyed( + Arc::new(self.resolved_schema.as_ref().into()), + self.internal_timestamp_offset, + ) + } + + fn compile_operator_config(&self, planner: &Planner) -> Result { + let physical_expr = planner.create_physical_expr( + &self.watermark_strategy_expr, + &self.resolved_schema, + )?; + + let serialized_expr = + serialize_physical_expr(&physical_expr, &DefaultPhysicalExtensionCodec {})?; + + Ok(ExpressionWatermarkConfig { + period_micros: DEFAULT_WATERMARK_EMISSION_PERIOD_MICROS, + idle_time_micros: None, + expression: serialized_expr.encode_to_vec(), + input_schema: Some(self.generate_fs_schema().into()), + }) + } +} + +// ----------------------------------------------------------------------------- +// DataFusion Logical Node Hooks +// ----------------------------------------------------------------------------- + +impl UserDefinedLogicalNodeCore for EventTimeWatermarkNode { + fn name(&self) -> &str { + EVENT_TIME_WATERMARK_NODE_NAME + } + + fn inputs(&self) -> Vec<&LogicalPlan> { + vec![&self.upstream_plan] + } + + fn schema(&self) -> &DFSchemaRef { + &self.resolved_schema + } + + fn expressions(&self) -> Vec { + vec![self.watermark_strategy_expr.clone()] + } + + fn fmt_for_explain(&self, f: &mut Formatter) -> std::fmt::Result { + write!( + f, + "EventTimeWatermarkNode({}): Schema={}", + self.namespace_qualifier, self.resolved_schema + ) + } + + fn with_exprs_and_inputs( + &self, + mut exprs: Vec, + mut inputs: Vec, + ) -> Result { + if inputs.len() != 1 { + return internal_err!( + "EventTimeWatermarkNode requires exactly 1 upstream logical plan, but received {}", + inputs.len() + ); + } + if exprs.len() != 1 { + return internal_err!( + "EventTimeWatermarkNode requires exactly 1 watermark strategy expression, but received {}", + exprs.len() + ); + } + + let internal_timestamp_offset = self + .resolved_schema + .index_of_column_by_name(Some(&self.namespace_qualifier), TIMESTAMP_FIELD) + .ok_or_else(|| { + DataFusionError::Plan(format!( + "Optimizer Error: Lost tracking of temporal column '{}'", + TIMESTAMP_FIELD + )) + })?; + + Ok(Self { + upstream_plan: inputs.remove(0), + namespace_qualifier: self.namespace_qualifier.clone(), + watermark_strategy_expr: exprs.remove(0), + resolved_schema: self.resolved_schema.clone(), + internal_timestamp_offset, + }) + } +} + +// ----------------------------------------------------------------------------- +// Core Execution Blueprint Implementation +// ----------------------------------------------------------------------------- + +impl StreamingOperatorBlueprint for EventTimeWatermarkNode { + fn operator_identity(&self) -> Option { + Some(NamedNode::Watermark(self.namespace_qualifier.clone())) + } + + fn compile_to_graph_node( + &self, + planner: &Planner, + node_index: usize, + mut upstream_schemas: Vec, + ) -> Result { + if upstream_schemas.len() != 1 { + return plan_err!( + "Topology Violation: EventTimeWatermarkNode requires exactly 1 upstream input, received {}", + upstream_schemas.len() + ); + } + + let operator_config = self.compile_operator_config(planner)?; + + let execution_unit = LogicalNode::single( + node_index as u32, + format!("watermark_{node_index}"), + OperatorName::ExpressionWatermark, + operator_config.encode_to_vec(), + runtime_operator_kind::WATERMARK_GENERATOR.to_string(), + 1, + ); + + let incoming_edge = LogicalEdge::project_all( + LogicalEdgeType::Forward, + (*upstream_schemas.remove(0)).clone(), + ); + + Ok(CompiledTopologyNode { + execution_unit, + routing_edges: vec![incoming_edge], + }) + } + + fn yielded_schema(&self) -> FsSchema { + self.generate_fs_schema() + } +} diff --git a/src/sql/extensions/windows_function.rs b/src/sql/extensions/windows_function.rs new file mode 100644 index 00000000..ccb0ff89 --- /dev/null +++ b/src/sql/extensions/windows_function.rs @@ -0,0 +1,198 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::fmt::Formatter; +use std::sync::Arc; + +use datafusion::common::{Column, DFSchema, DFSchemaRef, Result, internal_err, plan_err}; +use datafusion::logical_expr::{Expr, LogicalPlan, UserDefinedLogicalNodeCore}; +use datafusion_proto::physical_plan::DefaultPhysicalExtensionCodec; +use datafusion_proto::physical_plan::to_proto::serialize_physical_expr; +use datafusion_proto::{physical_plan::AsExecutionPlan, protobuf::PhysicalPlanNode}; +use prost::Message; +use protocol::grpc::api::WindowFunctionOperator; + +use crate::sql::common::constants::{extension_node, proto_operator_name, runtime_operator_kind}; +use crate::sql::common::{FsSchema, FsSchemaRef}; +use crate::sql::logical_node::logical::{LogicalEdge, LogicalEdgeType, LogicalNode, OperatorName}; +use crate::sql::physical::FsPhysicalExtensionCodec; +use crate::sql::logical_planner::planner::{NamedNode, Planner}; +use crate::sql::types::TIMESTAMP_FIELD; + +use super::{CompiledTopologyNode, StreamingOperatorBlueprint}; + +// ----------------------------------------------------------------------------- +// Constants & Identifiers +// ----------------------------------------------------------------------------- + +pub(crate) const STREAMING_WINDOW_NODE_NAME: &str = extension_node::STREAMING_WINDOW_FUNCTION; + +// ----------------------------------------------------------------------------- +// Logical Node Definition +// ----------------------------------------------------------------------------- + +/// Stateful streaming window: temporal binning plus underlying window evaluation plan. +#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd)] +pub(crate) struct StreamingWindowFunctionNode { + pub(crate) underlying_evaluation_plan: LogicalPlan, + pub(crate) partition_key_indices: Vec, +} + +impl StreamingWindowFunctionNode { + pub fn new( + underlying_evaluation_plan: LogicalPlan, + partition_key_indices: Vec, + ) -> Self { + Self { + underlying_evaluation_plan, + partition_key_indices, + } + } + + fn compile_temporal_binning_function( + &self, + planner: &Planner, + input_df_schema: &DFSchema, + ) -> Result> { + let timestamp_column = Expr::Column(Column::new_unqualified(TIMESTAMP_FIELD.to_string())); + + let physical_binning_expr = + planner.create_physical_expr(×tamp_column, input_df_schema)?; + + let serialized_expr = + serialize_physical_expr(&physical_binning_expr, &DefaultPhysicalExtensionCodec {})?; + + Ok(serialized_expr.encode_to_vec()) + } + + fn compile_physical_evaluation_plan(&self, planner: &Planner) -> Result> { + let physical_window_plan = planner.sync_plan(&self.underlying_evaluation_plan)?; + + let proto_plan_node = PhysicalPlanNode::try_from_physical_plan( + physical_window_plan, + &FsPhysicalExtensionCodec::default(), + )?; + + Ok(proto_plan_node.encode_to_vec()) + } +} + +// ----------------------------------------------------------------------------- +// DataFusion Logical Node Hooks +// ----------------------------------------------------------------------------- + +impl UserDefinedLogicalNodeCore for StreamingWindowFunctionNode { + fn name(&self) -> &str { + STREAMING_WINDOW_NODE_NAME + } + + fn inputs(&self) -> Vec<&LogicalPlan> { + vec![&self.underlying_evaluation_plan] + } + + fn schema(&self) -> &DFSchemaRef { + self.underlying_evaluation_plan.schema() + } + + fn expressions(&self) -> Vec { + vec![] + } + + fn fmt_for_explain(&self, f: &mut Formatter) -> std::fmt::Result { + write!( + f, + "StreamingWindowFunction: Schema={}", + self.schema() + ) + } + + fn with_exprs_and_inputs( + &self, + _exprs: Vec, + mut inputs: Vec, + ) -> Result { + if inputs.len() != 1 { + return internal_err!( + "StreamingWindowFunctionNode requires exactly 1 upstream input, got {}", + inputs.len() + ); + } + + Ok(Self::new( + inputs.remove(0), + self.partition_key_indices.clone(), + )) + } +} + +// ----------------------------------------------------------------------------- +// Core Execution Blueprint Implementation +// ----------------------------------------------------------------------------- + +impl StreamingOperatorBlueprint for StreamingWindowFunctionNode { + fn operator_identity(&self) -> Option { + None + } + + fn compile_to_graph_node( + &self, + planner: &Planner, + node_index: usize, + mut input_schemas: Vec, + ) -> Result { + if input_schemas.len() != 1 { + return plan_err!( + "Topology Violation: StreamingWindowFunctionNode requires exactly 1 upstream input schema, received {}", + input_schemas.len() + ); + } + + let input_schema = input_schemas.remove(0); + + let input_df_schema = DFSchema::try_from(input_schema.schema.as_ref().clone())?; + + let binning_payload = self.compile_temporal_binning_function(planner, &input_df_schema)?; + let evaluation_plan_payload = self.compile_physical_evaluation_plan(planner)?; + + let operator_config = WindowFunctionOperator { + name: proto_operator_name::WINDOW_FUNCTION.to_string(), + input_schema: Some(input_schema.as_ref().clone().into()), + binning_function: binning_payload, + window_function_plan: evaluation_plan_payload, + }; + + let logical_node = LogicalNode::single( + node_index as u32, + format!("window_function_{node_index}"), + OperatorName::WindowFunction, + operator_config.encode_to_vec(), + runtime_operator_kind::STREAMING_WINDOW_EVALUATOR.to_string(), + 1, + ); + + let routing_edge = LogicalEdge::project_all( + LogicalEdgeType::Shuffle, + (*input_schema).clone(), + ); + + Ok(CompiledTopologyNode { + execution_unit: logical_node, + routing_edges: vec![routing_edge], + }) + } + + fn yielded_schema(&self) -> FsSchema { + FsSchema::from_schema_unkeyed(Arc::new(self.schema().as_ref().clone().into())).expect( + "Fatal: Failed to generate unkeyed output schema for StreamingWindowFunctionNode", + ) + } +} diff --git a/src/sql/functions/mod.rs b/src/sql/functions/mod.rs new file mode 100644 index 00000000..b78f5d2a --- /dev/null +++ b/src/sql/functions/mod.rs @@ -0,0 +1,612 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use crate::sql::schema::StreamSchemaProvider; +use datafusion::arrow::array::{ + Array, ArrayRef, StringArray, UnionArray, + builder::{FixedSizeBinaryBuilder, ListBuilder, StringBuilder}, + cast::{AsArray, as_string_array}, + types::{Float64Type, Int64Type}, +}; +use datafusion::arrow::datatypes::{DataType, Field, UnionFields, UnionMode}; +use datafusion::arrow::row::{RowConverter, SortField}; +use datafusion::common::{DataFusionError, ScalarValue}; +use datafusion::common::{Result, TableReference}; +use datafusion::execution::FunctionRegistry; +use datafusion::logical_expr::expr::{Alias, ScalarFunction}; +use datafusion::logical_expr::{ + ColumnarValue, LogicalPlan, Projection, ScalarFunctionArgs, ScalarUDFImpl, Signature, + TypeSignature, Volatility, create_udf, +}; +use datafusion::prelude::{Expr, col}; +use serde_json_path::JsonPath; +use std::any::Any; +use std::collections::HashMap; +use std::fmt::{Debug, Write}; +use std::sync::{Arc, OnceLock}; + +use crate::sql::common::constants::scalar_fn; + +/// Borrowed from DataFusion +/// +/// Creates a singleton `ScalarUDF` of the `$UDF` function named `$GNAME` and a +/// function named `$NAME` which returns that function named $NAME. +/// +/// This is used to ensure creating the list of `ScalarUDF` only happens once. +#[macro_export] +macro_rules! make_udf_function { + ($UDF:ty, $GNAME:ident, $NAME:ident) => { + /// Singleton instance of the function + static $GNAME: std::sync::OnceLock> = + std::sync::OnceLock::new(); + + /// Return a [`ScalarUDF`] for [`$UDF`] + /// + /// [`ScalarUDF`]: datafusion_expr::ScalarUDF + pub fn $NAME() -> std::sync::Arc { + $GNAME + .get_or_init(|| { + std::sync::Arc::new(datafusion::logical_expr::ScalarUDF::new_from_impl( + <$UDF>::default(), + )) + }) + .clone() + } + }; +} + +make_udf_function!(MultiHashFunction, MULTI_HASH, multi_hash); + +pub fn register_all(registry: &mut dyn FunctionRegistry) { + registry + .register_udf(Arc::new(create_udf( + scalar_fn::GET_FIRST_JSON_OBJECT, + vec![DataType::Utf8, DataType::Utf8], + DataType::Utf8, + Volatility::Immutable, + Arc::new(get_first_json_object), + ))) + .unwrap(); + + registry + .register_udf(Arc::new(create_udf( + scalar_fn::EXTRACT_JSON, + vec![DataType::Utf8, DataType::Utf8], + DataType::List(Arc::new(Field::new("item", DataType::Utf8, true))), + Volatility::Immutable, + Arc::new(extract_json), + ))) + .unwrap(); + + registry + .register_udf(Arc::new(create_udf( + scalar_fn::EXTRACT_JSON_STRING, + vec![DataType::Utf8, DataType::Utf8], + DataType::Utf8, + Volatility::Immutable, + Arc::new(extract_json_string), + ))) + .unwrap(); + + registry + .register_udf(Arc::new(create_udf( + scalar_fn::SERIALIZE_JSON_UNION, + vec![DataType::Union(union_fields(), UnionMode::Sparse)], + DataType::Utf8, + Volatility::Immutable, + Arc::new(serialize_json_union), + ))) + .unwrap(); + + registry.register_udf(multi_hash()).unwrap(); +} + +fn parse_path(name: &str, path: &ScalarValue) -> Result> { + let path = match path { + ScalarValue::Utf8(Some(s)) => JsonPath::parse(s) + .map_err(|e| DataFusionError::Execution(format!("Invalid json path '{s}': {e:?}")))?, + ScalarValue::Utf8(None) => { + return Err(DataFusionError::Execution(format!( + "The path argument to {name} cannot be null" + ))); + } + _ => { + return Err(DataFusionError::Execution(format!( + "The path argument to {name} must be of type TEXT" + ))); + } + }; + + Ok(Arc::new(path)) +} + +// Hash function that can take any number of arguments and produces a fast (non-cryptographic) +// 128-bit hash from their string representations +#[derive(Debug)] +pub struct MultiHashFunction { + signature: Signature, +} + +impl MultiHashFunction { + pub fn invoke(&self, args: &[ColumnarValue]) -> Result { + let mut hasher = xxhash_rust::xxh3::Xxh3::new(); + + let all_scalar = args.iter().all(|a| matches!(a, ColumnarValue::Scalar(_))); + + let length = args + .iter() + .map(|t| match t { + ColumnarValue::Scalar(_) => 1, + ColumnarValue::Array(a) => a.len(), + }) + .max() + .ok_or_else(|| { + DataFusionError::Plan("multi_hash must have at least one argument".to_string()) + })?; + + let row_builder = RowConverter::new( + args.iter() + .map(|t| SortField::new(t.data_type().clone())) + .collect(), + )?; + + let arrays = args + .iter() + .map(|c| c.clone().into_array(length)) + .collect::>>()?; + let rows = row_builder.convert_columns(&arrays)?; + + if all_scalar { + hasher.update(rows.row(0).as_ref()); + let result = hasher.digest128().to_be_bytes().to_vec(); + hasher.reset(); + Ok(ColumnarValue::Scalar(ScalarValue::FixedSizeBinary( + size_of::() as i32, + Some(result), + ))) + } else { + let mut builder = + FixedSizeBinaryBuilder::with_capacity(length, size_of::() as i32); + + for row in rows.iter() { + hasher.update(row.as_ref()); + builder.append_value(hasher.digest128().to_be_bytes())?; + hasher.reset(); + } + + Ok(ColumnarValue::Array(Arc::new(builder.finish()))) + } + } +} + +impl Default for MultiHashFunction { + fn default() -> Self { + Self { + signature: Signature::new(TypeSignature::VariadicAny, Volatility::Immutable), + } + } +} + +impl ScalarUDFImpl for MultiHashFunction { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + scalar_fn::MULTI_HASH + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Ok(DataType::FixedSizeBinary(size_of::() as i32)) + } + + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + self.invoke(&args.args) + } +} + +fn json_function( + name: &str, + f: F, + to_scalar: ToS, + args: &[ColumnarValue], +) -> Result +where + ArrayT: Array + FromIterator> + 'static, + F: Fn(serde_json::Value, &JsonPath) -> Option, + ToS: Fn(Option) -> ScalarValue, +{ + assert_eq!(args.len(), 2); + Ok(match (&args[0], &args[1]) { + (ColumnarValue::Array(values), ColumnarValue::Scalar(path)) => { + let path = parse_path(name, path)?; + let vs = as_string_array(values); + ColumnarValue::Array(Arc::new( + vs.iter() + .map(|s| s.and_then(|s| f(serde_json::from_str(s).ok()?, &path))) + .collect::(), + ) as ArrayRef) + } + (ColumnarValue::Scalar(value), ColumnarValue::Scalar(path)) => { + let path = parse_path(name, path)?; + let ScalarValue::Utf8(value) = value else { + return Err(DataFusionError::Execution(format!( + "The value argument to {name} must be of type TEXT" + ))); + }; + + let result = value + .as_ref() + .and_then(|v| f(serde_json::from_str(v).ok()?, &path)); + ColumnarValue::Scalar(to_scalar(result)) + } + _ => { + return Err(DataFusionError::Execution( + "The path argument to {name} must be a literal".to_string(), + )); + } + }) +} + +pub fn extract_json(args: &[ColumnarValue]) -> Result { + assert_eq!(args.len(), 2); + + let inner = |s, path: &JsonPath| { + Some( + path.query(&serde_json::from_str(s).ok()?) + .iter() + .map(|v| Some(v.to_string())) + .collect::>>(), + ) + }; + + Ok(match (&args[0], &args[1]) { + (ColumnarValue::Array(values), ColumnarValue::Scalar(path)) => { + let path = parse_path("extract_json", path)?; + let values = as_string_array(values); + + let mut builder = ListBuilder::with_capacity(StringBuilder::new(), values.len()); + + let queried = values.iter().map(|s| s.and_then(|s| inner(s, &path))); + + for v in queried { + builder.append_option(v); + } + + ColumnarValue::Array(Arc::new(builder.finish())) + } + (ColumnarValue::Scalar(value), ColumnarValue::Scalar(path)) => { + let path = parse_path("extract_json", path)?; + let ScalarValue::Utf8(v) = value else { + return Err(DataFusionError::Execution( + "The value argument to extract_json must be of type TEXT".to_string(), + )); + }; + + let mut builder = ListBuilder::with_capacity(StringBuilder::new(), 1); + let result = v.as_ref().and_then(|s| inner(s, &path)); + builder.append_option(result); + + ColumnarValue::Scalar(ScalarValue::List(Arc::new(builder.finish()))) + } + _ => { + return Err(DataFusionError::Execution( + "The path argument to extract_json must be a literal".to_string(), + )); + } + }) +} + +pub fn get_first_json_object(args: &[ColumnarValue]) -> Result { + json_function::( + "get_first_json_object", + |s, path| path.query(&s).first().map(|v| v.to_string()), + |s| s.as_deref().into(), + args, + ) +} + +pub fn extract_json_string(args: &[ColumnarValue]) -> Result { + json_function::( + "extract_json_string", + |s, path| { + path.query(&s) + .first() + .and_then(|v| v.as_str().map(|s| s.to_string())) + }, + |s| s.as_deref().into(), + args, + ) +} + +// This code is vendored from +// https://github.com/datafusion-contrib/datafusion-functions-json/blob/main/src/common_union.rs +// as the `is_json_union` function is not public. It should be kept in sync with that code so +// that we are able to detect JSON unions and rewrite them to serialized JSON for sinks. +pub(crate) fn is_json_union(data_type: &DataType) -> bool { + match data_type { + DataType::Union(fields, UnionMode::Sparse) => fields == &union_fields(), + _ => false, + } +} + +pub(crate) const TYPE_ID_NULL: i8 = 0; +const TYPE_ID_BOOL: i8 = 1; +const TYPE_ID_INT: i8 = 2; +const TYPE_ID_FLOAT: i8 = 3; +const TYPE_ID_STR: i8 = 4; +const TYPE_ID_ARRAY: i8 = 5; +const TYPE_ID_OBJECT: i8 = 6; + +fn union_fields() -> UnionFields { + static FIELDS: OnceLock = OnceLock::new(); + FIELDS + .get_or_init(|| { + let json_metadata: HashMap = + HashMap::from_iter(vec![("is_json".to_string(), "true".to_string())]); + UnionFields::from_iter([ + ( + TYPE_ID_NULL, + Arc::new(Field::new("null", DataType::Null, true)), + ), + ( + TYPE_ID_BOOL, + Arc::new(Field::new("bool", DataType::Boolean, false)), + ), + ( + TYPE_ID_INT, + Arc::new(Field::new("int", DataType::Int64, false)), + ), + ( + TYPE_ID_FLOAT, + Arc::new(Field::new("float", DataType::Float64, false)), + ), + ( + TYPE_ID_STR, + Arc::new(Field::new("str", DataType::Utf8, false)), + ), + ( + TYPE_ID_ARRAY, + Arc::new( + Field::new("array", DataType::Utf8, false) + .with_metadata(json_metadata.clone()), + ), + ), + ( + TYPE_ID_OBJECT, + Arc::new( + Field::new("object", DataType::Utf8, false) + .with_metadata(json_metadata.clone()), + ), + ), + ]) + }) + .clone() +} +// End vendored code + +pub fn serialize_json_union(args: &[ColumnarValue]) -> Result { + assert_eq!(args.len(), 1); + let array = match args.first().unwrap() { + ColumnarValue::Array(a) => a.clone(), + ColumnarValue::Scalar(s) => s.to_array_of_size(1)?, + }; + + let mut b = StringBuilder::with_capacity(array.len(), array.get_array_memory_size()); + + write_union(&mut b, &array)?; + + Ok(ColumnarValue::Array(Arc::new(b.finish()))) +} + +fn write_union(b: &mut StringBuilder, array: &ArrayRef) -> Result<(), std::fmt::Error> { + assert!( + is_json_union(array.data_type()), + "array item is not a valid JSON union" + ); + let json_union = array.as_any().downcast_ref::().unwrap(); + + for i in 0..json_union.len() { + if json_union.is_null(i) { + b.append_null(); + } else { + write_value(b, json_union.type_id(i), &json_union.value(i))?; + b.append_value(""); + } + } + + Ok(()) +} + +fn write_value(b: &mut StringBuilder, id: i8, a: &ArrayRef) -> Result<(), std::fmt::Error> { + match id { + TYPE_ID_NULL => write!(b, "null")?, + TYPE_ID_BOOL => write!(b, "{}", a.as_boolean().value(0))?, + TYPE_ID_INT => write!(b, "{}", a.as_primitive::().value(0))?, + TYPE_ID_FLOAT => write!(b, "{}", a.as_primitive::().value(0))?, + TYPE_ID_STR => { + // assumes that this is already a valid (escaped) json string as the only way to + // construct these values are by parsing (valid) JSON + b.write_char('"')?; + b.write_str(a.as_string::().value(0))?; + b.write_char('"')?; + } + TYPE_ID_ARRAY => { + b.write_str(a.as_string::().value(0))?; + } + TYPE_ID_OBJECT => { + b.write_str(a.as_string::().value(0))?; + } + _ => unreachable!("invalid union type in JSON union: {}", id), + } + + Ok(()) +} + +pub(crate) fn serialize_outgoing_json( + registry: &StreamSchemaProvider, + node: Arc, +) -> LogicalPlan { + let exprs = node + .schema() + .fields() + .iter() + .map(|f| { + if is_json_union(f.data_type()) { + Expr::Alias(Alias::new( + Expr::ScalarFunction(ScalarFunction::new_udf( + registry.udf(scalar_fn::SERIALIZE_JSON_UNION).unwrap(), + vec![col(f.name())], + )), + Option::::None, + f.name(), + )) + } else { + col(f.name()) + } + }) + .collect(); + + LogicalPlan::Projection(Projection::try_new(exprs, node).unwrap()) +} + +#[cfg(test)] +mod test { + use datafusion::arrow::array::StringArray; + use datafusion::arrow::array::builder::{ListBuilder, StringBuilder}; + use datafusion::common::ScalarValue; + use std::sync::Arc; + + #[test] + fn test_extract_json() { + let input = Arc::new(StringArray::from(vec![ + r#"{"a": 1, "b": 2, "c": { "d": "hello" }}"#, + r#"{"a": 3, "b": 4}"#, + r#"{"a": 5, "b": 6}"#, + ])); + + let path = "$.c.d"; + + let result = super::extract_json(&[ + super::ColumnarValue::Array(input), + super::ColumnarValue::Scalar(path.into()), + ]) + .unwrap(); + + let mut expected = ListBuilder::new(StringBuilder::new()); + expected.append_value(vec![Some("\"hello\"".to_string())]); + expected.append_value(Vec::>::new()); + expected.append_value(Vec::>::new()); + if let super::ColumnarValue::Array(result) = result { + assert_eq!(*result, expected.finish()); + } else { + panic!("Expected array, got scalar"); + } + + let result = super::extract_json(&[ + super::ColumnarValue::Scalar(r#"{"a": 1, "b": 2, "c": { "d": "hello" }}"#.into()), + super::ColumnarValue::Scalar(path.into()), + ]) + .unwrap(); + + let mut expected = ListBuilder::with_capacity(StringBuilder::new(), 1); + expected.append_value(vec![Some("\"hello\"".to_string())]); + + if let super::ColumnarValue::Scalar(ScalarValue::List(result)) = result { + assert_eq!(*result, expected.finish()); + } else { + panic!("Expected scalar list"); + } + } + + #[test] + fn test_get_first_json_object() { + let input = Arc::new(StringArray::from(vec![ + r#"{"a": 1, "b": 2}"#, + r#"{"a": 3}"#, + r#"{"a": 5, "b": 6}"#, + ])); + + let path = "$.b"; + + let result = super::get_first_json_object(&[ + super::ColumnarValue::Array(input), + super::ColumnarValue::Scalar(path.into()), + ]) + .unwrap(); + + let expected = StringArray::from(vec![Some("2"), None, Some("6")]); + + if let super::ColumnarValue::Array(result) = result { + assert_eq!(*result, expected); + } else { + panic!("Expected array, got scalar"); + } + + let result = super::get_first_json_object(&[ + super::ColumnarValue::Scalar(r#"{"a": 1, "b": 2, "c": { "d": "hello" }}"#.into()), + super::ColumnarValue::Scalar("$.c.d".into()), + ]) + .unwrap(); + + let expected = ScalarValue::Utf8(Some("\"hello\"".to_string())); + + if let super::ColumnarValue::Scalar(result) = result { + assert_eq!(result, expected); + } else { + panic!("Expected scalar"); + } + } + + #[test] + fn test_extract_json_string() { + let input = Arc::new(StringArray::from(vec![ + r#"{"a": 1, "b": 2, "c": { "d": "hello" }}"#, + r#"{"a": 3, "b": 4}"#, + r#"{"a": 5, "b": 6}"#, + ])); + + let path = "$.c.d"; + + let result = super::extract_json_string(&[ + super::ColumnarValue::Array(input), + super::ColumnarValue::Scalar(path.into()), + ]) + .unwrap(); + + let expected = StringArray::from(vec![Some("hello"), None, None]); + + if let super::ColumnarValue::Array(result) = result { + assert_eq!(*result, expected); + } else { + panic!("Expected array, got scalar"); + } + + let result = super::extract_json_string(&[ + super::ColumnarValue::Scalar(r#"{"a": 1, "b": 2, "c": { "d": "hello" }}"#.into()), + super::ColumnarValue::Scalar(path.into()), + ]) + .unwrap(); + + let expected = ScalarValue::Utf8(Some("hello".to_string())); + + if let super::ColumnarValue::Scalar(result) = result { + assert_eq!(result, expected); + } else { + panic!("Expected scalar"); + } + } +} diff --git a/src/sql/grammar.pest b/src/sql/grammar.pest deleted file mode 100644 index 15f70dd7..00000000 --- a/src/sql/grammar.pest +++ /dev/null @@ -1,134 +0,0 @@ -// ============================================================================= -// FUNCTION SQL Grammar -// -// Using pest PEG syntax, referencing ANTLR style -// ============================================================================= - -// ============================================================================= -// 1. Whitespace (automatically skipped) -// ============================================================================= - -WHITESPACE = _{ " " | "\t" | "\r" | "\n" } - -// ============================================================================= -// 2. Keywords (case-insensitive) -// ============================================================================= - -kw_create = _{ C ~ R ~ E ~ A ~ T ~ E } -kw_drop = _{ D ~ R ~ O ~ P } -kw_start = _{ S ~ T ~ A ~ R ~ T } -kw_stop = _{ S ~ T ~ O ~ P } -kw_show = _{ S ~ H ~ O ~ W } -kw_with = _{ W ~ I ~ T ~ H } -kw_function = _{ F ~ U ~ N ~ C ~ T ~ I ~ O ~ N } -kw_functions = _{ F ~ U ~ N ~ C ~ T ~ I ~ O ~ N ~ S } - -// ============================================================================= -// 3. Operators & Symbols -// ============================================================================= - -LPAREN = _{ "(" } -RPAREN = _{ ")" } -COMMA = _{ "," } -EQ = _{ "=" } -SQUOTE = _{ "'" } -DQUOTE = _{ "\"" } - -// ============================================================================= -// 4. Literals -// ============================================================================= - -// String literal (single or double quotes) -string_literal = @{ - SQUOTE ~ string_inner_single ~ SQUOTE | - DQUOTE ~ string_inner_double ~ DQUOTE -} - -string_inner_single = @{ (!(SQUOTE | "\\") ~ ANY | escape_seq)* } -string_inner_double = @{ (!(DQUOTE | "\\") ~ ANY | escape_seq)* } -escape_seq = @{ "\\" ~ ANY } - -// ============================================================================= -// 5. Identifiers -// ============================================================================= - -// Task name identifier -identifier = @{ (ASCII_ALPHA | "_") ~ (ASCII_ALPHANUMERIC | "_" | "-")* } - -// ============================================================================= -// 6. Statements -// ============================================================================= - -// Entry rule -statement = _{ - SOI ~ ( - create_stmt | - drop_stmt | - start_stmt | - stop_stmt | - show_stmt - ) ~ EOI -} - -// CREATE FUNCTION WITH (...) -// Note: name is read from config file, not from SQL statement -create_stmt = { kw_create ~ kw_function ~ kw_with ~ properties } - -// DROP FUNCTION name -drop_stmt = { kw_drop ~ kw_function ~ identifier } - -// START FUNCTION name -start_stmt = { kw_start ~ kw_function ~ identifier } - -// STOP FUNCTION name -stop_stmt = { kw_stop ~ kw_function ~ identifier } - -// SHOW FUNCTIONS -show_stmt = { kw_show ~ kw_functions } - -// ============================================================================= -// 7. Properties -// ============================================================================= - -// Property list ('key'='value', ...) -properties = { LPAREN ~ property ~ (COMMA ~ property)* ~ RPAREN } - -// Single property 'key'='value' -property = { property_key ~ EQ ~ property_value } - -// Property key (string) -property_key = { string_literal } - -// Property value (string) -property_value = { string_literal } - -// ============================================================================= -// 8. Character Fragments (for case-insensitive matching) -// ============================================================================= - -A = _{ "A" | "a" } -B = _{ "B" | "b" } -C = _{ "C" | "c" } -D = _{ "D" | "d" } -E = _{ "E" | "e" } -F = _{ "F" | "f" } -G = _{ "G" | "g" } -H = _{ "H" | "h" } -I = _{ "I" | "i" } -J = _{ "J" | "j" } -K = _{ "K" | "k" } -L = _{ "L" | "l" } -M = _{ "M" | "m" } -N = _{ "N" | "n" } -O = _{ "O" | "o" } -P = _{ "P" | "p" } -Q = _{ "Q" | "q" } -R = _{ "R" | "r" } -S = _{ "S" | "s" } -T = _{ "T" | "t" } -U = _{ "U" | "u" } -V = _{ "V" | "v" } -W = _{ "W" | "w" } -X = _{ "X" | "x" } -Y = _{ "Y" | "y" } -Z = _{ "Z" | "z" } diff --git a/src/sql/logical_node/logical/dylib_udf_config.rs b/src/sql/logical_node/logical/dylib_udf_config.rs new file mode 100644 index 00000000..6c88054f --- /dev/null +++ b/src/sql/logical_node/logical/dylib_udf_config.rs @@ -0,0 +1,71 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use datafusion::arrow::datatypes::DataType; +use datafusion_proto::protobuf::ArrowType; +use prost::Message; +use protocol::grpc::api; + +#[derive(Clone, Debug, Eq, PartialEq, Hash, PartialOrd)] +pub struct DylibUdfConfig { + pub dylib_path: String, + pub arg_types: Vec, + pub return_type: DataType, + pub aggregate: bool, + pub is_async: bool, +} + +impl From for api::DylibUdfConfig { + fn from(from: DylibUdfConfig) -> Self { + api::DylibUdfConfig { + dylib_path: from.dylib_path, + arg_types: from + .arg_types + .iter() + .map(|t| { + ArrowType::try_from(t) + .expect("unsupported data type") + .encode_to_vec() + }) + .collect(), + return_type: ArrowType::try_from(&from.return_type) + .expect("unsupported data type") + .encode_to_vec(), + aggregate: from.aggregate, + is_async: from.is_async, + } + } +} + +impl From for DylibUdfConfig { + fn from(from: api::DylibUdfConfig) -> Self { + DylibUdfConfig { + dylib_path: from.dylib_path, + arg_types: from + .arg_types + .iter() + .map(|t| { + DataType::try_from( + &ArrowType::decode(&mut t.as_slice()).expect("invalid arrow type"), + ) + .expect("invalid arrow type") + }) + .collect(), + return_type: DataType::try_from( + &ArrowType::decode(&mut from.return_type.as_slice()).unwrap(), + ) + .expect("invalid arrow type"), + aggregate: from.aggregate, + is_async: from.is_async, + } + } +} diff --git a/src/sql/logical_node/logical/fs_program_convert.rs b/src/sql/logical_node/logical/fs_program_convert.rs new file mode 100644 index 00000000..a8ac20b1 --- /dev/null +++ b/src/sql/logical_node/logical/fs_program_convert.rs @@ -0,0 +1,201 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Conversions between [`LogicalProgram`] and `protocol::grpc::api::FsProgram` / pipeline API types. + +use std::collections::HashMap; +use std::str::FromStr; +use std::sync::Arc; + +use datafusion::common::{DataFusionError, Result as DFResult}; +use petgraph::graph::DiGraph; +use petgraph::prelude::EdgeRef; +use protocol::grpc::api::{ + ChainedOperator, EdgeType as ProtoEdgeType, FsEdge, FsNode, FsProgram, FsSchema as ProtoFsSchema, +}; + +use crate::sql::api::pipelines::{PipelineEdge, PipelineGraph, PipelineNode}; +use crate::sql::common::FsSchema; + +use super::logical_edge::logical_edge_type_from_proto_i32; +use super::operator_chain::{ChainedLogicalOperator, OperatorChain}; +use super::operator_name::OperatorName; +use super::{LogicalEdge, LogicalNode, LogicalProgram, ProgramConfig}; + +impl TryFrom for LogicalProgram { + type Error = DataFusionError; + + fn try_from(value: FsProgram) -> DFResult { + let mut graph = DiGraph::new(); + let mut id_map = HashMap::with_capacity(value.nodes.len()); + + for node in value.nodes { + let operators = node + .operators + .into_iter() + .map(|op| { + let ChainedOperator { + operator_id, + operator_name: name_str, + operator_config, + } = op; + let operator_name = OperatorName::from_str(&name_str).map_err(|_| { + DataFusionError::Plan(format!("Invalid operator name: {name_str}")) + })?; + Ok(ChainedLogicalOperator { + operator_id, + operator_name, + operator_config, + }) + }) + .collect::>>()?; + + let edges = node + .edges + .into_iter() + .map(|e| { + let fs: FsSchema = e.try_into()?; + Ok(Arc::new(fs)) + }) + .collect::>>()?; + + let logical_node = LogicalNode { + node_id: node.node_id, + description: node.description, + operator_chain: OperatorChain { operators, edges }, + parallelism: node.parallelism as usize, + }; + + id_map.insert(node.node_index, graph.add_node(logical_node)); + } + + for edge in value.edges { + let source = *id_map.get(&edge.source).ok_or_else(|| { + DataFusionError::Plan("Graph integrity error: Missing source node".into()) + })?; + let target = *id_map.get(&edge.target).ok_or_else(|| { + DataFusionError::Plan("Graph integrity error: Missing target node".into()) + })?; + let schema = edge + .schema + .ok_or_else(|| DataFusionError::Plan("Graph integrity error: Missing edge schema".into()))?; + let edge_type = logical_edge_type_from_proto_i32(edge.edge_type)?; + + graph.add_edge( + source, + target, + LogicalEdge { + edge_type, + schema: Arc::new(FsSchema::try_from(schema)?), + }, + ); + } + + let program_config = value + .program_config + .map(ProgramConfig::from) + .unwrap_or_default(); + + Ok(LogicalProgram::new(graph, program_config)) + } +} + +impl From for FsProgram { + fn from(value: LogicalProgram) -> Self { + let nodes = value + .graph + .node_indices() + .filter_map(|idx| value.graph.node_weight(idx).map(|node| (idx, node))) + .map(|(idx, node)| FsNode { + node_index: idx.index() as i32, + node_id: node.node_id, + parallelism: node.parallelism as u32, + description: node.description.clone(), + operators: node + .operator_chain + .operators + .iter() + .map(|op| ChainedOperator { + operator_id: op.operator_id.clone(), + operator_name: op.operator_name.to_string(), + operator_config: op.operator_config.clone(), + }) + .collect(), + edges: node + .operator_chain + .edges + .iter() + .map(|edge| ProtoFsSchema::from((**edge).clone())) + .collect(), + }) + .collect(); + + let edges = value + .graph + .edge_indices() + .filter_map(|eidx| { + let edge = value.graph.edge_weight(eidx)?; + let (source, target) = value.graph.edge_endpoints(eidx)?; + Some(FsEdge { + source: source.index() as i32, + target: target.index() as i32, + schema: Some(ProtoFsSchema::from((*edge.schema).clone())), + edge_type: ProtoEdgeType::from(edge.edge_type) as i32, + }) + }) + .collect(); + + FsProgram { + nodes, + edges, + program_config: Some(value.program_config.into()), + } + } +} + +impl TryFrom for PipelineGraph { + type Error = DataFusionError; + + fn try_from(value: LogicalProgram) -> DFResult { + let nodes = value + .graph + .node_weights() + .map(|node| { + Ok(PipelineNode { + node_id: node.node_id, + operator: node.resolve_pipeline_operator_name()?, + description: node.description.clone(), + parallelism: node.parallelism as u32, + }) + }) + .collect::>>()?; + + let edges = value + .graph + .edge_references() + .filter_map(|edge| { + let src = value.graph.node_weight(edge.source())?; + let target = value.graph.node_weight(edge.target())?; + Some(PipelineEdge { + src_id: src.node_id, + dest_id: target.node_id, + key_type: "()".to_string(), + value_type: "()".to_string(), + edge_type: format!("{:?}", edge.weight().edge_type), + }) + }) + .collect(); + + Ok(PipelineGraph { nodes, edges }) + } +} diff --git a/src/sql/logical_node/logical/logical_edge.rs b/src/sql/logical_node/logical/logical_edge.rs new file mode 100644 index 00000000..1a169c1d --- /dev/null +++ b/src/sql/logical_node/logical/logical_edge.rs @@ -0,0 +1,102 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::fmt::{Display, Formatter}; +use std::sync::Arc; + +use datafusion::common::{DataFusionError, Result}; +use protocol::grpc::api::EdgeType as ProtoEdgeType; +use serde::{Deserialize, Serialize}; + +use crate::sql::common::FsSchema; + +#[derive(Copy, Clone, Debug, Eq, PartialEq, PartialOrd, Ord, Serialize, Deserialize)] +pub enum LogicalEdgeType { + Forward, + Shuffle, + LeftJoin, + RightJoin, +} + +impl Display for LogicalEdgeType { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + let symbol = match self { + LogicalEdgeType::Forward => "→", + LogicalEdgeType::Shuffle => "⤨", + LogicalEdgeType::LeftJoin => "-[left]⤨", + LogicalEdgeType::RightJoin => "-[right]⤨", + }; + write!(f, "{symbol}") + } +} + +impl From for LogicalEdgeType { + fn from(value: ProtoEdgeType) -> Self { + match value { + ProtoEdgeType::Unused => { + panic!("Critical: Invalid EdgeType 'Unused' encountered") + } + ProtoEdgeType::Forward => Self::Forward, + ProtoEdgeType::Shuffle => Self::Shuffle, + ProtoEdgeType::LeftJoin => Self::LeftJoin, + ProtoEdgeType::RightJoin => Self::RightJoin, + } + } +} + +impl From for ProtoEdgeType { + fn from(value: LogicalEdgeType) -> Self { + match value { + LogicalEdgeType::Forward => Self::Forward, + LogicalEdgeType::Shuffle => Self::Shuffle, + LogicalEdgeType::LeftJoin => Self::LeftJoin, + LogicalEdgeType::RightJoin => Self::RightJoin, + } + } +} + +pub(crate) fn logical_edge_type_from_proto_i32(i: i32) -> Result { + let e = ProtoEdgeType::try_from(i).map_err(|_| { + DataFusionError::Plan(format!("invalid protobuf EdgeType discriminant {i}")) + })?; + match e { + ProtoEdgeType::Unused => Err(DataFusionError::Plan( + "Critical: Invalid EdgeType 'Unused' encountered".into(), + )), + ProtoEdgeType::Forward => Ok(LogicalEdgeType::Forward), + ProtoEdgeType::Shuffle => Ok(LogicalEdgeType::Shuffle), + ProtoEdgeType::LeftJoin => Ok(LogicalEdgeType::LeftJoin), + ProtoEdgeType::RightJoin => Ok(LogicalEdgeType::RightJoin), + } +} + +#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize)] +pub struct LogicalEdge { + pub edge_type: LogicalEdgeType, + pub schema: Arc, +} + +impl LogicalEdge { + pub fn new(edge_type: LogicalEdgeType, schema: FsSchema) -> Self { + LogicalEdge { + edge_type, + schema: Arc::new(schema), + } + } + + pub fn project_all(edge_type: LogicalEdgeType, schema: FsSchema) -> Self { + LogicalEdge { + edge_type, + schema: Arc::new(schema), + } + } +} diff --git a/src/sql/logical_node/logical/logical_graph.rs b/src/sql/logical_node/logical/logical_graph.rs new file mode 100644 index 00000000..b877e2a0 --- /dev/null +++ b/src/sql/logical_node/logical/logical_graph.rs @@ -0,0 +1,30 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use petgraph::graph::DiGraph; + +use super::logical_edge::LogicalEdge; +use super::logical_node::LogicalNode; + +pub type LogicalGraph = DiGraph; + +pub trait Optimizer { + fn optimize_once(&self, plan: &mut LogicalGraph) -> bool; + + fn optimize(&self, plan: &mut LogicalGraph) { + loop { + if !self.optimize_once(plan) { + break; + } + } + } +} diff --git a/src/sql/logical_node/logical/logical_node.rs b/src/sql/logical_node/logical/logical_node.rs new file mode 100644 index 00000000..26129b26 --- /dev/null +++ b/src/sql/logical_node/logical/logical_node.rs @@ -0,0 +1,89 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::fmt::{Debug, Display, Formatter}; + +use datafusion::common::{DataFusionError, Result}; +use itertools::Itertools; +use serde::{Deserialize, Serialize}; + +use super::operator_chain::{ChainedLogicalOperator, OperatorChain}; +use super::operator_name::OperatorName; + +#[derive(Clone, Serialize, Deserialize)] +pub struct LogicalNode { + pub node_id: u32, + pub description: String, + pub operator_chain: OperatorChain, + pub parallelism: usize, +} + +impl LogicalNode { + pub fn single( + id: u32, + operator_id: String, + name: OperatorName, + config: Vec, + description: String, + parallelism: usize, + ) -> Self { + Self { + node_id: id, + description, + operator_chain: OperatorChain { + operators: vec![ChainedLogicalOperator { + operator_id, + operator_name: name, + operator_config: config, + }], + edges: vec![], + }, + parallelism, + } + } + + pub fn resolve_pipeline_operator_name(&self) -> Result { + let first_op = self + .operator_chain + .operators + .first() + .ok_or_else(|| DataFusionError::Plan("Invalid LogicalNode: Operator chain is empty".into()))?; + + if let Some(connector_name) = first_op.extract_connector_name() { + return Ok(connector_name); + } + + if self.operator_chain.len() == 1 { + return Ok(first_op.operator_id.clone()); + } + + Ok("chained_op".to_string()) + } +} + +impl Display for LogicalNode { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.description) + } +} + +impl Debug for LogicalNode { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + let chain_path = self + .operator_chain + .operators + .iter() + .map(|op| op.operator_id.as_str()) + .join(" -> "); + write!(f, "{chain_path}[{}]", self.parallelism) + } +} diff --git a/src/sql/logical_node/logical/logical_program.rs b/src/sql/logical_node/logical/logical_program.rs new file mode 100644 index 00000000..888f4292 --- /dev/null +++ b/src/sql/logical_node/logical/logical_program.rs @@ -0,0 +1,156 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::hash_map::DefaultHasher; +use std::collections::{HashMap, HashSet}; +use std::hash::Hasher; +use std::sync::Arc; + +use datafusion::arrow::datatypes::Schema; +use datafusion::common::{DataFusionError, Result as DFResult}; +use petgraph::Direction; +use petgraph::dot::Dot; +use prost::Message; +use protocol::grpc::api::FsProgram; +use rand::distributions::Alphanumeric; +use rand::rngs::SmallRng; +use rand::{Rng, SeedableRng}; + +use super::logical_graph::{LogicalGraph, Optimizer}; +use super::operator_name::OperatorName; +use super::program_config::ProgramConfig; + +#[derive(Clone, Debug, Default)] +pub struct LogicalProgram { + pub graph: LogicalGraph, + pub program_config: ProgramConfig, +} + +impl LogicalProgram { + pub fn new(graph: LogicalGraph, program_config: ProgramConfig) -> Self { + Self { + graph, + program_config, + } + } + + pub fn optimize(&mut self, optimizer: &dyn Optimizer) { + optimizer.optimize(&mut self.graph); + } + + pub fn update_parallelism(&mut self, overrides: &HashMap) { + for node in self.graph.node_weights_mut() { + if let Some(&p) = overrides.get(&node.node_id) { + node.parallelism = p; + } + } + } + + pub fn dot(&self) -> String { + format!("{:?}", Dot::with_config(&self.graph, &[])) + } + + pub fn task_count(&self) -> usize { + self.graph.node_weights().map(|nw| nw.parallelism).sum() + } + + pub fn sources(&self) -> HashSet { + self.graph + .externals(Direction::Incoming) + .filter_map(|idx| self.graph.node_weight(idx)) + .map(|node| node.node_id) + .collect() + } + + pub fn get_hash(&self) -> String { + let mut hasher = DefaultHasher::new(); + let program_bytes = FsProgram::from(self.clone()).encode_to_vec(); + hasher.write(&program_bytes); + let rng = SmallRng::seed_from_u64(hasher.finish()); + rng.sample_iter(&Alphanumeric) + .take(16) + .map(|c| (c as char).to_ascii_lowercase()) + .collect() + } + + pub fn tasks_per_operator(&self) -> HashMap { + self.graph + .node_weights() + .flat_map(|node| { + node.operator_chain + .operators + .iter() + .map(move |op| (op.operator_id.clone(), node.parallelism)) + }) + .collect() + } + + pub fn operator_names_by_id(&self) -> HashMap { + self.graph + .node_weights() + .flat_map(|node| &node.operator_chain.operators) + .map(|op| { + let resolved_name = op + .extract_connector_name() + .unwrap_or_else(|| op.operator_name.to_string()); + (op.operator_id.clone(), resolved_name) + }) + .collect() + } + + pub fn tasks_per_node(&self) -> HashMap { + self.graph + .node_weights() + .map(|node| (node.node_id, node.parallelism)) + .collect() + } + + pub fn features(&self) -> HashSet { + self.graph + .node_weights() + .flat_map(|node| &node.operator_chain.operators) + .filter_map(|op| op.extract_feature()) + .collect() + } + + /// Arrow schema carried on edges into the connector-sink node, if present. + pub fn egress_arrow_schema(&self) -> Option> { + for idx in self.graph.node_indices() { + let node = self.graph.node_weight(idx)?; + if node + .operator_chain + .operators + .iter() + .any(|op| op.operator_name == OperatorName::ConnectorSink) + { + let e = self + .graph + .edges_directed(idx, Direction::Incoming) + .next()?; + return Some(Arc::clone(&e.weight().schema.schema)); + } + } + None + } + + pub fn encode_for_catalog(&self) -> DFResult> { + Ok(FsProgram::from(self.clone()).encode_to_vec()) + } + + pub fn decode_for_catalog(bytes: &[u8]) -> DFResult { + let proto = FsProgram::decode(bytes).map_err(|e| { + DataFusionError::Execution(format!("FsProgram catalog decode failed: {e}")) + })?; + LogicalProgram::try_from(proto) + } +} diff --git a/src/sql/logical_node/logical/mod.rs b/src/sql/logical_node/logical/mod.rs new file mode 100644 index 00000000..d2e9a327 --- /dev/null +++ b/src/sql/logical_node/logical/mod.rs @@ -0,0 +1,30 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +mod dylib_udf_config; +mod fs_program_convert; +mod logical_edge; +mod logical_graph; +mod logical_node; +mod logical_program; +mod operator_chain; +mod operator_name; +mod program_config; +mod python_udf_config; + +pub use dylib_udf_config::DylibUdfConfig; +pub use logical_edge::{LogicalEdge, LogicalEdgeType}; +pub use logical_graph::{LogicalGraph, Optimizer}; +pub use logical_node::LogicalNode; +pub use logical_program::LogicalProgram; +pub use operator_name::OperatorName; +pub use program_config::ProgramConfig; diff --git a/src/sql/logical_node/logical/operator_chain.rs b/src/sql/logical_node/logical/operator_chain.rs new file mode 100644 index 00000000..e74684ba --- /dev/null +++ b/src/sql/logical_node/logical/operator_chain.rs @@ -0,0 +1,128 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; + +use itertools::{EitherOrBoth, Itertools}; +use prost::Message; +use protocol::grpc::api::ConnectorOp; +use serde::{Deserialize, Serialize}; + +use super::operator_name::OperatorName; +use crate::sql::common::FsSchema; + +#[derive(Clone, Debug, Serialize, Deserialize)] +pub struct ChainedLogicalOperator { + pub operator_id: String, + pub operator_name: OperatorName, + pub operator_config: Vec, +} + +impl ChainedLogicalOperator { + pub fn extract_connector_name(&self) -> Option { + if matches!( + self.operator_name, + OperatorName::ConnectorSource | OperatorName::ConnectorSink + ) { + ConnectorOp::decode(self.operator_config.as_slice()) + .ok() + .map(|op| op.connector) + } else { + None + } + } + + pub fn extract_feature(&self) -> Option { + match self.operator_name { + OperatorName::AsyncUdf => Some("async-udf".to_string()), + OperatorName::Join => Some("join-with-expiration".to_string()), + OperatorName::InstantJoin => Some("windowed-join".to_string()), + OperatorName::WindowFunction => Some("sql-window-function".to_string()), + OperatorName::LookupJoin => Some("lookup-join".to_string()), + OperatorName::TumblingWindowAggregate => { + Some("sql-tumbling-window-aggregate".to_string()) + } + OperatorName::SlidingWindowAggregate => { + Some("sql-sliding-window-aggregate".to_string()) + } + OperatorName::SessionWindowAggregate => { + Some("sql-session-window-aggregate".to_string()) + } + OperatorName::UpdatingAggregate => Some("sql-updating-aggregate".to_string()), + OperatorName::ConnectorSource => self + .extract_connector_name() + .map(|c| format!("{c}-source")), + OperatorName::ConnectorSink => self.extract_connector_name().map(|c| format!("{c}-sink")), + _ => None, + } + } +} + +#[derive(Clone, Debug, Serialize, Deserialize)] +pub struct OperatorChain { + pub(crate) operators: Vec, + pub(crate) edges: Vec>, +} + +impl OperatorChain { + pub fn new(operator: ChainedLogicalOperator) -> Self { + Self { + operators: vec![operator], + edges: vec![], + } + } + + pub fn iter( + &self, + ) -> impl Iterator>)> { + self.operators.iter().zip_longest(&self.edges).filter_map(|e| match e { + EitherOrBoth::Both(op, edge) => Some((op, Some(edge))), + EitherOrBoth::Left(op) => Some((op, None)), + EitherOrBoth::Right(_) => None, + }) + } + + pub fn iter_mut( + &mut self, + ) -> impl Iterator>)> { + self.operators + .iter_mut() + .zip_longest(&self.edges) + .filter_map(|e| match e { + EitherOrBoth::Both(op, edge) => Some((op, Some(edge))), + EitherOrBoth::Left(op) => Some((op, None)), + EitherOrBoth::Right(_) => None, + }) + } + + pub fn first(&self) -> &ChainedLogicalOperator { + self.operators + .first() + .expect("OperatorChain must contain at least one operator") + } + + pub fn len(&self) -> usize { + self.operators.len() + } + + pub fn is_empty(&self) -> bool { + self.operators.is_empty() + } + + pub fn is_source(&self) -> bool { + self.operators[0].operator_name == OperatorName::ConnectorSource + } + + pub fn is_sink(&self) -> bool { + self.operators[0].operator_name == OperatorName::ConnectorSink + } +} diff --git a/src/sql/logical_node/logical/operator_name.rs b/src/sql/logical_node/logical/operator_name.rs new file mode 100644 index 00000000..57f53f90 --- /dev/null +++ b/src/sql/logical_node/logical/operator_name.rs @@ -0,0 +1,82 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::str::FromStr; + +use serde::{Deserialize, Deserializer, Serialize, Serializer}; +use strum::{Display, EnumString, IntoStaticStr}; + +use crate::sql::common::constants::operator_feature; + +#[derive(Clone, Copy, Debug, Eq, PartialEq, EnumString, Display, IntoStaticStr)] +pub enum OperatorName { + ExpressionWatermark, + Value, + KeyBy, + Projection, + AsyncUdf, + Join, + InstantJoin, + LookupJoin, + WindowFunction, + TumblingWindowAggregate, + SlidingWindowAggregate, + SessionWindowAggregate, + UpdatingAggregate, + ConnectorSource, + ConnectorSink, +} + +impl OperatorName { + /// Registry / worker lookup key; matches [`Display`] and protobuf operator names. + #[inline] + pub fn as_registry_key(self) -> &'static str { + self.into() + } + + pub fn feature_tag(self) -> Option<&'static str> { + match self { + Self::ExpressionWatermark | Self::Value | Self::Projection => None, + Self::AsyncUdf => Some(operator_feature::ASYNC_UDF), + Self::Join => Some(operator_feature::JOIN_WITH_EXPIRATION), + Self::InstantJoin => Some(operator_feature::WINDOWED_JOIN), + Self::WindowFunction => Some(operator_feature::SQL_WINDOW_FUNCTION), + Self::LookupJoin => Some(operator_feature::LOOKUP_JOIN), + Self::TumblingWindowAggregate => Some(operator_feature::SQL_TUMBLING_WINDOW_AGGREGATE), + Self::SlidingWindowAggregate => Some(operator_feature::SQL_SLIDING_WINDOW_AGGREGATE), + Self::SessionWindowAggregate => Some(operator_feature::SQL_SESSION_WINDOW_AGGREGATE), + Self::UpdatingAggregate => Some(operator_feature::SQL_UPDATING_AGGREGATE), + Self::KeyBy => Some(operator_feature::KEY_BY_ROUTING), + Self::ConnectorSource => Some(operator_feature::CONNECTOR_SOURCE), + Self::ConnectorSink => Some(operator_feature::CONNECTOR_SINK), + } + } +} + +impl Serialize for OperatorName { + fn serialize(&self, serializer: S) -> Result + where + S: Serializer, + { + serializer.serialize_str(&self.to_string()) + } +} + +impl<'de> Deserialize<'de> for OperatorName { + fn deserialize(deserializer: D) -> Result + where + D: Deserializer<'de>, + { + let s = String::deserialize(deserializer)?; + Self::from_str(&s).map_err(serde::de::Error::custom) + } +} diff --git a/src/sql/logical_node/logical/program_config.rs b/src/sql/logical_node/logical/program_config.rs new file mode 100644 index 00000000..931a5424 --- /dev/null +++ b/src/sql/logical_node/logical/program_config.rs @@ -0,0 +1,33 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use protocol::grpc::api::FsProgramConfig; + +/// Placeholder program-level config (UDF tables live elsewhere; wire maps stay empty). +#[derive(Clone, Debug, Default)] +pub struct ProgramConfig {} + +impl From for FsProgramConfig { + fn from(_: ProgramConfig) -> Self { + Self { + udf_dylibs: Default::default(), + python_udfs: Default::default(), + } + } +} + +impl From for ProgramConfig { + fn from(_: FsProgramConfig) -> Self { + Self::default() + } +} diff --git a/src/sql/logical_node/logical/python_udf_config.rs b/src/sql/logical_node/logical/python_udf_config.rs new file mode 100644 index 00000000..6e7d5c66 --- /dev/null +++ b/src/sql/logical_node/logical/python_udf_config.rs @@ -0,0 +1,23 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; + +use datafusion::arrow::datatypes::DataType; + +#[derive(Clone, Debug, Eq, PartialEq, Hash)] +pub struct PythonUdfConfig { + pub arg_types: Vec, + pub return_type: DataType, + pub name: Arc, + pub definition: Arc, +} diff --git a/src/sql/logical_node/mod.rs b/src/sql/logical_node/mod.rs new file mode 100644 index 00000000..922801f6 --- /dev/null +++ b/src/sql/logical_node/mod.rs @@ -0,0 +1,13 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +pub mod logical; diff --git a/src/sql/logical_planner/mod.rs b/src/sql/logical_planner/mod.rs new file mode 100644 index 00000000..f29cba18 --- /dev/null +++ b/src/sql/logical_planner/mod.rs @@ -0,0 +1,14 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +pub(crate) mod planner; +pub mod optimizers; diff --git a/src/sql/logical_planner/optimizers/chaining.rs b/src/sql/logical_planner/optimizers/chaining.rs new file mode 100644 index 00000000..8c1534a6 --- /dev/null +++ b/src/sql/logical_planner/optimizers/chaining.rs @@ -0,0 +1,173 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::mem; + +use petgraph::graph::{EdgeIndex, NodeIndex}; +use petgraph::prelude::*; +use petgraph::visit::NodeRef; + + +use crate::sql::logical_node::logical::{LogicalEdgeType, LogicalGraph, Optimizer}; + +pub type NodeId = NodeIndex; +pub type EdgeId = EdgeIndex; + +pub struct ChainingOptimizer {} + +fn remove_in_place(graph: &mut DiGraph, node: NodeIndex) { + let incoming = graph.edges_directed(node, Incoming).next().unwrap(); + + let parent = incoming.source().id(); + let incoming = incoming.id(); + graph.remove_edge(incoming); + + let outgoing: Vec<_> = graph + .edges_directed(node, Outgoing) + .map(|e| (e.id(), e.target().id())) + .collect(); + + for (edge, target) in outgoing { + let weight = graph.remove_edge(edge).unwrap(); + graph.add_edge(parent, target, weight); + } + + graph.remove_node(node); +} + +impl Optimizer for ChainingOptimizer { + fn optimize_once(&self, plan: &mut LogicalGraph) -> bool { + let node_indices: Vec = plan.node_indices().collect(); + + for &node_idx in &node_indices { + let cur = plan.node_weight(node_idx).unwrap(); + + // sources can't be chained + if cur.operator_chain.is_source() { + continue; + } + + let mut successors = plan.edges_directed(node_idx, Outgoing).collect::>(); + + if successors.len() != 1 { + continue; + } + + let edge = successors.remove(0); + let edge_type = edge.weight().edge_type; + + if edge_type != LogicalEdgeType::Forward { + continue; + } + + let successor_idx = edge.target(); + + let successor_node = plan.node_weight(successor_idx).unwrap(); + + // skip if parallelism doesn't match or successor is a sink + if cur.parallelism != successor_node.parallelism + || successor_node.operator_chain.is_sink() + { + continue; + } + + // skip successors with multiple predecessors + if plan.edges_directed(successor_idx, Incoming).count() > 1 { + continue; + } + + // construct the new node + let mut new_cur = cur.clone(); + + new_cur.description = format!("{} -> {}", cur.description, successor_node.description); + + new_cur + .operator_chain + .operators + .extend(successor_node.operator_chain.operators.clone()); + + new_cur + .operator_chain + .edges + .push(edge.weight().schema.clone()); + + mem::swap(&mut new_cur, plan.node_weight_mut(node_idx).unwrap()); + + // remove the old successor + remove_in_place(plan, successor_idx); + return true; + } + + false + } +} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use datafusion::arrow::datatypes::{DataType, Field, Schema, TimeUnit}; + + use crate::sql::common::FsSchema; + use crate::sql::logical_node::logical::{ + LogicalEdge, LogicalEdgeType, LogicalGraph, LogicalNode, Optimizer, OperatorName, + }; + + use super::ChainingOptimizer; + + fn forward_edge() -> LogicalEdge { + let s = Arc::new(Schema::new(vec![Field::new( + "_timestamp", + DataType::Timestamp(TimeUnit::Nanosecond, None), + false, + )])); + LogicalEdge::new(LogicalEdgeType::Forward, FsSchema::new_unkeyed(s, 0)) + } + + fn proj_node(id: u32, label: &str) -> LogicalNode { + LogicalNode::single( + id, + format!("op_{label}"), + OperatorName::Projection, + vec![], + label.to_string(), + 1, + ) + } + + fn source_node() -> LogicalNode { + LogicalNode::single( + 0, + "src".into(), + OperatorName::ConnectorSource, + vec![], + "source".into(), + 1, + ) + } + + /// Regression: upstream at last `NodeIndex` + remove non-last downstream swaps indices. + #[test] + fn fusion_remaps_when_upstream_was_last_node_index() { + let mut g = LogicalGraph::new(); + let n0 = g.add_node(source_node()); + let n1 = g.add_node(proj_node(1, "downstream")); + let n2 = g.add_node(proj_node(2, "upstream_last_index")); + let e = forward_edge(); + g.add_edge(n0, n2, e.clone()); + g.add_edge(n2, n1, e); + + let changed = ChainingOptimizer {}.optimize_once(&mut g); + assert!(changed); + assert_eq!(g.node_count(), 2); + } +} diff --git a/src/sql/logical_planner/optimizers/datafusion_logical.rs b/src/sql/logical_planner/optimizers/datafusion_logical.rs new file mode 100644 index 00000000..fbb64845 --- /dev/null +++ b/src/sql/logical_planner/optimizers/datafusion_logical.rs @@ -0,0 +1,95 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; + +use datafusion::common::Result; +use datafusion::common::config::ConfigOptions; +use datafusion::logical_expr::LogicalPlan; +use datafusion::optimizer::OptimizerContext; +use datafusion::optimizer::OptimizerRule; +use datafusion::optimizer::common_subexpr_eliminate::CommonSubexprEliminate; +use datafusion::optimizer::decorrelate_lateral_join::DecorrelateLateralJoin; +use datafusion::optimizer::decorrelate_predicate_subquery::DecorrelatePredicateSubquery; +use datafusion::optimizer::eliminate_cross_join::EliminateCrossJoin; +use datafusion::optimizer::eliminate_duplicated_expr::EliminateDuplicatedExpr; +use datafusion::optimizer::eliminate_filter::EliminateFilter; +use datafusion::optimizer::eliminate_group_by_constant::EliminateGroupByConstant; +use datafusion::optimizer::eliminate_join::EliminateJoin; +use datafusion::optimizer::eliminate_limit::EliminateLimit; +use datafusion::optimizer::eliminate_nested_union::EliminateNestedUnion; +use datafusion::optimizer::eliminate_one_union::EliminateOneUnion; +use datafusion::optimizer::eliminate_outer_join::EliminateOuterJoin; +use datafusion::optimizer::extract_equijoin_predicate::ExtractEquijoinPredicate; +use datafusion::optimizer::filter_null_join_keys::FilterNullJoinKeys; +use datafusion::optimizer::optimizer::Optimizer; +use datafusion::optimizer::propagate_empty_relation::PropagateEmptyRelation; +use datafusion::optimizer::push_down_filter::PushDownFilter; +use datafusion::optimizer::push_down_limit::PushDownLimit; +use datafusion::optimizer::replace_distinct_aggregate::ReplaceDistinctWithAggregate; +use datafusion::optimizer::scalar_subquery_to_join::ScalarSubqueryToJoin; +use datafusion::optimizer::simplify_expressions::SimplifyExpressions; +use datafusion::sql::planner::SqlToRel; +use datafusion::sql::sqlparser::ast::Statement; + +use crate::sql::schema::StreamSchemaProvider; + +/// Converts a SQL statement into an optimized DataFusion logical plan. +/// +/// Applies the DataFusion analyzer followed by a curated set of optimizer rules +/// suitable for streaming SQL (some rules like OptimizeProjections are excluded +/// because they can drop event-time calculation fields). +pub fn produce_optimized_plan( + statement: &Statement, + schema_provider: &StreamSchemaProvider, +) -> Result { + let sql_to_rel = SqlToRel::new(schema_provider); + let plan = sql_to_rel.sql_statement_to_plan(statement.clone())?; + + let analyzed_plan = schema_provider.analyzer.execute_and_check( + plan, + &ConfigOptions::default(), + |_plan, _rule| {}, + )?; + + let rules: Vec> = vec![ + Arc::new(EliminateNestedUnion::new()), + Arc::new(SimplifyExpressions::new()), + Arc::new(ReplaceDistinctWithAggregate::new()), + Arc::new(EliminateJoin::new()), + Arc::new(DecorrelatePredicateSubquery::new()), + Arc::new(ScalarSubqueryToJoin::new()), + Arc::new(DecorrelateLateralJoin::new()), + Arc::new(ExtractEquijoinPredicate::new()), + Arc::new(EliminateDuplicatedExpr::new()), + Arc::new(EliminateFilter::new()), + Arc::new(EliminateCrossJoin::new()), + Arc::new(EliminateLimit::new()), + Arc::new(PropagateEmptyRelation::new()), + Arc::new(EliminateOneUnion::new()), + Arc::new(FilterNullJoinKeys::default()), + Arc::new(EliminateOuterJoin::new()), + Arc::new(PushDownLimit::new()), + Arc::new(PushDownFilter::new()), + Arc::new(EliminateGroupByConstant::new()), + Arc::new(CommonSubexprEliminate::new()), + ]; + + let optimizer = Optimizer::with_rules(rules); + let optimized = optimizer.optimize( + analyzed_plan, + &OptimizerContext::default(), + |_plan, _rule| {}, + )?; + + Ok(optimized) +} diff --git a/src/sql/logical_planner/optimizers/mod.rs b/src/sql/logical_planner/optimizers/mod.rs new file mode 100644 index 00000000..0e0de6a2 --- /dev/null +++ b/src/sql/logical_planner/optimizers/mod.rs @@ -0,0 +1,20 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Logical planner optimizers: graph-level chaining ([`ChainingOptimizer`]) and +//! DataFusion SQL logical-plan rules ([`produce_optimized_plan`]). + +mod chaining; +mod datafusion_logical; + +pub use chaining::ChainingOptimizer; +pub use datafusion_logical::produce_optimized_plan; diff --git a/src/sql/logical_planner/planner.rs b/src/sql/logical_planner/planner.rs new file mode 100644 index 00000000..b0a712c7 --- /dev/null +++ b/src/sql/logical_planner/planner.rs @@ -0,0 +1,418 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; +use std::sync::Arc; +use std::thread; +use std::time::Duration; + +use datafusion::arrow::datatypes::IntervalMonthDayNanoType; +use datafusion::common::tree_node::{TreeNode, TreeNodeRecursion, TreeNodeVisitor}; +use datafusion::common::{ + DFSchema, DFSchemaRef, DataFusionError, Result, ScalarValue, Spans, plan_err, +}; +use datafusion::execution::context::SessionState; +use datafusion::execution::runtime_env::RuntimeEnvBuilder; +use datafusion::functions::datetime::date_bin; +use datafusion::logical_expr::{Expr, Extension, LogicalPlan, UserDefinedLogicalNode}; +use datafusion::physical_expr::PhysicalExpr; +use datafusion::physical_plan::ExecutionPlan; +use datafusion::physical_planner::{DefaultPhysicalPlanner, ExtensionPlanner, PhysicalPlanner}; +use datafusion_proto::protobuf::{PhysicalExprNode, PhysicalPlanNode}; +use datafusion_proto::{ + physical_plan::AsExecutionPlan, + protobuf::{AggregateMode, physical_plan_node::PhysicalPlanType}, +}; +use petgraph::graph::{DiGraph, NodeIndex}; +use prost::Message; +use tokio::runtime::Builder; +use tokio::sync::oneshot; + +use async_trait::async_trait; +use datafusion_common::TableReference; +use datafusion_proto::physical_plan::DefaultPhysicalExtensionCodec; +use datafusion_proto::physical_plan::to_proto::serialize_physical_expr; + +use crate::sql::logical_node::logical::{LogicalEdge, LogicalGraph, LogicalNode}; +use crate::sql::physical::{ + DebeziumUnrollingExec, DecodingContext, FsMemExec, FsPhysicalExtensionCodec, ToDebeziumExec, +}; +use crate::sql::extensions::debezium::{PACK_NODE_NAME, UNROLL_NODE_NAME, UnrollDebeziumPayloadNode}; +use crate::sql::extensions::key_calculation::KeyExtractionNode; +use crate::sql::extensions::{CompiledTopologyNode, StreamingOperatorBlueprint}; +use crate::sql::schema::utils::add_timestamp_field_arrow; +use crate::sql::schema::StreamSchemaProvider; +use crate::sql::common::{FsSchema, FsSchemaRef}; + +#[derive(Eq, Hash, PartialEq)] +#[derive(Debug)] +pub(crate) enum NamedNode { + Source(TableReference), + Watermark(TableReference), + RemoteTable(TableReference), + Sink(TableReference), +} + +pub(crate) struct PlanToGraphVisitor<'a> { + graph: DiGraph, + output_schemas: HashMap, + named_nodes: HashMap, + traversal: Vec>, + planner: Planner<'a>, +} + +impl<'a> PlanToGraphVisitor<'a> { + pub fn new(schema_provider: &'a StreamSchemaProvider, session_state: &'a SessionState) -> Self { + Self { + graph: Default::default(), + output_schemas: Default::default(), + named_nodes: Default::default(), + traversal: vec![], + planner: Planner::new(schema_provider, session_state), + } + } +} + +pub(crate) struct Planner<'a> { + schema_provider: &'a StreamSchemaProvider, + planner: DefaultPhysicalPlanner, + session_state: &'a SessionState, +} + +impl<'a> Planner<'a> { + pub(crate) fn new( + schema_provider: &'a StreamSchemaProvider, + session_state: &'a SessionState, + ) -> Self { + let planner = + DefaultPhysicalPlanner::with_extension_planners(vec![Arc::new(FsExtensionPlanner {})]); + Self { + schema_provider, + planner, + session_state, + } + } + + pub(crate) fn sync_plan(&self, plan: &LogicalPlan) -> Result> { + let fut = self.planner.create_physical_plan(plan, self.session_state); + let (tx, mut rx) = oneshot::channel(); + thread::scope(|s| { + let builder = thread::Builder::new(); + let builder = if cfg!(debug_assertions) { + builder.stack_size(10_000_000) + } else { + builder + }; + builder + .spawn_scoped(s, move || { + let rt = Builder::new_current_thread().enable_all().build().unwrap(); + rt.block_on(async { + let plan = fut.await; + tx.send(plan).unwrap(); + }); + }) + .unwrap(); + }); + + rx.try_recv().unwrap() + } + + pub(crate) fn create_physical_expr( + &self, + expr: &Expr, + input_dfschema: &DFSchema, + ) -> Result> { + self.planner + .create_physical_expr(expr, input_dfschema, self.session_state) + } + + pub(crate) fn serialize_as_physical_expr( + &self, + expr: &Expr, + schema: &DFSchema, + ) -> Result> { + let physical = self.create_physical_expr(expr, schema)?; + let proto = serialize_physical_expr(&physical, &DefaultPhysicalExtensionCodec {})?; + Ok(proto.encode_to_vec()) + } + + pub(crate) fn split_physical_plan( + &self, + key_indices: Vec, + aggregate: &LogicalPlan, + add_timestamp_field: bool, + ) -> Result { + let physical_plan = self.sync_plan(aggregate)?; + let codec = FsPhysicalExtensionCodec { + context: DecodingContext::Planning, + }; + let mut physical_plan_node = + PhysicalPlanNode::try_from_physical_plan(physical_plan.clone(), &codec)?; + let PhysicalPlanType::Aggregate(mut final_aggregate_proto) = physical_plan_node + .physical_plan_type + .take() + .ok_or_else(|| DataFusionError::Plan("missing physical plan type".to_string()))? + else { + return plan_err!("unexpected physical plan type"); + }; + let AggregateMode::Final = final_aggregate_proto.mode() else { + return plan_err!("unexpected physical plan type"); + }; + + let partial_aggregation_plan = *final_aggregate_proto + .input + .take() + .ok_or_else(|| DataFusionError::Plan("missing input".to_string()))?; + + let partial_aggregation_exec_plan = partial_aggregation_plan.try_into_physical_plan( + self.schema_provider, + &RuntimeEnvBuilder::new().build().unwrap(), + &codec, + )?; + + let partial_schema = partial_aggregation_exec_plan.schema(); + let final_input_table_provider = FsMemExec::new("partial".into(), partial_schema.clone()); + + final_aggregate_proto.input = Some(Box::new(PhysicalPlanNode::try_from_physical_plan( + Arc::new(final_input_table_provider), + &codec, + )?)); + + let finish_plan = PhysicalPlanNode { + physical_plan_type: Some(PhysicalPlanType::Aggregate(final_aggregate_proto)), + }; + + let (partial_schema, timestamp_index) = if add_timestamp_field { + ( + add_timestamp_field_arrow((*partial_schema).clone()), + partial_schema.fields().len(), + ) + } else { + (partial_schema.clone(), partial_schema.fields().len() - 1) + }; + + let partial_schema = FsSchema::new_keyed(partial_schema, timestamp_index, key_indices); + + Ok(SplitPlanOutput { + partial_aggregation_plan, + partial_schema, + finish_plan, + }) + } + + pub fn binning_function_proto( + &self, + width: Duration, + input_schema: DFSchemaRef, + ) -> Result { + let date_bin = date_bin().call(vec![ + Expr::Literal( + ScalarValue::IntervalMonthDayNano(Some(IntervalMonthDayNanoType::make_value( + 0, + 0, + width.as_nanos() as i64, + ))), + None, + ), + Expr::Column(datafusion::common::Column { + relation: None, + name: "_timestamp".into(), + spans: Spans::new(), + }), + ]); + + let binning_function = self.create_physical_expr(&date_bin, &input_schema)?; + serialize_physical_expr(&binning_function, &DefaultPhysicalExtensionCodec {}) + } +} + +struct FsExtensionPlanner {} + +#[async_trait] +impl ExtensionPlanner for FsExtensionPlanner { + async fn plan_extension( + &self, + _planner: &dyn PhysicalPlanner, + node: &dyn UserDefinedLogicalNode, + _logical_inputs: &[&LogicalPlan], + physical_inputs: &[Arc], + _session_state: &SessionState, + ) -> Result>> { + let schema = node.schema().as_ref().into(); + if let Ok::<&dyn StreamingOperatorBlueprint, _>(stream_extension) = node.try_into() { + if stream_extension.is_passthrough_boundary() { + match node.name() { + UNROLL_NODE_NAME => { + let node = node + .as_any() + .downcast_ref::() + .unwrap(); + let input = physical_inputs[0].clone(); + return Ok(Some(Arc::new(DebeziumUnrollingExec::try_new( + input, + node.pk_indices.clone(), + )?))); + } + PACK_NODE_NAME => { + let input = physical_inputs[0].clone(); + return Ok(Some(Arc::new(ToDebeziumExec::try_new(input)?))); + } + _ => return Ok(None), + } + } + }; + let name = + if let Some(key_extension) = node.as_any().downcast_ref::() { + key_extension.operator_label.clone() + } else { + None + }; + Ok(Some(Arc::new(FsMemExec::new( + name.unwrap_or("memory".to_string()), + Arc::new(schema), + )))) + } +} + +impl PlanToGraphVisitor<'_> { + fn add_index_to_traversal(&mut self, index: NodeIndex) { + if let Some(last) = self.traversal.last_mut() { + last.push(index); + } + } + + pub(crate) fn add_plan(&mut self, plan: LogicalPlan) -> Result<()> { + self.traversal.clear(); + plan.visit(self)?; + Ok(()) + } + + pub fn into_graph(self) -> LogicalGraph { + self.graph + } + + pub fn build_extension( + &mut self, + input_nodes: Vec, + extension: &dyn StreamingOperatorBlueprint, + ) -> Result<()> { + if let Some(node_name) = extension.operator_identity() { + if self.named_nodes.contains_key(&node_name) { + return plan_err!( + "extension {:?} has already been planned, shouldn't try again.", + node_name + ); + } + } + + let input_schemas = input_nodes + .iter() + .map(|index| { + Ok(self + .output_schemas + .get(index) + .ok_or_else(|| DataFusionError::Plan("missing input node".to_string()))? + .clone()) + }) + .collect::>>()?; + + let CompiledTopologyNode { + execution_unit, + routing_edges, + } = extension + .compile_to_graph_node(&self.planner, self.graph.node_count(), input_schemas) + .map_err(|e| e.context(format!("planning operator {extension:?}")))?; + + let node_index = self.graph.add_node(execution_unit); + self.add_index_to_traversal(node_index); + + for (source, edge) in input_nodes.into_iter().zip(routing_edges.into_iter()) { + self.graph.add_edge(source, node_index, edge); + } + + self.output_schemas + .insert(node_index, extension.yielded_schema().into()); + + if let Some(node_name) = extension.operator_identity() { + self.named_nodes.insert(node_name, node_index); + } + Ok(()) + } +} + +impl TreeNodeVisitor<'_> for PlanToGraphVisitor<'_> { + type Node = LogicalPlan; + + fn f_down(&mut self, node: &Self::Node) -> Result { + let LogicalPlan::Extension(Extension { node }) = node else { + return Ok(TreeNodeRecursion::Continue); + }; + + let stream_extension: &dyn StreamingOperatorBlueprint = node + .try_into() + .map_err(|e: DataFusionError| e.context("converting extension"))?; + if stream_extension.is_passthrough_boundary() { + return Ok(TreeNodeRecursion::Continue); + } + + if let Some(name) = stream_extension.operator_identity() { + if let Some(node_index) = self.named_nodes.get(&name) { + self.add_index_to_traversal(*node_index); + return Ok(TreeNodeRecursion::Jump); + } + } + + if !node.inputs().is_empty() { + self.traversal.push(vec![]); + } + + Ok(TreeNodeRecursion::Continue) + } + + fn f_up(&mut self, node: &Self::Node) -> Result { + let LogicalPlan::Extension(Extension { node }) = node else { + return Ok(TreeNodeRecursion::Continue); + }; + + let stream_extension: &dyn StreamingOperatorBlueprint = node + .try_into() + .map_err(|e: DataFusionError| e.context("planning extension"))?; + + if stream_extension.is_passthrough_boundary() { + return Ok(TreeNodeRecursion::Continue); + } + + if let Some(name) = stream_extension.operator_identity() { + if self.named_nodes.contains_key(&name) { + return Ok(TreeNodeRecursion::Continue); + } + } + + let input_nodes = if !node.inputs().is_empty() { + self.traversal.pop().unwrap_or_default() + } else { + vec![] + }; + let stream_extension: &dyn StreamingOperatorBlueprint = node + .try_into() + .map_err(|e: DataFusionError| e.context("converting extension"))?; + self.build_extension(input_nodes, stream_extension)?; + + Ok(TreeNodeRecursion::Continue) + } +} + +pub(crate) struct SplitPlanOutput { + pub(crate) partial_aggregation_plan: PhysicalPlanNode, + pub(crate) partial_schema: FsSchema, + pub(crate) finish_plan: PhysicalPlanNode, +} diff --git a/src/sql/mod.rs b/src/sql/mod.rs index ed3c2e30..dc98a4de 100644 --- a/src/sql/mod.rs +++ b/src/sql/mod.rs @@ -10,6 +10,18 @@ // See the License for the specific language governing permissions and // limitations under the License. -pub mod parser; +pub mod common; +pub mod api; + +pub mod schema; +pub mod functions; +pub mod parse; +pub mod logical_node; +pub mod logical_planner; +pub mod physical; +pub mod analysis; +pub(crate) mod extensions; +pub mod types; + +pub use analysis::rewrite_plan; -pub use parser::SqlParser; diff --git a/src/sql/parse.rs b/src/sql/parse.rs new file mode 100644 index 00000000..5fd4a59f --- /dev/null +++ b/src/sql/parse.rs @@ -0,0 +1,404 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Coordinator-facing SQL parsing (`parse_sql`). +//! +//! **Data-definition / pipeline shape (this entry point)** +//! Only these table-related forms are supported: +//! - **`CREATE TABLE ... (cols [, WATERMARK FOR ...]) WITH ('connector' = '...', 'format' = '...', ...)`** +//! connector-backed **source** DDL (no `AS SELECT`; `connector` in `WITH` selects this path) +//! - **`CREATE TABLE ...`** other forms (including `CREATE TABLE ... AS SELECT` where DataFusion accepts it) +//! - **`CREATE STREAMING TABLE ... WITH (...) AS SELECT ...`** (streaming sink DDL) +//! - **`DROP TABLE`** / **`DROP TABLE IF EXISTS`** / **`DROP STREAMING TABLE`** (alias for `DROP TABLE` on the stream catalog) +//! - **`SHOW TABLES`** — list stream catalog tables (connector sources and streaming sinks) +//! - **`SHOW CREATE TABLE `** — best-effort DDL text (full `WITH` / `AS SELECT` may not be stored) +//! +//! **`INSERT` is not supported** here — use `CREATE TABLE ... AS SELECT` or +//! `CREATE STREAMING TABLE ... AS SELECT` to define the query shape instead. +//! +//! Other supported statements include function lifecycle (`CREATE FUNCTION WITH`, `START FUNCTION`, …). + +use std::collections::HashMap; + +use datafusion::common::{Result, plan_err}; +use datafusion::error::DataFusionError; +use datafusion::sql::sqlparser::ast::{ + ObjectType, ShowCreateObject, SqlOption, Statement as DFStatement, +}; +use datafusion::sql::sqlparser::dialect::FunctionStreamDialect; +use datafusion::sql::sqlparser::parser::Parser; + +use crate::coordinator::{ + CreateFunction, CreateTable, DropFunction, DropStreamingTableStatement, DropTableStatement, + ShowCatalogTables, ShowCreateStreamingTable, ShowCreateTable, ShowFunctions, + ShowStreamingTables, StartFunction, Statement as CoordinatorStatement, StopFunction, + StreamingTableStatement, +}; + +/// Streaming-specific SQL that the sqlparser dialect does not natively handle. +/// +/// Returns `Some(statement)` if the SQL was intercepted, `None` otherwise so +/// the caller falls through to the normal sqlparser pipeline. +fn try_parse_streaming_statement(sql: &str) -> Option> { + let tokens: Vec<&str> = sql.split_whitespace().collect(); + if tokens.is_empty() { + return None; + } + + // SHOW STREAMING TABLES + if tokens.len() == 3 + && tokens[0].eq_ignore_ascii_case("show") + && tokens[1].eq_ignore_ascii_case("streaming") + && tokens[2].eq_ignore_ascii_case("tables") + { + return Some(Box::new(ShowStreamingTables::new())); + } + + // SHOW CREATE STREAMING TABLE + if tokens.len() == 5 + && tokens[0].eq_ignore_ascii_case("show") + && tokens[1].eq_ignore_ascii_case("create") + && tokens[2].eq_ignore_ascii_case("streaming") + && tokens[3].eq_ignore_ascii_case("table") + { + let name = tokens[4].trim_end_matches(';').to_string(); + return Some(Box::new(ShowCreateStreamingTable::new(name))); + } + + // DROP STREAMING TABLE [IF EXISTS] + if tokens.len() >= 4 + && tokens[0].eq_ignore_ascii_case("drop") + && tokens[1].eq_ignore_ascii_case("streaming") + && tokens[2].eq_ignore_ascii_case("table") + { + let (if_exists, name_idx) = if tokens.len() >= 6 + && tokens[3].eq_ignore_ascii_case("if") + && tokens[4].eq_ignore_ascii_case("exists") + { + (true, 5) + } else { + (false, 3) + }; + + if name_idx >= tokens.len() { + return None; + } + let name = tokens[name_idx].trim_end_matches(';').to_string(); + return Some(Box::new(DropStreamingTableStatement::new(name, if_exists))); + } + + None +} + +pub fn parse_sql(query: &str) -> Result>> { + let trimmed = query.trim(); + if trimmed.is_empty() { + return plan_err!("Query is empty"); + } + + if let Some(stmt) = try_parse_streaming_statement(trimmed) { + return Ok(vec![stmt]); + } + + let dialect = FunctionStreamDialect {}; + let statements = Parser::parse_sql(&dialect, trimmed) + .map_err(|e| DataFusionError::Plan(format!("SQL parse error: {e}")))?; + + if statements.is_empty() { + return plan_err!("No SQL statements found"); + } + + statements.into_iter().map(classify_statement).collect() +} + +fn classify_statement(stmt: DFStatement) -> Result> { + match stmt { + DFStatement::CreateFunctionWith { options } => { + let properties = sql_options_to_map(&options); + let create_fn = CreateFunction::from_properties(properties) + .map_err(|e| DataFusionError::Plan(format!("CREATE FUNCTION: {e}")))?; + Ok(Box::new(create_fn)) + } + DFStatement::StartFunction { name } => Ok(Box::new(StartFunction::new(name.to_string()))), + DFStatement::StopFunction { name } => Ok(Box::new(StopFunction::new(name.to_string()))), + DFStatement::DropFunction { func_desc, .. } => { + let name = func_desc + .first() + .map(|d| d.name.to_string()) + .unwrap_or_default(); + Ok(Box::new(DropFunction::new(name))) + } + DFStatement::ShowFunctions { .. } => Ok(Box::new(ShowFunctions::new())), + DFStatement::ShowTables { .. } => Ok(Box::new(ShowCatalogTables::new())), + DFStatement::ShowCreate { obj_type, obj_name } => { + if obj_type != ShowCreateObject::Table { + return plan_err!( + "SHOW CREATE {obj_type} is not supported; use SHOW CREATE TABLE " + ); + } + Ok(Box::new(ShowCreateTable::new(obj_name.to_string()))) + }, + s @ DFStatement::CreateTable(_) => Ok(Box::new(CreateTable::new(s))), + s @ DFStatement::CreateStreamingTable { .. } => { + Ok(Box::new(StreamingTableStatement::new(s))) + } + stmt @ DFStatement::Drop { .. } => { + { + let DFStatement::Drop { + object_type, + names, + .. + } = &stmt + else { + unreachable!() + }; + if *object_type != ObjectType::Table { + return plan_err!("Only DROP TABLE is supported in this SQL frontend"); + } + if names.len() != 1 { + return plan_err!("DROP TABLE supports exactly one table name per statement"); + } + } + Ok(Box::new(DropTableStatement::new(stmt))) + } + DFStatement::Insert { .. } => plan_err!( + "INSERT is not supported; only CREATE TABLE and CREATE STREAMING TABLE (with AS SELECT) \ + are supported for defining table/query pipelines in this SQL frontend" + ), + other => plan_err!("Unsupported SQL statement: {other}"), + } +} + +/// Convert Vec (KeyValue pairs) into HashMap. +fn sql_options_to_map(options: &[SqlOption]) -> HashMap { + options + .iter() + .filter_map(|opt| match opt { + SqlOption::KeyValue { key, value } => Some(( + key.value.clone(), + value.to_string().trim_matches('\'').to_string(), + )), + _ => None, + }) + .collect() +} + +#[cfg(test)] +mod tests { + use super::*; + + fn first_stmt(sql: &str) -> Box { + let mut stmts = parse_sql(sql).unwrap(); + assert!(!stmts.is_empty()); + stmts.remove(0) + } + + fn is_type(stmt: &dyn CoordinatorStatement, prefix: &str) -> bool { + format!("{:?}", stmt).starts_with(prefix) + } + + #[test] + fn test_parse_create_function() { + let sql = + "CREATE FUNCTION WITH ('function_path'='./test.wasm', 'config_path'='./config.yml')"; + let stmt = first_stmt(sql); + assert!(is_type(stmt.as_ref(), "CreateFunction")); + } + + #[test] + fn test_parse_create_function_minimal() { + let sql = "CREATE FUNCTION WITH ('function_path'='./processor.wasm')"; + let stmt = first_stmt(sql); + assert!(is_type(stmt.as_ref(), "CreateFunction")); + } + + #[test] + fn test_parse_drop_function() { + let stmt = first_stmt("DROP FUNCTION my_task"); + assert!(is_type(stmt.as_ref(), "DropFunction")); + } + + #[test] + fn test_parse_start_function() { + let stmt = first_stmt("START FUNCTION my_task"); + assert!(is_type(stmt.as_ref(), "StartFunction")); + } + + #[test] + fn test_parse_stop_function() { + let stmt = first_stmt("STOP FUNCTION my_task"); + assert!(is_type(stmt.as_ref(), "StopFunction")); + } + + #[test] + fn test_parse_show_functions() { + let stmt = first_stmt("SHOW FUNCTIONS"); + assert!(is_type(stmt.as_ref(), "ShowFunctions")); + } + + #[test] + fn test_parse_show_tables() { + let stmt = first_stmt("SHOW TABLES"); + assert!(is_type(stmt.as_ref(), "ShowCatalogTables")); + } + + #[test] + fn test_parse_show_create_table() { + let stmt = first_stmt("SHOW CREATE TABLE my_src"); + assert!(is_type(stmt.as_ref(), "ShowCreateTable")); + } + + #[test] + fn test_parse_create_table() { + let stmt = first_stmt("CREATE TABLE foo (id INT, name VARCHAR)"); + assert!(is_type(stmt.as_ref(), "CreateTable")); + } + + #[test] + fn test_parse_create_table_connector_source_ddl() { + let sql = concat!( + "CREATE TABLE kafka_src (id BIGINT, ts TIMESTAMP NOT NULL, WATERMARK FOR ts) ", + "WITH ('connector' = 'kafka', 'format' = 'json', 'topic' = 'events')", + ); + let stmt = first_stmt(sql); + assert!(is_type(stmt.as_ref(), "CreateTable")); + } + + #[test] + fn test_parse_drop_table() { + let stmt = first_stmt("DROP TABLE foo"); + assert!(is_type(stmt.as_ref(), "DropTableStatement")); + } + + #[test] + fn test_parse_drop_table_if_exists() { + let stmt = first_stmt("DROP TABLE IF EXISTS foo"); + assert!(is_type(stmt.as_ref(), "DropTableStatement")); + } + + #[test] + fn test_parse_drop_streaming_table() { + let stmt = first_stmt("DROP STREAMING TABLE my_sink"); + assert!(is_type(stmt.as_ref(), "DropStreamingTableStatement")); + } + + #[test] + fn test_parse_drop_streaming_table_if_exists() { + let stmt = first_stmt("DROP STREAMING TABLE IF EXISTS my_sink"); + assert!(is_type(stmt.as_ref(), "DropStreamingTableStatement")); + } + + #[test] + fn test_parse_show_streaming_tables() { + let stmt = first_stmt("SHOW STREAMING TABLES"); + assert!(is_type(stmt.as_ref(), "ShowStreamingTables")); + } + + #[test] + fn test_parse_show_create_streaming_table() { + let stmt = first_stmt("SHOW CREATE STREAMING TABLE my_sink"); + assert!(is_type(stmt.as_ref(), "ShowCreateStreamingTable")); + } + + /// `CREATE STREAMING TABLE` is the sink DDL supported by FunctionStream (not `CREATE STREAM TABLE`). + #[test] + fn test_parse_create_streaming_table() { + let sql = concat!( + "CREATE STREAMING TABLE my_sink ", + "WITH ('connector' = 'kafka') ", + "AS SELECT id FROM src", + ); + let stmt = first_stmt(sql); + assert!( + is_type(stmt.as_ref(), "StreamingTableStatement"), + "expected StreamingTableStatement, got {:?}", + stmt + ); + } + + #[test] + fn test_parse_create_streaming_table_case_insensitive() { + let sql = concat!( + "create streaming table out_q ", + "with ('connector' = 'memory') ", + "as select 1 as x", + ); + let stmt = first_stmt(sql); + assert!(is_type(stmt.as_ref(), "StreamingTableStatement")); + } + + #[test] + fn test_parse_case_insensitive() { + assert!(is_type( + first_stmt("create function with ('function_path'='./test.wasm')").as_ref(), + "CreateFunction" + )); + assert!(is_type( + first_stmt("show functions").as_ref(), + "ShowFunctions" + )); + assert!(is_type( + first_stmt("start function my_task").as_ref(), + "StartFunction" + )); + } + + #[test] + fn test_parse_multiple_statements() { + let sql = concat!( + "CREATE TABLE t1 (id INT); ", + "CREATE STREAMING TABLE sk WITH ('connector' = 'kafka') AS SELECT id FROM t1", + ); + let stmts = parse_sql(sql).unwrap(); + assert_eq!(stmts.len(), 2); + assert!(is_type(stmts[0].as_ref(), "CreateTable")); + assert!(is_type(stmts[1].as_ref(), "StreamingTableStatement")); + } + + #[test] + fn test_parse_empty() { + assert!(parse_sql("").is_err()); + assert!(parse_sql(" ").is_err()); + } + + #[test] + fn test_parse_unsupported_statement() { + let result = parse_sql("SELECT 1"); + assert!(result.is_err()); + } + + #[test] + fn test_insert_not_supported() { + let err = parse_sql("INSERT INTO sink SELECT * FROM src").unwrap_err(); + let msg = err.to_string(); + assert!( + msg.contains("INSERT") && msg.contains("not supported"), + "expected explicit INSERT rejection, got: {msg}" + ); + assert!( + msg.contains("CREATE TABLE") || msg.contains("CREATE STREAMING TABLE"), + "error should mention supported alternatives, got: {msg}" + ); + } + + #[test] + fn test_parse_with_extra_properties() { + let sql = r#"CREATE FUNCTION WITH ( + 'function_path'='./test.wasm', + 'config_path'='./config.yml', + 'parallelism'='4', + 'memory-limit'='256mb' + )"#; + let stmt = first_stmt(sql); + assert!(is_type(stmt.as_ref(), "CreateFunction")); + } +} diff --git a/src/sql/parser/sql_parser.rs b/src/sql/parser/sql_parser.rs deleted file mode 100644 index dc110745..00000000 --- a/src/sql/parser/sql_parser.rs +++ /dev/null @@ -1,249 +0,0 @@ -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use pest::Parser; -use pest_derive::Parser; - -use super::ParseError; -use crate::coordinator::{ - CreateFunction, DropFunction, ShowFunctions, StartFunction, Statement, StopFunction, -}; -use std::collections::HashMap; - -#[derive(Parser)] -#[grammar = "src/sql/grammar.pest"] -struct Grammar; - -#[derive(Debug, Default)] -pub struct SqlParser; - -impl SqlParser { - pub fn parse(sql: &str) -> Result, ParseError> { - let pairs = Grammar::parse(Rule::statement, sql) - .map_err(|e| ParseError::new(format!("Parse error: {}", e)))?; - - for pair in pairs { - return match pair.as_rule() { - Rule::create_stmt => { - handle_create_stmt(pair).map(|stmt| stmt as Box) - } - Rule::drop_stmt => handle_drop_stmt(pair).map(|stmt| stmt as Box), - Rule::start_stmt => handle_start_stmt(pair).map(|stmt| stmt as Box), - Rule::stop_stmt => handle_stop_stmt(pair).map(|stmt| stmt as Box), - Rule::show_stmt => handle_show_stmt(pair).map(|stmt| stmt as Box), - _ => continue, - }; - } - - Err(ParseError::new("Unknown statement type")) - } -} - -fn handle_create_stmt( - pair: pest::iterators::Pair, -) -> Result, ParseError> { - let mut inner = pair.into_inner(); - // Note: name is read from config file, not from SQL statement - // Pass empty string here, name will be read from config file later - let properties = inner - .next() - .map(parse_properties) - .ok_or_else(|| ParseError::new("Missing WITH clause"))?; - - Ok(Box::new( - CreateFunction::from_properties(properties).map_err(ParseError::from)?, - )) -} - -fn handle_drop_stmt(pair: pest::iterators::Pair) -> Result, ParseError> { - let mut inner = pair.into_inner(); - let name = inner.next().map(extract_string).unwrap_or_default(); - Ok(Box::new(DropFunction::new(name))) -} - -fn handle_start_stmt(pair: pest::iterators::Pair) -> Result, ParseError> { - let mut inner = pair.into_inner(); - let name = inner.next().map(extract_string).unwrap_or_default(); - Ok(Box::new(StartFunction::new(name))) -} - -fn handle_stop_stmt(pair: pest::iterators::Pair) -> Result, ParseError> { - let mut inner = pair.into_inner(); - let name = inner.next().map(extract_string).unwrap_or_default(); - Ok(Box::new(StopFunction::new(name))) -} - -fn handle_show_stmt(_pair: pest::iterators::Pair) -> Result, ParseError> { - Ok(Box::new(ShowFunctions::new())) -} - -fn extract_string(pair: pest::iterators::Pair) -> String { - match pair.as_rule() { - Rule::string_literal => { - let s = pair.as_str(); - if (s.starts_with('\'') && s.ends_with('\'')) - || (s.starts_with('"') && s.ends_with('"')) - { - unescape_string(&s[1..s.len() - 1]) - } else { - unescape_string(s) - } - } - Rule::identifier => pair.as_str().to_string(), - _ => pair.as_str().to_string(), - } -} - -fn unescape_string(s: &str) -> String { - let mut result = String::with_capacity(s.len()); - let mut chars = s.chars().peekable(); - - while let Some(ch) = chars.next() { - if ch == '\\' { - if let Some(&next) = chars.peek() { - chars.next(); - match next { - 'n' => result.push('\n'), - 't' => result.push('\t'), - 'r' => result.push('\r'), - '\\' => result.push('\\'), - '\'' => result.push('\''), - '"' => result.push('"'), - _ => { - result.push('\\'); - result.push(next); - } - } - } else { - result.push(ch); - } - } else { - result.push(ch); - } - } - - result -} - -fn parse_properties(pair: pest::iterators::Pair) -> HashMap { - let mut properties = HashMap::new(); - - for prop in pair.into_inner() { - if prop.as_rule() == Rule::property { - let mut inner = prop.into_inner(); - if let (Some(key_pair), Some(val_pair)) = (inner.next(), inner.next()) { - let key = key_pair - .into_inner() - .next() - .map(extract_string) - .unwrap_or_default(); - let value = val_pair - .into_inner() - .next() - .map(extract_string) - .unwrap_or_default(); - properties.insert(key, value); - } - } - } - - properties -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_create_function() { - let sql = - "CREATE FUNCTION WITH ('function_path'='./test.wasm', 'config_path'='./config.yml')"; - let _stmt = SqlParser::parse(sql).unwrap(); - } - - #[test] - fn test_create_function_minimal() { - let sql = "CREATE FUNCTION WITH ('function_path'='./processor.wasm')"; - let _stmt = SqlParser::parse(sql).unwrap(); - } - - // Note: SQL only supports Path mode, not Bytes mode - // Bytes mode is only for gRPC requests - - #[test] - fn test_drop_function() { - let sql = "DROP FUNCTION my_task"; - let _stmt = SqlParser::parse(sql).unwrap(); - } - - #[test] - fn test_start_function() { - let sql = "START FUNCTION my_task"; - let _stmt = SqlParser::parse(sql).unwrap(); - } - - #[test] - fn test_stop_function() { - let sql = "STOP FUNCTION my_task"; - let _stmt = SqlParser::parse(sql).unwrap(); - } - - #[test] - fn test_show_functions() { - let sql = "SHOW FUNCTIONS"; - let _stmt = SqlParser::parse(sql).unwrap(); - } - - #[test] - fn test_case_insensitive_keywords() { - let sql1 = "create function with ('function_path'='./test.wasm')"; - let _stmt1 = SqlParser::parse(sql1).unwrap(); - - let sql2 = "Create Function With ('Function_Path'='./test.wasm')"; - let _stmt2 = SqlParser::parse(sql2).unwrap(); - - let sql3 = "show functions"; - let _stmt3 = SqlParser::parse(sql3).unwrap(); - - let sql4 = "start function my_task"; - let _stmt4 = SqlParser::parse(sql4).unwrap(); - } - - #[test] - fn test_case_insensitive_property_keys() { - let sql1 = - "CREATE FUNCTION WITH ('function_path'='./test.wasm', 'config_path'='./config.yml')"; - let _stmt1 = SqlParser::parse(sql1).unwrap(); - - let sql2 = - "CREATE FUNCTION WITH ('Function_Path'='./test.wasm', 'Config_Path'='./config.yml')"; - let _stmt2 = SqlParser::parse(sql2).unwrap(); - - let sql3 = - "CREATE FUNCTION WITH ('FUNCTION_PATH'='./test.wasm', 'CONFIG_PATH'='./config.yml')"; - let _stmt3 = SqlParser::parse(sql3).unwrap(); - - // Note: SQL only supports Path mode (function_path, config_path) - // Bytes mode (function, config) is only for gRPC requests - } - - #[test] - fn test_with_extra_properties() { - let sql = r#"CREATE FUNCTION WITH ( - 'function_path'='./test.wasm', - 'config_path'='./config.yml', - 'parallelism'='4', - 'memory-limit'='256mb' - )"#; - let _stmt = SqlParser::parse(sql).unwrap(); - } -} diff --git a/src/sql/physical/cdc/encode.rs b/src/sql/physical/cdc/encode.rs new file mode 100644 index 00000000..07495a38 --- /dev/null +++ b/src/sql/physical/cdc/encode.rs @@ -0,0 +1,329 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +use std::any::Any; +use std::collections::HashMap; +use std::pin::Pin; +use std::sync::Arc; +use std::task::{Context, Poll}; + +use datafusion::arrow::array::AsArray; +use datafusion::arrow::array::{ + Array, BooleanArray, FixedSizeBinaryArray, PrimitiveArray, RecordBatch, StringArray, + StructArray, TimestampNanosecondBuilder, +}; +use datafusion::arrow::buffer::NullBuffer; +use datafusion::arrow::compute::take; +use datafusion::arrow::datatypes::{DataType, Field, Schema, SchemaRef, UInt64Type}; +use datafusion::arrow::datatypes::TimestampNanosecondType; +use datafusion::common::{DataFusionError, Result}; +use datafusion::execution::{RecordBatchStream, SendableRecordBatchStream, TaskContext}; +use datafusion::physical_plan::{DisplayAs, ExecutionPlan, PlanProperties}; +use futures::{ready, stream::Stream, StreamExt}; + +use crate::sql::common::constants::{cdc, debezium_op_short, physical_plan_node_name}; +use crate::sql::common::{TIMESTAMP_FIELD, UPDATING_META_FIELD}; +use crate::sql::physical::readers::make_stream_properties; + +#[derive(Debug)] +pub struct ToDebeziumExec { + input: Arc, + schema: SchemaRef, + properties: PlanProperties, +} + +impl ToDebeziumExec { + pub fn try_new(input: Arc) -> Result { + let input_schema = input.schema(); + let timestamp_index = input_schema.index_of(TIMESTAMP_FIELD)?; + let struct_fields: Vec<_> = input_schema + .fields() + .into_iter() + .enumerate() + .filter_map(|(index, field)| { + if field.name() == UPDATING_META_FIELD || index == timestamp_index { + None + } else { + Some(field.clone()) + } + }) + .collect(); + let struct_data_type = DataType::Struct(struct_fields.into()); + let before_field = Arc::new(Field::new(cdc::BEFORE, struct_data_type.clone(), true)); + let after_field = Arc::new(Field::new(cdc::AFTER, struct_data_type, true)); + let op_field = Arc::new(Field::new(cdc::OP, DataType::Utf8, false)); + let timestamp_field = Arc::new(input_schema.field(timestamp_index).clone()); + + let output_schema = Arc::new(Schema::new(vec![ + before_field, + after_field, + op_field, + timestamp_field, + ])); + + Ok(Self { + input, + schema: output_schema.clone(), + properties: make_stream_properties(output_schema), + }) + } + + pub(crate) fn from_decoded_parts(input: Arc, schema: SchemaRef) -> Self { + Self { + properties: make_stream_properties(schema.clone()), + input, + schema, + } + } +} + +impl DisplayAs for ToDebeziumExec { + fn fmt_as( + &self, + _t: datafusion::physical_plan::DisplayFormatType, + f: &mut std::fmt::Formatter, + ) -> std::fmt::Result { + write!(f, "ToDebeziumExec") + } +} + +impl ExecutionPlan for ToDebeziumExec { + fn name(&self) -> &str { + physical_plan_node_name::TO_DEBEZIUM_EXEC + } + + fn as_any(&self) -> &dyn Any { + self as &dyn Any + } + + fn schema(&self) -> SchemaRef { + self.schema.clone() + } + + fn properties(&self) -> &PlanProperties { + &self.properties + } + + fn children(&self) -> Vec<&Arc> { + vec![&self.input] + } + + fn with_new_children( + self: Arc, + children: Vec>, + ) -> Result> { + if children.len() != 1 { + return Err(DataFusionError::Internal( + "ToDebeziumExec wrong number of children".to_string(), + )); + } + Ok(Arc::new(ToDebeziumExec::try_new(children[0].clone())?)) + } + + fn execute( + &self, + partition: usize, + context: Arc, + ) -> Result { + let updating_meta_index = self.input.schema().index_of(UPDATING_META_FIELD).ok(); + let timestamp_index = self.input.schema().index_of(TIMESTAMP_FIELD)?; + let struct_projection = (0..self.input.schema().fields().len()) + .filter(|index| { + updating_meta_index + .map(|is_retract_index| *index != is_retract_index) + .unwrap_or(true) + && *index != timestamp_index + }) + .collect(); + + Ok(Box::pin(ToDebeziumStream { + input: self.input.execute(partition, context)?, + schema: self.schema.clone(), + updating_meta_index, + timestamp_index, + struct_projection, + })) + } + + fn reset(&self) -> Result<()> { + self.input.reset() + } +} + +struct ToDebeziumStream { + input: SendableRecordBatchStream, + schema: SchemaRef, + updating_meta_index: Option, + timestamp_index: usize, + struct_projection: Vec, +} + +fn compact_changelog_by_id<'a>( + num_rows: usize, + is_retract: &'a BooleanArray, + id: &'a FixedSizeBinaryArray, + timestamps: &'a PrimitiveArray, +) -> ( + Vec<&'a [u8]>, + HashMap<&'a [u8], (usize, usize, bool, bool, i64)>, +) { + let mut id_map: HashMap<&[u8], (usize, usize, bool, bool, i64)> = HashMap::new(); + let mut order = vec![]; + for i in 0..num_rows { + let row_id = id.value(i); + let is_create = !is_retract.value(i); + let timestamp = timestamps.value(i); + + id_map + .entry(row_id) + .and_modify(|e| { + e.1 = i; + e.3 = is_create; + e.4 = e.4.max(timestamp); + }) + .or_insert_with(|| { + order.push(row_id); + (i, i, is_create, is_create, timestamp) + }); + } + (order, id_map) +} + +impl ToDebeziumStream { + fn as_debezium_batch(&mut self, batch: &RecordBatch) -> Result { + let value_struct = batch.project(&self.struct_projection)?; + let timestamps = batch + .column(self.timestamp_index) + .as_primitive::(); + + let columns: Vec> = if let Some(metadata_index) = self.updating_meta_index { + let metadata = batch + .column(metadata_index) + .as_any() + .downcast_ref::() + .ok_or_else(|| { + DataFusionError::Internal("Invalid type for updating_meta column".to_string()) + })?; + + let is_retract = metadata.column(0).as_boolean(); + let id = metadata.column(1).as_fixed_size_binary(); + + let (order, id_map) = + compact_changelog_by_id(batch.num_rows(), is_retract, id, timestamps); + + let mut before = Vec::with_capacity(id_map.len()); + let mut after = Vec::with_capacity(id_map.len()); + let mut op = Vec::with_capacity(id_map.len()); + let mut ts = TimestampNanosecondBuilder::with_capacity(id_map.len()); + + for row_id in order { + let (first_idx, last_idx, first_is_create, last_is_create, timestamp) = + id_map.get(row_id).unwrap(); + + if *first_is_create && *last_is_create { + before.push(None); + after.push(Some(*last_idx)); + op.push(debezium_op_short::CREATE); + } else if !(*first_is_create) && !(*last_is_create) { + before.push(Some(*first_idx)); + after.push(None); + op.push(debezium_op_short::DELETE); + } else if !(*first_is_create) && *last_is_create { + before.push(Some(*first_idx)); + after.push(Some(*last_idx)); + op.push(debezium_op_short::UPDATE); + } else { + continue; + } + + ts.append_value(*timestamp); + } + + let before_array = Self::create_output_array(&value_struct, &before)?; + let after_array = Self::create_output_array(&value_struct, &after)?; + let op_array = StringArray::from(op); + + vec![ + Arc::new(before_array), + Arc::new(after_array), + Arc::new(op_array), + Arc::new(ts.finish()), + ] + } else { + let after_array = StructArray::try_new( + value_struct.schema().fields().clone(), + value_struct.columns().to_vec(), + None, + )?; + + let before_array = StructArray::new_null( + value_struct.schema().fields().clone(), + value_struct.num_rows(), + ); + + vec![ + Arc::new(before_array), + Arc::new(after_array), + Arc::new(StringArray::from(vec![ + debezium_op_short::CREATE; + value_struct.num_rows() + ])), + batch.column(self.timestamp_index).clone(), + ] + }; + + Ok(RecordBatch::try_new(self.schema.clone(), columns)?) + } + + fn create_output_array( + value_struct: &RecordBatch, + indices: &[Option], + ) -> Result { + let mut arrays: Vec> = Vec::with_capacity(value_struct.num_columns()); + for col in value_struct.columns() { + let new_array = take( + col.as_ref(), + &indices + .iter() + .map(|&idx| idx.map(|i| i as u64)) + .collect::>(), + None, + )?; + arrays.push(new_array); + } + + Ok(StructArray::try_new( + value_struct.schema().fields().clone(), + arrays, + Some(NullBuffer::from( + indices.iter().map(|&idx| idx.is_some()).collect::>(), + )), + )?) + } +} + +impl Stream for ToDebeziumStream { + type Item = Result; + + fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context) -> Poll> { + let result = + ready!(self.input.poll_next_unpin(cx)).map(|result| self.as_debezium_batch(&result?)); + Poll::Ready(result) + } +} + +impl RecordBatchStream for ToDebeziumStream { + fn schema(&self) -> SchemaRef { + self.schema.clone() + } +} diff --git a/src/sql/physical/cdc/mod.rs b/src/sql/physical/cdc/mod.rs new file mode 100644 index 00000000..9e32e67a --- /dev/null +++ b/src/sql/physical/cdc/mod.rs @@ -0,0 +1,18 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +mod encode; +mod unroll; + +pub use encode::ToDebeziumExec; +pub use unroll::DebeziumUnrollingExec; diff --git a/src/sql/physical/cdc/unroll.rs b/src/sql/physical/cdc/unroll.rs new file mode 100644 index 00000000..f40beb06 --- /dev/null +++ b/src/sql/physical/cdc/unroll.rs @@ -0,0 +1,298 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +use std::any::Any; +use std::pin::Pin; +use std::sync::Arc; +use std::task::{Context, Poll}; + +use datafusion::arrow::array::AsArray; +use datafusion::arrow::array::{ + Array, BooleanBuilder, RecordBatch, StringArray, StructArray, TimestampNanosecondArray, + TimestampNanosecondBuilder, UInt32Builder, +}; +use datafusion::arrow::compute::{concat, take}; +use datafusion::arrow::datatypes::{DataType, Field, Schema, SchemaRef, TimeUnit}; +use datafusion::common::{DataFusionError, Result, plan_err}; +use datafusion::execution::{RecordBatchStream, SendableRecordBatchStream, TaskContext}; +use datafusion::logical_expr::ColumnarValue; +use datafusion::physical_plan::{DisplayAs, ExecutionPlan, PlanProperties}; +use futures::{ready, stream::Stream, StreamExt}; + +use crate::sql::common::constants::{cdc, debezium_op_short, physical_plan_node_name}; +use crate::sql::common::TIMESTAMP_FIELD; +use crate::sql::functions::MultiHashFunction; +use crate::sql::physical::meta::{updating_meta_field, updating_meta_fields}; +use crate::sql::physical::readers::make_stream_properties; + +#[derive(Debug)] +pub struct DebeziumUnrollingExec { + input: Arc, + schema: SchemaRef, + properties: PlanProperties, + primary_keys: Vec, +} + +impl DebeziumUnrollingExec { + pub fn try_new(input: Arc, primary_keys: Vec) -> Result { + let input_schema = input.schema(); + let before_index = input_schema.index_of(cdc::BEFORE)?; + let after_index = input_schema.index_of(cdc::AFTER)?; + let op_index = input_schema.index_of(cdc::OP)?; + let _timestamp_index = input_schema.index_of(TIMESTAMP_FIELD)?; + let before_type = input_schema.field(before_index).data_type(); + let after_type = input_schema.field(after_index).data_type(); + if before_type != after_type { + return Err(DataFusionError::Internal( + "before and after columns must have the same type".to_string(), + )); + } + let op_type = input_schema.field(op_index).data_type(); + if *op_type != DataType::Utf8 { + return Err(DataFusionError::Internal( + "op column must be a string".to_string(), + )); + } + let DataType::Struct(fields) = before_type else { + return Err(DataFusionError::Internal( + "before and after columns must be structs".to_string(), + )); + }; + let mut fields = fields.to_vec(); + fields.push(updating_meta_field()); + fields.push(Arc::new(Field::new( + TIMESTAMP_FIELD, + DataType::Timestamp(TimeUnit::Nanosecond, None), + false, + ))); + + let schema = Arc::new(Schema::new(fields)); + Ok(Self { + input, + schema: schema.clone(), + properties: make_stream_properties(schema), + primary_keys, + }) + } + + pub(crate) fn from_decoded_parts( + input: Arc, + schema: SchemaRef, + primary_keys: Vec, + ) -> Self { + Self { + properties: make_stream_properties(schema.clone()), + input, + schema, + primary_keys, + } + } + + pub fn primary_key_indices(&self) -> &[usize] { + &self.primary_keys + } +} + +impl DisplayAs for DebeziumUnrollingExec { + fn fmt_as( + &self, + _t: datafusion::physical_plan::DisplayFormatType, + f: &mut std::fmt::Formatter, + ) -> std::fmt::Result { + write!(f, "DebeziumUnrollingExec") + } +} + +impl ExecutionPlan for DebeziumUnrollingExec { + fn name(&self) -> &str { + physical_plan_node_name::DEBEZIUM_UNROLLING_EXEC + } + + fn as_any(&self) -> &dyn Any { + self as &dyn Any + } + + fn schema(&self) -> SchemaRef { + self.schema.clone() + } + + fn properties(&self) -> &PlanProperties { + &self.properties + } + + fn children(&self) -> Vec<&Arc> { + vec![&self.input] + } + + fn with_new_children( + self: Arc, + children: Vec>, + ) -> Result> { + if children.len() != 1 { + return Err(DataFusionError::Internal( + "DebeziumUnrollingExec wrong number of children".to_string(), + )); + } + Ok(Arc::new(DebeziumUnrollingExec { + input: children[0].clone(), + schema: self.schema.clone(), + properties: self.properties.clone(), + primary_keys: self.primary_keys.clone(), + })) + } + + fn execute( + &self, + partition: usize, + context: Arc, + ) -> Result { + Ok(Box::pin(DebeziumUnrollingStream::try_new( + self.input.execute(partition, context)?, + self.schema.clone(), + self.primary_keys.clone(), + )?)) + } + + fn reset(&self) -> Result<()> { + self.input.reset() + } +} + +struct DebeziumUnrollingStream { + input: SendableRecordBatchStream, + schema: SchemaRef, + before_index: usize, + after_index: usize, + op_index: usize, + timestamp_index: usize, + primary_keys: Vec, +} + +impl DebeziumUnrollingStream { + fn try_new( + input: SendableRecordBatchStream, + schema: SchemaRef, + primary_keys: Vec, + ) -> Result { + if primary_keys.is_empty() { + return plan_err!("there must be at least one primary key for a Debezium source"); + } + let input_schema = input.schema(); + let before_index = input_schema.index_of(cdc::BEFORE)?; + let after_index = input_schema.index_of(cdc::AFTER)?; + let op_index = input_schema.index_of(cdc::OP)?; + let timestamp_index = input_schema.index_of(TIMESTAMP_FIELD)?; + + Ok(Self { + input, + schema, + before_index, + after_index, + op_index, + timestamp_index, + primary_keys, + }) + } + + fn unroll_batch(&self, batch: &RecordBatch) -> Result { + let before = batch.column(self.before_index).as_ref(); + let after = batch.column(self.after_index).as_ref(); + let op = batch + .column(self.op_index) + .as_any() + .downcast_ref::() + .ok_or_else(|| DataFusionError::Internal("op column is not a string".to_string()))?; + + let timestamp = batch + .column(self.timestamp_index) + .as_any() + .downcast_ref::() + .ok_or_else(|| { + DataFusionError::Internal("timestamp column is not a timestamp".to_string()) + })?; + + let num_rows = batch.num_rows(); + let combined_array = concat(&[before, after])?; + let mut take_indices = UInt32Builder::with_capacity(num_rows); + let mut is_retract_builder = BooleanBuilder::with_capacity(num_rows); + + let mut timestamp_builder = TimestampNanosecondBuilder::with_capacity(2 * num_rows); + for i in 0..num_rows { + let op = op.value(i); + match op { + debezium_op_short::CREATE | debezium_op_short::READ => { + take_indices.append_value((i + num_rows) as u32); + is_retract_builder.append_value(false); + timestamp_builder.append_value(timestamp.value(i)); + } + debezium_op_short::UPDATE => { + take_indices.append_value(i as u32); + is_retract_builder.append_value(true); + timestamp_builder.append_value(timestamp.value(i)); + take_indices.append_value((i + num_rows) as u32); + is_retract_builder.append_value(false); + timestamp_builder.append_value(timestamp.value(i)); + } + debezium_op_short::DELETE => { + take_indices.append_value(i as u32); + is_retract_builder.append_value(true); + timestamp_builder.append_value(timestamp.value(i)); + } + _ => { + return Err(DataFusionError::Internal(format!( + "unexpected op value: {op}" + ))); + } + } + } + let take_indices = take_indices.finish(); + let unrolled_array = take(&combined_array, &take_indices, None)?; + + let mut columns = unrolled_array.as_struct().columns().to_vec(); + + let hash = MultiHashFunction::default().invoke( + &self + .primary_keys + .iter() + .map(|i| ColumnarValue::Array(columns[*i].clone())) + .collect::>(), + )?; + + let ids = hash.into_array(num_rows)?; + + let meta = StructArray::try_new( + updating_meta_fields(), + vec![Arc::new(is_retract_builder.finish()), ids], + None, + )?; + columns.push(Arc::new(meta)); + columns.push(Arc::new(timestamp_builder.finish())); + Ok(RecordBatch::try_new(self.schema.clone(), columns)?) + } +} + +impl Stream for DebeziumUnrollingStream { + type Item = Result; + + fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context) -> Poll> { + let result = + ready!(self.input.poll_next_unpin(cx)).map(|result| self.unroll_batch(&result?)); + Poll::Ready(result) + } +} + +impl RecordBatchStream for DebeziumUnrollingStream { + fn schema(&self) -> SchemaRef { + self.schema.clone() + } +} diff --git a/src/sql/physical/codec.rs b/src/sql/physical/codec.rs new file mode 100644 index 00000000..c8349dc6 --- /dev/null +++ b/src/sql/physical/codec.rs @@ -0,0 +1,271 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +use std::sync::Arc; + +use datafusion::arrow::array::RecordBatch; +use datafusion::arrow::datatypes::Schema; +use datafusion::common::{DataFusionError, Result, UnnestOptions, not_impl_err}; +use datafusion::execution::FunctionRegistry; +use datafusion::logical_expr::ScalarUDF; +use datafusion::physical_plan::ExecutionPlan; +use datafusion::physical_plan::unnest::{ListUnnest, UnnestExec}; +use datafusion_proto::physical_plan::PhysicalExtensionCodec; +use prost::Message; +use protocol::grpc::api::{ + DebeziumDecodeNode, DebeziumEncodeNode, FsExecNode, MemExecNode, UnnestExecNode, + fs_exec_node::Node, +}; +use tokio::sync::mpsc::UnboundedReceiver; + +use crate::sql::analysis::UNNESTED_COL; +use crate::sql::common::constants::{mem_exec_join_side, window_function_udf}; +use crate::sql::physical::udfs::window; +use crate::sql::physical::cdc::{DebeziumUnrollingExec, ToDebeziumExec}; +use crate::sql::physical::readers::{ + FsMemExec, RecordBatchVecReader, RwLockRecordBatchReader, UnboundedRecordBatchReader, +}; + +#[derive(Debug)] +pub struct FsPhysicalExtensionCodec { + pub context: DecodingContext, +} + +impl Default for FsPhysicalExtensionCodec { + fn default() -> Self { + Self { + context: DecodingContext::None, + } + } +} + +#[derive(Debug)] +pub enum DecodingContext { + None, + Planning, + SingleLockedBatch(Arc>>), + UnboundedBatchStream(Arc>>>), + LockedBatchVec(Arc>>), + LockedJoinPair { + left: Arc>>, + right: Arc>>, + }, + LockedJoinStream { + left: Arc>>>, + right: Arc>>>, + }, +} + +impl PhysicalExtensionCodec for FsPhysicalExtensionCodec { + fn try_decode( + &self, + buf: &[u8], + inputs: &[Arc], + _registry: &dyn FunctionRegistry, + ) -> Result> { + let exec: FsExecNode = Message::decode(buf) + .map_err(|err| DataFusionError::Internal(format!("couldn't deserialize: {err}")))?; + + let node = exec + .node + .ok_or_else(|| DataFusionError::Internal("exec node is empty".to_string()))?; + + match node { + Node::MemExec(mem) => self.decode_mem_exec(mem), + Node::UnnestExec(unnest) => decode_unnest_exec(unnest, inputs), + Node::DebeziumDecode(debezium) => decode_debezium_decode(debezium, inputs), + Node::DebeziumEncode(debezium) => decode_debezium_encode(debezium, inputs), + } + } + + fn try_encode(&self, node: Arc, buf: &mut Vec) -> Result<()> { + let mut proto = None; + + if let Some(table) = node.as_any().downcast_ref::() { + proto = Some(FsExecNode { + node: Some(Node::MemExec(MemExecNode { + table_name: table.table_name.clone(), + schema: serde_json::to_string(&table.schema).unwrap(), + })), + }); + } + + if let Some(unnest) = node.as_any().downcast_ref::() { + proto = Some(FsExecNode { + node: Some(Node::UnnestExec(UnnestExecNode { + schema: serde_json::to_string(&unnest.schema()).unwrap(), + })), + }); + } + + if let Some(decode) = node.as_any().downcast_ref::() { + proto = Some(FsExecNode { + node: Some(Node::DebeziumDecode(DebeziumDecodeNode { + schema: serde_json::to_string(decode.schema().as_ref()).unwrap(), + primary_keys: decode + .primary_key_indices() + .iter() + .map(|c| *c as u64) + .collect(), + })), + }); + } + + if let Some(encode) = node.as_any().downcast_ref::() { + proto = Some(FsExecNode { + node: Some(Node::DebeziumEncode(DebeziumEncodeNode { + schema: serde_json::to_string(encode.schema().as_ref()).unwrap(), + })), + }); + } + + if let Some(node) = proto { + node.encode(buf).map_err(|err| { + DataFusionError::Internal(format!("couldn't serialize exec node {err}")) + })?; + Ok(()) + } else { + Err(DataFusionError::Internal(format!( + "cannot serialize {node:?}" + ))) + } + } + + fn try_decode_udf(&self, name: &str, _buf: &[u8]) -> Result> { + if name == window_function_udf::NAME { + return Ok(window()); + } + not_impl_err!("PhysicalExtensionCodec is not provided for scalar function {name}") + } +} + +impl FsPhysicalExtensionCodec { + fn decode_mem_exec(&self, mem_exec: MemExecNode) -> Result> { + let schema: Schema = serde_json::from_str(&mem_exec.schema).map_err(|e| { + DataFusionError::Internal(format!("invalid schema in exec codec: {e:?}")) + })?; + let schema = Arc::new(schema); + match &self.context { + DecodingContext::SingleLockedBatch(single_batch) => Ok(Arc::new( + RwLockRecordBatchReader::new(schema, single_batch.clone()), + )), + DecodingContext::UnboundedBatchStream(unbounded_stream) => Ok(Arc::new( + UnboundedRecordBatchReader::new(schema, unbounded_stream.clone()), + )), + DecodingContext::LockedBatchVec(locked_batches) => Ok(Arc::new( + RecordBatchVecReader::new(schema, locked_batches.clone()), + )), + DecodingContext::Planning => Ok(Arc::new(FsMemExec::new(mem_exec.table_name, schema))), + DecodingContext::None => Err(DataFusionError::Internal( + "Need an internal context to decode".into(), + )), + DecodingContext::LockedJoinPair { left, right } => { + match mem_exec.table_name.as_str() { + mem_exec_join_side::LEFT => { + Ok(Arc::new(RwLockRecordBatchReader::new(schema, left.clone()))) + } + mem_exec_join_side::RIGHT => Ok(Arc::new(RwLockRecordBatchReader::new( + schema, + right.clone(), + ))), + _ => Err(DataFusionError::Internal(format!( + "unknown table name {}", + mem_exec.table_name + ))), + } + } + DecodingContext::LockedJoinStream { left, right } => { + match mem_exec.table_name.as_str() { + mem_exec_join_side::LEFT => Ok(Arc::new(UnboundedRecordBatchReader::new( + schema, + left.clone(), + ))), + mem_exec_join_side::RIGHT => Ok(Arc::new(UnboundedRecordBatchReader::new( + schema, + right.clone(), + ))), + _ => Err(DataFusionError::Internal(format!( + "unknown table name {}", + mem_exec.table_name + ))), + } + } + } + } +} + +fn decode_unnest_exec( + unnest: UnnestExecNode, + inputs: &[Arc], +) -> Result> { + let schema: Schema = serde_json::from_str(&unnest.schema).map_err(|e| { + DataFusionError::Internal(format!("invalid schema in exec codec: {e:?}")) + })?; + + let column = schema.index_of(UNNESTED_COL).map_err(|_| { + DataFusionError::Internal(format!( + "unnest node schema does not contain {UNNESTED_COL} col" + )) + })?; + + Ok(Arc::new(UnnestExec::new( + inputs + .first() + .ok_or_else(|| DataFusionError::Internal("no input for unnest node".to_string()))? + .clone(), + vec![ListUnnest { + index_in_input_schema: column, + depth: 1, + }], + vec![], + Arc::new(schema), + UnnestOptions::default(), + ))) +} + +fn decode_debezium_decode( + debezium: DebeziumDecodeNode, + inputs: &[Arc], +) -> Result> { + let schema = Arc::new(serde_json::from_str::(&debezium.schema).map_err(|e| { + DataFusionError::Internal(format!("invalid schema in exec codec: {e:?}")) + })?); + let input = inputs + .first() + .ok_or_else(|| DataFusionError::Internal("no input for debezium node".to_string()))? + .clone(); + let primary_keys = debezium + .primary_keys + .into_iter() + .map(|c| c as usize) + .collect(); + Ok(Arc::new(DebeziumUnrollingExec::from_decoded_parts( + input, + schema.clone(), + primary_keys, + ))) +} + +fn decode_debezium_encode( + debezium: DebeziumEncodeNode, + inputs: &[Arc], +) -> Result> { + let schema = Arc::new(serde_json::from_str::(&debezium.schema).map_err(|e| { + DataFusionError::Internal(format!("invalid schema in exec codec: {e:?}")) + })?); + let input = inputs + .first() + .ok_or_else(|| DataFusionError::Internal("no input for debezium node".to_string()))? + .clone(); + Ok(Arc::new(ToDebeziumExec::from_decoded_parts(input, schema))) +} diff --git a/src/sql/physical/meta.rs b/src/sql/physical/meta.rs new file mode 100644 index 00000000..95dd8fd8 --- /dev/null +++ b/src/sql/physical/meta.rs @@ -0,0 +1,52 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +use std::sync::{Arc, OnceLock}; + +use datafusion::arrow::datatypes::{DataType, Field, Fields}; + +use crate::sql::common::constants::updating_state_field; +use crate::sql::common::UPDATING_META_FIELD; + +pub fn updating_meta_fields() -> Fields { + static FIELDS: OnceLock = OnceLock::new(); + FIELDS + .get_or_init(|| { + Fields::from(vec![ + Field::new( + updating_state_field::IS_RETRACT, + DataType::Boolean, + true, + ), + Field::new( + updating_state_field::ID, + DataType::FixedSizeBinary(16), + true, + ), + ]) + }) + .clone() +} + +pub fn updating_meta_field() -> Arc { + static FIELD: OnceLock> = OnceLock::new(); + FIELD + .get_or_init(|| { + Arc::new(Field::new( + UPDATING_META_FIELD, + DataType::Struct(updating_meta_fields()), + false, + )) + }) + .clone() +} diff --git a/src/sql/physical/mod.rs b/src/sql/physical/mod.rs new file mode 100644 index 00000000..7cbb3231 --- /dev/null +++ b/src/sql/physical/mod.rs @@ -0,0 +1,24 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +mod cdc; +mod codec; +mod meta; +mod readers; +mod udfs; + +pub use cdc::{DebeziumUnrollingExec, ToDebeziumExec}; +pub use codec::{DecodingContext, FsPhysicalExtensionCodec}; +pub use meta::{updating_meta_field, updating_meta_fields}; +pub use readers::FsMemExec; +pub use udfs::window; diff --git a/src/sql/physical/readers.rs b/src/sql/physical/readers.rs new file mode 100644 index 00000000..1c785464 --- /dev/null +++ b/src/sql/physical/readers.rs @@ -0,0 +1,371 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +use std::any::Any; +use std::mem; +use std::sync::Arc; + +use datafusion::arrow::array::RecordBatch; +use datafusion::arrow::datatypes::SchemaRef; +use datafusion::catalog::memory::MemorySourceConfig; +use datafusion::common::{DataFusionError, Result, Statistics, not_impl_err, plan_err}; +use datafusion::datasource::memory::DataSourceExec; +use datafusion::execution::{SendableRecordBatchStream, TaskContext}; +use datafusion::physical_expr::EquivalenceProperties; +use datafusion::physical_plan::execution_plan::{Boundedness, EmissionType}; +use datafusion::physical_plan::memory::MemoryStream; +use datafusion::physical_plan::stream::RecordBatchStreamAdapter; +use datafusion::physical_plan::{ + DisplayAs, ExecutionPlan, Partitioning, PlanProperties, +}; +use futures::StreamExt; +use tokio::sync::mpsc::UnboundedReceiver; +use tokio_stream::wrappers::UnboundedReceiverStream; + +use crate::sql::common::constants::physical_plan_node_name; + +pub(crate) fn make_stream_properties(schema: SchemaRef) -> PlanProperties { + PlanProperties::new( + EquivalenceProperties::new(schema), + Partitioning::UnknownPartitioning(1), + EmissionType::Incremental, + Boundedness::Unbounded { + requires_infinite_memory: false, + }, + ) +} + +#[derive(Debug)] +pub(crate) struct RwLockRecordBatchReader { + schema: SchemaRef, + locked_batch: Arc>>, + properties: PlanProperties, +} + +impl RwLockRecordBatchReader { + pub(crate) fn new( + schema: SchemaRef, + locked_batch: Arc>>, + ) -> Self { + Self { + schema: schema.clone(), + locked_batch, + properties: make_stream_properties(schema), + } + } +} + +impl DisplayAs for RwLockRecordBatchReader { + fn fmt_as( + &self, + _t: datafusion::physical_plan::DisplayFormatType, + f: &mut std::fmt::Formatter, + ) -> std::fmt::Result { + write!(f, "RW Lock RecordBatchReader") + } +} + +impl ExecutionPlan for RwLockRecordBatchReader { + fn as_any(&self) -> &dyn Any { + self + } + + fn schema(&self) -> SchemaRef { + self.schema.clone() + } + + fn children(&self) -> Vec<&Arc> { + vec![] + } + + fn with_new_children( + self: Arc, + _children: Vec>, + ) -> Result> { + Err(DataFusionError::Internal("not supported".into())) + } + + fn execute( + &self, + _partition: usize, + _context: Arc, + ) -> Result { + let result = self + .locked_batch + .write() + .unwrap() + .take() + .expect("should have set a record batch before calling execute()"); + Ok(Box::pin(MemoryStream::try_new( + vec![result], + self.schema.clone(), + None, + )?)) + } + + fn statistics(&self) -> Result { + Ok(Statistics::new_unknown(&self.schema)) + } + + fn reset(&self) -> Result<()> { + Ok(()) + } + + fn properties(&self) -> &PlanProperties { + &self.properties + } + + fn name(&self) -> &str { + physical_plan_node_name::RW_LOCK_READER + } +} + +#[derive(Debug)] +pub(crate) struct UnboundedRecordBatchReader { + schema: SchemaRef, + receiver: Arc>>>, + properties: PlanProperties, +} + +impl UnboundedRecordBatchReader { + pub(crate) fn new( + schema: SchemaRef, + receiver: Arc>>>, + ) -> Self { + Self { + schema: schema.clone(), + receiver, + properties: make_stream_properties(schema), + } + } +} + +impl DisplayAs for UnboundedRecordBatchReader { + fn fmt_as( + &self, + _t: datafusion::physical_plan::DisplayFormatType, + f: &mut std::fmt::Formatter, + ) -> std::fmt::Result { + write!(f, "unbounded record batch reader") + } +} + +impl ExecutionPlan for UnboundedRecordBatchReader { + fn name(&self) -> &str { + physical_plan_node_name::UNBOUNDED_READER + } + + fn as_any(&self) -> &dyn Any { + self + } + + fn schema(&self) -> SchemaRef { + self.schema.clone() + } + + fn properties(&self) -> &PlanProperties { + &self.properties + } + + fn children(&self) -> Vec<&Arc> { + vec![] + } + + fn with_new_children( + self: Arc, + _children: Vec>, + ) -> Result> { + Err(DataFusionError::Internal("not supported".into())) + } + + fn execute( + &self, + _partition: usize, + _context: Arc, + ) -> Result { + Ok(Box::pin(RecordBatchStreamAdapter::new( + self.schema.clone(), + UnboundedReceiverStream::new( + self.receiver + .write() + .unwrap() + .take() + .expect("unbounded receiver should be present before calling exec"), + ) + .map(Ok), + ))) + } + + fn statistics(&self) -> Result { + Ok(Statistics::new_unknown(&self.schema)) + } + + fn reset(&self) -> Result<()> { + Ok(()) + } +} + +#[derive(Debug)] +pub(crate) struct RecordBatchVecReader { + schema: SchemaRef, + receiver: Arc>>, + properties: PlanProperties, +} + +impl RecordBatchVecReader { + pub(crate) fn new( + schema: SchemaRef, + receiver: Arc>>, + ) -> Self { + Self { + schema: schema.clone(), + receiver, + properties: make_stream_properties(schema), + } + } +} + +impl DisplayAs for RecordBatchVecReader { + fn fmt_as( + &self, + _t: datafusion::physical_plan::DisplayFormatType, + f: &mut std::fmt::Formatter, + ) -> std::fmt::Result { + write!(f, "record batch vec reader") + } +} + +impl ExecutionPlan for RecordBatchVecReader { + fn name(&self) -> &str { + physical_plan_node_name::VEC_READER + } + + fn as_any(&self) -> &dyn Any { + self + } + + fn schema(&self) -> SchemaRef { + self.schema.clone() + } + + fn properties(&self) -> &PlanProperties { + &self.properties + } + + fn children(&self) -> Vec<&Arc> { + vec![] + } + + fn with_new_children( + self: Arc, + _children: Vec>, + ) -> Result> { + Err(DataFusionError::Internal("not supported".into())) + } + + fn execute( + &self, + partition: usize, + context: Arc, + ) -> Result { + let memory = MemorySourceConfig::try_new( + &[mem::take(self.receiver.write().unwrap().as_mut())], + self.schema.clone(), + None, + )?; + + DataSourceExec::new(Arc::new(memory)).execute(partition, context) + } + + fn statistics(&self) -> Result { + Ok(Statistics::new_unknown(&self.schema)) + } + + fn reset(&self) -> Result<()> { + Ok(()) + } +} + +#[derive(Debug, Clone)] +pub struct FsMemExec { + pub table_name: String, + pub schema: SchemaRef, + properties: PlanProperties, +} + +impl DisplayAs for FsMemExec { + fn fmt_as( + &self, + _t: datafusion::physical_plan::DisplayFormatType, + f: &mut std::fmt::Formatter, + ) -> std::fmt::Result { + write!(f, "EmptyPartitionStream: schema={}", self.schema) + } +} + +impl FsMemExec { + pub fn new(table_name: String, schema: SchemaRef) -> Self { + Self { + schema: schema.clone(), + table_name, + properties: make_stream_properties(schema), + } + } +} + +impl ExecutionPlan for FsMemExec { + fn name(&self) -> &str { + physical_plan_node_name::MEM_EXEC + } + + fn as_any(&self) -> &dyn Any { + self + } + + fn schema(&self) -> SchemaRef { + self.schema.clone() + } + + fn properties(&self) -> &PlanProperties { + &self.properties + } + + fn children(&self) -> Vec<&Arc> { + vec![] + } + + fn with_new_children( + self: Arc, + _children: Vec>, + ) -> Result> { + not_impl_err!("with_new_children is not implemented for mem_exec; should not be called") + } + + fn execute( + &self, + _partition: usize, + _context: Arc, + ) -> Result { + plan_err!( + "EmptyPartitionStream cannot be executed, this is only used for physical planning before serialization" + ) + } + + fn statistics(&self) -> Result { + Ok(Statistics::new_unknown(&self.schema)) + } + + fn reset(&self) -> Result<()> { + Ok(()) + } +} diff --git a/src/sql/physical/udfs.rs b/src/sql/physical/udfs.rs new file mode 100644 index 00000000..03895fda --- /dev/null +++ b/src/sql/physical/udfs.rs @@ -0,0 +1,131 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +use std::any::Any; +use std::sync::Arc; + +use datafusion::arrow::array::StructArray; +use datafusion::arrow::datatypes::{DataType, Field, TimeUnit}; +use datafusion::common::{Result, ScalarValue, plan_err}; +use datafusion::logical_expr::{ + ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, TypeSignature, Volatility, +}; + +use crate::make_udf_function; +use crate::sql::common::constants::{window_function_udf, window_interval_field}; +use crate::sql::schema::utils::window_arrow_struct; + +#[derive(Debug)] +pub struct WindowFunctionUdf { + signature: Signature, +} + +impl Default for WindowFunctionUdf { + fn default() -> Self { + Self { + signature: Signature::new( + TypeSignature::Exact(vec![ + DataType::Timestamp(TimeUnit::Nanosecond, None), + DataType::Timestamp(TimeUnit::Nanosecond, None), + ]), + Volatility::Immutable, + ), + } + } +} + +impl ScalarUDFImpl for WindowFunctionUdf { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + window_function_udf::NAME + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _: &[DataType]) -> Result { + Ok(window_arrow_struct()) + } + + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + let columns = args.args; + if columns.len() != 2 { + return plan_err!( + "window function expected 2 arguments, got {}", + columns.len() + ); + } + if columns[0].data_type() != DataType::Timestamp(TimeUnit::Nanosecond, None) { + return plan_err!( + "window function expected first argument to be a timestamp, got {:?}", + columns[0].data_type() + ); + } + if columns[1].data_type() != DataType::Timestamp(TimeUnit::Nanosecond, None) { + return plan_err!( + "window function expected second argument to be a timestamp, got {:?}", + columns[1].data_type() + ); + } + let fields = vec![ + Arc::new(Field::new( + window_interval_field::START, + DataType::Timestamp(TimeUnit::Nanosecond, None), + false, + )), + Arc::new(Field::new( + window_interval_field::END, + DataType::Timestamp(TimeUnit::Nanosecond, None), + false, + )), + ] + .into(); + + match (&columns[0], &columns[1]) { + (ColumnarValue::Array(start), ColumnarValue::Array(end)) => { + Ok(ColumnarValue::Array(Arc::new(StructArray::new( + fields, + vec![start.clone(), end.clone()], + None, + )))) + } + (ColumnarValue::Array(start), ColumnarValue::Scalar(end)) => { + let end = end.to_array_of_size(start.len())?; + Ok(ColumnarValue::Array(Arc::new(StructArray::new( + fields, + vec![start.clone(), end], + None, + )))) + } + (ColumnarValue::Scalar(start), ColumnarValue::Array(end)) => { + let start = start.to_array_of_size(end.len())?; + Ok(ColumnarValue::Array(Arc::new(StructArray::new( + fields, + vec![start, end.clone()], + None, + )))) + } + (ColumnarValue::Scalar(start), ColumnarValue::Scalar(end)) => { + Ok(ColumnarValue::Scalar(ScalarValue::Struct( + StructArray::new(fields, vec![start.to_array()?, end.to_array()?], None).into(), + ))) + } + } + } +} + +make_udf_function!(WindowFunctionUdf, WINDOW_FUNCTION, window); diff --git a/src/sql/schema/catalog_ddl.rs b/src/sql/schema/catalog_ddl.rs new file mode 100644 index 00000000..3729c99c --- /dev/null +++ b/src/sql/schema/catalog_ddl.rs @@ -0,0 +1,253 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Best-effort SQL text for catalog introspection (`SHOW CREATE TABLE`). + +use std::collections::BTreeMap; + +use datafusion::arrow::datatypes::{DataType, TimeUnit}; + +use super::schema_provider::StreamTable; +use super::table::Table as CatalogTable; +use crate::sql::logical_node::logical::LogicalProgram; + +fn data_type_sql(dt: &DataType) -> String { + match dt { + DataType::Null => "NULL".to_string(), + DataType::Boolean => "BOOLEAN".to_string(), + DataType::Int8 => "TINYINT".to_string(), + DataType::Int16 => "SMALLINT".to_string(), + DataType::Int32 => "INT".to_string(), + DataType::Int64 => "BIGINT".to_string(), + DataType::UInt8 => "TINYINT UNSIGNED".to_string(), + DataType::UInt16 => "SMALLINT UNSIGNED".to_string(), + DataType::UInt32 => "INT UNSIGNED".to_string(), + DataType::UInt64 => "BIGINT UNSIGNED".to_string(), + DataType::Float16 => "FLOAT".to_string(), + DataType::Float32 => "REAL".to_string(), + DataType::Float64 => "DOUBLE".to_string(), + DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View => "VARCHAR".to_string(), + DataType::Binary | DataType::LargeBinary => "VARBINARY".to_string(), + DataType::Date32 => "DATE".to_string(), + DataType::Date64 => "DATE".to_string(), + DataType::Timestamp(unit, tz) => match (unit, tz) { + (TimeUnit::Second, None) => "TIMESTAMP(0)".to_string(), + (TimeUnit::Millisecond, None) => "TIMESTAMP(3)".to_string(), + (TimeUnit::Microsecond, None) => "TIMESTAMP(6)".to_string(), + (TimeUnit::Nanosecond, None) => "TIMESTAMP(9)".to_string(), + (_, Some(_)) => "TIMESTAMP WITH TIME ZONE".to_string(), + }, + DataType::Decimal128(p, s) => format!("DECIMAL({p},{s})"), + DataType::Decimal256(p, s) => format!("DECIMAL({p},{s})"), + _ => dt.to_string(), + } +} + +fn format_columns(schema: &datafusion::arrow::datatypes::Schema) -> Vec { + schema + .fields() + .iter() + .map(|f| { + let null = if f.is_nullable() { + "" + } else { + " NOT NULL" + }; + format!(" {} {}{}", f.name(), data_type_sql(f.data_type()), null) + }) + .collect() +} + +fn format_with_clause(opts: &BTreeMap) -> String { + if opts.is_empty() { + return "WITH ('connector' = '...', 'format' = '...');\n/* Original WITH options are not persisted in the stream catalog. */\n" + .to_string(); + } + let pairs: Vec = opts + .iter() + .map(|(k, v)| { + let k_esc = k.replace('\'', "''"); + let v_esc = v.replace('\'', "''"); + format!(" '{k_esc}' = '{v_esc}'") + }) + .collect(); + format!("WITH (\n{}\n);\n", pairs.join(",\n")) +} + +/// Single-line `col:TYPE` list for result grids. +pub fn schema_columns_one_line(schema: &datafusion::arrow::datatypes::Schema) -> String { + schema + .fields() + .iter() + .map(|f| format!("{}:{}", f.name(), data_type_sql(f.data_type()))) + .collect::>() + .join(", ") +} + +fn pipeline_summary_short(program: &LogicalProgram) -> String { + let mut parts: Vec = Vec::new(); + parts.push(format!("tasks={}", program.task_count())); + parts.push(format!("hash={}", program.get_hash())); + for nw in program.graph.node_weights() { + let chain = nw + .operator_chain + .operators + .iter() + .map(|o| format!("{}", o.operator_name)) + .collect::>() + .join("->"); + parts.push(format!("n{}:{}", nw.node_id, chain)); + } + parts.join(" | ") +} + +/// Extra fields for `SHOW TABLES` result grid (pipeline summary; no full Graphviz). +pub fn stream_table_row_detail(table: &StreamTable) -> String { + match table { + StreamTable::Source { + connector, + event_time_field, + watermark_field, + with_options, + .. + } => { + format!( + "connector={}, event_time={:?}, watermark={:?}, with_options={}", + connector, + event_time_field, + watermark_field, + with_options.len() + ) + } + StreamTable::Sink { program, .. } => pipeline_summary_short(program), + } +} + +fn pipeline_text(program: &LogicalProgram) -> String { + let mut lines: Vec = Vec::new(); + lines.push(format!("tasks_total: {}", program.task_count())); + lines.push(format!("program_hash: {}", program.get_hash())); + for nw in program.graph.node_weights() { + let chain = nw + .operator_chain + .operators + .iter() + .map(|o| format!("{}[{}]", o.operator_name, o.operator_id)) + .collect::>() + .join(" -> "); + lines.push(format!( + "node {} (parallelism={}): {chain}", + nw.node_id, nw.parallelism + )); + } + let dot = program.dot(); + const MAX_DOT: usize = 12_000; + if dot.len() > MAX_DOT { + lines.push(format!( + "graphviz_dot_truncated:\n{}... [{} more bytes]", + &dot[..MAX_DOT], + dot.len() - MAX_DOT + )); + } else { + lines.push(format!("graphviz_dot:\n{dot}")); + } + lines.join("\n") +} + +/// Human-readable `SHOW CREATE TABLE` text (sink `AS SELECT` is not stored). +pub fn show_create_stream_table(table: &StreamTable) -> String { + match table { + StreamTable::Source { + name, + connector, + schema, + event_time_field, + watermark_field, + with_options, + } => { + let cols = format_columns(schema); + let mut ddl = format!("CREATE TABLE {name} (\n{}\n)", cols.join(",\n")); + if let Some(e) = event_time_field { + ddl.push_str(&format!("\n/* EVENT TIME COLUMN: {e} */\n")); + } + if let Some(w) = watermark_field { + ddl.push_str(&format!("/* WATERMARK: {w} */\n")); + } + let mut merged_opts = with_options.clone(); + merged_opts + .entry("connector".to_string()) + .or_insert_with(|| connector.clone()); + ddl.push_str(&format_with_clause(&merged_opts)); + ddl + } + StreamTable::Sink { name, program } => { + let schema = program + .egress_arrow_schema() + .unwrap_or_else(|| std::sync::Arc::new(datafusion::arrow::datatypes::Schema::empty())); + let cols = format_columns(&schema); + let mut ddl = format!( + "CREATE STREAMING TABLE {name}\nWITH ('connector' = '...') AS SELECT ...\n/* Sink WITH / AS SELECT text is not stored. Output schema:\n{}\n*/\n\n", + cols.join(",\n") + ); + ddl.push_str("-- Resolved logical pipeline:\n"); + ddl.push_str(&pipeline_text(program)); + ddl.push('\n'); + ddl + } + } +} + +/// Extra fields for `SHOW TABLES` result grid for persisted catalog rows. +pub fn catalog_table_row_detail(table: &CatalogTable) -> String { + match table { + CatalogTable::ConnectorTable(source) => format!( + "kind=connector, connector={}, event_time={:?}, watermark={:?}, with_options={}", + source.connector(), + source.event_time_field(), + source.temporal_config.watermark_strategy_column, + source.catalog_with_options().len() + ), + CatalogTable::LookupTable(source) => format!( + "kind=lookup, connector={}, event_time={:?}, watermark={:?}, with_options={}", + source.connector(), + source.event_time_field(), + source.temporal_config.watermark_strategy_column, + source.catalog_with_options().len() + ), + CatalogTable::TableFromQuery { .. } => "kind=query".to_string(), + } +} + +/// Human-readable `SHOW CREATE TABLE` text for persisted catalog rows. +pub fn show_create_catalog_table(table: &CatalogTable) -> String { + match table { + CatalogTable::ConnectorTable(source) | CatalogTable::LookupTable(source) => { + let schema = source.produce_physical_schema(); + let cols = format_columns(&schema); + let mut ddl = format!("CREATE TABLE {} (\n{}\n)", source.name(), cols.join(",\n")); + if let Some(e) = source.event_time_field() { + ddl.push_str(&format!("\n/* EVENT TIME COLUMN: {e} */\n")); + } + if let Some(w) = source.temporal_config.watermark_strategy_column.as_deref() { + ddl.push_str(&format!("/* WATERMARK: {w} */\n")); + } + let mut opts = source.catalog_with_options().clone(); + opts.entry("connector".to_string()) + .or_insert_with(|| source.connector().to_string()); + ddl.push_str(&format_with_clause(&opts)); + ddl + } + CatalogTable::TableFromQuery { name, .. } => { + format!("CREATE TABLE {name} AS SELECT ...;\n/* logical query text is not persisted */\n") + } + } +} diff --git a/src/sql/schema/column_descriptor.rs b/src/sql/schema/column_descriptor.rs new file mode 100644 index 00000000..533708cc --- /dev/null +++ b/src/sql/schema/column_descriptor.rs @@ -0,0 +1,146 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use datafusion::arrow::datatypes::{DataType, Field, TimeUnit}; +use datafusion::logical_expr::Expr; + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub enum ColumnDescriptor { + Physical(Field), + SystemMeta { + field: Field, + meta_key: String, + }, + Computed { + field: Field, + logic: Box, + }, +} + +impl ColumnDescriptor { + #[inline] + pub fn new_physical(field: Field) -> Self { + Self::Physical(field) + } + + #[inline] + pub fn new_system_meta(field: Field, meta_key: impl Into) -> Self { + Self::SystemMeta { + field, + meta_key: meta_key.into(), + } + } + + #[inline] + pub fn new_computed(field: Field, logic: Expr) -> Self { + Self::Computed { + field, + logic: Box::new(logic), + } + } + + #[inline] + pub fn arrow_field(&self) -> &Field { + match self { + Self::Physical(f) => f, + Self::SystemMeta { field: f, .. } => f, + Self::Computed { field: f, .. } => f, + } + } + + #[inline] + pub fn into_arrow_field(self) -> Field { + match self { + Self::Physical(f) => f, + Self::SystemMeta { field: f, .. } => f, + Self::Computed { field: f, .. } => f, + } + } + + #[inline] + pub fn is_computed(&self) -> bool { + matches!(self, Self::Computed { .. }) + } + + #[inline] + pub fn is_physical(&self) -> bool { + matches!(self, Self::Physical(_)) + } + + #[inline] + pub fn system_meta_key(&self) -> Option<&str> { + if let Self::SystemMeta { meta_key, .. } = self { + Some(meta_key.as_str()) + } else { + None + } + } + + #[inline] + pub fn computation_logic(&self) -> Option<&Expr> { + if let Self::Computed { logic, .. } = self { + Some(logic) + } else { + None + } + } + + #[inline] + pub fn data_type(&self) -> &DataType { + self.arrow_field().data_type() + } + + pub fn set_nullable(&mut self, nullable: bool) { + let f = match self { + Self::Physical(f) => f, + Self::SystemMeta { field, .. } => field, + Self::Computed { field, .. } => field, + }; + *f = Field::new(f.name(), f.data_type().clone(), nullable) + .with_metadata(f.metadata().clone()); + } + + pub fn force_precision(&mut self, unit: TimeUnit) { + match self { + Self::Physical(f) => { + if let DataType::Timestamp(_, tz) = f.data_type() { + *f = Field::new(f.name(), DataType::Timestamp(unit, tz.clone()), f.is_nullable()); + } + } + Self::SystemMeta { field, .. } => { + if let DataType::Timestamp(_, tz) = field.data_type() { + *field = Field::new( + field.name(), + DataType::Timestamp(unit, tz.clone()), + field.is_nullable(), + ); + } + } + Self::Computed { field, .. } => { + if let DataType::Timestamp(_, tz) = field.data_type() { + *field = Field::new( + field.name(), + DataType::Timestamp(unit, tz.clone()), + field.is_nullable(), + ); + } + } + } + } +} + +impl From for ColumnDescriptor { + #[inline] + fn from(field: Field) -> Self { + Self::Physical(field) + } +} diff --git a/src/sql/schema/connection_type.rs b/src/sql/schema/connection_type.rs new file mode 100644 index 00000000..06a3df92 --- /dev/null +++ b/src/sql/schema/connection_type.rs @@ -0,0 +1,31 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::fmt; + +/// Describes the role of a connection in the streaming pipeline. +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub enum ConnectionType { + Source, + Sink, + Lookup, +} + +impl fmt::Display for ConnectionType { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + ConnectionType::Source => write!(f, "source"), + ConnectionType::Sink => write!(f, "sink"), + ConnectionType::Lookup => write!(f, "lookup"), + } + } +} diff --git a/src/sql/schema/connector_config.rs b/src/sql/schema/connector_config.rs new file mode 100644 index 00000000..f47e05d9 --- /dev/null +++ b/src/sql/schema/connector_config.rs @@ -0,0 +1,82 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// +// Strongly-typed in-memory connector configuration for the SQL catalog layer. +// Maps 1:1 to the `ConnectorOp.oneof config` proto variants. + +use std::collections::HashMap; + +use protocol::grpc::api::{ + connector_op, GenericConnectorConfig, KafkaSinkConfig, KafkaSourceConfig, +}; + +/// Strongly-typed connector configuration stored in [`super::SourceTable`]. +/// +/// Each variant corresponds directly to a proto `ConnectorOp.oneof config` branch. +/// Adding a new connector (e.g. MySQL CDC) means adding a variant here and a proto message — +/// the Rust compiler will then guide you to every call-site that needs updating. +#[derive(Debug, Clone)] +pub enum ConnectorConfig { + KafkaSource(KafkaSourceConfig), + KafkaSink(KafkaSinkConfig), + /// Fallback for connectors not yet strongly typed (e.g. future Redis, JDBC). + Generic(HashMap), +} + +impl ConnectorConfig { + /// Convert to the proto `ConnectorOp.oneof config` representation — zero JSON involved. + pub fn to_proto_config(&self) -> connector_op::Config { + match self { + ConnectorConfig::KafkaSource(cfg) => { + connector_op::Config::KafkaSource(cfg.clone()) + } + ConnectorConfig::KafkaSink(cfg) => { + connector_op::Config::KafkaSink(cfg.clone()) + } + ConnectorConfig::Generic(props) => { + connector_op::Config::Generic(GenericConnectorConfig { + properties: props.clone(), + }) + } + } + } +} + +// Proto-generated types do not derive Eq/Hash/PartialEq since they contain f32/f64 +// in the general case. For our subset (Kafka configs) all fields are integers, strings, +// and maps — logically hashable. We impl the traits via serialized proto bytes so the +// SourceTable derive chain stays intact. + +impl PartialEq for ConnectorConfig { + fn eq(&self, other: &Self) -> bool { + use prost::Message; + match (self, other) { + (ConnectorConfig::KafkaSource(a), ConnectorConfig::KafkaSource(b)) => { + a.encode_to_vec() == b.encode_to_vec() + } + (ConnectorConfig::KafkaSink(a), ConnectorConfig::KafkaSink(b)) => { + a.encode_to_vec() == b.encode_to_vec() + } + (ConnectorConfig::Generic(a), ConnectorConfig::Generic(b)) => a == b, + _ => false, + } + } +} + +impl Eq for ConnectorConfig {} + +impl std::hash::Hash for ConnectorConfig { + fn hash(&self, state: &mut H) { + use prost::Message; + std::mem::discriminant(self).hash(state); + match self { + ConnectorConfig::KafkaSource(cfg) => cfg.encode_to_vec().hash(state), + ConnectorConfig::KafkaSink(cfg) => cfg.encode_to_vec().hash(state), + ConnectorConfig::Generic(m) => { + let mut pairs: Vec<_> = m.iter().collect(); + pairs.sort_by_key(|(k, _)| (*k).clone()); + pairs.hash(state); + } + } + } +} diff --git a/src/sql/schema/data_encoding_format.rs b/src/sql/schema/data_encoding_format.rs new file mode 100644 index 00000000..29828c86 --- /dev/null +++ b/src/sql/schema/data_encoding_format.rs @@ -0,0 +1,88 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; + +use datafusion::arrow::datatypes::{DataType, Field}; +use datafusion::common::{Result, plan_err}; + +use super::column_descriptor::ColumnDescriptor; +use crate::sql::common::constants::{cdc, connection_format_value, with_opt_bool_str}; +use crate::sql::common::with_option_keys as opt; +use crate::sql::common::Format; + +/// High-level payload encoding (orthogonal to `Format` wire details in `ConnectionSchema`). +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub enum DataEncodingFormat { + StandardJson, + DebeziumJson, + Avro, + Parquet, + Raw, +} + +impl DataEncodingFormat { + pub fn extract_from_map(opts: &HashMap) -> Result { + let format_str = opts + .get(opt::FORMAT) + .map(|s| s.as_str()) + .unwrap_or(opt::DEFAULT_FORMAT_VALUE); + let is_debezium = opts + .get(opt::FORMAT_DEBEZIUM_FLAG) + .or_else(|| opts.get(opt::JSON_DEBEZIUM)) + .map(|s| s == with_opt_bool_str::TRUE) + .unwrap_or(false); + + match (format_str, is_debezium) { + (f, true) if f == connection_format_value::JSON => Ok(Self::DebeziumJson), + (f, _) if f == connection_format_value::DEBEZIUM_JSON => Ok(Self::DebeziumJson), + (f, false) if f == connection_format_value::JSON => Ok(Self::StandardJson), + (f, _) if f == connection_format_value::AVRO => Ok(Self::Avro), + (f, _) if f == connection_format_value::PARQUET => Ok(Self::Parquet), + _ => Ok(Self::Raw), + } + } + + pub fn from_connection_format(format: &Format) -> Self { + match format { + Format::Json(j) if j.debezium => Self::DebeziumJson, + Format::Json(_) => Self::StandardJson, + Format::Avro(_) => Self::Avro, + Format::Parquet(_) => Self::Parquet, + Format::Protobuf(_) | Format::RawString(_) | Format::RawBytes(_) => Self::Raw, + } + } + + pub fn supports_delta_updates(&self) -> bool { + matches!(self, Self::DebeziumJson) + } + + pub fn apply_envelope(self, columns: Vec) -> Result> { + if !self.supports_delta_updates() { + return Ok(columns); + } + if columns.iter().any(|c| c.is_computed()) { + return plan_err!("Virtual fields are not supported with CDC envelope"); + } + if columns.is_empty() { + return Ok(columns); + } + let fields: Vec = columns.into_iter().map(|c| c.into_arrow_field()).collect(); + let struct_type = DataType::Struct(fields.into()); + + Ok(vec![ + ColumnDescriptor::new_physical(Field::new(cdc::BEFORE, struct_type.clone(), true)), + ColumnDescriptor::new_physical(Field::new(cdc::AFTER, struct_type.clone(), true)), + ColumnDescriptor::new_physical(Field::new(cdc::OP, DataType::Utf8, true)), + ]) + } +} diff --git a/src/sql/schema/kafka_operator_config.rs b/src/sql/schema/kafka_operator_config.rs new file mode 100644 index 00000000..4dd70906 --- /dev/null +++ b/src/sql/schema/kafka_operator_config.rs @@ -0,0 +1,250 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// +// Builds strongly-typed proto Kafka configs from SQL DDL WITH options. + +use std::collections::HashMap; + +use datafusion::arrow::datatypes::Schema; +use datafusion::common::{Result as DFResult, plan_datafusion_err, plan_err}; + +use protocol::grpc::api::connector_op::Config as ProtoConfig; +use protocol::grpc::api::{ + BadDataPolicy, DecimalEncodingProto, FormatConfig, JsonFormatConfig, KafkaAuthConfig, + KafkaAuthNone, KafkaOffsetMode, KafkaReadMode, KafkaSinkCommitMode, KafkaSinkConfig, + KafkaSourceConfig, RawBytesFormatConfig, RawStringFormatConfig, TimestampFormatProto, +}; + +use crate::sql::common::constants::{connection_table_role, kafka_with_value}; +use crate::sql::common::connector_options::ConnectorOptions; +use crate::sql::common::formats::{ + BadData, DecimalEncoding as SqlDecimalEncoding, Format as SqlFormat, + TimestampFormat as SqlTimestampFormat, +}; +use crate::sql::common::with_option_keys as opt; +use crate::sql::schema::table_role::TableRole; + +fn sql_format_to_proto(fmt: &SqlFormat) -> DFResult { + match fmt { + SqlFormat::Json(j) => Ok(FormatConfig { + format: Some(protocol::grpc::api::format_config::Format::Json( + JsonFormatConfig { + timestamp_format: match j.timestamp_format { + SqlTimestampFormat::RFC3339 => TimestampFormatProto::TimestampRfc3339 as i32, + SqlTimestampFormat::UnixMillis => { + TimestampFormatProto::TimestampUnixMillis as i32 + } + }, + decimal_encoding: match j.decimal_encoding { + SqlDecimalEncoding::Number => DecimalEncodingProto::DecimalNumber as i32, + SqlDecimalEncoding::String => DecimalEncodingProto::DecimalString as i32, + SqlDecimalEncoding::Bytes => DecimalEncodingProto::DecimalBytes as i32, + }, + include_schema: j.include_schema, + confluent_schema_registry: j.confluent_schema_registry, + schema_id: j.schema_id, + debezium: j.debezium, + unstructured: j.unstructured, + }, + )), + }), + SqlFormat::RawString(_) => Ok(FormatConfig { + format: Some(protocol::grpc::api::format_config::Format::RawString( + RawStringFormatConfig {}, + )), + }), + SqlFormat::RawBytes(_) => Ok(FormatConfig { + format: Some(protocol::grpc::api::format_config::Format::RawBytes( + RawBytesFormatConfig {}, + )), + }), + other => plan_err!( + "Kafka connector: format '{}' is not supported yet", + other.name() + ), + } +} + +fn sql_bad_data_to_proto(bad: &BadData) -> i32 { + match bad { + BadData::Fail {} => BadDataPolicy::BadDataFail as i32, + BadData::Drop {} => BadDataPolicy::BadDataDrop as i32, + } +} + +/// Build Kafka proto config from a flat string map (catalog rebuild path). +pub fn build_kafka_proto_config_from_string_map( + map: HashMap, + _physical_schema: &Schema, +) -> DFResult { + let mut options = ConnectorOptions::from_flat_string_map(map)?; + let format = crate::sql::common::formats::Format::from_opts(&mut options) + .map_err(|e| datafusion::error::DataFusionError::Plan(format!("invalid format: {e}")))?; + let bad_data = BadData::from_opts(&mut options) + .map_err(|e| datafusion::error::DataFusionError::Plan(format!("Invalid bad_data: '{e}'")))?; + let _framing = crate::sql::common::formats::Framing::from_opts(&mut options) + .map_err(|e| datafusion::error::DataFusionError::Plan(format!("invalid framing: '{e}'")))?; + + let role = match options.pull_opt_str(opt::TYPE)?.as_deref() { + None | Some(connection_table_role::SOURCE) => TableRole::Ingestion, + Some(connection_table_role::SINK) => TableRole::Egress, + Some(connection_table_role::LOOKUP) => TableRole::Reference, + Some(other) => { + return plan_err!("invalid connection type '{other}' in WITH options"); + } + }; + + build_kafka_proto_config(&mut options, role, &format, bad_data) +} + +/// Core builder shared by SQL DDL and catalog reload paths. +pub fn build_kafka_proto_config( + options: &mut ConnectorOptions, + role: TableRole, + format: &Option, + bad_data: BadData, +) -> DFResult { + let bootstrap_servers = match options.pull_opt_str(opt::KAFKA_BOOTSTRAP_SERVERS)? { + Some(s) => s, + None => options + .pull_opt_str(opt::KAFKA_BOOTSTRAP_SERVERS_LEGACY)? + .ok_or_else(|| { + plan_datafusion_err!( + "Kafka connector requires 'bootstrap.servers' in the WITH clause" + ) + })?, + }; + + let topic = options + .pull_opt_str(opt::KAFKA_TOPIC)? + .ok_or_else(|| plan_datafusion_err!("Kafka connector requires 'topic' in the WITH clause"))?; + + let sql_format = format.clone().ok_or_else(|| { + plan_datafusion_err!( + "Kafka connector requires 'format' in the WITH clause (e.g. format = 'json')" + ) + })?; + let proto_format = sql_format_to_proto(&sql_format)?; + + let rate_limit = options + .pull_opt_u64(opt::KAFKA_RATE_LIMIT_MESSAGES_PER_SECOND)? + .map(|v| v.clamp(1, u32::MAX as u64) as u32) + .unwrap_or(0); + + let value_subject = options.pull_opt_str(opt::KAFKA_VALUE_SUBJECT)?; + + let auth = Some(KafkaAuthConfig { + auth: Some(protocol::grpc::api::kafka_auth_config::Auth::None( + KafkaAuthNone {}, + )), + }); + + let _ = options.pull_opt_str(opt::TYPE)?; + let _ = options.pull_opt_str(opt::CONNECTOR)?; + + match role { + TableRole::Ingestion => { + let offset_mode = match options.pull_opt_str(opt::KAFKA_SCAN_STARTUP_MODE)?.as_deref() { + Some(s) if s == kafka_with_value::SCAN_LATEST => { + KafkaOffsetMode::KafkaOffsetLatest as i32 + } + Some(s) if s == kafka_with_value::SCAN_EARLIEST => { + KafkaOffsetMode::KafkaOffsetEarliest as i32 + } + Some(s) + if s == kafka_with_value::SCAN_GROUP_OFFSETS + || s == kafka_with_value::SCAN_GROUP => + { + KafkaOffsetMode::KafkaOffsetGroup as i32 + } + None => KafkaOffsetMode::KafkaOffsetGroup as i32, + Some(other) => { + return plan_err!( + "invalid scan.startup.mode '{other}'; expected latest, earliest, or group-offsets" + ); + } + }; + + let read_mode = match options.pull_opt_str(opt::KAFKA_ISOLATION_LEVEL)?.as_deref() { + Some(s) if s == kafka_with_value::ISOLATION_READ_COMMITTED => { + KafkaReadMode::KafkaReadCommitted as i32 + } + Some(s) if s == kafka_with_value::ISOLATION_READ_UNCOMMITTED => { + KafkaReadMode::KafkaReadUncommitted as i32 + } + None => KafkaReadMode::KafkaReadDefault as i32, + Some(other) => { + return plan_err!("invalid isolation.level '{other}'"); + } + }; + + let group_id = match options.pull_opt_str(opt::KAFKA_GROUP_ID)? { + Some(s) => Some(s), + None => options.pull_opt_str(opt::KAFKA_GROUP_ID_LEGACY)?, + }; + let group_id_prefix = options.pull_opt_str(opt::KAFKA_GROUP_ID_PREFIX)?; + + let client_configs = options.drain_remaining_string_values()?; + + Ok(ProtoConfig::KafkaSource(KafkaSourceConfig { + topic, + bootstrap_servers, + group_id, + group_id_prefix, + offset_mode, + read_mode, + auth, + client_configs, + format: Some(proto_format), + bad_data_policy: sql_bad_data_to_proto(&bad_data), + rate_limit_msgs_per_sec: rate_limit, + value_subject, + })) + } + TableRole::Egress => { + let commit_mode = match options.pull_opt_str(opt::KAFKA_SINK_COMMIT_MODE)?.as_deref() { + Some(s) + if s == kafka_with_value::SINK_COMMIT_EXACTLY_ONCE_HYPHEN + || s == kafka_with_value::SINK_COMMIT_EXACTLY_ONCE_UNDERSCORE => + { + KafkaSinkCommitMode::KafkaSinkExactlyOnce as i32 + } + None => KafkaSinkCommitMode::KafkaSinkAtLeastOnce as i32, + Some(s) + if s == kafka_with_value::SINK_COMMIT_AT_LEAST_ONCE_HYPHEN + || s == kafka_with_value::SINK_COMMIT_AT_LEAST_ONCE_UNDERSCORE => + { + KafkaSinkCommitMode::KafkaSinkAtLeastOnce as i32 + } + Some(other) => { + return plan_err!("invalid sink.commit.mode '{other}'"); + } + }; + let key_field = match options.pull_opt_str(opt::KAFKA_SINK_KEY_FIELD)? { + Some(s) => Some(s), + None => options.pull_opt_str(opt::KAFKA_KEY_FIELD_LEGACY)?, + }; + let timestamp_field = match options.pull_opt_str(opt::KAFKA_SINK_TIMESTAMP_FIELD)? { + Some(s) => Some(s), + None => options.pull_opt_str(opt::KAFKA_TIMESTAMP_FIELD_LEGACY)?, + }; + + let client_configs = options.drain_remaining_string_values()?; + + Ok(ProtoConfig::KafkaSink(KafkaSinkConfig { + topic, + bootstrap_servers, + commit_mode, + key_field, + timestamp_field, + auth, + client_configs, + format: Some(proto_format), + value_subject, + })) + } + TableRole::Reference => { + plan_err!("Kafka connector cannot be used as a lookup table in this path") + } + } +} diff --git a/src/sql/schema/mod.rs b/src/sql/schema/mod.rs new file mode 100644 index 00000000..f3bf1946 --- /dev/null +++ b/src/sql/schema/mod.rs @@ -0,0 +1,41 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +pub mod catalog_ddl; +pub mod column_descriptor; +pub mod connection_type; +pub mod connector_config; +pub mod kafka_operator_config; +pub mod source_table; +pub mod data_encoding_format; +pub mod schema_context; +pub mod schema_provider; +pub mod table; +pub mod table_execution_unit; +pub mod table_role; +pub mod temporal_pipeline_config; +pub mod utils; + +pub use catalog_ddl::{ + catalog_table_row_detail, schema_columns_one_line, show_create_catalog_table, +}; +pub use column_descriptor::ColumnDescriptor; +pub use connection_type::ConnectionType; +pub use connector_config::ConnectorConfig; +pub use source_table::SourceTable; + +/// Back-compat alias for [`SourceTable`]. +pub type ConnectorTable = SourceTable; +pub use schema_provider::{ + ObjectName, StreamPlanningContext, StreamSchemaProvider, StreamTable, +}; +pub use table::Table; diff --git a/src/sql/schema/schema_context.rs b/src/sql/schema/schema_context.rs new file mode 100644 index 00000000..232fd9e7 --- /dev/null +++ b/src/sql/schema/schema_context.rs @@ -0,0 +1,37 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use datafusion::arrow::datatypes::{DataType, Schema}; +use datafusion::common::{Result, DFSchema}; +use datafusion::logical_expr::Expr; +use datafusion_expr::ExprSchemable; + +pub trait SchemaContext { + fn resolve_expression(&self, expr: &Expr, schema: &Schema) -> Result; + fn extract_datatype(&self, expr: &Expr, schema: &Schema) -> Result; +} + +/// [`SchemaContext`] backed by a [`DFSchema`] built from the physical Arrow schema. +pub struct DfSchemaContext; + +impl SchemaContext for DfSchemaContext { + fn resolve_expression(&self, expr: &Expr, schema: &Schema) -> Result { + let df = DFSchema::try_from(schema.clone())?; + let _ = expr.get_type(&df)?; + Ok(expr.clone()) + } + + fn extract_datatype(&self, expr: &Expr, schema: &Schema) -> Result { + let df = DFSchema::try_from(schema.clone())?; + expr.get_type(&df) + } +} diff --git a/src/sql/schema/schema_provider.rs b/src/sql/schema/schema_provider.rs new file mode 100644 index 00000000..bbe03079 --- /dev/null +++ b/src/sql/schema/schema_provider.rs @@ -0,0 +1,430 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::{BTreeMap, HashMap, HashSet}; +use std::sync::Arc; + +use datafusion::arrow::datatypes::{self as datatypes, DataType, Field, Schema}; +use datafusion::common::{DataFusionError, Result}; +use datafusion::datasource::{DefaultTableSource, TableProvider, TableType}; +use datafusion::execution::{FunctionRegistry, SessionStateDefaults}; +use datafusion::logical_expr::expr_rewriter::FunctionRewrite; +use datafusion::logical_expr::planner::ExprPlanner; +use datafusion::logical_expr::{AggregateUDF, Expr, ScalarUDF, TableSource, WindowUDF}; +use datafusion::optimizer::Analyzer; +use datafusion::sql::planner::ContextProvider; +use datafusion::sql::TableReference; +use unicase::UniCase; + +use crate::sql::logical_node::logical::{DylibUdfConfig, LogicalProgram}; +use crate::sql::common::constants::{planning_placeholder_udf, window_fn}; +use crate::sql::schema::table::Table as CatalogTable; +use crate::sql::schema::utils::window_arrow_struct; +use crate::sql::types::{PlaceholderUdf, PlanningOptions}; + +pub type ObjectName = UniCase; + +#[inline] +fn object_name(s: impl Into) -> ObjectName { + UniCase::new(s.into()) +} + +#[derive(Clone, Debug)] +pub enum StreamTable { + Source { + name: String, + connector: String, + schema: Arc, + event_time_field: Option, + watermark_field: Option, + /// Persisted `WITH` options for `SHOW CREATE TABLE`. + with_options: BTreeMap, + }, + Sink { + name: String, + program: LogicalProgram, + }, +} + +impl StreamTable { + pub fn name(&self) -> &str { + match self { + Self::Source { name, .. } | Self::Sink { name, .. } => name, + } + } + + pub fn schema(&self) -> Arc { + match self { + Self::Source { schema, .. } => Arc::clone(schema), + Self::Sink { program, .. } => program + .egress_arrow_schema() + .unwrap_or_else(|| Arc::new(Schema::empty())), + } + } +} + +#[derive(Debug, Clone)] +pub struct LogicalBatchInput { + pub table_name: String, + pub schema: Arc, +} + +#[async_trait::async_trait] +impl TableProvider for LogicalBatchInput { + fn as_any(&self) -> &dyn std::any::Any { + self + } + + fn schema(&self) -> Arc { + Arc::clone(&self.schema) + } + + fn table_type(&self) -> TableType { + TableType::Temporary + } + + async fn scan( + &self, + _state: &dyn datafusion::catalog::Session, + _projection: Option<&Vec>, + _filters: &[Expr], + _limit: Option, + ) -> Result> { + Ok(Arc::new(crate::sql::physical::FsMemExec::new( + self.table_name.clone(), + Arc::clone(&self.schema), + ))) + } +} + +#[derive(Clone, Default)] +pub struct FunctionCatalog { + pub scalars: HashMap>, + pub aggregates: HashMap>, + pub windows: HashMap>, + pub planners: Vec>, +} + +#[derive(Clone, Default)] +pub struct TableCatalog { + pub streams: HashMap>, + pub catalogs: HashMap>, + pub source_defs: HashMap, +} + +#[derive(Clone)] +pub struct StreamPlanningContext { + pub tables: TableCatalog, + pub functions: FunctionCatalog, + pub dylib_udfs: HashMap, + pub config_options: datafusion::config::ConfigOptions, + pub planning_options: PlanningOptions, + pub analyzer: Analyzer, +} + +impl Default for StreamPlanningContext { + fn default() -> Self { + Self { + tables: TableCatalog::default(), + functions: FunctionCatalog::default(), + dylib_udfs: HashMap::new(), + config_options: datafusion::config::ConfigOptions::default(), + planning_options: PlanningOptions::default(), + analyzer: Analyzer::default(), + } + } +} + +/// Back-compat name for [`StreamPlanningContext`]. +pub type StreamSchemaProvider = StreamPlanningContext; + +impl StreamPlanningContext { + pub fn builder() -> StreamPlanningContextBuilder { + StreamPlanningContextBuilder::default() + } + + /// Same registration order as the historical `StreamSchemaProvider::new` (placeholders, then DataFusion defaults). + pub fn new() -> Self { + Self::builder() + .with_streaming_extensions() + .expect("streaming extensions") + .with_default_functions() + .expect("default functions") + .build() + } + + pub fn register_stream_table(&mut self, table: StreamTable) { + let key = object_name(table.name().to_string()); + self.tables.streams.insert(key, Arc::new(table)); + } + + pub fn get_stream_table(&self, name: &str) -> Option> { + self.tables.streams.get(&object_name(name.to_string())).cloned() + } + + pub fn register_catalog_table(&mut self, table: CatalogTable) { + let key = object_name(table.name().to_string()); + self.tables.catalogs.insert(key, Arc::new(table)); + } + + pub fn get_catalog_table(&self, table_name: impl AsRef) -> Option<&CatalogTable> { + self.tables + .catalogs + .get(&object_name(table_name.as_ref().to_string())) + .map(|t| t.as_ref()) + } + + pub fn get_catalog_table_mut( + &mut self, + table_name: impl AsRef, + ) -> Option<&mut CatalogTable> { + self.tables + .catalogs + .get_mut(&object_name(table_name.as_ref().to_string())) + .map(|t| Arc::make_mut(t)) + } + + pub fn add_source_table( + &mut self, + name: String, + schema: Arc, + event_time_field: Option, + watermark_field: Option, + ) { + self.register_stream_table(StreamTable::Source { + name, + connector: "stream_catalog".to_string(), + schema, + event_time_field, + watermark_field, + with_options: BTreeMap::new(), + }); + } + + pub fn add_sink_table(&mut self, name: String, program: LogicalProgram) { + self.register_stream_table(StreamTable::Sink { name, program }); + } + + pub fn insert_table(&mut self, table: StreamTable) { + self.register_stream_table(table); + } + + /// Alias for [`Self::register_catalog_table`]. + pub fn insert_catalog_table(&mut self, table: CatalogTable) { + self.register_catalog_table(table); + } + + pub fn get_table(&self, table_name: impl AsRef) -> Option<&StreamTable> { + self.tables + .streams + .get(&object_name(table_name.as_ref().to_string())) + .map(|a| a.as_ref()) + } + + pub fn get_table_mut(&mut self, table_name: impl AsRef) -> Option<&mut StreamTable> { + self.tables + .streams + .get_mut(&object_name(table_name.as_ref().to_string())) + .map(|a| Arc::make_mut(a)) + } + + pub fn get_async_udf_options(&self, _name: &str) -> Option { + None + } + + fn create_table_source(name: String, schema: Arc) -> Arc { + let provider = LogicalBatchInput { table_name: name, schema }; + Arc::new(DefaultTableSource::new(Arc::new(provider))) + } +} + +impl ContextProvider for StreamPlanningContext { + fn get_table_source(&self, name: TableReference) -> Result> { + let table = self + .get_stream_table(name.table()) + .ok_or_else(|| DataFusionError::Plan(format!("Table {} not found", name)))?; + + Ok(Self::create_table_source(name.to_string(), table.schema())) + } + + fn get_function_meta(&self, name: &str) -> Option> { + self.functions.scalars.get(name).cloned() + } + + fn get_aggregate_meta(&self, name: &str) -> Option> { + self.functions.aggregates.get(name).cloned() + } + + fn get_window_meta(&self, name: &str) -> Option> { + self.functions.windows.get(name).cloned() + } + + fn get_variable_type(&self, _variable_names: &[String]) -> Option { + None + } + + fn options(&self) -> &datafusion::config::ConfigOptions { + &self.config_options + } + + fn udf_names(&self) -> Vec { + self.functions.scalars.keys().cloned().collect() + } + + fn udaf_names(&self) -> Vec { + self.functions.aggregates.keys().cloned().collect() + } + + fn udwf_names(&self) -> Vec { + self.functions.windows.keys().cloned().collect() + } + + fn get_expr_planners(&self) -> &[Arc] { + &self.functions.planners + } +} + +impl FunctionRegistry for StreamPlanningContext { + fn udfs(&self) -> HashSet { + self.functions.scalars.keys().cloned().collect() + } + + fn udf(&self, name: &str) -> Result> { + self.functions + .scalars + .get(name) + .cloned() + .ok_or_else(|| DataFusionError::Plan(format!("No UDF with name {name}"))) + } + + fn udaf(&self, name: &str) -> Result> { + self.functions + .aggregates + .get(name) + .cloned() + .ok_or_else(|| DataFusionError::Plan(format!("No UDAF with name {name}"))) + } + + fn udwf(&self, name: &str) -> Result> { + self.functions + .windows + .get(name) + .cloned() + .ok_or_else(|| DataFusionError::Plan(format!("No UDWF with name {name}"))) + } + + fn register_function_rewrite( + &mut self, + rewrite: Arc, + ) -> Result<()> { + self.analyzer.add_function_rewrite(rewrite); + Ok(()) + } + + fn register_udf(&mut self, udf: Arc) -> Result>> { + Ok(self.functions.scalars.insert(udf.name().to_string(), udf)) + } + + fn register_udaf(&mut self, udaf: Arc) -> Result>> { + Ok(self + .functions + .aggregates + .insert(udaf.name().to_string(), udaf)) + } + + fn register_udwf(&mut self, udwf: Arc) -> Result>> { + Ok(self.functions.windows.insert(udwf.name().to_string(), udwf)) + } + + fn register_expr_planner(&mut self, expr_planner: Arc) -> Result<()> { + self.functions.planners.push(expr_planner); + Ok(()) + } + + fn expr_planners(&self) -> Vec> { + self.functions.planners.clone() + } +} + +#[derive(Default)] +pub struct StreamPlanningContextBuilder { + context: StreamPlanningContext, +} + +impl StreamPlanningContextBuilder { + pub fn new() -> Self { + Self::default() + } + + pub fn with_default_functions(mut self) -> Result { + for p in SessionStateDefaults::default_scalar_functions() { + self.context.register_udf(p)?; + } + for p in SessionStateDefaults::default_aggregate_functions() { + self.context.register_udaf(p)?; + } + for p in SessionStateDefaults::default_window_functions() { + self.context.register_udwf(p)?; + } + for p in SessionStateDefaults::default_expr_planners() { + self.context.register_expr_planner(p)?; + } + Ok(self) + } + + pub fn with_streaming_extensions(mut self) -> Result { + let extensions = vec![ + PlaceholderUdf::with_return( + window_fn::HOP, + vec![ + DataType::Interval(datatypes::IntervalUnit::MonthDayNano), + DataType::Interval(datatypes::IntervalUnit::MonthDayNano), + ], + window_arrow_struct(), + ), + PlaceholderUdf::with_return( + window_fn::TUMBLE, + vec![DataType::Interval(datatypes::IntervalUnit::MonthDayNano)], + window_arrow_struct(), + ), + PlaceholderUdf::with_return( + window_fn::SESSION, + vec![DataType::Interval(datatypes::IntervalUnit::MonthDayNano)], + window_arrow_struct(), + ), + PlaceholderUdf::with_return( + planning_placeholder_udf::UNNEST, + vec![DataType::List(Arc::new(Field::new( + planning_placeholder_udf::LIST_ELEMENT_FIELD, + DataType::Utf8, + true, + )))], + DataType::Utf8, + ), + PlaceholderUdf::with_return( + planning_placeholder_udf::ROW_TIME, + vec![], + DataType::Timestamp(datatypes::TimeUnit::Nanosecond, None), + ), + ]; + + for ext in extensions { + self.context.register_udf(ext)?; + } + + Ok(self) + } + + pub fn build(self) -> StreamPlanningContext { + self.context + } +} diff --git a/src/sql/schema/source_table.rs b/src/sql/schema/source_table.rs new file mode 100644 index 00000000..fe4411dd --- /dev/null +++ b/src/sql/schema/source_table.rs @@ -0,0 +1,593 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::{BTreeMap, HashMap}; +use std::sync::Arc; +use std::time::Duration; + +use datafusion::arrow::datatypes::{DataType, Field, FieldRef, Schema}; +use datafusion::common::{Column, DFSchema, Result, plan_datafusion_err, plan_err}; +use datafusion::error::DataFusionError; +use datafusion::logical_expr::Expr; +use datafusion_expr::ExprSchemable; +use datafusion::sql::planner::{PlannerContext, SqlToRel}; +use datafusion::sql::sqlparser::ast; +use datafusion::sql::TableReference; +use protocol::grpc::api::ConnectorOp; +use tracing::warn; + +use super::column_descriptor::ColumnDescriptor; +use super::connector_config::ConnectorConfig; +use super::data_encoding_format::DataEncodingFormat; +use super::schema_context::SchemaContext; +use super::table_execution_unit::{EngineDescriptor, SyncMode, TableExecutionUnit}; +use super::table_role::{ + apply_adapter_specific_rules, deduce_role, serialize_backend_params, + validate_adapter_availability, TableRole, +}; +use super::temporal_pipeline_config::{resolve_temporal_logic, TemporalPipelineConfig, TemporalSpec}; +use super::StreamSchemaProvider; +use crate::multifield_partial_ord; +use crate::sql::api::ConnectionProfile; +use crate::sql::common::constants::{ + connection_table_role, connector_type, sql_field, +}; +use crate::sql::common::connector_options::ConnectorOptions; +use crate::sql::common::with_option_keys as opt; +use crate::sql::common::{ + BadData, Format, Framing, FsSchema, JsonCompression, JsonFormat, +}; +use crate::sql::schema::kafka_operator_config::build_kafka_proto_config; +use crate::sql::schema::ConnectionType; +use crate::sql::schema::table::SqlSource; +use crate::sql::types::ProcessingMode; + +/// Connector-backed catalog table (adapter / source-sink model). +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct SourceTable { + pub registry_id: Option, + pub adapter_type: String, + pub table_identifier: String, + pub role: TableRole, + pub schema_specs: Vec, + /// Strongly-typed connector runtime configuration — replaces the legacy `opaque_config: String`. + pub connector_config: ConnectorConfig, + pub temporal_config: TemporalPipelineConfig, + pub key_constraints: Vec, + pub payload_format: Option, + /// Wire [`Format`] when built from SQL `WITH` (updating mode, `ConnectionSchema`). + pub connection_format: Option, + pub description: String, + pub partition_exprs: Arc>>, + pub lookup_cache_max_bytes: Option, + pub lookup_cache_ttl: Option, + pub inferred_fields: Option>, + /// Original `WITH` options for catalog persistence / `SHOW CREATE TABLE`. + pub catalog_with_options: BTreeMap, +} + +multifield_partial_ord!( + SourceTable, + registry_id, + adapter_type, + table_identifier, + role, + description, + key_constraints, + connection_format, + catalog_with_options +); + +impl SourceTable { + #[inline] + pub fn name(&self) -> &str { + self.table_identifier.as_str() + } + + pub fn new( + table_identifier: impl Into, + connector: impl Into, + connection_type: ConnectionType, + ) -> Self { + Self { + registry_id: None, + adapter_type: connector.into(), + table_identifier: table_identifier.into(), + role: connection_type.into(), + schema_specs: Vec::new(), + connector_config: ConnectorConfig::Generic(HashMap::new()), + temporal_config: TemporalPipelineConfig::default(), + key_constraints: Vec::new(), + payload_format: None, + connection_format: None, + description: String::new(), + partition_exprs: Arc::new(None), + lookup_cache_max_bytes: None, + lookup_cache_ttl: None, + inferred_fields: None, + catalog_with_options: BTreeMap::new(), + } + } + + #[inline] + pub fn connector(&self) -> &str { + self.adapter_type.as_str() + } + + #[inline] + pub fn connection_type(&self) -> ConnectionType { + self.role.into() + } + + pub fn event_time_field(&self) -> Option<&str> { + self.temporal_config.event_column.as_deref() + } + + pub fn watermark_field(&self) -> Option<&str> { + self.temporal_config.watermark_strategy_column.as_deref() + } + + /// Watermark column name safe to persist for [`StreamTable::Source`]. Omits the computed + /// [`sql_field::COMPUTED_WATERMARK`] column: stream catalog only stores Arrow physical fields, + /// so `__watermark` cannot be resolved when the table is planned from the catalog. + pub fn stream_catalog_watermark_field(&self) -> Option { + self.temporal_config + .watermark_strategy_column + .as_deref() + .filter(|w| *w != sql_field::COMPUTED_WATERMARK) + .map(str::to_string) + } + + #[inline] + pub fn catalog_with_options(&self) -> &BTreeMap { + &self.catalog_with_options + } + + pub fn idle_time(&self) -> Option { + self.temporal_config.liveness_timeout + } + + pub fn initialize_from_params( + id: &str, + adapter: &str, + raw_columns: Vec, + pk_list: Vec, + time_meta: Option, + options: &mut HashMap, + _schema_ctx: &dyn SchemaContext, + ) -> Result { + validate_adapter_availability(adapter)?; + + let catalog_with_options: BTreeMap = options + .iter() + .map(|(k, v)| (k.clone(), v.clone())) + .collect(); + + let encoding = DataEncodingFormat::extract_from_map(options)?; + + let mut refined_columns = apply_adapter_specific_rules(adapter, raw_columns); + refined_columns = encoding.apply_envelope(refined_columns)?; + + let temporal_settings = resolve_temporal_logic(&refined_columns, time_meta)?; + let _finalized_config = serialize_backend_params(adapter, options)?; + let role = deduce_role(options)?; + + if role == TableRole::Ingestion && encoding.supports_delta_updates() && pk_list.is_empty() { + return plan_err!("CDC source requires at least one primary key"); + } + + Ok(Self { + registry_id: None, + adapter_type: adapter.to_string(), + table_identifier: id.to_string(), + role, + schema_specs: refined_columns, + connector_config: ConnectorConfig::Generic(catalog_with_options.clone().into_iter().collect()), + temporal_config: temporal_settings, + key_constraints: pk_list, + payload_format: Some(encoding), + connection_format: None, + description: String::new(), + partition_exprs: Arc::new(None), + lookup_cache_max_bytes: None, + lookup_cache_ttl: None, + inferred_fields: None, + catalog_with_options, + }) + } + + pub fn produce_physical_schema(&self) -> Schema { + Schema::new( + self.schema_specs + .iter() + .filter(|c| !c.is_computed()) + .map(|c| c.arrow_field().clone()) + .collect::>(), + ) + } + + #[inline] + pub fn physical_schema(&self) -> Schema { + self.produce_physical_schema() + } + + pub fn convert_to_execution_unit(&self) -> Result { + if self.role == TableRole::Egress { + return plan_err!("Target [{}] is write-only", self.table_identifier); + } + + if self.is_cdc_enabled() && self.schema_specs.iter().any(|c| c.is_computed()) { + return plan_err!("CDC cannot be mixed with computed columns natively"); + } + + let mode = if self.is_cdc_enabled() { + SyncMode::Incremental + } else { + SyncMode::AppendOnly + }; + + Ok(TableExecutionUnit { + label: self.table_identifier.clone(), + engine_meta: EngineDescriptor { + engine_type: self.adapter_type.clone(), + raw_payload: String::new(), + }, + sync_mode: mode, + temporal_offset: self.temporal_config.clone(), + }) + } + + #[inline] + pub fn to_execution_unit(&self) -> Result { + self.convert_to_execution_unit() + } + + fn is_cdc_enabled(&self) -> bool { + self.payload_format + .as_ref() + .is_some_and(|f| f.supports_delta_updates()) + } + + #[allow(clippy::too_many_arguments)] + pub fn from_options( + table_identifier: &str, + connector_name: &str, + temporary: bool, + fields: Vec, + primary_keys: Vec, + watermark: Option<(String, Option)>, + options: &mut ConnectorOptions, + connection_profile: Option<&ConnectionProfile>, + schema_provider: &StreamSchemaProvider, + connection_type_override: Option, + description: String, + ) -> Result { + let _ = connection_profile; + + let catalog_with_options = options.snapshot_for_catalog(); + + if let Some(c) = options.pull_opt_str(opt::CONNECTOR)? { + if c != connector_name { + return plan_err!( + "WITH option `connector` is '{c}' but table uses connector '{connector_name}'" + ); + } + } + + validate_adapter_availability(connector_name)?; + + let mut columns = fields; + columns = apply_adapter_specific_rules(connector_name, columns); + + let format = Format::from_opts(options) + .map_err(|e| DataFusionError::Plan(format!("invalid format: '{e}'")))?; + + if let Some(Format::Json(JsonFormat { compression, .. })) = &format + && !matches!(compression, JsonCompression::Uncompressed) + && connector_name != connector_type::FILESYSTEM + { + return plan_err!("'json.compression' is only supported for the filesystem connector"); + } + + let _framing = Framing::from_opts(options) + .map_err(|e| DataFusionError::Plan(format!("invalid framing: '{e}'")))?; + + if temporary + && let Some(t) = options.insert_str(opt::TYPE, connection_table_role::LOOKUP)? + && t != connection_table_role::LOOKUP + { + return plan_err!( + "Cannot have a temporary table with type '{t}'; temporary tables must be type 'lookup'" + ); + } + + let payload_format = format.as_ref().map(DataEncodingFormat::from_connection_format); + let encoding = payload_format.unwrap_or(DataEncodingFormat::Raw); + columns = encoding.apply_envelope(columns)?; + + let bad_data = BadData::from_opts(options) + .map_err(|e| DataFusionError::Plan(format!("Invalid bad_data: '{e}'")))?; + + let role = if let Some(t) = connection_type_override { + t.into() + } else { + match options.pull_opt_str(opt::TYPE)?.as_deref() { + None | Some(connection_table_role::SOURCE) => TableRole::Ingestion, + Some(connection_table_role::SINK) => TableRole::Egress, + Some(connection_table_role::LOOKUP) => TableRole::Reference, + Some(other) => { + return plan_err!("invalid connection type '{other}' in WITH options"); + } + } + }; + + let mut table = SourceTable { + registry_id: None, + adapter_type: connector_name.to_string(), + table_identifier: table_identifier.to_string(), + role, + schema_specs: columns, + connector_config: ConnectorConfig::Generic(HashMap::new()), + temporal_config: TemporalPipelineConfig::default(), + key_constraints: Vec::new(), + payload_format, + connection_format: format.clone(), + description, + partition_exprs: Arc::new(None), + lookup_cache_max_bytes: None, + lookup_cache_ttl: None, + inferred_fields: None, + catalog_with_options, + }; + + if let Some(event_time_field) = options.pull_opt_field(opt::EVENT_TIME_FIELD)? { + warn!("`event_time_field` WITH option is deprecated; use WATERMARK FOR syntax"); + table.temporal_config.event_column = Some(event_time_field); + } + + if let Some(watermark_field) = options.pull_opt_field(opt::WATERMARK_FIELD)? { + warn!("`watermark_field` WITH option is deprecated; use WATERMARK FOR syntax"); + table.temporal_config.watermark_strategy_column = Some(watermark_field); + } + + if let Some((time_field, watermark_expr)) = watermark { + let field = table + .schema_specs + .iter() + .find(|c| c.arrow_field().name().as_str() == time_field.as_str()) + .ok_or_else(|| { + plan_datafusion_err!( + "WATERMARK FOR field `{}` does not exist in table", + time_field + ) + })?; + + if !matches!(field.arrow_field().data_type(), DataType::Timestamp(_, None)) { + return plan_err!( + "WATERMARK FOR field `{time_field}` has type {}, but expected TIMESTAMP", + field.arrow_field().data_type() + ); + } + + // Watermark 引用的时间列语义上必须非空,强制设为 NOT NULL, + // 避免用户建表时遗漏 NOT NULL 导致后续表达式 nullable 校验失败。 + for col in table.schema_specs.iter_mut() { + if col.arrow_field().name().as_str() == time_field.as_str() { + col.set_nullable(false); + break; + } + } + + let table_ref = TableReference::bare(table.table_identifier.as_str()); + let df_schema = + DFSchema::try_from_qualified_schema(table_ref, &table.produce_physical_schema())?; + + table.temporal_config.event_column = Some(time_field.clone()); + + if let Some(expr) = watermark_expr { + let logical_expr = plan_generating_expr(&expr, &df_schema, schema_provider) + .map_err(|e| { + DataFusionError::Plan(format!("could not plan watermark expression: {e}")) + })?; + + let (data_type, _nullable) = logical_expr.data_type_and_nullable(&df_schema)?; + if !matches!(data_type, DataType::Timestamp(_, _)) { + return plan_err!( + "the type of the WATERMARK FOR expression must be TIMESTAMP, but was {data_type}" + ); + } + + table.schema_specs.push(ColumnDescriptor::new_computed( + Field::new( + sql_field::COMPUTED_WATERMARK, + logical_expr.get_type(&df_schema)?, + false, + ), + logical_expr, + )); + table.temporal_config.watermark_strategy_column = + Some(sql_field::COMPUTED_WATERMARK.to_string()); + } else { + table.temporal_config.watermark_strategy_column = Some(time_field); + } + } + + let idle_from_micros = options + .pull_opt_i64(opt::IDLE_MICROS)? + .filter(|t| *t > 0) + .map(|t| Duration::from_micros(t as u64)); + let idle_from_duration = options.pull_opt_duration(opt::IDLE_TIME)?; + table.temporal_config.liveness_timeout = idle_from_micros.or(idle_from_duration); + + table.lookup_cache_max_bytes = options.pull_opt_u64(opt::LOOKUP_CACHE_MAX_BYTES)?; + + table.lookup_cache_ttl = options.pull_opt_duration(opt::LOOKUP_CACHE_TTL)?; + + if connector_name.eq_ignore_ascii_case(connector_type::KAFKA) { + let proto_cfg = build_kafka_proto_config(options, role, &format, bad_data)?; + table.connector_config = match proto_cfg { + protocol::grpc::api::connector_op::Config::KafkaSource(cfg) => { + ConnectorConfig::KafkaSource(cfg) + } + protocol::grpc::api::connector_op::Config::KafkaSink(cfg) => { + ConnectorConfig::KafkaSink(cfg) + } + protocol::grpc::api::connector_op::Config::Generic(g) => { + ConnectorConfig::Generic(g.properties) + } + }; + } else { + let extra_opts = options.drain_remaining_string_values()?; + table.connector_config = ConnectorConfig::Generic(extra_opts); + } + + if role == TableRole::Ingestion && encoding.supports_delta_updates() && primary_keys.is_empty() + { + return plan_err!("Debezium source must have at least one PRIMARY KEY field"); + } + + table.key_constraints = primary_keys; + + Ok(table) + } + + pub fn has_virtual_fields(&self) -> bool { + self.schema_specs.iter().any(|c| c.is_computed()) + } + + pub fn is_updating(&self) -> bool { + self.connection_format + .as_ref() + .is_some_and(|f| f.is_updating()) + || self.payload_format == Some(DataEncodingFormat::DebeziumJson) + } + + /// Build strongly-typed `ConnectorOp` protobuf for runtime operator construction. + /// + /// Directly maps the in-memory [`ConnectorConfig`] to the proto `oneof config` — zero JSON, + /// zero re-parsing. + pub fn connector_op(&self) -> ConnectorOp { + let physical = self.produce_physical_schema(); + let fields: Vec = physical + .fields() + .iter() + .map(|f| f.as_ref().clone()) + .collect(); + let fs_schema = FsSchema::from_fields(fields); + + ConnectorOp { + connector: self.adapter_type.clone(), + fs_schema: Some(fs_schema.into()), + name: self.table_identifier.clone(), + description: self.description.clone(), + config: Some(self.connector_config.to_proto_config()), + } + } + + pub fn processing_mode(&self) -> ProcessingMode { + if self.is_updating() { + ProcessingMode::Update + } else { + ProcessingMode::Append + } + } + + pub fn timestamp_override(&self) -> Result> { + if let Some(field_name) = self.temporal_config.event_column.clone() { + if self.is_updating() { + return plan_err!("can't use event_time_field with update mode"); + } + let _field = self.get_time_column(&field_name)?; + Ok(Some(Expr::Column(Column::from_name(field_name.as_str())))) + } else { + Ok(None) + } + } + + fn get_time_column(&self, field_name: &str) -> Result<&ColumnDescriptor> { + self.schema_specs + .iter() + .find(|c| { + c.arrow_field().name() == field_name + && matches!(c.arrow_field().data_type(), DataType::Timestamp(..)) + }) + .ok_or_else(|| { + DataFusionError::Plan(format!( + "field {field_name} not found or not a timestamp" + )) + }) + } + + pub fn watermark_column(&self) -> Result> { + if let Some(field_name) = self.temporal_config.watermark_strategy_column.clone() { + let _field = self.get_time_column(&field_name)?; + Ok(Some(Expr::Column(Column::from_name(field_name.as_str())))) + } else { + Ok(None) + } + } + + pub fn as_sql_source(&self) -> Result { + match self.role { + TableRole::Ingestion => {} + TableRole::Egress | TableRole::Reference => { + return plan_err!("cannot read from sink"); + } + }; + + if self.is_updating() && self.has_virtual_fields() { + return plan_err!("can't read from a source with virtual fields and update mode."); + } + + let timestamp_override = self.timestamp_override()?; + let watermark_column = self.watermark_column()?; + + let source = SqlSource { + id: self.registry_id, + struct_def: self + .schema_specs + .iter() + .filter(|c| !c.is_computed()) + .map(|c| Arc::new(c.arrow_field().clone())) + .collect(), + config: self.connector_op(), + processing_mode: self.processing_mode(), + idle_time: self.temporal_config.liveness_timeout, + }; + + Ok(SourceOperator { + name: self.table_identifier.clone(), + source, + timestamp_override, + watermark_column, + }) + } +} + +/// Plan a SQL scalar expression against a table-qualified schema (e.g. watermark `AS` clause). +fn plan_generating_expr( + ast: &ast::Expr, + df_schema: &DFSchema, + schema_provider: &StreamSchemaProvider, +) -> Result { + let planner = SqlToRel::new(schema_provider); + let mut ctx = PlannerContext::new(); + planner.sql_to_expr(ast.clone(), df_schema, &mut ctx) +} + +#[derive(Debug, Clone)] +pub struct SourceOperator { + pub name: String, + pub source: SqlSource, + pub timestamp_override: Option, + pub watermark_column: Option, +} diff --git a/src/sql/schema/table.rs b/src/sql/schema/table.rs new file mode 100644 index 00000000..156e8ffd --- /dev/null +++ b/src/sql/schema/table.rs @@ -0,0 +1,162 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; +use std::time::Duration; +use datafusion::arrow::datatypes::FieldRef; +use datafusion::common::{Result, plan_err}; +use datafusion::logical_expr::{Extension, LogicalPlan}; +use datafusion::sql::sqlparser::ast::Statement; +use protocol::grpc::api::ConnectorOp; +use super::source_table::SourceTable; +use crate::sql::logical_planner::optimizers::produce_optimized_plan; +use crate::sql::schema::StreamSchemaProvider; +use crate::sql::extensions::remote_table::RemoteTableBoundaryNode; +use crate::sql::analysis::rewrite_plan; +use crate::sql::types::{DFField, ProcessingMode}; + +/// Represents all table types in the FunctionStream SQL catalog. +#[allow(clippy::enum_variant_names)] +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub enum Table { + /// A lookup table backed by an external connector. + LookupTable(SourceTable), + /// A source/sink table backed by an external connector. + ConnectorTable(SourceTable), + /// A table defined by a query (CREATE VIEW / CREATE TABLE AS SELECT). + TableFromQuery { + name: String, + logical_plan: LogicalPlan, + }, +} + +impl Table { + /// Try to construct a Table from a CREATE TABLE or CREATE VIEW statement. + pub fn try_from_statement( + statement: &Statement, + schema_provider: &StreamSchemaProvider, + ) -> Result> { + use datafusion::logical_expr::{CreateMemoryTable, CreateView, DdlStatement}; + use datafusion::sql::sqlparser::ast::CreateTable; + + if let Statement::CreateTable(CreateTable { query: None, .. }) = statement { + return plan_err!( + "CREATE TABLE without AS SELECT is not supported; use CREATE TABLE ... AS SELECT or a connector table" + ); + } + + match produce_optimized_plan(statement, schema_provider) { + Ok(LogicalPlan::Ddl(DdlStatement::CreateView(CreateView { name, input, .. }))) + | Ok(LogicalPlan::Ddl(DdlStatement::CreateMemoryTable(CreateMemoryTable { + name, + input, + .. + }))) => { + let rewritten = rewrite_plan(input.as_ref().clone(), schema_provider)?; + let schema = rewritten.schema().clone(); + let remote = RemoteTableBoundaryNode { + upstream_plan: rewritten, + table_identifier: name.to_owned(), + resolved_schema: schema, + requires_materialization: true, + }; + Ok(Some(Table::TableFromQuery { + name: name.to_string(), + logical_plan: LogicalPlan::Extension(Extension { + node: Arc::new(remote), + }), + })) + } + _ => Ok(None), + } + } + + pub fn name(&self) -> &str { + match self { + Table::TableFromQuery { name, .. } => name.as_str(), + Table::ConnectorTable(c) | Table::LookupTable(c) => c.name(), + } + } + + pub fn get_fields(&self) -> Vec { + match self { + Table::ConnectorTable(SourceTable { + schema_specs, + inferred_fields, + .. + }) + | Table::LookupTable(SourceTable { + schema_specs, + inferred_fields, + .. + }) => inferred_fields.clone().unwrap_or_else(|| { + schema_specs + .iter() + .map(|c| Arc::new(c.arrow_field().clone())) + .collect() + }), + Table::TableFromQuery { logical_plan, .. } => { + logical_plan.schema().fields().iter().cloned().collect() + } + } + } + + pub fn set_inferred_fields(&mut self, fields: Vec) -> Result<()> { + let Table::ConnectorTable(t) = self else { + return Ok(()); + }; + + if !t.schema_specs.is_empty() { + return Ok(()); + } + + if let Some(existing) = &t.inferred_fields { + let matches = existing.len() == fields.len() + && existing + .iter() + .zip(&fields) + .all(|(a, b)| a.name() == b.name() && a.data_type() == b.data_type()); + + if !matches { + return plan_err!("all inserts into a table must share the same schema"); + } + } + + let fields: Vec<_> = fields.into_iter().map(|f| f.field().clone()).collect(); + t.inferred_fields.replace(fields); + + Ok(()) + } + + pub fn connector_op(&self) -> Result { + match self { + Table::ConnectorTable(c) | Table::LookupTable(c) => Ok(c.connector_op()), + Table::TableFromQuery { .. } => plan_err!("can't write to a query-defined table"), + } + } + + pub fn partition_exprs(&self) -> Option<&Vec> { + match self { + Table::ConnectorTable(c) => (*c.partition_exprs).as_ref(), + _ => None, + } + } +} + +#[derive(Clone, Debug)] +pub struct SqlSource { + pub id: Option, + pub struct_def: Vec, + pub config: ConnectorOp, + pub processing_mode: ProcessingMode, + pub idle_time: Option, +} diff --git a/src/sql/schema/table_execution_unit.rs b/src/sql/schema/table_execution_unit.rs new file mode 100644 index 00000000..c23dda7a --- /dev/null +++ b/src/sql/schema/table_execution_unit.rs @@ -0,0 +1,33 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use super::temporal_pipeline_config::TemporalPipelineConfig; + +#[derive(Debug, Clone)] +pub struct EngineDescriptor { + pub engine_type: String, + pub raw_payload: String, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub enum SyncMode { + AppendOnly, + Incremental, +} + +#[derive(Debug, Clone)] +pub struct TableExecutionUnit { + pub label: String, + pub engine_meta: EngineDescriptor, + pub sync_mode: SyncMode, + pub temporal_offset: TemporalPipelineConfig, +} diff --git a/src/sql/schema/table_role.rs b/src/sql/schema/table_role.rs new file mode 100644 index 00000000..bf3fed74 --- /dev/null +++ b/src/sql/schema/table_role.rs @@ -0,0 +1,96 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; + +use datafusion::arrow::datatypes::{DataType, TimeUnit}; +use datafusion::common::{Result, plan_err}; +use datafusion::error::DataFusionError; + +use super::column_descriptor::ColumnDescriptor; +use super::connection_type::ConnectionType; +use crate::sql::common::constants::{ + connection_table_role, connector_type, SUPPORTED_CONNECTOR_ADAPTERS, +}; +use crate::sql::common::with_option_keys as opt; + +/// Role of a connector-backed table in the pipeline (ingest / egress / lookup). +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub enum TableRole { + Ingestion, + Egress, + Reference, +} + +impl From for ConnectionType { + fn from(r: TableRole) -> Self { + match r { + TableRole::Ingestion => ConnectionType::Source, + TableRole::Egress => ConnectionType::Sink, + TableRole::Reference => ConnectionType::Lookup, + } + } +} + +impl From for TableRole { + fn from(c: ConnectionType) -> Self { + match c { + ConnectionType::Source => TableRole::Ingestion, + ConnectionType::Sink => TableRole::Egress, + ConnectionType::Lookup => TableRole::Reference, + } + } +} + +pub fn validate_adapter_availability(adapter: &str) -> Result<()> { + if !SUPPORTED_CONNECTOR_ADAPTERS.contains(&adapter) { + return Err(DataFusionError::Plan(format!("Unknown adapter '{adapter}'"))); + } + Ok(()) +} + +pub fn apply_adapter_specific_rules(adapter: &str, mut cols: Vec) -> Vec { + match adapter { + a if a == connector_type::DELTA || a == connector_type::ICEBERG => { + for c in &mut cols { + if matches!(c.data_type(), DataType::Timestamp(_, _)) { + c.force_precision(TimeUnit::Microsecond); + } + } + cols + } + _ => cols, + } +} + +pub fn deduce_role(options: &HashMap) -> Result { + match options.get(opt::TYPE).map(|s| s.as_str()) { + None | Some(connection_table_role::SOURCE) => Ok(TableRole::Ingestion), + Some(connection_table_role::SINK) => Ok(TableRole::Egress), + Some(connection_table_role::LOOKUP) => Ok(TableRole::Reference), + Some(other) => plan_err!("Invalid role '{other}'"), + } +} + +pub fn serialize_backend_params(adapter: &str, options: &HashMap) -> Result { + let mut payload = serde_json::Map::new(); + payload.insert( + opt::ADAPTER.to_string(), + serde_json::Value::String(adapter.to_string()), + ); + + for (k, v) in options { + payload.insert(k.clone(), serde_json::Value::String(v.clone())); + } + + serde_json::to_string(&payload).map_err(|e| DataFusionError::Plan(e.to_string())) +} diff --git a/src/sql/schema/temporal_pipeline_config.rs b/src/sql/schema/temporal_pipeline_config.rs new file mode 100644 index 00000000..f672e552 --- /dev/null +++ b/src/sql/schema/temporal_pipeline_config.rs @@ -0,0 +1,58 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::time::Duration; + +use datafusion::common::{Result, plan_err}; +use datafusion::logical_expr::Expr; + +use super::column_descriptor::ColumnDescriptor; +use crate::sql::common::constants::sql_field; + +/// Event-time and watermark configuration for streaming tables. +#[derive(Debug, Clone, Default, PartialEq, Eq, Hash)] +pub struct TemporalPipelineConfig { + pub event_column: Option, + pub watermark_strategy_column: Option, + pub liveness_timeout: Option, +} + +#[derive(Debug, Clone)] +pub struct TemporalSpec { + pub time_field: String, + pub watermark_expr: Option, +} + +pub fn resolve_temporal_logic( + columns: &[ColumnDescriptor], + time_meta: Option, +) -> Result { + let mut config = TemporalPipelineConfig::default(); + + if let Some(meta) = time_meta { + let field_exists = columns + .iter() + .any(|c| c.arrow_field().name() == meta.time_field.as_str()); + if !field_exists { + return plan_err!("Temporal field {} does not exist", meta.time_field); + } + config.event_column = Some(meta.time_field.clone()); + + if meta.watermark_expr.is_some() { + config.watermark_strategy_column = Some(sql_field::COMPUTED_WATERMARK.to_string()); + } else { + config.watermark_strategy_column = Some(meta.time_field); + } + } + + Ok(config) +} diff --git a/src/sql/schema/utils.rs b/src/sql/schema/utils.rs new file mode 100644 index 00000000..ba408f22 --- /dev/null +++ b/src/sql/schema/utils.rs @@ -0,0 +1,79 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; +use std::sync::Arc; + +use datafusion::arrow::datatypes::{DataType, Field, Schema, SchemaRef, TimeUnit}; +use datafusion::common::{DFSchema, DFSchemaRef, Result as DFResult, TableReference}; + +use crate::sql::common::constants::window_interval_field; +use crate::sql::types::{DFField, TIMESTAMP_FIELD}; + +/// Returns the Arrow struct type for a window (start, end) pair. +pub fn window_arrow_struct() -> DataType { + DataType::Struct( + vec![ + Arc::new(Field::new( + window_interval_field::START, + DataType::Timestamp(TimeUnit::Nanosecond, None), + false, + )), + Arc::new(Field::new( + window_interval_field::END, + DataType::Timestamp(TimeUnit::Nanosecond, None), + false, + )), + ] + .into(), + ) +} + +/// Adds a `_timestamp` field to a DFSchema if it doesn't already have one. +pub fn add_timestamp_field( + schema: DFSchemaRef, + qualifier: Option, +) -> DFResult { + if has_timestamp_field(&schema) { + return Ok(schema); + } + + let timestamp_field = DFField::new( + qualifier, + TIMESTAMP_FIELD, + DataType::Timestamp(TimeUnit::Nanosecond, None), + false, + ); + Ok(Arc::new(schema.join(&DFSchema::new_with_metadata( + vec![timestamp_field.into()], + HashMap::new(), + )?)?)) +} + +/// Checks whether a DFSchema contains a `_timestamp` field. +pub fn has_timestamp_field(schema: &DFSchemaRef) -> bool { + schema + .fields() + .iter() + .any(|field| field.name() == TIMESTAMP_FIELD) +} + +/// Adds a `_timestamp` field to an Arrow Schema, returning a new SchemaRef. +pub fn add_timestamp_field_arrow(schema: Schema) -> SchemaRef { + let mut fields = schema.fields().to_vec(); + fields.push(Arc::new(Field::new( + TIMESTAMP_FIELD, + DataType::Timestamp(TimeUnit::Nanosecond, None), + false, + ))); + Arc::new(Schema::new(fields)) +} diff --git a/src/sql/types/data_type.rs b/src/sql/types/data_type.rs new file mode 100644 index 00000000..4736f812 --- /dev/null +++ b/src/sql/types/data_type.rs @@ -0,0 +1,157 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; + +use datafusion::arrow::datatypes::{ + DECIMAL_DEFAULT_SCALE, DECIMAL128_MAX_PRECISION, DataType, Field, IntervalUnit, TimeUnit, +}; +use datafusion::common::{Result, plan_datafusion_err, plan_err}; + +use crate::sql::common::constants::planning_placeholder_udf; +use crate::sql::common::FsExtensionType; + +pub fn convert_data_type( + sql_type: &datafusion::sql::sqlparser::ast::DataType, +) -> Result<(DataType, Option)> { + use datafusion::sql::sqlparser::ast::ArrayElemTypeDef; + use datafusion::sql::sqlparser::ast::DataType as SQLDataType; + + match sql_type { + SQLDataType::Array(ArrayElemTypeDef::AngleBracket(inner_sql_type)) + | SQLDataType::Array(ArrayElemTypeDef::SquareBracket(inner_sql_type, _)) => { + let (data_type, extension) = convert_simple_data_type(inner_sql_type)?; + + Ok(( + DataType::List(Arc::new(FsExtensionType::add_metadata( + extension, + Field::new(planning_placeholder_udf::LIST_ELEMENT_FIELD, data_type, true), + ))), + None, + )) + } + SQLDataType::Array(ArrayElemTypeDef::None) => { + plan_err!("Arrays with unspecified type is not supported") + } + other => convert_simple_data_type(other), + } +} + +fn convert_simple_data_type( + sql_type: &datafusion::sql::sqlparser::ast::DataType, +) -> Result<(DataType, Option)> { + use datafusion::sql::sqlparser::ast::DataType as SQLDataType; + use datafusion::sql::sqlparser::ast::{ExactNumberInfo, TimezoneInfo}; + + if matches!(sql_type, SQLDataType::JSON) { + return Ok((DataType::Utf8, Some(FsExtensionType::JSON))); + } + + let dt = match sql_type { + SQLDataType::Boolean | SQLDataType::Bool => Ok(DataType::Boolean), + SQLDataType::TinyInt(_) => Ok(DataType::Int8), + SQLDataType::SmallInt(_) | SQLDataType::Int2(_) => Ok(DataType::Int16), + SQLDataType::Int(_) | SQLDataType::Integer(_) | SQLDataType::Int4(_) => Ok(DataType::Int32), + SQLDataType::BigInt(_) | SQLDataType::Int8(_) => Ok(DataType::Int64), + SQLDataType::TinyIntUnsigned(_) => Ok(DataType::UInt8), + SQLDataType::SmallIntUnsigned(_) | SQLDataType::Int2Unsigned(_) => Ok(DataType::UInt16), + SQLDataType::IntUnsigned(_) + | SQLDataType::UnsignedInteger + | SQLDataType::Int4Unsigned(_) => Ok(DataType::UInt32), + SQLDataType::BigIntUnsigned(_) | SQLDataType::Int8Unsigned(_) => Ok(DataType::UInt64), + SQLDataType::Float(_) => Ok(DataType::Float32), + SQLDataType::Real | SQLDataType::Float4 => Ok(DataType::Float32), + SQLDataType::Double(_) | SQLDataType::DoublePrecision | SQLDataType::Float8 => { + Ok(DataType::Float64) + } + SQLDataType::Char(_) + | SQLDataType::Varchar(_) + | SQLDataType::Text + | SQLDataType::String(_) => Ok(DataType::Utf8), + SQLDataType::Timestamp(None, TimezoneInfo::None) | SQLDataType::Datetime(_) => { + Ok(DataType::Timestamp(TimeUnit::Nanosecond, None)) + } + SQLDataType::Timestamp(Some(precision), TimezoneInfo::None) => match *precision { + 0 => Ok(DataType::Timestamp(TimeUnit::Second, None)), + 3 => Ok(DataType::Timestamp(TimeUnit::Millisecond, None)), + 6 => Ok(DataType::Timestamp(TimeUnit::Microsecond, None)), + 9 => Ok(DataType::Timestamp(TimeUnit::Nanosecond, None)), + _ => { + return plan_err!( + "unsupported precision {} -- supported precisions are 0 (seconds), \ + 3 (milliseconds), 6 (microseconds), and 9 (nanoseconds)", + precision + ); + } + }, + SQLDataType::Date => Ok(DataType::Date32), + SQLDataType::Time(None, tz_info) => { + if matches!(tz_info, TimezoneInfo::None) + || matches!(tz_info, TimezoneInfo::WithoutTimeZone) + { + Ok(DataType::Time64(TimeUnit::Nanosecond)) + } else { + return plan_err!("Unsupported SQL type {sql_type:?}"); + } + } + SQLDataType::Numeric(exact_number_info) | SQLDataType::Decimal(exact_number_info) => { + let (precision, scale) = match *exact_number_info { + ExactNumberInfo::None => (None, None), + ExactNumberInfo::Precision(precision) => (Some(precision), None), + ExactNumberInfo::PrecisionAndScale(precision, scale) => { + (Some(precision), Some(scale)) + } + }; + make_decimal_type(precision, scale) + } + SQLDataType::Bytea => Ok(DataType::Binary), + SQLDataType::Interval => Ok(DataType::Interval(IntervalUnit::MonthDayNano)), + SQLDataType::Struct(fields, _) => { + let fields: Vec<_> = fields + .iter() + .map(|f| { + Ok::<_, datafusion::error::DataFusionError>(Arc::new(Field::new( + f.field_name + .as_ref() + .ok_or_else(|| { + plan_datafusion_err!("anonymous struct fields are not allowed") + })? + .to_string(), + convert_data_type(&f.field_type)?.0, + true, + ))) + }) + .collect::>()?; + Ok(DataType::Struct(fields.into())) + } + _ => return plan_err!("Unsupported SQL type {sql_type:?}"), + }; + + Ok((dt?, None)) +} + +fn make_decimal_type(precision: Option, scale: Option) -> Result { + let (precision, scale) = match (precision, scale) { + (Some(p), Some(s)) => (p as u8, s as i8), + (Some(p), None) => (p as u8, 0), + (None, Some(_)) => return plan_err!("Cannot specify only scale for decimal data type"), + (None, None) => (DECIMAL128_MAX_PRECISION, DECIMAL_DEFAULT_SCALE), + }; + + if precision == 0 || precision > DECIMAL128_MAX_PRECISION || scale.unsigned_abs() > precision { + plan_err!( + "Decimal(precision = {precision}, scale = {scale}) should satisfy `0 < precision <= 38`, and `scale <= precision`." + ) + } else { + Ok(DataType::Decimal128(precision, scale)) + } +} diff --git a/src/sql/types/df_field.rs b/src/sql/types/df_field.rs new file mode 100644 index 00000000..435ae30a --- /dev/null +++ b/src/sql/types/df_field.rs @@ -0,0 +1,153 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; +use std::sync::Arc; + +use datafusion::arrow::datatypes::{DataType, Field, FieldRef}; +use datafusion::common::{Column, DFSchema, Result, TableReference}; + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct DFField { + qualifier: Option, + field: FieldRef, +} + +impl From<(Option, FieldRef)> for DFField { + fn from(value: (Option, FieldRef)) -> Self { + Self { + qualifier: value.0, + field: value.1, + } + } +} + +impl From<(Option<&TableReference>, &Field)> for DFField { + fn from(value: (Option<&TableReference>, &Field)) -> Self { + Self { + qualifier: value.0.cloned(), + field: Arc::new(value.1.clone()), + } + } +} + +impl From for (Option, FieldRef) { + fn from(value: DFField) -> Self { + (value.qualifier, value.field) + } +} + +impl DFField { + pub fn new( + qualifier: Option, + name: impl Into, + data_type: DataType, + nullable: bool, + ) -> Self { + Self { + qualifier, + field: Arc::new(Field::new(name, data_type, nullable)), + } + } + + pub fn new_unqualified(name: &str, data_type: DataType, nullable: bool) -> Self { + DFField { + qualifier: None, + field: Arc::new(Field::new(name, data_type, nullable)), + } + } + + pub fn name(&self) -> &String { + self.field.name() + } + + pub fn data_type(&self) -> &DataType { + self.field.data_type() + } + + pub fn is_nullable(&self) -> bool { + self.field.is_nullable() + } + + pub fn metadata(&self) -> &HashMap { + self.field.metadata() + } + + pub fn qualified_name(&self) -> String { + if let Some(qualifier) = &self.qualifier { + format!("{}.{}", qualifier, self.field.name()) + } else { + self.field.name().to_owned() + } + } + + pub fn qualified_column(&self) -> Column { + Column { + relation: self.qualifier.clone(), + name: self.field.name().to_string(), + spans: Default::default(), + } + } + + pub fn unqualified_column(&self) -> Column { + Column { + relation: None, + name: self.field.name().to_string(), + spans: Default::default(), + } + } + + pub fn qualifier(&self) -> Option<&TableReference> { + self.qualifier.as_ref() + } + + pub fn field(&self) -> &FieldRef { + &self.field + } + + pub fn strip_qualifier(mut self) -> Self { + self.qualifier = None; + self + } + + pub fn with_nullable(mut self, nullable: bool) -> Self { + let f = self.field().as_ref().clone().with_nullable(nullable); + self.field = f.into(); + self + } + + pub fn with_metadata(mut self, metadata: HashMap) -> Self { + let f = self.field().as_ref().clone().with_metadata(metadata); + self.field = f.into(); + self + } +} + +pub fn fields_with_qualifiers(schema: &DFSchema) -> Vec { + schema + .fields() + .iter() + .enumerate() + .map(|(i, f)| (schema.qualified_field(i).0.cloned(), f.clone()).into()) + .collect() +} + +pub fn schema_from_df_fields(fields: &[DFField]) -> Result { + schema_from_df_fields_with_metadata(fields, HashMap::new()) +} + +pub fn schema_from_df_fields_with_metadata( + fields: &[DFField], + metadata: HashMap, +) -> Result { + DFSchema::new_with_metadata(fields.iter().map(|t| t.clone().into()).collect(), metadata) +} diff --git a/src/sql/types/mod.rs b/src/sql/types/mod.rs new file mode 100644 index 00000000..4c99d08f --- /dev/null +++ b/src/sql/types/mod.rs @@ -0,0 +1,62 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +mod data_type; +mod df_field; +pub(crate) mod placeholder_udf; +mod stream_schema; +mod window; + +use std::time::Duration; + +use crate::sql::common::constants::sql_planning_default; + +pub use df_field::{ + DFField, fields_with_qualifiers, schema_from_df_fields, schema_from_df_fields_with_metadata, +}; +pub(crate) use placeholder_udf::PlaceholderUdf; +pub(crate) use window::WindowBehavior; +pub use window::{WindowType, find_window}; + +pub use crate::sql::common::constants::sql_field::TIMESTAMP_FIELD; + +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum ProcessingMode { + Append, + Update, +} + +#[derive(Clone, Debug)] +pub struct SqlConfig { + pub default_parallelism: usize, +} + +impl Default for SqlConfig { + fn default() -> Self { + Self { + default_parallelism: sql_planning_default::DEFAULT_PARALLELISM, + } + } +} + +#[derive(Clone)] +pub struct PlanningOptions { + pub ttl: Duration, +} + +impl Default for PlanningOptions { + fn default() -> Self { + Self { + ttl: Duration::from_secs(sql_planning_default::PLANNING_TTL_SECS), + } + } +} diff --git a/src/sql/types/placeholder_udf.rs b/src/sql/types/placeholder_udf.rs new file mode 100644 index 00000000..0bdf17e6 --- /dev/null +++ b/src/sql/types/placeholder_udf.rs @@ -0,0 +1,70 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::any::Any; +use std::fmt::{Debug, Formatter}; +use std::sync::Arc; + +use datafusion::arrow::datatypes::DataType; +use datafusion::common::Result; +use datafusion::logical_expr::{ + ColumnarValue, ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl, Signature, Volatility, +}; + +#[allow(clippy::type_complexity)] +pub(crate) struct PlaceholderUdf { + name: String, + signature: Signature, + return_type: Arc Result + Send + Sync + 'static>, +} + +impl Debug for PlaceholderUdf { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "PlaceholderUDF<{}>", self.name) + } +} + +impl ScalarUDFImpl for PlaceholderUdf { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + &self.name + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, args: &[DataType]) -> Result { + (self.return_type)(args) + } + + fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> Result { + unimplemented!("PlaceholderUdf should never be called at execution time"); + } +} + +impl PlaceholderUdf { + pub fn with_return( + name: impl Into, + args: Vec, + ret: DataType, + ) -> Arc { + Arc::new(ScalarUDF::new_from_impl(PlaceholderUdf { + name: name.into(), + signature: Signature::exact(args, Volatility::Volatile), + return_type: Arc::new(move |_| Ok(ret.clone())), + })) + } +} diff --git a/src/sql/types/stream_schema.rs b/src/sql/types/stream_schema.rs new file mode 100644 index 00000000..4b63182d --- /dev/null +++ b/src/sql/types/stream_schema.rs @@ -0,0 +1,88 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; + +use datafusion::arrow::datatypes::{Field, Schema, SchemaRef}; +use datafusion::common::Result; + +use super::TIMESTAMP_FIELD; + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct StreamSchema { + pub schema: SchemaRef, + pub timestamp_index: usize, + pub key_indices: Option>, +} + +impl StreamSchema { + pub fn new(schema: SchemaRef, timestamp_index: usize, key_indices: Option>) -> Self { + Self { + schema, + timestamp_index, + key_indices, + } + } + + pub fn new_unkeyed(schema: SchemaRef, timestamp_index: usize) -> Self { + Self { + schema, + timestamp_index, + key_indices: None, + } + } + + pub fn from_fields(fields: Vec) -> Self { + let schema = Arc::new(Schema::new(fields)); + let timestamp_index = schema + .column_with_name(TIMESTAMP_FIELD) + .map(|(i, _)| i) + .unwrap_or(0); + Self { + schema, + timestamp_index, + key_indices: None, + } + } + + pub fn from_schema_keys(schema: SchemaRef, key_indices: Vec) -> Result { + let timestamp_index = schema + .column_with_name(TIMESTAMP_FIELD) + .ok_or_else(|| { + datafusion::error::DataFusionError::Plan(format!( + "no {TIMESTAMP_FIELD} field in schema, schema is {schema:?}" + )) + })? + .0; + Ok(Self { + schema, + timestamp_index, + key_indices: Some(key_indices), + }) + } + + pub fn from_schema_unkeyed(schema: SchemaRef) -> Result { + let timestamp_index = schema + .column_with_name(TIMESTAMP_FIELD) + .ok_or_else(|| { + datafusion::error::DataFusionError::Plan(format!( + "no {TIMESTAMP_FIELD} field in schema" + )) + })? + .0; + Ok(Self { + schema, + timestamp_index, + key_indices: None, + }) + } +} diff --git a/src/sql/types/window.rs b/src/sql/types/window.rs new file mode 100644 index 00000000..7934bc1d --- /dev/null +++ b/src/sql/types/window.rs @@ -0,0 +1,109 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::time::Duration; + +use datafusion::common::{Result, plan_err}; +use datafusion::logical_expr::Expr; + +use crate::sql::common::constants::window_fn; + +use super::DFField; + +#[derive(Clone, Debug, PartialEq, Eq, Hash)] +pub enum WindowType { + Tumbling { width: Duration }, + Sliding { width: Duration, slide: Duration }, + Session { gap: Duration }, + Instant, +} + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub(crate) enum WindowBehavior { + FromOperator { + window: WindowType, + window_field: DFField, + window_index: usize, + is_nested: bool, + }, + InData, +} + +pub fn get_duration(expression: &Expr) -> Result { + use datafusion::common::ScalarValue; + + match expression { + Expr::Literal(ScalarValue::IntervalDayTime(Some(val)), _) => { + Ok(Duration::from_secs((val.days as u64) * 24 * 60 * 60) + + Duration::from_millis(val.milliseconds as u64)) + } + Expr::Literal(ScalarValue::IntervalMonthDayNano(Some(val)), _) => { + if val.months != 0 { + return datafusion::common::not_impl_err!( + "Windows do not support durations specified as months" + ); + } + Ok(Duration::from_secs((val.days as u64) * 24 * 60 * 60) + + Duration::from_nanos(val.nanoseconds as u64)) + } + _ => plan_err!( + "unsupported Duration expression, expect duration literal, not {}", + expression + ), + } +} + +pub fn find_window(expression: &Expr) -> Result> { + use datafusion::logical_expr::expr::Alias; + use datafusion::logical_expr::expr::ScalarFunction; + + match expression { + Expr::ScalarFunction(ScalarFunction { func: fun, args }) => match fun.name() { + name if name == window_fn::HOP => { + if args.len() != 2 { + unreachable!(); + } + let slide = get_duration(&args[0])?; + let width = get_duration(&args[1])?; + if width.as_nanos() % slide.as_nanos() != 0 { + return plan_err!( + "hop() width {:?} must be a multiple of slide {:?}", + width, + slide + ); + } + if slide == width { + Ok(Some(WindowType::Tumbling { width })) + } else { + Ok(Some(WindowType::Sliding { width, slide })) + } + } + name if name == window_fn::TUMBLE => { + if args.len() != 1 { + unreachable!("wrong number of arguments for tumble(), expect one"); + } + let width = get_duration(&args[0])?; + Ok(Some(WindowType::Tumbling { width })) + } + name if name == window_fn::SESSION => { + if args.len() != 1 { + unreachable!("wrong number of arguments for session(), expected one"); + } + let gap = get_duration(&args[0])?; + Ok(Some(WindowType::Session { gap })) + } + _ => Ok(None), + }, + Expr::Alias(Alias { expr, .. }) => find_window(expr), + _ => Ok(None), + } +} diff --git a/src/storage/mod.rs b/src/storage/mod.rs index a4898619..823425d2 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -11,4 +11,5 @@ // limitations under the License. pub mod state_backend; +pub mod stream_catalog; pub mod task; diff --git a/src/storage/stream_catalog/codec.rs b/src/storage/stream_catalog/codec.rs new file mode 100644 index 00000000..34c2c4ba --- /dev/null +++ b/src/storage/stream_catalog/codec.rs @@ -0,0 +1,57 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Arrow Schema IPC and [`LogicalProgram`] bincode payloads for stream catalog rows. + +use std::io::Cursor; +use std::sync::Arc; + +use datafusion::arrow::datatypes::Schema; +use datafusion::arrow::ipc::reader::StreamReader; +use datafusion::arrow::ipc::writer::StreamWriter; +use datafusion::arrow::record_batch::RecordBatch; +use datafusion::common::{DataFusionError, Result}; + +use crate::sql::logical_node::logical::LogicalProgram; + +pub struct CatalogCodec; + +impl CatalogCodec { + pub fn encode_schema(schema: &Arc) -> Result> { + let mut buffer = Vec::new(); + let empty_batch = RecordBatch::new_empty(Arc::clone(schema)); + let mut writer = StreamWriter::try_new(&mut buffer, schema.as_ref()) + .map_err(|e| DataFusionError::External(Box::new(e)))?; + writer + .write(&empty_batch) + .map_err(|e| DataFusionError::External(Box::new(e)))?; + writer + .finish() + .map_err(|e| DataFusionError::External(Box::new(e)))?; + Ok(buffer) + } + + pub fn decode_schema(bytes: &[u8]) -> Result> { + let cursor = Cursor::new(bytes); + let reader = StreamReader::try_new(cursor, None) + .map_err(|e| DataFusionError::External(Box::new(e)))?; + Ok(reader.schema()) + } + + pub fn encode_logical_program(program: &LogicalProgram) -> Result> { + program.encode_for_catalog() + } + + pub fn decode_logical_program(bytes: &[u8]) -> Result { + LogicalProgram::decode_for_catalog(bytes) + } +} diff --git a/src/storage/stream_catalog/manager.rs b/src/storage/stream_catalog/manager.rs new file mode 100644 index 00000000..a0eb9b49 --- /dev/null +++ b/src/storage/stream_catalog/manager.rs @@ -0,0 +1,624 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::{Arc, OnceLock}; + +use anyhow::{anyhow, bail, Context}; +use datafusion::common::{internal_err, plan_err, Result as DFResult}; +use prost::Message; +use protocol::grpc::api::FsProgram; +use protocol::storage::{self as pb, table_definition}; +use tracing::{info, warn}; +use unicase::UniCase; + +use crate::sql::common::constants::sql_field; +use crate::sql::schema::column_descriptor::ColumnDescriptor; +use crate::sql::schema::connection_type::ConnectionType; +use crate::sql::schema::source_table::SourceTable; +use crate::sql::schema::table::Table as CatalogTable; +use crate::sql::schema::{StreamPlanningContext, StreamTable}; + +use super::codec::CatalogCodec; +use super::meta_store::MetaStore; + +const CATALOG_KEY_PREFIX: &str = "catalog:stream_table:"; +const STREAMING_JOB_KEY_PREFIX: &str = "streaming_job:"; + +pub struct CatalogManager { + store: Arc, +} + +static GLOBAL_CATALOG: OnceLock> = OnceLock::new(); + +impl CatalogManager { + pub fn new(store: Arc) -> Self { + Self { store } + } + + pub fn init_global_in_memory() -> anyhow::Result<()> { + Self::init_global(Arc::new(super::InMemoryMetaStore::new())) + } + + pub fn init_global(store: Arc) -> anyhow::Result<()> { + if GLOBAL_CATALOG.get().is_some() { + bail!("CatalogManager already initialized"); + } + + let mgr = Arc::new(CatalogManager::new(store)); + GLOBAL_CATALOG + .set(mgr) + .map_err(|_| anyhow!("CatalogManager global install failed"))?; + + Ok(()) + } + + pub fn try_global() -> Option> { + GLOBAL_CATALOG.get().cloned() + } + + pub fn global() -> anyhow::Result> { + Self::try_global().ok_or_else(|| anyhow!("CatalogManager not initialized")) + } + + #[inline] + fn build_store_key(table_name: &str) -> String { + format!("{CATALOG_KEY_PREFIX}{}", table_name.to_lowercase()) + } + + #[inline] + fn build_streaming_job_key(table_name: &str) -> String { + format!("{STREAMING_JOB_KEY_PREFIX}{}", table_name.to_lowercase()) + } + + // ======================================================================== + // Streaming job persistence (CREATE STREAMING TABLE / DROP STREAMING TABLE) + // ======================================================================== + + pub fn persist_streaming_job( + &self, + table_name: &str, + fs_program: &FsProgram, + comment: &str, + ) -> DFResult<()> { + let program_bytes = fs_program.encode_to_vec(); + let def = pb::StreamingTableDefinition { + table_name: table_name.to_string(), + created_at_millis: chrono::Utc::now().timestamp_millis(), + fs_program_bytes: program_bytes, + comment: comment.to_string(), + }; + let payload = def.encode_to_vec(); + let key = Self::build_streaming_job_key(table_name); + self.store.put(&key, payload)?; + info!(table = %table_name, "Streaming job definition persisted"); + Ok(()) + } + + pub fn remove_streaming_job(&self, table_name: &str) -> DFResult<()> { + let key = Self::build_streaming_job_key(table_name); + self.store.delete(&key)?; + info!(table = %table_name, "Streaming job definition removed from store"); + Ok(()) + } + + pub fn load_streaming_job_definitions( + &self, + ) -> DFResult> { + let records = self.store.scan_prefix(STREAMING_JOB_KEY_PREFIX)?; + let mut out = Vec::with_capacity(records.len()); + for (key, payload) in records { + let def = match pb::StreamingTableDefinition::decode(payload.as_slice()) { + Ok(v) => v, + Err(e) => { + warn!( + key = %key, + error = %e, + "Skipping corrupted streaming job record" + ); + continue; + } + }; + let program = match FsProgram::decode(def.fs_program_bytes.as_slice()) { + Ok(v) => v, + Err(e) => { + warn!( + table = %def.table_name, + error = %e, + "Skipping streaming job with corrupted FsProgram" + ); + continue; + } + }; + out.push((def.table_name, program)); + } + Ok(out) + } + + // ======================================================================== + // Catalog table persistence (CREATE TABLE / DROP TABLE) + // ======================================================================== + + pub fn add_catalog_table(&self, table: CatalogTable) -> DFResult<()> { + let proto_def = self.encode_catalog_table(&table)?; + let payload = proto_def.encode_to_vec(); + let key = Self::build_store_key(table.name()); + + self.store.put(&key, payload)?; + Ok(()) + } + + pub fn has_catalog_table(&self, name: &str) -> bool { + let key = Self::build_store_key(name); + self.store.get(&key).ok().flatten().is_some() + } + + pub fn drop_catalog_table(&self, table_name: &str, if_exists: bool) -> DFResult<()> { + let key = Self::build_store_key(table_name); + let exists = self.store.get(&key)?.is_some(); + if !exists { + if if_exists { + return Ok(()); + } + return plan_err!("Table '{table_name}' not found"); + } + self.store.delete(&key)?; + Ok(()) + } + + pub fn restore_from_store(&self) -> DFResult<()> { + // No-op by design: the catalog is read-through from storage. + Ok(()) + } + + pub fn acquire_planning_context(&self) -> StreamPlanningContext { + let mut ctx = StreamPlanningContext::new(); + let catalogs = self.load_catalog_tables_map().unwrap_or_default(); + ctx.tables.catalogs = catalogs.clone(); + + for (name, table) in catalogs { + let source = match table.as_ref() { + CatalogTable::ConnectorTable(s) | CatalogTable::LookupTable(s) => s, + CatalogTable::TableFromQuery { .. } => continue, + }; + + let schema = Arc::new(source.produce_physical_schema()); + ctx.tables.streams.insert( + name, + Arc::new(StreamTable::Source { + name: source.name().to_string(), + connector: source.connector().to_string(), + schema, + event_time_field: source.event_time_field().map(str::to_string), + watermark_field: source.stream_catalog_watermark_field(), + with_options: source.catalog_with_options().clone(), + }), + ); + } + ctx + } + + /// All persisted catalog tables, sorted by table name. + pub fn list_catalog_tables(&self) -> DFResult>> { + let mut out: Vec> = + self.load_catalog_tables_map()?.into_values().collect(); + out.sort_by(|a, b| a.name().cmp(b.name())); + Ok(out) + } + + pub fn get_catalog_table(&self, name: &str) -> DFResult>> { + let key = UniCase::new(name.to_string()); + Ok(self.load_catalog_tables_map()?.get(&key).cloned()) + } + + pub fn add_table(&self, table: StreamTable) -> DFResult<()> { + match table { + StreamTable::Source { + name, + connector, + schema, + event_time_field, + watermark_field, + with_options, + } => { + let mut source = SourceTable::new(name, connector, ConnectionType::Source); + source.schema_specs = schema + .fields() + .iter() + .map(|f| ColumnDescriptor::new_physical((**f).clone())) + .collect(); + source.inferred_fields = Some(schema.fields().iter().cloned().collect()); + source.temporal_config.event_column = event_time_field; + source.temporal_config.watermark_strategy_column = watermark_field; + source.catalog_with_options = with_options; + self.add_catalog_table(CatalogTable::ConnectorTable(source)) + } + StreamTable::Sink { name, .. } => plan_err!( + "Persisting streaming sink '{name}' in stream catalog is no longer supported" + ), + } + } + + pub fn has_stream_table(&self, name: &str) -> bool { + self.has_catalog_table(name) + } + + pub fn drop_table(&self, table_name: &str, if_exists: bool) -> DFResult<()> { + self.drop_catalog_table(table_name, if_exists) + } + + pub fn list_stream_tables(&self) -> Vec> { + self.list_catalog_tables() + .unwrap_or_default() + .into_iter() + .filter_map(|t| match t.as_ref() { + CatalogTable::ConnectorTable(s) | CatalogTable::LookupTable(s) => { + Some(Arc::new(StreamTable::Source { + name: s.name().to_string(), + connector: s.connector().to_string(), + schema: Arc::new(s.produce_physical_schema()), + event_time_field: s.event_time_field().map(str::to_string), + watermark_field: s.stream_catalog_watermark_field(), + with_options: s.catalog_with_options().clone(), + })) + } + CatalogTable::TableFromQuery { .. } => None, + }) + .collect() + } + + pub fn get_stream_table(&self, name: &str) -> Option> { + self.get_catalog_table(name) + .ok() + .flatten() + .and_then(|t| match t.as_ref() { + CatalogTable::ConnectorTable(s) | CatalogTable::LookupTable(s) => { + Some(Arc::new(StreamTable::Source { + name: s.name().to_string(), + connector: s.connector().to_string(), + schema: Arc::new(s.produce_physical_schema()), + event_time_field: s.event_time_field().map(str::to_string), + watermark_field: s.stream_catalog_watermark_field(), + with_options: s.catalog_with_options().clone(), + })) + } + CatalogTable::TableFromQuery { .. } => None, + }) + } + + fn encode_catalog_table(&self, table: &CatalogTable) -> DFResult { + let table_type = match table { + CatalogTable::ConnectorTable(source) | CatalogTable::LookupTable(source) => { + let mut opts = source.catalog_with_options().clone(); + opts.entry("connector".to_string()) + .or_insert_with(|| source.connector().to_string()); + let catalog_row = pb::CatalogSourceTable { + arrow_schema_ipc: CatalogCodec::encode_schema(&Arc::new( + source.produce_physical_schema(), + ))?, + event_time_field: source.event_time_field().map(str::to_string), + watermark_field: source.stream_catalog_watermark_field(), + with_options: opts.into_iter().collect(), + connector: source.connector().to_string(), + description: source.description.clone(), + }; + if matches!(table, CatalogTable::LookupTable(_)) { + table_definition::TableType::LookupTable(catalog_row) + } else { + table_definition::TableType::ConnectorTable(catalog_row) + } + } + CatalogTable::TableFromQuery { name, .. } => return plan_err!( + "Persisting query-defined table '{}' is not supported by stream catalog storage", + name + ), + }; + + Ok(pb::TableDefinition { + table_name: table.name().to_string(), + updated_at_millis: chrono::Utc::now().timestamp_millis(), + table_type: Some(table_type), + }) + } + + fn decode_catalog_source_table( + &self, + table_name: String, + source_row: pb::CatalogSourceTable, + as_lookup: bool, + ) -> DFResult { + let connector = if source_row.connector.is_empty() { + source_row + .with_options + .get("connector") + .cloned() + .unwrap_or_else(|| "stream_catalog".to_string()) + } else { + source_row.connector.clone() + }; + let mut source = SourceTable::new( + table_name, + connector, + if as_lookup { + ConnectionType::Lookup + } else { + ConnectionType::Source + }, + ); + let schema = CatalogCodec::decode_schema(&source_row.arrow_schema_ipc)?; + source.schema_specs = schema + .fields() + .iter() + .map(|f| ColumnDescriptor::new_physical((**f).clone())) + .collect(); + source.inferred_fields = Some(schema.fields().iter().cloned().collect()); + source.temporal_config.event_column = source_row.event_time_field; + source.temporal_config.watermark_strategy_column = source_row + .watermark_field + .filter(|w| w != sql_field::COMPUTED_WATERMARK); + source.catalog_with_options = source_row.with_options.into_iter().collect(); + source.description = source_row.description; + + // Rebuild strongly-typed ConnectorConfig from persisted WITH options. + if source.connector().eq_ignore_ascii_case("kafka") { + use crate::sql::schema::kafka_operator_config::build_kafka_proto_config_from_string_map; + use crate::sql::schema::ConnectorConfig; + let opts_map: std::collections::HashMap = + source.catalog_with_options.iter().map(|(k, v)| (k.clone(), v.clone())).collect(); + let physical = source.produce_physical_schema(); + if let Ok(proto_cfg) = build_kafka_proto_config_from_string_map(opts_map, &physical) { + source.connector_config = match proto_cfg { + protocol::grpc::api::connector_op::Config::KafkaSource(cfg) => { + ConnectorConfig::KafkaSource(cfg) + } + protocol::grpc::api::connector_op::Config::KafkaSink(cfg) => { + ConnectorConfig::KafkaSink(cfg) + } + protocol::grpc::api::connector_op::Config::Generic(g) => { + ConnectorConfig::Generic(g.properties) + } + }; + } + } else { + use crate::sql::schema::ConnectorConfig; + source.connector_config = ConnectorConfig::Generic( + source.catalog_with_options.iter().map(|(k, v)| (k.clone(), v.clone())).collect(), + ); + } + + if as_lookup { + Ok(CatalogTable::LookupTable(source)) + } else { + Ok(CatalogTable::ConnectorTable(source)) + } + } + + fn decode_catalog_table(&self, proto_def: pb::TableDefinition) -> DFResult { + let Some(table_type) = proto_def.table_type else { + return internal_err!( + "Corrupted catalog row: missing table_type for {}", + proto_def.table_name + ); + }; + + match table_type { + table_definition::TableType::ConnectorTable(src) => { + self.decode_catalog_source_table(proto_def.table_name, src, false) + } + table_definition::TableType::LookupTable(src) => { + self.decode_catalog_source_table(proto_def.table_name, src, true) + } + } + } + + fn load_catalog_tables_map( + &self, + ) -> DFResult>> { + let mut out = std::collections::HashMap::new(); + let records = self.store.scan_prefix(CATALOG_KEY_PREFIX)?; + for (key, payload) in records { + let proto_def = match pb::TableDefinition::decode(payload.as_slice()) { + Ok(v) => v, + Err(e) => { + warn!( + catalog_key = %key, + error = %e, + "Skipping corrupted stream catalog row: protobuf decode failed" + ); + continue; + } + }; + let table = match self.decode_catalog_table(proto_def) { + Ok(v) => v, + Err(e) => { + warn!( + catalog_key = %key, + error = %e, + "Skipping unsupported/corrupted stream catalog row" + ); + continue; + } + }; + let object_name = UniCase::new(table.name().to_string()); + out.insert(object_name, Arc::new(table)); + } + Ok(out) + } +} + +pub fn restore_global_catalog_from_store() { + let Some(mgr) = CatalogManager::try_global() else { + return; + }; + match mgr.restore_from_store() { + Ok(()) => { + let n = mgr.list_catalog_tables().map(|t| t.len()).unwrap_or(0); + info!(catalog_tables = n, "Catalog loaded from durable store"); + } + Err(e) => warn!("Stream catalog restore_from_store failed: {e:#}"), + } +} + +pub fn restore_streaming_jobs_from_store() { + use crate::runtime::streaming::job::JobManager; + + let Some(catalog) = CatalogManager::try_global() else { + warn!("CatalogManager not available; skipping streaming job restore"); + return; + }; + let job_manager = match JobManager::global() { + Ok(jm) => jm, + Err(e) => { + warn!(error = %e, "JobManager not available; skipping streaming job restore"); + return; + } + }; + + let definitions = match catalog.load_streaming_job_definitions() { + Ok(defs) => defs, + Err(e) => { + warn!(error = %e, "Failed to load streaming job definitions from store"); + return; + } + }; + + if definitions.is_empty() { + info!("No persisted streaming jobs to restore"); + return; + } + + let total = definitions.len(); + info!(count = total, "Restoring persisted streaming jobs"); + + let rt = tokio::runtime::Handle::current(); + let mut restored = 0usize; + let mut failed = 0usize; + + for (table_name, fs_program) in definitions { + let jm = job_manager.clone(); + let name = table_name.clone(); + match rt.block_on(jm.submit_job(name.clone(), fs_program)) { + Ok(job_id) => { + info!(table = %table_name, job_id = %job_id, "Streaming job restored"); + restored += 1; + } + Err(e) => { + warn!(table = %table_name, error = %e, "Failed to restore streaming job"); + failed += 1; + } + } + } + + info!( + restored = restored, + failed = failed, + total = total, + "Streaming job restore complete" + ); +} + +pub fn initialize_stream_catalog(config: &crate::config::GlobalConfig) -> anyhow::Result<()> { + if !config.stream_catalog.persist { + return CatalogManager::init_global_in_memory() + .context("Stream catalog (CatalogManager) in-memory init failed"); + } + + let path = config + .stream_catalog + .db_path + .as_ref() + .map(|p| crate::config::resolve_path(p)) + .unwrap_or_else(|| crate::config::get_data_dir().join("stream_catalog")); + + std::fs::create_dir_all(&path).with_context(|| { + format!( + "Failed to create stream catalog directory {}", + path.display() + ) + })?; + + let store = std::sync::Arc::new( + super::RocksDbMetaStore::open(&path).with_context(|| { + format!( + "Failed to open stream catalog RocksDB at {}", + path.display() + ) + })?, + ); + + CatalogManager::init_global(store).context("Stream catalog (CatalogManager) init failed") +} + +pub fn planning_schema_provider() -> StreamPlanningContext { + CatalogManager::try_global() + .map(|m| m.acquire_planning_context()) + .unwrap_or_else(StreamPlanningContext::new) +} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use datafusion::arrow::datatypes::{DataType, Field}; + + use crate::sql::schema::column_descriptor::ColumnDescriptor; + use crate::sql::schema::connection_type::ConnectionType; + use crate::sql::schema::source_table::SourceTable; + use crate::sql::schema::table::Table as CatalogTable; + use crate::storage::stream_catalog::InMemoryMetaStore; + + use super::CatalogManager; + + fn create_test_manager() -> CatalogManager { + CatalogManager::new(Arc::new(InMemoryMetaStore::new())) + } + + #[test] + fn add_table_roundtrip_snapshot() { + let mgr = create_test_manager(); + let mut source = SourceTable::new("t1", "kafka", ConnectionType::Source); + source.schema_specs = vec![ColumnDescriptor::new_physical(Field::new( + "a", + DataType::Int32, + false, + ))]; + source.temporal_config.event_column = Some("ts".into()); + let table = CatalogTable::ConnectorTable(source); + + mgr.add_catalog_table(table).unwrap(); + + let got = mgr + .get_catalog_table("t1") + .unwrap() + .expect("table present"); + assert_eq!(got.name(), "t1"); + } + + #[test] + fn drop_table_if_exists() { + let mgr = create_test_manager(); + let mut source = SourceTable::new("t_drop", "kafka", ConnectionType::Source); + source.schema_specs = vec![ColumnDescriptor::new_physical(Field::new( + "a", + DataType::Int32, + false, + ))]; + mgr.add_catalog_table(CatalogTable::ConnectorTable(source)) + .unwrap(); + + mgr.drop_catalog_table("t_drop", false).unwrap(); + assert!(!mgr.has_catalog_table("t_drop")); + + mgr.drop_catalog_table("t_drop", true).unwrap(); + assert!(mgr.drop_catalog_table("nope", false).is_err()); + mgr.drop_catalog_table("nope", true).unwrap(); + } +} diff --git a/src/storage/stream_catalog/meta_store.rs b/src/storage/stream_catalog/meta_store.rs new file mode 100644 index 00000000..6f61b3f7 --- /dev/null +++ b/src/storage/stream_catalog/meta_store.rs @@ -0,0 +1,70 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Pluggable metadata KV backend (memory, etcd, Redis, …). + +use std::collections::HashMap; + +use datafusion::common::Result; +use parking_lot::RwLock; + +/// Synchronous metadata store for catalog records. +pub trait MetaStore: Send + Sync { + fn put(&self, key: &str, value: Vec) -> Result<()>; + fn get(&self, key: &str) -> Result>>; + fn delete(&self, key: &str) -> Result<()>; + fn scan_prefix(&self, prefix: &str) -> Result)>>; +} + +/// In-process KV store for single-node deployments and tests. +pub struct InMemoryMetaStore { + db: RwLock>>, +} + +impl InMemoryMetaStore { + pub fn new() -> Self { + Self { + db: RwLock::new(HashMap::new()), + } + } +} + +impl Default for InMemoryMetaStore { + fn default() -> Self { + Self::new() + } +} + +impl MetaStore for InMemoryMetaStore { + fn put(&self, key: &str, value: Vec) -> Result<()> { + self.db.write().insert(key.to_string(), value); + Ok(()) + } + + fn get(&self, key: &str) -> Result>> { + Ok(self.db.read().get(key).cloned()) + } + + fn delete(&self, key: &str) -> Result<()> { + self.db.write().remove(key); + Ok(()) + } + + fn scan_prefix(&self, prefix: &str) -> Result)>> { + let db = self.db.read(); + Ok(db + .iter() + .filter(|(k, _)| k.starts_with(prefix)) + .map(|(k, v)| (k.clone(), v.clone())) + .collect()) + } +} diff --git a/src/storage/stream_catalog/mod.rs b/src/storage/stream_catalog/mod.rs new file mode 100644 index 00000000..b99f3080 --- /dev/null +++ b/src/storage/stream_catalog/mod.rs @@ -0,0 +1,26 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Stream table catalog: protobuf persistence, MVCC-style planning snapshots for the coordinator. + +mod codec; +mod manager; +mod meta_store; +mod rocksdb_meta_store; + +pub use manager::{ + CatalogManager, initialize_stream_catalog, + restore_global_catalog_from_store, + restore_streaming_jobs_from_store, +}; +pub use meta_store::{InMemoryMetaStore, MetaStore}; +pub use rocksdb_meta_store::RocksDbMetaStore; diff --git a/src/storage/stream_catalog/rocksdb_meta_store.rs b/src/storage/stream_catalog/rocksdb_meta_store.rs new file mode 100644 index 00000000..98a518a3 --- /dev/null +++ b/src/storage/stream_catalog/rocksdb_meta_store.rs @@ -0,0 +1,131 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! RocksDB-backed [`super::MetaStore`] for durable stream catalog rows. + +use std::path::Path; +use std::sync::Arc; + +use anyhow::Context; +use datafusion::common::Result; +use rocksdb::{DB, Direction, IteratorMode, Options}; + +use super::MetaStore; + +/// Single-node durable KV used by [`crate::storage::stream_catalog::CatalogManager`]. +pub struct RocksDbMetaStore { + db: Arc, +} + +impl RocksDbMetaStore { + pub fn open>(path: P) -> anyhow::Result { + let path = path.as_ref(); + if let Some(parent) = path.parent() { + std::fs::create_dir_all(parent).with_context(|| { + format!("stream catalog: create parent directory {parent:?}") + })?; + } + let mut opts = Options::default(); + opts.create_if_missing(true); + let db = DB::open(&opts, path).with_context(|| { + format!("stream catalog: open RocksDB at {}", path.display()) + })?; + Ok(Self { db: Arc::new(db) }) + } +} + +impl MetaStore for RocksDbMetaStore { + fn put(&self, key: &str, value: Vec) -> Result<()> { + self.db + .put(key.as_bytes(), value.as_slice()) + .map_err(|e| datafusion::common::DataFusionError::Execution(format!( + "stream catalog store put: {e}" + ))) + } + + fn get(&self, key: &str) -> Result>> { + self.db + .get(key.as_bytes()) + .map_err(|e| datafusion::common::DataFusionError::Execution(format!( + "stream catalog store get: {e}" + ))) + } + + fn delete(&self, key: &str) -> Result<()> { + self.db + .delete(key.as_bytes()) + .map_err(|e| datafusion::common::DataFusionError::Execution(format!( + "stream catalog store delete: {e}" + ))) + } + + fn scan_prefix(&self, prefix: &str) -> Result)>> { + let mut out = Vec::new(); + let iter = self + .db + .iterator(IteratorMode::From(prefix.as_bytes(), Direction::Forward)); + for item in iter { + let (k, v) = item.map_err(|e| { + datafusion::common::DataFusionError::Execution(format!( + "stream catalog store scan: {e}" + )) + })?; + let key = String::from_utf8(k.to_vec()).map_err(|e| { + datafusion::common::DataFusionError::Execution(format!( + "stream catalog store: invalid utf8 key: {e}" + )) + })?; + if !key.starts_with(prefix) { + break; + } + out.push((key, v.to_vec())); + } + Ok(out) + } +} + +#[cfg(test)] +mod tests { + use std::path::PathBuf; + + use uuid::Uuid; + + use super::*; + + #[test] + fn put_get_scan_roundtrip() { + let dir: PathBuf = std::env::temp_dir().join(format!( + "fs_stream_catalog_test_{}", + Uuid::new_v4() + )); + let _ = std::fs::remove_dir_all(&dir); + + let store = RocksDbMetaStore::open(&dir).expect("open"); + store.put("catalog:stream_table:a", vec![1, 2, 3]).unwrap(); + store.put("catalog:stream_table:b", vec![4]).unwrap(); + store.put("other:x", vec![9]).unwrap(); + + assert_eq!( + store.get("catalog:stream_table:a").unwrap(), + Some(vec![1, 2, 3]) + ); + + let prefixed = store.scan_prefix("catalog:stream_table:").unwrap(); + assert_eq!(prefixed.len(), 2); + assert!(prefixed.iter().any(|(k, _)| k.ends_with(":a"))); + assert!(prefixed.iter().any(|(k, _)| k.ends_with(":b"))); + + store.delete("catalog:stream_table:a").unwrap(); + assert!(store.get("catalog:stream_table:a").unwrap().is_none()); + + let _ = std::fs::remove_dir_all(&dir); + } +} diff --git a/src/storage/task/mod.rs b/src/storage/task/mod.rs index b4b3680f..3123415a 100644 --- a/src/storage/task/mod.rs +++ b/src/storage/task/mod.rs @@ -16,6 +16,7 @@ pub mod factory; mod function_info; +mod proto_codec; mod rocksdb_storage; pub mod storage; diff --git a/src/storage/task/proto_codec.rs b/src/storage/task/proto_codec.rs new file mode 100644 index 00000000..1e0bedb3 --- /dev/null +++ b/src/storage/task/proto_codec.rs @@ -0,0 +1,271 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Protobuf wire format for RocksDB task rows, with legacy bincode read support. + +use anyhow::{Context, Result, anyhow}; +use prost::Message; +use protocol::storage::{ + ComponentStateKind, ComponentStateProto, TaskMetadataProto, TaskModulePayloadProto, + TaskModulePython, TaskModuleWasm, task_module_payload_proto, +}; +use serde::{Deserialize, Serialize}; + +use crate::runtime::common::ComponentState; + +use super::storage::TaskModuleBytes; + +/// Magic prefix for protobuf-encoded task values (meta + payload). Legacy rows have no prefix. +pub const TASK_STORAGE_PROTO_MAGIC: &[u8; 4] = b"FSP1"; + +#[derive(Debug, Clone, Serialize, Deserialize)] +struct LegacyTaskMetadata { + task_type: String, + state: ComponentState, + created_at: u64, + checkpoint_id: Option, +} + +fn component_state_to_proto(state: &ComponentState) -> ComponentStateProto { + let (kind, error_message) = match state { + ComponentState::Uninitialized => (ComponentStateKind::Uninitialized, String::new()), + ComponentState::Initialized => (ComponentStateKind::Initialized, String::new()), + ComponentState::Starting => (ComponentStateKind::Starting, String::new()), + ComponentState::Running => (ComponentStateKind::Running, String::new()), + ComponentState::Checkpointing => (ComponentStateKind::Checkpointing, String::new()), + ComponentState::Stopping => (ComponentStateKind::Stopping, String::new()), + ComponentState::Stopped => (ComponentStateKind::Stopped, String::new()), + ComponentState::Closing => (ComponentStateKind::Closing, String::new()), + ComponentState::Closed => (ComponentStateKind::Closed, String::new()), + ComponentState::Error { error } => (ComponentStateKind::Error, error.clone()), + }; + ComponentStateProto { + kind: kind as i32, + error_message, + } +} + +fn component_state_from_proto(p: &ComponentStateProto) -> ComponentState { + let kind = ComponentStateKind::try_from(p.kind).unwrap_or(ComponentStateKind::Unspecified); + match kind { + ComponentStateKind::Unspecified | ComponentStateKind::Uninitialized => { + ComponentState::Uninitialized + } + ComponentStateKind::Initialized => ComponentState::Initialized, + ComponentStateKind::Starting => ComponentState::Starting, + ComponentStateKind::Running => ComponentState::Running, + ComponentStateKind::Checkpointing => ComponentState::Checkpointing, + ComponentStateKind::Stopping => ComponentState::Stopping, + ComponentStateKind::Stopped => ComponentState::Stopped, + ComponentStateKind::Closing => ComponentState::Closing, + ComponentStateKind::Closed => ComponentState::Closed, + ComponentStateKind::Error => ComponentState::Error { + error: if p.error_message.is_empty() { + "unknown error".to_string() + } else { + p.error_message.clone() + }, + }, + } +} + +/// Encode task metadata for `task_meta` column family (always protobuf + magic). +pub fn encode_task_metadata_bytes( + task_type: &str, + state: &ComponentState, + created_at: u64, + checkpoint_id: Option, +) -> Result> { + let proto = TaskMetadataProto { + task_type: task_type.to_string(), + state: Some(component_state_to_proto(state)), + created_at, + checkpoint_id, + }; + let mut out = TASK_STORAGE_PROTO_MAGIC.to_vec(); + proto + .encode(&mut out) + .context("encode TaskMetadataProto")?; + Ok(out) +} + +pub struct DecodedTaskMetadata { + pub task_type: String, + pub state: ComponentState, + pub created_at: u64, + pub checkpoint_id: Option, +} + +/// Decode metadata written by this version (protobuf) or legacy bincode+serde. +pub fn decode_task_metadata_bytes(raw: &[u8]) -> Result { + if raw.len() >= TASK_STORAGE_PROTO_MAGIC.len() + && &raw[..TASK_STORAGE_PROTO_MAGIC.len()] == TASK_STORAGE_PROTO_MAGIC.as_slice() + { + let proto = TaskMetadataProto::decode(&raw[TASK_STORAGE_PROTO_MAGIC.len()..]) + .context("decode TaskMetadataProto")?; + let state = proto + .state + .as_ref() + .map(component_state_from_proto) + .unwrap_or_default(); + return Ok(DecodedTaskMetadata { + task_type: proto.task_type, + state, + created_at: proto.created_at, + checkpoint_id: proto.checkpoint_id, + }); + } + + let (legacy, _): (LegacyTaskMetadata, _) = bincode::serde::decode_from_slice( + raw, + bincode::config::standard(), + ) + .map_err(|e| anyhow!("legacy task metadata bincode decode failed: {e}"))?; + Ok(DecodedTaskMetadata { + task_type: legacy.task_type, + state: legacy.state, + created_at: legacy.created_at, + checkpoint_id: legacy.checkpoint_id, + }) +} + +fn module_to_proto(module: &TaskModuleBytes) -> TaskModulePayloadProto { + match module { + TaskModuleBytes::Wasm(bytes) => TaskModulePayloadProto { + payload: Some(task_module_payload_proto::Payload::Wasm(TaskModuleWasm { + wasm_binary: bytes.clone(), + })), + }, + TaskModuleBytes::Python { + class_name, + module, + bytes, + } => TaskModulePayloadProto { + payload: Some(task_module_payload_proto::Payload::Python(TaskModulePython { + class_name: class_name.clone(), + module_path: module.clone(), + embedded_code: bytes.clone(), + })), + }, + } +} + +/// Encode module payload for `task_payload` column family (always protobuf + magic). +pub fn encode_task_module_bytes(module: &TaskModuleBytes) -> Result> { + let proto = module_to_proto(module); + let mut out = TASK_STORAGE_PROTO_MAGIC.to_vec(); + proto + .encode(&mut out) + .context("encode TaskModulePayloadProto")?; + Ok(out) +} + +/// Decode module payload: protobuf+magic or legacy bincode+serde [`TaskModuleBytes`]. +pub fn decode_task_module_bytes(raw: &[u8]) -> Result { + if raw.len() >= TASK_STORAGE_PROTO_MAGIC.len() + && &raw[..TASK_STORAGE_PROTO_MAGIC.len()] == TASK_STORAGE_PROTO_MAGIC.as_slice() + { + let proto = TaskModulePayloadProto::decode(&raw[TASK_STORAGE_PROTO_MAGIC.len()..]) + .context("decode TaskModulePayloadProto")?; + return proto.try_into_task_module(); + } + + let (legacy, _): (TaskModuleBytes, _) = bincode::serde::decode_from_slice( + raw, + bincode::config::standard(), + ) + .map_err(|e| anyhow!("legacy task module bincode decode failed: {e}"))?; + Ok(legacy) +} + +trait TryIntoTaskModule { + fn try_into_task_module(self) -> Result; +} + +impl TryIntoTaskModule for TaskModulePayloadProto { + fn try_into_task_module(self) -> Result { + match self.payload { + Some(task_module_payload_proto::Payload::Wasm(w)) => { + Ok(TaskModuleBytes::Wasm(w.wasm_binary)) + } + Some(task_module_payload_proto::Payload::Python(p)) => Ok(TaskModuleBytes::Python { + class_name: p.class_name, + module: p.module_path, + bytes: p.embedded_code, + }), + None => Err(anyhow!("TaskModulePayloadProto missing payload")), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn metadata_roundtrip_proto() { + let enc = encode_task_metadata_bytes( + "wasm", + &ComponentState::Running, + 42, + Some(7), + ) + .unwrap(); + let dec = decode_task_metadata_bytes(&enc).unwrap(); + assert_eq!(dec.task_type, "wasm"); + assert_eq!(dec.state, ComponentState::Running); + assert_eq!(dec.created_at, 42); + assert_eq!(dec.checkpoint_id, Some(7)); + } + + #[test] + fn module_roundtrip_wasm_proto() { + let m = TaskModuleBytes::Wasm(vec![1, 2, 3]); + let enc = encode_task_module_bytes(&m).unwrap(); + let dec = decode_task_module_bytes(&enc).unwrap(); + assert_eq!(dec, m); + } + + #[test] + fn module_roundtrip_python_proto() { + let m = TaskModuleBytes::Python { + class_name: "C".into(), + module: "m".into(), + bytes: Some(vec![9]), + }; + let enc = encode_task_module_bytes(&m).unwrap(); + let dec = decode_task_module_bytes(&enc).unwrap(); + assert_eq!(dec, m); + } + + #[test] + fn legacy_bincode_metadata_still_decodes() { + let legacy = LegacyTaskMetadata { + task_type: "legacy".into(), + state: ComponentState::Stopped, + created_at: 99, + checkpoint_id: None, + }; + let raw = bincode::serde::encode_to_vec(&legacy, bincode::config::standard()).unwrap(); + let dec = decode_task_metadata_bytes(&raw).unwrap(); + assert_eq!(dec.task_type, "legacy"); + assert_eq!(dec.state, ComponentState::Stopped); + assert_eq!(dec.created_at, 99); + } + + #[test] + fn legacy_bincode_module_still_decodes() { + let m = TaskModuleBytes::Wasm(vec![8, 9]); + let raw = bincode::serde::encode_to_vec(&m, bincode::config::standard()).unwrap(); + assert_eq!(decode_task_module_bytes(&raw).unwrap(), m); + } +} diff --git a/src/storage/task/rocksdb_storage.rs b/src/storage/task/rocksdb_storage.rs index 31709a51..cea0ceb9 100644 --- a/src/storage/task/rocksdb_storage.rs +++ b/src/storage/task/rocksdb_storage.rs @@ -14,12 +14,15 @@ //! //! Uses three column families: task_meta, task_config, task_payload. -use super::storage::{StoredTaskInfo, TaskModuleBytes, TaskStorage}; +use super::proto_codec::{ + decode_task_metadata_bytes, decode_task_module_bytes, encode_task_metadata_bytes, + encode_task_module_bytes, +}; +use super::storage::{StoredTaskInfo, TaskStorage}; use crate::config::storage::RocksDBStorageConfig; use crate::runtime::common::ComponentState; use anyhow::{Context, Result, anyhow}; use rocksdb::{ColumnFamilyDescriptor, DB, IteratorMode, Options, WriteBatch}; -use serde::{Deserialize, Serialize}; use std::path::Path; use std::sync::Arc; @@ -27,14 +30,6 @@ const CF_METADATA: &str = "task_meta"; const CF_CONFIG: &str = "task_config"; const CF_PAYLOAD: &str = "task_payload"; -#[derive(Debug, Clone, Serialize, Deserialize)] -struct TaskMetadata { - task_type: String, - state: ComponentState, - created_at: u64, - checkpoint_id: Option, -} - pub struct RocksDBTaskStorage { db: Arc, } @@ -95,19 +90,19 @@ impl TaskStorage for RocksDBTaskStorage { return Err(anyhow!("Task uniqueness violation: {}", task_info.name)); } - let meta = TaskMetadata { - task_type: task_info.task_type.clone(), - state: task_info.state.clone(), - created_at: task_info.created_at, - checkpoint_id: task_info.checkpoint_id, - }; + let meta_bytes = encode_task_metadata_bytes( + &task_info.task_type, + &task_info.state, + task_info.created_at, + task_info.checkpoint_id, + )?; let mut batch = WriteBatch::default(); - batch.put_cf(&cf_meta, key, bincode::serialize(&meta)?); + batch.put_cf(&cf_meta, key, meta_bytes); batch.put_cf(&cf_conf, key, &task_info.config_bytes); if let Some(ref module) = task_info.module_bytes { - batch.put_cf(&cf_payl, key, bincode::serialize(module)?); + batch.put_cf(&cf_payl, key, encode_task_module_bytes(module)?); } self.db @@ -124,10 +119,19 @@ impl TaskStorage for RocksDBTaskStorage { .get_cf(&cf, key)? .ok_or_else(|| anyhow!("Task {} not found", task_name))?; - let mut meta: TaskMetadata = bincode::deserialize(&raw)?; - meta.state = new_state; - - self.db.put_cf(&cf, key, bincode::serialize(&meta)?)?; + let mut decoded = decode_task_metadata_bytes(&raw)?; + decoded.state = new_state; + + self.db.put_cf( + &cf, + key, + encode_task_metadata_bytes( + &decoded.task_type, + &decoded.state, + decoded.created_at, + decoded.checkpoint_id, + )?, + )?; Ok(()) } @@ -140,10 +144,19 @@ impl TaskStorage for RocksDBTaskStorage { .get_cf(&cf, key)? .ok_or_else(|| anyhow!("Task {} not found", task_name))?; - let mut meta: TaskMetadata = bincode::deserialize(&raw)?; - meta.checkpoint_id = checkpoint_id; - - self.db.put_cf(&cf, key, bincode::serialize(&meta)?)?; + let mut decoded = decode_task_metadata_bytes(&raw)?; + decoded.checkpoint_id = checkpoint_id; + + self.db.put_cf( + &cf, + key, + encode_task_metadata_bytes( + &decoded.task_type, + &decoded.state, + decoded.created_at, + decoded.checkpoint_id, + )?, + )?; Ok(()) } @@ -171,12 +184,12 @@ impl TaskStorage for RocksDBTaskStorage { .get_cf(&self.get_cf(CF_CONFIG)?, key)? .ok_or_else(|| anyhow!("Config missing: {}", task_name))?; - let module_bytes = self - .db - .get_cf(&self.get_cf(CF_PAYLOAD)?, key)? - .and_then(|b| bincode::deserialize::(&b).ok()); + let module_bytes = match self.db.get_cf(&self.get_cf(CF_PAYLOAD)?, key)? { + None => None, + Some(b) => Some(decode_task_module_bytes(&b)?), + }; - let meta: TaskMetadata = bincode::deserialize(&meta_raw)?; + let meta = decode_task_metadata_bytes(&meta_raw)?; Ok(StoredTaskInfo { name: task_name.to_string(), diff --git a/src/storage/task/storage.rs b/src/storage/task/storage.rs index 3c9e4080..156ee5d8 100644 --- a/src/storage/task/storage.rs +++ b/src/storage/task/storage.rs @@ -15,7 +15,7 @@ use anyhow::Result; use serde::{Deserialize, Serialize}; #[allow(dead_code)] -#[derive(Debug, Clone, Serialize, Deserialize)] +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] pub enum TaskModuleBytes { Wasm(Vec), Python {