diff --git a/Cargo.lock b/Cargo.lock
index 26f07400..9cdca7e8 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -11,18 +11,24 @@ dependencies = [
  "gimli",
 ]
 
+[[package]]
+name = "adler2"
+version = "2.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa"
+
 [[package]]
 name = "ahash"
-version = "0.8.12"
+version = "0.8.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75"
+checksum = "77c3a9648d43b9cd48db467b3f87fdd6e146bcc88ab0180006cef2179fe11d01"
 dependencies = [
  "cfg-if",
  "const-random",
- "getrandom 0.3.4",
+ "getrandom 0.2.16",
  "once_cell",
  "version_check",
- "zerocopy",
+ "zerocopy 0.7.35",
 ]
 
 [[package]]
@@ -34,6 +40,21 @@ dependencies = [
  "memchr",
 ]
 
+[[package]]
+name = "alloc-no-stdlib"
+version = "2.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cc7bb162ec39d46ab1ca8c77bf72e890535becd1751bb45f64c597edb4c8c6b3"
+
+[[package]]
+name = "alloc-stdlib"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "94fb8275041c72129eb51b7d0322c29b8387a0386127718b096429201a5d6ece"
+dependencies = [
+ "alloc-no-stdlib",
+]
+
 [[package]]
 name = "allocator-api2"
 version = "0.2.21"
@@ -111,12 +132,68 @@ version = "1.0.100"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a23eb6b1614318a8071c9b2521f36b424b2c83db5eb3a0fead4a6c0809af6e61"
 
+[[package]]
+name = "ar_archive_writer"
+version = "0.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7eb93bbb63b9c227414f6eb3a0adfddca591a8ce1e9b60661bb08969b87e340b"
+dependencies = [
+ "object",
+]
+
 [[package]]
 name = "arbitrary"
 version = "1.4.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c3d036a3c4ab069c7b410a2ce876bd74808d2d0888a82667669f8e783a898bf1"
 
+[[package]]
+name = "arrayref"
+version = "0.3.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "76a2e8124351fda1ef8aaaa3bbd7ebbcb486bbcd4225aca0aa0d84bb2db8fecb"
+
+[[package]]
+name = "arrayvec"
+version = "0.7.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50"
+
+[[package]]
+name = "arrow"
+version = "55.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f3f15b4c6b148206ff3a2b35002e08929c2462467b62b9c02036d9c34f9ef994"
+dependencies = [
+ "arrow-arith",
+ "arrow-array 55.2.0",
+ "arrow-buffer 55.2.0",
+ "arrow-cast 55.2.0",
+ "arrow-csv",
+ "arrow-data 55.2.0",
+ "arrow-ipc 55.2.0",
+ "arrow-json 55.2.0 (registry+https://github.com/rust-lang/crates.io-index)",
+ "arrow-ord",
+ "arrow-row",
+ "arrow-schema 55.2.0",
+ "arrow-select 55.2.0",
+ "arrow-string",
+]
+
+[[package]]
+name = "arrow-arith"
+version = "55.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "30feb679425110209ae35c3fbf82404a39a4c0436bb3ec36164d8bffed2a4ce4"
+dependencies = [
+ "arrow-array 55.2.0",
+ "arrow-buffer 55.2.0",
+ "arrow-data 55.2.0",
+ "arrow-schema 55.2.0",
+ "chrono",
+ "num",
+]
+
 [[package]]
 name = "arrow-array"
 version = "52.2.0"
@@ -124,15 +201,32 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "16f4a9468c882dc66862cef4e1fd8423d47e67972377d85d80e022786427768c"
 dependencies = [
  "ahash",
- "arrow-buffer",
- "arrow-data",
- "arrow-schema",
+ "arrow-buffer 52.2.0",
+ "arrow-data 52.2.0",
+ "arrow-schema 52.2.0",
  "chrono",
  "half",
  "hashbrown 0.14.5",
  "num",
 ]
 
+[[package]]
+name = "arrow-array"
+version = "55.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "70732f04d285d49054a48b72c54f791bb3424abae92d27aafdf776c98af161c8"
+dependencies = [
+ "ahash",
+ "arrow-buffer 55.2.0",
+ "arrow-data 55.2.0",
+ "arrow-schema 55.2.0",
+ "chrono",
+ "chrono-tz",
+ "half",
+ "hashbrown 0.15.5",
+ "num",
+]
+
 [[package]]
 name = "arrow-buffer"
 version = "52.2.0"
@@ -144,34 +238,93 @@ dependencies = [
  "num",
 ]
 
+[[package]]
+name = "arrow-buffer"
+version = "55.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "169b1d5d6cb390dd92ce582b06b23815c7953e9dfaaea75556e89d890d19993d"
+dependencies = [
+ "bytes",
+ "half",
+ "num",
+]
+
 [[package]]
 name = "arrow-cast"
 version = "52.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "da26719e76b81d8bc3faad1d4dbdc1bcc10d14704e63dc17fc9f3e7e1e567c8e"
 dependencies = [
- "arrow-array",
- "arrow-buffer",
- "arrow-data",
- "arrow-schema",
- "arrow-select",
+ "arrow-array 52.2.0",
+ "arrow-buffer 52.2.0",
+ "arrow-data 52.2.0",
+ "arrow-schema 52.2.0",
+ "arrow-select 52.2.0",
+ "atoi",
+ "base64",
+ "chrono",
+ "half",
+ "lexical-core 0.8.5",
+ "num",
+ "ryu",
+]
+
+[[package]]
+name = "arrow-cast"
+version = "55.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e4f12eccc3e1c05a766cafb31f6a60a46c2f8efec9b74c6e0648766d30686af8"
+dependencies = [
+ "arrow-array 55.2.0",
+ "arrow-buffer 55.2.0",
+ "arrow-data 55.2.0",
+ "arrow-schema 55.2.0",
+ "arrow-select 55.2.0",
  "atoi",
  "base64",
  "chrono",
+ "comfy-table",
  "half",
- "lexical-core",
+ "lexical-core 1.0.6",
  "num",
  "ryu",
 ]
 
+[[package]]
+name = "arrow-csv"
+version = "55.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "012c9fef3f4a11573b2c74aec53712ff9fdae4a95f4ce452d1bbf088ee00f06b"
+dependencies = [
+ "arrow-array 55.2.0",
+ "arrow-cast 55.2.0",
+ "arrow-schema 55.2.0",
+ "chrono",
+ "csv",
+ "csv-core",
+ "regex",
+]
+
 [[package]]
 name = "arrow-data"
 version = "52.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "dd9d6f18c65ef7a2573ab498c374d8ae364b4a4edf67105357491c031f716ca5"
 dependencies = [
- "arrow-buffer",
- "arrow-schema",
+ "arrow-buffer 52.2.0",
+ "arrow-schema 52.2.0",
+ "half",
+ "num",
+]
+
+[[package]]
+name = "arrow-data"
+version = "55.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8de1ce212d803199684b658fc4ba55fb2d7e87b213de5af415308d2fee3619c2"
+dependencies = [
+ "arrow-buffer 55.2.0",
+ "arrow-schema 55.2.0",
  "half",
  "num",
 ]
@@ -182,12 +335,96 @@ version = "52.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e786e1cdd952205d9a8afc69397b317cfbb6e0095e445c69cda7e8da5c1eeb0f"
 dependencies = [
- "arrow-array",
- "arrow-buffer",
- "arrow-cast",
- "arrow-data",
- "arrow-schema",
- "flatbuffers",
+ "arrow-array 52.2.0",
+ "arrow-buffer 52.2.0",
+ "arrow-cast 52.2.0",
+ "arrow-data 52.2.0",
+ "arrow-schema 52.2.0",
+ "flatbuffers 24.12.23",
+]
+
+[[package]]
+name = "arrow-ipc"
+version = "55.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d9ea5967e8b2af39aff5d9de2197df16e305f47f404781d3230b2dc672da5d92"
+dependencies = [
+ "arrow-array 55.2.0",
+ "arrow-buffer 55.2.0",
+ "arrow-data 55.2.0",
+ "arrow-schema 55.2.0",
+ "flatbuffers 25.12.19",
+ "lz4_flex",
+]
+
+[[package]]
+name = "arrow-json"
+version = "55.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5709d974c4ea5be96d900c01576c7c0b99705f4a3eec343648cb1ca863988a9c"
+dependencies = [
+ "arrow-array 55.2.0",
+ "arrow-buffer 55.2.0",
+ "arrow-cast 55.2.0",
+ "arrow-data 55.2.0",
+ "arrow-schema 55.2.0",
+ "chrono",
+ "half",
+ "indexmap 2.12.1",
+ "lexical-core 1.0.6",
+ "memchr",
+ "num",
+ "serde",
+ "serde_json",
+ "simdutf8",
+]
+
+[[package]]
+name = "arrow-json"
+version = "55.2.0"
+source = "git+https://github.com/ArroyoSystems/arrow-rs?branch=55.2.0%2Fjson#d31f8d8f97c6e1394b52927cd8c23c14fec6ba16"
+dependencies = [
+ "arrow-array 55.2.0",
+ "arrow-buffer 55.2.0",
+ "arrow-cast 55.2.0",
+ "arrow-data 55.2.0",
+ "arrow-schema 55.2.0",
+ "base64",
+ "chrono",
+ "half",
+ "indexmap 2.12.1",
+ "lexical-core 1.0.6",
+ "memchr",
+ "num",
+ "serde",
+ "serde_json",
+ "simdutf8",
+]
+
+[[package]]
+name = "arrow-ord"
+version = "55.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6506e3a059e3be23023f587f79c82ef0bcf6d293587e3272d20f2d30b969b5a7"
+dependencies = [
+ "arrow-array 55.2.0",
+ "arrow-buffer 55.2.0",
+ "arrow-data 55.2.0",
+ "arrow-schema 55.2.0",
+ "arrow-select 55.2.0",
+]
+
+[[package]]
+name = "arrow-row"
+version = "55.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "52bf7393166beaf79b4bed9bfdf19e97472af32ce5b6b48169d321518a08cae2"
+dependencies = [
+ "arrow-array 55.2.0",
+ "arrow-buffer 55.2.0",
+ "arrow-data 55.2.0",
+ "arrow-schema 55.2.0",
+ "half",
 ]
 
 [[package]]
@@ -196,6 +433,16 @@ version = "52.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9e972cd1ff4a4ccd22f86d3e53e835c2ed92e0eea6a3e8eadb72b4f1ac802cf8"
 
+[[package]]
+name = "arrow-schema"
+version = "55.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "af7686986a3bf2254c9fb130c623cdcb2f8e1f15763e7c71c310f0834da3d292"
+dependencies = [
+ "serde",
+ "serde_json",
+]
+
 [[package]]
 name = "arrow-select"
 version = "52.2.0"
@@ -203,11 +450,59 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "600bae05d43483d216fb3494f8c32fdbefd8aa4e1de237e790dbb3d9f44690a3"
 dependencies = [
  "ahash",
- "arrow-array",
- "arrow-buffer",
- "arrow-data",
- "arrow-schema",
+ "arrow-array 52.2.0",
+ "arrow-buffer 52.2.0",
+ "arrow-data 52.2.0",
+ "arrow-schema 52.2.0",
+ "num",
+]
+
+[[package]]
+name = "arrow-select"
+version = "55.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dd2b45757d6a2373faa3352d02ff5b54b098f5e21dccebc45a21806bc34501e5"
+dependencies = [
+ "ahash",
+ "arrow-array 55.2.0",
+ "arrow-buffer 55.2.0",
+ "arrow-data 55.2.0",
+ "arrow-schema 55.2.0",
+ "num",
+]
+
+[[package]]
+name = "arrow-string"
+version = "55.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0377d532850babb4d927a06294314b316e23311503ed580ec6ce6a0158f49d40"
+dependencies = [
+ "arrow-array 55.2.0",
+ "arrow-buffer 55.2.0",
+ "arrow-data 55.2.0",
+ "arrow-schema 55.2.0",
+ "arrow-select 55.2.0",
+ "memchr",
  "num",
+ "regex",
+ "regex-syntax",
+]
+
+[[package]]
+name = "async-compression"
+version = "0.4.19"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "06575e6a9673580f52661c92107baabffbf41e2141373441cbcdc47cb733003c"
+dependencies = [
+ "bzip2",
+ "flate2",
+ "futures-core",
+ "memchr",
+ "pin-project-lite",
+ "tokio",
+ "xz2",
+ "zstd",
+ "zstd-safe",
 ]
 
 [[package]]
@@ -317,13 +612,37 @@ version = "0.22.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6"
 
+[[package]]
+name = "bigdecimal"
+version = "0.4.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4d6867f1565b3aad85681f1015055b087fcfd840d6aeee6eee7f2da317603695"
+dependencies = [
+ "autocfg",
+ "libm",
+ "num-bigint",
+ "num-integer",
+ "num-traits",
+]
+
 [[package]]
 name = "bincode"
-version = "1.3.3"
+version = "2.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b1f45e9417d87227c7a56d22e471c6206462cba514c7590c09aff4cf6d1ddcad"
+checksum = "36eaf5d7b090263e8150820482d5d93cd964a81e4019913c972f4edcc6edb740"
 dependencies = [
+ "bincode_derive",
  "serde",
+ "unty",
+]
+
+[[package]]
+name = "bincode_derive"
+version = "2.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bf95709a440f45e986983918d0e8a1f30a9b1df04918fc828670606804ac3c09"
+dependencies = [
+ "virtue",
 ]
 
 [[package]]
@@ -386,6 +705,29 @@ dependencies = [
  "typenum",
 ]
 
+[[package]]
+name = "blake2"
+version = "0.10.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "46502ad458c9a52b69d4d4d32775c788b7a1b85e8bc9d482d92250fc0e3f8efe"
+dependencies = [
+ "digest",
+]
+
+[[package]]
+name = "blake3"
+version = "1.8.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2468ef7d57b3fb7e16b576e8377cdbde2320c60e1491e961d11da40fc4f02a2d"
+dependencies = [
+ "arrayref",
+ "arrayvec",
+ "cc",
+ "cfg-if",
+ "constant_time_eq",
+ "cpufeatures",
+]
+
 [[package]]
 name = "block-buffer"
 version = "0.10.4"
@@ -395,6 +737,27 @@ dependencies = [
  "generic-array",
 ]
 
+[[package]]
+name = "brotli"
+version = "8.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4bd8b9603c7aa97359dbd97ecf258968c95f3adddd6db2f7e7a5bef101c84560"
+dependencies = [
+ "alloc-no-stdlib",
+ "alloc-stdlib",
+ "brotli-decompressor",
+]
+
+[[package]]
+name = "brotli-decompressor"
+version = "5.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "874bb8112abecc98cbd6d81ea4fa7e94fb9449648c93cc89aa40c81c24d7de03"
+dependencies = [
+ "alloc-no-stdlib",
+ "alloc-stdlib",
+]
+
 [[package]]
 name = "bumpalo"
 version = "3.19.1"
@@ -404,11 +767,32 @@ dependencies = [
  "allocator-api2",
 ]
 
+[[package]]
+name = "bytecount"
+version = "0.6.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "175812e0be2bccb6abe50bb8d566126198344f707e304f45c648fd8f2cc0365e"
+
+[[package]]
+name = "byteorder"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
+
 [[package]]
 name = "bytes"
-version = "1.11.0"
+version = "1.11.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1e748733b7cbc798e1434b6ac524f0c1ff2ab456fe201501e6497c8417a4fc33"
+
+[[package]]
+name = "bzip2"
+version = "0.5.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b35204fbdc0b3f4446b89fc1ac2cf84a8a68971995d0bf2e925ec7cd960f9cb3"
+checksum = "49ecfb22d906f800d4fe833b6282cf4dc1c298f5057ca0b5445e5c209735ca47"
+dependencies = [
+ "bzip2-sys",
+]
 
 [[package]]
 name = "bzip2-sys"
@@ -420,6 +804,15 @@ dependencies = [
  "pkg-config",
 ]
 
+[[package]]
+name = "camino"
+version = "1.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e629a66d692cb9ff1a1c664e41771b3dcaf961985a9774c0eb0bd1b51cf60a48"
+dependencies = [
+ "serde_core",
+]
+
 [[package]]
 name = "cap-fs-ext"
 version = "3.4.5"
@@ -469,7 +862,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d8144c22e24bbcf26ade86cb6501a0916c46b7e4787abdb0045a467eb1645a1d"
 dependencies = [
  "ambient-authority",
- "rand",
+ "rand 0.8.5",
 ]
 
 [[package]]
@@ -498,6 +891,28 @@ dependencies = [
  "winx",
 ]
 
+[[package]]
+name = "cargo-platform"
+version = "0.1.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e35af189006b9c0f00a064685c727031e3ed2d8020f7ba284d78cc2671bd36ea"
+dependencies = [
+ "serde",
+]
+
+[[package]]
+name = "cargo_metadata"
+version = "0.14.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4acbb09d9ee8e23699b9634375c72795d095bf268439da88562cf9b501f181fa"
+dependencies = [
+ "camino",
+ "cargo-platform",
+ "semver",
+ "serde",
+ "serde_json",
+]
+
 [[package]]
 name = "cc"
 version = "1.2.51"
@@ -538,10 +953,22 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "145052bdd345b87320e369255277e3fb5152762ad123a901ef5c262dd38fe8d2"
 dependencies = [
  "iana-time-zone",
+ "js-sys",
  "num-traits",
+ "wasm-bindgen",
  "windows-link",
 ]
 
+[[package]]
+name = "chrono-tz"
+version = "0.10.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a6139a8597ed92cf816dfb33f5dd6cf0bb93a6adc938f11039f371bc5bcd26c3"
+dependencies = [
+ "chrono",
+ "phf",
+]
+
 [[package]]
 name = "clang-sys"
 version = "1.8.1"
@@ -581,7 +1008,7 @@ version = "4.5.49"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2a0b5487afeab2deb2ff4e03a807ad1a03ac532ff5a2cee5d86884440c7f7671"
 dependencies = [
- "heck",
+ "heck 0.5.0",
  "proc-macro2",
  "quote",
  "syn",
@@ -657,6 +1084,12 @@ dependencies = [
  "tiny-keccak",
 ]
 
+[[package]]
+name = "constant_time_eq"
+version = "0.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3d52eff69cd5e647efe296129160853a42795992097e8af39800e1060caeea9b"
+
 [[package]]
 name = "core-foundation-sys"
 version = "0.8.7"
@@ -757,7 +1190,7 @@ dependencies = [
  "cranelift-assembler-x64-meta",
  "cranelift-codegen-shared",
  "cranelift-srcgen",
- "heck",
+ "heck 0.5.0",
  "pulley-interpreter",
 ]
 
@@ -904,6 +1337,649 @@ dependencies = [
  "typenum",
 ]
 
+[[package]]
+name = "csv"
+version = "1.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "52cd9d68cf7efc6ddfaaee42e7288d3a99d613d4b50f76ce9827ae0c6e14f938"
+dependencies = [
+ "csv-core",
+ "itoa",
+ "ryu",
+ "serde_core",
+]
+
+[[package]]
+name = "csv-core"
+version = "0.1.13"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "704a3c26996a80471189265814dbc2c257598b96b8a7feae2d31ace646bb9782"
+dependencies = [
+ "memchr",
+]
+
+[[package]]
+name = "dashmap"
+version = "5.5.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "978747c1d849a7d2ee5e8adc0159961c48fb7e5db2f06af6723b80123bb53856"
+dependencies = [
+ "cfg-if",
+ "hashbrown 0.14.5",
+ "lock_api",
+ "once_cell",
+ "parking_lot_core",
+]
+
+[[package]]
+name = "dashmap"
+version = "6.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5041cc499144891f3790297212f32a74fb938e5136a14943f338ef9e0ae276cf"
+dependencies = [
+ "cfg-if",
+ "crossbeam-utils",
+ "hashbrown 0.14.5",
+ "lock_api",
+ "once_cell",
+ "parking_lot_core",
+]
+
+[[package]]
+name = "datafusion"
+version = "48.0.1"
+source = "git+https://github.com/ArroyoSystems/arrow-datafusion?branch=48.0.1%2Farroyo#916b45f5c28d94765ae4a6393c5e126b2ea55e1c"
+dependencies = [
+ "arrow",
+ "arrow-ipc 55.2.0",
+ "arrow-schema 55.2.0",
+ "async-trait",
+ "bytes",
+ "bzip2",
+ "chrono",
+ "datafusion-catalog",
+ "datafusion-catalog-listing",
+ "datafusion-common",
+ "datafusion-common-runtime",
+ "datafusion-datasource",
+ "datafusion-datasource-csv",
+ "datafusion-datasource-json",
+ "datafusion-datasource-parquet",
+ "datafusion-execution",
+ "datafusion-expr",
+ "datafusion-expr-common",
+ "datafusion-functions",
+ "datafusion-functions-aggregate",
+ "datafusion-functions-nested",
+ "datafusion-functions-table",
+ "datafusion-functions-window",
+ "datafusion-optimizer",
+ "datafusion-physical-expr",
+ "datafusion-physical-expr-common",
+ "datafusion-physical-optimizer",
+ "datafusion-physical-plan",
+ "datafusion-session",
+ "datafusion-sql",
+ "flate2",
+ "futures",
+ "itertools 0.14.0",
+ "log",
+ "object_store",
+ "parking_lot",
+ "parquet 55.2.0 (registry+https://github.com/rust-lang/crates.io-index)",
+ "rand 0.9.2",
+ "regex",
+ "sqlparser",
+ "tempfile",
+ "tokio",
+ "url",
+ "uuid",
+ "xz2",
+ "zstd",
+]
+
+[[package]]
+name = "datafusion-catalog"
+version = "48.0.1"
+source = "git+https://github.com/ArroyoSystems/arrow-datafusion?branch=48.0.1%2Farroyo#916b45f5c28d94765ae4a6393c5e126b2ea55e1c"
+dependencies = [
+ "arrow",
+ "async-trait",
+ "dashmap 6.1.0",
+ "datafusion-common",
+ "datafusion-common-runtime",
+ "datafusion-datasource",
+ "datafusion-execution",
+ "datafusion-expr",
+ "datafusion-physical-expr",
+ "datafusion-physical-plan",
+ "datafusion-session",
+ "datafusion-sql",
+ "futures",
+ "itertools 0.14.0",
+ "log",
+ "object_store",
+ "parking_lot",
+ "tokio",
+]
+
+[[package]]
+name = "datafusion-catalog-listing"
+version = "48.0.1"
+source = "git+https://github.com/ArroyoSystems/arrow-datafusion?branch=48.0.1%2Farroyo#916b45f5c28d94765ae4a6393c5e126b2ea55e1c"
+dependencies = [
+ "arrow",
+ "async-trait",
+ "datafusion-catalog",
+ "datafusion-common",
+ "datafusion-datasource",
+ "datafusion-execution",
+ "datafusion-expr",
+ "datafusion-physical-expr",
+ "datafusion-physical-expr-common",
+ "datafusion-physical-plan",
+ "datafusion-session",
+ "futures",
+ "log",
+ "object_store",
+ "tokio",
+]
+
+[[package]]
+name = "datafusion-common"
+version = "48.0.1"
+source = "git+https://github.com/ArroyoSystems/arrow-datafusion?branch=48.0.1%2Farroyo#916b45f5c28d94765ae4a6393c5e126b2ea55e1c"
+dependencies = [
+ "ahash",
+ "arrow",
+ "arrow-ipc 55.2.0",
+ "base64",
+ "half",
+ "hashbrown 0.14.5",
+ "indexmap 2.12.1",
+ "libc",
+ "log",
+ "object_store",
+ "parquet 55.2.0 (registry+https://github.com/rust-lang/crates.io-index)",
+ "paste",
+ "recursive",
+ "sqlparser",
+ "tokio",
+ "web-time",
+]
+
+[[package]]
+name = "datafusion-common-runtime"
+version = "48.0.1"
+source = "git+https://github.com/ArroyoSystems/arrow-datafusion?branch=48.0.1%2Farroyo#916b45f5c28d94765ae4a6393c5e126b2ea55e1c"
+dependencies = [
+ "futures",
+ "log",
+ "tokio",
+]
+
+[[package]]
+name = "datafusion-datasource"
+version = "48.0.1"
+source = "git+https://github.com/ArroyoSystems/arrow-datafusion?branch=48.0.1%2Farroyo#916b45f5c28d94765ae4a6393c5e126b2ea55e1c"
+dependencies = [
+ "arrow",
+ "async-compression",
+ "async-trait",
+ "bytes",
+ "bzip2",
+ "chrono",
+ "datafusion-common",
+ "datafusion-common-runtime",
+ "datafusion-execution",
+ "datafusion-expr",
+ "datafusion-physical-expr",
+ "datafusion-physical-expr-common",
+ "datafusion-physical-plan",
+ "datafusion-session",
+ "flate2",
+ "futures",
+ "glob",
+ "itertools 0.14.0",
+ "log",
+ "object_store",
+ "parquet 55.2.0 (registry+https://github.com/rust-lang/crates.io-index)",
+ "rand 0.9.2",
+ "tempfile",
+ "tokio",
+ "tokio-util",
+ "url",
+ "xz2",
+ "zstd",
+]
+
+[[package]]
+name = "datafusion-datasource-csv"
+version = "48.0.1"
+source = "git+https://github.com/ArroyoSystems/arrow-datafusion?branch=48.0.1%2Farroyo#916b45f5c28d94765ae4a6393c5e126b2ea55e1c"
+dependencies = [
+ "arrow",
+ "async-trait",
+ "bytes",
+ "datafusion-catalog",
+ "datafusion-common",
+ "datafusion-common-runtime",
+ "datafusion-datasource",
+ "datafusion-execution",
+ "datafusion-expr",
+ "datafusion-physical-expr",
+ "datafusion-physical-expr-common",
+ "datafusion-physical-plan",
+ "datafusion-session",
+ "futures",
+ "object_store",
+ "regex",
+ "tokio",
+]
+
+[[package]]
+name = "datafusion-datasource-json"
+version = "48.0.1"
+source = "git+https://github.com/ArroyoSystems/arrow-datafusion?branch=48.0.1%2Farroyo#916b45f5c28d94765ae4a6393c5e126b2ea55e1c"
+dependencies = [
+ "arrow",
+ "async-trait",
+ "bytes",
+ "datafusion-catalog",
+ "datafusion-common",
+ "datafusion-common-runtime",
+ "datafusion-datasource",
+ "datafusion-execution",
+ "datafusion-expr",
+ "datafusion-physical-expr",
+ "datafusion-physical-expr-common",
+ "datafusion-physical-plan",
+ "datafusion-session",
+ "futures",
+ "object_store",
+ "serde_json",
+ "tokio",
+]
+
+[[package]]
+name = "datafusion-datasource-parquet"
+version = "48.0.1"
+source = "git+https://github.com/ArroyoSystems/arrow-datafusion?branch=48.0.1%2Farroyo#916b45f5c28d94765ae4a6393c5e126b2ea55e1c"
+dependencies = [
+ "arrow",
+ "async-trait",
+ "bytes",
+ "datafusion-catalog",
+ "datafusion-common",
+ "datafusion-common-runtime",
+ "datafusion-datasource",
+ "datafusion-execution",
+ "datafusion-expr",
+ "datafusion-functions-aggregate",
+ "datafusion-physical-expr",
+ "datafusion-physical-expr-common",
+ "datafusion-physical-optimizer",
+ "datafusion-physical-plan",
+ "datafusion-session",
+ "futures",
+ "itertools 0.14.0",
+ "log",
+ "object_store",
+ "parking_lot",
+ "parquet 55.2.0 (registry+https://github.com/rust-lang/crates.io-index)",
+ "rand 0.9.2",
+ "tokio",
+]
+
+[[package]]
+name = "datafusion-doc"
+version = "48.0.1"
+source = "git+https://github.com/ArroyoSystems/arrow-datafusion?branch=48.0.1%2Farroyo#916b45f5c28d94765ae4a6393c5e126b2ea55e1c"
+
+[[package]]
+name = "datafusion-execution"
+version = "48.0.1"
+source = "git+https://github.com/ArroyoSystems/arrow-datafusion?branch=48.0.1%2Farroyo#916b45f5c28d94765ae4a6393c5e126b2ea55e1c"
+dependencies = [
+ "arrow",
+ "dashmap 6.1.0",
+ "datafusion-common",
+ "datafusion-expr",
+ "futures",
+ "log",
+ "object_store",
+ "parking_lot",
+ "rand 0.9.2",
+ "tempfile",
+ "url",
+]
+
+[[package]]
+name = "datafusion-expr"
+version = "48.0.1"
+source = "git+https://github.com/ArroyoSystems/arrow-datafusion?branch=48.0.1%2Farroyo#916b45f5c28d94765ae4a6393c5e126b2ea55e1c"
+dependencies = [
+ "arrow",
+ "chrono",
+ "datafusion-common",
+ "datafusion-doc",
+ "datafusion-expr-common",
+ "datafusion-functions-aggregate-common",
+ "datafusion-functions-window-common",
+ "datafusion-physical-expr-common",
+ "indexmap 2.12.1",
+ "paste",
+ "recursive",
+ "serde_json",
+ "sqlparser",
+]
+
+[[package]]
+name = "datafusion-expr-common"
+version = "48.0.1"
+source = "git+https://github.com/ArroyoSystems/arrow-datafusion?branch=48.0.1%2Farroyo#916b45f5c28d94765ae4a6393c5e126b2ea55e1c"
+dependencies = [
+ "arrow",
+ "datafusion-common",
+ "indexmap 2.12.1",
+ "itertools 0.14.0",
+ "paste",
+]
+
+[[package]]
+name = "datafusion-functions"
+version = "48.0.1"
+source = "git+https://github.com/ArroyoSystems/arrow-datafusion?branch=48.0.1%2Farroyo#916b45f5c28d94765ae4a6393c5e126b2ea55e1c"
+dependencies = [
+ "arrow",
+ "arrow-buffer 55.2.0",
+ "base64",
+ "blake2",
+ "blake3",
+ "chrono",
+ "datafusion-common",
+ "datafusion-doc",
+ "datafusion-execution",
+ "datafusion-expr",
+ "datafusion-expr-common",
+ "datafusion-macros",
+ "hex",
+ "itertools 0.14.0",
+ "log",
+ "md-5",
+ "rand 0.9.2",
+ "regex",
+ "sha2",
+ "unicode-segmentation",
+ "uuid",
+]
+
+[[package]]
+name = "datafusion-functions-aggregate"
+version = "48.0.1"
+source = "git+https://github.com/ArroyoSystems/arrow-datafusion?branch=48.0.1%2Farroyo#916b45f5c28d94765ae4a6393c5e126b2ea55e1c"
+dependencies = [
+ "ahash",
+ "arrow",
+ "datafusion-common",
+ "datafusion-doc",
+ "datafusion-execution",
+ "datafusion-expr",
+ "datafusion-functions-aggregate-common",
+ "datafusion-macros",
+ "datafusion-physical-expr",
+ "datafusion-physical-expr-common",
+ "half",
+ "log",
+ "paste",
+]
+
+[[package]]
+name = "datafusion-functions-aggregate-common"
+version = "48.0.1"
+source = "git+https://github.com/ArroyoSystems/arrow-datafusion?branch=48.0.1%2Farroyo#916b45f5c28d94765ae4a6393c5e126b2ea55e1c"
+dependencies = [
+ "ahash",
+ "arrow",
+ "datafusion-common",
+ "datafusion-expr-common",
+ "datafusion-physical-expr-common",
+]
+
+[[package]]
+name = "datafusion-functions-nested"
+version = "48.0.1"
+source = "git+https://github.com/ArroyoSystems/arrow-datafusion?branch=48.0.1%2Farroyo#916b45f5c28d94765ae4a6393c5e126b2ea55e1c"
+dependencies = [
+ "arrow",
+ "arrow-ord",
+ "datafusion-common",
+ "datafusion-doc",
+ "datafusion-execution",
+ "datafusion-expr",
+ "datafusion-functions",
+ "datafusion-functions-aggregate",
+ "datafusion-macros",
+ "datafusion-physical-expr-common",
+ "itertools 0.14.0",
+ "log",
+ "paste",
+]
+
+[[package]]
+name = "datafusion-functions-table"
+version = "48.0.1"
+source = "git+https://github.com/ArroyoSystems/arrow-datafusion?branch=48.0.1%2Farroyo#916b45f5c28d94765ae4a6393c5e126b2ea55e1c"
+dependencies = [
+ "arrow",
+ "async-trait",
+ "datafusion-catalog",
+ "datafusion-common",
+ "datafusion-expr",
+ "datafusion-physical-plan",
+ "parking_lot",
+ "paste",
+]
+
+[[package]]
+name = "datafusion-functions-window"
+version = "48.0.1"
+source = "git+https://github.com/ArroyoSystems/arrow-datafusion?branch=48.0.1%2Farroyo#916b45f5c28d94765ae4a6393c5e126b2ea55e1c"
+dependencies = [
+ "arrow",
+ "datafusion-common",
+ "datafusion-doc",
+ "datafusion-expr",
+ "datafusion-functions-window-common",
+ "datafusion-macros",
+ "datafusion-physical-expr",
+ "datafusion-physical-expr-common",
+ "log",
+ "paste",
+]
+
+[[package]]
+name = "datafusion-functions-window-common"
+version = "48.0.1"
+source = "git+https://github.com/ArroyoSystems/arrow-datafusion?branch=48.0.1%2Farroyo#916b45f5c28d94765ae4a6393c5e126b2ea55e1c"
+dependencies = [
+ "datafusion-common",
+ "datafusion-physical-expr-common",
+]
+
+[[package]]
+name = "datafusion-macros"
+version = "48.0.1"
+source = "git+https://github.com/ArroyoSystems/arrow-datafusion?branch=48.0.1%2Farroyo#916b45f5c28d94765ae4a6393c5e126b2ea55e1c"
+dependencies = [
+ "datafusion-expr",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "datafusion-optimizer"
+version = "48.0.1"
+source = "git+https://github.com/ArroyoSystems/arrow-datafusion?branch=48.0.1%2Farroyo#916b45f5c28d94765ae4a6393c5e126b2ea55e1c"
+dependencies = [
+ "arrow",
+ "chrono",
+ "datafusion-common",
+ "datafusion-expr",
+ "datafusion-physical-expr",
+ "indexmap 2.12.1",
+ "itertools 0.14.0",
+ "log",
+ "recursive",
+ "regex",
+ "regex-syntax",
+]
+
+[[package]]
+name = "datafusion-physical-expr"
+version = "48.0.1"
+source = "git+https://github.com/ArroyoSystems/arrow-datafusion?branch=48.0.1%2Farroyo#916b45f5c28d94765ae4a6393c5e126b2ea55e1c"
+dependencies = [
+ "ahash",
+ "arrow",
+ "datafusion-common",
+ "datafusion-expr",
+ "datafusion-expr-common",
+ "datafusion-functions-aggregate-common",
+ "datafusion-physical-expr-common",
+ "half",
+ "hashbrown 0.14.5",
+ "indexmap 2.12.1",
+ "itertools 0.14.0",
+ "log",
+ "paste",
+ "petgraph 0.8.3",
+]
+
+[[package]]
+name = "datafusion-physical-expr-common"
+version = "48.0.1"
+source = "git+https://github.com/ArroyoSystems/arrow-datafusion?branch=48.0.1%2Farroyo#916b45f5c28d94765ae4a6393c5e126b2ea55e1c"
+dependencies = [
+ "ahash",
+ "arrow",
+ "datafusion-common",
+ "datafusion-expr-common",
+ "hashbrown 0.14.5",
+ "itertools 0.14.0",
+]
+
+[[package]]
+name = "datafusion-physical-optimizer"
+version = "48.0.1"
+source = "git+https://github.com/ArroyoSystems/arrow-datafusion?branch=48.0.1%2Farroyo#916b45f5c28d94765ae4a6393c5e126b2ea55e1c"
+dependencies = [
+ "arrow",
+ "datafusion-common",
+ "datafusion-execution",
+ "datafusion-expr",
+ "datafusion-expr-common",
+ "datafusion-physical-expr",
+ "datafusion-physical-expr-common",
+ "datafusion-physical-plan",
+ "itertools 0.14.0",
+ "log",
+ "recursive",
+]
+
+[[package]]
+name = "datafusion-physical-plan"
+version = "48.0.1"
+source = "git+https://github.com/ArroyoSystems/arrow-datafusion?branch=48.0.1%2Farroyo#916b45f5c28d94765ae4a6393c5e126b2ea55e1c"
+dependencies = [
+ "ahash",
+ "arrow",
+ "arrow-ord",
+ "arrow-schema 55.2.0",
+ "async-trait",
+ "chrono",
+ "datafusion-common",
+ "datafusion-common-runtime",
+ "datafusion-execution",
+ "datafusion-expr",
+ "datafusion-functions-window-common",
+ "datafusion-physical-expr",
+ "datafusion-physical-expr-common",
+ "futures",
+ "half",
+ "hashbrown 0.14.5",
+ "indexmap 2.12.1",
+ "itertools 0.14.0",
+ "log",
+ "parking_lot",
+ "pin-project-lite",
+ "tokio",
+]
+
+[[package]]
+name = "datafusion-proto"
+version = "48.0.1"
+source = "git+https://github.com/ArroyoSystems/arrow-datafusion?branch=48.0.1%2Farroyo#916b45f5c28d94765ae4a6393c5e126b2ea55e1c"
+dependencies = [
+ "arrow",
+ "chrono",
+ "datafusion",
+ "datafusion-common",
+ "datafusion-expr",
+ "datafusion-proto-common",
+ "object_store",
+ "prost",
+]
+
+[[package]]
+name = "datafusion-proto-common"
+version = "48.0.1"
+source = "git+https://github.com/ArroyoSystems/arrow-datafusion?branch=48.0.1%2Farroyo#916b45f5c28d94765ae4a6393c5e126b2ea55e1c"
+dependencies = [
+ "arrow",
+ "datafusion-common",
+ "prost",
+]
+
+[[package]]
+name = "datafusion-session"
+version = "48.0.1"
+source = "git+https://github.com/ArroyoSystems/arrow-datafusion?branch=48.0.1%2Farroyo#916b45f5c28d94765ae4a6393c5e126b2ea55e1c"
+dependencies = [
+ "arrow",
+ "async-trait",
+ "dashmap 6.1.0",
+ "datafusion-common",
+ "datafusion-common-runtime",
+ "datafusion-execution",
+ "datafusion-expr",
+ "datafusion-physical-expr",
+ "datafusion-physical-plan",
+ "datafusion-sql",
+ "futures",
+ "itertools 0.14.0",
+ "log",
+ "object_store",
+ "parking_lot",
+ "tokio",
+]
+
+[[package]]
+name = "datafusion-sql"
+version = "48.0.1"
+source = "git+https://github.com/ArroyoSystems/arrow-datafusion?branch=48.0.1%2Farroyo#916b45f5c28d94765ae4a6393c5e126b2ea55e1c"
+dependencies = [
+ "arrow",
+ "bigdecimal",
+ "datafusion-common",
+ "datafusion-expr",
+ "indexmap 2.12.1",
+ "log",
+ "recursive",
+ "regex",
+ "sqlparser",
+]
+
 [[package]]
 name = "debugid"
 version = "0.8.0"
@@ -930,6 +2006,7 @@ checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292"
 dependencies = [
  "block-buffer",
  "crypto-common",
+ "subtle",
 ]
 
 [[package]]
@@ -985,6 +2062,12 @@ dependencies = [
  "shared_child",
 ]
 
+[[package]]
+name = "dyn-clone"
+version = "1.0.20"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d0881ea181b1df73ff77ffaaf9c7544ecc11e82fba9b5f27b262a3c73a332555"
+
 [[package]]
 name = "either"
 version = "1.15.0"
@@ -1047,6 +2130,15 @@ dependencies = [
  "windows-sys 0.61.2",
 ]
 
+[[package]]
+name = "error-chain"
+version = "0.12.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2d2f06b9cac1506ece98fe3231e3cc9c4410ec3d5b1f24ae1c8946f0742cdefc"
+dependencies = [
+ "version_check",
+]
+
 [[package]]
 name = "error-code"
 version = "3.3.2"
@@ -1104,6 +2196,27 @@ dependencies = [
  "rustc_version",
 ]
 
+[[package]]
+name = "flatbuffers"
+version = "25.12.19"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "35f6839d7b3b98adde531effaf34f0c2badc6f4735d26fe74709d8e513a96ef3"
+dependencies = [
+ "bitflags 2.10.0",
+ "rustc_version",
+]
+
+[[package]]
+name = "flate2"
+version = "1.1.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "843fba2746e448b37e26a819579957415c8cef339bf08564fe8b7ddbd959573c"
+dependencies = [
+ "crc32fast",
+ "miniz_oxide",
+ "zlib-rs",
+]
+
 [[package]]
 name = "fnv"
 version = "1.0.7"
@@ -1140,28 +2253,51 @@ dependencies = [
 name = "function-stream"
 version = "0.6.0"
 dependencies = [
+ "ahash",
  "anyhow",
- "arrow-array",
- "arrow-ipc",
- "arrow-schema",
+ "arrow",
+ "arrow-array 55.2.0",
+ "arrow-ipc 55.2.0",
+ "arrow-json 55.2.0 (git+https://github.com/ArroyoSystems/arrow-rs?branch=55.2.0%2Fjson)",
+ "arrow-schema 55.2.0",
  "async-trait",
  "base64",
  "bincode",
- "clap",
+ "chrono",
  "crossbeam-channel",
+ "datafusion",
+ "datafusion-common",
+ "datafusion-execution",
+ "datafusion-expr",
+ "datafusion-functions",
+ "datafusion-functions-window",
+ "datafusion-physical-expr",
+ "datafusion-physical-plan",
+ "datafusion-proto",
+ "futures",
+ "governor",
+ "hex",
+ "itertools 0.14.0",
  "log",
  "lru",
+ "mini-moka",
  "num_cpus",
  "parking_lot",
- "pest",
- "pest_derive",
+ "parquet 55.2.0 (git+https://github.com/ArroyoSystems/arrow-rs?branch=55.2.0%2Fparquet)",
+ "petgraph 0.7.1",
  "proctitle",
+ "prost",
  "protocol",
+ "rand 0.8.5",
  "rdkafka",
  "rocksdb",
  "serde",
  "serde_json",
+ "serde_json_path",
  "serde_yaml",
+ "sha2",
+ "sqlparser",
+ "strum",
  "thiserror 2.0.17",
  "tokio",
  "tokio-stream",
@@ -1169,24 +2305,25 @@ dependencies = [
  "tracing",
  "tracing-appender",
  "tracing-subscriber",
+ "typify",
+ "unicase",
  "uuid",
  "wasmtime",
  "wasmtime-wasi",
+ "xxhash-rust",
 ]
 
 [[package]]
 name = "function-stream-cli"
 version = "0.1.0"
 dependencies = [
- "arrow-array",
- "arrow-ipc",
- "arrow-schema",
+ "arrow-array 52.2.0",
+ "arrow-ipc 52.2.0",
+ "arrow-schema 52.2.0",
  "clap",
  "comfy-table",
- "function-stream",
  "protocol",
  "rustyline",
- "thiserror 2.0.17",
  "tokio",
  "tonic",
 ]
@@ -1199,6 +2336,7 @@ checksum = "65bc07b1a8bc7c85c5f2e110c476c7389b4554ba72af57d8445ea63a576b0876"
 dependencies = [
  "futures-channel",
  "futures-core",
+ "futures-executor",
  "futures-io",
  "futures-sink",
  "futures-task",
@@ -1221,12 +2359,34 @@ version = "0.3.31"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e"
 
+[[package]]
+name = "futures-executor"
+version = "0.3.31"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1e28d1d997f585e54aebc3f97d39e72338912123a67330d723fdbb564d646c9f"
+dependencies = [
+ "futures-core",
+ "futures-task",
+ "futures-util",
+]
+
 [[package]]
 name = "futures-io"
 version = "0.3.31"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6"
 
+[[package]]
+name = "futures-macro"
+version = "0.3.31"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
 [[package]]
 name = "futures-sink"
 version = "0.3.31"
@@ -1239,6 +2399,12 @@ version = "0.3.31"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f90f7dce0722e95104fcb095585910c0977252f286e354b5e3bd38902cd99988"
 
+[[package]]
+name = "futures-timer"
+version = "3.0.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f288b0a4f20f9a56b5d1da57e2227c661b7b16168e2f72365f57b63326e29b24"
+
 [[package]]
 name = "futures-util"
 version = "0.3.31"
@@ -1248,6 +2414,7 @@ dependencies = [
  "futures-channel",
  "futures-core",
  "futures-io",
+ "futures-macro",
  "futures-sink",
  "futures-task",
  "memchr",
@@ -1298,9 +2465,11 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd"
 dependencies = [
  "cfg-if",
+ "js-sys",
  "libc",
  "r-efi",
  "wasip2",
+ "wasm-bindgen",
 ]
 
 [[package]]
@@ -1320,6 +2489,29 @@ version = "0.3.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0cc23270f6e1808e30a928bdc84dea0b9b4136a8bc82338574f23baf47bbd280"
 
+[[package]]
+name = "governor"
+version = "0.8.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "be93b4ec2e4710b04d9264c0c7350cdd62a8c20e5e4ac732552ebb8f0debe8eb"
+dependencies = [
+ "cfg-if",
+ "dashmap 6.1.0",
+ "futures-sink",
+ "futures-timer",
+ "futures-util",
+ "getrandom 0.3.4",
+ "no-std-compat",
+ "nonzero_ext",
+ "parking_lot",
+ "portable-atomic",
+ "quanta",
+ "rand 0.9.2",
+ "smallvec",
+ "spinning_top",
+ "web-time",
+]
+
 [[package]]
 name = "h2"
 version = "0.4.12"
@@ -1348,7 +2540,7 @@ dependencies = [
  "cfg-if",
  "crunchy",
  "num-traits",
- "zerocopy",
+ "zerocopy 0.8.31",
 ]
 
 [[package]]
@@ -1357,11 +2549,24 @@ version = "0.12.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888"
 
+[[package]]
+name = "hashbrown"
+version = "0.13.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "43a3c133739dddd0d2990f9a4bdf8eb4b21ef50e4851ca85ab661199821d510e"
+dependencies = [
+ "ahash",
+]
+
 [[package]]
 name = "hashbrown"
 version = "0.14.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1"
+dependencies = [
+ "ahash",
+ "allocator-api2",
+]
 
 [[package]]
 name = "hashbrown"
@@ -1379,7 +2584,13 @@ dependencies = [
 name = "hashbrown"
 version = "0.16.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100"
+checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100"
+
+[[package]]
+name = "heck"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"
 
 [[package]]
 name = "heck"
@@ -1393,6 +2604,12 @@ version = "0.5.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "fc0fef456e4baa96da950455cd02c081ca953b141298e41db3fc7e36b1da849c"
 
+[[package]]
+name = "hex"
+version = "0.4.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70"
+
 [[package]]
 name = "home"
 version = "0.5.12"
@@ -1649,7 +2866,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "af1955a75fa080c677d3972822ec4bad316169ab1cfc6c257a942c2265dbe5fe"
 dependencies = [
  "bitmaps",
- "rand_core",
+ "rand_core 0.6.4",
  "rand_xoshiro",
  "sized-chunks",
  "typenum",
@@ -1678,6 +2895,21 @@ dependencies = [
  "serde_core",
 ]
 
+[[package]]
+name = "integer-encoding"
+version = "3.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02"
+
+[[package]]
+name = "inventory"
+version = "0.3.22"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "009ae045c87e7082cb72dab0ccd01ae075dd00141ddc108f43a0ea150a9e7227"
+dependencies = [
+ "rustversion",
+]
+
 [[package]]
 name = "io-extras"
 version = "0.18.4"
@@ -1811,11 +3043,24 @@ version = "0.8.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2cde5de06e8d4c2faabc400238f9ae1c74d5412d03a7bd067645ccbc47070e46"
 dependencies = [
- "lexical-parse-float",
- "lexical-parse-integer",
- "lexical-util",
- "lexical-write-float",
- "lexical-write-integer",
+ "lexical-parse-float 0.8.5",
+ "lexical-parse-integer 0.8.6",
+ "lexical-util 0.8.5",
+ "lexical-write-float 0.8.5",
+ "lexical-write-integer 0.8.5",
+]
+
+[[package]]
+name = "lexical-core"
+version = "1.0.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7d8d125a277f807e55a77304455eb7b1cb52f2b18c143b60e766c120bd64a594"
+dependencies = [
+ "lexical-parse-float 1.0.6",
+ "lexical-parse-integer 1.0.6",
+ "lexical-util 1.0.7",
+ "lexical-write-float 1.0.6",
+ "lexical-write-integer 1.0.6",
 ]
 
 [[package]]
@@ -1824,21 +3069,40 @@ version = "0.8.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "683b3a5ebd0130b8fb52ba0bdc718cc56815b6a097e28ae5a6997d0ad17dc05f"
 dependencies = [
- "lexical-parse-integer",
- "lexical-util",
+ "lexical-parse-integer 0.8.6",
+ "lexical-util 0.8.5",
  "static_assertions",
 ]
 
+[[package]]
+name = "lexical-parse-float"
+version = "1.0.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "52a9f232fbd6f550bc0137dcb5f99ab674071ac2d690ac69704593cb4abbea56"
+dependencies = [
+ "lexical-parse-integer 1.0.6",
+ "lexical-util 1.0.7",
+]
+
 [[package]]
 name = "lexical-parse-integer"
 version = "0.8.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6d0994485ed0c312f6d965766754ea177d07f9c00c9b82a5ee62ed5b47945ee9"
 dependencies = [
- "lexical-util",
+ "lexical-util 0.8.5",
  "static_assertions",
 ]
 
+[[package]]
+name = "lexical-parse-integer"
+version = "1.0.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9a7a039f8fb9c19c996cd7b2fcce303c1b2874fe1aca544edc85c4a5f8489b34"
+dependencies = [
+ "lexical-util 1.0.7",
+]
+
 [[package]]
 name = "lexical-util"
 version = "0.8.5"
@@ -1848,27 +3112,52 @@ dependencies = [
  "static_assertions",
 ]
 
+[[package]]
+name = "lexical-util"
+version = "1.0.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2604dd126bb14f13fb5d1bd6a66155079cb9fa655b37f875b3a742c705dbed17"
+
 [[package]]
 name = "lexical-write-float"
 version = "0.8.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "accabaa1c4581f05a3923d1b4cfd124c329352288b7b9da09e766b0668116862"
 dependencies = [
- "lexical-util",
- "lexical-write-integer",
+ "lexical-util 0.8.5",
+ "lexical-write-integer 0.8.5",
  "static_assertions",
 ]
 
+[[package]]
+name = "lexical-write-float"
+version = "1.0.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "50c438c87c013188d415fbabbb1dceb44249ab81664efbd31b14ae55dabb6361"
+dependencies = [
+ "lexical-util 1.0.7",
+ "lexical-write-integer 1.0.6",
+]
+
 [[package]]
 name = "lexical-write-integer"
 version = "0.8.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e1b6f3d1f4422866b68192d62f77bc5c700bee84f3069f2469d7bc8c77852446"
 dependencies = [
- "lexical-util",
+ "lexical-util 0.8.5",
  "static_assertions",
 ]
 
+[[package]]
+name = "lexical-write-integer"
+version = "1.0.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "409851a618475d2d5796377cad353802345cba92c867d9fbcde9cf4eac4e14df"
+dependencies = [
+ "lexical-util 1.0.7",
+]
+
 [[package]]
 name = "libc"
 version = "0.2.179"
@@ -1987,6 +3276,26 @@ dependencies = [
  "libc",
 ]
 
+[[package]]
+name = "lz4_flex"
+version = "0.11.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "373f5eceeeab7925e0c1098212f2fbc4d416adec9d35051a6ab251e824c1854a"
+dependencies = [
+ "twox-hash",
+]
+
+[[package]]
+name = "lzma-sys"
+version = "0.1.20"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5fda04ab3764e6cde78b9974eec4f779acaba7c4e84b36eca3cf77c581b85d27"
+dependencies = [
+ "cc",
+ "libc",
+ "pkg-config",
+]
+
 [[package]]
 name = "mach2"
 version = "0.4.3"
@@ -2017,6 +3326,16 @@ version = "0.3.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4facc753ae494aeb6e3c22f839b158aebd4f9270f55cd3c79906c45476c47ab4"
 
+[[package]]
+name = "md-5"
+version = "0.10.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d89e7ee0cfbedfc4da3340218492196241d89eefb6dab27de5df917a6d2e78cf"
+dependencies = [
+ "cfg-if",
+ "digest",
+]
+
 [[package]]
 name = "memchr"
 version = "2.7.6"
@@ -2038,12 +3357,37 @@ version = "0.3.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a"
 
+[[package]]
+name = "mini-moka"
+version = "0.10.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c325dfab65f261f386debee8b0969da215b3fa0037e74c8a1234db7ba986d803"
+dependencies = [
+ "crossbeam-channel",
+ "crossbeam-utils",
+ "dashmap 5.5.3",
+ "skeptic",
+ "smallvec",
+ "tagptr",
+ "triomphe",
+]
+
 [[package]]
 name = "minimal-lexical"
 version = "0.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a"
 
+[[package]]
+name = "miniz_oxide"
+version = "0.8.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1fa76a2c86f704bdb222d66965fb3d63269ce38518b83cb0575fca855ebb6316"
+dependencies = [
+ "adler2",
+ "simd-adler32",
+]
+
 [[package]]
 name = "mio"
 version = "1.1.1"
@@ -2082,6 +3426,12 @@ dependencies = [
  "libc",
 ]
 
+[[package]]
+name = "no-std-compat"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b93853da6d84c2e3c7d730d6473e8817692dd89be387eb01b94d7f108ecb5b8c"
+
 [[package]]
 name = "nom"
 version = "7.1.3"
@@ -2092,6 +3442,12 @@ dependencies = [
  "minimal-lexical",
 ]
 
+[[package]]
+name = "nonzero_ext"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "38bf9645c8b145698bb0b18a4637dcacbc421ea49bef2317e4fd8065a387cf21"
+
 [[package]]
 name = "nu-ansi-term"
 version = "0.50.3"
@@ -2225,6 +3581,30 @@ dependencies = [
  "memchr",
 ]
 
+[[package]]
+name = "object_store"
+version = "0.12.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fbfbfff40aeccab00ec8a910b57ca8ecf4319b335c542f2edcd19dd25a1e2a00"
+dependencies = [
+ "async-trait",
+ "bytes",
+ "chrono",
+ "futures",
+ "http",
+ "humantime",
+ "itertools 0.14.0",
+ "parking_lot",
+ "percent-encoding",
+ "thiserror 2.0.17",
+ "tokio",
+ "tracing",
+ "url",
+ "walkdir",
+ "wasm-bindgen-futures",
+ "web-time",
+]
+
 [[package]]
 name = "once_cell"
 version = "1.21.3"
@@ -2249,6 +3629,15 @@ dependencies = [
  "vcpkg",
 ]
 
+[[package]]
+name = "ordered-float"
+version = "2.10.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "68f19d67e5a2795c94e73e0bb1cc1a7edeb2e28efd39e2e1c9b7a40c1108b11c"
+dependencies = [
+ "num-traits",
+]
+
 [[package]]
 name = "os_pipe"
 version = "1.2.3"
@@ -2282,6 +3671,80 @@ dependencies = [
  "windows-link",
 ]
 
+[[package]]
+name = "parquet"
+version = "55.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b17da4150748086bd43352bc77372efa9b6e3dbd06a04831d2a98c041c225cfa"
+dependencies = [
+ "ahash",
+ "arrow-array 55.2.0",
+ "arrow-buffer 55.2.0",
+ "arrow-cast 55.2.0",
+ "arrow-data 55.2.0",
+ "arrow-ipc 55.2.0",
+ "arrow-schema 55.2.0",
+ "arrow-select 55.2.0",
+ "base64",
+ "brotli",
+ "bytes",
+ "chrono",
+ "flate2",
+ "futures",
+ "half",
+ "hashbrown 0.15.5",
+ "lz4_flex",
+ "num",
+ "num-bigint",
+ "object_store",
+ "paste",
+ "seq-macro",
+ "simdutf8",
+ "snap",
+ "thrift",
+ "tokio",
+ "twox-hash",
+ "zstd",
+]
+
+[[package]]
+name = "parquet"
+version = "55.2.0"
+source = "git+https://github.com/ArroyoSystems/arrow-rs?branch=55.2.0%2Fparquet#d1d2dd8edf673cddc79ba6403dc6508263a2ddda"
+dependencies = [
+ "ahash",
+ "arrow-array 55.2.0",
+ "arrow-buffer 55.2.0",
+ "arrow-cast 55.2.0",
+ "arrow-data 55.2.0",
+ "arrow-ipc 55.2.0",
+ "arrow-schema 55.2.0",
+ "arrow-select 55.2.0",
+ "base64",
+ "brotli",
+ "bytes",
+ "chrono",
+ "flate2",
+ "half",
+ "hashbrown 0.15.5",
+ "lz4_flex",
+ "num",
+ "num-bigint",
+ "paste",
+ "seq-macro",
+ "simdutf8",
+ "snap",
+ "thrift",
+ "twox-hash",
+ "zstd",
+]
+
+[[package]]
+name = "paste"
+version = "1.0.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a"
+
 [[package]]
 name = "peeking_take_while"
 version = "0.1.2"
@@ -2295,66 +3758,53 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9b4f627cb1b25917193a259e49bdad08f671f8d9708acfd5fe0a8c1455d87220"
 
 [[package]]
-name = "pest"
-version = "2.8.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2c9eb05c21a464ea704b53158d358a31e6425db2f63a1a7312268b05fe2b75f7"
-dependencies = [
- "memchr",
- "ucd-trie",
-]
-
-[[package]]
-name = "pest_derive"
-version = "2.8.5"
+name = "petgraph"
+version = "0.6.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "68f9dbced329c441fa79d80472764b1a2c7e57123553b8519b36663a2fb234ed"
+checksum = "b4c5cc86750666a3ed20bdaf5ca2a0344f9c67674cae0515bec2da16fbaa47db"
 dependencies = [
- "pest",
- "pest_generator",
+ "fixedbitset 0.4.2",
+ "indexmap 2.12.1",
 ]
 
 [[package]]
-name = "pest_generator"
-version = "2.8.5"
+name = "petgraph"
+version = "0.7.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3bb96d5051a78f44f43c8f712d8e810adb0ebf923fc9ed2655a7f66f63ba8ee5"
+checksum = "3672b37090dbd86368a4145bc067582552b29c27377cad4e0a306c97f9bd7772"
 dependencies = [
- "pest",
- "pest_meta",
- "proc-macro2",
- "quote",
- "syn",
+ "fixedbitset 0.5.7",
+ "indexmap 2.12.1",
 ]
 
 [[package]]
-name = "pest_meta"
-version = "2.8.5"
+name = "petgraph"
+version = "0.8.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "602113b5b5e8621770cfd490cfd90b9f84ab29bd2b0e49ad83eb6d186cef2365"
+checksum = "8701b58ea97060d5e5b155d383a69952a60943f0e6dfe30b04c287beb0b27455"
 dependencies = [
- "pest",
- "sha2",
+ "fixedbitset 0.5.7",
+ "hashbrown 0.15.5",
+ "indexmap 2.12.1",
+ "serde",
 ]
 
 [[package]]
-name = "petgraph"
-version = "0.6.5"
+name = "phf"
+version = "0.12.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b4c5cc86750666a3ed20bdaf5ca2a0344f9c67674cae0515bec2da16fbaa47db"
+checksum = "913273894cec178f401a31ec4b656318d95473527be05c0752cc41cdc32be8b7"
 dependencies = [
- "fixedbitset 0.4.2",
- "indexmap 2.12.1",
+ "phf_shared",
 ]
 
 [[package]]
-name = "petgraph"
-version = "0.7.1"
+name = "phf_shared"
+version = "0.12.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3672b37090dbd86368a4145bc067582552b29c27377cad4e0a306c97f9bd7772"
+checksum = "06005508882fb681fd97892ecff4b7fd0fee13ef1aa569f8695dae7ab9099981"
 dependencies = [
- "fixedbitset 0.5.7",
- "indexmap 2.12.1",
+ "siphasher",
 ]
 
 [[package]]
@@ -2395,6 +3845,12 @@ version = "0.3.32"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c"
 
+[[package]]
+name = "portable-atomic"
+version = "1.13.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c33a9471896f1c69cecef8d20cbe2f7accd12527ce60845ff44c153bb2a21b49"
+
 [[package]]
 name = "postcard"
 version = "1.1.3"
@@ -2428,7 +3884,7 @@ version = "0.2.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9"
 dependencies = [
- "zerocopy",
+ "zerocopy 0.8.31",
 ]
 
 [[package]]
@@ -2486,7 +3942,7 @@ version = "0.13.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "be769465445e8c1474e9c5dac2018218498557af32d9ed057325ec9a41ae81bf"
 dependencies = [
- "heck",
+ "heck 0.5.0",
  "itertools 0.14.0",
  "log",
  "multimap",
@@ -2529,10 +3985,32 @@ dependencies = [
  "env_logger",
  "log",
  "prost",
+ "serde",
  "tonic",
  "tonic-build",
 ]
 
+[[package]]
+name = "psm"
+version = "0.1.30"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3852766467df634d74f0b2d7819bf8dc483a0eb2e3b0f50f756f9cfe8b0d18d8"
+dependencies = [
+ "ar_archive_writer",
+ "cc",
+]
+
+[[package]]
+name = "pulldown-cmark"
+version = "0.9.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "57206b407293d2bcd3af849ce869d52068623f19e1b5ff8e8778e3309439682b"
+dependencies = [
+ "bitflags 2.10.0",
+ "memchr",
+ "unicase",
+]
+
 [[package]]
 name = "pulley-interpreter"
 version = "41.0.3"
@@ -2556,6 +4034,21 @@ dependencies = [
  "syn",
 ]
 
+[[package]]
+name = "quanta"
+version = "0.12.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f3ab5a9d756f0d97bdc89019bd2e4ea098cf9cde50ee7564dde6b81ccc8f06c7"
+dependencies = [
+ "crossbeam-utils",
+ "libc",
+ "once_cell",
+ "raw-cpuid",
+ "wasi",
+ "web-sys",
+ "winapi",
+]
+
 [[package]]
 name = "quote"
 version = "1.0.42"
@@ -2588,8 +4081,18 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
 dependencies = [
  "libc",
- "rand_chacha",
- "rand_core",
+ "rand_chacha 0.3.1",
+ "rand_core 0.6.4",
+]
+
+[[package]]
+name = "rand"
+version = "0.9.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6db2770f06117d490610c7488547d543617b21bfa07796d7a12f6f1bd53850d1"
+dependencies = [
+ "rand_chacha 0.9.0",
+ "rand_core 0.9.5",
 ]
 
 [[package]]
@@ -2599,7 +4102,17 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88"
 dependencies = [
  "ppv-lite86",
- "rand_core",
+ "rand_core 0.6.4",
+]
+
+[[package]]
+name = "rand_chacha"
+version = "0.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb"
+dependencies = [
+ "ppv-lite86",
+ "rand_core 0.9.5",
 ]
 
 [[package]]
@@ -2611,13 +4124,31 @@ dependencies = [
  "getrandom 0.2.16",
 ]
 
+[[package]]
+name = "rand_core"
+version = "0.9.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "76afc826de14238e6e8c374ddcc1fa19e374fd8dd986b0d2af0d02377261d83c"
+dependencies = [
+ "getrandom 0.3.4",
+]
+
 [[package]]
 name = "rand_xoshiro"
 version = "0.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6f97cdb2a36ed4183de61b2f824cc45c9f1037f28afe0a322e9fff4c108b5aaa"
 dependencies = [
- "rand_core",
+ "rand_core 0.6.4",
+]
+
+[[package]]
+name = "raw-cpuid"
+version = "11.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "498cd0dc59d73224351ee52a95fee0f1a617a2eae0e7d9d720cc622c73a54186"
+dependencies = [
+ "bitflags 2.10.0",
 ]
 
 [[package]]
@@ -2673,6 +4204,26 @@ dependencies = [
  "sasl2-sys",
 ]
 
+[[package]]
+name = "recursive"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0786a43debb760f491b1bc0269fe5e84155353c67482b9e60d0cfb596054b43e"
+dependencies = [
+ "recursive-proc-macro-impl",
+ "stacker",
+]
+
+[[package]]
+name = "recursive-proc-macro-impl"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "76009fbe0614077fc1a2ce255e3a1881a2e3a3527097d5dc6d8212c585e7e38b"
+dependencies = [
+ "quote",
+ "syn",
+]
+
 [[package]]
 name = "redox_syscall"
 version = "0.5.18"
@@ -2735,7 +4286,17 @@ dependencies = [
 name = "regex-syntax"
 version = "0.8.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7a2d987857b319362043e95f5353c0535c1f58eec5336fdfcf626430af7def58"
+checksum = "7a2d987857b319362043e95f5353c0535c1f58eec5336fdfcf626430af7def58"
+
+[[package]]
+name = "regress"
+version = "0.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "82a9ecfa0cb04d0b04dddb99b8ccf4f66bc8dfd23df694b398570bd8ae3a50fb"
+dependencies = [
+ "hashbrown 0.13.2",
+ "memchr",
+]
 
 [[package]]
 name = "rocksdb"
@@ -2844,6 +4405,15 @@ version = "1.0.22"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a50f4cf475b65d88e057964e0e9bb1f0aa9bbb2036dc65c64596b42932536984"
 
+[[package]]
+name = "same-file"
+version = "1.0.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502"
+dependencies = [
+ "winapi-util",
+]
+
 [[package]]
 name = "sasl2-sys"
 version = "0.1.22+2.1.28"
@@ -2856,6 +4426,30 @@ dependencies = [
  "pkg-config",
 ]
 
+[[package]]
+name = "schemars"
+version = "0.8.22"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3fbf2ae1b8bc8e02df939598064d22402220cd5bbcca1c76f7d6a310974d5615"
+dependencies = [
+ "dyn-clone",
+ "schemars_derive",
+ "serde",
+ "serde_json",
+]
+
+[[package]]
+name = "schemars_derive"
+version = "0.8.22"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "32e265784ad618884abaea0600a9adf15393368d840e0222d101a072f3f7534d"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "serde_derive_internals",
+ "syn",
+]
+
 [[package]]
 name = "scopeguard"
 version = "1.2.0"
@@ -2872,6 +4466,12 @@ dependencies = [
  "serde_core",
 ]
 
+[[package]]
+name = "seq-macro"
+version = "0.3.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1bc711410fbe7399f390ca1c3b60ad0f53f80e95c5eb935e52268a0e2cd49acc"
+
 [[package]]
 name = "serde"
 version = "1.0.228"
@@ -2902,6 +4502,17 @@ dependencies = [
  "syn",
 ]
 
+[[package]]
+name = "serde_derive_internals"
+version = "0.29.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "18d26a20a969b9e3fdf2fc2d9f21eda6c40e2de84c9408bb5d3b05d499aae711"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
 [[package]]
 name = "serde_json"
 version = "1.0.148"
@@ -2915,6 +4526,56 @@ dependencies = [
  "zmij",
 ]
 
+[[package]]
+name = "serde_json_path"
+version = "0.7.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b992cea3194eea663ba99a042d61cea4bd1872da37021af56f6a37e0359b9d33"
+dependencies = [
+ "inventory",
+ "nom",
+ "regex",
+ "serde",
+ "serde_json",
+ "serde_json_path_core",
+ "serde_json_path_macros",
+ "thiserror 2.0.17",
+]
+
+[[package]]
+name = "serde_json_path_core"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dde67d8dfe7d4967b5a95e247d4148368ddd1e753e500adb34b3ffe40c6bc1bc"
+dependencies = [
+ "inventory",
+ "serde",
+ "serde_json",
+ "thiserror 2.0.17",
+]
+
+[[package]]
+name = "serde_json_path_macros"
+version = "0.1.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "517acfa7f77ddaf5c43d5f119c44a683774e130b4247b7d3210f8924506cfac8"
+dependencies = [
+ "inventory",
+ "serde_json_path_core",
+ "serde_json_path_macros_internal",
+]
+
+[[package]]
+name = "serde_json_path_macros_internal"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "aafbefbe175fa9bf03ca83ef89beecff7d2a95aaacd5732325b90ac8c3bd7b90"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
 [[package]]
 name = "serde_spanned"
 version = "1.0.4"
@@ -2924,6 +4585,18 @@ dependencies = [
  "serde_core",
 ]
 
+[[package]]
+name = "serde_tokenstream"
+version = "0.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d7c49585c52c01f13c5c2ebb333f14f6885d76daa768d8a037d28017ec538c69"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "serde",
+ "syn",
+]
+
 [[package]]
 name = "serde_yaml"
 version = "0.9.34+deprecated"
@@ -3005,6 +4678,24 @@ dependencies = [
  "libc",
 ]
 
+[[package]]
+name = "simd-adler32"
+version = "0.3.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e320a6c5ad31d271ad523dcf3ad13e2767ad8b1cb8f047f75a8aeaf8da139da2"
+
+[[package]]
+name = "simdutf8"
+version = "0.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e3a9fe34e3e7a50316060351f37187a3f546bce95496156754b601a5fa71b76e"
+
+[[package]]
+name = "siphasher"
+version = "1.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b2aa850e253778c88a04c3d7323b043aeda9d3e30d5971937c1855769763678e"
+
 [[package]]
 name = "sized-chunks"
 version = "0.6.5"
@@ -3015,6 +4706,21 @@ dependencies = [
  "typenum",
 ]
 
+[[package]]
+name = "skeptic"
+version = "0.13.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "16d23b015676c90a0f01c197bfdc786c20342c73a0afdda9025adb0bc42940a8"
+dependencies = [
+ "bytecount",
+ "cargo_metadata",
+ "error-chain",
+ "glob",
+ "pulldown-cmark",
+ "tempfile",
+ "walkdir",
+]
+
 [[package]]
 name = "slab"
 version = "0.4.11"
@@ -3030,6 +4736,12 @@ dependencies = [
  "serde",
 ]
 
+[[package]]
+name = "snap"
+version = "1.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1b6b67fb9a61334225b5b790716f609cd58395f895b3fe8b328786812a40bc3b"
+
 [[package]]
 name = "socket2"
 version = "0.5.10"
@@ -3050,12 +4762,54 @@ dependencies = [
  "windows-sys 0.60.2",
 ]
 
+[[package]]
+name = "spinning_top"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d96d2d1d716fb500937168cc09353ffdc7a012be8475ac7308e1bdf0e3923300"
+dependencies = [
+ "lock_api",
+]
+
+[[package]]
+name = "sqlparser"
+version = "0.55.0"
+source = "git+https://github.com/FunctionStream/sqlparser-rs?branch=0.6.0%2Ffunction-sql-parser#9783cf9e3e6b61c763f78bcdd460e85edec22250"
+dependencies = [
+ "log",
+ "recursive",
+ "sqlparser_derive",
+]
+
+[[package]]
+name = "sqlparser_derive"
+version = "0.3.0"
+source = "git+https://github.com/FunctionStream/sqlparser-rs?branch=0.6.0%2Ffunction-sql-parser#9783cf9e3e6b61c763f78bcdd460e85edec22250"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
 [[package]]
 name = "stable_deref_trait"
 version = "1.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596"
 
+[[package]]
+name = "stacker"
+version = "0.1.23"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "08d74a23609d509411d10e2176dc2a4346e3b4aea2e7b1869f19fdedbc71c013"
+dependencies = [
+ "cc",
+ "cfg-if",
+ "libc",
+ "psm",
+ "windows-sys 0.59.0",
+]
+
 [[package]]
 name = "static_assertions"
 version = "1.1.0"
@@ -3068,6 +4822,34 @@ version = "0.11.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
 
+[[package]]
+name = "strum"
+version = "0.26.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8fec0f0aef304996cf250b31b5a10dee7980c85da9d759361292b8bca5a18f06"
+dependencies = [
+ "strum_macros",
+]
+
+[[package]]
+name = "strum_macros"
+version = "0.26.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be"
+dependencies = [
+ "heck 0.5.0",
+ "proc-macro2",
+ "quote",
+ "rustversion",
+ "syn",
+]
+
+[[package]]
+name = "subtle"
+version = "2.6.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292"
+
 [[package]]
 name = "syn"
 version = "2.0.113"
@@ -3112,6 +4894,12 @@ dependencies = [
  "winx",
 ]
 
+[[package]]
+name = "tagptr"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7b2093cf4c8eb1e67749a6762251bc9cd836b6fc171623bd0a9d324d37af2417"
+
 [[package]]
 name = "target-lexicon"
 version = "0.13.4"
@@ -3189,6 +4977,17 @@ dependencies = [
  "cfg-if",
 ]
 
+[[package]]
+name = "thrift"
+version = "0.17.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7e54bc85fc7faa8bc175c4bab5b92ba8d9a3ce893d0e9f42cc455c8ab16a9e09"
+dependencies = [
+ "byteorder",
+ "integer-encoding",
+ "ordered-float",
+]
+
 [[package]]
 name = "time"
 version = "0.3.44"
@@ -3397,7 +5196,7 @@ dependencies = [
  "indexmap 1.9.3",
  "pin-project",
  "pin-project-lite",
- "rand",
+ "rand 0.8.5",
  "slab",
  "tokio",
  "tokio-util",
@@ -3519,12 +5318,24 @@ dependencies = [
  "tracing-serde",
 ]
 
+[[package]]
+name = "triomphe"
+version = "0.1.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dd69c5aa8f924c7519d6372789a74eac5b94fb0f8fcf0d4a97eb0bfc3e785f39"
+
 [[package]]
 name = "try-lock"
 version = "0.2.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b"
 
+[[package]]
+name = "twox-hash"
+version = "2.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9ea3136b675547379c4bd395ca6b938e5ad3c3d20fad76e7fe85f9e0d011419c"
+
 [[package]]
 name = "typenum"
 version = "1.19.0"
@@ -3532,10 +5343,51 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "562d481066bde0658276a35467c4af00bdc6ee726305698a55b86e61d7ad82bb"
 
 [[package]]
-name = "ucd-trie"
-version = "0.1.7"
+name = "typify"
+version = "0.0.13"
+source = "git+https://github.com/ArroyoSystems/typify.git?branch=arroyo#d14b6fc016bf9d63618d8b43b4d74a648980737b"
+dependencies = [
+ "typify-impl",
+ "typify-macro",
+]
+
+[[package]]
+name = "typify-impl"
+version = "0.0.13"
+source = "git+https://github.com/ArroyoSystems/typify.git?branch=arroyo#d14b6fc016bf9d63618d8b43b4d74a648980737b"
+dependencies = [
+ "heck 0.4.1",
+ "log",
+ "proc-macro2",
+ "quote",
+ "regress",
+ "schemars",
+ "serde_json",
+ "syn",
+ "thiserror 1.0.69",
+ "unicode-ident",
+]
+
+[[package]]
+name = "typify-macro"
+version = "0.0.13"
+source = "git+https://github.com/ArroyoSystems/typify.git?branch=arroyo#d14b6fc016bf9d63618d8b43b4d74a648980737b"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "schemars",
+ "serde",
+ "serde_json",
+ "serde_tokenstream",
+ "syn",
+ "typify-impl",
+]
+
+[[package]]
+name = "unicase"
+version = "2.9.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2896d95c02a80c6d6a5d6e953d479f5ddf2dfdb6a244441010e373ac0fb88971"
+checksum = "dbc4bc3a9f746d862c45cb89d705aa10f187bb96c76001afab07a0d35ce60142"
 
 [[package]]
 name = "unicode-ident"
@@ -3573,6 +5425,12 @@ version = "0.2.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "673aac59facbab8a9007c7f6108d11f63b603f7cabff99fabf650fea5c32b861"
 
+[[package]]
+name = "unty"
+version = "0.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6d49784317cd0d1ee7ec5c716dd598ec5b4483ea832a2dced265471cc0f690ae"
+
 [[package]]
 name = "url"
 version = "2.5.7"
@@ -3626,6 +5484,22 @@ version = "0.9.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a"
 
+[[package]]
+name = "virtue"
+version = "0.0.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "051eb1abcf10076295e815102942cc58f9d5e3b4560e46e53c21e8ff6f3af7b1"
+
+[[package]]
+name = "walkdir"
+version = "2.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b"
+dependencies = [
+ "same-file",
+ "winapi-util",
+]
+
 [[package]]
 name = "want"
 version = "0.3.1"
@@ -3663,6 +5537,19 @@ dependencies = [
  "wasm-bindgen-shared",
 ]
 
+[[package]]
+name = "wasm-bindgen-futures"
+version = "0.4.56"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "836d9622d604feee9e5de25ac10e3ea5f2d65b41eac0d9ce72eb5deae707ce7c"
+dependencies = [
+ "cfg-if",
+ "js-sys",
+ "once_cell",
+ "wasm-bindgen",
+ "web-sys",
+]
+
 [[package]]
 name = "wasm-bindgen-macro"
 version = "0.2.106"
@@ -3702,7 +5589,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "af801b6f36459023eaec63fdbaedad2fd5a4ab7dc74ecc110a8b5d375c5775e4"
 dependencies = [
  "anyhow",
- "heck",
+ "heck 0.5.0",
  "im-rc",
  "indexmap 2.12.1",
  "log",
@@ -4005,7 +5892,7 @@ checksum = "87acbd416227cdd279565ba49e57cf7f08d112657c3b3f39b70250acdfd094fe"
 dependencies = [
  "anyhow",
  "bitflags 2.10.0",
- "heck",
+ "heck 0.5.0",
  "indexmap 2.12.1",
  "wit-parser",
 ]
@@ -4085,6 +5972,26 @@ dependencies = [
  "wast 243.0.0",
 ]
 
+[[package]]
+name = "web-sys"
+version = "0.3.83"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9b32828d774c412041098d182a8b38b16ea816958e07cf40eec2bc080ae137ac"
+dependencies = [
+ "js-sys",
+ "wasm-bindgen",
+]
+
+[[package]]
+name = "web-time"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5a6580f308b1fad9207618087a65c04e7a10bc77e02c8e84e9b00dd4b12fa0bb"
+dependencies = [
+ "js-sys",
+ "wasm-bindgen",
+]
+
 [[package]]
 name = "wiggle"
 version = "41.0.3"
@@ -4106,7 +6013,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "57f773d51c1696bd7d028aa35c884d9fc58f48d79a1176dfbad6c908de314235"
 dependencies = [
  "anyhow",
- "heck",
+ "heck 0.5.0",
  "proc-macro2",
  "quote",
  "syn",
@@ -4461,6 +6368,21 @@ version = "0.6.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9edde0db4769d2dc68579893f2306b26c6ecfbe0ef499b013d731b7b9247e0b9"
 
+[[package]]
+name = "xxhash-rust"
+version = "0.8.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fdd20c5420375476fbd4394763288da7eb0cc0b8c11deed431a91562af7335d3"
+
+[[package]]
+name = "xz2"
+version = "0.1.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "388c44dc09d76f1536602ead6d325eb532f5c122f17782bd57fb47baeeb767e2"
+dependencies = [
+ "lzma-sys",
+]
+
 [[package]]
 name = "yoke"
 version = "0.8.1"
@@ -4484,13 +6406,33 @@ dependencies = [
  "synstructure",
 ]
 
+[[package]]
+name = "zerocopy"
+version = "0.7.35"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0"
+dependencies = [
+ "zerocopy-derive 0.7.35",
+]
+
 [[package]]
 name = "zerocopy"
 version = "0.8.31"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "fd74ec98b9250adb3ca554bdde269adf631549f51d8a8f8f0a10b50f1cb298c3"
 dependencies = [
- "zerocopy-derive",
+ "zerocopy-derive 0.8.31",
+]
+
+[[package]]
+name = "zerocopy-derive"
+version = "0.7.35"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
 ]
 
 [[package]]
@@ -4558,6 +6500,12 @@ dependencies = [
  "syn",
 ]
 
+[[package]]
+name = "zlib-rs"
+version = "0.6.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3be3d40e40a133f9c916ee3f9f4fa2d9d63435b5fbe1bfc6d9dae0aa0ada1513"
+
 [[package]]
 name = "zmij"
 version = "1.0.10"
diff --git a/Cargo.toml b/Cargo.toml
index 4b855aa9..7c49d04c 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -35,25 +35,57 @@ tonic = { version = "0.12", features = ["default"] }
 async-trait = "0.1"
 num_cpus = "1.0"
 protocol = { path = "./protocol" }
+prost = "0.13"
 rdkafka = { version = "0.38", features = ["cmake-build", "ssl", "gssapi"] }
 crossbeam-channel = "0.5"
-pest = "2.7"
-pest_derive = "2.7"
-clap = { version = "4.5", features = ["derive"] }
 wasmtime = { version = "41.0.3", features = ["component-model", "async"] }
 base64 = "0.22"
 wasmtime-wasi = "41.0.3"
 rocksdb = { version = "0.21", features = ["multi-threaded-cf", "lz4"] }
-bincode = "1.3"
+bincode = { version = "2", features = ["serde"] }
+chrono = "0.4"
 tokio-stream = "0.1.18"
 lru = "0.12"
 parking_lot = "0.12"
-arrow-array = "52"
-arrow-ipc = "52"
-arrow-schema = "52"
+arrow = { version = "55", default-features = false }
+arrow-array = "55"
+arrow-ipc = "55"
+arrow-schema = { version = "55", features = ["serde"] }
+futures = "0.3"
+serde_json_path = "0.7"
+xxhash-rust = { version = "0.8", features = ["xxh3"] }
 proctitle = "0.1"
+unicase = "2.7"
+petgraph = "0.7"
+rand = { version = "0.8", features = ["small_rng"] }
+itertools = "0.14"
+strum = { version = "0.26", features = ["derive"] }
+
+typify = { git = 'https://github.com/ArroyoSystems/typify.git', branch = 'arroyo' }
+parquet = {git = 'https://github.com/ArroyoSystems/arrow-rs', branch = '55.2.0/parquet'}
+arrow-json = {git = 'https://github.com/ArroyoSystems/arrow-rs', branch = '55.2.0/json'}
+datafusion = {git = 'https://github.com/ArroyoSystems/arrow-datafusion', branch = '48.0.1/arroyo'}
+datafusion-common = {git = 'https://github.com/ArroyoSystems/arrow-datafusion', branch = '48.0.1/arroyo'}
+datafusion-execution = {git = 'https://github.com/ArroyoSystems/arrow-datafusion', branch = '48.0.1/arroyo'}
+datafusion-expr = {git = 'https://github.com/ArroyoSystems/arrow-datafusion', branch = '48.0.1/arroyo'}
+datafusion-physical-expr = {git = 'https://github.com/ArroyoSystems/arrow-datafusion', branch = '48.0.1/arroyo'}
+datafusion-physical-plan = {git = 'https://github.com/ArroyoSystems/arrow-datafusion', branch = '48.0.1/arroyo'}
+datafusion-proto = {git = 'https://github.com/ArroyoSystems/arrow-datafusion', branch = '48.0.1/arroyo'}
+datafusion-functions = {git = 'https://github.com/ArroyoSystems/arrow-datafusion', branch = '48.0.1/arroyo'}
+datafusion-functions-window = {git = 'https://github.com/ArroyoSystems/arrow-datafusion', branch = '48.0.1/arroyo'}
+
+sqlparser = { git = "https://github.com/FunctionStream/sqlparser-rs", branch = "0.6.0/function-sql-parser" }
+
+ahash = "0.8"
+governor = "0.8.0"
+mini-moka = "0.10"
+sha2 = "0.10"
+hex = "0.4"
 
 [features]
 default = ["incremental-cache", "python"]
 incremental-cache = ["wasmtime/incremental-cache"]
 python = []
+
+[patch."https://github.com/ArroyoSystems/sqlparser-rs"]
+sqlparser = { git = "https://github.com/FunctionStream/sqlparser-rs", branch = "0.6.0/function-sql-parser" }
diff --git a/README-zh.md b/README-zh.md
index b1d68eac..a15bfcc5 100644
--- a/README-zh.md
+++ b/README-zh.md
@@ -23,7 +23,7 @@
 
 [中文](README-zh.md) | [English](README.md)
 
-**Function Stream** 是一个基于 Rust 构建的高性能、事件驱动的流处理框架。它提供了一个模块化的运行时，用于编排编译为 **WebAssembly (WASM)** 的 Serverless 风格处理函数，支持使用 **Go、Python 和 Rust** 编写函数。
+**Function Stream** 是一个基于 Rust 构建的高性能、事件驱动的流处理框架。它提供了一个模块化的运行时，用于编排编译为 **WebAssembly (WASM)** 的 Serverless 风格处理函数，支持使用 **Go、Python 和 Rust** 编写函数。同时内置 **Streaming SQL** 引擎，可通过纯声明式 SQL 构建实时数据管道 — 包括时间窗口聚合、多流关联和持续 ETL。
 
 ## 目录
 
@@ -46,6 +46,7 @@
 
 ## 核心特性
 
+- **Streaming SQL 引擎**：使用纯 SQL 构建实时管道 — 注册数据源（`CREATE TABLE`）、启动持续计算（`CREATE STREAMING TABLE ... AS SELECT`）、管理生命周期（`SHOW` / `DROP`）。支持滚动窗口、滑动窗口、窗口关联等丰富语义。
 - **事件驱动的 WASM 运行时**：以接近原生的性能和沙箱隔离的方式执行多语言函数（Go、Python、Rust）。
 - **持久化状态管理**：内置支持基于 RocksDB 的状态存储，用于有状态流处理。
 - **SQL 驱动的 CLI**：使用类 SQL 命令进行作业管理和流检测的交互式 REPL。
@@ -200,14 +201,16 @@ function-stream-<version>/
 
 ## 文档
 
-| 文档                                                   | 描述            |
-|------------------------------------------------------|---------------|
-| [服务端配置与运维指南](docs/server-configuration-zh.md)        | 服务端配置与运维操作    |
-| [Function 任务配置规范](docs/function-configuration-zh.md) | 任务定义规范        |
-| [SQL CLI 交互式管理指南](docs/sql-cli-guide-zh.md)          | 交互式管理指南       |
-| [Function 管理与开发指南](docs/function-development-zh.md)  | 管理与开发指南       |
-| [Go SDK 开发与交互指南](docs/Go-SDK/go-sdk-guide-zh.md)     | Go SDK 指南        |
-| [Python SDK 开发与交互指南](docs/Python-SDK/python-sdk-guide-zh.md) | Python SDK 指南 |
+| 文档                                                                     | 描述                       |
+|------------------------------------------------------------------------|--------------------------|
+| [Streaming SQL 使用指南](docs/streaming-sql-guide-zh.md)                   | 声明式 SQL 实时流处理指南         |
+| [连接器、格式与类型参考](docs/connectors-and-formats-zh.md)                       | 支持的 Source/Sink、格式与数据类型  |
+| [服务端配置与运维指南](docs/server-configuration-zh.md)                          | 服务端配置与运维操作              |
+| [Function 任务配置规范](docs/function-configuration-zh.md)        | 任务定义规范           |
+| [SQL CLI 交互式管理指南](docs/sql-cli-guide-zh.md)                 | 交互式管理指南          |
+| [Function 管理与开发指南](docs/function-development-zh.md)         | 管理与开发指南          |
+| [Go SDK 开发与交互指南](docs/Go-SDK/go-sdk-guide-zh.md)            | Go SDK 指南         |
+| [Python SDK 开发与交互指南](docs/Python-SDK/python-sdk-guide-zh.md) | Python SDK 指南     |
 
 ## 配置
 
diff --git a/README.md b/README.md
index 51a69de1..f74bee33 100644
--- a/README.md
+++ b/README.md
@@ -23,7 +23,7 @@
 
 [中文](README-zh.md) | [English](README.md)
 
-**Function Stream** is a high-performance, event-driven stream processing framework built in Rust. It provides a modular runtime to orchestrate serverless-style processing functions compiled to **WebAssembly (WASM)**, supporting functions written in **Go, Python, and Rust**.
+**Function Stream** is a high-performance, event-driven stream processing framework built in Rust. It provides a modular runtime to orchestrate serverless-style processing functions compiled to **WebAssembly (WASM)**, supporting functions written in **Go, Python, and Rust**. It also features a **Streaming SQL** engine that lets you build real-time data pipelines — including time-windowed aggregations, multi-stream joins, and continuous ETL — using pure declarative SQL.
 
 ## Table of Contents
 
@@ -46,6 +46,7 @@
 
 ## Key Features
 
+* **Streaming SQL Engine**: Build real-time pipelines with pure SQL — register sources (`CREATE TABLE`), launch continuous computations (`CREATE STREAMING TABLE ... AS SELECT`), and manage lifecycle (`SHOW` / `DROP`). Supports tumbling windows, hopping windows, window joins, and more.
 * **Event-Driven WASM Runtime**: Executes polyglot functions (Go, Python, Rust) with near-native performance and sandboxed isolation.
 * **Durable State Management**: Built-in support for RocksDB-backed state stores for stateful stream processing.
 * **SQL-Powered CLI**: Interactive REPL for job management and stream inspection using SQL-like commands.
@@ -199,14 +200,16 @@ We provide a robust shell script to manage the server process, capable of handli
 
 ## Documentation
 
-| Document                                                 | Description                       |
-|----------------------------------------------------------|-----------------------------------|
-| [Server Configuration](docs/server-configuration.md)     | Server Configuration & Operations |
-| [Function Configuration](docs/function-configuration.md) | Task Definition Specification     |
-| [SQL CLI Guide](docs/sql-cli-guide.md)                   | Interactive Management Guide      |
-| [Function Development](docs/function-development.md)     | Management & Development Guide    |
-| [Go SDK Guide](docs/Go-SDK/go-sdk-guide.md)              | Go SDK Guide                      |
-| [Python SDK Guide](docs/Python-SDK/python-sdk-guide.md)   | Python SDK Guide                  |
+| Document                                                       | Description                                     |
+|----------------------------------------------------------------|-------------------------------------------------|
+| [Streaming SQL Guide](docs/streaming-sql-guide.md)             | Declarative SQL for Real-Time Stream Processing |
+| [Connectors, Formats & Types](docs/connectors-and-formats.md) | Supported Sources, Sinks, Formats & Data Types  |
+| [Server Configuration](docs/server-configuration.md)           | Server Configuration & Operations               |
+| [Function Configuration](docs/function-configuration.md) | Task Definition Specification                   |
+| [SQL CLI Guide](docs/sql-cli-guide.md)                   | Interactive Management Guide                    |
+| [Function Development](docs/function-development.md)     | Management & Development Guide                  |
+| [Go SDK Guide](docs/Go-SDK/go-sdk-guide.md)              | Go SDK Guide                                    |
+| [Python SDK Guide](docs/Python-SDK/python-sdk-guide.md)  | Python SDK Guide                                |
 
 ## Configuration
 
diff --git a/cli/cli/Cargo.toml b/cli/cli/Cargo.toml
index 72352995..3c05d6b4 100644
--- a/cli/cli/Cargo.toml
+++ b/cli/cli/Cargo.toml
@@ -12,10 +12,8 @@ arrow-array = "52"
 arrow-ipc = "52"
 arrow-schema = "52"
 comfy-table = "7"
-function-stream = { path = "../../" }
 protocol = { path = "../../protocol" }
 clap = { version = "4.5", features = ["derive"] }
-thiserror = "2"
 tokio = { version = "1.0", features = ["full", "signal"] }
 tonic = { version = "0.12", features = ["default"] }
 rustyline = { version = "14.0", features = ["with-dirs"] }
diff --git a/cli/cli/src/repl.rs b/cli/cli/src/repl.rs
index 7f8087b3..8c3882b2 100644
--- a/cli/cli/src/repl.rs
+++ b/cli/cli/src/repl.rs
@@ -20,26 +20,62 @@ use comfy_table::{Attribute, Cell, Color, ContentArrangement, Table, TableCompon
 use protocol::cli::{function_stream_service_client::FunctionStreamServiceClient, SqlRequest};
 use rustyline::error::ReadlineError;
 use rustyline::{Config, DefaultEditor, EditMode};
+use std::fmt;
 use std::io::{self, Cursor, Write};
 use std::sync::Arc;
 use tokio::sync::Mutex;
 use tonic::Request;
 
-#[derive(Debug, thiserror::Error)]
+/// CLI errors.
+///
+/// **Important:** [`tonic::Status`] must not be formatted with `{}` — its [`fmt::Display`] dumps
+/// `details` / `metadata` (e.g. HTTP headers). Only [`tonic::Status::message`] is stored in
+/// [`ReplError::Rpc`].
+#[derive(Debug)]
 pub enum ReplError {
-    #[error("RPC error: {0}")]
-    Rpc(Box<tonic::Status>),
-    #[error("Connection failed: {0}")]
+    Rpc(String),
     Connection(String),
-    #[error("Internal error: {0}")]
     Internal(String),
-    #[error("IO error: {0}")]
-    Io(#[from] io::Error),
+    Io(io::Error),
+}
+
+impl fmt::Display for ReplError {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match self {
+            ReplError::Rpc(s) => f.write_str(s),
+            ReplError::Connection(s) => f.write_str(s),
+            ReplError::Internal(s) => write!(f, "Internal error: {s}"),
+            ReplError::Io(e) => write!(f, "IO error: {e}"),
+        }
+    }
+}
+
+impl std::error::Error for ReplError {
+    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
+        match self {
+            ReplError::Io(e) => Some(e),
+            _ => None,
+        }
+    }
+}
+
+impl From<io::Error> for ReplError {
+    fn from(e: io::Error) -> Self {
+        ReplError::Io(e)
+    }
 }
 
 impl From<tonic::Status> for ReplError {
     fn from(s: tonic::Status) -> Self {
-        ReplError::Rpc(Box::new(s))
+        let msg = s.message();
+        if msg.is_empty() {
+            ReplError::Rpc(format!(
+                "gRPC {} (server returned no message)",
+                s.code()
+            ))
+        } else {
+            ReplError::Rpc(msg.to_string())
+        }
     }
 }
 
diff --git a/conf/config.yaml b/conf/config.yaml
index 3f19493d..9d0f625e 100644
--- a/conf/config.yaml
+++ b/conf/config.yaml
@@ -117,3 +117,10 @@ task_storage:
     
     # Maximum bytes for level base in bytes (optional)
     max_bytes_for_level_base: 268435456
+
+# Stream table catalog (SQL: CREATE TABLE connector sources, SHOW TABLES, SHOW CREATE TABLE).
+# When persist is true (default), metadata is stored under RocksDB at db_path (default: data/stream_catalog)
+# and reloaded after process restart. Set persist: false only for tests/ephemeral nodes.
+stream_catalog:
+  persist: true
+  # db_path: data/stream_catalog
diff --git a/docs/connectors-and-formats-zh.md b/docs/connectors-and-formats-zh.md
new file mode 100644
index 00000000..8f25a7dc
--- /dev/null
+++ b/docs/connectors-and-formats-zh.md
@@ -0,0 +1,197 @@
+<!--
+
+    Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+
+-->
+
+# 连接器、数据格式与 SQL 类型参考
+
+[中文](connectors-and-formats-zh.md) | [English](connectors-and-formats.md)
+
+本文档是 Function Stream Streaming SQL 引擎所支持的连接器（Source / Sink）、序列化格式以及 SQL 数据类型的权威参考。
+
+---
+
+## 目录
+
+- [1. 连接器 (Connector)](#1-连接器-connector)
+  - [1.1 Kafka Source（数据源）](#11-kafka-source数据源)
+  - [1.2 Kafka Sink（数据汇）](#12-kafka-sink数据汇)
+- [2. 数据格式 (Format)](#2-数据格式-format)
+- [3. SQL 数据类型](#3-sql-数据类型)
+- [4. 完整示例](#4-完整示例)
+
+---
+
+## 1. 连接器 (Connector)
+
+当前 Function Stream 支持 **Kafka** 作为生产可用的连接器，同时可作为数据源（Source）和数据汇（Sink）。
+
+### 1.1 Kafka Source（数据源）
+
+Kafka Source 从一个或多个 Kafka Topic 分区读取消息。在 `CREATE TABLE` 中使用以注册输入流。
+
+**必填属性：**
+
+| 属性 | 说明 | 示例 |
+|------|------|------|
+| `connector` | 必须为 `kafka`。 | `'kafka'` |
+| `topic` | 要消费的 Kafka Topic。 | `'raw_events'` |
+| `format` | 消息的序列化格式。 | `'json'` |
+| `bootstrap.servers` | Kafka Broker 地址列表，逗号分隔。 | `'broker1:9092,broker2:9092'` |
+
+**示例：**
+
+```sql
+CREATE TABLE page_views (
+    user_id VARCHAR,
+    page_url VARCHAR,
+    view_time TIMESTAMP NOT NULL,
+    WATERMARK FOR view_time AS view_time - INTERVAL '3' SECOND
+) WITH (
+    'connector' = 'kafka',
+    'topic' = 'page_views',
+    'format' = 'json',
+    'bootstrap.servers' = 'localhost:9092'
+);
+```
+
+### 1.2 Kafka Sink（数据汇）
+
+Kafka Sink 将计算结果写入 Kafka Topic。在 `CREATE STREAMING TABLE` 的 `WITH` 子句中配置。
+
+**必填属性：**
+
+| 属性 | 说明 | 示例 |
+|------|------|------|
+| `connector` | 必须为 `kafka`。 | `'kafka'` |
+| `topic` | 要写入的 Kafka Topic。 | `'sink_results'` |
+| `format` | 输出消息的序列化格式。 | `'json'` |
+| `bootstrap.servers` | Kafka Broker 地址列表。 | `'broker1:9092'` |
+
+**示例：**
+
+```sql
+CREATE STREAMING TABLE enriched_clicks WITH (
+    'connector' = 'kafka',
+    'topic' = 'enriched_clicks',
+    'format' = 'json',
+    'bootstrap.servers' = 'localhost:9092'
+) AS
+SELECT click_id, user_id, click_time
+FROM ad_clicks;
+```
+
+---
+
+## 2. 数据格式 (Format)
+
+当前唯一支持的序列化格式是 **JSON**。每条 Kafka 消息应为一个自描述的 JSON 对象，其字段直接映射到 `CREATE TABLE` 中定义的列。
+
+在 `WITH` 子句中设置 `'format' = 'json'`（省略时也默认为 JSON）。
+
+---
+
+## 3. SQL 数据类型
+
+以下是 `CREATE TABLE` 列定义中支持的 SQL 数据类型：
+
+### 数值类型
+
+| SQL 类型 | 别名 | Arrow 类型 | 说明 |
+|----------|------|-----------|------|
+| `BOOLEAN` | `BOOL` | Boolean | 布尔值。 |
+| `TINYINT` | — | Int8 | 8 位有符号整数。 |
+| `SMALLINT` | `INT2` | Int16 | 16 位有符号整数。 |
+| `INT` | `INTEGER`、`INT4` | Int32 | 32 位有符号整数。 |
+| `BIGINT` | `INT8` | Int64 | 64 位有符号整数。 |
+| `TINYINT UNSIGNED` | — | UInt8 | 8 位无符号整数。 |
+| `SMALLINT UNSIGNED` | `INT2 UNSIGNED` | UInt16 | 16 位无符号整数。 |
+| `INT UNSIGNED` | `INT4 UNSIGNED` | UInt32 | 32 位无符号整数。 |
+| `BIGINT UNSIGNED` | `INT8 UNSIGNED` | UInt64 | 64 位无符号整数。 |
+| `FLOAT` | `REAL`、`FLOAT4` | Float32 | 32 位 IEEE 754 浮点数。 |
+| `DOUBLE` | `DOUBLE PRECISION`、`FLOAT8` | Float64 | 64 位 IEEE 754 浮点数。 |
+| `DECIMAL(p, s)` | `NUMERIC(p, s)` | Decimal128 | 定点小数。精度 1–38，标度 <= 精度。 |
+
+### 字符串与二进制类型
+
+| SQL 类型 | 别名 | Arrow 类型 | 说明 |
+|----------|------|-----------|------|
+| `VARCHAR` | `TEXT`、`STRING`、`CHAR` | Utf8 | 可变长度 UTF-8 字符串。 |
+| `BYTEA` | — | Binary | 可变长度字节数组。 |
+| `JSON` | — | Utf8（JSON 扩展） | 带有 FunctionStream 扩展元数据的 JSON 类型字符串。 |
+
+### 日期与时间类型
+
+| SQL 类型 | Arrow 类型 | 说明 |
+|----------|-----------|------|
+| `TIMESTAMP` | Timestamp(Nanosecond) | 不含时区的日期时间（纳秒精度）。 |
+| `TIMESTAMP(0)` | Timestamp(Second) | 秒精度。 |
+| `TIMESTAMP(3)` | Timestamp(Millisecond) | 毫秒精度。 |
+| `TIMESTAMP(6)` | Timestamp(Microsecond) | 微秒精度。 |
+| `TIMESTAMP(9)` | Timestamp(Nanosecond) | 纳秒精度（与 `TIMESTAMP` 相同）。 |
+| `DATE` | Date32 | 日历日期（年、月、日）。 |
+| `DATETIME` | Timestamp(Nanosecond) | `TIMESTAMP` 的别名。 |
+| `TIME` | Time64(Nanosecond) | 不含时区的时刻。 |
+| `INTERVAL` | Interval(MonthDayNano) | 时间间隔 / 持续时间。 |
+
+### 复合类型
+
+| SQL 类型 | Arrow 类型 | 说明 |
+|----------|-----------|------|
+| `STRUCT<name type, ...>` | Struct | 命名组合字段。 |
+| `ARRAY<element_type>` | List | 相同类型元素的有序列表。也支持 `element_type[]` 语法。 |
+
+---
+
+## 4. 完整示例
+
+以下是一个结合 Kafka Source、Kafka Sink、JSON 格式和多种 SQL 数据类型的完整示例：
+
+```sql
+-- Source：从 Kafka 读取用户活动事件
+CREATE TABLE user_activity (
+    event_id VARCHAR,
+    user_id BIGINT,
+    action VARCHAR,
+    amount DECIMAL(10, 2),
+    tags ARRAY<VARCHAR>,
+    event_time TIMESTAMP NOT NULL,
+    WATERMARK FOR event_time AS event_time - INTERVAL '5' SECOND
+) WITH (
+    'connector' = 'kafka',
+    'topic' = 'user_activity',
+    'format' = 'json',
+    'bootstrap.servers' = 'localhost:9092'
+);
+
+-- Sink：1 分钟滚动窗口聚合
+CREATE STREAMING TABLE activity_stats_1m WITH (
+    'connector' = 'kafka',
+    'topic' = 'activity_stats_1m',
+    'format' = 'json',
+    'bootstrap.servers' = 'localhost:9092'
+) AS
+SELECT
+    TUMBLE(INTERVAL '1' MINUTE) AS time_window,
+    action,
+    COUNT(*) AS event_count,
+    SUM(amount) AS total_amount
+FROM user_activity
+GROUP BY 1, action;
+```
diff --git a/docs/connectors-and-formats.md b/docs/connectors-and-formats.md
new file mode 100644
index 00000000..46d0d964
--- /dev/null
+++ b/docs/connectors-and-formats.md
@@ -0,0 +1,197 @@
+<!--
+
+    Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+
+-->
+
+# Connectors, Formats & Data Types
+
+[中文](connectors-and-formats-zh.md) | [English](connectors-and-formats.md)
+
+This document is the authoritative reference for connectors (sources & sinks), serialization formats, and SQL data types supported by Function Stream's Streaming SQL engine.
+
+---
+
+## Table of Contents
+
+- [1. Connectors](#1-connectors)
+  - [1.1 Kafka (Source)](#11-kafka-source)
+  - [1.2 Kafka (Sink)](#12-kafka-sink)
+- [2. Data Format](#2-data-format)
+- [3. SQL Data Types](#3-sql-data-types)
+- [4. Full Example](#4-full-example)
+
+---
+
+## 1. Connectors
+
+Currently Function Stream supports **Kafka** as the production-ready connector for both source (ingestion) and sink (egress).
+
+### 1.1 Kafka (Source)
+
+A Kafka source reads records from one or more Kafka topic partitions. Use it in `CREATE TABLE` to register an input stream.
+
+**Required Properties:**
+
+| Property | Description | Example |
+|----------|-------------|---------|
+| `connector` | Must be `kafka`. | `'kafka'` |
+| `topic` | Kafka topic to consume from. | `'raw_events'` |
+| `format` | Serialization format of messages. | `'json'` |
+| `bootstrap.servers` | Comma-separated list of Kafka broker addresses. | `'broker1:9092,broker2:9092'` |
+
+**Example:**
+
+```sql
+CREATE TABLE page_views (
+    user_id VARCHAR,
+    page_url VARCHAR,
+    view_time TIMESTAMP NOT NULL,
+    WATERMARK FOR view_time AS view_time - INTERVAL '3' SECOND
+) WITH (
+    'connector' = 'kafka',
+    'topic' = 'page_views',
+    'format' = 'json',
+    'bootstrap.servers' = 'localhost:9092'
+);
+```
+
+### 1.2 Kafka (Sink)
+
+A Kafka sink writes records into a Kafka topic. It is configured in the `WITH` clause of a `CREATE STREAMING TABLE` statement.
+
+**Required Properties:**
+
+| Property | Description | Example |
+|----------|-------------|---------|
+| `connector` | Must be `kafka`. | `'kafka'` |
+| `topic` | Kafka topic to write to. | `'sink_results'` |
+| `format` | Serialization format of output messages. | `'json'` |
+| `bootstrap.servers` | Comma-separated Kafka broker addresses. | `'broker1:9092'` |
+
+**Example:**
+
+```sql
+CREATE STREAMING TABLE enriched_clicks WITH (
+    'connector' = 'kafka',
+    'topic' = 'enriched_clicks',
+    'format' = 'json',
+    'bootstrap.servers' = 'localhost:9092'
+) AS
+SELECT click_id, user_id, click_time
+FROM ad_clicks;
+```
+
+---
+
+## 2. Data Format
+
+Currently the only supported serialization format is **JSON**. Each Kafka message is expected to be a self-describing JSON object whose fields map directly to the columns defined in `CREATE TABLE`.
+
+Set `'format' = 'json'` in the `WITH` clause (this is also the default when omitted).
+
+---
+
+## 3. SQL Data Types
+
+The following SQL data types are supported in `CREATE TABLE` column definitions:
+
+### Numeric Types
+
+| SQL Type | Aliases | Arrow Type | Description |
+|----------|---------|------------|-------------|
+| `BOOLEAN` | `BOOL` | Boolean | True / false. |
+| `TINYINT` | — | Int8 | 8-bit signed integer. |
+| `SMALLINT` | `INT2` | Int16 | 16-bit signed integer. |
+| `INT` | `INTEGER`, `INT4` | Int32 | 32-bit signed integer. |
+| `BIGINT` | `INT8` | Int64 | 64-bit signed integer. |
+| `TINYINT UNSIGNED` | — | UInt8 | 8-bit unsigned integer. |
+| `SMALLINT UNSIGNED` | `INT2 UNSIGNED` | UInt16 | 16-bit unsigned integer. |
+| `INT UNSIGNED` | `INT4 UNSIGNED` | UInt32 | 32-bit unsigned integer. |
+| `BIGINT UNSIGNED` | `INT8 UNSIGNED` | UInt64 | 64-bit unsigned integer. |
+| `FLOAT` | `REAL`, `FLOAT4` | Float32 | 32-bit IEEE 754 floating point. |
+| `DOUBLE` | `DOUBLE PRECISION`, `FLOAT8` | Float64 | 64-bit IEEE 754 floating point. |
+| `DECIMAL(p, s)` | `NUMERIC(p, s)` | Decimal128 | Fixed-point decimal. Precision 1–38, scale <= precision. |
+
+### String & Binary Types
+
+| SQL Type | Aliases | Arrow Type | Description |
+|----------|---------|------------|-------------|
+| `VARCHAR` | `TEXT`, `STRING`, `CHAR` | Utf8 | Variable-length UTF-8 string. |
+| `BYTEA` | — | Binary | Variable-length byte array. |
+| `JSON` | — | Utf8 (JSON extension) | JSON-typed string with FunctionStream extension metadata. |
+
+### Date & Time Types
+
+| SQL Type | Arrow Type | Description |
+|----------|------------|-------------|
+| `TIMESTAMP` | Timestamp(Nanosecond) | Date and time without timezone (nanosecond precision). |
+| `TIMESTAMP(0)` | Timestamp(Second) | Second precision. |
+| `TIMESTAMP(3)` | Timestamp(Millisecond) | Millisecond precision. |
+| `TIMESTAMP(6)` | Timestamp(Microsecond) | Microsecond precision. |
+| `TIMESTAMP(9)` | Timestamp(Nanosecond) | Nanosecond precision (same as `TIMESTAMP`). |
+| `DATE` | Date32 | Calendar date (year, month, day). |
+| `DATETIME` | Timestamp(Nanosecond) | Alias for `TIMESTAMP`. |
+| `TIME` | Time64(Nanosecond) | Time of day without timezone. |
+| `INTERVAL` | Interval(MonthDayNano) | Time duration / interval. |
+
+### Composite Types
+
+| SQL Type | Arrow Type | Description |
+|----------|------------|-------------|
+| `STRUCT<name type, ...>` | Struct | Named composite fields. |
+| `ARRAY<element_type>` | List | Ordered list of elements of the same type. Also supports `element_type[]` syntax. |
+
+---
+
+## 4. Full Example
+
+Below is a complete example combining a Kafka source, a Kafka sink, JSON format, and various SQL data types:
+
+```sql
+-- Source: user activity events from Kafka
+CREATE TABLE user_activity (
+    event_id VARCHAR,
+    user_id BIGINT,
+    action VARCHAR,
+    amount DECIMAL(10, 2),
+    tags ARRAY<VARCHAR>,
+    event_time TIMESTAMP NOT NULL,
+    WATERMARK FOR event_time AS event_time - INTERVAL '5' SECOND
+) WITH (
+    'connector' = 'kafka',
+    'topic' = 'user_activity',
+    'format' = 'json',
+    'bootstrap.servers' = 'localhost:9092'
+);
+
+-- Sink: 1-minute tumbling window aggregation
+CREATE STREAMING TABLE activity_stats_1m WITH (
+    'connector' = 'kafka',
+    'topic' = 'activity_stats_1m',
+    'format' = 'json',
+    'bootstrap.servers' = 'localhost:9092'
+) AS
+SELECT
+    TUMBLE(INTERVAL '1' MINUTE) AS time_window,
+    action,
+    COUNT(*) AS event_count,
+    SUM(amount) AS total_amount
+FROM user_activity
+GROUP BY 1, action;
+```
diff --git a/docs/sql-cli-guide-zh.md b/docs/sql-cli-guide-zh.md
index 8352dea1..bff05932 100644
--- a/docs/sql-cli-guide-zh.md
+++ b/docs/sql-cli-guide-zh.md
@@ -129,7 +129,69 @@ DROP FUNCTION go_processor_demo;
 
 ---
 
-## 三、REPL 内建辅助指令
+## 三、Streaming SQL：TABLE 与 STREAMING TABLE
+
+除了 Function 管理之外，CLI 还支持一整套 **Streaming SQL** 命令，用于声明数据源和构建实时管道。完整示例请参阅 [Streaming SQL 使用指南](streaming-sql-guide-zh.md)。
+
+### 3.1 注册数据源：CREATE TABLE
+
+声明外部数据源（如 Kafka），包含 Schema、事件时间和水位线策略。此操作仅创建**静态目录条目**，不消耗计算资源。
+
+```sql
+CREATE TABLE ad_impressions (
+    impression_id VARCHAR,
+    ad_id BIGINT,
+    campaign_id BIGINT,
+    user_id VARCHAR,
+    impression_time TIMESTAMP NOT NULL,
+    WATERMARK FOR impression_time AS impression_time - INTERVAL '2' SECOND
+) WITH (
+    'connector' = 'kafka',
+    'topic' = 'raw_ad_impressions',
+    'format' = 'json',
+    'bootstrap.servers' = 'localhost:9092'
+);
+```
+
+### 3.2 创建流计算管道：CREATE STREAMING TABLE
+
+使用 CTAS 语法启动持续运行的分布式计算管道。结果以纯追加模式写入目标连接器。
+
+```sql
+CREATE STREAMING TABLE metric_tumble_impressions_1m WITH (
+    'connector' = 'kafka',
+    'topic' = 'sink_impressions_1m',
+    'format' = 'json',
+    'bootstrap.servers' = 'localhost:9092'
+) AS
+SELECT
+    TUMBLE(INTERVAL '1' MINUTE) AS time_window,
+    campaign_id,
+    COUNT(*) AS total_impressions
+FROM ad_impressions
+GROUP BY 1, campaign_id;
+```
+
+### 3.3 查看与监控
+
+| 命令 | 说明 |
+|------|------|
+| `SHOW TABLES` | 列出所有已注册的数据源表。 |
+| `SHOW CREATE TABLE <name>` | 显示某张表的建表 DDL。 |
+| `SHOW STREAMING TABLES` | 列出所有正在运行的流计算管道及其状态。 |
+| `SHOW CREATE STREAMING TABLE <name>` | 查看某条管道的物理执行拓扑图（ASCII 格式）。 |
+
+### 3.4 销毁流计算管道：DROP STREAMING TABLE
+
+停止并释放某条流计算管道的所有资源：
+
+```sql
+DROP STREAMING TABLE metric_tumble_impressions_1m;
+```
+
+---
+
+## 四、REPL 内建辅助指令
 
 在 `function-stream>` 提示符下，支持以下便捷指令：
 
@@ -141,7 +203,7 @@ DROP FUNCTION go_processor_demo;
 
 ---
 
-## 四、技术约束与注意事项
+## 五、技术约束与注意事项
 
 - **路径隔离**：SQL CLI 本身不负责上传文件。function_path 指向的文件必须预先存在于**服务端机器**的磁盘上。若需远程上传打包，请使用 Python SDK。
 - **Python 函数限制**：由于 Python 函数涉及动态依赖分析与代码打包，目前**不支持**通过 SQL CLI 创建，仅能通过 CLI 进行 START / STOP / SHOW 等生命周期管理。
diff --git a/docs/sql-cli-guide.md b/docs/sql-cli-guide.md
index be42a37e..a7f36a88 100644
--- a/docs/sql-cli-guide.md
+++ b/docs/sql-cli-guide.md
@@ -129,7 +129,69 @@ DROP FUNCTION go_processor_demo;
 
 ---
 
-## 3. REPL Built-in Auxiliary Commands
+## 3. Streaming SQL: TABLE & STREAMING TABLE
+
+In addition to Function management, the CLI supports a full set of **Streaming SQL** commands for declaring data sources and building real-time pipelines. For a comprehensive guide with examples, see [Streaming SQL Guide](streaming-sql-guide.md).
+
+### 3.1 Register Data Source: CREATE TABLE
+
+Declare an external data source (e.g. Kafka) with schema, event time, and watermark strategy. This creates a **static catalog entry** that consumes no compute resources.
+
+```sql
+CREATE TABLE ad_impressions (
+    impression_id VARCHAR,
+    ad_id BIGINT,
+    campaign_id BIGINT,
+    user_id VARCHAR,
+    impression_time TIMESTAMP NOT NULL,
+    WATERMARK FOR impression_time AS impression_time - INTERVAL '2' SECOND
+) WITH (
+    'connector' = 'kafka',
+    'topic' = 'raw_ad_impressions',
+    'format' = 'json',
+    'bootstrap.servers' = 'localhost:9092'
+);
+```
+
+### 3.2 Create Streaming Pipeline: CREATE STREAMING TABLE
+
+Launch a continuous, distributed compute pipeline using CTAS syntax. Results are written to the target connector in append-only mode.
+
+```sql
+CREATE STREAMING TABLE metric_tumble_impressions_1m WITH (
+    'connector' = 'kafka',
+    'topic' = 'sink_impressions_1m',
+    'format' = 'json',
+    'bootstrap.servers' = 'localhost:9092'
+) AS
+SELECT
+    TUMBLE(INTERVAL '1' MINUTE) AS time_window,
+    campaign_id,
+    COUNT(*) AS total_impressions
+FROM ad_impressions
+GROUP BY 1, campaign_id;
+```
+
+### 3.3 Inspect & Monitor
+
+| Command | Description |
+|---------|-------------|
+| `SHOW TABLES` | List all registered source tables. |
+| `SHOW CREATE TABLE <name>` | Display the DDL of a registered table. |
+| `SHOW STREAMING TABLES` | List all running streaming pipelines with status. |
+| `SHOW CREATE STREAMING TABLE <name>` | Inspect the physical execution graph (ASCII topology). |
+
+### 3.4 Destroy Streaming Pipeline: DROP STREAMING TABLE
+
+Stop and release all resources for a streaming pipeline:
+
+```sql
+DROP STREAMING TABLE metric_tumble_impressions_1m;
+```
+
+---
+
+## 4. REPL Built-in Auxiliary Commands
 
 At the `function-stream>` prompt, the following convenient commands are supported:
 
@@ -141,7 +203,7 @@ At the `function-stream>` prompt, the following convenient commands are supporte
 
 ---
 
-## 4. Technical Constraints and Notes
+## 5. Technical Constraints and Notes
 
 - **Path Isolation**: The SQL CLI itself is not responsible for uploading files. The file pointed to by function_path must pre-exist on the **Server machine's** disk. If remote upload packaging is required, please use the Python SDK.
 - **Python Function Limitations**: Since Python functions involve dynamic dependency analysis and code packaging, they are currently **not supported** for creation via SQL CLI; only lifecycle management such as START / STOP / SHOW via CLI is supported.
diff --git a/docs/streaming-sql-guide-zh.md b/docs/streaming-sql-guide-zh.md
new file mode 100644
index 00000000..98842614
--- /dev/null
+++ b/docs/streaming-sql-guide-zh.md
@@ -0,0 +1,284 @@
+<!--
+
+    Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+
+-->
+
+# Streaming SQL 使用指南
+
+[中文](streaming-sql-guide-zh.md) | [English](streaming-sql-guide.md)
+
+Function Stream 提供了声明式 SQL 接口来构建实时流处理管道。通过 Streaming SQL，您可以轻松应对无界数据流（Unbounded Data）的摄取、时间窗口聚合、流式关联以及任务生命周期管理 — 无需编写任何命令式代码。
+
+---
+
+## 目录
+
+- [核心概念](#核心概念)
+- [第一部分：注册数据源 (TABLE)](#第一部分注册数据源-table)
+- [第二部分：构建实时 Pipeline (STREAMING TABLE)](#第二部分构建实时-pipeline-streaming-table)
+  - [滚动窗口 (Tumbling Window)](#场景-1滚动窗口-tumbling-window)
+  - [滑动窗口 (Hopping Window)](#场景-2滑动窗口-hopping-window)
+  - [会话窗口 (Session Window)](#场景-3会话窗口-session-window)
+  - [窗口双流关联 (Window Join)](#场景-4窗口双流关联-window-join)
+- [第三部分：生命周期与流任务管理](#第三部分生命周期与流任务管理)
+  - [数据源管理](#1-数据源与元数据管理)
+  - [Pipeline 监控](#2-实时-pipeline-监控与排障)
+  - [停止与释放](#3-安全停止与释放资源)
+- [SQL 语法速查表](#sql-语法速查表)
+
+---
+
+## 核心概念
+
+| 概念 | SQL 关键字 | 说明 |
+|------|-----------|------|
+| **TABLE** | `CREATE TABLE` | 系统目录（Catalog）中的静态逻辑定义。只记录外部数据源的连接信息、格式和 Schema，不消耗任何计算资源。 |
+| **STREAMING TABLE** | `CREATE STREAMING TABLE ... AS SELECT` | 持续运行的物理数据管道。引擎会在后台拉起真实的分布式计算任务，并将结果以纯追加（Append-only）方式持续写入外部系统。 |
+| **事件时间 (Event Time)** | `WATERMARK FOR <column>` | 引擎内部用于推进时间进度的时间戳列。 |
+| **水位线 (Watermark)** | `AS <column> - INTERVAL ...` | 对迟到乱序数据的容忍度。超过水位线的事件将被丢弃。 |
+
+> 支持的连接器、数据格式和 SQL 数据类型的完整参考，请参阅 [连接器、格式与类型参考](connectors-and-formats-zh.md)。
+
+---
+
+## 第一部分：注册数据源 (TABLE)
+
+`TABLE` 是系统目录（Catalog）中的静态逻辑定义。它只记录外部数据源（如 Kafka）的连接信息、格式和 Schema，**不消耗任何计算资源**。
+
+在流计算中，我们必须为输入流指定**事件时间（Event Time）**和**水位线（Watermark）**，以此作为引擎内部推进时间、触发计算的唯一依据。
+
+### 示例：注册广告曝光流与点击流
+
+```sql
+-- 1. 注册广告曝光流
+CREATE TABLE ad_impressions (
+    impression_id VARCHAR,
+    ad_id BIGINT,
+    campaign_id BIGINT,
+    user_id VARCHAR,
+    impression_time TIMESTAMP NOT NULL,
+    -- 核心：将 impression_time 设为事件时间，并容忍最多 2 秒的数据迟到乱序
+    WATERMARK FOR impression_time AS impression_time - INTERVAL '2' SECOND
+) WITH (
+    'connector' = 'kafka',
+    'topic' = 'raw_ad_impressions',
+    'format' = 'json',
+    'bootstrap.servers' = 'localhost:9092'
+);
+
+-- 2. 注册广告点击流
+CREATE TABLE ad_clicks (
+    click_id VARCHAR,
+    impression_id VARCHAR,
+    ad_id BIGINT,
+    click_time TIMESTAMP NOT NULL,
+    WATERMARK FOR click_time AS click_time - INTERVAL '5' SECOND
+) WITH (
+    'connector' = 'kafka',
+    'topic' = 'raw_ad_clicks',
+    'format' = 'json',
+    'bootstrap.servers' = 'localhost:9092'
+);
+```
+
+**关键要素：**
+
+- `WATERMARK FOR <列> AS <列> - INTERVAL '<n>' SECOND`：声明事件时间列以及允许的最大乱序延迟。
+- `WITH (...)`：连接器属性 — 类型、Topic、格式、Broker 地址。
+
+---
+
+## 第二部分：构建实时 Pipeline (STREAMING TABLE)
+
+`STREAMING TABLE` 是持续运行的物理数据管道。使用 `CREATE STREAMING TABLE ... AS SELECT`（CTAS）语法，引擎会在后台拉起真实的分布式计算任务，并将结果以**纯追加（Append-only）**的方式持续写入外部系统。
+
+### 场景 1：滚动窗口 (Tumbling Window)
+
+将时间切分为互不重叠的固定窗口。
+
+```sql
+-- 需求：每 1 分钟统计一次各广告计划的曝光总量
+CREATE STREAMING TABLE metric_tumble_impressions_1m WITH (
+    'connector' = 'kafka',
+    'topic' = 'sink_impressions_1m',
+    'format' = 'json',
+    'bootstrap.servers' = 'localhost:9092'
+) AS
+SELECT
+    TUMBLE(INTERVAL '1' MINUTE) AS time_window,
+    campaign_id,
+    COUNT(*) AS total_impressions
+FROM ad_impressions
+GROUP BY
+    1, -- 指代 SELECT 中的第一个字段 (time_window)
+    campaign_id;
+```
+
+### 场景 2：滑动窗口 (Hopping Window)
+
+窗口之间存在重叠，用于平滑趋势监控。
+
+```sql
+-- 需求：统计过去 10 分钟内各广告的独立访客数(UV)，每 1 分钟刷新一次
+CREATE STREAMING TABLE metric_hop_uv_10m WITH (
+    'connector' = 'kafka',
+    'topic' = 'sink_uv_10m_step_1m',
+    'format' = 'json',
+    'bootstrap.servers' = 'localhost:9092'
+) AS
+SELECT
+    HOP(INTERVAL '1' MINUTE, INTERVAL '10' MINUTE) AS time_window,
+    ad_id,
+    COUNT(DISTINCT user_id) AS unique_users
+FROM ad_impressions
+GROUP BY
+    1,
+    ad_id;
+```
+
+### 场景 3：会话窗口 (Session Window)
+
+会话窗口根据指定的不活跃间隔（Gap）对事件进行分组。如果在 Gap 时间内没有新事件到达，窗口关闭并输出结果。会话窗口非常适合用户行为会话分析。
+
+```sql
+-- 需求：按用户检测广告曝光会话，30 秒无活动则会话结束
+CREATE STREAMING TABLE metric_session_impressions WITH (
+    'connector' = 'kafka',
+    'topic' = 'sink_session_impressions',
+    'format' = 'json',
+    'bootstrap.servers' = 'localhost:9092'
+) AS
+SELECT
+    SESSION(INTERVAL '30' SECOND) AS time_window,
+    user_id,
+    COUNT(*) AS impressions_in_session
+FROM ad_impressions
+GROUP BY
+    1,
+    user_id;
+```
+
+### 场景 4：窗口双流关联 (Window Join)
+
+将两条流在完全相同的时间窗口内进行等值关联。因为状态限定在窗口内，水位线越过窗口后状态会自动清理，绝不发生内存泄漏（OOM）。
+
+```sql
+-- 需求：精确计算 5 分钟级别的点击率 (CTR)
+CREATE STREAMING TABLE metric_window_join_ctr_5m WITH (
+    'connector' = 'kafka',
+    'topic' = 'sink_ctr_5m',
+    'format' = 'json',
+    'bootstrap.servers' = 'localhost:9092'
+) AS
+SELECT
+    imp.time_window,
+    imp.ad_id,
+    imp.impressions,
+    COALESCE(clk.clicks, 0) AS clicks
+FROM (
+    SELECT TUMBLE(INTERVAL '5' MINUTE) AS time_window, ad_id, COUNT(*) AS impressions
+    FROM ad_impressions
+    GROUP BY 1, ad_id
+) imp
+LEFT JOIN (
+    SELECT TUMBLE(INTERVAL '5' MINUTE) AS time_window, ad_id, COUNT(*) AS clicks
+    FROM ad_clicks
+    GROUP BY 1, ad_id
+) clk
+ON imp.time_window = clk.time_window AND imp.ad_id = clk.ad_id;
+```
+
+> **要求：**关联条件**必须**包含相同的时间窗口列，以确保状态有界。
+
+---
+
+## 第三部分：生命周期与流任务管理
+
+Function Stream 提供了一套完整的运维指令，帮助您管理元数据目录、排查物理执行图以及销毁流计算任务。
+
+### 1. 数据源与元数据管理
+
+**查看所有已注册的数据源表：**
+
+```sql
+SHOW TABLES;
+```
+
+列出当前 Catalog 中的所有静态表定义及其对应的 Event Time 与 Watermark 策略。
+
+**查看原始建表语句（DDL）：**
+
+```sql
+SHOW CREATE TABLE ad_clicks;
+```
+
+用于导出或排查某张表的底层连接参数（如 Kafka Topic、Format 等）。
+
+### 2. 实时 Pipeline 监控与排障
+
+**查看当前运行的计算流：**
+
+```sql
+SHOW STREAMING TABLES;
+```
+
+输出字段说明：
+
+| 字段 | 说明 |
+|------|------|
+| `job_id` | 计算流的名称（如 `metric_tumble_impressions_1m`）。 |
+| `status` | 当前生命周期状态（如 `RUNNING`、`FAILED`）。 |
+| `pipeline_count` | 该任务在底层被拆分成的并行算子链数量。 |
+| `uptime` | 任务已持续运行的时长。 |
+
+**洞察物理执行拓扑 (Execution Graph)：**
+
+```sql
+SHOW CREATE STREAMING TABLE metric_tumble_impressions_1m;
+```
+
+这是 Function Stream 极其强大的排障指令。它会以 ASCII 格式打印出一条 SQL 是如何在底层被转化为真实分布式计算图的：
+
+- `[Source]` — 从连接器读取数据。
+- `[Operator] ExpressionWatermark` — 注入水位线。
+- `[Shuffle]` — 重分布网络数据。
+- `[Operator] TumblingWindowAggregate` — 执行真正的窗口聚合。
+- `[Sink] ConnectorSink` — 将结果发往目标连接器（如 Kafka）。
+
+### 3. 安全停止与释放资源
+
+当某个实时大屏活动结束，或者您需要更新计算逻辑时，必须显式销毁旧的流任务：
+
+```sql
+DROP STREAMING TABLE metric_tumble_impressions_1m;
+```
+
+---
+
+## SQL 语法速查表
+
+| 语句 | 说明 |
+|------|------|
+| `CREATE TABLE ... WITH (...)` | 注册外部数据源，声明 Schema、事件时间和水位线。 |
+| `CREATE STREAMING TABLE ... WITH (...) AS SELECT ...` | 创建并启动持续运行的流计算管道。 |
+| `SHOW TABLES` | 列出所有已注册的数据源表。 |
+| `SHOW CREATE TABLE <name>` | 显示某张表的建表 DDL。 |
+| `SHOW STREAMING TABLES` | 列出所有正在运行的流计算管道及其状态。 |
+| `SHOW CREATE STREAMING TABLE <name>` | 查看某条管道的物理执行拓扑图。 |
+| `DROP STREAMING TABLE <name>` | 销毁流计算管道并释放所有资源。 |
diff --git a/docs/streaming-sql-guide.md b/docs/streaming-sql-guide.md
new file mode 100644
index 00000000..cafaf887
--- /dev/null
+++ b/docs/streaming-sql-guide.md
@@ -0,0 +1,283 @@
+<!--
+
+    Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+
+-->
+
+# Streaming SQL Guide
+
+[中文](streaming-sql-guide-zh.md) | [English](streaming-sql-guide.md)
+
+Function Stream provides a declarative SQL interface for building real-time stream processing pipelines. With Streaming SQL you can ingest unbounded data streams, perform time-windowed aggregations, join multiple streams, and manage pipeline lifecycles — all without writing imperative code.
+
+---
+
+## Table of Contents
+
+- [Core Concepts](#core-concepts)
+- [Part 1: Registering Data Sources (TABLE)](#part-1-registering-data-sources-table)
+- [Part 2: Building Real-Time Pipelines (STREAMING TABLE)](#part-2-building-real-time-pipelines-streaming-table)
+  - [Tumbling Window](#scenario-1-tumbling-window)
+  - [Hopping Window](#scenario-2-hopping-window)
+  - [Session Window](#scenario-3-session-window)
+  - [Window Join](#scenario-4-window-join)
+- [Part 3: Lifecycle & Pipeline Management](#part-3-lifecycle--pipeline-management)
+  - [Data Source Management](#1-data-source--metadata-management)
+  - [Pipeline Monitoring](#2-real-time-pipeline-monitoring--troubleshooting)
+  - [Stopping & Cleanup](#3-safe-shutdown--resource-release)
+- [SQL Reference Summary](#sql-reference-summary)
+
+---
+
+## Core Concepts
+
+| Concept | SQL Keyword | Description |
+|---------|-------------|-------------|
+| **TABLE** | `CREATE TABLE` | A static logical definition in the catalog. Records external source connection info, format, and schema. Consumes no compute resources. |
+| **STREAMING TABLE** | `CREATE STREAMING TABLE ... AS SELECT` | A physically running data pipeline. The engine allocates distributed compute tasks and continuously writes results to external systems in append-only mode. |
+| **Event Time** | `WATERMARK FOR <column>` | The timestamp column used by the engine to track the progression of time within a stream. |
+| **Watermark** | `AS <column> - INTERVAL ...` | A tolerance for late-arriving, out-of-order data. Events arriving after the watermark are dropped. |
+
+> For the full reference on supported connectors, data formats, and SQL data types, see [Connectors, Formats & Data Types](connectors-and-formats.md).
+
+---
+
+## Part 1: Registering Data Sources (TABLE)
+
+A `TABLE` is a static logical definition in the system catalog. It only records the connection information (e.g. Kafka broker, topic), data format, and schema of an external data source. **It does not consume any compute resources.**
+
+In stream processing, you must specify an **Event Time** column and a **Watermark** strategy for each input stream. The engine uses these as the sole basis for advancing time and triggering computations.
+
+### Example: Register an Ad-Impressions Stream and a Clicks Stream
+
+```sql
+-- 1. Register the ad-impressions stream
+CREATE TABLE ad_impressions (
+    impression_id VARCHAR,
+    ad_id BIGINT,
+    campaign_id BIGINT,
+    user_id VARCHAR,
+    impression_time TIMESTAMP NOT NULL,
+    WATERMARK FOR impression_time AS impression_time - INTERVAL '2' SECOND
+) WITH (
+    'connector' = 'kafka',
+    'topic' = 'raw_ad_impressions',
+    'format' = 'json',
+    'bootstrap.servers' = 'localhost:9092'
+);
+
+-- 2. Register the ad-clicks stream
+CREATE TABLE ad_clicks (
+    click_id VARCHAR,
+    impression_id VARCHAR,
+    ad_id BIGINT,
+    click_time TIMESTAMP NOT NULL,
+    WATERMARK FOR click_time AS click_time - INTERVAL '5' SECOND
+) WITH (
+    'connector' = 'kafka',
+    'topic' = 'raw_ad_clicks',
+    'format' = 'json',
+    'bootstrap.servers' = 'localhost:9092'
+);
+```
+
+**Key elements:**
+
+- `WATERMARK FOR <column> AS <column> - INTERVAL '<n>' SECOND`: declares the event-time column and the maximum tolerated out-of-order delay.
+- `WITH (...)`: connector properties — type, topic, format, and broker address.
+
+---
+
+## Part 2: Building Real-Time Pipelines (STREAMING TABLE)
+
+A `STREAMING TABLE` is a continuously running physical data pipeline. Using the `CREATE STREAMING TABLE ... AS SELECT` (CTAS) syntax, the engine launches real distributed compute tasks in the background and continuously writes results to an external system in **append-only** mode.
+
+### Scenario 1: Tumbling Window
+
+Divides time into fixed, non-overlapping windows.
+
+```sql
+-- Count total impressions per campaign every 1 minute
+CREATE STREAMING TABLE metric_tumble_impressions_1m WITH (
+    'connector' = 'kafka',
+    'topic' = 'sink_impressions_1m',
+    'format' = 'json',
+    'bootstrap.servers' = 'localhost:9092'
+) AS
+SELECT
+    TUMBLE(INTERVAL '1' MINUTE) AS time_window,
+    campaign_id,
+    COUNT(*) AS total_impressions
+FROM ad_impressions
+GROUP BY
+    1,
+    campaign_id;
+```
+
+### Scenario 2: Hopping Window
+
+Windows overlap, useful for smoothed trend monitoring.
+
+```sql
+-- Count distinct visitors (UV) per ad over the last 10 minutes, refreshed every 1 minute
+CREATE STREAMING TABLE metric_hop_uv_10m WITH (
+    'connector' = 'kafka',
+    'topic' = 'sink_uv_10m_step_1m',
+    'format' = 'json',
+    'bootstrap.servers' = 'localhost:9092'
+) AS
+SELECT
+    HOP(INTERVAL '1' MINUTE, INTERVAL '10' MINUTE) AS time_window,
+    ad_id,
+    COUNT(DISTINCT user_id) AS unique_users
+FROM ad_impressions
+GROUP BY
+    1,
+    ad_id;
+```
+
+### Scenario 3: Session Window
+
+A session window groups events that arrive within a specified gap of inactivity. If no new event arrives within the gap duration, the window closes and emits results. Session windows are ideal for user-session analysis.
+
+```sql
+-- Detect ad-impression sessions per user; a session ends after 30 seconds of inactivity
+CREATE STREAMING TABLE metric_session_impressions WITH (
+    'connector' = 'kafka',
+    'topic' = 'sink_session_impressions',
+    'format' = 'json',
+    'bootstrap.servers' = 'localhost:9092'
+) AS
+SELECT
+    SESSION(INTERVAL '30' SECOND) AS time_window,
+    user_id,
+    COUNT(*) AS impressions_in_session
+FROM ad_impressions
+GROUP BY
+    1,
+    user_id;
+```
+
+### Scenario 4: Window Join
+
+Join two streams within exactly the same time window. Because state is bounded by the window, memory is automatically reclaimed once the watermark advances past the window boundary — eliminating the risk of OOM.
+
+```sql
+-- Calculate 5-minute click-through rate (CTR)
+CREATE STREAMING TABLE metric_window_join_ctr_5m WITH (
+    'connector' = 'kafka',
+    'topic' = 'sink_ctr_5m',
+    'format' = 'json',
+    'bootstrap.servers' = 'localhost:9092'
+) AS
+SELECT
+    imp.time_window,
+    imp.ad_id,
+    imp.impressions,
+    COALESCE(clk.clicks, 0) AS clicks
+FROM (
+    SELECT TUMBLE(INTERVAL '5' MINUTE) AS time_window, ad_id, COUNT(*) AS impressions
+    FROM ad_impressions
+    GROUP BY 1, ad_id
+) imp
+LEFT JOIN (
+    SELECT TUMBLE(INTERVAL '5' MINUTE) AS time_window, ad_id, COUNT(*) AS clicks
+    FROM ad_clicks
+    GROUP BY 1, ad_id
+) clk
+ON imp.time_window = clk.time_window AND imp.ad_id = clk.ad_id;
+```
+
+> **Requirement:** The join condition **must** include the same time-window column to ensure bounded state.
+
+---
+
+## Part 3: Lifecycle & Pipeline Management
+
+Function Stream provides a complete set of operational commands for managing the metadata catalog, inspecting physical execution graphs, and destroying streaming pipelines.
+
+### 1. Data Source & Metadata Management
+
+**List all registered source tables:**
+
+```sql
+SHOW TABLES;
+```
+
+Lists all static table definitions in the current catalog along with their Event Time and Watermark strategies.
+
+**Show the original DDL of a table:**
+
+```sql
+SHOW CREATE TABLE ad_clicks;
+```
+
+Useful for exporting or auditing the underlying connection parameters (Kafka topic, format, etc.).
+
+### 2. Real-Time Pipeline Monitoring & Troubleshooting
+
+**List all running streaming pipelines:**
+
+```sql
+SHOW STREAMING TABLES;
+```
+
+Output columns:
+
+| Column | Description |
+|--------|-------------|
+| `job_id` | Pipeline name (e.g. `metric_tumble_impressions_1m`). |
+| `status` | Lifecycle state (`RUNNING`, `FAILED`, etc.). |
+| `pipeline_count` | Number of parallel operator chains the engine split the job into. |
+| `uptime` | How long the pipeline has been running. |
+
+**Inspect the physical execution topology:**
+
+```sql
+SHOW CREATE STREAMING TABLE metric_tumble_impressions_1m;
+```
+
+This prints an ASCII representation of how the SQL was translated into a distributed execution graph:
+
+- `[Source]` — reads from the connector.
+- `[Operator] ExpressionWatermark` — injects watermarks.
+- `[Shuffle]` — redistributes data across the network.
+- `[Operator] TumblingWindowAggregate` — performs the actual windowed aggregation.
+- `[Sink] ConnectorSink` — writes results to the target connector (e.g. Kafka).
+
+### 3. Safe Shutdown & Resource Release
+
+When a campaign ends or you need to update the pipeline logic, explicitly destroy the old streaming pipeline:
+
+```sql
+DROP STREAMING TABLE metric_tumble_impressions_1m;
+```
+
+---
+
+## SQL Reference Summary
+
+| Statement | Description |
+|-----------|-------------|
+| `CREATE TABLE ... WITH (...)` | Register an external data source with schema, event time, and watermark. |
+| `CREATE STREAMING TABLE ... WITH (...) AS SELECT ...` | Create and launch a continuous streaming pipeline. |
+| `SHOW TABLES` | List all registered source tables. |
+| `SHOW CREATE TABLE <name>` | Display the DDL of a registered table. |
+| `SHOW STREAMING TABLES` | List all running streaming pipelines with status. |
+| `SHOW CREATE STREAMING TABLE <name>` | Inspect the physical execution graph of a pipeline. |
+| `DROP STREAMING TABLE <name>` | Destroy a streaming pipeline and release all resources. |
diff --git a/protocol/Cargo.toml b/protocol/Cargo.toml
index fde9de52..5fa7d0f0 100644
--- a/protocol/Cargo.toml
+++ b/protocol/Cargo.toml
@@ -9,6 +9,7 @@ repository = "https://github.com/your-username/rust-function-stream"
 [dependencies]
 prost = "0.13"
 tonic = { version = "0.12", features = ["default"] }
+serde = { version = "1.0", features = ["derive"] }
 log = "0.4"
 
 [build-dependencies]
diff --git a/protocol/build.rs b/protocol/build.rs
index 17e77d30..d3943f53 100644
--- a/protocol/build.rs
+++ b/protocol/build.rs
@@ -10,54 +10,65 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-use std::path::Path;
+use std::path::{Path, PathBuf};
 
 fn main() -> Result<(), Box<dyn std::error::Error>> {
-    // Initialize logger for build script
     env_logger::init();
 
-    // Create output directories in the protocol package directory
-    // Use CARGO_MANIFEST_DIR to get the package root directory
     let manifest_dir = std::env::var("CARGO_MANIFEST_DIR")?;
     let out_dir = Path::new(&manifest_dir).join("generated");
-    let proto_file = Path::new(&manifest_dir).join("proto/function_stream.proto");
-
-    // Note: Cargo doesn't directly support cleaning custom directories via cargo clean.
-    // The generated directory will be automatically regenerated on each build if needed.
-    // To clean it manually, use: ./clean.sh or make clean or rm -rf protocol/generated
 
     log::info!("Generated code will be placed in: {}", out_dir.display());
-    log::info!("Proto file: {}", proto_file.display());
 
-    // Create output directories
     let cli_dir = out_dir.join("cli");
     let service_dir = out_dir.join("service");
 
     std::fs::create_dir_all(&cli_dir)?;
     std::fs::create_dir_all(&service_dir)?;
-    log::info!(
-        "Created output directories: {} and {}",
-        cli_dir.display(),
-        service_dir.display()
-    );
 
-    // Generate code for CLI - only client code needed
+    // 1. function_stream.proto → CLI (client) and Service (server)
     tonic_build::configure()
         .out_dir(&cli_dir)
-        .build_client(true) // Enable client code generation
-        .build_server(false) // Disable server code generation
+        .build_client(true)
+        .build_server(false)
         .compile_protos(&["proto/function_stream.proto"], &["proto"])?;
 
-    // Generate code for Service - only server code needed
     tonic_build::configure()
         .out_dir(&service_dir)
-        .build_client(false) // Disable client code generation
-        .build_server(true) // Enable server code generation
+        .build_client(false)
+        .build_server(true)
         .compile_protos(&["proto/function_stream.proto"], &["proto"])?;
 
+    let api_dir = out_dir.join("api");
+    std::fs::create_dir_all(&api_dir)?;
+
+    let descriptor_path =
+        PathBuf::from(std::env::var("OUT_DIR").unwrap()).join("fs_api_descriptor.bin");
+
+    tonic_build::configure()
+        .out_dir(&api_dir)
+        .protoc_arg("--experimental_allow_proto3_optional")
+        .file_descriptor_set_path(&descriptor_path)
+        .type_attribute(".", "#[derive(serde::Serialize, serde::Deserialize)]")
+        .type_attribute(".", "#[serde(rename_all = \"camelCase\")]")
+        .build_client(false)
+        .build_server(false)
+        .compile_protos(&["proto/fs_api.proto"], &["proto"])?;
+
+    let storage_dir = out_dir.join("storage");
+    std::fs::create_dir_all(&storage_dir)?;
+    tonic_build::configure()
+        .out_dir(&storage_dir)
+        .protoc_arg("--experimental_allow_proto3_optional")
+        .build_client(false)
+        .build_server(false)
+        .compile_protos(&["proto/storage.proto"], &["proto"])?;
+
     log::info!("Protocol Buffers code generated successfully");
     println!("cargo:rustc-env=PROTO_GEN_DIR={}", out_dir.display());
-    println!("cargo:rerun-if-changed={}", proto_file.display());
+    println!("cargo:rerun-if-changed=proto/function_stream.proto");
+    println!("cargo:rerun-if-changed=proto/fs_api.proto");
+    println!("cargo:rerun-if-changed=proto/storage.proto");
 
     Ok(())
 }
diff --git a/protocol/proto/fs_api.proto b/protocol/proto/fs_api.proto
new file mode 100644
index 00000000..1f578ffe
--- /dev/null
+++ b/protocol/proto/fs_api.proto
@@ -0,0 +1,423 @@
+// Licensed under the Apache License, Version 2.0
+// Adapted from Arroyo's api.proto for FunctionStream
+
+syntax = "proto3";
+package fs_api;
+
+// ─────────────────────── Operators ───────────────────────
+
+message ConnectorOp {
+  string connector = 1;
+  reserved 2;  // removed: map<string, string> config_map
+  optional FsSchema fs_schema = 3;
+  string name = 4;
+  string description = 5;
+
+  oneof config {
+    KafkaSourceConfig kafka_source = 6;
+    KafkaSinkConfig kafka_sink = 7;
+    GenericConnectorConfig generic = 8;
+  }
+}
+
+// ─────────────────────── Kafka Connector Configs ───────────────────────
+
+message KafkaSourceConfig {
+  string topic = 1;
+  string bootstrap_servers = 2;
+  optional string group_id = 3;
+  optional string group_id_prefix = 4;
+  KafkaOffsetMode offset_mode = 5;
+  KafkaReadMode read_mode = 6;
+  KafkaAuthConfig auth = 7;
+  map<string, string> client_configs = 8;
+  FormatConfig format = 9;
+  BadDataPolicy bad_data_policy = 10;
+  uint32 rate_limit_msgs_per_sec = 11;
+  optional string value_subject = 12;
+}
+
+message KafkaSinkConfig {
+  string topic = 1;
+  string bootstrap_servers = 2;
+  KafkaSinkCommitMode commit_mode = 3;
+  optional string key_field = 4;
+  optional string timestamp_field = 5;
+  KafkaAuthConfig auth = 6;
+  map<string, string> client_configs = 7;
+  FormatConfig format = 8;
+  optional string value_subject = 9;
+}
+
+// Fallback for non-Kafka connectors that are not yet strongly typed.
+message GenericConnectorConfig {
+  map<string, string> properties = 1;
+}
+
+// ─────────────────────── Kafka Auth ───────────────────────
+
+message KafkaAuthConfig {
+  oneof auth {
+    KafkaAuthNone none = 1;
+    KafkaAuthSasl sasl = 2;
+    KafkaAuthAwsMskIam aws_msk_iam = 3;
+  }
+}
+
+message KafkaAuthNone {}
+
+message KafkaAuthSasl {
+  string protocol = 1;
+  string mechanism = 2;
+  string username = 3;
+  string password = 4;
+}
+
+message KafkaAuthAwsMskIam {
+  string region = 1;
+}
+
+// ─────────────────────── Format & Data-Quality ───────────────────────
+
+message FormatConfig {
+  oneof format {
+    JsonFormatConfig json = 1;
+    RawStringFormatConfig raw_string = 2;
+    RawBytesFormatConfig raw_bytes = 3;
+  }
+}
+
+message JsonFormatConfig {
+  TimestampFormatProto timestamp_format = 1;
+  DecimalEncodingProto decimal_encoding = 2;
+  bool include_schema = 3;
+  bool confluent_schema_registry = 4;
+  optional uint32 schema_id = 5;
+  bool debezium = 6;
+  bool unstructured = 7;
+}
+
+message RawStringFormatConfig {}
+message RawBytesFormatConfig {}
+
+// ─────────────────────── Kafka Enums ───────────────────────
+
+enum TimestampFormatProto {
+  TIMESTAMP_RFC3339 = 0;
+  TIMESTAMP_UNIX_MILLIS = 1;
+}
+
+enum DecimalEncodingProto {
+  DECIMAL_NUMBER = 0;
+  DECIMAL_STRING = 1;
+  DECIMAL_BYTES = 2;
+}
+
+enum BadDataPolicy {
+  BAD_DATA_FAIL = 0;
+  BAD_DATA_DROP = 1;
+}
+
+enum KafkaOffsetMode {
+  KAFKA_OFFSET_EARLIEST = 0;
+  KAFKA_OFFSET_LATEST = 1;
+  KAFKA_OFFSET_GROUP = 2;
+}
+
+enum KafkaReadMode {
+  KAFKA_READ_DEFAULT = 0;
+  KAFKA_READ_COMMITTED = 1;
+  KAFKA_READ_UNCOMMITTED = 2;
+}
+
+enum KafkaSinkCommitMode {
+  KAFKA_SINK_AT_LEAST_ONCE = 0;
+  KAFKA_SINK_EXACTLY_ONCE = 1;
+}
+
+message ValuePlanOperator {
+  string name = 1;
+  bytes physical_plan = 2;
+}
+
+message KeyPlanOperator {
+  string name = 1;
+  bytes physical_plan = 2;
+  repeated uint64 key_fields = 3;
+}
+
+message ProjectionOperator {
+  string name = 1;
+  FsSchema input_schema = 2;
+  FsSchema output_schema = 3;
+  repeated bytes exprs = 4;
+}
+
+message TumblingWindowAggregateOperator {
+  string name = 1;
+  uint64 width_micros = 2;
+  bytes binning_function = 3;
+  FsSchema input_schema = 4;
+  FsSchema partial_schema = 5;
+  bytes partial_aggregation_plan = 6;
+  bytes final_aggregation_plan = 7;
+  optional bytes final_projection = 8;
+}
+
+message SlidingWindowAggregateOperator {
+  string name = 1;
+  uint64 width_micros = 2;
+  uint64 slide_micros = 3;
+  bytes binning_function = 4;
+  FsSchema input_schema = 5;
+  FsSchema partial_schema = 6;
+  bytes partial_aggregation_plan = 7;
+  bytes final_aggregation_plan = 8;
+  bytes final_projection = 9;
+}
+
+message SessionWindowAggregateOperator {
+  string name = 1;
+  uint64 gap_micros = 2;
+  string window_field_name = 3;
+  uint64 window_index = 4;
+  FsSchema input_schema = 5;
+  FsSchema unkeyed_aggregate_schema = 6;
+  bytes partial_aggregation_plan = 7;
+  bytes final_aggregation_plan = 8;
+}
+
+message JoinOperator {
+  string name = 1;
+  FsSchema left_schema = 2;
+  FsSchema right_schema = 3;
+  FsSchema output_schema = 4;
+  bytes join_plan = 5;
+  optional uint64 ttl_micros = 6;
+}
+
+message LookupJoinCondition {
+  bytes left_expr = 1;
+  string right_key = 2;
+}
+
+message LookupJoinOperator {
+  FsSchema input_schema = 1;
+  FsSchema lookup_schema = 2;
+  ConnectorOp connector = 3;
+  repeated LookupJoinCondition key_exprs = 4;
+  JoinType join_type = 5;
+  optional uint64 ttl_micros = 6;
+  optional uint64 max_capacity_bytes = 7;
+}
+
+message WindowFunctionOperator {
+  string name = 1;
+  FsSchema input_schema = 2;
+  bytes binning_function = 3;
+  bytes window_function_plan = 4;
+}
+
+enum AsyncUdfOrdering {
+  UNORDERED = 0;
+  ORDERED = 1;
+}
+
+message AsyncUdfOperator {
+  string name = 1;
+  DylibUdfConfig udf = 2;
+  repeated bytes arg_exprs = 3;
+  repeated bytes final_exprs = 4;
+  AsyncUdfOrdering ordering = 5;
+  uint32 max_concurrency = 6;
+  uint64 timeout_micros = 7;
+}
+
+message UpdatingAggregateOperator {
+  string name = 1;
+  FsSchema input_schema = 2;
+  FsSchema final_schema = 3;
+  bytes aggregate_exec = 5;
+  bytes metadata_expr = 6;
+  uint64 flush_interval_micros = 7;
+  uint64 ttl_micros = 8;
+}
+
+// ─────────────────────── Watermark ───────────────────────
+
+message ExpressionWatermarkConfig {
+  uint64 period_micros = 1;
+  optional uint64 idle_time_micros = 2;
+  FsSchema input_schema = 3;
+  bytes expression = 4;
+}
+
+// ─────────────────────── Windows ───────────────────────
+
+message Window {
+  oneof window {
+    SlidingWindow sliding_window = 2;
+    TumblingWindow tumbling_window = 3;
+    InstantWindow instant_window = 4;
+    SessionWindow session_window = 5;
+  }
+}
+
+message SlidingWindow {
+  uint64 size_micros = 1;
+  uint64 slide_micros = 2;
+}
+
+message TumblingWindow {
+  uint64 size_micros = 1;
+}
+
+message InstantWindow {}
+
+message SessionWindow {
+  uint64 gap_micros = 1;
+}
+
+// ─────────────────────── Enums ───────────────────────
+
+enum JoinType {
+  INNER = 0;
+  LEFT = 1;
+  RIGHT = 2;
+  FULL = 3;
+}
+
+enum OffsetMode {
+  EARLIEST = 0;
+  LATEST = 1;
+}
+
+enum EdgeType {
+  UNUSED = 0;
+  FORWARD = 1;
+  SHUFFLE = 2;
+  LEFT_JOIN = 3;
+  RIGHT_JOIN = 4;
+}
+
+// ─────────────────── Physical Extension Nodes ───────────────────
+
+message MemExecNode {
+  string table_name = 1;
+  string schema = 2; // json-encoded
+}
+
+message UnnestExecNode {
+  string schema = 1; // json-encoded
+}
+
+message DebeziumDecodeNode {
+  string schema = 1; // json-encoded
+  repeated uint64 primary_keys = 2;
+}
+
+message DebeziumEncodeNode {
+  string schema = 1; // json-encoded
+}
+
+message FsExecNode {
+  oneof node {
+    MemExecNode mem_exec = 1;
+    UnnestExecNode unnest_exec = 2;
+    DebeziumDecodeNode debezium_decode = 3;
+    DebeziumEncodeNode debezium_encode = 4;
+  }
+}
+
+// ─────────────────── Checkpoints ───────────────────
+
+enum TaskCheckpointEventType {
+  ALIGNMENT_STARTED = 0;
+  CHECKPOINT_STARTED = 1;
+  CHECKPOINT_OPERATOR_SETUP_FINISHED = 2;
+  CHECKPOINT_SYNC_FINISHED = 3;
+  CHECKPOINT_PRE_COMMIT = 4;
+}
+
+message TaskCheckpointEvent {
+  uint64 time = 1;
+  TaskCheckpointEventType event_type = 2;
+}
+
+message TaskCheckpointDetail {
+  uint32 subtask_index = 1;
+  uint64 start_time = 2;
+  optional uint64 finish_time = 3;
+  optional uint64 bytes = 4;
+  repeated TaskCheckpointEvent events = 5;
+}
+
+message OperatorCheckpointDetail {
+  string operator_id = 1;
+  uint64 start_time = 2;
+  optional uint64 finish_time = 3;
+  bool has_state = 4;
+  optional uint64 started_metadata_write = 6;
+  map<uint32, TaskCheckpointDetail> tasks = 5;
+}
+
+// ─────────────────── UDF Config ───────────────────
+
+message DylibUdfConfig {
+  string dylib_path = 1;
+  repeated bytes arg_types = 2;
+  bytes return_type = 3;
+  bool aggregate = 4;
+  bool is_async = 5;
+}
+
+message PythonUdfConfig {
+  string name = 1;
+  repeated bytes arg_types = 2;
+  bytes return_type = 3;
+  string definition = 4;
+}
+
+message FsProgramConfig {
+  map<string, DylibUdfConfig> udf_dylibs = 1;
+  map<string, PythonUdfConfig> python_udfs = 2;
+}
+
+// ─────────────────── Arrow Program ───────────────────
+
+message FsProgram {
+  repeated FsNode nodes = 1;
+  repeated FsEdge edges = 2;
+  FsProgramConfig program_config = 3;
+}
+
+message FsSchema {
+  string arrow_schema = 1;  // json-encoded Arrow Schema
+  uint32 timestamp_index = 2;
+  repeated uint32 key_indices = 3;
+  bool has_keys = 4;
+  repeated uint32 routing_key_indices = 5;
+  bool has_routing_keys = 6;
+}
+
+message ChainedOperator {
+  string operator_id = 1;
+  string operator_name = 2;
+  bytes operator_config = 3;
+}
+
+message FsNode {
+  int32 node_index = 1;
+  uint32 node_id = 2;
+  uint32 parallelism = 3;
+  string description = 4;
+  repeated ChainedOperator operators = 5;
+  repeated FsSchema edges = 6;
+}
+
+message FsEdge {
+  int32 source = 1;
+  int32 target = 2;
+  FsSchema schema = 4;
+  EdgeType edge_type = 5;
+}
diff --git a/protocol/proto/storage.proto b/protocol/proto/storage.proto
new file mode 100644
index 00000000..f107d472
--- /dev/null
+++ b/protocol/proto/storage.proto
@@ -0,0 +1,107 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+//
+// All durable / persisted payloads for FunctionStream (single source of truth for storage wire format).
+// - Stream table catalog (MetaStore KV)
+// - Task rows (RocksDB task_meta / task_payload; values may be prefixed — see runtime codec)
+
+syntax = "proto3";
+
+package function_stream.storage;
+
+// =============================================================================
+// Catalog table storage (coordinator SQL catalog)
+// =============================================================================
+
+// Top-level persisted record for one catalog table.
+message TableDefinition {
+  string table_name = 1;
+  int64 updated_at_millis = 2;
+  oneof table_type {
+    // Connector-backed ingestion/egress table definition.
+    CatalogSourceTable connector_table = 3;
+    // Connector-backed lookup table definition.
+    CatalogSourceTable lookup_table = 5;
+  }
+}
+
+// Shared connector-backed table payload for connector/lookup entries.
+message CatalogSourceTable {
+  bytes arrow_schema_ipc = 1;
+  optional string event_time_field = 2;
+  optional string watermark_field = 3;
+  // Original CREATE TABLE ... WITH ('k'='v', ...) pairs — single source of truth.
+  map<string, string> with_options = 4;
+  // Canonical connector identifier (e.g. kafka, postgres-cdc).
+  string connector = 5;
+  reserved 6;  // removed: string opaque_config (JSON blob no longer needed)
+  // Human-readable note from DDL (ConnectorOp.description).
+  string description = 7;
+}
+
+// =============================================================================
+// Streaming table storage (CREATE STREAMING TABLE persistence)
+// =============================================================================
+
+// Persisted record for one streaming table (CREATE STREAMING TABLE).
+// On restart, the engine re-submits each record to JobManager to resume the pipeline.
+message StreamingTableDefinition {
+  string table_name = 1;
+  int64  created_at_millis = 2;
+  // Serialized function_stream.api.FsProgram — the full execution graph.
+  // Stored as opaque bytes to avoid coupling storage schema with runtime API protos.
+  bytes  fs_program_bytes = 3;
+  string comment = 4;
+}
+
+// =============================================================================
+// Task storage (RocksDB metadata + module payload)
+// =============================================================================
+
+// Lifecycle state persisted for task recovery. New enum values MUST be appended
+// with new numbers (never renumber) for forward compatibility.
+enum ComponentStateKind {
+  COMPONENT_STATE_KIND_UNSPECIFIED = 0;
+  UNINITIALIZED = 1;
+  INITIALIZED = 2;
+  STARTING = 3;
+  RUNNING = 4;
+  CHECKPOINTING = 5;
+  STOPPING = 6;
+  STOPPED = 7;
+  CLOSING = 8;
+  CLOSED = 9;
+  ERROR = 10;
+}
+
+message ComponentStateProto {
+  ComponentStateKind kind = 1;
+  // Set when kind == ERROR
+  string error_message = 2;
+}
+
+// Stored in CF task_meta (after magic prefix FSP1).
+message TaskMetadataProto {
+  string task_type = 1;
+  ComponentStateProto state = 2;
+  uint64 created_at = 3;
+  optional uint64 checkpoint_id = 4;
+}
+
+message TaskModuleWasm {
+  bytes wasm_binary = 1;
+}
+
+message TaskModulePython {
+  string class_name = 1;
+  string module_path = 2;
+  optional bytes embedded_code = 3;
+}
+
+// Stored in CF task_payload (after magic prefix FSP1).
+message TaskModulePayloadProto {
+  oneof payload {
+    TaskModuleWasm wasm = 1;
+    TaskModulePython python = 2;
+  }
+}
diff --git a/protocol/src/lib.rs b/protocol/src/lib.rs
index b0c6da06..d1bdfff9 100644
--- a/protocol/src/lib.rs
+++ b/protocol/src/lib.rs
@@ -10,25 +10,39 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// Protocol Buffers protocol definitions for function stream
-// This module exports the generated Protocol Buffers code
+// ─────────────── FunctionStream Service (original) ───────────────
 
-// CLI module - exports client code
 #[path = "../generated/cli/function_stream.rs"]
 pub mod cli;
 
-// Service module - exports server code
 #[path = "../generated/service/function_stream.rs"]
 pub mod service;
 
-// Re-export commonly used types from both modules
-// Data structures are the same in both, so we can re-export from either
 pub use cli::function_stream_service_client;
-
-// Re-export client-specific types
 pub use cli::function_stream_service_client::FunctionStreamServiceClient;
-
-// Re-export server-specific types
 pub use service::function_stream_service_server::{
     FunctionStreamService, FunctionStreamServiceServer,
 };
+
+// ─────────────── Streaming Pipeline API (fs_api.proto) ───────────────
+
+pub mod grpc {
+    /// Serde-annotated API types for streaming operators, schemas, programs.
+    #[allow(clippy::all)]
+    pub mod api {
+        include!("../generated/api/fs_api.rs");
+    }
+}
+
+/// File descriptor set for fs_api.proto (for gRPC reflection / REST gateway).
+pub const FS_API_FILE_DESCRIPTOR_SET: &[u8] =
+    tonic::include_file_descriptor_set!("fs_api_descriptor");
+
+// ─────────────── Durable storage (storage.proto: catalog + task rows) ───────────────
+
+/// Prost types for persisted stream catalog and task storage (`proto/storage.proto`).
+pub mod storage {
+    #![allow(clippy::all)]
+    #![allow(warnings)]
+    include!("../generated/storage/function_stream.storage.rs");
+}
diff --git a/src/common/fs_schema.rs b/src/common/fs_schema.rs
new file mode 100644
index 00000000..4229b957
--- /dev/null
+++ b/src/common/fs_schema.rs
@@ -0,0 +1,456 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! FunctionStream table/stream schema: Arrow [`Schema`] plus timestamp index and optional key columns.
+//!
+//! [`Schema`]: datafusion::arrow::datatypes::Schema
+
+use datafusion::arrow::array::builder::{ArrayBuilder, make_builder};
+use datafusion::arrow::array::{RecordBatch, TimestampNanosecondArray};
+use datafusion::arrow::datatypes::{DataType, Field, FieldRef, Schema, SchemaBuilder, TimeUnit};
+use datafusion::arrow::error::ArrowError;
+use datafusion::common::{DataFusionError, Result as DFResult};
+use std::sync::Arc;
+use std::time::SystemTime;
+use arrow::compute::{filter_record_batch, lexsort_to_indices, partition, take, SortColumn};
+use arrow::compute::kernels::cmp::gt_eq;
+use arrow::compute::kernels::numeric::div;
+use arrow::row::SortField;
+use arrow_array::{PrimitiveArray, UInt64Array};
+use arrow_array::types::UInt64Type;
+use protocol::grpc::api;
+use super::{to_nanos, TIMESTAMP_FIELD};
+use std::ops::Range;
+use crate::common::converter::Converter;
+
+pub type FsSchemaRef = Arc<FsSchema>;
+
+#[derive(Debug, Clone, Eq, PartialEq, Hash)]
+pub struct FsSchema {
+    pub schema: Arc<Schema>,
+    pub timestamp_index: usize,
+    key_indices: Option<Vec<usize>>,
+    /// If defined, these indices are used for routing (i.e., which subtask gets which piece of data)
+    routing_key_indices: Option<Vec<usize>>,
+}
+
+impl TryFrom<api::FsSchema> for FsSchema {
+    type Error = DataFusionError;
+    fn try_from(schema_proto: api::FsSchema) -> Result<Self, DataFusionError> {
+        let schema: Schema = serde_json::from_str(&schema_proto.arrow_schema)
+            .map_err(|e| DataFusionError::Plan(format!("Invalid arrow schema: {e}")))?;
+        let timestamp_index = schema_proto.timestamp_index as usize;
+
+        let key_indices = schema_proto.has_keys.then(|| {
+            schema_proto
+                .key_indices
+                .into_iter()
+                .map(|index| index as usize)
+                .collect()
+        });
+
+        let routing_key_indices = schema_proto.has_routing_keys.then(|| {
+            schema_proto
+                .routing_key_indices
+                .into_iter()
+                .map(|index| index as usize)
+                .collect()
+        });
+
+        Ok(Self {
+            schema: Arc::new(schema),
+            timestamp_index,
+            key_indices,
+            routing_key_indices,
+        })
+    }
+}
+
+impl From<FsSchema> for api::FsSchema {
+    fn from(schema: FsSchema) -> Self {
+        let arrow_schema = serde_json::to_string(schema.schema.as_ref()).unwrap();
+        let timestamp_index = schema.timestamp_index as u32;
+
+        let has_keys = schema.key_indices.is_some();
+        let key_indices = schema
+            .key_indices
+            .map(|ks| ks.into_iter().map(|index| index as u32).collect())
+            .unwrap_or_default();
+
+        let has_routing_keys = schema.routing_key_indices.is_some();
+        let routing_key_indices = schema
+            .routing_key_indices
+            .map(|ks| ks.into_iter().map(|index| index as u32).collect())
+            .unwrap_or_default();
+
+        Self {
+            arrow_schema,
+            timestamp_index,
+            key_indices,
+            has_keys,
+            routing_key_indices,
+            has_routing_keys,
+        }
+    }
+}
+
+impl FsSchema {
+    pub fn new(
+        schema: Arc<Schema>,
+        timestamp_index: usize,
+        key_indices: Option<Vec<usize>>,
+        routing_key_indices: Option<Vec<usize>>,
+    ) -> Self {
+        Self {
+            schema,
+            timestamp_index,
+            key_indices,
+            routing_key_indices,
+        }
+    }
+    pub fn new_unkeyed(schema: Arc<Schema>, timestamp_index: usize) -> Self {
+        Self {
+            schema,
+            timestamp_index,
+            key_indices: None,
+            routing_key_indices: None,
+        }
+    }
+    pub fn new_keyed(schema: Arc<Schema>, timestamp_index: usize, key_indices: Vec<usize>) -> Self {
+        Self {
+            schema,
+            timestamp_index,
+            key_indices: Some(key_indices),
+            routing_key_indices: None,
+        }
+    }
+
+    pub fn from_fields(mut fields: Vec<Field>) -> Self {
+        if !fields.iter().any(|f| f.name() == TIMESTAMP_FIELD) {
+            fields.push(Field::new(
+                TIMESTAMP_FIELD,
+                DataType::Timestamp(TimeUnit::Nanosecond, None),
+                false,
+            ));
+        }
+
+        Self::from_schema_keys(Arc::new(Schema::new(fields)), vec![]).unwrap()
+    }
+
+    pub fn from_schema_unkeyed(schema: Arc<Schema>) -> DFResult<Self> {
+        let timestamp_index = schema
+            .column_with_name(TIMESTAMP_FIELD)
+            .ok_or_else(|| {
+                DataFusionError::Plan(format!(
+                    "no {TIMESTAMP_FIELD} field in schema, schema is {schema:?}"
+                ))
+            })?
+            .0;
+
+        Ok(Self {
+            schema,
+            timestamp_index,
+            key_indices: None,
+            routing_key_indices: None,
+        })
+    }
+
+    pub fn from_schema_keys(schema: Arc<Schema>, key_indices: Vec<usize>) -> DFResult<Self> {
+        let timestamp_index = schema
+            .column_with_name(TIMESTAMP_FIELD)
+            .ok_or_else(|| {
+                DataFusionError::Plan(format!(
+                    "no {TIMESTAMP_FIELD} field in schema, schema is {schema:?}"
+                ))
+            })?
+            .0;
+
+        Ok(Self {
+            schema,
+            timestamp_index,
+            key_indices: Some(key_indices),
+            routing_key_indices: None,
+        })
+    }
+
+    pub fn schema_without_timestamp(&self) -> Schema {
+        let mut builder = SchemaBuilder::from(self.schema.fields());
+        builder.remove(self.timestamp_index);
+        builder.finish()
+    }
+
+    pub fn remove_timestamp_column(&self, batch: &mut RecordBatch) {
+        batch.remove_column(self.timestamp_index);
+    }
+
+    pub fn builders(&self) -> Vec<Box<dyn ArrayBuilder>> {
+        self.schema
+            .fields
+            .iter()
+            .map(|f| make_builder(f.data_type(), 8))
+            .collect()
+    }
+
+    pub fn timestamp_column<'a>(&self, batch: &'a RecordBatch) -> &'a TimestampNanosecondArray {
+        batch
+            .column(self.timestamp_index)
+            .as_any()
+            .downcast_ref::<TimestampNanosecondArray>()
+            .unwrap()
+    }
+
+    pub fn has_routing_keys(&self) -> bool {
+        self.routing_keys().map(|k| !k.is_empty()).unwrap_or(false)
+    }
+
+    pub fn routing_keys(&self) -> Option<&Vec<usize>> {
+        self.routing_key_indices
+            .as_ref()
+            .or(self.key_indices.as_ref())
+    }
+
+    pub fn storage_keys(&self) -> Option<&Vec<usize>> {
+        self.key_indices.as_ref()
+    }
+
+    pub fn filter_by_time(
+        &self,
+        batch: RecordBatch,
+        cutoff: Option<SystemTime>,
+    ) -> Result<RecordBatch, ArrowError> {
+        let Some(cutoff) = cutoff else {
+            // no watermark, so we just return the same batch.
+            return Ok(batch);
+        };
+        // filter out late data
+        let timestamp_column = batch
+            .column(self.timestamp_index)
+            .as_any()
+            .downcast_ref::<TimestampNanosecondArray>()
+            .ok_or_else(|| ArrowError::CastError(
+                format!("failed to downcast column {} of {:?} to timestamp. Schema is supposed to be {:?}",
+                        self.timestamp_index, batch, self.schema)))?;
+        let cutoff_scalar = TimestampNanosecondArray::new_scalar(to_nanos(cutoff) as i64);
+        let on_time = gt_eq(timestamp_column, &cutoff_scalar)?;
+        filter_record_batch(&batch, &on_time)
+    }
+
+    pub fn sort_columns(&self, batch: &RecordBatch, with_timestamp: bool) -> Vec<SortColumn> {
+        let mut columns = vec![];
+        if let Some(keys) = &self.key_indices {
+            columns.extend(keys.iter().map(|index| SortColumn {
+                values: batch.column(*index).clone(),
+                options: None,
+            }));
+        }
+        if with_timestamp {
+            columns.push(SortColumn {
+                values: batch.column(self.timestamp_index).clone(),
+                options: None,
+            });
+        }
+        columns
+    }
+
+    pub fn sort_fields(&self, with_timestamp: bool) -> Vec<SortField> {
+        let mut sort_fields = vec![];
+        if let Some(keys) = &self.key_indices {
+            sort_fields.extend(keys.iter());
+        }
+        if with_timestamp {
+            sort_fields.push(self.timestamp_index);
+        }
+        self.sort_fields_by_indices(&sort_fields)
+    }
+
+    fn sort_fields_by_indices(&self, indices: &[usize]) -> Vec<SortField> {
+        indices
+            .iter()
+            .map(|index| SortField::new(self.schema.field(*index).data_type().clone()))
+            .collect()
+    }
+
+    pub fn converter(&self, with_timestamp: bool) -> Result<Converter, ArrowError> {
+        Converter::new(self.sort_fields(with_timestamp))
+    }
+
+    pub fn value_converter(
+        &self,
+        with_timestamp: bool,
+        generation_index: usize,
+    ) -> Result<Converter, ArrowError> {
+        match &self.key_indices {
+            None => {
+                let mut indices = (0..self.schema.fields().len()).collect::<Vec<_>>();
+                indices.remove(generation_index);
+                if !with_timestamp {
+                    indices.remove(self.timestamp_index);
+                }
+                Converter::new(self.sort_fields_by_indices(&indices))
+            }
+            Some(keys) => {
+                let indices = (0..self.schema.fields().len())
+                    .filter(|index| {
+                        !keys.contains(index)
+                            && (with_timestamp || *index != self.timestamp_index)
+                            && *index != generation_index
+                    })
+                    .collect::<Vec<_>>();
+                Converter::new(self.sort_fields_by_indices(&indices))
+            }
+        }
+    }
+
+    pub fn value_indices(&self, with_timestamp: bool) -> Vec<usize> {
+        let field_count = self.schema.fields().len();
+        match &self.key_indices {
+            None => {
+                let mut indices = (0..field_count).collect::<Vec<_>>();
+
+                if !with_timestamp {
+                    indices.remove(self.timestamp_index);
+                }
+                indices
+            }
+            Some(keys) => (0..field_count)
+                .filter(|index| {
+                    !keys.contains(index) && (with_timestamp || *index != self.timestamp_index)
+                })
+                .collect::<Vec<_>>(),
+        }
+    }
+
+    pub fn sort(
+        &self,
+        batch: RecordBatch,
+        with_timestamp: bool,
+    ) -> Result<RecordBatch, ArrowError> {
+        if self.key_indices.is_none() && !with_timestamp {
+            return Ok(batch);
+        }
+        let sort_columns = self.sort_columns(&batch, with_timestamp);
+        let sort_indices = lexsort_to_indices(&sort_columns, None).expect("should be able to sort");
+        let columns = batch
+            .columns()
+            .iter()
+            .map(|c| take(c, &sort_indices, None).unwrap())
+            .collect();
+
+        RecordBatch::try_new(batch.schema(), columns)
+    }
+
+    pub fn partition(
+        &self,
+        batch: &RecordBatch,
+        with_timestamp: bool,
+    ) -> Result<Vec<Range<usize>>, ArrowError> {
+        if self.key_indices.is_none() && !with_timestamp {
+            #[allow(clippy::single_range_in_vec_init)]
+            return Ok(vec![0..batch.num_rows()]);
+        }
+
+        let mut partition_columns = vec![];
+
+        if let Some(keys) = &self.routing_keys() {
+            partition_columns.extend(keys.iter().map(|index| batch.column(*index).clone()));
+        }
+        if with_timestamp {
+            partition_columns.push(batch.column(self.timestamp_index).clone());
+        }
+
+        Ok(partition(&partition_columns)?.ranges())
+    }
+
+    pub fn unkeyed_batch(&self, batch: &RecordBatch) -> Result<RecordBatch, ArrowError> {
+        if self.key_indices.is_none() {
+            return Ok(batch.clone());
+        }
+        let columns: Vec<_> = (0..batch.num_columns())
+            .filter(|index| !self.key_indices.as_ref().unwrap().contains(index))
+            .collect();
+        batch.project(&columns)
+    }
+
+    pub fn schema_without_keys(&self) -> Result<Self, ArrowError> {
+        if self.key_indices.is_none() {
+            return Ok(self.clone());
+        }
+        let key_indices = self.key_indices.as_ref().unwrap();
+        let unkeyed_schema = Schema::new(
+            self.schema
+                .fields()
+                .iter()
+                .enumerate()
+                .filter(|(index, _field)| !key_indices.contains(index))
+                .map(|(_, field)| field.as_ref().clone())
+                .collect::<Vec<_>>(),
+        );
+        let timestamp_index = unkeyed_schema.index_of(TIMESTAMP_FIELD)?;
+        Ok(Self {
+            schema: Arc::new(unkeyed_schema),
+            timestamp_index,
+            key_indices: None,
+            routing_key_indices: None,
+        })
+    }
+
+    pub fn with_fields(&self, fields: Vec<FieldRef>) -> Result<Self, ArrowError> {
+        let schema = Arc::new(Schema::new_with_metadata(
+            fields,
+            self.schema.metadata.clone(),
+        ));
+
+        let timestamp_index = schema.index_of(TIMESTAMP_FIELD)?;
+        let max_index = *[&self.key_indices, &self.routing_key_indices]
+            .iter()
+            .map(|indices| indices.as_ref().and_then(|k| k.iter().max()))
+            .max()
+            .flatten()
+            .unwrap_or(&0);
+
+        if schema.fields.len() - 1 < max_index {
+            return Err(ArrowError::InvalidArgumentError(format!(
+                "expected at least {} fields, but were only {}",
+                max_index + 1,
+                schema.fields.len()
+            )));
+        }
+
+        Ok(Self {
+            schema,
+            timestamp_index,
+            key_indices: self.key_indices.clone(),
+            routing_key_indices: self.routing_key_indices.clone(),
+        })
+    }
+
+    pub fn with_additional_fields(
+        &self,
+        new_fields: impl Iterator<Item = Field>,
+    ) -> Result<Self, ArrowError> {
+        let mut fields = self.schema.fields.to_vec();
+        fields.extend(new_fields.map(Arc::new));
+
+        self.with_fields(fields)
+    }
+}
+
+pub fn server_for_hash_array(
+    hash: &PrimitiveArray<UInt64Type>,
+    n: usize,
+) -> Result<PrimitiveArray<UInt64Type>, ArrowError> {
+    let range_size = u64::MAX / (n as u64) + 1;
+    let range_scalar = UInt64Array::new_scalar(range_size);
+    let division = div(hash, &range_scalar)?;
+    let result: &PrimitiveArray<UInt64Type> = division.as_any().downcast_ref().unwrap();
+    Ok(result.clone())
+}
diff --git a/src/common/mod.rs b/src/common/mod.rs
new file mode 100644
index 00000000..e0eb8d7a
--- /dev/null
+++ b/src/common/mod.rs
@@ -0,0 +1,72 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Shared core types and constants for FunctionStream (`crate::common`).
+//!
+//! Used by the runtime, SQL planner, coordinator, and other subsystems —
+//! analogous to `arroyo-types` + `arroyo-rpc` in Arroyo.
+
+pub mod arrow_ext;
+pub mod control;
+pub mod date;
+pub mod debezium;
+pub mod fs_schema;
+pub mod errors;
+pub mod formats;
+pub mod hash;
+pub mod message;
+pub mod operator_config;
+pub mod task_info;
+pub mod time_utils;
+pub mod worker;
+mod converter;
+
+// ── Re-exports from existing modules ──
+pub use arrow_ext::{DisplayAsSql, FsExtensionType, GetArrowSchema, GetArrowType};
+pub use date::{DatePart, DateTruncPrecision};
+pub use debezium::{Debezium, DebeziumOp, UpdatingData};
+pub use hash::{range_for_server, server_for_hash, HASH_SEEDS};
+pub use message::{ArrowMessage, CheckpointBarrier, SignalMessage, Watermark};
+pub use task_info::{ChainInfo, TaskInfo};
+pub use time_utils::{from_micros, from_millis, from_nanos, to_micros, to_millis, to_nanos};
+pub use worker::{MachineId, WorkerId};
+
+// ── Re-exports from new modules ──
+pub use control::{
+    CheckpointCompleted, CheckpointEvent, CompactionResult, ControlMessage, ControlResp,
+    ErrorDomain, RetryHint, StopMode, TaskCheckpointEventType, TaskError,
+};
+pub use fs_schema::{FsSchema, FsSchemaRef};
+pub use errors::DataflowError;
+pub use formats::{BadData, Format, Framing, JsonFormat};
+pub use operator_config::MetadataField;
+
+// ── Well-known column names ──
+pub const TIMESTAMP_FIELD: &str = "_timestamp";
+pub const UPDATING_META_FIELD: &str = "_updating_meta";
+
+// ── Environment variables ──
+pub const JOB_ID_ENV: &str = "JOB_ID";
+pub const RUN_ID_ENV: &str = "RUN_ID";
+
+// ── Metric names ──
+pub const MESSAGES_RECV: &str = "fs_worker_messages_recv";
+pub const MESSAGES_SENT: &str = "fs_worker_messages_sent";
+pub const BYTES_RECV: &str = "fs_worker_bytes_recv";
+pub const BYTES_SENT: &str = "fs_worker_bytes_sent";
+pub const BATCHES_RECV: &str = "fs_worker_batches_recv";
+pub const BATCHES_SENT: &str = "fs_worker_batches_sent";
+pub const TX_QUEUE_SIZE: &str = "fs_worker_tx_queue_size";
+pub const TX_QUEUE_REM: &str = "fs_worker_tx_queue_rem";
+pub const DESERIALIZATION_ERRORS: &str = "fs_worker_deserialization_errors";
+
+pub const LOOKUP_KEY_INDEX_FIELD: &str = "__lookup_key_index";
diff --git a/src/config/global_config.rs b/src/config/global_config.rs
index b4f92edd..c76bf4b0 100644
--- a/src/config/global_config.rs
+++ b/src/config/global_config.rs
@@ -19,6 +19,13 @@ use crate::config::python_config::PythonConfig;
 use crate::config::service_config::ServiceConfig;
 use crate::config::wasm_config::WasmConfig;
 
+#[derive(Debug, Clone, Serialize, Deserialize, Default)]
+pub struct StreamingConfig {
+    /// Maximum heap memory (in bytes) available to the streaming runtime's memory pool.
+    /// Defaults to 256 MiB when absent.
+    pub max_memory_bytes: Option<usize>,
+}
+
 #[derive(Debug, Clone, Serialize, Deserialize, Default)]
 pub struct GlobalConfig {
     pub service: ServiceConfig,
@@ -31,6 +38,10 @@ pub struct GlobalConfig {
     pub state_storage: crate::config::storage::StateStorageConfig,
     #[serde(default)]
     pub task_storage: crate::config::storage::TaskStorageConfig,
+    #[serde(default)]
+    pub streaming: StreamingConfig,
+    #[serde(default)]
+    pub stream_catalog: crate::config::storage::StreamCatalogConfig,
 }
 
 impl GlobalConfig {
diff --git a/src/config/storage.rs b/src/config/storage.rs
index e5186648..28396d7d 100644
--- a/src/config/storage.rs
+++ b/src/config/storage.rs
@@ -118,3 +118,27 @@ impl Default for TaskStorageConfig {
         }
     }
 }
+
+/// Stream table catalog (`CREATE TABLE` / `SHOW TABLES`) storage.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct StreamCatalogConfig {
+    /// When `false`, the catalog is in-memory only and is **lost on process restart**.
+    #[serde(default = "default_stream_catalog_persist")]
+    pub persist: bool,
+    /// RocksDB directory for persisted catalog. Default: `{data_dir}/stream_catalog`.
+    #[serde(default)]
+    pub db_path: Option<String>,
+}
+
+fn default_stream_catalog_persist() -> bool {
+    true
+}
+
+impl Default for StreamCatalogConfig {
+    fn default() -> Self {
+        Self {
+            persist: default_stream_catalog_persist(),
+            db_path: None,
+        }
+    }
+}
diff --git a/src/coordinator/analyze/analyzer.rs b/src/coordinator/analyze/analyzer.rs
index 30552191..878a9481 100644
--- a/src/coordinator/analyze/analyzer.rs
+++ b/src/coordinator/analyze/analyzer.rs
@@ -13,8 +13,11 @@
 use super::Analysis;
 use crate::coordinator::execution_context::ExecutionContext;
 use crate::coordinator::statement::{
-    CreateFunction, CreatePythonFunction, DropFunction, ShowFunctions, StartFunction, Statement,
-    StatementVisitor, StatementVisitorContext, StatementVisitorResult, StopFunction,
+    CreateFunction, CreatePythonFunction, CreateTable, DropFunction,
+    DropStreamingTableStatement, DropTableStatement, ShowCatalogTables,
+    ShowCreateStreamingTable, ShowCreateTable, ShowFunctions, ShowStreamingTables,
+    StartFunction, Statement, StatementVisitor, StatementVisitorContext,
+    StatementVisitorResult, StopFunction, StreamingTableStatement,
 };
 use std::fmt;
 
@@ -108,6 +111,22 @@ impl StatementVisitor for Analyzer<'_> {
         StatementVisitorResult::Analyze(Box::new(stmt.clone()))
     }
 
+    fn visit_show_catalog_tables(
+        &self,
+        stmt: &ShowCatalogTables,
+        _context: &StatementVisitorContext,
+    ) -> StatementVisitorResult {
+        StatementVisitorResult::Analyze(Box::new(stmt.clone()))
+    }
+
+    fn visit_show_create_table(
+        &self,
+        stmt: &ShowCreateTable,
+        _context: &StatementVisitorContext,
+    ) -> StatementVisitorResult {
+        StatementVisitorResult::Analyze(Box::new(stmt.clone()))
+    }
+
     fn visit_create_python_function(
         &self,
         stmt: &CreatePythonFunction,
@@ -115,4 +134,54 @@ impl StatementVisitor for Analyzer<'_> {
     ) -> StatementVisitorResult {
         StatementVisitorResult::Analyze(Box::new(stmt.clone()))
     }
+
+    fn visit_create_table(
+        &self,
+        stmt: &CreateTable,
+        _context: &StatementVisitorContext,
+    ) -> StatementVisitorResult {
+        StatementVisitorResult::Analyze(Box::new(CreateTable::new(stmt.statement.clone())))
+    }
+
+    fn visit_streaming_table_statement(
+        &self,
+        stmt: &StreamingTableStatement,
+        _context: &StatementVisitorContext,
+    ) -> StatementVisitorResult {
+        StatementVisitorResult::Analyze(Box::new(StreamingTableStatement::new(
+            stmt.statement.clone(),
+        )))
+    }
+
+    fn visit_drop_table_statement(
+        &self,
+        stmt: &DropTableStatement,
+        _context: &StatementVisitorContext,
+    ) -> StatementVisitorResult {
+        StatementVisitorResult::Analyze(Box::new(DropTableStatement::new(stmt.statement.clone())))
+    }
+
+    fn visit_show_streaming_tables(
+        &self,
+        stmt: &ShowStreamingTables,
+        _context: &StatementVisitorContext,
+    ) -> StatementVisitorResult {
+        StatementVisitorResult::Analyze(Box::new(stmt.clone()))
+    }
+
+    fn visit_show_create_streaming_table(
+        &self,
+        stmt: &ShowCreateStreamingTable,
+        _context: &StatementVisitorContext,
+    ) -> StatementVisitorResult {
+        StatementVisitorResult::Analyze(Box::new(stmt.clone()))
+    }
+
+    fn visit_drop_streaming_table(
+        &self,
+        stmt: &DropStreamingTableStatement,
+        _context: &StatementVisitorContext,
+    ) -> StatementVisitorResult {
+        StatementVisitorResult::Analyze(Box::new(stmt.clone()))
+    }
 }
diff --git a/src/coordinator/coordinator.rs b/src/coordinator/coordinator.rs
index 4ad766d5..b86b1070 100644
--- a/src/coordinator/coordinator.rs
+++ b/src/coordinator/coordinator.rs
@@ -10,128 +10,139 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+use std::sync::Arc;
 use std::time::Instant;
 
 use anyhow::{Context, Result};
 
-use crate::coordinator::analyze::{Analysis, Analyzer};
+use crate::coordinator::analyze::Analyzer;
 use crate::coordinator::dataset::ExecuteResult;
 use crate::coordinator::execution::Executor;
 use crate::coordinator::plan::{LogicalPlanVisitor, LogicalPlanner, PlanNode};
 use crate::coordinator::statement::Statement;
-use crate::runtime::taskexecutor::TaskManager;
+use crate::sql::schema::StreamSchemaProvider;
 
 use super::execution_context::ExecutionContext;
+use super::runtime_context::CoordinatorRuntimeContext;
 
+#[derive(Default)]
 pub struct Coordinator {}
 
-impl Default for Coordinator {
-    fn default() -> Self {
-        Self::new()
-    }
-}
-
 impl Coordinator {
     pub fn new() -> Self {
         Self {}
     }
 
-    pub fn execute(&self, stmt: &dyn Statement) -> ExecuteResult {
-        let start_time = Instant::now();
-        let context = ExecutionContext::new();
-        let execution_id = context.execution_id;
+    // ========================================================================
+    // Plan compilation
+    // ========================================================================
 
-        match self.execute_pipeline(&context, stmt) {
-            Ok(result) => {
-                log::debug!(
-                    "[{}] Execution completed in {}ms",
-                    execution_id,
-                    start_time.elapsed().as_millis()
-                );
-                result
-            }
-            Err(e) => {
-                log::error!(
-                    "[{}] Execution failed after {}ms. Error: {:#}",
-                    execution_id,
-                    start_time.elapsed().as_millis(),
-                    e
-                );
-                ExecuteResult::err(format!("Execution failed: {:#}", e))
-            }
-        }
+    pub fn compile_plan(
+        &self,
+        stmt: &dyn Statement,
+        schema_provider: StreamSchemaProvider,
+    ) -> Result<Box<dyn PlanNode>> {
+        self.compile_plan_internal(&ExecutionContext::new(), stmt, schema_provider)
     }
 
-    fn execute_pipeline(
+    /// Internal pipeline: Analyze → build logical plan → optimize.
+    fn compile_plan_internal(
         &self,
         context: &ExecutionContext,
         stmt: &dyn Statement,
-    ) -> Result<ExecuteResult> {
-        let analysis = self.step_analyze(context, stmt)?;
-        let plan = self.step_build_logical_plan(&analysis)?;
-        let optimized_plan = self.step_optimize(&analysis, plan)?;
-        self.step_execute(optimized_plan)
-    }
-
-    fn step_analyze(&self, context: &ExecutionContext, stmt: &dyn Statement) -> Result<Analysis> {
+        schema_provider: StreamSchemaProvider,
+    ) -> Result<Box<dyn PlanNode>> {
+        let exec_id = context.execution_id;
         let start = Instant::now();
-        let analyzer = Analyzer::new(context);
-        let result = analyzer
+
+        let analysis = Analyzer::new(context)
             .analyze(stmt)
             .map_err(|e| anyhow::anyhow!(e))
-            .context("Analyzer phase failed");
-
+            .context("Analyzer phase failed")?;
         log::debug!(
             "[{}] Analyze phase finished in {}ms",
-            context.execution_id,
+            exec_id,
             start.elapsed().as_millis()
         );
-        result
-    }
-
-    fn step_build_logical_plan(&self, analysis: &Analysis) -> Result<Box<dyn PlanNode>> {
-        let visitor = LogicalPlanVisitor::new();
-        let plan = visitor.visit(analysis);
-        Ok(plan)
-    }
 
-    fn step_optimize(
-        &self,
-        analysis: &Analysis,
-        plan: Box<dyn PlanNode>,
-    ) -> Result<Box<dyn PlanNode>> {
-        let start = Instant::now();
-        let planner = LogicalPlanner::new();
-        let optimized = planner.optimize(plan, analysis);
+        let plan = LogicalPlanVisitor::new(schema_provider).visit(&analysis);
 
+        let opt_start = Instant::now();
+        let optimized = LogicalPlanner::new().optimize(plan, &analysis);
         log::debug!(
-            "Optimizer phase finished in {}ms",
-            start.elapsed().as_millis()
+            "[{}] Optimizer phase finished in {}ms",
+            exec_id,
+            opt_start.elapsed().as_millis()
         );
+
         Ok(optimized)
     }
 
-    fn step_execute(&self, plan: Box<dyn PlanNode>) -> Result<ExecuteResult> {
+    // ========================================================================
+    // Execution
+    // ========================================================================
+
+    pub fn execute(&self, stmt: &dyn Statement) -> ExecuteResult {
+        match CoordinatorRuntimeContext::try_from_globals() {
+            Ok(ctx) => self.execute_with_runtime_context(stmt, &ctx),
+            Err(e) => ExecuteResult::err(e.to_string()),
+        }
+    }
+
+    pub async fn execute_with_stream_catalog(&self, stmt: &dyn Statement) -> ExecuteResult {
+        self.execute(stmt)
+    }
+
+    /// Same as [`Self::execute`], but uses an explicit [`CoordinatorRuntimeContext`] (e.g. tests or custom wiring).
+    pub fn execute_with_runtime_context(
+        &self,
+        stmt: &dyn Statement,
+        runtime: &CoordinatorRuntimeContext,
+    ) -> ExecuteResult {
         let start = Instant::now();
-        let task_manager = match TaskManager::get() {
-            Ok(tm) => tm,
+        let context = ExecutionContext::new();
+        let exec_id = context.execution_id;
+        let schema_provider = runtime.planning_schema_provider();
+
+        let result = (|| -> Result<ExecuteResult> {
+            let plan = self.compile_plan_internal(&context, stmt, schema_provider)?;
+
+            let exec_start = Instant::now();
+            let res = Executor::new(
+                Arc::clone(&runtime.task_manager),
+                runtime.catalog_manager.clone(),
+                Arc::clone(&runtime.job_manager),
+            )
+            .execute(plan.as_ref())
+            .map_err(|e| anyhow::anyhow!(e))
+            .context("Executor phase failed")?;
+
+            log::debug!(
+                "[{}] Executor phase finished in {}ms",
+                exec_id,
+                exec_start.elapsed().as_millis()
+            );
+            Ok(res)
+        })();
+
+        match result {
+            Ok(res) => {
+                log::debug!(
+                    "[{}] Execution completed in {}ms",
+                    exec_id,
+                    start.elapsed().as_millis()
+                );
+                res
+            }
             Err(e) => {
-                return Ok(ExecuteResult::err(format!(
-                    "Failed to get TaskManager: {}",
+                log::error!(
+                    "[{}] Execution failed after {}ms. Error: {:#}",
+                    exec_id,
+                    start.elapsed().as_millis(),
                     e
-                )));
+                );
+                ExecuteResult::err(format!("Execution failed: {:#}", e))
             }
-        };
-        let executor = Executor::new(task_manager.clone());
-        let result = executor
-            .execute(plan.as_ref())
-            .map_err(|e| anyhow::anyhow!(e))
-            .context("Executor phase failed");
-
-        log::debug!(
-            "Executor phase finished in {}ms",
-            start.elapsed().as_millis()
-        );
-        result
+        }
     }
-}
+}
\ No newline at end of file
diff --git a/src/coordinator/dataset/mod.rs b/src/coordinator/dataset/mod.rs
index b72613da..bbcac6f0 100644
--- a/src/coordinator/dataset/mod.rs
+++ b/src/coordinator/dataset/mod.rs
@@ -12,8 +12,16 @@
 
 mod data_set;
 mod execute_result;
+mod show_catalog_tables_result;
+mod show_create_streaming_table_result;
+mod show_create_table_result;
 mod show_functions_result;
+mod show_streaming_tables_result;
 
 pub use data_set::{DataSet, empty_record_batch};
 pub use execute_result::ExecuteResult;
+pub use show_catalog_tables_result::ShowCatalogTablesResult;
+pub use show_create_streaming_table_result::ShowCreateStreamingTableResult;
+pub use show_create_table_result::ShowCreateTableResult;
 pub use show_functions_result::ShowFunctionsResult;
+pub use show_streaming_tables_result::ShowStreamingTablesResult;
diff --git a/src/coordinator/dataset/show_catalog_tables_result.rs b/src/coordinator/dataset/show_catalog_tables_result.rs
new file mode 100644
index 00000000..74a8cd2d
--- /dev/null
+++ b/src/coordinator/dataset/show_catalog_tables_result.rs
@@ -0,0 +1,100 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::sync::Arc;
+
+use arrow_array::{Int32Array, StringArray};
+use arrow_schema::{DataType, Field, Schema};
+use datafusion::arrow::datatypes::Schema as DfSchema;
+
+use super::DataSet;
+use crate::sql::schema::table::Table as CatalogTable;
+use crate::sql::schema::{catalog_table_row_detail, schema_columns_one_line};
+
+#[derive(Clone, Debug)]
+pub struct ShowCatalogTablesResult {
+    names: Vec<String>,
+    kinds: Vec<String>,
+    column_counts: Vec<i32>,
+    schema_lines: Vec<String>,
+    details: Vec<String>,
+}
+
+impl ShowCatalogTablesResult {
+    pub fn from_tables(tables: &[Arc<CatalogTable>]) -> Self {
+        let mut names = Vec::with_capacity(tables.len());
+        let mut kinds = Vec::with_capacity(tables.len());
+        let mut column_counts = Vec::with_capacity(tables.len());
+        let mut schema_lines = Vec::with_capacity(tables.len());
+        let mut details = Vec::with_capacity(tables.len());
+
+        for t in tables {
+            let schema = match t.as_ref() {
+                CatalogTable::ConnectorTable(source) | CatalogTable::LookupTable(source) => {
+                    source.produce_physical_schema()
+                }
+                CatalogTable::TableFromQuery { .. } => DfSchema::new(t.get_fields()),
+            };
+            let ncols = schema.fields().len() as i32;
+            names.push(t.name().to_string());
+            kinds.push(match t.as_ref() {
+                CatalogTable::ConnectorTable(_) => "SOURCE",
+                CatalogTable::LookupTable(_) => "LOOKUP",
+                CatalogTable::TableFromQuery { .. } => "QUERY",
+            }
+            .to_string());
+            column_counts.push(ncols);
+            schema_lines.push(schema_columns_one_line(&schema));
+            details.push(catalog_table_row_detail(t.as_ref()));
+        }
+
+        Self {
+            names,
+            kinds,
+            column_counts,
+            schema_lines,
+            details,
+        }
+    }
+}
+
+impl DataSet for ShowCatalogTablesResult {
+    fn to_record_batch(&self) -> arrow_array::RecordBatch {
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("table_name", DataType::Utf8, false),
+            Field::new("kind", DataType::Utf8, false),
+            Field::new("column_count", DataType::Int32, false),
+            Field::new("schema_columns", DataType::Utf8, false),
+            Field::new("details", DataType::Utf8, false),
+        ]));
+
+        arrow_array::RecordBatch::try_new(
+            schema,
+            vec![
+                Arc::new(StringArray::from(
+                    self.names.iter().map(|s| s.as_str()).collect::<Vec<_>>(),
+                )),
+                Arc::new(StringArray::from(
+                    self.kinds.iter().map(|s| s.as_str()).collect::<Vec<_>>(),
+                )),
+                Arc::new(Int32Array::from(self.column_counts.clone())),
+                Arc::new(StringArray::from(
+                    self.schema_lines.iter().map(|s| s.as_str()).collect::<Vec<_>>(),
+                )),
+                Arc::new(StringArray::from(
+                    self.details.iter().map(|s| s.as_str()).collect::<Vec<_>>(),
+                )),
+            ],
+        )
+        .unwrap_or_else(|_| arrow_array::RecordBatch::new_empty(Arc::new(Schema::empty())))
+    }
+}
diff --git a/src/coordinator/dataset/show_create_streaming_table_result.rs b/src/coordinator/dataset/show_create_streaming_table_result.rs
new file mode 100644
index 00000000..ed3ec600
--- /dev/null
+++ b/src/coordinator/dataset/show_create_streaming_table_result.rs
@@ -0,0 +1,69 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::sync::Arc;
+
+use arrow_array::StringArray;
+use arrow_schema::{DataType, Field, Schema};
+use protocol::grpc::api::FsProgram;
+
+use crate::sql::common::render_program_topology;
+
+use super::DataSet;
+
+#[derive(Clone, Debug)]
+pub struct ShowCreateStreamingTableResult {
+    table_name: String,
+    status: String,
+    pipeline_detail: String,
+    program: FsProgram,
+}
+
+impl ShowCreateStreamingTableResult {
+    pub fn new(
+        table_name: String,
+        status: String,
+        pipeline_detail: String,
+        program: FsProgram,
+    ) -> Self {
+        Self {
+            table_name,
+            status,
+            pipeline_detail,
+            program,
+        }
+    }
+}
+
+impl DataSet for ShowCreateStreamingTableResult {
+    fn to_record_batch(&self) -> arrow_array::RecordBatch {
+        let topology = render_program_topology(&self.program);
+
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("Streaming Table", DataType::Utf8, false),
+            Field::new("Status", DataType::Utf8, false),
+            Field::new("Pipelines", DataType::Utf8, false),
+            Field::new("Topology", DataType::Utf8, false),
+        ]));
+
+        arrow_array::RecordBatch::try_new(
+            schema,
+            vec![
+                Arc::new(StringArray::from(vec![self.table_name.as_str()])),
+                Arc::new(StringArray::from(vec![self.status.as_str()])),
+                Arc::new(StringArray::from(vec![self.pipeline_detail.as_str()])),
+                Arc::new(StringArray::from(vec![topology.as_str()])),
+            ],
+        )
+        .unwrap_or_else(|_| arrow_array::RecordBatch::new_empty(Arc::new(Schema::empty())))
+    }
+}
diff --git a/src/coordinator/dataset/show_create_table_result.rs b/src/coordinator/dataset/show_create_table_result.rs
new file mode 100644
index 00000000..47f49d59
--- /dev/null
+++ b/src/coordinator/dataset/show_create_table_result.rs
@@ -0,0 +1,51 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::sync::Arc;
+
+use arrow_array::StringArray;
+use arrow_schema::{DataType, Field, Schema};
+
+use super::DataSet;
+
+#[derive(Clone, Debug)]
+pub struct ShowCreateTableResult {
+    table_name: String,
+    create_sql: String,
+}
+
+impl ShowCreateTableResult {
+    pub fn new(table_name: String, create_sql: String) -> Self {
+        Self {
+            table_name,
+            create_sql,
+        }
+    }
+}
+
+impl DataSet for ShowCreateTableResult {
+    fn to_record_batch(&self) -> arrow_array::RecordBatch {
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("Table", DataType::Utf8, false),
+            Field::new("Create Table", DataType::Utf8, false),
+        ]));
+
+        arrow_array::RecordBatch::try_new(
+            schema,
+            vec![
+                Arc::new(StringArray::from(vec![self.table_name.as_str()])),
+                Arc::new(StringArray::from(vec![self.create_sql.as_str()])),
+            ],
+        )
+        .unwrap_or_else(|_| arrow_array::RecordBatch::new_empty(Arc::new(Schema::empty())))
+    }
+}
diff --git a/src/coordinator/dataset/show_streaming_tables_result.rs b/src/coordinator/dataset/show_streaming_tables_result.rs
new file mode 100644
index 00000000..a992d1b9
--- /dev/null
+++ b/src/coordinator/dataset/show_streaming_tables_result.rs
@@ -0,0 +1,75 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::sync::Arc;
+
+use arrow_array::{Int32Array, StringArray};
+use arrow_schema::{DataType, Field, Schema};
+
+use super::DataSet;
+use crate::runtime::streaming::job::StreamingJobSummary;
+
+#[derive(Clone, Debug)]
+pub struct ShowStreamingTablesResult {
+    jobs: Vec<StreamingJobSummary>,
+}
+
+impl ShowStreamingTablesResult {
+    pub fn new(jobs: Vec<StreamingJobSummary>) -> Self {
+        Self { jobs }
+    }
+}
+
+impl DataSet for ShowStreamingTablesResult {
+    fn to_record_batch(&self) -> arrow_array::RecordBatch {
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("job_id", DataType::Utf8, false),
+            Field::new("status", DataType::Utf8, false),
+            Field::new("pipeline_count", DataType::Int32, false),
+            Field::new("uptime", DataType::Utf8, false),
+        ]));
+
+        let job_ids: Vec<&str> = self.jobs.iter().map(|j| j.job_id.as_str()).collect();
+        let statuses: Vec<&str> = self.jobs.iter().map(|j| j.status.as_str()).collect();
+        let pipeline_counts: Vec<i32> = self.jobs.iter().map(|j| j.pipeline_count).collect();
+        let uptimes: Vec<String> = self.jobs.iter().map(|j| format_duration(j.uptime_secs)).collect();
+        let uptime_refs: Vec<&str> = uptimes.iter().map(|s| s.as_str()).collect();
+
+        arrow_array::RecordBatch::try_new(
+            schema,
+            vec![
+                Arc::new(StringArray::from(job_ids)),
+                Arc::new(StringArray::from(statuses)),
+                Arc::new(Int32Array::from(pipeline_counts)),
+                Arc::new(StringArray::from(uptime_refs)),
+            ],
+        )
+        .unwrap_or_else(|_| arrow_array::RecordBatch::new_empty(Arc::new(Schema::empty())))
+    }
+}
+
+fn format_duration(total_secs: u64) -> String {
+    let days = total_secs / 86400;
+    let hours = (total_secs % 86400) / 3600;
+    let mins = (total_secs % 3600) / 60;
+    let secs = total_secs % 60;
+
+    if days > 0 {
+        format!("{days}d {hours}h {mins}m {secs}s")
+    } else if hours > 0 {
+        format!("{hours}h {mins}m {secs}s")
+    } else if mins > 0 {
+        format!("{mins}m {secs}s")
+    } else {
+        format!("{secs}s")
+    }
+}
diff --git a/src/coordinator/execution/executor.rs b/src/coordinator/execution/executor.rs
index 7e44217e..c24a4cda 100644
--- a/src/coordinator/execution/executor.rs
+++ b/src/coordinator/execution/executor.rs
@@ -10,16 +10,32 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-use crate::coordinator::dataset::{ExecuteResult, ShowFunctionsResult, empty_record_batch};
+use std::sync::Arc;
+
+use protocol::grpc::api::FsProgram;
+use thiserror::Error;
+use tracing::{debug, info, warn};
+
+use crate::coordinator::dataset::{
+    empty_record_batch, ExecuteResult, ShowCatalogTablesResult,
+    ShowCreateStreamingTableResult, ShowCreateTableResult, ShowFunctionsResult,
+    ShowStreamingTablesResult,
+};
 use crate::coordinator::plan::{
-    CreateFunctionPlan, CreatePythonFunctionPlan, DropFunctionPlan, PlanNode, PlanVisitor,
-    PlanVisitorContext, PlanVisitorResult, ShowFunctionsPlan, StartFunctionPlan, StopFunctionPlan,
+    CreateFunctionPlan, CreatePythonFunctionPlan, CreateTablePlan, CreateTablePlanBody,
+    DropFunctionPlan, DropStreamingTablePlan, DropTablePlan, LookupTablePlan, PlanNode,
+    PlanVisitor, PlanVisitorContext, PlanVisitorResult, ShowCatalogTablesPlan,
+    ShowCreateStreamingTablePlan, ShowCreateTablePlan, ShowFunctionsPlan,
+    ShowStreamingTablesPlan, StartFunctionPlan, StopFunctionPlan, StreamingTable,
+    StreamingTableConnectorPlan,
 };
 use crate::coordinator::statement::{ConfigSource, FunctionSource};
+use crate::runtime::streaming::job::JobManager;
+use crate::runtime::streaming::protocol::control::StopMode;
 use crate::runtime::taskexecutor::TaskManager;
-use std::sync::Arc;
-use thiserror::Error;
-use tracing::{debug, info};
+use crate::sql::schema::table::Table as CatalogTable;
+use crate::sql::schema::show_create_catalog_table;
+use crate::storage::stream_catalog::CatalogManager;
 
 #[derive(Error, Debug)]
 pub enum ExecuteError {
@@ -35,11 +51,21 @@ pub enum ExecuteError {
 
 pub struct Executor {
     task_manager: Arc<TaskManager>,
+    catalog_manager: Arc<CatalogManager>,
+    job_manager: Arc<JobManager>,
 }
 
 impl Executor {
-    pub fn new(task_manager: Arc<TaskManager>) -> Self {
-        Self { task_manager }
+    pub fn new(
+        task_manager: Arc<TaskManager>,
+        catalog_manager: Arc<CatalogManager>,
+        job_manager: Arc<JobManager>,
+    ) -> Self {
+        Self {
+            task_manager,
+            catalog_manager,
+            job_manager,
+        }
     }
 
     pub fn execute(&self, plan: &dyn PlanNode) -> Result<ExecuteResult, ExecuteError> {
@@ -50,32 +76,35 @@ impl Executor {
 
         match visitor_result {
             PlanVisitorResult::Execute(result) => {
-                let elapsed = timer.elapsed();
-                debug!(target: "executor", elapsed_ms = elapsed.as_millis(), "Execution completed");
+                debug!(
+                    target: "executor",
+                    elapsed_ms = timer.elapsed().as_millis(),
+                    "Execution completed"
+                );
                 result
             }
         }
     }
 }
 
+
 impl PlanVisitor for Executor {
-    #[allow(clippy::redundant_closure_call)]
     fn visit_create_function(
         &self,
         plan: &CreateFunctionPlan,
         _context: &PlanVisitorContext,
     ) -> PlanVisitorResult {
-        let result = (|| -> Result<ExecuteResult, ExecuteError> {
+        let execute = || -> Result<ExecuteResult, ExecuteError> {
             let function_bytes = match &plan.function_source {
                 FunctionSource::Path(path) => std::fs::read(path).map_err(|e| {
-                    ExecuteError::Validation(format!("Failed to read function at {}: {}", path, e))
+                    ExecuteError::Validation(format!("Failed to read function at {path}: {e}"))
                 })?,
                 FunctionSource::Bytes(bytes) => bytes.clone(),
             };
 
             let config_bytes = match &plan.config_source {
                 Some(ConfigSource::Path(path)) => std::fs::read(path).map_err(|e| {
-                    ExecuteError::Validation(format!("Failed to read config at {}: {}", path, e))
+                    ExecuteError::Validation(format!("Failed to read config at {path}: {e}"))
                 })?,
                 Some(ConfigSource::Bytes(bytes)) => bytes.clone(),
                 None => {
@@ -88,35 +117,34 @@ impl PlanVisitor for Executor {
             info!(config_size = config_bytes.len(), "Registering Wasm task");
             self.task_manager
                 .register_task(&config_bytes, &function_bytes)
-                .map_err(|e| ExecuteError::Task(format!("Registration failed: {:?}", e)))?;
+                .map_err(|e| ExecuteError::Task(format!("Registration failed: {e:?}")))?;
 
             Ok(ExecuteResult::ok_with_data(
                 "Function registered successfully",
                 empty_record_batch(),
             ))
-        })();
+        };
 
-        PlanVisitorResult::Execute(result)
+        PlanVisitorResult::Execute(execute())
     }
 
-    #[allow(clippy::redundant_closure_call)]
     fn visit_drop_function(
         &self,
         plan: &DropFunctionPlan,
         _context: &PlanVisitorContext,
     ) -> PlanVisitorResult {
-        let result = (|| -> Result<ExecuteResult, ExecuteError> {
+        let execute = || -> Result<ExecuteResult, ExecuteError> {
             self.task_manager
                 .remove_task(&plan.name)
-                .map_err(|e| ExecuteError::Task(format!("Removal failed: {}", e)))?;
+                .map_err(|e| ExecuteError::Task(format!("Removal failed: {e}")))?;
 
             Ok(ExecuteResult::ok_with_data(
                 format!("Function '{}' dropped", plan.name),
                 empty_record_batch(),
             ))
-        })();
+        };
 
-        PlanVisitorResult::Execute(result)
+        PlanVisitorResult::Execute(execute())
     }
 
     fn visit_start_function(
@@ -138,48 +166,85 @@ impl PlanVisitor for Executor {
         PlanVisitorResult::Execute(result)
     }
 
-    #[allow(clippy::redundant_closure_call)]
     fn visit_show_functions(
         &self,
         _plan: &ShowFunctionsPlan,
         _context: &PlanVisitorContext,
     ) -> PlanVisitorResult {
-        let result = {
-            let functions = self.task_manager.list_all_functions();
+        let functions = self.task_manager.list_all_functions();
+        let result = ExecuteResult::ok_with_data(
+            format!("Found {} task(s)", functions.len()),
+            ShowFunctionsResult::new(functions),
+        );
+
+        PlanVisitorResult::Execute(Ok(result))
+    }
 
+    fn visit_show_catalog_tables(
+        &self,
+        _plan: &ShowCatalogTablesPlan,
+        _context: &PlanVisitorContext,
+    ) -> PlanVisitorResult {
+        let tables = match self.catalog_manager.list_catalog_tables() {
+            Ok(tables) => tables,
+            Err(e) => return PlanVisitorResult::Execute(Err(ExecuteError::Internal(e.to_string()))),
+        };
+        let n = tables.len();
+        let result = ExecuteResult::ok_with_data(
+            format!("{n} stream catalog table(s)"),
+            ShowCatalogTablesResult::from_tables(&tables),
+        );
+        PlanVisitorResult::Execute(Ok(result))
+    }
+
+    fn visit_show_create_table(
+        &self,
+        plan: &ShowCreateTablePlan,
+        _context: &PlanVisitorContext,
+    ) -> PlanVisitorResult {
+        let execute = || -> Result<ExecuteResult, ExecuteError> {
+            let t = self
+                .catalog_manager
+                .get_catalog_table(&plan.table_name)
+                .map_err(|e| ExecuteError::Internal(e.to_string()))?
+                .ok_or_else(|| {
+                    ExecuteError::Validation(format!(
+                        "Table '{}' not found in stream catalog",
+                        plan.table_name
+                    ))
+                })?;
+            let ddl = show_create_catalog_table(t.as_ref());
             Ok(ExecuteResult::ok_with_data(
-                format!("Found {} task(s)", functions.len()),
-                ShowFunctionsResult::new(functions),
+                format!("SHOW CREATE TABLE {}", plan.table_name),
+                ShowCreateTableResult::new(plan.table_name.clone(), ddl),
             ))
         };
-
-        PlanVisitorResult::Execute(result)
+        PlanVisitorResult::Execute(execute())
     }
 
-    #[allow(clippy::redundant_closure_call)]
     fn visit_create_python_function(
         &self,
         plan: &CreatePythonFunctionPlan,
         _context: &PlanVisitorContext,
     ) -> PlanVisitorResult {
-        let result = (|| -> Result<ExecuteResult, ExecuteError> {
-            let modules: Vec<(String, Vec<u8>)> = plan
+        let execute = || -> Result<ExecuteResult, ExecuteError> {
+            let modules = plan
                 .modules
                 .iter()
                 .map(|m| (m.name.clone(), m.bytes.clone()))
-                .collect();
+                .collect::<Vec<_>>();
 
             self.task_manager
                 .register_python_task(plan.config_content.as_bytes(), &modules)
-                .map_err(|e| ExecuteError::Task(format!("Python registration failed: {}", e)))?;
+                .map_err(|e| ExecuteError::Task(format!("Python registration failed: {e}")))?;
 
             Ok(ExecuteResult::ok_with_data(
                 format!("Python function '{}' deployed", plan.class_name),
                 empty_record_batch(),
             ))
-        })();
+        };
 
-        PlanVisitorResult::Execute(result)
+        PlanVisitorResult::Execute(execute())
     }
 
     fn visit_stop_function(
@@ -200,4 +265,252 @@ impl PlanVisitor for Executor {
 
         PlanVisitorResult::Execute(result)
     }
+
+    fn visit_create_table_plan(
+        &self,
+        plan: &CreateTablePlan,
+        _context: &PlanVisitorContext,
+    ) -> PlanVisitorResult {
+        let execute = || -> Result<ExecuteResult, ExecuteError> {
+            let (table_name, if_not_exists, catalog_table) = match &plan.body {
+                CreateTablePlanBody::ConnectorSource {
+                    source_table,
+                    if_not_exists,
+                } => {
+                    let table_name = source_table.name().to_string();
+                    let table_instance = CatalogTable::ConnectorTable(source_table.clone());
+                    (table_name, *if_not_exists, table_instance)
+                }
+                CreateTablePlanBody::DataFusion(_) => {
+                    return Err(ExecuteError::Internal(
+                        "Operation not supported: Currently, the system strictly supports creating tables backed by an external Connector Source (e.g., Kafka, Postgres). In-memory tables, Views, or CTAS (Create Table As Select) are not supported."
+                            .into(),
+                    ));
+                }
+            };
+
+            if if_not_exists && self.catalog_manager.has_catalog_table(&table_name) {
+                return Ok(ExecuteResult::ok(format!(
+                    "Table '{table_name}' already exists (skipped)"
+                )));
+            }
+
+            self.catalog_manager
+                .add_catalog_table(catalog_table)
+                .map_err(|e| {
+                    ExecuteError::Internal(format!(
+                        "Failed to register connector source table '{}': {}",
+                        table_name, e
+                    ))
+                })?;
+
+            Ok(ExecuteResult::ok(format!(
+                "Created connector source table '{table_name}'"
+            )))
+        };
+
+        PlanVisitorResult::Execute(execute())
+    }
+
+    fn visit_streaming_table(
+        &self,
+        plan: &StreamingTable,
+        _context: &PlanVisitorContext,
+    ) -> PlanVisitorResult {
+        let execute = || -> Result<ExecuteResult, ExecuteError> {
+            let fs_program: FsProgram = plan.program.clone().into();
+            let job_manager: Arc<JobManager> = Arc::clone(&self.job_manager);
+
+            let job_id = plan.name.clone();
+            let job_id = tokio::task::block_in_place(|| {
+                tokio::runtime::Handle::current()
+                    .block_on(job_manager.submit_job(job_id, fs_program.clone()))
+            })
+            .map_err(|e| ExecuteError::Internal(format!("Failed to submit streaming job: {e}")))?;
+
+            self.catalog_manager
+                .persist_streaming_job(
+                    &plan.name,
+                    &fs_program,
+                    plan.comment.as_deref().unwrap_or(""),
+                )
+                .map_err(|e| {
+                    ExecuteError::Internal(format!(
+                        "Streaming job '{}' submitted but persistence failed: {e}",
+                        plan.name
+                    ))
+                })?;
+
+            info!(
+                job_id = %job_id,
+                table = %plan.name,
+                "Streaming job submitted and persisted"
+            );
+
+            Ok(ExecuteResult::ok_with_data(
+                format!("Streaming table '{}' created, job_id = {}", plan.name, job_id),
+                empty_record_batch(),
+            ))
+        };
+
+        PlanVisitorResult::Execute(execute())
+    }
+
+    fn visit_lookup_table(
+        &self,
+        _plan: &LookupTablePlan,
+        _context: &PlanVisitorContext,
+    ) -> PlanVisitorResult {
+        PlanVisitorResult::Execute(Err(ExecuteError::Internal(
+            "LookupTable execution not yet implemented".to_string(),
+        )))
+    }
+
+    fn visit_streaming_connector_table(
+        &self,
+        _plan: &StreamingTableConnectorPlan,
+        _context: &PlanVisitorContext,
+    ) -> PlanVisitorResult {
+        PlanVisitorResult::Execute(Err(ExecuteError::Internal(
+            "StreamingTableConnector execution not yet implemented".to_string(),
+        )))
+    }
+
+    fn visit_drop_table_plan(
+        &self,
+        plan: &DropTablePlan,
+        _context: &PlanVisitorContext,
+    ) -> PlanVisitorResult {
+        let execute = || -> Result<ExecuteResult, ExecuteError> {
+            self.catalog_manager
+                .drop_catalog_table(&plan.table_name, plan.if_exists)
+                .map_err(|e| ExecuteError::Internal(e.to_string()))?;
+
+            Ok(ExecuteResult::ok(format!(
+                "Dropped table '{}'",
+                plan.table_name
+            )))
+        };
+
+        PlanVisitorResult::Execute(execute())
+    }
+
+    fn visit_show_streaming_tables(
+        &self,
+        _plan: &ShowStreamingTablesPlan,
+        _context: &PlanVisitorContext,
+    ) -> PlanVisitorResult {
+        let execute = || -> Result<ExecuteResult, ExecuteError> {
+            let jobs = self.job_manager.list_jobs();
+            let n = jobs.len();
+            Ok(ExecuteResult::ok_with_data(
+                format!("{n} streaming table(s)"),
+                ShowStreamingTablesResult::new(jobs),
+            ))
+        };
+        PlanVisitorResult::Execute(execute())
+    }
+
+    fn visit_show_create_streaming_table(
+        &self,
+        plan: &ShowCreateStreamingTablePlan,
+        _context: &PlanVisitorContext,
+    ) -> PlanVisitorResult {
+        let execute = || -> Result<ExecuteResult, ExecuteError> {
+            let detail = self
+                .job_manager
+                .get_job_detail(&plan.table_name)
+                .ok_or_else(|| {
+                    ExecuteError::Validation(format!(
+                        "Streaming table '{}' not found in active jobs",
+                        plan.table_name
+                    ))
+                })?;
+
+            let pipeline_lines: Vec<String> = detail
+                .pipelines
+                .iter()
+                .map(|p| format!("  pipeline[{}]: {}", p.pipeline_id, p.status))
+                .collect();
+            let pipeline_detail = if pipeline_lines.is_empty() {
+                "(no pipelines)".to_string()
+            } else {
+                pipeline_lines.join("\n")
+            };
+
+            Ok(ExecuteResult::ok_with_data(
+                format!("SHOW CREATE STREAMING TABLE {}", plan.table_name),
+                ShowCreateStreamingTableResult::new(
+                    plan.table_name.clone(),
+                    detail.status,
+                    pipeline_detail,
+                    detail.program,
+                ),
+            ))
+        };
+        PlanVisitorResult::Execute(execute())
+    }
+
+    fn visit_drop_streaming_table(
+        &self,
+        plan: &DropStreamingTablePlan,
+        _context: &PlanVisitorContext,
+    ) -> PlanVisitorResult {
+        let execute = || -> Result<ExecuteResult, ExecuteError> {
+            let job_exists = self.job_manager.has_job(&plan.table_name);
+
+            if !job_exists && !plan.if_exists {
+                return Err(ExecuteError::Validation(format!(
+                    "Streaming table '{}' not found in active jobs",
+                    plan.table_name
+                )));
+            }
+
+            if job_exists {
+                let job_manager = Arc::clone(&self.job_manager);
+                let table_name = plan.table_name.clone();
+                tokio::task::block_in_place(|| {
+                    tokio::runtime::Handle::current()
+                        .block_on(job_manager.remove_job(&table_name, StopMode::Graceful))
+                })
+                .map_err(|e| {
+                    ExecuteError::Internal(format!(
+                        "Failed to stop streaming job '{}': {}",
+                        plan.table_name, e
+                    ))
+                })?;
+
+                info!(
+                    table = %plan.table_name,
+                    "Streaming job stopped and removed"
+                );
+            }
+
+            if let Err(e) = self.catalog_manager.remove_streaming_job(&plan.table_name) {
+                warn!(
+                    table = %plan.table_name,
+                    error = %e,
+                    "Failed to remove streaming job persisted definition (non-fatal)"
+                );
+            }
+
+            let _ = self
+                .catalog_manager
+                .drop_catalog_table(&plan.table_name, true);
+
+            if job_exists {
+                Ok(ExecuteResult::ok(format!(
+                    "Dropped streaming table '{}'",
+                    plan.table_name
+                )))
+            } else {
+                Ok(ExecuteResult::ok(format!(
+                    "Streaming table '{}' does not exist (skipped)",
+                    plan.table_name
+                )))
+            }
+        };
+
+        PlanVisitorResult::Execute(execute())
+    }
 }
diff --git a/src/coordinator/mod.rs b/src/coordinator/mod.rs
index 0b94d4bf..922b793f 100644
--- a/src/coordinator/mod.rs
+++ b/src/coordinator/mod.rs
@@ -17,11 +17,15 @@ mod dataset;
 mod execution;
 mod execution_context;
 mod plan;
+mod runtime_context;
 mod statement;
+mod tool;
 
 pub use coordinator::Coordinator;
 pub use dataset::{DataSet, ShowFunctionsResult};
 pub use statement::{
-    CreateFunction, CreatePythonFunction, DropFunction, PythonModule, ShowFunctions, StartFunction,
-    Statement, StopFunction,
+    CreateFunction, CreatePythonFunction, CreateTable, DropFunction,
+    DropStreamingTableStatement, DropTableStatement, PythonModule, ShowCatalogTables,
+    ShowCreateStreamingTable, ShowCreateTable, ShowFunctions, ShowStreamingTables,
+    StartFunction, Statement, StopFunction, StreamingTableStatement,
 };
diff --git a/src/coordinator/plan/create_table_plan.rs b/src/coordinator/plan/create_table_plan.rs
new file mode 100644
index 00000000..7ad82bb3
--- /dev/null
+++ b/src/coordinator/plan/create_table_plan.rs
@@ -0,0 +1,55 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use datafusion::logical_expr::LogicalPlan;
+
+use crate::sql::schema::SourceTable;
+
+use super::{PlanNode, PlanVisitor, PlanVisitorContext, PlanVisitorResult};
+
+/// Payload for [`CreateTablePlan`]: either a DataFusion DDL plan or a connector `CREATE TABLE` (no `AS SELECT`).
+#[derive(Debug, Clone)]
+pub enum CreateTablePlanBody {
+    DataFusion(LogicalPlan),
+    ConnectorSource {
+        source_table: SourceTable,
+        if_not_exists: bool,
+    },
+}
+
+#[derive(Debug, Clone)]
+pub struct CreateTablePlan {
+    pub body: CreateTablePlanBody,
+}
+
+impl CreateTablePlan {
+    pub fn new(logical_plan: LogicalPlan) -> Self {
+        Self {
+            body: CreateTablePlanBody::DataFusion(logical_plan),
+        }
+    }
+
+    pub fn connector_source(source_table: SourceTable, if_not_exists: bool) -> Self {
+        Self {
+            body: CreateTablePlanBody::ConnectorSource {
+                source_table,
+                if_not_exists,
+            },
+        }
+    }
+}
+
+impl PlanNode for CreateTablePlan {
+    fn accept(&self, visitor: &dyn PlanVisitor, context: &PlanVisitorContext) -> PlanVisitorResult {
+        visitor.visit_create_table_plan(self, context)
+    }
+}
diff --git a/src/sql/parser/mod.rs b/src/coordinator/plan/drop_streaming_table_plan.rs
similarity index 52%
rename from src/sql/parser/mod.rs
rename to src/coordinator/plan/drop_streaming_table_plan.rs
index 11f4b18e..d06dc836 100644
--- a/src/sql/parser/mod.rs
+++ b/src/coordinator/plan/drop_streaming_table_plan.rs
@@ -10,33 +10,25 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-mod sql_parser;
+use super::{PlanNode, PlanVisitor, PlanVisitorContext, PlanVisitorResult};
 
-pub use sql_parser::SqlParser;
-
-#[derive(Debug)]
-pub struct ParseError {
-    pub message: String,
+#[derive(Debug, Clone)]
+pub struct DropStreamingTablePlan {
+    pub table_name: String,
+    pub if_exists: bool,
 }
 
-impl std::fmt::Display for ParseError {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "Parse error: {}", self.message)
-    }
-}
-
-impl std::error::Error for ParseError {}
-
-impl From<String> for ParseError {
-    fn from(message: String) -> Self {
-        ParseError { message }
+impl DropStreamingTablePlan {
+    pub fn new(table_name: String, if_exists: bool) -> Self {
+        Self {
+            table_name,
+            if_exists,
+        }
     }
 }
 
-impl ParseError {
-    pub fn new(message: impl Into<String>) -> Self {
-        Self {
-            message: message.into(),
-        }
+impl PlanNode for DropStreamingTablePlan {
+    fn accept(&self, visitor: &dyn PlanVisitor, context: &PlanVisitorContext) -> PlanVisitorResult {
+        visitor.visit_drop_streaming_table(self, context)
     }
 }
diff --git a/src/coordinator/plan/drop_table_plan.rs b/src/coordinator/plan/drop_table_plan.rs
new file mode 100644
index 00000000..7d80a7b7
--- /dev/null
+++ b/src/coordinator/plan/drop_table_plan.rs
@@ -0,0 +1,34 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use super::{PlanNode, PlanVisitor, PlanVisitorContext, PlanVisitorResult};
+
+#[derive(Debug, Clone)]
+pub struct DropTablePlan {
+    pub table_name: String,
+    pub if_exists: bool,
+}
+
+impl DropTablePlan {
+    pub fn new(table_name: String, if_exists: bool) -> Self {
+        Self {
+            table_name,
+            if_exists,
+        }
+    }
+}
+
+impl PlanNode for DropTablePlan {
+    fn accept(&self, visitor: &dyn PlanVisitor, context: &PlanVisitorContext) -> PlanVisitorResult {
+        visitor.visit_drop_table_plan(self, context)
+    }
+}
diff --git a/src/coordinator/plan/logical_plan_visitor.rs b/src/coordinator/plan/logical_plan_visitor.rs
index 536fec37..77fa9eb4 100644
--- a/src/coordinator/plan/logical_plan_visitor.rs
+++ b/src/coordinator/plan/logical_plan_visitor.rs
@@ -10,34 +10,311 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+use std::sync::Arc;
+
+use datafusion::common::{plan_datafusion_err, plan_err, Result};
+use datafusion::execution::SessionStateBuilder;
+use datafusion::sql::sqlparser::ast::{
+    CreateTable as SqlCreateTable, Expr as SqlExpr, ObjectType, SqlOption, Statement as DFStatement,
+    TableConstraint,
+};
+use datafusion_common::TableReference;
+use datafusion_execution::config::SessionConfig;
+use datafusion_expr::{col, Extension, Expr, LogicalPlan};
+use sqlparser::ast::Statement;
+use tracing::debug;
+
 use crate::coordinator::analyze::analysis::Analysis;
 use crate::coordinator::plan::{
-    CreateFunctionPlan, CreatePythonFunctionPlan, DropFunctionPlan, PlanNode, ShowFunctionsPlan,
-    StartFunctionPlan, StopFunctionPlan,
+    CreateFunctionPlan, CreatePythonFunctionPlan, CreateTablePlan, DropFunctionPlan,
+    DropStreamingTablePlan, DropTablePlan, PlanNode, ShowCatalogTablesPlan,
+    ShowCreateStreamingTablePlan, ShowCreateTablePlan, ShowFunctionsPlan,
+    ShowStreamingTablesPlan, StartFunctionPlan, StopFunctionPlan, StreamingTable,
 };
 use crate::coordinator::statement::{
-    CreateFunction, CreatePythonFunction, DropFunction, ShowFunctions, StartFunction,
-    StatementVisitor, StatementVisitorContext, StatementVisitorResult, StopFunction,
+    CreateFunction, CreatePythonFunction, CreateTable, DropFunction,
+    DropStreamingTableStatement, DropTableStatement, ShowCatalogTables,
+    ShowCreateStreamingTable, ShowCreateTable, ShowFunctions, ShowStreamingTables,
+    StartFunction, StatementVisitor, StatementVisitorContext, StatementVisitorResult,
+    StopFunction, StreamingTableStatement,
+};
+use crate::coordinator::tool::ConnectorOptions;
+use crate::sql::analysis::{
+    maybe_add_key_extension_to_sink, rewrite_sinks, StreamSchemaProvider,
 };
+use crate::sql::common::with_option_keys as opt;
+use crate::sql::extensions::sink::StreamEgressNode;
+use crate::sql::functions::{is_json_union, serialize_outgoing_json};
+use crate::sql::logical_node::logical::{LogicalProgram, ProgramConfig};
+use crate::sql::logical_planner::optimizers::{produce_optimized_plan, ChainingOptimizer};
+use crate::sql::logical_planner::planner::PlanToGraphVisitor;
+use crate::sql::rewrite_plan;
+use crate::sql::schema::source_table::SourceTable;
+use crate::sql::schema::{ColumnDescriptor, ConnectionType, Table};
 
-#[derive(Debug, Default)]
-pub struct LogicalPlanVisitor;
+#[derive(Clone)]
+pub struct LogicalPlanVisitor {
+    schema_provider: StreamSchemaProvider,
+}
 
 impl LogicalPlanVisitor {
-    pub fn new() -> Self {
-        Self
+    pub fn new(schema_provider: StreamSchemaProvider) -> Self {
+        Self { schema_provider }
     }
 
     pub fn visit(&self, analysis: &Analysis) -> Box<dyn PlanNode> {
-        let context = StatementVisitorContext::Empty;
         let stmt = analysis.statement();
+        let context = StatementVisitorContext::Empty;
 
-        let result = stmt.accept(self, &context);
-
-        match result {
+        match stmt.accept(self, &context) {
             StatementVisitorResult::Plan(plan) => plan,
-            _ => panic!("LogicalPlanVisitor should return Plan"),
+            _ => panic!("Fatal: LogicalPlanVisitor must yield a PlanNode variant"),
+        }
+    }
+
+    pub fn build_streaming_table(
+        schema_provider: &StreamSchemaProvider,
+        stmt: &StreamingTableStatement,
+    ) -> Result<StreamingTable> {
+        Self::new(schema_provider.clone()).compile_streaming_sink(stmt)
+    }
+
+    fn compile_streaming_sink(
+        &self,
+        stmt: &StreamingTableStatement,
+    ) -> Result<StreamingTable> {
+        let DFStatement::CreateStreamingTable {
+            name,
+            with_options,
+            comment,
+            query,
+        } = &stmt.statement
+        else {
+            return plan_err!("Statement mismatch: Expected CREATE STREAMING TABLE AST node");
+        };
+
+        let sink_table_name = name.to_string();
+        debug!("Initiating streaming sink compilation for identifier: {}", sink_table_name);
+
+        let mut sink_properties = ConnectorOptions::new(with_options, &None)?;
+        let connector_type = sink_properties.pull_opt_str(opt::CONNECTOR)?.ok_or_else(|| {
+            plan_datafusion_err!(
+            "Validation Error: Streaming table '{}' requires the '{}' property",
+            sink_table_name,
+            opt::CONNECTOR
+        )
+        })?;
+
+        let partition_keys = Self::extract_partitioning_keys(&mut sink_properties)?;
+
+        let sink_description = comment
+            .as_deref()
+            .map(str::trim)
+            .filter(|s| !s.is_empty())
+            .map(str::to_string)
+            .unwrap_or_else(|| format!("sink `{}` ({connector_type})", sink_table_name));
+
+        let mut query_logical_plan = rewrite_plan(
+            produce_optimized_plan(&Statement::Query(query.clone()), &self.schema_provider)?,
+            &self.schema_provider,
+        )?;
+
+        if query_logical_plan.schema().fields().iter().any(|f| is_json_union(f.data_type())) {
+            query_logical_plan = serialize_outgoing_json(&self.schema_provider, Arc::new(query_logical_plan));
+        }
+
+        let output_schema_fields = query_logical_plan
+            .schema()
+            .fields()
+            .iter()
+            .map(|f| ColumnDescriptor::from((**f).clone()))
+            .collect::<Vec<_>>();
+
+        let mut sink_definition = SourceTable::from_options(
+            &sink_table_name,
+            &connector_type,
+            false,
+            output_schema_fields,
+            vec![],
+            None,
+            &mut sink_properties,
+            None,
+            &self.schema_provider,
+            Some(ConnectionType::Sink),
+            sink_description,
+        )?;
+        sink_definition.partition_exprs = Arc::new(partition_keys);
+
+        let output_schema = query_logical_plan.schema().clone();
+        let sink_plan_node = StreamEgressNode::try_new(
+            TableReference::bare(sink_table_name.clone()),
+            Table::ConnectorTable(sink_definition.clone()),
+            output_schema,
+            query_logical_plan,
+        )?;
+
+        let mut rewritten_plans = rewrite_sinks(vec![maybe_add_key_extension_to_sink(
+            LogicalPlan::Extension(Extension {
+                node: Arc::new(sink_plan_node),
+            }),
+        )?])?;
+
+        let final_logical_plan = rewritten_plans.remove(0);
+
+        let validated_program = self.validate_graph_topology(&final_logical_plan)?;
+
+        Ok(StreamingTable {
+            name: sink_table_name,
+            comment: comment.clone(),
+            program: validated_program,
+        })
+    }
+
+    fn validate_graph_topology(&self, logical_plan: &LogicalPlan) -> Result<LogicalProgram> {
+        let mut session_config = SessionConfig::new();
+        let opts = session_config.options_mut();
+        opts.optimizer.enable_round_robin_repartition = false;
+        opts.optimizer.repartition_aggregations = false;
+        opts.optimizer.repartition_windows = false;
+        opts.optimizer.repartition_sorts = false;
+        opts.optimizer.repartition_joins = false;
+        opts.execution.target_partitions = 1;
+
+        let session_state = SessionStateBuilder::new()
+            .with_config(session_config)
+            .with_default_features()
+            .with_physical_optimizer_rules(vec![])
+            .build();
+
+        let mut graph_compiler = PlanToGraphVisitor::new(&self.schema_provider, &session_state);
+        graph_compiler.add_plan(logical_plan.clone())?;
+
+        let mut executable_program =
+            LogicalProgram::new(graph_compiler.into_graph(), ProgramConfig::default());
+        executable_program.optimize(&ChainingOptimizer {});
+
+        Ok(executable_program)
+    }
+
+    fn extract_partitioning_keys(
+        options: &mut ConnectorOptions,
+    ) -> Result<Option<Vec<Expr>>> {
+        options
+            .pull_opt_str(opt::PARTITION_BY)?
+            .map(|raw_cols| raw_cols.split(',').map(|c| col(c.trim())).collect())
+            .map(Ok)
+            .transpose()
+    }
+
+    fn contains_connector_property(options: &[SqlOption]) -> bool {
+        options.iter().any(|opt| match opt {
+            SqlOption::KeyValue { key, .. } => key.value.eq_ignore_ascii_case(opt::CONNECTOR),
+            _ => false,
+        })
+    }
+
+    fn parse_primary_keys(constraints: &[TableConstraint]) -> Result<Vec<String>> {
+        let mut keys = None;
+        for constraint in constraints {
+            if let TableConstraint::PrimaryKey { columns, .. } = constraint {
+                if keys.is_some() {
+                    return plan_err!(
+                        "Constraint Violation: Multiple PRIMARY KEY constraints are forbidden"
+                    );
+                }
+                keys = Some(columns.iter().map(|ident| ident.value.clone()).collect());
+            }
+        }
+        Ok(keys.unwrap_or_default())
+    }
+
+    fn parse_watermark_strategy(
+        constraints: &[TableConstraint],
+    ) -> Result<Option<(String, Option<SqlExpr>)>> {
+        let mut strategy = None;
+        for constraint in constraints {
+            if let TableConstraint::Watermark {
+                column_name,
+                watermark_expr,
+            } = constraint
+            {
+                if strategy.is_some() {
+                    return plan_err!(
+                        "Constraint Violation: Only a single WATERMARK FOR clause is permitted"
+                    );
+                }
+                strategy = Some((column_name.value.clone(), watermark_expr.clone()));
+            }
         }
+        Ok(strategy)
+    }
+
+    fn compile_connector_source_plan(
+        &self,
+        stmt: &SqlCreateTable,
+    ) -> Result<CreateTablePlan> {
+        if stmt.query.is_some() {
+            return plan_err!("Syntax Error: CREATE TABLE ... AS SELECT combined with WITH ('connector'=...) is invalid. Use CREATE STREAMING TABLE instead.");
+        }
+        if stmt.or_replace {
+            return plan_err!(
+                "Syntax Error: OR REPLACE is not supported for external connector tables."
+            );
+        }
+        if stmt.temporary {
+            return plan_err!(
+                "Syntax Error: TEMPORARY is not supported for external connector tables."
+            );
+        }
+        if stmt.external {
+            return plan_err!("Syntax Error: EXTERNAL keyword is redundant and unsupported for connector configurations.");
+        }
+
+        let target_name = stmt.name.to_string();
+        let table_description = stmt
+            .comment
+            .clone()
+            .map(|c| c.to_string())
+            .unwrap_or_default();
+
+        let schema_compiler = datafusion::sql::planner::SqlToRel::new(&self.schema_provider);
+        let arrow_schema = schema_compiler.build_schema(stmt.columns.clone())?;
+
+        let schema_descriptors = arrow_schema
+            .fields()
+            .iter()
+            .map(|f| ColumnDescriptor::from((**f).clone()))
+            .collect::<Vec<_>>();
+
+        let mut connector_options = ConnectorOptions::new(&stmt.with_options, &None)?;
+        let adapter_type = connector_options.pull_opt_str(opt::CONNECTOR)?.ok_or_else(|| {
+            plan_datafusion_err!(
+                "Configuration Error: Missing required property '{}' in WITH clause",
+                opt::CONNECTOR
+            )
+        })?;
+
+        let pk_constraints = Self::parse_primary_keys(&stmt.constraints)?;
+        let watermark_strategy = Self::parse_watermark_strategy(&stmt.constraints)?;
+
+        let source_definition = SourceTable::from_options(
+            &target_name,
+            &adapter_type,
+            false,
+            schema_descriptors,
+            pk_constraints,
+            watermark_strategy,
+            &mut connector_options,
+            None,
+            &self.schema_provider,
+            Some(ConnectionType::Source),
+            table_description,
+        )?;
+
+        Ok(CreateTablePlan::connector_source(
+            source_definition,
+            stmt.if_not_exists,
+        ))
     }
 }
 
@@ -45,24 +322,19 @@ impl StatementVisitor for LogicalPlanVisitor {
     fn visit_create_function(
         &self,
         stmt: &CreateFunction,
-        _context: &StatementVisitorContext,
+        _ctx: &StatementVisitorContext,
     ) -> StatementVisitorResult {
-        let function_source = stmt.get_function_source().clone();
-        let config_source = stmt.get_config_source().cloned();
-        let extra_props = stmt.get_extra_properties().clone();
-
-        // Name will be read from config file during execution
         StatementVisitorResult::Plan(Box::new(CreateFunctionPlan::new(
-            function_source,
-            config_source,
-            extra_props,
+            stmt.get_function_source().clone(),
+            stmt.get_config_source().cloned(),
+            stmt.get_extra_properties().clone(),
         )))
     }
 
     fn visit_drop_function(
         &self,
         stmt: &DropFunction,
-        _context: &StatementVisitorContext,
+        _ctx: &StatementVisitorContext,
     ) -> StatementVisitorResult {
         StatementVisitorResult::Plan(Box::new(DropFunctionPlan::new(stmt.name.clone())))
     }
@@ -70,7 +342,7 @@ impl StatementVisitor for LogicalPlanVisitor {
     fn visit_start_function(
         &self,
         stmt: &StartFunction,
-        _context: &StatementVisitorContext,
+        _ctx: &StatementVisitorContext,
     ) -> StatementVisitorResult {
         StatementVisitorResult::Plan(Box::new(StartFunctionPlan::new(stmt.name.clone())))
     }
@@ -78,7 +350,7 @@ impl StatementVisitor for LogicalPlanVisitor {
     fn visit_stop_function(
         &self,
         stmt: &StopFunction,
-        _context: &StatementVisitorContext,
+        _ctx: &StatementVisitorContext,
     ) -> StatementVisitorResult {
         StatementVisitorResult::Plan(Box::new(StopFunctionPlan::new(stmt.name.clone())))
     }
@@ -86,24 +358,137 @@ impl StatementVisitor for LogicalPlanVisitor {
     fn visit_show_functions(
         &self,
         _stmt: &ShowFunctions,
-        _context: &StatementVisitorContext,
+        _ctx: &StatementVisitorContext,
     ) -> StatementVisitorResult {
         StatementVisitorResult::Plan(Box::new(ShowFunctionsPlan::new()))
     }
 
+    fn visit_show_catalog_tables(
+        &self,
+        _stmt: &ShowCatalogTables,
+        _ctx: &StatementVisitorContext,
+    ) -> StatementVisitorResult {
+        StatementVisitorResult::Plan(Box::new(ShowCatalogTablesPlan::new()))
+    }
+
+    fn visit_show_create_table(
+        &self,
+        stmt: &ShowCreateTable,
+        _ctx: &StatementVisitorContext,
+    ) -> StatementVisitorResult {
+        StatementVisitorResult::Plan(Box::new(ShowCreateTablePlan::new(
+            stmt.table_name.clone(),
+        )))
+    }
+
     fn visit_create_python_function(
         &self,
         stmt: &CreatePythonFunction,
-        _context: &StatementVisitorContext,
+        _ctx: &StatementVisitorContext,
     ) -> StatementVisitorResult {
-        let class_name = stmt.get_class_name().to_string();
-        let modules = stmt.get_modules().to_vec();
-        let config_content = stmt.get_config_content().to_string();
-
         StatementVisitorResult::Plan(Box::new(CreatePythonFunctionPlan::new(
-            class_name,
-            modules,
-            config_content,
+            stmt.get_class_name().to_string(),
+            stmt.get_modules().to_vec(),
+            stmt.get_config_content().to_string(),
         )))
     }
-}
+
+    fn visit_create_table(
+        &self,
+        stmt: &CreateTable,
+        _ctx: &StatementVisitorContext,
+    ) -> StatementVisitorResult {
+        if let Statement::CreateTable(ast_node) = &stmt.statement {
+            if ast_node.query.is_none()
+                && Self::contains_connector_property(&ast_node.with_options)
+            {
+                let execution_plan = self.compile_connector_source_plan(ast_node).unwrap_or_else(
+                    |err| {
+                        panic!("Fatal Compiler Error: Connector source resolution failed - {err:#}");
+                    },
+                );
+                return StatementVisitorResult::Plan(Box::new(execution_plan));
+            }
+        }
+
+        let schema_compiler = datafusion::sql::planner::SqlToRel::new(&self.schema_provider);
+        match schema_compiler.sql_statement_to_plan(stmt.statement.clone()) {
+            Ok(logical_plan) => {
+                debug!(
+                    "Successfully compiled logical DDL topology:\n{}",
+                    logical_plan.display_graphviz()
+                );
+                StatementVisitorResult::Plan(Box::new(CreateTablePlan::new(logical_plan)))
+            }
+            Err(err) => panic!("Fatal Compiler Error: Logical plan translation failed - {err}"),
+        }
+    }
+
+    fn visit_streaming_table_statement(
+        &self,
+        stmt: &StreamingTableStatement,
+        _ctx: &StatementVisitorContext,
+    ) -> StatementVisitorResult {
+        let execution_plan = self.compile_streaming_sink(stmt).unwrap_or_else(|err| {
+            panic!("Fatal Compiler Error: Streaming sink compilation aborted - {err}");
+        });
+        StatementVisitorResult::Plan(Box::new(execution_plan))
+    }
+
+    fn visit_drop_table_statement(
+        &self,
+        stmt: &DropTableStatement,
+        _ctx: &StatementVisitorContext,
+    ) -> StatementVisitorResult {
+        let DFStatement::Drop {
+            object_type,
+            if_exists,
+            names,
+            ..
+        } = &stmt.statement
+        else {
+            panic!("Fatal Compiler Error: AST mismatch on DropTableStatement");
+        };
+
+        if *object_type != ObjectType::Table {
+            panic!("Fatal Compiler Error: Drop target must be of type TABLE");
+        }
+        if names.len() != 1 {
+            panic!("Fatal Compiler Error: Bulk drop operations are not supported. Specify exactly one table.");
+        }
+
+        StatementVisitorResult::Plan(Box::new(DropTablePlan::new(
+            names[0].to_string(),
+            *if_exists,
+        )))
+    }
+
+    fn visit_show_streaming_tables(
+        &self,
+        _stmt: &ShowStreamingTables,
+        _ctx: &StatementVisitorContext,
+    ) -> StatementVisitorResult {
+        StatementVisitorResult::Plan(Box::new(ShowStreamingTablesPlan::new()))
+    }
+
+    fn visit_show_create_streaming_table(
+        &self,
+        stmt: &ShowCreateStreamingTable,
+        _ctx: &StatementVisitorContext,
+    ) -> StatementVisitorResult {
+        StatementVisitorResult::Plan(Box::new(ShowCreateStreamingTablePlan::new(
+            stmt.table_name.clone(),
+        )))
+    }
+
+    fn visit_drop_streaming_table(
+        &self,
+        stmt: &DropStreamingTableStatement,
+        _ctx: &StatementVisitorContext,
+    ) -> StatementVisitorResult {
+        StatementVisitorResult::Plan(Box::new(DropStreamingTablePlan::new(
+            stmt.table_name.clone(),
+            stmt.if_exists,
+        )))
+    }
+}
\ No newline at end of file
diff --git a/src/coordinator/plan/lookup_table_plan.rs b/src/coordinator/plan/lookup_table_plan.rs
new file mode 100644
index 00000000..65103b61
--- /dev/null
+++ b/src/coordinator/plan/lookup_table_plan.rs
@@ -0,0 +1,27 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use crate::sql::schema::source_table::SourceTable;
+
+use super::{PlanNode, PlanVisitor, PlanVisitorContext, PlanVisitorResult};
+
+/// Plan node that exposes a lookup table config as a logical plan input.
+#[derive(Debug)]
+pub struct LookupTablePlan {
+    pub table: SourceTable,
+}
+
+impl PlanNode for LookupTablePlan {
+    fn accept(&self, visitor: &dyn PlanVisitor, context: &PlanVisitorContext) -> PlanVisitorResult {
+        visitor.visit_lookup_table(self, context)
+    }
+}
diff --git a/src/coordinator/plan/mod.rs b/src/coordinator/plan/mod.rs
index 9aa403b5..8166d444 100644
--- a/src/coordinator/plan/mod.rs
+++ b/src/coordinator/plan/mod.rs
@@ -12,22 +12,42 @@
 
 mod create_function_plan;
 mod create_python_function_plan;
+mod create_table_plan;
 mod drop_function_plan;
+mod drop_streaming_table_plan;
+mod drop_table_plan;
 mod logical_plan_visitor;
+mod lookup_table_plan;
 mod optimizer;
+mod show_catalog_tables_plan;
+mod show_create_streaming_table_plan;
+mod show_create_table_plan;
 mod show_functions_plan;
+mod show_streaming_tables_plan;
 mod start_function_plan;
 mod stop_function_plan;
+mod streaming_table_connector_plan;
+mod streaming_table_plan;
 mod visitor;
 
 pub use create_function_plan::CreateFunctionPlan;
 pub use create_python_function_plan::CreatePythonFunctionPlan;
+pub use create_table_plan::{CreateTablePlan, CreateTablePlanBody};
 pub use drop_function_plan::DropFunctionPlan;
+pub use drop_streaming_table_plan::DropStreamingTablePlan;
+pub use drop_table_plan::DropTablePlan;
 pub use logical_plan_visitor::LogicalPlanVisitor;
+pub use lookup_table_plan::LookupTablePlan;
 pub use optimizer::LogicalPlanner;
+pub use show_catalog_tables_plan::ShowCatalogTablesPlan;
+pub use show_create_streaming_table_plan::ShowCreateStreamingTablePlan;
+pub use show_create_table_plan::ShowCreateTablePlan;
 pub use show_functions_plan::ShowFunctionsPlan;
+pub use show_streaming_tables_plan::ShowStreamingTablesPlan;
 pub use start_function_plan::StartFunctionPlan;
 pub use stop_function_plan::StopFunctionPlan;
+pub use streaming_table_connector_plan::StreamingTableConnectorPlan;
+pub use streaming_table_plan::StreamingTable;
 pub use visitor::{PlanVisitor, PlanVisitorContext, PlanVisitorResult};
 
 use std::fmt;
diff --git a/src/coordinator/plan/show_catalog_tables_plan.rs b/src/coordinator/plan/show_catalog_tables_plan.rs
new file mode 100644
index 00000000..420fdb40
--- /dev/null
+++ b/src/coordinator/plan/show_catalog_tables_plan.rs
@@ -0,0 +1,28 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use super::{PlanNode, PlanVisitor, PlanVisitorContext, PlanVisitorResult};
+
+#[derive(Debug, Default)]
+pub struct ShowCatalogTablesPlan;
+
+impl ShowCatalogTablesPlan {
+    pub fn new() -> Self {
+        Self
+    }
+}
+
+impl PlanNode for ShowCatalogTablesPlan {
+    fn accept(&self, visitor: &dyn PlanVisitor, context: &PlanVisitorContext) -> PlanVisitorResult {
+        visitor.visit_show_catalog_tables(self, context)
+    }
+}
diff --git a/src/coordinator/plan/show_create_streaming_table_plan.rs b/src/coordinator/plan/show_create_streaming_table_plan.rs
new file mode 100644
index 00000000..8d63c0d5
--- /dev/null
+++ b/src/coordinator/plan/show_create_streaming_table_plan.rs
@@ -0,0 +1,30 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use super::{PlanNode, PlanVisitor, PlanVisitorContext, PlanVisitorResult};
+
+#[derive(Debug, Clone)]
+pub struct ShowCreateStreamingTablePlan {
+    pub table_name: String,
+}
+
+impl ShowCreateStreamingTablePlan {
+    pub fn new(table_name: String) -> Self {
+        Self { table_name }
+    }
+}
+
+impl PlanNode for ShowCreateStreamingTablePlan {
+    fn accept(&self, visitor: &dyn PlanVisitor, context: &PlanVisitorContext) -> PlanVisitorResult {
+        visitor.visit_show_create_streaming_table(self, context)
+    }
+}
diff --git a/src/coordinator/plan/show_create_table_plan.rs b/src/coordinator/plan/show_create_table_plan.rs
new file mode 100644
index 00000000..c5fe6376
--- /dev/null
+++ b/src/coordinator/plan/show_create_table_plan.rs
@@ -0,0 +1,30 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use super::{PlanNode, PlanVisitor, PlanVisitorContext, PlanVisitorResult};
+
+#[derive(Debug, Clone)]
+pub struct ShowCreateTablePlan {
+    pub table_name: String,
+}
+
+impl ShowCreateTablePlan {
+    pub fn new(table_name: String) -> Self {
+        Self { table_name }
+    }
+}
+
+impl PlanNode for ShowCreateTablePlan {
+    fn accept(&self, visitor: &dyn PlanVisitor, context: &PlanVisitorContext) -> PlanVisitorResult {
+        visitor.visit_show_create_table(self, context)
+    }
+}
diff --git a/src/coordinator/plan/show_streaming_tables_plan.rs b/src/coordinator/plan/show_streaming_tables_plan.rs
new file mode 100644
index 00000000..08410115
--- /dev/null
+++ b/src/coordinator/plan/show_streaming_tables_plan.rs
@@ -0,0 +1,28 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use super::{PlanNode, PlanVisitor, PlanVisitorContext, PlanVisitorResult};
+
+#[derive(Debug, Default)]
+pub struct ShowStreamingTablesPlan;
+
+impl ShowStreamingTablesPlan {
+    pub fn new() -> Self {
+        Self
+    }
+}
+
+impl PlanNode for ShowStreamingTablesPlan {
+    fn accept(&self, visitor: &dyn PlanVisitor, context: &PlanVisitorContext) -> PlanVisitorResult {
+        visitor.visit_show_streaming_tables(self, context)
+    }
+}
diff --git a/src/coordinator/plan/streaming_table_connector_plan.rs b/src/coordinator/plan/streaming_table_connector_plan.rs
new file mode 100644
index 00000000..214e2e15
--- /dev/null
+++ b/src/coordinator/plan/streaming_table_connector_plan.rs
@@ -0,0 +1,27 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use crate::sql::schema::source_table::SourceTable;
+
+use super::{PlanNode, PlanVisitor, PlanVisitorContext, PlanVisitorResult};
+
+/// Plan node that exposes a connector table config as a logical plan input.
+#[derive(Debug)]
+pub struct StreamingTableConnectorPlan {
+    pub table: SourceTable,
+}
+
+impl PlanNode for StreamingTableConnectorPlan {
+    fn accept(&self, visitor: &dyn PlanVisitor, context: &PlanVisitorContext) -> PlanVisitorResult {
+        visitor.visit_streaming_connector_table(self, context)
+    }
+}
diff --git a/src/coordinator/plan/streaming_table_plan.rs b/src/coordinator/plan/streaming_table_plan.rs
new file mode 100644
index 00000000..512ec266
--- /dev/null
+++ b/src/coordinator/plan/streaming_table_plan.rs
@@ -0,0 +1,28 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use super::{PlanNode, PlanVisitor, PlanVisitorContext, PlanVisitorResult};
+use crate::sql::logical_node::logical::LogicalProgram;
+
+/// Plan node representing a fully resolved streaming table (DDL).
+#[derive(Debug)]
+pub struct StreamingTable {
+    pub name: String,
+    pub comment: Option<String>,
+    pub program: LogicalProgram,
+}
+
+impl PlanNode for StreamingTable {
+    fn accept(&self, visitor: &dyn PlanVisitor, context: &PlanVisitorContext) -> PlanVisitorResult {
+        visitor.visit_streaming_table(self, context)
+    }
+}
diff --git a/src/coordinator/plan/visitor.rs b/src/coordinator/plan/visitor.rs
index 44059c67..bba44a1f 100644
--- a/src/coordinator/plan/visitor.rs
+++ b/src/coordinator/plan/visitor.rs
@@ -11,8 +11,11 @@
 // limitations under the License.
 
 use super::{
-    CreateFunctionPlan, CreatePythonFunctionPlan, DropFunctionPlan, ShowFunctionsPlan,
-    StartFunctionPlan, StopFunctionPlan,
+    CreateFunctionPlan, CreatePythonFunctionPlan, CreateTablePlan, DropFunctionPlan,
+    DropStreamingTablePlan, DropTablePlan, LookupTablePlan, ShowCatalogTablesPlan,
+    ShowCreateStreamingTablePlan, ShowCreateTablePlan, ShowFunctionsPlan,
+    ShowStreamingTablesPlan, StartFunctionPlan, StopFunctionPlan, StreamingTable,
+    StreamingTableConnectorPlan,
 };
 
 /// Context passed to PlanVisitor methods
@@ -79,9 +82,69 @@ pub trait PlanVisitor {
         context: &PlanVisitorContext,
     ) -> PlanVisitorResult;
 
+    fn visit_show_catalog_tables(
+        &self,
+        plan: &ShowCatalogTablesPlan,
+        context: &PlanVisitorContext,
+    ) -> PlanVisitorResult;
+
+    fn visit_show_create_table(
+        &self,
+        plan: &ShowCreateTablePlan,
+        context: &PlanVisitorContext,
+    ) -> PlanVisitorResult;
+
     fn visit_create_python_function(
         &self,
         plan: &CreatePythonFunctionPlan,
         context: &PlanVisitorContext,
     ) -> PlanVisitorResult;
+
+    fn visit_create_table_plan(
+        &self,
+        plan: &CreateTablePlan,
+        context: &PlanVisitorContext,
+    ) -> PlanVisitorResult;
+
+    fn visit_streaming_table(
+        &self,
+        plan: &StreamingTable,
+        context: &PlanVisitorContext,
+    ) -> PlanVisitorResult;
+
+    fn visit_lookup_table(
+        &self,
+        plan: &LookupTablePlan,
+        context: &PlanVisitorContext,
+    ) -> PlanVisitorResult;
+
+    fn visit_streaming_connector_table(
+        &self,
+        plan: &StreamingTableConnectorPlan,
+        context: &PlanVisitorContext,
+    ) -> PlanVisitorResult;
+
+    fn visit_drop_table_plan(
+        &self,
+        plan: &DropTablePlan,
+        context: &PlanVisitorContext,
+    ) -> PlanVisitorResult;
+
+    fn visit_show_streaming_tables(
+        &self,
+        plan: &ShowStreamingTablesPlan,
+        context: &PlanVisitorContext,
+    ) -> PlanVisitorResult;
+
+    fn visit_show_create_streaming_table(
+        &self,
+        plan: &ShowCreateStreamingTablePlan,
+        context: &PlanVisitorContext,
+    ) -> PlanVisitorResult;
+
+    fn visit_drop_streaming_table(
+        &self,
+        plan: &DropStreamingTablePlan,
+        context: &PlanVisitorContext,
+    ) -> PlanVisitorResult;
 }
diff --git a/src/coordinator/runtime_context.rs b/src/coordinator/runtime_context.rs
new file mode 100644
index 00000000..5d671b98
--- /dev/null
+++ b/src/coordinator/runtime_context.rs
@@ -0,0 +1,61 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Runtime resources for a single coordinator run: [`TaskManager`], [`CatalogManager`], and [`JobManager`].
+
+use std::sync::Arc;
+
+use anyhow::Result;
+
+use crate::runtime::streaming::job::JobManager;
+use crate::runtime::taskexecutor::TaskManager;
+use crate::sql::schema::StreamSchemaProvider;
+use crate::storage::stream_catalog::CatalogManager;
+
+/// Dependencies shared by analyze / plan / execute, analogous to installing globals in
+/// [`TaskManager`], [`CatalogManager`], and [`JobManager`].
+#[derive(Clone)]
+pub struct CoordinatorRuntimeContext {
+    pub task_manager: Arc<TaskManager>,
+    pub catalog_manager: Arc<CatalogManager>,
+    pub job_manager: Arc<JobManager>,
+}
+
+impl CoordinatorRuntimeContext {
+    pub fn try_from_globals() -> Result<Self> {
+        Ok(Self {
+            task_manager: TaskManager::get()
+                .map_err(|e| anyhow::anyhow!("Failed to get TaskManager: {}", e))?,
+            catalog_manager: CatalogManager::global()
+                .map_err(|e| anyhow::anyhow!("Failed to get CatalogManager: {}", e))?,
+            job_manager: JobManager::global()
+                .map_err(|e| anyhow::anyhow!("Failed to get JobManager: {}", e))?,
+        })
+    }
+
+    pub fn new(
+        task_manager: Arc<TaskManager>,
+        catalog_manager: Arc<CatalogManager>,
+        job_manager: Arc<JobManager>,
+    ) -> Self {
+        Self {
+            task_manager,
+            catalog_manager,
+            job_manager,
+        }
+    }
+
+    /// Schema provider for [`LogicalPlanVisitor`] / [`SqlToRel`].
+    pub fn planning_schema_provider(&self) -> StreamSchemaProvider {
+        self.catalog_manager.acquire_planning_context()
+    }
+}
diff --git a/src/coordinator/statement/create_table.rs b/src/coordinator/statement/create_table.rs
new file mode 100644
index 00000000..67a500d1
--- /dev/null
+++ b/src/coordinator/statement/create_table.rs
@@ -0,0 +1,44 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use datafusion::sql::sqlparser::ast::Statement as DFStatement;
+
+use super::{Statement, StatementVisitor, StatementVisitorContext, StatementVisitorResult};
+
+/// Represents a CREATE TABLE or CREATE VIEW statement.
+///
+/// This wraps the raw SQL AST node so the coordinator pipeline can
+/// distinguish table/view creation from other streaming SQL operations.
+#[derive(Debug)]
+pub struct CreateTable {
+    pub statement: DFStatement,
+}
+
+impl CreateTable {
+    pub fn new(statement: DFStatement) -> Self {
+        Self { statement }
+    }
+}
+
+impl Statement for CreateTable {
+    fn accept(
+        &self,
+        visitor: &dyn StatementVisitor,
+        context: &StatementVisitorContext,
+    ) -> StatementVisitorResult {
+        visitor.visit_create_table(self, context)
+    }
+
+    fn as_create_table(&self) -> Option<&CreateTable> {
+        Some(self)
+    }
+}
diff --git a/src/coordinator/statement/drop_streaming_table.rs b/src/coordinator/statement/drop_streaming_table.rs
new file mode 100644
index 00000000..309abd97
--- /dev/null
+++ b/src/coordinator/statement/drop_streaming_table.rs
@@ -0,0 +1,40 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use super::{Statement, StatementVisitor, StatementVisitorContext, StatementVisitorResult};
+
+/// `DROP STREAMING TABLE [IF EXISTS] <name>` — stops and removes the streaming
+/// job from `JobManager`, then drops the corresponding catalog entry if present.
+#[derive(Debug, Clone)]
+pub struct DropStreamingTableStatement {
+    pub table_name: String,
+    pub if_exists: bool,
+}
+
+impl DropStreamingTableStatement {
+    pub fn new(table_name: String, if_exists: bool) -> Self {
+        Self {
+            table_name,
+            if_exists,
+        }
+    }
+}
+
+impl Statement for DropStreamingTableStatement {
+    fn accept(
+        &self,
+        visitor: &dyn StatementVisitor,
+        context: &StatementVisitorContext,
+    ) -> StatementVisitorResult {
+        visitor.visit_drop_streaming_table(self, context)
+    }
+}
diff --git a/src/coordinator/statement/drop_table.rs b/src/coordinator/statement/drop_table.rs
new file mode 100644
index 00000000..fa547dca
--- /dev/null
+++ b/src/coordinator/statement/drop_table.rs
@@ -0,0 +1,41 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use datafusion::sql::sqlparser::ast::Statement as DFStatement;
+
+use super::{Statement, StatementVisitor, StatementVisitorContext, StatementVisitorResult};
+
+/// `DROP TABLE` / `DROP TABLE IF EXISTS` (and `DROP STREAMING TABLE`, normalized at parse time).
+#[derive(Debug, Clone)]
+pub struct DropTableStatement {
+    pub statement: DFStatement,
+}
+
+impl DropTableStatement {
+    pub fn new(statement: DFStatement) -> Self {
+        Self { statement }
+    }
+}
+
+impl Statement for DropTableStatement {
+    fn accept(
+        &self,
+        visitor: &dyn StatementVisitor,
+        context: &StatementVisitorContext,
+    ) -> StatementVisitorResult {
+        visitor.visit_drop_table_statement(self, context)
+    }
+
+    fn as_drop_table_statement(&self) -> Option<&DropTableStatement> {
+        Some(self)
+    }
+}
diff --git a/src/coordinator/statement/mod.rs b/src/coordinator/statement/mod.rs
index f887209c..80d9c320 100644
--- a/src/coordinator/statement/mod.rs
+++ b/src/coordinator/statement/mod.rs
@@ -12,18 +12,34 @@
 
 mod create_function;
 mod create_python_function;
+mod create_table;
 mod drop_function;
+mod drop_streaming_table;
+mod drop_table;
+mod show_catalog_tables;
+mod show_create_streaming_table;
+mod show_create_table;
 mod show_functions;
+mod show_streaming_tables;
 mod start_function;
 mod stop_function;
+mod streaming_table;
 mod visitor;
 
 pub use create_function::{ConfigSource, CreateFunction, FunctionSource};
 pub use create_python_function::{CreatePythonFunction, PythonModule};
+pub use create_table::CreateTable;
 pub use drop_function::DropFunction;
+pub use drop_streaming_table::DropStreamingTableStatement;
+pub use drop_table::DropTableStatement;
+pub use show_catalog_tables::ShowCatalogTables;
+pub use show_create_streaming_table::ShowCreateStreamingTable;
+pub use show_create_table::ShowCreateTable;
 pub use show_functions::ShowFunctions;
+pub use show_streaming_tables::ShowStreamingTables;
 pub use start_function::StartFunction;
 pub use stop_function::StopFunction;
+pub use streaming_table::StreamingTableStatement;
 pub use visitor::{StatementVisitor, StatementVisitorContext, StatementVisitorResult};
 
 use std::fmt;
@@ -34,4 +50,20 @@ pub trait Statement: fmt::Debug + Send + Sync {
         visitor: &dyn StatementVisitor,
         context: &StatementVisitorContext,
     ) -> StatementVisitorResult;
+
+    fn as_create_table(&self) -> Option<&CreateTable> {
+        None
+    }
+
+    fn as_drop_table_statement(&self) -> Option<&DropTableStatement> {
+        None
+    }
+
+    fn as_streaming_table_statement(&self) -> Option<&StreamingTableStatement> {
+        None
+    }
+
+    fn as_drop_streaming_table_statement(&self) -> Option<&DropStreamingTableStatement> {
+        None
+    }
 }
diff --git a/src/coordinator/statement/show_catalog_tables.rs b/src/coordinator/statement/show_catalog_tables.rs
new file mode 100644
index 00000000..1f034562
--- /dev/null
+++ b/src/coordinator/statement/show_catalog_tables.rs
@@ -0,0 +1,33 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use super::{Statement, StatementVisitor, StatementVisitorContext, StatementVisitorResult};
+
+/// `SHOW TABLES` over the stream catalog (connector sources + streaming sinks).
+#[derive(Debug, Clone, Default)]
+pub struct ShowCatalogTables;
+
+impl ShowCatalogTables {
+    pub fn new() -> Self {
+        Self
+    }
+}
+
+impl Statement for ShowCatalogTables {
+    fn accept(
+        &self,
+        visitor: &dyn StatementVisitor,
+        context: &StatementVisitorContext,
+    ) -> StatementVisitorResult {
+        visitor.visit_show_catalog_tables(self, context)
+    }
+}
diff --git a/src/coordinator/statement/show_create_streaming_table.rs b/src/coordinator/statement/show_create_streaming_table.rs
new file mode 100644
index 00000000..73f16870
--- /dev/null
+++ b/src/coordinator/statement/show_create_streaming_table.rs
@@ -0,0 +1,36 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use super::{Statement, StatementVisitor, StatementVisitorContext, StatementVisitorResult};
+
+/// `SHOW CREATE STREAMING TABLE <name>` — displays the pipeline topology and
+/// runtime metadata for the named streaming job.
+#[derive(Debug, Clone)]
+pub struct ShowCreateStreamingTable {
+    pub table_name: String,
+}
+
+impl ShowCreateStreamingTable {
+    pub fn new(table_name: String) -> Self {
+        Self { table_name }
+    }
+}
+
+impl Statement for ShowCreateStreamingTable {
+    fn accept(
+        &self,
+        visitor: &dyn StatementVisitor,
+        context: &StatementVisitorContext,
+    ) -> StatementVisitorResult {
+        visitor.visit_show_create_streaming_table(self, context)
+    }
+}
diff --git a/src/coordinator/statement/show_create_table.rs b/src/coordinator/statement/show_create_table.rs
new file mode 100644
index 00000000..5b54a726
--- /dev/null
+++ b/src/coordinator/statement/show_create_table.rs
@@ -0,0 +1,35 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use super::{Statement, StatementVisitor, StatementVisitorContext, StatementVisitorResult};
+
+/// `SHOW CREATE TABLE <name>` for a stream-catalog table.
+#[derive(Debug, Clone)]
+pub struct ShowCreateTable {
+    pub table_name: String,
+}
+
+impl ShowCreateTable {
+    pub fn new(table_name: String) -> Self {
+        Self { table_name }
+    }
+}
+
+impl Statement for ShowCreateTable {
+    fn accept(
+        &self,
+        visitor: &dyn StatementVisitor,
+        context: &StatementVisitorContext,
+    ) -> StatementVisitorResult {
+        visitor.visit_show_create_table(self, context)
+    }
+}
diff --git a/src/coordinator/statement/show_streaming_tables.rs b/src/coordinator/statement/show_streaming_tables.rs
new file mode 100644
index 00000000..cedf3610
--- /dev/null
+++ b/src/coordinator/statement/show_streaming_tables.rs
@@ -0,0 +1,33 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use super::{Statement, StatementVisitor, StatementVisitorContext, StatementVisitorResult};
+
+/// `SHOW STREAMING TABLES` — lists all active streaming jobs managed by `JobManager`.
+#[derive(Debug, Clone, Default)]
+pub struct ShowStreamingTables;
+
+impl ShowStreamingTables {
+    pub fn new() -> Self {
+        Self
+    }
+}
+
+impl Statement for ShowStreamingTables {
+    fn accept(
+        &self,
+        visitor: &dyn StatementVisitor,
+        context: &StatementVisitorContext,
+    ) -> StatementVisitorResult {
+        visitor.visit_show_streaming_tables(self, context)
+    }
+}
diff --git a/src/coordinator/statement/streaming_table.rs b/src/coordinator/statement/streaming_table.rs
new file mode 100644
index 00000000..bfef3503
--- /dev/null
+++ b/src/coordinator/statement/streaming_table.rs
@@ -0,0 +1,44 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use datafusion::sql::sqlparser::ast::Statement as DFStatement;
+
+use super::{Statement, StatementVisitor, StatementVisitorContext, StatementVisitorResult};
+
+/// Wrapper for **`CREATE STREAMING TABLE ... WITH (...) AS SELECT ...`** (parsed AST).
+///
+/// The coordinator `parse_sql` frontend does **not** support `INSERT`; streaming sinks are
+/// defined only via **`CREATE STREAMING TABLE`** (and regular tables via **`CREATE TABLE`**).
+#[derive(Debug)]
+pub struct StreamingTableStatement {
+    pub statement: DFStatement,
+}
+
+impl StreamingTableStatement {
+    pub fn new(statement: DFStatement) -> Self {
+        Self { statement }
+    }
+}
+
+impl Statement for StreamingTableStatement {
+    fn accept(
+        &self,
+        visitor: &dyn StatementVisitor,
+        context: &StatementVisitorContext,
+    ) -> StatementVisitorResult {
+        visitor.visit_streaming_table_statement(self, context)
+    }
+
+    fn as_streaming_table_statement(&self) -> Option<&StreamingTableStatement> {
+        Some(self)
+    }
+}
diff --git a/src/coordinator/statement/visitor.rs b/src/coordinator/statement/visitor.rs
index 13ce2cfc..c3cf153a 100644
--- a/src/coordinator/statement/visitor.rs
+++ b/src/coordinator/statement/visitor.rs
@@ -11,7 +11,10 @@
 // limitations under the License.
 
 use super::{
-    CreateFunction, CreatePythonFunction, DropFunction, ShowFunctions, StartFunction, StopFunction,
+    CreateFunction, CreatePythonFunction, CreateTable, DropFunction,
+    DropStreamingTableStatement, DropTableStatement, ShowCatalogTables,
+    ShowCreateStreamingTable, ShowCreateTable, ShowFunctions, ShowStreamingTables,
+    StartFunction, StopFunction, StreamingTableStatement,
 };
 use crate::coordinator::plan::PlanNode;
 use crate::coordinator::statement::Statement;
@@ -82,9 +85,57 @@ pub trait StatementVisitor {
         context: &StatementVisitorContext,
     ) -> StatementVisitorResult;
 
+    fn visit_show_catalog_tables(
+        &self,
+        stmt: &ShowCatalogTables,
+        context: &StatementVisitorContext,
+    ) -> StatementVisitorResult;
+
+    fn visit_show_create_table(
+        &self,
+        stmt: &ShowCreateTable,
+        context: &StatementVisitorContext,
+    ) -> StatementVisitorResult;
+
     fn visit_create_python_function(
         &self,
         stmt: &CreatePythonFunction,
         context: &StatementVisitorContext,
     ) -> StatementVisitorResult;
+
+    fn visit_create_table(
+        &self,
+        stmt: &CreateTable,
+        context: &StatementVisitorContext,
+    ) -> StatementVisitorResult;
+
+    fn visit_streaming_table_statement(
+        &self,
+        stmt: &StreamingTableStatement,
+        context: &StatementVisitorContext,
+    ) -> StatementVisitorResult;
+
+    fn visit_drop_table_statement(
+        &self,
+        stmt: &DropTableStatement,
+        context: &StatementVisitorContext,
+    ) -> StatementVisitorResult;
+
+    fn visit_show_streaming_tables(
+        &self,
+        stmt: &ShowStreamingTables,
+        context: &StatementVisitorContext,
+    ) -> StatementVisitorResult;
+
+    fn visit_show_create_streaming_table(
+        &self,
+        stmt: &ShowCreateStreamingTable,
+        context: &StatementVisitorContext,
+    ) -> StatementVisitorResult;
+
+    fn visit_drop_streaming_table(
+        &self,
+        stmt: &DropStreamingTableStatement,
+        context: &StatementVisitorContext,
+    ) -> StatementVisitorResult;
 }
diff --git a/src/coordinator/tool/mod.rs b/src/coordinator/tool/mod.rs
new file mode 100644
index 00000000..6b48aa0e
--- /dev/null
+++ b/src/coordinator/tool/mod.rs
@@ -0,0 +1,13 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+pub use crate::sql::common::ConnectorOptions;
diff --git a/src/main.rs b/src/main.rs
index 562b1526..1faf45f1 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -179,9 +179,7 @@ fn main() -> Result<()> {
     );
 
     // 2. Component Initialization
-    let registry = server::register_components();
-    registry
-        .initialize_all(&config)
+    server::bootstrap_system(&config)
         .context("Component initialization failed")?;
 
     // 3. Server Startup
diff --git a/src/runtime/mod.rs b/src/runtime/mod.rs
index f69ad017..61b67e1f 100644
--- a/src/runtime/mod.rs
+++ b/src/runtime/mod.rs
@@ -14,10 +14,12 @@
 
 pub mod buffer_and_event;
 pub mod common;
-pub mod input;
-pub mod output;
-pub mod processor;
-pub mod sink;
-pub mod source;
+pub mod streaming;
+pub mod util;
 pub mod task;
 pub mod taskexecutor;
+pub mod wasm;
+
+pub use wasm::input;
+pub use wasm::output;
+pub use wasm::processor;
diff --git a/src/runtime/streaming/api/context.rs b/src/runtime/streaming/api/context.rs
new file mode 100644
index 00000000..f0c3dfcb
--- /dev/null
+++ b/src/runtime/streaming/api/context.rs
@@ -0,0 +1,124 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use crate::runtime::streaming::memory::MemoryPool;
+use crate::runtime::streaming::protocol::event::StreamEvent;
+use crate::runtime::streaming::protocol::tracked::TrackedEvent;
+use crate::runtime::streaming::network::endpoint::PhysicalSender;
+
+use arrow_array::RecordBatch;
+use std::sync::Arc;
+
+pub struct TaskContext {
+    pub job_id: String,
+    pub vertex_id: u32,
+    pub subtask_idx: u32,
+    pub parallelism: u32,
+
+    pub outboxes: Vec<PhysicalSender>,
+
+    memory_pool: Arc<MemoryPool>,
+
+    current_watermark: Option<std::time::SystemTime>,
+}
+
+impl TaskContext {
+    pub fn new(
+        job_id: String,
+        vertex_id: u32,
+        subtask_idx: u32,
+        parallelism: u32,
+        outboxes: Vec<PhysicalSender>,
+        memory_pool: Arc<MemoryPool>,
+    ) -> Self {
+        Self {
+            job_id,
+            vertex_id,
+            subtask_idx,
+            parallelism,
+            outboxes,
+            memory_pool,
+            current_watermark: None,
+        }
+    }
+
+    // ========================================================================
+    // ========================================================================
+
+    pub fn last_present_watermark(&self) -> Option<std::time::SystemTime> {
+        self.current_watermark
+    }
+
+    pub fn advance_watermark(&mut self, watermark: std::time::SystemTime) {
+        if let Some(current) = self.current_watermark {
+            if watermark > current {
+                self.current_watermark = Some(watermark);
+            }
+        } else {
+            self.current_watermark = Some(watermark);
+        }
+    }
+
+    // ========================================================================
+    // ========================================================================
+
+    pub fn task_identity(&self) -> String {
+        format!(
+            "Job[{}], Vertex[{}], Subtask[{}/{}]",
+            self.job_id, self.vertex_id, self.subtask_idx, self.parallelism
+        )
+    }
+
+    // ========================================================================
+    // ========================================================================
+
+    pub async fn collect(&self, batch: RecordBatch) -> anyhow::Result<()> {
+        if self.outboxes.is_empty() {
+            return Ok(());
+        }
+
+        let bytes_required = batch.get_array_memory_size();
+        let ticket = self.memory_pool.request_memory(bytes_required).await;
+        let tracked_event = TrackedEvent::new(StreamEvent::Data(batch), Some(ticket));
+
+        for outbox in &self.outboxes {
+            outbox.send(tracked_event.clone()).await?;
+        }
+        Ok(())
+    }
+
+    pub async fn collect_keyed(
+        &self,
+        key_hash: u64,
+        batch: RecordBatch,
+    ) -> anyhow::Result<()> {
+        if self.outboxes.is_empty() {
+            return Ok(());
+        }
+
+        let bytes_required = batch.get_array_memory_size();
+        let ticket = self.memory_pool.request_memory(bytes_required).await;
+        let tracked_event = TrackedEvent::new(StreamEvent::Data(batch), Some(ticket));
+
+        let target_idx = (key_hash as usize) % self.outboxes.len();
+        self.outboxes[target_idx].send(tracked_event).await?;
+        Ok(())
+    }
+
+    pub async fn broadcast(&self, event: StreamEvent) -> anyhow::Result<()> {
+        let tracked_event = TrackedEvent::control(event);
+        for outbox in &self.outboxes {
+            outbox.send(tracked_event.clone()).await?;
+        }
+        Ok(())
+    }
+}
diff --git a/src/runtime/sink/mod.rs b/src/runtime/streaming/api/mod.rs
similarity index 91%
rename from src/runtime/sink/mod.rs
rename to src/runtime/streaming/api/mod.rs
index a0a2a6fc..e78ba371 100644
--- a/src/runtime/sink/mod.rs
+++ b/src/runtime/streaming/api/mod.rs
@@ -10,6 +10,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// Sink module
 
-// TODO: Add sink implementation here
+pub mod context;
+pub mod operator;
+pub mod source;
+
diff --git a/src/runtime/streaming/api/operator.rs b/src/runtime/streaming/api/operator.rs
new file mode 100644
index 00000000..9acc6e06
--- /dev/null
+++ b/src/runtime/streaming/api/operator.rs
@@ -0,0 +1,80 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use crate::runtime::streaming::api::context::TaskContext;
+use crate::runtime::streaming::api::source::SourceOperator;
+use crate::runtime::streaming::protocol::stream_out::StreamOutput;
+use arrow_array::RecordBatch;
+use async_trait::async_trait;
+use std::time::Duration;
+use crate::sql::common::{CheckpointBarrier, Watermark};
+
+// ---------------------------------------------------------------------------
+// ConstructedOperator
+// ---------------------------------------------------------------------------
+
+pub enum ConstructedOperator {
+    Source(Box<dyn SourceOperator>),
+    Operator(Box<dyn Operator>),
+}
+
+#[async_trait]
+pub trait Operator: Send + 'static {
+    fn name(&self) -> &str;
+
+    async fn on_start(&mut self, _ctx: &mut TaskContext) -> anyhow::Result<()> {
+        Ok(())
+    }
+
+    async fn process_data(
+        &mut self,
+        input_idx: usize,
+        batch: RecordBatch,
+        ctx: &mut TaskContext,
+    ) -> anyhow::Result<Vec<StreamOutput>>;
+
+    async fn process_watermark(
+        &mut self,
+        watermark: Watermark,
+        ctx: &mut TaskContext,
+    ) -> anyhow::Result<Vec<StreamOutput>>;
+
+    async fn snapshot_state(
+        &mut self,
+        barrier: CheckpointBarrier,
+        ctx: &mut TaskContext,
+    ) -> anyhow::Result<()>;
+
+    async fn commit_checkpoint(
+        &mut self,
+        _epoch: u32,
+        _ctx: &mut TaskContext,
+    ) -> anyhow::Result<()> {
+        Ok(())
+    }
+
+    fn tick_interval(&self) -> Option<Duration> {
+        None
+    }
+
+    async fn process_tick(
+        &mut self,
+        _tick_index: u64,
+        _ctx: &mut TaskContext,
+    ) -> anyhow::Result<Vec<StreamOutput>> {
+        Ok(vec![])
+    }
+
+    async fn on_close(&mut self, _ctx: &mut TaskContext) -> anyhow::Result<Vec<StreamOutput>> {
+        Ok(vec![])
+    }
+}
diff --git a/src/runtime/streaming/api/source.rs b/src/runtime/streaming/api/source.rs
new file mode 100644
index 00000000..f46f3de7
--- /dev/null
+++ b/src/runtime/streaming/api/source.rs
@@ -0,0 +1,58 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+use crate::runtime::streaming::api::context::TaskContext;
+use arrow_array::RecordBatch;
+use async_trait::async_trait;
+use crate::sql::common::{CheckpointBarrier, Watermark};
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
+pub enum SourceOffset {
+    Earliest,
+    Latest,
+    #[default]
+    Group,
+}
+
+#[derive(Debug)]
+pub enum SourceEvent {
+    Data(RecordBatch),
+    Watermark(Watermark),
+    Idle,
+    EndOfStream,
+}
+
+#[async_trait]
+pub trait SourceOperator: Send + 'static {
+    fn name(&self) -> &str;
+
+    async fn on_start(&mut self, _ctx: &mut TaskContext) -> anyhow::Result<()> {
+        Ok(())
+    }
+
+    async fn fetch_next(&mut self, ctx: &mut TaskContext) -> anyhow::Result<SourceEvent>;
+
+    fn poll_watermark(&mut self) -> Option<Watermark> {
+        None
+    }
+
+    async fn snapshot_state(
+        &mut self,
+        barrier: CheckpointBarrier,
+        ctx: &mut TaskContext,
+    ) -> anyhow::Result<()>;
+
+    async fn on_close(&mut self, _ctx: &mut TaskContext) -> anyhow::Result<()> {
+        Ok(())
+    }
+}
diff --git a/src/runtime/streaming/error.rs b/src/runtime/streaming/error.rs
new file mode 100644
index 00000000..178f5bbb
--- /dev/null
+++ b/src/runtime/streaming/error.rs
@@ -0,0 +1,46 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::fmt::Display;
+use thiserror::Error;
+
+#[derive(Debug, Error)]
+pub enum RunError {
+    #[error("Operator execution failed: {0:#}")]
+    Operator(#[from] anyhow::Error),
+
+    #[error("Downstream send failed: {0}")]
+    DownstreamSend(String),
+
+    #[error("Internal engine error: {0}")]
+    Internal(String),
+
+    #[error("State backend error: {0}")]
+    State(String),
+
+    #[error("I/O error: {0}")]
+    Io(#[from] std::io::Error),
+}
+
+impl RunError {
+    pub fn internal<T: Display>(msg: T) -> Self {
+        Self::Internal(msg.to_string())
+    }
+
+    pub fn downstream<T: Display>(msg: T) -> Self {
+        Self::DownstreamSend(msg.to_string())
+    }
+
+    pub fn state<T: Display>(msg: T) -> Self {
+        Self::State(msg.to_string())
+    }
+}
\ No newline at end of file
diff --git a/src/runtime/source/mod.rs b/src/runtime/streaming/execution/mod.rs
similarity index 90%
rename from src/runtime/source/mod.rs
rename to src/runtime/streaming/execution/mod.rs
index 8a05bf30..1a8401ef 100644
--- a/src/runtime/source/mod.rs
+++ b/src/runtime/streaming/execution/mod.rs
@@ -10,6 +10,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// Source module
 
-// TODO: Add source implementation here
+pub mod runner;
+pub mod source;
+pub mod tracker;
+
diff --git a/src/runtime/streaming/execution/runner.rs b/src/runtime/streaming/execution/runner.rs
new file mode 100644
index 00000000..c4981d93
--- /dev/null
+++ b/src/runtime/streaming/execution/runner.rs
@@ -0,0 +1,375 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use async_trait::async_trait;
+use tokio::sync::mpsc::Receiver;
+use tokio_stream::{StreamExt, StreamMap};
+use tracing::{info, info_span, Instrument};
+
+use crate::runtime::streaming::api::context::TaskContext;
+use crate::runtime::streaming::api::operator::Operator;
+use crate::runtime::streaming::error::RunError;
+use crate::runtime::streaming::network::endpoint::BoxedEventStream;
+use crate::runtime::streaming::protocol::{
+    control::{ControlCommand, StopMode},
+    event::StreamEvent,
+    stream_out::StreamOutput,
+    tracked::TrackedEvent,
+};
+use crate::runtime::streaming::execution::tracker::{
+    barrier_aligner::{AlignmentStatus, BarrierAligner},
+    watermark_tracker::WatermarkTracker,
+};
+use crate::sql::common::{CheckpointBarrier, Watermark};
+
+// ==========================================
+// ==========================================
+
+#[async_trait]
+pub trait OperatorDrive: Send {
+    async fn on_start(&mut self, ctx: &mut TaskContext) -> Result<(), RunError>;
+    async fn process_event(
+        &mut self,
+        input_idx: usize,
+        event: TrackedEvent,
+        ctx: &mut TaskContext,
+    ) -> Result<bool, RunError>;
+    async fn handle_control(
+        &mut self,
+        cmd: ControlCommand,
+        ctx: &mut TaskContext,
+    ) -> Result<bool, RunError>;
+    async fn on_close(&mut self, ctx: &mut TaskContext) -> Result<(), RunError>;
+}
+
+pub struct ChainedDriver {
+    operator: Box<dyn Operator>,
+    next: Option<Box<dyn OperatorDrive>>,
+}
+
+impl ChainedDriver {
+    pub fn new(operator: Box<dyn Operator>, next: Option<Box<dyn OperatorDrive>>) -> Self {
+        Self { operator, next }
+    }
+
+    pub fn build_chain(mut operators: Vec<Box<dyn Operator>>) -> Option<Box<dyn OperatorDrive>> {
+        if operators.is_empty() {
+            return None;
+        }
+        let mut next_driver: Option<Box<dyn OperatorDrive>> = None;
+        while let Some(op) = operators.pop() {
+            let current = ChainedDriver::new(op, next_driver);
+            next_driver = Some(Box::new(current));
+        }
+        next_driver
+    }
+
+    async fn dispatch_outputs(
+        &mut self,
+        outputs: Vec<StreamOutput>,
+        ctx: &mut TaskContext,
+    ) -> Result<(), RunError> {
+        for out in outputs {
+            match out {
+                StreamOutput::Forward(b) => {
+                    if let Some(next) = &mut self.next {
+                        next.process_event(0, TrackedEvent::control(StreamEvent::Data(b)), ctx)
+                            .await?;
+                    } else {
+                        ctx.collect(b).await?;
+                    }
+                }
+                StreamOutput::Keyed(hash, b) => {
+                    if self.next.is_some() {
+                        return Err(RunError::internal(format!(
+                            "Topology Error: Keyed output emitted in the middle of chain by '{}'",
+                            self.operator.name()
+                        )));
+                    }
+                    ctx.collect_keyed(hash, b).await?;
+                }
+                StreamOutput::Broadcast(b) => {
+                    if self.next.is_some() {
+                        return Err(RunError::internal(format!(
+                            "Topology Error: Broadcast output emitted in the middle of chain by '{}'",
+                            self.operator.name()
+                        )));
+                    }
+                    ctx.collect(b).await?;
+                }
+                StreamOutput::Watermark(wm) => {
+                    if let Some(next) = &mut self.next {
+                        next.process_event(
+                            0,
+                            TrackedEvent::control(StreamEvent::Watermark(wm)),
+                            ctx,
+                        )
+                        .await?;
+                    } else {
+                        ctx.broadcast(StreamEvent::Watermark(wm)).await?;
+                    }
+                }
+            }
+        }
+        Ok(())
+    }
+
+    async fn forward_signal(
+        &mut self,
+        event: StreamEvent,
+        ctx: &mut TaskContext,
+    ) -> Result<(), RunError> {
+        if let Some(next) = &mut self.next {
+            next.process_event(0, TrackedEvent::control(event), ctx).await?;
+        } else {
+            match event {
+                StreamEvent::Watermark(wm) => ctx.broadcast(StreamEvent::Watermark(wm)).await?,
+                StreamEvent::Barrier(b) => ctx.broadcast(StreamEvent::Barrier(b)).await?,
+                StreamEvent::EndOfStream => ctx.broadcast(StreamEvent::EndOfStream).await?,
+                StreamEvent::Data(_) => unreachable!(),
+            }
+        }
+        Ok(())
+    }
+}
+
+#[async_trait]
+impl OperatorDrive for ChainedDriver {
+    async fn on_start(&mut self, ctx: &mut TaskContext) -> Result<(), RunError> {
+        self.operator.on_start(ctx).await?;
+        if let Some(next) = &mut self.next {
+            next.on_start(ctx).await?;
+        }
+        Ok(())
+    }
+
+    async fn process_event(
+        &mut self,
+        input_idx: usize,
+        tracked: TrackedEvent,
+        ctx: &mut TaskContext,
+    ) -> Result<bool, RunError> {
+        let mut should_stop = false;
+        match tracked.event {
+            StreamEvent::Data(batch) => {
+                let outputs = self.operator.process_data(input_idx, batch, ctx).await?;
+                self.dispatch_outputs(outputs, ctx).await?;
+            }
+            StreamEvent::Watermark(wm) => {
+                let outputs = self.operator.process_watermark(wm.clone(), ctx).await?;
+                self.dispatch_outputs(outputs, ctx).await?;
+                self.forward_signal(StreamEvent::Watermark(wm), ctx).await?;
+            }
+            StreamEvent::Barrier(barrier) => {
+                self.operator.snapshot_state(barrier.clone(), ctx).await?;
+                self.forward_signal(StreamEvent::Barrier(barrier), ctx).await?;
+            }
+            StreamEvent::EndOfStream => {
+                should_stop = true;
+                self.forward_signal(StreamEvent::EndOfStream, ctx).await?;
+            }
+        }
+        Ok(should_stop)
+    }
+
+    async fn handle_control(
+        &mut self,
+        cmd: ControlCommand,
+        ctx: &mut TaskContext,
+    ) -> Result<bool, RunError> {
+        let mut stop = false;
+        match &cmd {
+            ControlCommand::TriggerCheckpoint { barrier } => {
+                let b: CheckpointBarrier = barrier.clone().into();
+                self.operator.snapshot_state(b, ctx).await?;
+            }
+            ControlCommand::Commit { epoch } => {
+                self.operator.commit_checkpoint(*epoch, ctx).await?;
+            }
+            ControlCommand::Stop { mode } => {
+                if *mode == StopMode::Immediate {
+                    stop = true;
+                }
+            }
+            ControlCommand::DropState | ControlCommand::Start | ControlCommand::UpdateConfig { .. } => {}
+        }
+
+        if let Some(next) = &mut self.next {
+            if next.handle_control(cmd, ctx).await? {
+                stop = true;
+            }
+        } else if let ControlCommand::TriggerCheckpoint { barrier } = cmd {
+            ctx.broadcast(StreamEvent::Barrier(barrier.into())).await?;
+        }
+
+        Ok(stop)
+    }
+
+    async fn on_close(&mut self, ctx: &mut TaskContext) -> Result<(), RunError> {
+        let close_outs = self.operator.on_close(ctx).await?;
+        self.dispatch_outputs(close_outs, ctx).await?;
+        if let Some(next) = &mut self.next {
+            next.on_close(ctx).await?;
+        }
+        Ok(())
+    }
+}
+
+// ==========================================
+// ==========================================
+
+pub struct Pipeline {
+    chain_head: Box<dyn OperatorDrive>,
+    ctx: TaskContext,
+    inboxes: Vec<BoxedEventStream>,
+    control_rx: Receiver<ControlCommand>,
+
+    wm_tracker: WatermarkTracker,
+    barrier_aligner: BarrierAligner,
+    paused_streams: Vec<Option<BoxedEventStream>>,
+}
+
+impl Pipeline {
+    pub fn new(
+        operators: Vec<Box<dyn Operator>>,
+        ctx: TaskContext,
+        inboxes: Vec<BoxedEventStream>,
+        control_rx: Receiver<ControlCommand>,
+    ) -> Result<Self, RunError> {
+        let input_count = inboxes.len();
+        let chain_head = ChainedDriver::build_chain(operators)
+            .ok_or_else(|| RunError::internal("Cannot build pipeline with empty operators"))?;
+
+        let paused_streams = (0..input_count).map(|_| None).collect();
+
+        Ok(Self {
+            chain_head,
+            ctx,
+            inboxes,
+            control_rx,
+            wm_tracker: WatermarkTracker::new(input_count),
+            barrier_aligner: BarrierAligner::new(input_count),
+            paused_streams,
+        })
+    }
+
+    pub async fn run(mut self) -> Result<(), RunError> {
+        let span = info_span!(
+            "pipeline_run",
+            job_id = %self.ctx.job_id,
+            vertex = self.ctx.vertex_id
+        );
+
+        async move {
+            info!("Pipeline initializing...");
+            self.chain_head.on_start(&mut self.ctx).await?;
+
+            let mut active_streams = StreamMap::new();
+            for (i, stream) in std::mem::take(&mut self.inboxes).into_iter().enumerate() {
+                active_streams.insert(i, stream);
+            }
+
+            loop {
+                tokio::select! {
+                    biased;
+
+                    Some(cmd) = self.control_rx.recv() => {
+                        if self.chain_head.handle_control(cmd, &mut self.ctx).await? {
+                            break;
+                        }
+                    }
+
+                    Some((idx, tracked_event)) = active_streams.next() => {
+                        match tracked_event.event {
+                            StreamEvent::Data(batch) => {
+                                self.chain_head
+                                    .process_event(
+                                        idx,
+                                        TrackedEvent::control(StreamEvent::Data(batch)),
+                                        &mut self.ctx,
+                                    )
+                                    .await?;
+                            }
+
+                            StreamEvent::Barrier(barrier) => {
+                                match self.barrier_aligner.mark(idx, &barrier) {
+                                    AlignmentStatus::Pending => {
+                                        if let Some(stream) = active_streams.remove(&idx) {
+                                            self.paused_streams[idx] = Some(stream);
+                                        }
+                                    }
+                                    AlignmentStatus::Complete => {
+                                        self.chain_head
+                                            .process_event(
+                                                idx,
+                                                TrackedEvent::control(StreamEvent::Barrier(barrier)),
+                                                &mut self.ctx,
+                                            )
+                                            .await?;
+
+                                        for i in 0..self.paused_streams.len() {
+                                            if let Some(stream) = self.paused_streams[i].take() {
+                                                active_streams.insert(i, stream);
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+
+                            StreamEvent::Watermark(wm) => {
+                                if let Some(aligned_wm) = self.wm_tracker.update(idx, wm) {
+                                    if let Watermark::EventTime(t) = aligned_wm {
+                                        self.ctx.advance_watermark(t);
+                                    }
+                                    self.chain_head
+                                        .process_event(
+                                            idx,
+                                            TrackedEvent::control(StreamEvent::Watermark(aligned_wm)),
+                                            &mut self.ctx,
+                                        )
+                                        .await?;
+                                }
+                            }
+
+                            StreamEvent::EndOfStream => {
+                                if self.wm_tracker.increment_eof() == self.wm_tracker.input_count() {
+                                    self.chain_head
+                                        .process_event(
+                                            idx,
+                                            TrackedEvent::control(StreamEvent::EndOfStream),
+                                            &mut self.ctx,
+                                        )
+                                        .await?;
+                                    break;
+                                }
+                            }
+                        }
+                    }
+
+                    else => break,
+                }
+            }
+
+            self.teardown().await
+        }
+        .instrument(span)
+        .await
+    }
+
+    async fn teardown(mut self) -> Result<(), RunError> {
+        info!("Pipeline tearing down...");
+        self.chain_head.on_close(&mut self.ctx).await?;
+        Ok(())
+    }
+}
+
+pub type SubtaskRunner = Pipeline;
diff --git a/src/runtime/streaming/execution/source.rs b/src/runtime/streaming/execution/source.rs
new file mode 100644
index 00000000..a85b0839
--- /dev/null
+++ b/src/runtime/streaming/execution/source.rs
@@ -0,0 +1,180 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+use crate::runtime::streaming::api::context::TaskContext;
+use crate::runtime::streaming::api::source::{SourceEvent, SourceOperator};
+use crate::runtime::streaming::error::RunError;
+use crate::runtime::streaming::execution::runner::OperatorDrive;
+use crate::runtime::streaming::protocol::control::ControlCommand;
+use crate::runtime::streaming::protocol::event::StreamEvent;
+use crate::runtime::streaming::protocol::tracked::TrackedEvent;
+use crate::sql::common::CheckpointBarrier;
+use std::time::Duration;
+use tokio::sync::mpsc::Receiver;
+use tokio::time::{interval, MissedTickBehavior};
+use tracing::{info, info_span, warn, Instrument};
+
+pub const SOURCE_IDLE_SLEEP: Duration = Duration::from_millis(50);
+pub const WATERMARK_EMIT_INTERVAL: Duration = Duration::from_millis(200);
+
+pub struct SourceRunner {
+    operator: Box<dyn SourceOperator>,
+    chain_head: Option<Box<dyn OperatorDrive>>,
+    ctx: TaskContext,
+    control_rx: Receiver<ControlCommand>,
+}
+
+impl SourceRunner {
+    pub fn new(
+        operator: Box<dyn SourceOperator>,
+        chain_head: Option<Box<dyn OperatorDrive>>,
+        ctx: TaskContext,
+        control_rx: Receiver<ControlCommand>,
+    ) -> Self {
+        Self {
+            operator,
+            chain_head,
+            ctx,
+            control_rx,
+        }
+    }
+
+    pub async fn run(mut self) -> Result<(), RunError> {
+        let span = info_span!(
+            "source_run",
+            vertex = self.ctx.vertex_id,
+            op = self.operator.name()
+        );
+
+        async move {
+            info!("Source subtask starting");
+            self.operator.on_start(&mut self.ctx).await?;
+            if let Some(chain) = &mut self.chain_head {
+                chain.on_start(&mut self.ctx).await?;
+            }
+
+            let mut idle_timer = interval(SOURCE_IDLE_SLEEP);
+            idle_timer.set_missed_tick_behavior(MissedTickBehavior::Skip);
+
+            let mut wm_timer = interval(WATERMARK_EMIT_INTERVAL);
+            wm_timer.set_missed_tick_behavior(MissedTickBehavior::Skip);
+
+            let mut is_idle = false;
+            let mut is_running = true;
+
+            while is_running {
+                tokio::select! {
+                    biased;
+
+                    cmd_opt = self.control_rx.recv() => {
+                        match cmd_opt {
+                            None => is_running = false,
+                            Some(cmd) => {
+                                if self.handle_control(cmd).await? {
+                                    is_running = false;
+                                }
+                            }
+                        }
+                    }
+
+                    _ = wm_timer.tick() => {
+                        if let Some(wm) = self.operator.poll_watermark() {
+                            self.dispatch_event(StreamEvent::Watermark(wm)).await?;
+                        }
+                    }
+
+                    _ = idle_timer.tick(), if is_idle => {
+                        is_idle = false;
+                    }
+
+                    fetch_res = self.operator.fetch_next(&mut self.ctx), if !is_idle => {
+                        match fetch_res {
+                            Ok(SourceEvent::Data(batch)) => {
+                                self.dispatch_event(StreamEvent::Data(batch)).await?;
+                            }
+                            Ok(SourceEvent::Watermark(wm)) => {
+                                self.dispatch_event(StreamEvent::Watermark(wm)).await?;
+                            }
+                            Ok(SourceEvent::Idle) => {
+                                is_idle = true;
+                                idle_timer.reset();
+                            }
+                            Ok(SourceEvent::EndOfStream) => {
+                                self.dispatch_event(StreamEvent::EndOfStream).await?;
+                                is_running = false;
+                            }
+                            Err(e) => {
+                                warn!("fetch_next error: {}", e);
+                                return Err(RunError::Operator(e));
+                            }
+                        }
+                    }
+                }
+            }
+
+            self.teardown().await
+        }
+        .instrument(span)
+        .await
+    }
+
+    async fn dispatch_event(&mut self, event: StreamEvent) -> Result<(), RunError> {
+        if let Some(chain) = &mut self.chain_head {
+            let _stop = chain
+                .process_event(0, TrackedEvent::control(event), &mut self.ctx)
+                .await?;
+        } else {
+            match event {
+                StreamEvent::Data(b) => self.ctx.collect(b).await?,
+                StreamEvent::Watermark(w) => {
+                    self.ctx.broadcast(StreamEvent::Watermark(w)).await?;
+                }
+                StreamEvent::Barrier(b) => {
+                    self.ctx.broadcast(StreamEvent::Barrier(b)).await?;
+                }
+                StreamEvent::EndOfStream => {
+                    self.ctx.broadcast(StreamEvent::EndOfStream).await?;
+                }
+            }
+        }
+        Ok(())
+    }
+
+    async fn handle_control(&mut self, cmd: ControlCommand) -> Result<bool, RunError> {
+        match cmd {
+            ControlCommand::TriggerCheckpoint { barrier } => {
+                let b: CheckpointBarrier = barrier.into();
+                self.operator.snapshot_state(b.clone(), &mut self.ctx).await?;
+                self.dispatch_event(StreamEvent::Barrier(b)).await?;
+            }
+            ControlCommand::Stop { .. } => return Ok(true),
+            other => {
+                if let Some(chain) = &mut self.chain_head {
+                    if chain.handle_control(other, &mut self.ctx).await? {
+                        return Ok(true);
+                    }
+                }
+            }
+        }
+        Ok(false)
+    }
+
+    async fn teardown(mut self) -> Result<(), RunError> {
+        self.operator.on_close(&mut self.ctx).await?;
+        if let Some(chain) = &mut self.chain_head {
+            chain.on_close(&mut self.ctx).await?;
+        }
+        info!("Source subtask shutdown");
+        Ok(())
+    }
+}
diff --git a/src/runtime/streaming/execution/tracker/barrier_aligner.rs b/src/runtime/streaming/execution/tracker/barrier_aligner.rs
new file mode 100644
index 00000000..b227e439
--- /dev/null
+++ b/src/runtime/streaming/execution/tracker/barrier_aligner.rs
@@ -0,0 +1,56 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+use std::collections::HashSet;
+
+use crate::sql::common::CheckpointBarrier;
+
+#[derive(Debug)]
+pub enum AlignmentStatus {
+    Pending,
+    Complete,
+}
+
+#[derive(Debug)]
+pub struct BarrierAligner {
+    input_count: usize,
+    current_epoch: Option<u32>,
+    reached_inputs: HashSet<usize>,
+}
+
+impl BarrierAligner {
+    pub fn new(input_count: usize) -> Self {
+        Self {
+            input_count,
+            current_epoch: None,
+            reached_inputs: HashSet::new(),
+        }
+    }
+
+    pub fn mark(&mut self, input_idx: usize, barrier: &CheckpointBarrier) -> AlignmentStatus {
+        if self.current_epoch != Some(barrier.epoch) {
+            self.current_epoch = Some(barrier.epoch);
+            self.reached_inputs.clear();
+        }
+
+        self.reached_inputs.insert(input_idx);
+
+        if self.reached_inputs.len() == self.input_count {
+            self.current_epoch = None;
+            self.reached_inputs.clear();
+            AlignmentStatus::Complete
+        } else {
+            AlignmentStatus::Pending
+        }
+    }
+}
diff --git a/src/runtime/streaming/execution/tracker/mod.rs b/src/runtime/streaming/execution/tracker/mod.rs
new file mode 100644
index 00000000..3206f352
--- /dev/null
+++ b/src/runtime/streaming/execution/tracker/mod.rs
@@ -0,0 +1,16 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+pub mod barrier_aligner;
+pub mod watermark_tracker;
+
diff --git a/src/runtime/streaming/execution/tracker/watermark_tracker.rs b/src/runtime/streaming/execution/tracker/watermark_tracker.rs
new file mode 100644
index 00000000..6304b4c3
--- /dev/null
+++ b/src/runtime/streaming/execution/tracker/watermark_tracker.rs
@@ -0,0 +1,109 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use crate::runtime::streaming::protocol::watermark::{merge_watermarks, watermark_strictly_advances};
+use crate::sql::common::Watermark;
+
+#[derive(Debug)]
+pub struct WatermarkTracker {
+    watermarks: Vec<Option<Watermark>>,
+    current_min_watermark: Option<Watermark>,
+    eof_count: usize,
+}
+
+impl WatermarkTracker {
+    pub fn new(input_count: usize) -> Self {
+        Self {
+            watermarks: vec![None; input_count],
+            current_min_watermark: None,
+            eof_count: 0,
+        }
+    }
+
+    pub fn update(&mut self, input_idx: usize, wm: Watermark) -> Option<Watermark> {
+        self.watermarks[input_idx] = Some(wm);
+
+        if self.watermarks.iter().any(|w| w.is_none()) {
+            return None;
+        }
+
+        let new_min = merge_watermarks(&self.watermarks)?;
+
+        if !watermark_strictly_advances(new_min, self.current_min_watermark) {
+            return None;
+        }
+
+        self.current_min_watermark = Some(new_min);
+        Some(new_min)
+    }
+
+    pub fn increment_eof(&mut self) -> usize {
+        self.eof_count += 1;
+        self.eof_count
+    }
+
+    pub fn input_count(&self) -> usize {
+        self.watermarks.len()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::time::{Duration, SystemTime};
+
+    #[test]
+    fn no_emit_until_all_inputs_seen() {
+        let mut t = WatermarkTracker::new(2);
+        let w = Watermark::EventTime(SystemTime::UNIX_EPOCH + Duration::from_secs(3));
+        assert!(t.update(0, w).is_none());
+        let w2 = Watermark::EventTime(SystemTime::UNIX_EPOCH + Duration::from_secs(1));
+        assert_eq!(t.update(1, w2), Some(w2));
+    }
+
+    #[test]
+    fn dedup_same_aligned() {
+        let mut t = WatermarkTracker::new(1);
+        let w = Watermark::EventTime(SystemTime::UNIX_EPOCH + Duration::from_secs(1));
+        assert_eq!(t.update(0, w), Some(w));
+        assert!(t.update(0, w).is_none());
+    }
+
+    #[test]
+    fn advances_only_when_min_strictly_increases() {
+        let mut t = WatermarkTracker::new(2);
+        let t1 = SystemTime::UNIX_EPOCH + Duration::from_secs(1);
+        let t5 = SystemTime::UNIX_EPOCH + Duration::from_secs(5);
+        assert!(t.update(0, Watermark::EventTime(t5)).is_none());
+        assert_eq!(t.update(1, Watermark::EventTime(t1)), Some(Watermark::EventTime(t1)));
+        let t3 = SystemTime::UNIX_EPOCH + Duration::from_secs(3);
+        assert_eq!(
+            t.update(1, Watermark::EventTime(t3)),
+            Some(Watermark::EventTime(t3))
+        );
+        assert!(t.update(1, Watermark::EventTime(t3)).is_none());
+    }
+
+    #[test]
+    fn backward_aligned_min_is_ignored() {
+        let mut t = WatermarkTracker::new(2);
+        let t5 = SystemTime::UNIX_EPOCH + Duration::from_secs(5);
+        let t10 = SystemTime::UNIX_EPOCH + Duration::from_secs(10);
+        assert!(t.update(0, Watermark::EventTime(t10)).is_none());
+        assert_eq!(
+            t.update(1, Watermark::EventTime(t5)),
+            Some(Watermark::EventTime(t5))
+        );
+        let t2 = SystemTime::UNIX_EPOCH + Duration::from_secs(2);
+        assert!(t.update(0, Watermark::EventTime(t2)).is_none());
+    }
+}
diff --git a/src/runtime/streaming/factory/connector/dispatchers.rs b/src/runtime/streaming/factory/connector/dispatchers.rs
new file mode 100644
index 00000000..40e7242c
--- /dev/null
+++ b/src/runtime/streaming/factory/connector/dispatchers.rs
@@ -0,0 +1,37 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::sync::Arc;
+
+use anyhow::Result;
+
+use crate::runtime::streaming::api::operator::ConstructedOperator;
+use crate::runtime::streaming::factory::global::Registry;
+use crate::runtime::streaming::factory::operator_constructor::OperatorConstructor;
+
+use super::kafka::ConnectorDispatcher;
+
+pub struct ConnectorSourceDispatcher;
+
+impl OperatorConstructor for ConnectorSourceDispatcher {
+    fn with_config(&self, config: &[u8], registry: Arc<Registry>) -> Result<ConstructedOperator> {
+        ConnectorDispatcher.with_config(config, registry)
+    }
+}
+
+pub struct ConnectorSinkDispatcher;
+
+impl OperatorConstructor for ConnectorSinkDispatcher {
+    fn with_config(&self, config: &[u8], registry: Arc<Registry>) -> Result<ConstructedOperator> {
+        ConnectorDispatcher.with_config(config, registry)
+    }
+}
diff --git a/src/runtime/streaming/factory/connector/kafka.rs b/src/runtime/streaming/factory/connector/kafka.rs
new file mode 100644
index 00000000..a55ef477
--- /dev/null
+++ b/src/runtime/streaming/factory/connector/kafka.rs
@@ -0,0 +1,262 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use anyhow::{bail, Context, Result};
+use prost::Message;
+use std::collections::HashMap;
+use std::num::NonZeroU32;
+use std::sync::Arc;
+
+use protocol::grpc::api::connector_op::Config;
+use protocol::grpc::api::{
+    BadDataPolicy, ConnectorOp, DecimalEncodingProto, FormatConfig,
+    KafkaAuthConfig, KafkaOffsetMode, KafkaReadMode, KafkaSinkCommitMode, KafkaSinkConfig,
+    KafkaSourceConfig, TimestampFormatProto,
+};
+use tracing::info;
+
+use crate::runtime::streaming::api::operator::ConstructedOperator;
+use crate::runtime::streaming::api::source::SourceOffset;
+use crate::runtime::streaming::factory::global::Registry;
+use crate::runtime::streaming::factory::operator_constructor::OperatorConstructor;
+use crate::runtime::streaming::format::{
+    BadDataPolicy as RtBadDataPolicy, DataSerializer, DecimalEncoding as RtDecimalEncoding,
+    Format as RuntimeFormat, JsonFormat as RuntimeJsonFormat,
+    TimestampFormat as RtTimestampFormat,
+};
+use crate::runtime::streaming::operators::sink::kafka::{ConsistencyMode, KafkaSinkOperator};
+use crate::runtime::streaming::operators::source::kafka::{
+    BufferedDeserializer, KafkaSourceOperator,
+};
+use crate::sql::common::FsSchema;
+
+const DEFAULT_SOURCE_BATCH_SIZE: usize = 1024;
+
+// ─────────────── Proto → Runtime type conversions ───────────────
+
+fn proto_format_to_runtime(fmt: &Option<FormatConfig>) -> Result<RuntimeFormat> {
+    let cfg = fmt.as_ref().context("FormatConfig is required")?;
+    match &cfg.format {
+        Some(protocol::grpc::api::format_config::Format::Json(j)) => {
+            Ok(RuntimeFormat::Json(RuntimeJsonFormat {
+                timestamp_format: match j.timestamp_format() {
+                    TimestampFormatProto::TimestampRfc3339 => RtTimestampFormat::RFC3339,
+                    TimestampFormatProto::TimestampUnixMillis => RtTimestampFormat::UnixMillis,
+                },
+                decimal_encoding: match j.decimal_encoding() {
+                    DecimalEncodingProto::DecimalNumber => RtDecimalEncoding::Number,
+                    DecimalEncodingProto::DecimalString => RtDecimalEncoding::String,
+                    DecimalEncodingProto::DecimalBytes => RtDecimalEncoding::Bytes,
+                },
+                include_schema: j.include_schema,
+            }))
+        }
+        Some(protocol::grpc::api::format_config::Format::RawString(_)) => {
+            Ok(RuntimeFormat::RawString)
+        }
+        Some(protocol::grpc::api::format_config::Format::RawBytes(_)) => {
+            Ok(RuntimeFormat::RawBytes)
+        }
+        None => bail!("FormatConfig has no format variant set"),
+    }
+}
+
+fn proto_bad_data_to_runtime(policy: i32) -> RtBadDataPolicy {
+    match BadDataPolicy::try_from(policy) {
+        Ok(BadDataPolicy::BadDataDrop) => RtBadDataPolicy::Drop,
+        _ => RtBadDataPolicy::Fail,
+    }
+}
+
+fn proto_offset_to_runtime(mode: i32) -> SourceOffset {
+    match KafkaOffsetMode::try_from(mode) {
+        Ok(KafkaOffsetMode::KafkaOffsetLatest) => SourceOffset::Latest,
+        Ok(KafkaOffsetMode::KafkaOffsetEarliest) => SourceOffset::Earliest,
+        _ => SourceOffset::Group,
+    }
+}
+
+fn build_auth_client_configs(auth: &Option<KafkaAuthConfig>) -> HashMap<String, String> {
+    let mut out = HashMap::new();
+    let Some(auth) = auth else { return out };
+    match &auth.auth {
+        Some(protocol::grpc::api::kafka_auth_config::Auth::Sasl(sasl)) => {
+            out.insert("security.protocol".to_string(), sasl.protocol.clone());
+            out.insert("sasl.mechanism".to_string(), sasl.mechanism.clone());
+            out.insert("sasl.username".to_string(), sasl.username.clone());
+            out.insert("sasl.password".to_string(), sasl.password.clone());
+        }
+        Some(protocol::grpc::api::kafka_auth_config::Auth::AwsMskIam(iam)) => {
+            out.insert("security.protocol".to_string(), "SASL_SSL".to_string());
+            out.insert("sasl.mechanism".to_string(), "OAUTHBEARER".to_string());
+            out.insert(
+                "sasl.oauthbearer.extensions".to_string(),
+                format!("logicalCluster=aws_msk;aws_region={}", iam.region),
+            );
+        }
+        _ => {}
+    }
+    out
+}
+
+fn merge_client_configs(
+    auth: &Option<KafkaAuthConfig>,
+    extra: &HashMap<String, String>,
+) -> HashMap<String, String> {
+    let mut configs = build_auth_client_configs(auth);
+    for (k, v) in extra {
+        configs.insert(k.clone(), v.clone());
+    }
+    configs
+}
+
+// ─────────────── Unified Connector Dispatcher ───────────────
+
+pub struct ConnectorDispatcher;
+
+impl OperatorConstructor for ConnectorDispatcher {
+    fn with_config(&self, payload: &[u8], _registry: Arc<Registry>) -> Result<ConstructedOperator> {
+        let op = ConnectorOp::decode(payload)
+            .context("Failed to decode ConnectorOp protobuf")?;
+
+        let fs_schema = op
+            .fs_schema
+            .as_ref()
+            .map(|fs| FsSchema::try_from(fs.clone()))
+            .transpose()
+            .map_err(|e| anyhow::anyhow!("{e}"))?;
+
+        match op.config {
+            Some(Config::KafkaSource(ref cfg)) => {
+                Self::build_kafka_source(&op.name, cfg, fs_schema)
+            }
+            Some(Config::KafkaSink(ref cfg)) => {
+                Self::build_kafka_sink(&op.name, cfg, fs_schema)
+            }
+            Some(Config::Generic(_)) => bail!(
+                "ConnectorOp '{}': GenericConnectorConfig dispatch not yet implemented",
+                op.name
+            ),
+            None => bail!("ConnectorOp '{}' has no configuration payload", op.name),
+        }
+    }
+}
+
+impl ConnectorDispatcher {
+    fn build_kafka_source(
+        _name: &str,
+        cfg: &KafkaSourceConfig,
+        fs_schema: Option<FsSchema>,
+    ) -> Result<ConstructedOperator> {
+        info!(topic = %cfg.topic, "Constructing Kafka Source");
+
+        let fs = fs_schema.context("fs_schema is required for Kafka Source")?;
+        let client_configs = merge_client_configs(&cfg.auth, &cfg.client_configs);
+
+        let mut final_configs = client_configs;
+        if cfg.read_mode() == KafkaReadMode::KafkaReadCommitted {
+            final_configs.insert("isolation.level".to_string(), "read_committed".to_string());
+        }
+
+        let runtime_format = proto_format_to_runtime(&cfg.format)?;
+        let bad_data = proto_bad_data_to_runtime(cfg.bad_data_policy);
+
+        let deserializer = Box::new(BufferedDeserializer::new(
+            runtime_format,
+            fs.schema.clone(),
+            bad_data,
+            DEFAULT_SOURCE_BATCH_SIZE,
+        ));
+
+        let rate = NonZeroU32::new(cfg.rate_limit_msgs_per_sec.max(1))
+            .unwrap_or_else(|| NonZeroU32::new(1_000_000).expect("nonzero"));
+
+        let source_op = KafkaSourceOperator::new(
+            cfg.topic.clone(),
+            cfg.bootstrap_servers.clone(),
+            cfg.group_id.clone(),
+            cfg.group_id_prefix.clone(),
+            proto_offset_to_runtime(cfg.offset_mode),
+            final_configs,
+            rate,
+            vec![],
+            deserializer,
+        );
+
+        Ok(ConstructedOperator::Source(Box::new(source_op)))
+    }
+
+    fn build_kafka_sink(
+        _name: &str,
+        cfg: &KafkaSinkConfig,
+        fs_schema: Option<FsSchema>,
+    ) -> Result<ConstructedOperator> {
+        info!(topic = %cfg.topic, "Constructing Kafka Sink");
+
+        let fs_in = fs_schema.context("fs_schema is required for Kafka Sink")?;
+        let client_configs = merge_client_configs(&cfg.auth, &cfg.client_configs);
+
+        let consistency = match cfg.commit_mode() {
+            KafkaSinkCommitMode::KafkaSinkExactlyOnce => ConsistencyMode::ExactlyOnce,
+            KafkaSinkCommitMode::KafkaSinkAtLeastOnce => ConsistencyMode::AtLeastOnce,
+        };
+
+        let runtime_format = proto_format_to_runtime(&cfg.format)?;
+        let fs = sink_fs_schema_adjusted(fs_in, &cfg.key_field, &cfg.timestamp_field)?;
+        let serializer = DataSerializer::new(runtime_format, fs.schema.clone());
+
+        let sink_op = KafkaSinkOperator::new(
+            cfg.topic.clone(),
+            cfg.bootstrap_servers.clone(),
+            consistency,
+            client_configs,
+            fs,
+            serializer,
+        );
+
+        Ok(ConstructedOperator::Operator(Box::new(sink_op)))
+    }
+}
+
+fn sink_fs_schema_adjusted(
+    fs: FsSchema,
+    key_field: &Option<String>,
+    timestamp_field: &Option<String>,
+) -> Result<FsSchema> {
+    if key_field.is_none() && timestamp_field.is_none() {
+        return Ok(fs);
+    }
+    let schema = fs.schema.clone();
+    let ts = if let Some(name) = timestamp_field {
+        schema
+            .column_with_name(name)
+            .ok_or_else(|| anyhow::anyhow!("timestamp column '{name}' not found in schema"))?
+            .0
+    } else {
+        fs.timestamp_index
+    };
+    let keys = fs.clone_storage_key_indices();
+    let routing = if let Some(name) = key_field {
+        let k = schema
+            .column_with_name(name)
+            .ok_or_else(|| anyhow::anyhow!("key column '{name}' not found in schema"))?
+            .0;
+        Some(vec![k])
+    } else {
+        fs.clone_routing_key_indices()
+    };
+    Ok(FsSchema::new(schema, ts, keys, routing))
+}
+
+// Legacy dispatcher aliases kept for backward compatibility with factory registration.
+pub type KafkaSourceDispatcher = ConnectorDispatcher;
+pub type KafkaSinkDispatcher = ConnectorDispatcher;
diff --git a/src/runtime/streaming/factory/connector/mod.rs b/src/runtime/streaming/factory/connector/mod.rs
new file mode 100644
index 00000000..be63478d
--- /dev/null
+++ b/src/runtime/streaming/factory/connector/mod.rs
@@ -0,0 +1,18 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+mod dispatchers;
+pub mod kafka;
+
+pub use dispatchers::{ConnectorSinkDispatcher, ConnectorSourceDispatcher};
+pub use kafka::{KafkaSinkDispatcher, KafkaSourceDispatcher};
diff --git a/src/runtime/streaming/factory/global/mod.rs b/src/runtime/streaming/factory/global/mod.rs
new file mode 100644
index 00000000..0dc2130e
--- /dev/null
+++ b/src/runtime/streaming/factory/global/mod.rs
@@ -0,0 +1,16 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+mod session_registry;
+
+pub use session_registry::Registry;
diff --git a/src/runtime/streaming/factory/global/session_registry.rs b/src/runtime/streaming/factory/global/session_registry.rs
new file mode 100644
index 00000000..4b7895a2
--- /dev/null
+++ b/src/runtime/streaming/factory/global/session_registry.rs
@@ -0,0 +1,60 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+use std::collections::HashSet;
+use std::sync::Arc;
+
+use datafusion::common::Result as DfResult;
+use datafusion::execution::context::SessionContext;
+use datafusion::execution::FunctionRegistry;
+use datafusion::logical_expr::planner::ExprPlanner;
+use datafusion::logical_expr::{AggregateUDF, ScalarUDF, WindowUDF};
+
+///
+pub struct Registry {
+    ctx: SessionContext,
+}
+
+impl Registry {
+    pub fn new() -> Self {
+        Self {
+            ctx: SessionContext::new(),
+        }
+    }
+
+    pub fn session_context(&self) -> &SessionContext {
+        &self.ctx
+    }
+}
+
+impl FunctionRegistry for Registry {
+    fn udfs(&self) -> HashSet<String> {
+        self.ctx.udfs()
+    }
+
+    fn udf(&self, name: &str) -> DfResult<Arc<ScalarUDF>> {
+        self.ctx.udf(name)
+    }
+
+    fn udaf(&self, name: &str) -> DfResult<Arc<AggregateUDF>> {
+        self.ctx.udaf(name)
+    }
+
+    fn udwf(&self, name: &str) -> DfResult<Arc<WindowUDF>> {
+        self.ctx.udwf(name)
+    }
+
+    fn expr_planners(&self) -> Vec<Arc<dyn ExprPlanner>> {
+        self.ctx.expr_planners()
+    }
+}
diff --git a/src/runtime/streaming/factory/mod.rs b/src/runtime/streaming/factory/mod.rs
new file mode 100644
index 00000000..f02ec955
--- /dev/null
+++ b/src/runtime/streaming/factory/mod.rs
@@ -0,0 +1,56 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+pub mod connector;
+pub mod global;
+
+mod operator_constructor;
+mod operator_factory;
+
+use tracing::info;
+
+use crate::sql::common::constants::factory_operator_name;
+
+#[allow(unused_imports)]
+pub use connector::{
+    ConnectorSinkDispatcher, ConnectorSourceDispatcher, KafkaSinkDispatcher, KafkaSourceDispatcher,
+};
+pub use global::Registry;
+pub use operator_factory::OperatorFactory;
+
+fn register_builtin_connectors(factory: &mut OperatorFactory) {
+    factory.register(
+        factory_operator_name::CONNECTOR_SOURCE,
+        Box::new(connector::ConnectorSourceDispatcher),
+    );
+    factory.register(
+        factory_operator_name::CONNECTOR_SINK,
+        Box::new(connector::ConnectorSinkDispatcher),
+    );
+}
+
+fn register_kafka_connector_plugins(factory: &mut OperatorFactory) {
+    factory.register(
+        factory_operator_name::KAFKA_SOURCE,
+        Box::new(connector::kafka::ConnectorDispatcher),
+    );
+    factory.register(
+        factory_operator_name::KAFKA_SINK,
+        Box::new(connector::kafka::ConnectorDispatcher),
+    );
+    info!(
+        "Registered Kafka connector plugins ({}, {})",
+        factory_operator_name::KAFKA_SOURCE,
+        factory_operator_name::KAFKA_SINK
+    );
+}
\ No newline at end of file
diff --git a/src/runtime/streaming/factory/operator_constructor.rs b/src/runtime/streaming/factory/operator_constructor.rs
new file mode 100644
index 00000000..832fe734
--- /dev/null
+++ b/src/runtime/streaming/factory/operator_constructor.rs
@@ -0,0 +1,23 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+use anyhow::Result;
+use std::sync::Arc;
+
+use crate::runtime::streaming::api::operator::ConstructedOperator;
+use crate::runtime::streaming::factory::global::Registry;
+
+///
+pub trait OperatorConstructor: Send + Sync {
+    fn with_config(&self, config: &[u8], registry: Arc<Registry>) -> Result<ConstructedOperator>;
+}
diff --git a/src/runtime/streaming/factory/operator_factory.rs b/src/runtime/streaming/factory/operator_factory.rs
new file mode 100644
index 00000000..5a2dc26f
--- /dev/null
+++ b/src/runtime/streaming/factory/operator_factory.rs
@@ -0,0 +1,261 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+use anyhow::{anyhow, Result};
+use prost::Message;
+use std::collections::HashMap;
+use std::sync::Arc;
+use protocol::grpc::api::ProjectionOperator as ProjectionOperatorProto;
+use super::operator_constructor::OperatorConstructor;
+use crate::runtime::streaming::api::operator::ConstructedOperator;
+use crate::runtime::streaming::factory::connector::{
+    ConnectorSinkDispatcher, ConnectorSourceDispatcher,
+};
+use crate::runtime::streaming::factory::global::Registry;
+use crate::runtime::streaming::operators::grouping::IncrementalAggregatingConstructor;
+use crate::runtime::streaming::operators::joins::{
+    InstantJoinConstructor, JoinWithExpirationConstructor,
+};
+use crate::runtime::streaming::operators::key_by::KeyByConstructor;
+use crate::runtime::streaming::operators::watermark::WatermarkGeneratorConstructor;
+use crate::runtime::streaming::operators::windows::{
+    SessionAggregatingWindowConstructor, SlidingAggregatingWindowConstructor,
+    TumblingAggregateWindowConstructor, WindowFunctionConstructor,
+};
+use crate::runtime::streaming::operators::{ProjectionOperator, StatelessPhysicalExecutor, ValueExecutionOperator};
+use protocol::grpc::api::{
+    ExpressionWatermarkConfig, JoinOperator as JoinOperatorProto,
+    KeyPlanOperator as KeyByProto,
+    SessionWindowAggregateOperator, SlidingWindowAggregateOperator, TumblingWindowAggregateOperator,
+    UpdatingAggregateOperator, ValuePlanOperator, WindowFunctionOperator as WindowFunctionProto,
+};
+
+use crate::sql::logical_node::logical::OperatorName;
+
+///
+pub struct OperatorFactory {
+    constructors: HashMap<String, Box<dyn OperatorConstructor>>,
+    registry: Arc<Registry>,
+}
+
+impl OperatorFactory {
+    pub fn new(registry: Arc<Registry>) -> Self {
+        let mut factory = Self {
+            constructors: HashMap::new(),
+            registry,
+        };
+        factory.register_builtins();
+        factory
+    }
+
+    pub fn register(&mut self, name: &str, constructor: Box<dyn OperatorConstructor>) {
+        self.constructors.insert(name.to_string(), constructor);
+    }
+
+    pub fn register_named(&mut self, name: OperatorName, constructor: Box<dyn OperatorConstructor>) {
+        self.register(name.as_registry_key(), constructor);
+    }
+
+    pub fn create_operator(&self, name: &str, payload: &[u8]) -> Result<ConstructedOperator> {
+        let ctor = self
+            .constructors
+            .get(name)
+            .ok_or_else(|| {
+                anyhow!(
+                    "FATAL: Operator '{}' not found in Factory Registry. \
+                     Ensure the worker is compiled with the correct plugins.",
+                    name
+                )
+            })?;
+
+        ctor.with_config(payload, self.registry.clone())
+    }
+
+    pub fn registered_operators(&self) -> Vec<&str> {
+        self.constructors.keys().map(|s| s.as_str()).collect()
+    }
+
+    fn register_builtins(&mut self) {
+        self.register_named(OperatorName::TumblingWindowAggregate, Box::new(TumblingWindowBridge));
+        self.register_named(OperatorName::SlidingWindowAggregate, Box::new(SlidingWindowBridge));
+        self.register_named(OperatorName::SessionWindowAggregate, Box::new(SessionWindowBridge));
+
+        self.register_named(OperatorName::ExpressionWatermark, Box::new(WatermarkBridge));
+
+        // ─── SQL Window Function ───
+        self.register_named(OperatorName::WindowFunction, Box::new(WindowFunctionBridge));
+
+        // ─── Join ───
+        self.register_named(OperatorName::Join, Box::new(JoinWithExpirationBridge));
+        self.register_named(OperatorName::InstantJoin, Box::new(InstantJoinBridge));
+        self.register_named(OperatorName::LookupJoin, Box::new(LookupJoinBridge));
+
+        self.register_named(OperatorName::UpdatingAggregate, Box::new(IncrementalAggregateBridge));
+
+        self.register_named(OperatorName::KeyBy, Box::new(KeyByBridge));
+
+        self.register_named(OperatorName::Projection, Box::new(ProjectionConstructor));
+        self.register_named(OperatorName::Value, Box::new(ValueBridge));
+        self.register_named(OperatorName::ConnectorSource, Box::new(ConnectorSourceBridge));
+        self.register_named(OperatorName::ConnectorSink, Box::new(ConnectorSinkBridge));
+
+        crate::runtime::streaming::factory::register_kafka_connector_plugins(self);
+    }
+}
+
+struct TumblingWindowBridge;
+impl OperatorConstructor for TumblingWindowBridge {
+    fn with_config(&self, config: &[u8], registry: Arc<Registry>) -> Result<ConstructedOperator> {
+        let proto = TumblingWindowAggregateOperator::decode(config)
+            .map_err(|e| anyhow!("Decode TumblingWindowAggregateOperator failed: {e}"))?;
+        let op = TumblingAggregateWindowConstructor.with_config(proto, registry)?;
+        Ok(ConstructedOperator::Operator(Box::new(op)))
+    }
+}
+
+struct SlidingWindowBridge;
+impl OperatorConstructor for SlidingWindowBridge {
+    fn with_config(&self, config: &[u8], registry: Arc<Registry>) -> Result<ConstructedOperator> {
+        let proto = SlidingWindowAggregateOperator::decode(config)
+            .map_err(|e| anyhow!("Decode SlidingWindowAggregateOperator failed: {e}"))?;
+        let op = SlidingAggregatingWindowConstructor.with_config(proto, registry)?;
+        Ok(ConstructedOperator::Operator(Box::new(op)))
+    }
+}
+
+struct SessionWindowBridge;
+impl OperatorConstructor for SessionWindowBridge {
+    fn with_config(&self, config: &[u8], registry: Arc<Registry>) -> Result<ConstructedOperator> {
+        let proto = SessionWindowAggregateOperator::decode(config)
+            .map_err(|e| anyhow!("Decode SessionWindowAggregateOperator failed: {e}"))?;
+        let op = SessionAggregatingWindowConstructor.with_config(proto, registry)?;
+        Ok(ConstructedOperator::Operator(Box::new(op)))
+    }
+}
+
+struct WatermarkBridge;
+impl OperatorConstructor for WatermarkBridge {
+    fn with_config(&self, config: &[u8], registry: Arc<Registry>) -> Result<ConstructedOperator> {
+        let proto = ExpressionWatermarkConfig::decode(config)
+            .map_err(|e| anyhow!("Decode ExpressionWatermarkConfig failed: {e}"))?;
+        let op = WatermarkGeneratorConstructor.with_config(proto, registry)?;
+        Ok(ConstructedOperator::Operator(Box::new(op)))
+    }
+}
+
+struct WindowFunctionBridge;
+impl OperatorConstructor for WindowFunctionBridge {
+    fn with_config(&self, config: &[u8], registry: Arc<Registry>) -> Result<ConstructedOperator> {
+        let proto = WindowFunctionProto::decode(config)
+            .map_err(|e| anyhow!("Decode WindowFunctionOperator failed: {e}"))?;
+        let op = WindowFunctionConstructor.with_config(proto, registry)?;
+        Ok(ConstructedOperator::Operator(Box::new(op)))
+    }
+}
+
+struct JoinWithExpirationBridge;
+impl OperatorConstructor for JoinWithExpirationBridge {
+    fn with_config(&self, config: &[u8], registry: Arc<Registry>) -> Result<ConstructedOperator> {
+        let proto = JoinOperatorProto::decode(config)
+            .map_err(|e| anyhow!("Decode JoinOperator (expiration) failed: {e}"))?;
+        let op = JoinWithExpirationConstructor.with_config(proto, registry)?;
+        Ok(ConstructedOperator::Operator(Box::new(op)))
+    }
+}
+
+struct InstantJoinBridge;
+impl OperatorConstructor for InstantJoinBridge {
+    fn with_config(&self, config: &[u8], registry: Arc<Registry>) -> Result<ConstructedOperator> {
+        let proto = JoinOperatorProto::decode(config)
+            .map_err(|e| anyhow!("Decode JoinOperator (instant) failed: {e}"))?;
+        let op = InstantJoinConstructor.with_config(proto, registry)?;
+        Ok(ConstructedOperator::Operator(Box::new(op)))
+    }
+}
+
+struct LookupJoinBridge;
+impl OperatorConstructor for LookupJoinBridge {
+    fn with_config(&self, _config: &[u8], _registry: Arc<Registry>) -> Result<ConstructedOperator> {
+        Err(anyhow!("LookupJoin is not supported in the current runtime"))
+    }
+}
+
+struct IncrementalAggregateBridge;
+impl OperatorConstructor for IncrementalAggregateBridge {
+    fn with_config(&self, config: &[u8], registry: Arc<Registry>) -> Result<ConstructedOperator> {
+        let proto = UpdatingAggregateOperator::decode(config)
+            .map_err(|e| anyhow!("Decode UpdatingAggregateOperator failed: {e}"))?;
+        let op = IncrementalAggregatingConstructor.with_config(proto, registry)?;
+        Ok(ConstructedOperator::Operator(Box::new(op)))
+    }
+}
+
+struct KeyByBridge;
+impl OperatorConstructor for KeyByBridge {
+    fn with_config(&self, config: &[u8], _registry: Arc<Registry>) -> Result<ConstructedOperator> {
+        let proto = KeyByProto::decode(config)
+            .map_err(|e| anyhow!("Decode KeyPlanOperator failed: {e}"))?;
+        let op = KeyByConstructor.with_config(proto)?;
+        Ok(ConstructedOperator::Operator(Box::new(op)))
+    }
+}
+
+pub struct ProjectionConstructor;
+
+impl OperatorConstructor for ProjectionConstructor {
+    fn with_config(&self, payload: &[u8], registry: Arc<Registry>) -> Result<ConstructedOperator> {
+        let proto = ProjectionOperatorProto::decode(payload)?;
+        let op = ProjectionOperator::from_proto(proto, registry)?;
+        Ok(ConstructedOperator::Operator(Box::new(op)))
+    }
+}
+
+struct ValueBridge;
+impl OperatorConstructor for ValueBridge {
+    fn with_config(&self, config: &[u8], registry: Arc<Registry>) -> Result<ConstructedOperator> {
+        let proto = ValuePlanOperator::decode(config)
+            .map_err(|e| anyhow!("Decode ValuePlanOperator failed: {e}"))?;
+        let op = ValueExecutionConstructor.with_config(proto, registry)?;
+        Ok(ConstructedOperator::Operator(Box::new(op)))
+    }
+}
+
+/// Generic connector source constructor: decodes `ConnectorOp` and dispatches by connector type.
+struct ConnectorSourceBridge;
+impl OperatorConstructor for ConnectorSourceBridge {
+    fn with_config(&self, config: &[u8], registry: Arc<Registry>) -> Result<ConstructedOperator> {
+        ConnectorSourceDispatcher.with_config(config, registry)
+    }
+}
+
+/// Generic connector sink constructor: decodes `ConnectorOp` and dispatches by connector type.
+struct ConnectorSinkBridge;
+impl OperatorConstructor for ConnectorSinkBridge {
+    fn with_config(&self, config: &[u8], registry: Arc<Registry>) -> Result<ConstructedOperator> {
+        ConnectorSinkDispatcher.with_config(config, registry)
+    }
+}
+
+
+struct ValueExecutionConstructor;
+impl ValueExecutionConstructor {
+    fn with_config(
+        &self,
+        config: ValuePlanOperator,
+        registry: Arc<Registry>,
+    ) -> Result<ValueExecutionOperator> {
+        let executor = StatelessPhysicalExecutor::new(&config.physical_plan, registry.as_ref())
+            .map_err(|e| anyhow!("build value execution plan '{}': {e}", config.name))?;
+        Ok(ValueExecutionOperator::new(config.name, executor))
+    }
+}
\ No newline at end of file
diff --git a/src/runtime/streaming/format/config.rs b/src/runtime/streaming/format/config.rs
new file mode 100644
index 00000000..15a58008
--- /dev/null
+++ b/src/runtime/streaming/format/config.rs
@@ -0,0 +1,47 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use serde::{Deserialize, Serialize};
+
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
+pub enum TimestampFormat {
+    RFC3339,
+    UnixMillis,
+    UnixSeconds,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
+pub enum DecimalEncoding {
+    String,
+    Number,
+    Bytes,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
+pub enum BadDataPolicy {
+    Fail,
+    Drop,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct JsonFormat {
+    pub timestamp_format: TimestampFormat,
+    pub decimal_encoding: DecimalEncoding,
+    pub include_schema: bool,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub enum Format {
+    Json(JsonFormat),
+    RawString,
+    RawBytes,
+}
diff --git a/src/runtime/streaming/format/deserializer.rs b/src/runtime/streaming/format/deserializer.rs
new file mode 100644
index 00000000..3e9e6d66
--- /dev/null
+++ b/src/runtime/streaming/format/deserializer.rs
@@ -0,0 +1,95 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+use anyhow::{anyhow, Result};
+use arrow_array::builder::StringBuilder;
+use arrow_array::RecordBatch;
+use arrow_json::reader::ReaderBuilder;
+use arrow_schema::SchemaRef;
+use std::sync::Arc;
+
+use super::config::{BadDataPolicy, Format};
+
+pub struct DataDeserializer {
+    format: Format,
+    schema: SchemaRef,
+    bad_data_policy: BadDataPolicy,
+}
+
+impl DataDeserializer {
+    pub fn new(format: Format, schema: SchemaRef, bad_data_policy: BadDataPolicy) -> Self {
+        Self {
+            format,
+            schema,
+            bad_data_policy,
+        }
+    }
+
+    pub fn deserialize_batch(&self, messages: &[&[u8]]) -> Result<RecordBatch> {
+        match &self.format {
+            Format::Json(_) => self.deserialize_json(messages),
+            Format::RawString => self.deserialize_raw_string(messages),
+            Format::RawBytes => self.deserialize_raw_bytes(messages),
+        }
+    }
+
+    fn deserialize_json(&self, messages: &[&[u8]]) -> Result<RecordBatch> {
+        let mut buffer = Vec::with_capacity(messages.len() * 256);
+        for msg in messages {
+            buffer.extend_from_slice(msg);
+            buffer.push(b'\n');
+        }
+
+        let allow_bad_data = self.bad_data_policy == BadDataPolicy::Drop;
+        let mut decoder = ReaderBuilder::new(self.schema.clone())
+            .with_strict_mode(!allow_bad_data)
+            .build_decoder()?;
+
+        decoder.decode(&buffer)?;
+
+        let batch = if allow_bad_data {
+            let (batch, _mask, _, _errors) = decoder.flush_with_bad_data()?.unwrap();
+            batch
+        } else {
+            decoder
+                .flush()?
+                .ok_or_else(|| anyhow!("JSON decoder returned no batch"))?
+        };
+
+        Ok(batch)
+    }
+
+    fn deserialize_raw_string(&self, messages: &[&[u8]]) -> Result<RecordBatch> {
+        let mut builder = StringBuilder::with_capacity(messages.len(), messages.len() * 64);
+        for msg in messages {
+            builder.append_value(String::from_utf8_lossy(msg));
+        }
+
+        let array = Arc::new(builder.finish());
+        RecordBatch::try_new(self.schema.clone(), vec![array])
+            .map_err(|e| anyhow!("build RawString batch: {e}"))
+    }
+
+    fn deserialize_raw_bytes(&self, messages: &[&[u8]]) -> Result<RecordBatch> {
+        use arrow_array::builder::BinaryBuilder;
+
+        let mut builder = BinaryBuilder::with_capacity(messages.len(), messages.len() * 64);
+        for msg in messages {
+            builder.append_value(msg);
+        }
+
+        let array = Arc::new(builder.finish());
+        RecordBatch::try_new(self.schema.clone(), vec![array])
+            .map_err(|e| anyhow!("build RawBytes batch: {e}"))
+    }
+}
diff --git a/src/runtime/streaming/format/json_encoder.rs b/src/runtime/streaming/format/json_encoder.rs
new file mode 100644
index 00000000..f834a192
--- /dev/null
+++ b/src/runtime/streaming/format/json_encoder.rs
@@ -0,0 +1,175 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//!
+
+use arrow_array::{
+    Array, Decimal128Array, TimestampMicrosecondArray,
+    TimestampMillisecondArray, TimestampNanosecondArray, TimestampSecondArray,
+};
+use arrow_json::writer::NullableEncoder;
+use arrow_json::{Encoder, EncoderFactory, EncoderOptions};
+use arrow_schema::{ArrowError, DataType, FieldRef, TimeUnit};
+use base64::prelude::BASE64_STANDARD;
+use base64::Engine;
+
+use super::config::{DecimalEncoding, TimestampFormat};
+
+#[derive(Debug)]
+pub struct CustomEncoderFactory {
+    pub timestamp_format: TimestampFormat,
+    pub decimal_encoding: DecimalEncoding,
+}
+
+impl EncoderFactory for CustomEncoderFactory {
+    fn make_default_encoder<'a>(
+        &self,
+        _field: &'a FieldRef,
+        array: &'a dyn Array,
+        _options: &'a EncoderOptions,
+    ) -> Result<Option<NullableEncoder<'a>>, ArrowError> {
+        let encoder: Box<dyn Encoder> = match (
+            &self.decimal_encoding,
+            &self.timestamp_format,
+            array.data_type(),
+        ) {
+            (_, TimestampFormat::UnixMillis, DataType::Timestamp(TimeUnit::Nanosecond, _)) => {
+                let arr = array
+                    .as_any()
+                    .downcast_ref::<TimestampNanosecondArray>()
+                    .unwrap()
+                    .clone();
+                Box::new(UnixMillisEncoder::Nanos(arr))
+            }
+            (_, TimestampFormat::UnixMillis, DataType::Timestamp(TimeUnit::Microsecond, _)) => {
+                let arr = array
+                    .as_any()
+                    .downcast_ref::<TimestampMicrosecondArray>()
+                    .unwrap()
+                    .clone();
+                Box::new(UnixMillisEncoder::Micros(arr))
+            }
+            (_, TimestampFormat::UnixMillis, DataType::Timestamp(TimeUnit::Millisecond, _)) => {
+                let arr = array
+                    .as_any()
+                    .downcast_ref::<TimestampMillisecondArray>()
+                    .unwrap()
+                    .clone();
+                Box::new(UnixMillisEncoder::Millis(arr))
+            }
+            (_, TimestampFormat::UnixMillis, DataType::Timestamp(TimeUnit::Second, _)) => {
+                let arr = array
+                    .as_any()
+                    .downcast_ref::<TimestampSecondArray>()
+                    .unwrap()
+                    .clone();
+                Box::new(UnixMillisEncoder::Seconds(arr))
+            }
+
+            // ── Decimal128 → String / Bytes ──
+            (DecimalEncoding::String, _, DataType::Decimal128(_, _)) => {
+                let arr = array
+                    .as_any()
+                    .downcast_ref::<Decimal128Array>()
+                    .unwrap()
+                    .clone();
+                Box::new(DecimalEncoder::StringEncoder(arr))
+            }
+            (DecimalEncoding::Bytes, _, DataType::Decimal128(_, _)) => {
+                let arr = array
+                    .as_any()
+                    .downcast_ref::<Decimal128Array>()
+                    .unwrap()
+                    .clone();
+                Box::new(DecimalEncoder::BytesEncoder(arr))
+            }
+
+            // ── Binary → Base64 ──
+            (_, _, DataType::Binary) => {
+                let arr = array
+                    .as_any()
+                    .downcast_ref::<arrow_array::BinaryArray>()
+                    .unwrap()
+                    .clone();
+                Box::new(BinaryEncoder(arr))
+            }
+
+            _ => return Ok(None),
+        };
+
+        Ok(Some(NullableEncoder::new(encoder, array.nulls().cloned())))
+    }
+}
+
+// ---------------------------------------------------------------------------
+// ---------------------------------------------------------------------------
+
+enum UnixMillisEncoder {
+    Nanos(TimestampNanosecondArray),
+    Micros(TimestampMicrosecondArray),
+    Millis(TimestampMillisecondArray),
+    Seconds(TimestampSecondArray),
+}
+
+impl Encoder for UnixMillisEncoder {
+    fn encode(&mut self, idx: usize, out: &mut Vec<u8>) {
+        let millis = match self {
+            Self::Nanos(arr) => arr.value(idx) / 1_000_000,
+            Self::Micros(arr) => arr.value(idx) / 1_000,
+            Self::Millis(arr) => arr.value(idx),
+            Self::Seconds(arr) => arr.value(idx) * 1_000,
+        };
+        out.extend_from_slice(millis.to_string().as_bytes());
+    }
+}
+
+// ---------------------------------------------------------------------------
+// ---------------------------------------------------------------------------
+
+enum DecimalEncoder {
+    StringEncoder(Decimal128Array),
+    BytesEncoder(Decimal128Array),
+}
+
+impl Encoder for DecimalEncoder {
+    fn encode(&mut self, idx: usize, out: &mut Vec<u8>) {
+        match self {
+            Self::StringEncoder(arr) => {
+                out.push(b'"');
+                out.extend_from_slice(arr.value_as_string(idx).as_bytes());
+                out.push(b'"');
+            }
+            Self::BytesEncoder(arr) => {
+                out.push(b'"');
+                out.extend_from_slice(
+                    BASE64_STANDARD
+                        .encode(arr.value(idx).to_be_bytes())
+                        .as_bytes(),
+                );
+                out.push(b'"');
+            }
+        }
+    }
+}
+
+// ---------------------------------------------------------------------------
+// ---------------------------------------------------------------------------
+
+struct BinaryEncoder(arrow_array::BinaryArray);
+
+impl Encoder for BinaryEncoder {
+    fn encode(&mut self, idx: usize, out: &mut Vec<u8>) {
+        out.push(b'"');
+        out.extend_from_slice(BASE64_STANDARD.encode(self.0.value(idx)).as_bytes());
+        out.push(b'"');
+    }
+}
diff --git a/src/runtime/streaming/format/mod.rs b/src/runtime/streaming/format/mod.rs
new file mode 100644
index 00000000..d5e63a9d
--- /dev/null
+++ b/src/runtime/streaming/format/mod.rs
@@ -0,0 +1,20 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+pub mod config;
+pub mod deserializer;
+pub mod json_encoder;
+pub mod serializer;
+
+pub use config::{BadDataPolicy, DecimalEncoding, Format, JsonFormat, TimestampFormat};
+pub use deserializer::DataDeserializer;
+pub use serializer::DataSerializer;
diff --git a/src/runtime/streaming/format/serializer.rs b/src/runtime/streaming/format/serializer.rs
new file mode 100644
index 00000000..bb123499
--- /dev/null
+++ b/src/runtime/streaming/format/serializer.rs
@@ -0,0 +1,140 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+use anyhow::{anyhow, Result};
+use arrow_array::{Array, RecordBatch, StructArray};
+use arrow_json::writer::make_encoder;
+use arrow_json::EncoderOptions;
+use arrow_schema::{DataType, Field, SchemaRef};
+use std::sync::Arc;
+
+use super::config::{Format, JsonFormat};
+use super::json_encoder::CustomEncoderFactory;
+
+pub struct DataSerializer {
+    format: Format,
+    projection_indices: Vec<usize>,
+}
+
+impl DataSerializer {
+    pub fn new(format: Format, schema: SchemaRef) -> Self {
+        let projection_indices: Vec<usize> = schema
+            .fields()
+            .iter()
+            .enumerate()
+            .filter(|(_, f)| !f.name().starts_with('_'))
+            .map(|(i, _)| i)
+            .collect();
+
+        Self {
+            format,
+            projection_indices,
+        }
+    }
+
+    pub fn serialize(&self, batch: &RecordBatch) -> Result<Vec<Vec<u8>>> {
+        let projected_batch = batch.project(&self.projection_indices)?;
+
+        match &self.format {
+            Format::Json(config) => self.serialize_json(config, &projected_batch),
+            Format::RawString => self.serialize_raw_string(&projected_batch),
+            Format::RawBytes => self.serialize_raw_bytes(&projected_batch),
+        }
+    }
+
+    fn serialize_json(&self, config: &JsonFormat, batch: &RecordBatch) -> Result<Vec<Vec<u8>>> {
+        let array = StructArray::from(batch.clone());
+        let field = Arc::new(Field::new_struct(
+            "",
+            batch.schema().fields().clone(),
+            false,
+        ));
+
+        let options = EncoderOptions::default()
+            .with_explicit_nulls(true)
+            .with_encoder_factory(Arc::new(CustomEncoderFactory {
+                timestamp_format: config.timestamp_format.clone(),
+                decimal_encoding: config.decimal_encoding.clone(),
+            }));
+
+        let mut encoder = make_encoder(&field, &array, &options)?;
+        let mut results = Vec::with_capacity(batch.num_rows());
+
+        for idx in 0..array.len() {
+            let mut buffer = Vec::with_capacity(128);
+            encoder.encode(idx, &mut buffer);
+            if !buffer.is_empty() {
+                results.push(buffer);
+            }
+        }
+        Ok(results)
+    }
+
+    fn serialize_raw_string(&self, batch: &RecordBatch) -> Result<Vec<Vec<u8>>> {
+        let value_idx = batch
+            .schema()
+            .index_of("value")
+            .map_err(|_| anyhow!("RawString format requires a 'value' column"))?;
+
+        if *batch.schema().field(value_idx).data_type() != DataType::Utf8 {
+            return Err(anyhow!("RawString 'value' column must be Utf8"));
+        }
+
+        let string_array = batch
+            .column(value_idx)
+            .as_any()
+            .downcast_ref::<arrow_array::StringArray>()
+            .unwrap();
+
+        let values: Vec<Vec<u8>> = (0..string_array.len())
+            .map(|i| {
+                if string_array.is_null(i) {
+                    vec![]
+                } else {
+                    string_array.value(i).as_bytes().to_vec()
+                }
+            })
+            .collect();
+
+        Ok(values)
+    }
+
+    fn serialize_raw_bytes(&self, batch: &RecordBatch) -> Result<Vec<Vec<u8>>> {
+        let value_idx = batch
+            .schema()
+            .index_of("value")
+            .map_err(|_| anyhow!("RawBytes format requires a 'value' column"))?;
+
+        if *batch.schema().field(value_idx).data_type() != DataType::Binary {
+            return Err(anyhow!("RawBytes 'value' column must be Binary"));
+        }
+
+        let binary_array = batch
+            .column(value_idx)
+            .as_any()
+            .downcast_ref::<arrow_array::BinaryArray>()
+            .unwrap();
+
+        let values: Vec<Vec<u8>> = (0..binary_array.len())
+            .map(|i| {
+                if binary_array.is_null(i) {
+                    vec![]
+                } else {
+                    binary_array.value(i).to_vec()
+                }
+            })
+            .collect();
+
+        Ok(values)
+    }
+}
diff --git a/src/runtime/streaming/job/edge_manager.rs b/src/runtime/streaming/job/edge_manager.rs
new file mode 100644
index 00000000..b57b761f
--- /dev/null
+++ b/src/runtime/streaming/job/edge_manager.rs
@@ -0,0 +1,52 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::collections::HashMap;
+
+use protocol::grpc::api::{FsEdge, FsNode};
+use tokio::sync::mpsc;
+
+use crate::runtime::streaming::protocol::tracked::TrackedEvent;
+
+pub struct EdgeManager {
+    endpoints: HashMap<u32, (Option<mpsc::Receiver<TrackedEvent>>, Vec<mpsc::Sender<TrackedEvent>>)>,
+}
+
+impl EdgeManager {
+    pub fn build(nodes: &[FsNode], edges: &[FsEdge]) -> Self {
+        let mut tx_map: HashMap<u32, Vec<mpsc::Sender<TrackedEvent>>> = HashMap::new();
+        let mut rx_map: HashMap<u32, mpsc::Receiver<TrackedEvent>> = HashMap::new();
+
+        for edge in edges {
+            let (tx, rx) = mpsc::channel(2048);
+            tx_map.entry(edge.source as u32).or_default().push(tx);
+            rx_map.insert(edge.target as u32, rx);
+        }
+
+        let mut endpoints = HashMap::new();
+        for node in nodes {
+            let id = node.node_index as u32;
+            endpoints.insert(id, (rx_map.remove(&id), tx_map.remove(&id).unwrap_or_default()));
+        }
+
+        Self { endpoints }
+    }
+
+    pub fn take_endpoints(
+        &mut self,
+        id: u32,
+    ) -> (Option<mpsc::Receiver<TrackedEvent>>, Vec<mpsc::Sender<TrackedEvent>>) {
+        self.endpoints
+            .remove(&id)
+            .expect("Critical: Execution Graph Inconsistent")
+    }
+}
diff --git a/src/runtime/streaming/job/job_manager.rs b/src/runtime/streaming/job/job_manager.rs
new file mode 100644
index 00000000..19a8a26e
--- /dev/null
+++ b/src/runtime/streaming/job/job_manager.rs
@@ -0,0 +1,481 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::collections::HashMap;
+use std::sync::{Arc, OnceLock, RwLock};
+
+use anyhow::anyhow;
+use tokio::sync::mpsc;
+use tokio_stream::wrappers::ReceiverStream;
+use tracing::{error, info, warn};
+
+use protocol::grpc::api::{ChainedOperator, FsProgram};
+
+use crate::runtime::streaming::api::context::TaskContext;
+use crate::runtime::streaming::api::operator::{ConstructedOperator, Operator};
+use crate::runtime::streaming::api::source::SourceOperator;
+use crate::runtime::streaming::execution::runner::{ChainedDriver, Pipeline};
+use crate::runtime::streaming::execution::source::SourceRunner;
+use crate::runtime::streaming::factory::OperatorFactory;
+use crate::runtime::streaming::job::edge_manager::EdgeManager;
+use crate::runtime::streaming::job::models::{PhysicalExecutionGraph, PhysicalPipeline, PipelineStatus};
+use crate::runtime::streaming::memory::MemoryPool;
+use crate::runtime::streaming::network::endpoint::{BoxedEventStream, PhysicalSender};
+use crate::runtime::streaming::protocol::control::{ControlCommand, StopMode};
+
+#[derive(Debug, Clone)]
+pub struct StreamingJobSummary {
+    pub job_id: String,
+    pub status: String,
+    pub pipeline_count: i32,
+    pub uptime_secs: u64,
+}
+
+#[derive(Debug, Clone)]
+pub struct PipelineDetail {
+    pub pipeline_id: u32,
+    pub status: String,
+}
+
+#[derive(Debug, Clone)]
+pub struct StreamingJobDetail {
+    pub job_id: String,
+    pub status: String,
+    pub pipeline_count: i32,
+    pub uptime_secs: u64,
+    pub pipelines: Vec<PipelineDetail>,
+    pub program: FsProgram,
+}
+
+static GLOBAL_JOB_MANAGER: OnceLock<Arc<JobManager>> = OnceLock::new();
+
+pub struct JobManager {
+    active_jobs: Arc<RwLock<HashMap<String, PhysicalExecutionGraph>>>,
+    operator_factory: Arc<OperatorFactory>,
+    memory_pool: Arc<MemoryPool>,
+}
+
+struct PreparedChain {
+    source: Option<Box<dyn SourceOperator>>,
+    operators: Vec<Box<dyn Operator>>,
+}
+
+impl JobManager {
+    pub fn new(operator_factory: Arc<OperatorFactory>, max_memory_bytes: usize) -> Self {
+        Self {
+            active_jobs: Arc::new(RwLock::new(HashMap::new())),
+            operator_factory,
+            memory_pool: MemoryPool::new(max_memory_bytes),
+        }
+    }
+
+    pub fn init(operator_factory: Arc<OperatorFactory>, max_memory_bytes: usize) -> anyhow::Result<()> {
+        let manager = Arc::new(Self::new(operator_factory, max_memory_bytes));
+        GLOBAL_JOB_MANAGER
+            .set(manager)
+            .map_err(|_| anyhow!("JobManager singleton already initialized"))
+    }
+
+    pub fn global() -> anyhow::Result<Arc<Self>> {
+        GLOBAL_JOB_MANAGER
+            .get()
+            .cloned()
+            .ok_or_else(|| anyhow!("JobManager not initialized. Call init() first."))
+    }
+
+    ///
+    pub async fn submit_job(&self, job_id: String, program: FsProgram) -> anyhow::Result<String> {
+        let mut edge_manager = EdgeManager::build(&program.nodes, &program.edges);
+        let mut pipelines = HashMap::new();
+
+        for node in &program.nodes {
+            let pipeline_id = node.node_index as u32;
+
+            let (raw_inboxes, raw_outboxes) = edge_manager.take_endpoints(pipeline_id);
+            let physical_outboxes = raw_outboxes.into_iter().map(PhysicalSender::Local).collect();
+            let physical_inboxes: Vec<BoxedEventStream> = raw_inboxes
+                .into_iter()
+                .map(|rx| Box::pin(ReceiverStream::new(rx)) as _)
+                .collect();
+
+            let chain = self.build_operator_chain(&node.operators)?;
+            if chain.source.is_none() && physical_inboxes.is_empty() {
+                anyhow::bail!(
+                    "Topology Error: pipeline '{}' contains no source operator and has no upstream inputs.",
+                    pipeline_id
+                );
+            }
+            if chain.source.is_some() && !physical_inboxes.is_empty() {
+                anyhow::bail!(
+                    "Topology Error: source pipeline '{}' should not have upstream inputs.",
+                    pipeline_id
+                );
+            }
+
+            let (control_tx, control_rx) = mpsc::channel(64);
+            let status = Arc::new(RwLock::new(PipelineStatus::Initializing));
+
+            let handle = if let Some(source) = chain.source {
+                self.spawn_source_pipeline_thread(
+                    job_id.clone(),
+                    pipeline_id,
+                    source,
+                    chain.operators,
+                    physical_outboxes,
+                    control_rx,
+                    Arc::clone(&status),
+                )?
+            } else {
+                self.spawn_pipeline_thread(
+                    job_id.clone(),
+                    pipeline_id,
+                    chain.operators,
+                    physical_inboxes,
+                    physical_outboxes,
+                    control_rx,
+                    Arc::clone(&status),
+                )?
+            };
+
+            pipelines.insert(
+                pipeline_id,
+                PhysicalPipeline {
+                    pipeline_id,
+                    handle: Some(handle),
+                    status,
+                    control_tx,
+                },
+            );
+        }
+
+        let graph = PhysicalExecutionGraph {
+            job_id: job_id.clone(),
+            program,
+            pipelines,
+            start_time: std::time::Instant::now(),
+        };
+
+        self.active_jobs.write().unwrap().insert(job_id.clone(), graph);
+        info!(job_id = %job_id, "Job submitted successfully.");
+
+        Ok(job_id)
+    }
+
+    pub async fn stop_job(&self, job_id: &str, mode: StopMode) -> anyhow::Result<()> {
+        let control_senders: Vec<_> = {
+            let jobs_guard = self.active_jobs.read().unwrap();
+            let graph = jobs_guard
+                .get(job_id)
+                .ok_or_else(|| anyhow::anyhow!("Job not found: {job_id}"))?;
+
+            graph.pipelines.values().map(|p| p.control_tx.clone()).collect()
+        };
+
+        for tx in control_senders {
+            let _ = tx.send(ControlCommand::Stop { mode: mode.clone() }).await;
+        }
+
+        info!(job_id = %job_id, mode = ?mode, "Job stop signal dispatched.");
+        Ok(())
+    }
+
+    pub fn get_pipeline_statuses(&self, job_id: &str) -> Option<HashMap<u32, PipelineStatus>> {
+        let jobs_guard = self.active_jobs.read().unwrap();
+        let graph = jobs_guard.get(job_id)?;
+
+        Some(
+            graph.pipelines
+                .iter()
+                .map(|(id, pipeline)| {
+                    (*id, pipeline.status.read().unwrap().clone())
+                })
+                .collect(),
+        )
+    }
+
+    pub fn list_jobs(&self) -> Vec<StreamingJobSummary> {
+        let jobs_guard = self.active_jobs.read().unwrap();
+        jobs_guard
+            .values()
+            .map(|graph| {
+                let pipeline_count = graph.pipelines.len() as i32;
+                let uptime_secs = graph.start_time.elapsed().as_secs();
+                let status = Self::aggregate_pipeline_status(&graph.pipelines);
+                StreamingJobSummary {
+                    job_id: graph.job_id.clone(),
+                    status,
+                    pipeline_count,
+                    uptime_secs,
+                }
+            })
+            .collect()
+    }
+
+    pub fn get_job_detail(&self, job_id: &str) -> Option<StreamingJobDetail> {
+        let jobs_guard = self.active_jobs.read().unwrap();
+        let graph = jobs_guard.get(job_id)?;
+
+        let uptime_secs = graph.start_time.elapsed().as_secs();
+        let overall_status = Self::aggregate_pipeline_status(&graph.pipelines);
+
+        let pipeline_details: Vec<PipelineDetail> = graph
+            .pipelines
+            .iter()
+            .map(|(id, pipeline)| {
+                let status = pipeline.status.read().unwrap().clone();
+                PipelineDetail {
+                    pipeline_id: *id,
+                    status: format!("{status:?}"),
+                }
+            })
+            .collect();
+
+        Some(StreamingJobDetail {
+            job_id: graph.job_id.clone(),
+            status: overall_status,
+            pipeline_count: graph.pipelines.len() as i32,
+            uptime_secs,
+            pipelines: pipeline_details,
+            program: graph.program.clone(),
+        })
+    }
+
+    pub fn has_job(&self, job_id: &str) -> bool {
+        self.active_jobs.read().unwrap().contains_key(job_id)
+    }
+
+    pub async fn remove_job(&self, job_id: &str, mode: StopMode) -> anyhow::Result<()> {
+        {
+            let jobs_guard = self.active_jobs.read().unwrap();
+            if !jobs_guard.contains_key(job_id) {
+                anyhow::bail!("Job not found: {job_id}");
+            }
+            let graph = &jobs_guard[job_id];
+            let control_senders: Vec<_> =
+                graph.pipelines.values().map(|p| p.control_tx.clone()).collect();
+
+            drop(jobs_guard);
+
+            for tx in control_senders {
+                let _ = tx.send(ControlCommand::Stop { mode: mode.clone() }).await;
+            }
+        }
+
+        self.active_jobs.write().unwrap().remove(job_id);
+        info!(job_id = %job_id, "Job stopped and removed.");
+        Ok(())
+    }
+
+    fn aggregate_pipeline_status(
+        pipelines: &HashMap<u32, PhysicalPipeline>,
+    ) -> String {
+        let mut running = 0u32;
+        let mut failed = 0u32;
+        let mut finished = 0u32;
+        let mut initializing = 0u32;
+
+        for pipeline in pipelines.values() {
+            match &*pipeline.status.read().unwrap() {
+                PipelineStatus::Running => running += 1,
+                PipelineStatus::Failed { .. } => failed += 1,
+                PipelineStatus::Finished => finished += 1,
+                PipelineStatus::Initializing => initializing += 1,
+                PipelineStatus::Stopping => {}
+            }
+        }
+
+        if failed > 0 {
+            "DEGRADED".to_string()
+        } else if running > 0 && running == pipelines.len() as u32 {
+            "RUNNING".to_string()
+        } else if finished == pipelines.len() as u32 {
+            "FINISHED".to_string()
+        } else if initializing > 0 {
+            "INITIALIZING".to_string()
+        } else {
+            "PARTIAL".to_string()
+        }
+    }
+
+    // ========================================================================
+
+    fn build_operator_chain(
+        &self,
+        operator_configs: &[ChainedOperator],
+    ) -> anyhow::Result<PreparedChain> {
+        let mut source: Option<Box<dyn SourceOperator>> = None;
+        let mut chain = Vec::with_capacity(operator_configs.len());
+
+        for op_config in operator_configs {
+            let constructed = self.operator_factory
+                .create_operator(&op_config.operator_name, &op_config.operator_config)?;
+
+            match constructed {
+                ConstructedOperator::Operator(msg_op) => chain.push(msg_op),
+                ConstructedOperator::Source(src_op) => {
+                    if source.is_some() {
+                        anyhow::bail!(
+                            "Topology Error: Multiple source operators detected in one physical chain."
+                        );
+                    }
+                    if !chain.is_empty() {
+                        anyhow::bail!(
+                            "Topology Error: Source operator '{}' cannot be scheduled inside a MessageOperator physical chain.",
+                            op_config.operator_name
+                        );
+                    }
+                    source = Some(src_op);
+                }
+            }
+        }
+        Ok(PreparedChain {
+            source,
+            operators: chain,
+        })
+    }
+
+    fn spawn_pipeline_thread(
+        &self,
+        job_id: String,
+        pipeline_id: u32,
+        operators: Vec<Box<dyn Operator>>,
+        inboxes: Vec<BoxedEventStream>,
+        outboxes: Vec<PhysicalSender>,
+        control_rx: mpsc::Receiver<ControlCommand>,
+        status: Arc<RwLock<PipelineStatus>>,
+    ) -> anyhow::Result<std::thread::JoinHandle<()>> {
+        let memory_pool = Arc::clone(&self.memory_pool);
+        let thread_name = format!("Task-{job_id}-{pipeline_id}");
+
+        let handle = std::thread::Builder::new()
+            .name(thread_name)
+            .spawn(move || {
+                *status.write().unwrap() = PipelineStatus::Running;
+
+                let rt = tokio::runtime::Builder::new_current_thread()
+                    .enable_all()
+                    .build()
+                    .expect("Failed to build current-thread Tokio runtime for pipeline");
+
+                let job_id_inner = job_id.clone();
+                let execution_result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
+                    rt.block_on(async move {
+                        let ctx = TaskContext::new(
+                            job_id_inner,
+                            pipeline_id,
+                            0,
+                            1,
+                            outboxes,
+                            memory_pool,
+                        );
+
+                        let pipeline = Pipeline::new(operators, ctx, inboxes, control_rx)
+                            .map_err(|e| anyhow::anyhow!("Pipeline init failed: {e}"))?;
+
+                        pipeline.run().await.map_err(|e| anyhow::anyhow!("Pipeline execution failed: {e}"))
+                    })
+                }));
+
+                Self::handle_pipeline_exit(&job_id, pipeline_id, execution_result, &status);
+            })?;
+
+        Ok(handle)
+    }
+
+    fn spawn_source_pipeline_thread(
+        &self,
+        job_id: String,
+        pipeline_id: u32,
+        source: Box<dyn SourceOperator>,
+        operators: Vec<Box<dyn Operator>>,
+        outboxes: Vec<PhysicalSender>,
+        control_rx: mpsc::Receiver<ControlCommand>,
+        status: Arc<RwLock<PipelineStatus>>,
+    ) -> anyhow::Result<std::thread::JoinHandle<()>> {
+        let memory_pool = Arc::clone(&self.memory_pool);
+        let thread_name = format!("Task-{job_id}-{pipeline_id}");
+
+        let handle = std::thread::Builder::new()
+            .name(thread_name)
+            .spawn(move || {
+                *status.write().unwrap() = PipelineStatus::Running;
+
+                let rt = tokio::runtime::Builder::new_current_thread()
+                    .enable_all()
+                    .build()
+                    .expect("Failed to build current-thread Tokio runtime for source pipeline");
+
+                let job_id_inner = job_id.clone();
+                let execution_result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
+                    rt.block_on(async move {
+                        let ctx = TaskContext::new(
+                            job_id_inner,
+                            pipeline_id,
+                            0,
+                            1,
+                            outboxes,
+                            memory_pool,
+                        );
+
+                        let chain_head = ChainedDriver::build_chain(operators);
+                        let runner = SourceRunner::new(source, chain_head, ctx, control_rx);
+
+                        runner
+                            .run()
+                            .await
+                            .map_err(|e| anyhow::anyhow!("Source pipeline execution failed: {e}"))
+                    })
+                }));
+
+                Self::handle_pipeline_exit(&job_id, pipeline_id, execution_result, &status);
+            })?;
+
+        Ok(handle)
+    }
+
+    fn handle_pipeline_exit(
+        job_id: &str,
+        pipeline_id: u32,
+        thread_result: std::thread::Result<anyhow::Result<()>>,
+        status: &RwLock<PipelineStatus>,
+    ) {
+        let mut is_fatal = false;
+        let final_status = match thread_result {
+            Ok(Ok(_)) => {
+                info!(job_id = %job_id, pipeline_id = pipeline_id, "Pipeline finished gracefully.");
+                PipelineStatus::Finished
+            }
+            Ok(Err(e)) => {
+                error!(job_id = %job_id, pipeline_id = pipeline_id, error = %e, "Pipeline failed.");
+                is_fatal = true;
+                PipelineStatus::Failed {
+                    error: e.to_string(),
+                    is_panic: false,
+                }
+            }
+            Err(_) => {
+                error!(job_id = %job_id, pipeline_id = pipeline_id, "Pipeline thread panicked!");
+                is_fatal = true;
+                PipelineStatus::Failed {
+                    error: "Task thread encountered an unexpected panic".into(),
+                    is_panic: true,
+                }
+            }
+        };
+
+        *status.write().unwrap() = final_status;
+
+        if is_fatal {
+            warn!(job_id = %job_id, pipeline_id = pipeline_id, "Pipeline failure detected, Job should be aborted or recovered.");
+        }
+    }
+}
diff --git a/src/runtime/streaming/job/mod.rs b/src/runtime/streaming/job/mod.rs
new file mode 100644
index 00000000..02e0343c
--- /dev/null
+++ b/src/runtime/streaming/job/mod.rs
@@ -0,0 +1,17 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+pub mod edge_manager;
+pub mod job_manager;
+pub mod models;
+
+pub use job_manager::{JobManager, StreamingJobSummary};
diff --git a/src/runtime/streaming/job/models.rs b/src/runtime/streaming/job/models.rs
new file mode 100644
index 00000000..45ea3bb7
--- /dev/null
+++ b/src/runtime/streaming/job/models.rs
@@ -0,0 +1,44 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::collections::HashMap;
+use std::sync::{Arc, RwLock};
+use std::thread::JoinHandle;
+use std::time::Instant;
+
+use protocol::grpc::api::FsProgram;
+use tokio::sync::mpsc;
+
+use crate::runtime::streaming::protocol::control::ControlCommand;
+
+#[derive(Debug, Clone, PartialEq)]
+pub enum PipelineStatus {
+    Initializing,
+    Running,
+    Failed { error: String, is_panic: bool },
+    Finished,
+    Stopping,
+}
+
+pub struct PhysicalPipeline {
+    pub pipeline_id: u32,
+    pub handle: Option<JoinHandle<()>>,
+    pub status: Arc<RwLock<PipelineStatus>>,
+    pub control_tx: mpsc::Sender<ControlCommand>,
+}
+
+pub struct PhysicalExecutionGraph {
+    pub job_id: String,
+    pub program: FsProgram,
+    pub pipelines: HashMap<u32, PhysicalPipeline>,
+    pub start_time: Instant,
+}
diff --git a/src/runtime/streaming/memory/mod.rs b/src/runtime/streaming/memory/mod.rs
new file mode 100644
index 00000000..45fc3194
--- /dev/null
+++ b/src/runtime/streaming/memory/mod.rs
@@ -0,0 +1,17 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+pub mod pool;
+pub mod ticket;
+
+pub use pool::MemoryPool;
+pub use ticket::MemoryTicket;
diff --git a/src/runtime/streaming/memory/pool.rs b/src/runtime/streaming/memory/pool.rs
new file mode 100644
index 00000000..4813a63e
--- /dev/null
+++ b/src/runtime/streaming/memory/pool.rs
@@ -0,0 +1,86 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use parking_lot::Mutex;
+use std::sync::atomic::{AtomicUsize, Ordering};
+use std::sync::Arc;
+use tokio::sync::Notify;
+use tracing::{debug, warn};
+
+use super::ticket::MemoryTicket;
+
+#[derive(Debug)]
+pub struct MemoryPool {
+    max_bytes: usize,
+    used_bytes: AtomicUsize,
+    available_bytes: Mutex<usize>,
+    notify: Notify,
+}
+
+impl MemoryPool {
+    pub fn new(max_bytes: usize) -> Arc<Self> {
+        Arc::new(Self {
+            max_bytes,
+            used_bytes: AtomicUsize::new(0),
+            available_bytes: Mutex::new(max_bytes),
+            notify: Notify::new(),
+        })
+    }
+
+    pub fn usage_metrics(&self) -> (usize, usize) {
+        (self.used_bytes.load(Ordering::Relaxed), self.max_bytes)
+    }
+
+    pub async fn request_memory(self: &Arc<Self>, bytes: usize) -> MemoryTicket {
+        if bytes == 0 {
+            return MemoryTicket::new(0, self.clone());
+        }
+
+        if bytes > self.max_bytes {
+            warn!(
+                "Requested memory ({} B) exceeds total pool size ({} B)! \
+                Permitting to avoid pipeline deadlock, but OOM risk is critical.",
+                bytes, self.max_bytes
+            );
+            self.used_bytes.fetch_add(bytes, Ordering::Relaxed);
+            return MemoryTicket::new(bytes, self.clone());
+        }
+
+        loop {
+            {
+                let mut available = self.available_bytes.lock();
+                if *available >= bytes {
+                    *available -= bytes;
+                    self.used_bytes.fetch_add(bytes, Ordering::Relaxed);
+                    return MemoryTicket::new(bytes, self.clone());
+                }
+            }
+
+            debug!("Backpressure engaged: waiting for {} bytes to be freed...", bytes);
+            self.notify.notified().await;
+        }
+    }
+
+    pub(crate) fn release(&self, bytes: usize) {
+        if bytes == 0 {
+            return;
+        }
+
+        {
+            let mut available = self.available_bytes.lock();
+            *available += bytes;
+        }
+
+        self.used_bytes.fetch_sub(bytes, Ordering::Relaxed);
+        self.notify.notify_waiters();
+    }
+}
diff --git a/src/runtime/streaming/memory/ticket.rs b/src/runtime/streaming/memory/ticket.rs
new file mode 100644
index 00000000..cb105be0
--- /dev/null
+++ b/src/runtime/streaming/memory/ticket.rs
@@ -0,0 +1,33 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::sync::Arc;
+
+use super::pool::MemoryPool;
+
+#[derive(Debug)]
+pub struct MemoryTicket {
+    bytes: usize,
+    pool: Arc<MemoryPool>,
+}
+
+impl MemoryTicket {
+    pub(crate) fn new(bytes: usize, pool: Arc<MemoryPool>) -> Self {
+        Self { bytes, pool }
+    }
+}
+
+impl Drop for MemoryTicket {
+    fn drop(&mut self) {
+        self.pool.release(self.bytes);
+    }
+}
diff --git a/src/runtime/streaming/mod.rs b/src/runtime/streaming/mod.rs
new file mode 100644
index 00000000..7e0ba57a
--- /dev/null
+++ b/src/runtime/streaming/mod.rs
@@ -0,0 +1,27 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+//
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Streaming actor runtime (vendored from Arroyo `arroyo-actor-runtime`).
+
+pub mod api;
+pub mod error;
+pub mod execution;
+pub mod factory;
+pub mod format;
+pub mod job;
+pub mod memory;
+pub mod network;
+pub mod operators;
+pub mod protocol;
+
+pub use protocol::StreamOutput;
diff --git a/src/runtime/streaming/network/endpoint.rs b/src/runtime/streaming/network/endpoint.rs
new file mode 100644
index 00000000..7448e9cd
--- /dev/null
+++ b/src/runtime/streaming/network/endpoint.rs
@@ -0,0 +1,64 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use crate::runtime::streaming::protocol::event::StreamEvent;
+use crate::runtime::streaming::protocol::tracked::TrackedEvent;
+use anyhow::{anyhow, Result};
+use std::pin::Pin;
+use tokio::sync::mpsc;
+use tokio_stream::Stream;
+use tracing::debug;
+
+// ========================================================================
+// ========================================================================
+
+#[derive(Clone)]
+pub struct RemoteSenderStub {
+    pub target_addr: String,
+}
+
+impl RemoteSenderStub {
+    pub async fn send_over_network(&self, _event: &StreamEvent) -> Result<()> {
+        unimplemented!("Remote network transport is not yet implemented")
+    }
+}
+
+// ========================================================================
+// ========================================================================
+
+#[derive(Clone)]
+pub enum PhysicalSender {
+    Local(mpsc::Sender<TrackedEvent>),
+    Remote(RemoteSenderStub),
+}
+
+impl PhysicalSender {
+    pub async fn send(&self, tracked_event: TrackedEvent) -> Result<()> {
+        match self {
+            PhysicalSender::Local(tx) => {
+                tx.send(tracked_event)
+                    .await
+                    .map_err(|_| anyhow!("Local channel closed! Downstream task may have crashed."))?;
+            }
+            PhysicalSender::Remote(stub) => {
+                stub.send_over_network(&tracked_event.event).await?;
+                debug!("Sent event over network, local memory ticket will be released.");
+            }
+        }
+        Ok(())
+    }
+}
+
+// ========================================================================
+// ========================================================================
+
+pub type BoxedEventStream = Pin<Box<dyn Stream<Item = TrackedEvent> + Send>>;
diff --git a/src/runtime/streaming/network/environment.rs b/src/runtime/streaming/network/environment.rs
new file mode 100644
index 00000000..fe8544c5
--- /dev/null
+++ b/src/runtime/streaming/network/environment.rs
@@ -0,0 +1,51 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use super::endpoint::{BoxedEventStream, PhysicalSender};
+use std::collections::HashMap;
+
+pub type VertexId = u32;
+pub type SubtaskIndex = u32;
+
+pub struct NetworkEnvironment {
+    pub outboxes: HashMap<(VertexId, SubtaskIndex), Vec<PhysicalSender>>,
+    pub inboxes: HashMap<(VertexId, SubtaskIndex), Vec<BoxedEventStream>>,
+}
+
+impl NetworkEnvironment {
+    pub fn new() -> Self {
+        Self {
+            outboxes: HashMap::new(),
+            inboxes: HashMap::new(),
+        }
+    }
+
+    pub fn take_outboxes(
+        &mut self,
+        vertex_id: VertexId,
+        subtask_idx: SubtaskIndex,
+    ) -> Vec<PhysicalSender> {
+        self.outboxes
+            .remove(&(vertex_id, subtask_idx))
+            .unwrap_or_default()
+    }
+
+    pub fn take_inboxes(
+        &mut self,
+        vertex_id: VertexId,
+        subtask_idx: SubtaskIndex,
+    ) -> Vec<BoxedEventStream> {
+        self.inboxes
+            .remove(&(vertex_id, subtask_idx))
+            .unwrap_or_default()
+    }
+}
diff --git a/src/runtime/streaming/network/mod.rs b/src/runtime/streaming/network/mod.rs
new file mode 100644
index 00000000..16100133
--- /dev/null
+++ b/src/runtime/streaming/network/mod.rs
@@ -0,0 +1,15 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+pub mod endpoint;
+pub mod environment;
+
diff --git a/src/runtime/streaming/operators/grouping/incremental_aggregate.rs b/src/runtime/streaming/operators/grouping/incremental_aggregate.rs
new file mode 100644
index 00000000..f895c173
--- /dev/null
+++ b/src/runtime/streaming/operators/grouping/incremental_aggregate.rs
@@ -0,0 +1,726 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use anyhow::{bail, Result};
+use arrow::compute::max_array;
+use arrow::row::{RowConverter, SortField};
+use arrow_array::builder::{
+    BinaryBuilder, TimestampNanosecondBuilder, UInt32Builder, UInt64Builder,
+};
+use arrow_array::cast::AsArray;
+use arrow_array::types::UInt64Type;
+use arrow_array::{
+    Array, ArrayRef, BooleanArray, RecordBatch, StructArray,
+};
+use arrow_schema::{DataType, Field, FieldRef, Schema, SchemaBuilder, TimeUnit};
+use datafusion::common::{Result as DFResult, ScalarValue};
+use datafusion::physical_expr::aggregate::AggregateFunctionExpr;
+use datafusion::physical_plan::{Accumulator, PhysicalExpr};
+use crate::sql::common::constants::updating_state_field;
+use datafusion_proto::physical_plan::DefaultPhysicalExtensionCodec;
+use datafusion_proto::physical_plan::from_proto::parse_physical_expr;
+use datafusion_proto::protobuf::PhysicalExprNode;
+use datafusion_proto::protobuf::PhysicalPlanNode;
+use datafusion_proto::protobuf::physical_plan_node::PhysicalPlanType;
+use itertools::Itertools;
+use prost::Message;
+use std::collections::HashSet;
+use std::sync::LazyLock;
+use std::time::{Duration, Instant, SystemTime};
+use std::{collections::HashMap, mem, sync::Arc};
+use tracing::{debug, warn};
+use protocol::grpc::api::UpdatingAggregateOperator;
+// =========================================================================
+// =========================================================================
+use crate::runtime::streaming::api::context::TaskContext;
+use crate::runtime::streaming::api::operator::Operator;
+use crate::runtime::streaming::factory::Registry;
+use crate::runtime::util::decode_aggregate;
+use crate::runtime::streaming::operators::{Key, UpdatingCache};
+use crate::runtime::streaming::StreamOutput;
+use crate::sql::common::{to_nanos, CheckpointBarrier, FsSchema, Watermark, TIMESTAMP_FIELD, UPDATING_META_FIELD};
+use crate::sql::physical::updating_meta_fields;
+
+#[derive(Debug, Copy, Clone)]
+struct BatchData {
+    count: u64,
+    generation: u64,
+}
+
+impl BatchData {
+    fn new(generation: u64) -> Self {
+        Self { count: 1, generation }
+    }
+
+    fn inc(&mut self) {
+        self.count += 1;
+        self.generation += 1;
+    }
+
+    fn dec(&mut self) {
+        self.count = self.count.checked_sub(1).unwrap_or_default();
+        self.generation += 1;
+    }
+}
+
+#[derive(Debug)]
+enum IncrementalState {
+    Sliding {
+        expr: Arc<AggregateFunctionExpr>,
+        accumulator: Box<dyn Accumulator>,
+    },
+    Batch {
+        expr: Arc<AggregateFunctionExpr>,
+        data: HashMap<Key, BatchData>,
+        row_converter: Arc<RowConverter>,
+        changed_values: HashSet<Key>,
+    },
+}
+
+impl IncrementalState {
+    fn update_batch(&mut self, new_generation: u64, batch: &[ArrayRef]) -> DFResult<()> {
+        match self {
+            IncrementalState::Sliding { accumulator, .. } => {
+                accumulator.update_batch(batch)?;
+            }
+            IncrementalState::Batch { data, row_converter, changed_values, .. } => {
+                for r in row_converter.convert_columns(batch)?.iter() {
+                    if data.contains_key(r.as_ref()) {
+                        data.get_mut(r.as_ref()).unwrap().inc();
+                        changed_values.insert(data.get_key_value(r.as_ref()).unwrap().0.clone());
+                    } else {
+                        let key = Key(Arc::new(r.as_ref().to_vec()));
+                        data.insert(key.clone(), BatchData::new(new_generation));
+                        changed_values.insert(key);
+                    }
+                }
+            }
+        }
+        Ok(())
+    }
+
+    fn retract_batch(&mut self, batch: &[ArrayRef]) -> DFResult<()> {
+        match self {
+            IncrementalState::Sliding { accumulator, .. } => accumulator.retract_batch(batch),
+            IncrementalState::Batch { data, row_converter, changed_values, .. } => {
+                for r in row_converter.convert_columns(batch)?.iter() {
+                    match data.get(r.as_ref()).map(|d| d.count) {
+                        Some(0) => {
+                            debug!("tried to retract value for key with count 0; implies append lost");
+                        }
+                        Some(_) => {
+                            data.get_mut(r.as_ref()).unwrap().dec();
+                            changed_values.insert(data.get_key_value(r.as_ref()).unwrap().0.clone());
+                        }
+                        None => {
+                            debug!("tried to retract value for missing key: implies append lost");
+                        }
+                    }
+                }
+                Ok(())
+            }
+        }
+    }
+
+    fn evaluate(&mut self) -> DFResult<ScalarValue> {
+        match self {
+            IncrementalState::Sliding { accumulator, .. } => accumulator.evaluate(),
+            IncrementalState::Batch { expr, data, row_converter, .. } => {
+                let parser = row_converter.parser();
+                let input = row_converter.convert_rows(
+                    data.iter()
+                        .filter(|(_, c)| c.count > 0)
+                        .map(|(v, _)| parser.parse(&v.0)),
+                )?;
+                let mut acc = expr.create_accumulator()?;
+                acc.update_batch(&input)?;
+                acc.evaluate_mut()
+            }
+        }
+    }
+}
+
+#[derive(Debug, Copy, Clone, Eq, PartialEq)]
+enum AccumulatorType {
+    Sliding,
+    Batch,
+}
+
+impl AccumulatorType {
+    fn state_fields(&self, agg: &AggregateFunctionExpr) -> DFResult<Vec<FieldRef>> {
+        Ok(match self {
+            AccumulatorType::Sliding => agg.sliding_state_fields()?,
+            AccumulatorType::Batch => vec![],
+        })
+    }
+}
+
+#[derive(Debug)]
+struct Aggregator {
+    func: Arc<AggregateFunctionExpr>,
+    accumulator_type: AccumulatorType,
+    row_converter: Arc<RowConverter>,
+    state_cols: Vec<usize>,
+}
+
+// =========================================================================
+// =========================================================================
+
+pub struct IncrementalAggregatingFunc {
+    flush_interval: Duration,
+    metadata_expr: Arc<dyn PhysicalExpr>,
+    aggregates: Vec<Aggregator>,
+    accumulators: UpdatingCache<Vec<IncrementalState>>,
+    updated_keys: HashMap<Key, Option<Vec<ScalarValue>>>,
+    
+    input_schema: Arc<FsSchema>,
+    has_routing_keys: bool,
+
+    sliding_state_schema: Arc<FsSchema>,
+    batch_state_schema: Arc<FsSchema>,
+    schema_without_metadata: Arc<Schema>,
+    final_output_schema: Arc<Schema>,
+    ttl: Duration,
+    key_converter: RowConverter,
+    new_generation: u64,
+}
+
+static GLOBAL_KEY: LazyLock<Arc<Vec<u8>>> = LazyLock::new(|| Arc::new(Vec::new()));
+
+impl IncrementalAggregatingFunc {
+    fn update_batch(&mut self, key: &[u8], batch: &[Vec<ArrayRef>], idx: Option<usize>) -> DFResult<()> {
+        self.accumulators
+            .modify_and_update(key, Instant::now(), |values| {
+                for (inputs, accs) in batch.iter().zip(values.iter_mut()) {
+                    let values = if let Some(idx) = idx {
+                        &inputs.iter().map(|c| c.slice(idx, 1)).collect()
+                    } else {
+                        inputs
+                    };
+                    accs.update_batch(self.new_generation, values)?;
+                }
+                Ok(())
+            })
+            .expect("tried to update for non-existent key")
+    }
+
+    fn retract_batch(&mut self, key: &[u8], batch: &[Vec<ArrayRef>], idx: Option<usize>) -> DFResult<()> {
+        self.accumulators
+            .modify(key, |values| {
+                for (inputs, accs) in batch.iter().zip(values.iter_mut()) {
+                    let values = if let Some(idx) = idx {
+                        &inputs.iter().map(|c| c.slice(idx, 1)).collect()
+                    } else {
+                        inputs
+                    };
+                    accs.retract_batch(values)?;
+                }
+                Ok::<(), datafusion::common::DataFusionError>(())
+            })
+            .expect("tried to retract state for non-existent key")?;
+        Ok(())
+    }
+
+    fn evaluate(&mut self, key: &[u8]) -> DFResult<Vec<ScalarValue>> {
+        self.accumulators
+            .get_mut(key)
+            .expect("tried to evaluate non-existent key")
+            .iter_mut()
+            .map(|s| s.evaluate())
+            .collect::<DFResult<_>>()
+    }
+
+    fn get_retracts(batch: &RecordBatch) -> Option<&BooleanArray> {
+        if let Some(meta_col) = batch.column_by_name(UPDATING_META_FIELD) {
+            let meta_struct = meta_col
+                .as_any()
+                .downcast_ref::<StructArray>()
+                .expect("_updating_meta must be StructArray");
+
+            let is_retract_array = meta_struct
+                .column_by_name(updating_state_field::IS_RETRACT)
+                .expect("meta struct must have is_retract");
+            
+            Some(is_retract_array.as_any().downcast_ref::<BooleanArray>().expect("is_retract must be BooleanArray"))
+        } else {
+            None
+        }
+    }
+
+    fn make_accumulators(&self) -> Vec<IncrementalState> {
+        self.aggregates
+            .iter()
+            .map(|agg| match agg.accumulator_type {
+                AccumulatorType::Sliding => IncrementalState::Sliding {
+                    expr: agg.func.clone(),
+                    accumulator: agg.func.create_sliding_accumulator().unwrap(),
+                },
+                AccumulatorType::Batch => IncrementalState::Batch {
+                    expr: agg.func.clone(),
+                    data: Default::default(),
+                    row_converter: agg.row_converter.clone(),
+                    changed_values: Default::default(),
+                },
+            })
+            .collect()
+    }
+
+    fn compute_inputs(&self, batch: &RecordBatch) -> Vec<Vec<ArrayRef>> {
+        self.aggregates
+            .iter()
+            .map(|agg| {
+                agg.func
+                    .expressions()
+                    .iter()
+                    .map(|ex| ex.evaluate(batch).unwrap().into_array(batch.num_rows()).unwrap())
+                    .collect::<Vec<_>>()
+            })
+            .collect::<Vec<_>>()
+    }
+
+    fn global_aggregate(&mut self, batch: &RecordBatch) -> Result<()> {
+        let retracts = Self::get_retracts(batch);
+        let aggregate_input_cols = self.compute_inputs(&batch);
+
+        let mut first = false;
+        if !self
+            .accumulators
+            .contains_key(GLOBAL_KEY.as_ref().as_slice())
+        {
+            first = true;
+            self.accumulators.insert(
+                GLOBAL_KEY.clone(),
+                Instant::now(),
+                self.new_generation,
+                self.make_accumulators(),
+            );
+        }
+
+        if !self
+            .updated_keys
+            .contains_key(GLOBAL_KEY.as_ref().as_slice())
+        {
+            if first {
+                self.updated_keys.insert(Key(GLOBAL_KEY.clone()), None);
+            } else {
+                let v = Some(self.evaluate(GLOBAL_KEY.as_ref().as_slice())?);
+                self.updated_keys.insert(Key(GLOBAL_KEY.clone()), v);
+            }
+        }
+
+        if let Some(retracts) = retracts {
+            for (i, r) in retracts.iter().enumerate() {
+                if r.unwrap_or_default() {
+                    self.retract_batch(
+                        GLOBAL_KEY.as_ref().as_slice(),
+                        &aggregate_input_cols,
+                        Some(i),
+                    )?;
+                } else {
+                    self.update_batch(
+                        GLOBAL_KEY.as_ref().as_slice(),
+                        &aggregate_input_cols,
+                        Some(i),
+                    )?;
+                }
+            }
+        } else {
+            self.update_batch(
+                GLOBAL_KEY.as_ref().as_slice(),
+                &aggregate_input_cols,
+                None,
+            )
+            .unwrap();
+        }
+        Ok(())
+    }
+
+    fn keyed_aggregate(&mut self, batch: &RecordBatch) -> Result<()> {
+        let retracts = Self::get_retracts(batch);
+
+        let sort_columns = &self.input_schema
+            .sort_columns(batch, false)
+            .into_iter()
+            .map(|e| e.values)
+            .collect::<Vec<_>>();
+
+        let keys = self.key_converter.convert_columns(sort_columns).unwrap();
+
+        for k in &keys {
+            if !self.updated_keys.contains_key(k.as_ref()) {
+                if let Some((key, accs)) = self.accumulators.get_mut_key_value(k.as_ref()) {
+                    self.updated_keys.insert(key, Some(accs.iter_mut().map(|s| s.evaluate()).collect::<DFResult<_>>()?));
+                } else {
+                    self.updated_keys.insert(Key(Arc::new(k.as_ref().to_vec())), None);
+                }
+            }
+        }
+
+        let aggregate_input_cols = self.compute_inputs(&batch);
+
+        for (i, key) in keys.iter().enumerate() {
+            if !self.accumulators.contains_key(key.as_ref()) {
+                self.accumulators.insert(Arc::new(key.as_ref().to_vec()), Instant::now(), 0, self.make_accumulators());
+            };
+
+            let retract = retracts.map(|r| r.value(i)).unwrap_or_default();
+            if retract {
+                self.retract_batch(key.as_ref(), &aggregate_input_cols, Some(i))?;
+            } else {
+                self.update_batch(key.as_ref(), &aggregate_input_cols, Some(i))?;
+            }
+        }
+        Ok(())
+    }
+
+    // =========================================================================
+    // =========================================================================
+
+    fn checkpoint_sliding(&mut self) -> DFResult<Option<Vec<ArrayRef>>> {
+        if self.updated_keys.is_empty() { return Ok(None); }
+
+        let mut states = vec![vec![]; self.sliding_state_schema.schema.fields.len()];
+        let parser = self.key_converter.parser();
+        let mut generation_builder = UInt64Builder::with_capacity(self.updated_keys.len());
+
+        let mut cols = self.key_converter.convert_rows(self.updated_keys.keys().map(|k| {
+            let (accumulators, generation) = self.accumulators.get_mut_generation(k.0.as_ref()).unwrap();
+            generation_builder.append_value(generation);
+
+            for (state, agg) in accumulators.iter_mut().zip(self.aggregates.iter()) {
+                let IncrementalState::Sliding { expr, accumulator } = state else { continue; };
+                let state = accumulator.state().unwrap_or_else(|_| {
+                    let state = accumulator.state().unwrap();
+                    *accumulator = expr.create_sliding_accumulator().unwrap();
+                    let states: Vec<_> = state.iter().map(|s| s.to_array()).try_collect().unwrap();
+                    accumulator.merge_batch(&states).unwrap();
+                    state
+                });
+
+                for (idx, v) in agg.state_cols.iter().zip(state.into_iter()) {
+                    states[*idx].push(v);
+                }
+            }
+            parser.parse(k.0.as_ref())
+        }))?;
+
+        cols.extend(states.into_iter().skip(cols.len()).map(|c| ScalarValue::iter_to_array(c).unwrap()));
+
+        let generations = generation_builder.finish();
+        self.new_generation = self.new_generation.max(max_array::<UInt64Type, _>(&generations).unwrap());
+        cols.push(Arc::new(generations));
+
+        Ok(Some(cols))
+    }
+
+    fn checkpoint_batch(&mut self) -> DFResult<Option<Vec<ArrayRef>>> {
+        if self.aggregates.iter().all(|agg| agg.accumulator_type == AccumulatorType::Sliding) { return Ok(None); }
+        if self.updated_keys.is_empty() { return Ok(None); }
+
+        let size = self.updated_keys.len();
+        let mut rows = Vec::with_capacity(size);
+        let mut accumulator_builder = UInt32Builder::with_capacity(size);
+        let mut args_row_builder = BinaryBuilder::with_capacity(size, size * 4);
+        let mut count_builder = UInt64Builder::with_capacity(size);
+        let mut timestamp_builder = TimestampNanosecondBuilder::with_capacity(size);
+        let mut generation_builder = UInt64Builder::with_capacity(size);
+
+        let now = to_nanos(SystemTime::now()) as i64;
+        let parser = self.key_converter.parser();
+
+        for k in self.updated_keys.keys() {
+            let row = parser.parse(&k.0);
+            for (i, state) in self.accumulators.get_mut(k.0.as_ref()).unwrap().iter_mut().enumerate() {
+                let IncrementalState::Batch { data, changed_values, .. } = state else { continue; };
+
+                for vk in changed_values.iter() {
+                    if let Some(count) = data.get(vk) {
+                        accumulator_builder.append_value(i as u32);
+                        args_row_builder.append_value(&*vk.0);
+                        count_builder.append_value(count.count);
+                        generation_builder.append_value(count.generation);
+                        timestamp_builder.append_value(now);
+                        rows.push(row.to_owned())
+                    }
+                }
+                data.retain(|_, v| v.count > 0);
+            }
+        }
+
+        let mut cols = self.key_converter.convert_rows(rows.into_iter())?;
+        cols.push(Arc::new(accumulator_builder.finish()));
+        cols.push(Arc::new(args_row_builder.finish()));
+        cols.push(Arc::new(count_builder.finish()));
+        cols.push(Arc::new(timestamp_builder.finish()));
+
+        let generations = generation_builder.finish();
+        self.new_generation = self.new_generation.max(max_array::<UInt64Type, _>(&generations).unwrap());
+        cols.push(Arc::new(generations));
+
+        Ok(Some(cols))
+    }
+
+    fn restore_sliding(&mut self, key: &[u8], now: Instant, i: usize, aggregate_states: &Vec<Vec<ArrayRef>>, generation: u64) -> Result<()> {
+        let mut accumulators = self.make_accumulators();
+        for ((_, state_cols), acc) in self.aggregates.iter().zip(aggregate_states.iter()).zip(accumulators.iter_mut()) {
+            if let IncrementalState::Sliding { accumulator, .. } = acc {
+                accumulator.merge_batch(&state_cols.iter().map(|c| c.slice(i, 1)).collect_vec())?
+            }
+        }
+        self.accumulators.insert(Arc::new(key.to_vec()), now, generation, accumulators);
+        Ok(())
+    }
+
+    async fn initialize(&mut self, _ctx: &mut TaskContext) -> Result<()> {
+
+        let mut deleted_keys = vec![];
+        for (k, v) in self.accumulators.iter_mut() {
+            let is_deleted = v.last_mut().unwrap().evaluate()?.is_null();
+            if is_deleted { deleted_keys.push(k.clone()); } 
+            else {
+                for is in v {
+                    if let IncrementalState::Batch { data, .. } = is { data.retain(|_, v| v.count > 0); }
+                }
+            }
+        }
+        for k in deleted_keys { self.accumulators.remove(&k.0); }
+        Ok(())
+    }
+
+    fn generate_changelog(&mut self) -> Result<Option<RecordBatch>> {
+        let mut output_keys = Vec::with_capacity(self.updated_keys.len() * 2);
+        let mut output_values = vec![Vec::with_capacity(self.updated_keys.len() * 2); self.aggregates.len()];
+        let mut is_retracts = Vec::with_capacity(self.updated_keys.len() * 2);
+
+        let (updated_keys, updated_values): (Vec<_>, Vec<_>) = mem::take(&mut self.updated_keys).into_iter().unzip();
+        let mut deleted_keys = vec![];
+
+        for (k, retract) in updated_keys.iter().zip(updated_values.into_iter()) {
+            let append = self.evaluate(&k.0)?;
+
+            if let Some(v) = retract {
+                if v.iter().zip(append.iter()).take(v.len() - 1).all(|(a, b)| a == b) { continue; }
+                is_retracts.push(true);
+                output_keys.push(k.clone());
+                for (out, val) in output_values.iter_mut().zip(v) { out.push(val); }
+            }
+
+            if !append.last().unwrap().is_null() {
+                is_retracts.push(false);
+                output_keys.push(k.clone());
+                for (out, val) in output_values.iter_mut().zip(append) { out.push(val); }
+            } else {
+                deleted_keys.push(k);
+            }
+        }
+
+        for k in deleted_keys { self.accumulators.remove(&k.0); }
+
+        let mut ttld_keys = vec![];
+        for (k, mut v) in self.accumulators.time_out(Instant::now()) {
+            is_retracts.push(true);
+            ttld_keys.push(k);
+            for (out, val) in output_values.iter_mut().zip(v.iter_mut().map(|s| s.evaluate())) { out.push(val?); }
+        }
+
+        if output_keys.is_empty() && ttld_keys.is_empty() { return Ok(None); }
+
+        let row_parser = self.key_converter.parser();
+        let mut result_cols = self.key_converter.convert_rows(
+            output_keys.iter().map(|k| row_parser.parse(k.0.as_slice()))
+            .chain(ttld_keys.iter().map(|k| row_parser.parse(k.as_slice())))
+        )?;
+
+        for acc in output_values.into_iter() { result_cols.push(ScalarValue::iter_to_array(acc).unwrap()); }
+
+        let record_batch = RecordBatch::try_new(self.schema_without_metadata.clone(), result_cols).unwrap();
+        
+        let metadata = self.metadata_expr.evaluate(&record_batch).unwrap().into_array(record_batch.num_rows()).unwrap();
+        let metadata = set_retract_metadata(metadata, Arc::new(BooleanArray::from(is_retracts)));
+        
+        let mut final_batch = record_batch.columns().to_vec();
+        final_batch.push(metadata);
+
+        Ok(Some(RecordBatch::try_new(
+            self.final_output_schema.clone(),
+            final_batch,
+        )?))
+    }
+}
+
+fn set_retract_metadata(metadata: ArrayRef, is_retract: Arc<BooleanArray>) -> ArrayRef {
+    let metadata = metadata.as_struct();
+    let arrays: Vec<Arc<dyn Array>> = vec![is_retract, metadata.column(1).clone()];
+    Arc::new(StructArray::new(updating_meta_fields(), arrays, None))
+}
+
+// =========================================================================
+// =========================================================================
+
+#[async_trait::async_trait]
+impl Operator for IncrementalAggregatingFunc {
+    fn name(&self) -> &str {
+        "UpdatingAggregatingFunc"
+    }
+
+    async fn on_start(&mut self, ctx: &mut TaskContext) -> Result<()> {
+        self.initialize(ctx).await?;
+        Ok(())
+    }
+
+    async fn process_data(
+        &mut self,
+        _input_idx: usize,
+        batch: RecordBatch,
+        _ctx: &mut TaskContext,
+    ) -> Result<Vec<StreamOutput>> {
+        if self.has_routing_keys {
+            self.keyed_aggregate(&batch)?;
+        } else {
+            self.global_aggregate(&batch)?;
+        }
+        
+        Ok(vec![])
+    }
+
+    async fn process_watermark(
+        &mut self,
+        _watermark: Watermark,
+        _ctx: &mut TaskContext,
+    ) -> Result<Vec<StreamOutput>> {
+        if let Some(changelog_batch) = self.generate_changelog()? {
+            Ok(vec![StreamOutput::Forward(changelog_batch)])
+        } else {
+            Ok(vec![])
+        }
+    }
+
+    async fn snapshot_state(
+        &mut self,
+        _barrier: CheckpointBarrier,
+        _ctx: &mut TaskContext,
+    ) -> Result<()> {
+         Ok(())
+    }
+
+    async fn on_close(&mut self, _ctx: &mut TaskContext) -> Result<Vec<StreamOutput>> {
+        Ok(vec![])
+    }
+}
+
+// =========================================================================
+// =========================================================================
+
+pub struct IncrementalAggregatingConstructor;
+
+impl IncrementalAggregatingConstructor {
+    pub fn with_config(
+        &self,
+        config: UpdatingAggregateOperator,
+        registry: Arc<Registry>,
+    ) -> anyhow::Result<IncrementalAggregatingFunc> {
+        let ttl = Duration::from_micros(if config.ttl_micros == 0 {
+            warn!("ttl was not set for updating aggregate");
+            24 * 60 * 60 * 1000 * 1000
+        } else {
+            config.ttl_micros
+        });
+
+        let input_schema: FsSchema = config.input_schema.unwrap().try_into()?;
+        let final_schema: FsSchema = config.final_schema.unwrap().try_into()?;
+        let mut schema_without_metadata = SchemaBuilder::from((*final_schema.schema).clone());
+        schema_without_metadata.remove(final_schema.schema.index_of(UPDATING_META_FIELD).unwrap());
+
+        let metadata_expr = parse_physical_expr(
+            &PhysicalExprNode::decode(&mut config.metadata_expr.as_slice())?,
+            registry.as_ref(),
+            &input_schema.schema,
+            &DefaultPhysicalExtensionCodec {},
+        )?;
+
+        let aggregate_exec = PhysicalPlanNode::decode(&mut config.aggregate_exec.as_ref())?;
+        let PhysicalPlanType::Aggregate(aggregate_exec) = aggregate_exec.physical_plan_type.unwrap() else { bail!("invalid proto"); };
+
+        let mut sliding_state_fields = input_schema.routing_keys()
+            .map(|v| v.iter().map(|idx| input_schema.schema.field(*idx).clone()).collect_vec())
+            .unwrap_or_default();
+
+        let has_routing_keys = input_schema.routing_keys().is_some();
+        let mut batch_state_fields = sliding_state_fields.clone();
+        let key_fields = (0..sliding_state_fields.len()).collect_vec();
+
+        let aggregates: Vec<_> = aggregate_exec.aggr_expr.iter().zip(aggregate_exec.aggr_expr_name.iter())
+            .map(|(expr, name)| Ok(decode_aggregate(&input_schema.schema, name, expr, registry.as_ref())?))
+            .map_ok(|agg| {
+                let retract = match agg.create_sliding_accumulator() { Ok(s) => s.supports_retract_batch(), _ => false };
+                (agg, if retract { AccumulatorType::Sliding } else { AccumulatorType::Batch })
+            })
+            .map_ok(|(agg, t)| {
+                let row_converter = Arc::new(RowConverter::new(
+                    agg.expressions().iter().map(|ex| Ok(SortField::new(ex.data_type(&input_schema.schema)?))).collect::<DFResult<_>>()?
+                )?);
+                let fields = t.state_fields(&agg)?;
+                let field_names = fields.iter().map(|f| f.name().to_string()).collect_vec();
+                sliding_state_fields.extend(fields.into_iter().map(|f| (*f).clone()));
+                Ok::<_, anyhow::Error>((agg, t, row_converter, field_names))
+            })
+            .flatten_ok()
+            .collect::<Result<_>>()?;
+
+        let state_schema = Schema::new(sliding_state_fields);
+
+        let aggregates = aggregates.into_iter().map(|(agg, t, row_converter, field_names)| Aggregator {
+            func: agg, accumulator_type: t, row_converter,
+            state_cols: field_names.iter().map(|f| state_schema.index_of(f).unwrap()).collect(),
+        }).collect();
+
+        let mut state_fields = state_schema.fields().to_vec();
+        let timestamp_field = state_fields.pop().unwrap();
+        state_fields.push(Arc::new((*timestamp_field).clone().with_name(TIMESTAMP_FIELD)));
+
+        let sliding_state_schema = Arc::new(FsSchema::from_schema_keys(Arc::new(Schema::new(state_fields)), key_fields.clone())?);
+
+        batch_state_fields.push(Field::new("accumulator", DataType::UInt32, false));
+        batch_state_fields.push(Field::new("args_row", DataType::Binary, false));
+        batch_state_fields.push(Field::new("count", DataType::UInt64, false));
+        batch_state_fields.push(Field::new(TIMESTAMP_FIELD, DataType::Timestamp(TimeUnit::Nanosecond, None), false));
+        let timestamp_index = batch_state_fields.len() - 1;
+
+        let mut storage_key_fields = key_fields.clone();
+        storage_key_fields.push(storage_key_fields.len());
+        storage_key_fields.push(storage_key_fields.len());
+
+        let batch_state_schema = Arc::new(FsSchema::new(
+            Arc::new(Schema::new(batch_state_fields)),
+            timestamp_index,
+            Some(storage_key_fields),
+            Some(key_fields),
+        ));
+
+        Ok(IncrementalAggregatingFunc {
+            flush_interval: Duration::from_micros(config.flush_interval_micros),
+            metadata_expr,
+            ttl,
+            aggregates,
+            accumulators: UpdatingCache::with_time_to_idle(ttl),
+            schema_without_metadata: Arc::new(schema_without_metadata.finish()),
+            final_output_schema: final_schema.schema.clone(),
+            updated_keys: Default::default(),
+            input_schema: Arc::new(input_schema.clone()),
+            has_routing_keys,
+            key_converter: RowConverter::new(input_schema.sort_fields(false))?,
+            sliding_state_schema,
+            batch_state_schema,
+            new_generation: 0,
+        })
+    }
+}
\ No newline at end of file
diff --git a/src/runtime/streaming/operators/grouping/mod.rs b/src/runtime/streaming/operators/grouping/mod.rs
new file mode 100644
index 00000000..2a17a49d
--- /dev/null
+++ b/src/runtime/streaming/operators/grouping/mod.rs
@@ -0,0 +1,17 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+pub mod incremental_aggregate;
+pub mod updating_cache;
+
+pub use incremental_aggregate::IncrementalAggregatingConstructor;
+pub use updating_cache::{Key, UpdatingCache};
diff --git a/src/runtime/streaming/operators/grouping/updating_cache.rs b/src/runtime/streaming/operators/grouping/updating_cache.rs
new file mode 100644
index 00000000..2172535b
--- /dev/null
+++ b/src/runtime/streaming/operators/grouping/updating_cache.rs
@@ -0,0 +1,508 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+use std::borrow::Borrow;
+use std::collections::HashMap;
+use std::sync::Arc;
+use std::time::{Duration, Instant};
+
+#[derive(Hash, Eq, PartialEq, Clone, Debug)]
+pub struct Key(pub Arc<Vec<u8>>);
+
+impl Borrow<[u8]> for Key {
+    fn borrow(&self) -> &[u8] {
+        &self.0
+    }
+}
+
+struct Node<T> {
+    key: Key,
+    data: Option<T>,
+    generation: u64,
+    updated: Instant,
+    prev: Option<usize>,
+    next: Option<usize>,
+}
+
+pub struct UpdatingCache<T: Send + Sync> {
+    map: HashMap<Key, usize>,
+    nodes: Vec<Node<T>>,
+    free_list: Vec<usize>,
+    head: Option<usize>,
+    tail: Option<usize>,
+    ttl: Duration,
+}
+
+struct TTLIter<'a, T: Send + Sync> {
+    now: Instant,
+    cache: &'a mut UpdatingCache<T>,
+}
+
+impl<T: Send + Sync> Iterator for TTLIter<'_, T> {
+    type Item = (Arc<Vec<u8>>, T);
+
+    fn next(&mut self) -> Option<Self::Item> {
+        let head_idx = self.cache.head?;
+        let node = &self.cache.nodes[head_idx];
+
+        if self.now.saturating_duration_since(node.updated) < self.cache.ttl {
+            return None;
+        }
+
+        let (k, v) = self.cache.pop_front()?;
+        Some((k.0, v))
+    }
+}
+
+impl<T: Send + Sync> UpdatingCache<T> {
+    pub fn with_time_to_idle(ttl: Duration) -> Self {
+        Self {
+            map: HashMap::new(),
+            nodes: Vec::new(),
+            free_list: Vec::new(),
+            head: None,
+            tail: None,
+            ttl,
+        }
+    }
+
+    pub fn insert(&mut self, key: Arc<Vec<u8>>, now: Instant, generation: u64, value: T) {
+        let key_obj = Key(key);
+
+        if let Some(&idx) = self.map.get(&key_obj) {
+            if self.nodes[idx].generation >= generation {
+                return;
+            }
+            self.nodes[idx].data = Some(value);
+            self.nodes[idx].generation = generation;
+            self.nodes[idx].updated = now;
+            self.move_to_tail(idx);
+            return;
+        }
+
+        let idx = self.allocate_node(key_obj.clone(), value, generation, now);
+        self.map.insert(key_obj, idx);
+        self.push_back(idx);
+    }
+
+    pub fn time_out(&mut self, now: Instant) -> impl Iterator<Item = (Arc<Vec<u8>>, T)> + '_ {
+        TTLIter { now, cache: self }
+    }
+
+    pub fn iter_mut(&mut self) -> impl Iterator<Item = (&Key, &mut T)> {
+        self.nodes.iter_mut().filter_map(|n| {
+            if let Some(data) = &mut n.data {
+                Some((&n.key, data))
+            } else {
+                None
+            }
+        })
+    }
+
+    pub fn modify_and_update<E, F: Fn(&mut T) -> Result<(), E>>(
+        &mut self,
+        key: &[u8],
+        now: Instant,
+        f: F,
+    ) -> Option<Result<(), E>> {
+        let &idx = self.map.get(key)?;
+        let node = &mut self.nodes[idx];
+
+        if let Err(e) = f(node.data.as_mut().unwrap()) {
+            return Some(Err(e));
+        }
+
+        node.generation += 1;
+        node.updated = now;
+        self.move_to_tail(idx);
+
+        Some(Ok(()))
+    }
+
+    pub fn modify<E, F: Fn(&mut T) -> Result<(), E>>(
+        &mut self,
+        key: &[u8],
+        f: F,
+    ) -> Option<Result<(), E>> {
+        let &idx = self.map.get(key)?;
+        let node = &mut self.nodes[idx];
+
+        node.generation += 1;
+
+        if let Err(e) = f(node.data.as_mut().unwrap()) {
+            return Some(Err(e));
+        }
+
+        Some(Ok(()))
+    }
+
+    pub fn contains_key(&self, k: &[u8]) -> bool {
+        self.map.contains_key(k)
+    }
+
+    pub fn get_mut(&mut self, key: &[u8]) -> Option<&mut T> {
+        let &idx = self.map.get(key)?;
+        self.nodes[idx].data.as_mut()
+    }
+
+    pub fn get_mut_generation(&mut self, key: &[u8]) -> Option<(&mut T, u64)> {
+        let &idx = self.map.get(key)?;
+        let node = &mut self.nodes[idx];
+        Some((node.data.as_mut().unwrap(), node.generation))
+    }
+
+    pub fn get_mut_key_value(&mut self, key: &[u8]) -> Option<(Key, &mut T)> {
+        let &idx = self.map.get(key)?;
+        let node = &mut self.nodes[idx];
+        Some((node.key.clone(), node.data.as_mut().unwrap()))
+    }
+
+    pub fn remove(&mut self, key: &[u8]) -> Option<T> {
+        let &idx = self.map.get(key)?;
+        self.map.remove(key);
+        self.remove_node(idx);
+
+        let data = self.nodes[idx].data.take().unwrap();
+        self.free_list.push(idx);
+
+        Some(data)
+    }
+
+    fn pop_front(&mut self) -> Option<(Key, T)> {
+        let head_idx = self.head?;
+        self.remove_node(head_idx);
+
+        let node = &mut self.nodes[head_idx];
+        self.map.remove(&node.key);
+
+        let key = node.key.clone();
+        let data = node.data.take().unwrap();
+        self.free_list.push(head_idx);
+
+        Some((key, data))
+    }
+
+    fn allocate_node(&mut self, key: Key, data: T, generation: u64, updated: Instant) -> usize {
+        let new_node = Node {
+            key,
+            data: Some(data),
+            generation,
+            updated,
+            prev: None,
+            next: None,
+        };
+
+        if let Some(idx) = self.free_list.pop() {
+            self.nodes[idx] = new_node;
+            idx
+        } else {
+            let idx = self.nodes.len();
+            self.nodes.push(new_node);
+            idx
+        }
+    }
+
+    fn push_back(&mut self, index: usize) {
+        self.nodes[index].prev = self.tail;
+        self.nodes[index].next = None;
+
+        if let Some(tail_idx) = self.tail {
+            self.nodes[tail_idx].next = Some(index);
+        } else {
+            self.head = Some(index);
+        }
+        self.tail = Some(index);
+    }
+
+    fn remove_node(&mut self, index: usize) {
+        let prev = self.nodes[index].prev;
+        let next = self.nodes[index].next;
+
+        if let Some(p) = prev {
+            self.nodes[p].next = next;
+        } else {
+            self.head = next;
+        }
+
+        if let Some(n) = next {
+            self.nodes[n].prev = prev;
+        } else {
+            self.tail = prev;
+        }
+
+        self.nodes[index].prev = None;
+        self.nodes[index].next = None;
+    }
+
+    fn move_to_tail(&mut self, index: usize) {
+        if self.tail == Some(index) {
+            return;
+        }
+        self.remove_node(index);
+        self.push_back(index);
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_insert_and_modify() {
+        let mut cache = UpdatingCache::with_time_to_idle(Duration::from_secs(60));
+
+        let key = Arc::new(vec![1, 2, 3]);
+        let now = Instant::now();
+        cache.insert(key.clone(), now, 1, 42);
+
+        assert!(
+            cache
+                .modify(key.as_ref(), |x| {
+                    *x = 43;
+                    Ok::<(), ()>(())
+                })
+                .unwrap()
+                .is_ok()
+        );
+
+        assert_eq!(*cache.get_mut(key.as_ref()).unwrap(), 43);
+    }
+
+    #[test]
+    fn test_timeout() {
+        let mut cache = UpdatingCache::with_time_to_idle(Duration::from_millis(10));
+
+        let key1 = Arc::new(vec![1]);
+        let key2 = Arc::new(vec![2]);
+
+        let start = Instant::now();
+        cache.insert(key1.clone(), start, 1, "value1");
+        cache.insert(key2.clone(), start + Duration::from_millis(5), 2, "value2");
+
+        let check_time = start + Duration::from_millis(11);
+        let timed_out: Vec<_> = cache.time_out(check_time).collect();
+        assert_eq!(timed_out.len(), 1);
+        assert_eq!(&*timed_out[0].0, &*key1);
+
+        assert!(cache.contains_key(key2.as_ref()));
+        assert!(!cache.contains_key(key1.as_ref()));
+    }
+
+    #[test]
+    fn test_update_keeps_alive() {
+        let mut cache = UpdatingCache::with_time_to_idle(Duration::from_millis(10));
+
+        let key = Arc::new(vec![1]);
+        let start = Instant::now();
+        cache.insert(key.clone(), start, 1, "value");
+
+        let update_time = start + Duration::from_millis(5);
+        cache
+            .modify_and_update(key.as_ref(), update_time, |_| Ok::<(), ()>(()))
+            .unwrap()
+            .unwrap();
+
+        let check_time = start + Duration::from_millis(11);
+        let timed_out: Vec<_> = cache.time_out(check_time).collect();
+        assert!(timed_out.is_empty());
+        assert!(cache.contains_key(key.as_ref()));
+    }
+
+    #[test]
+    fn test_lru_eviction_order_matches_insertion() {
+        let mut cache = UpdatingCache::with_time_to_idle(Duration::from_secs(60));
+        let key1 = Arc::new(vec![1]);
+        let key2 = Arc::new(vec![2]);
+        let key3 = Arc::new(vec![3]);
+        let now = Instant::now();
+        cache.insert(key1.clone(), now, 1, 1);
+        cache.insert(key2.clone(), now, 2, 2);
+        cache.insert(key3.clone(), now, 3, 3);
+
+        let evicted: Vec<_> = cache.time_out(now + Duration::from_secs(61)).collect();
+        assert_eq!(evicted.len(), 3);
+        assert_eq!(evicted[0].0.as_ref(), &*key1);
+        assert_eq!(evicted[1].0.as_ref(), &*key2);
+        assert_eq!(evicted[2].0.as_ref(), &*key3);
+    }
+
+    #[test]
+    fn test_remove_middle_key() {
+        let mut cache = UpdatingCache::with_time_to_idle(Duration::from_secs(60));
+        let key1 = Arc::new(vec![1]);
+        let key2 = Arc::new(vec![2]);
+        let key3 = Arc::new(vec![3]);
+        let now = Instant::now();
+        cache.insert(key1.clone(), now, 1, 1);
+        cache.insert(key2.clone(), now, 2, 2);
+        cache.insert(key3.clone(), now, 3, 3);
+
+        assert_eq!(cache.remove(&[2]).unwrap(), 2);
+        assert!(cache.contains_key(&[1]));
+        assert!(!cache.contains_key(&[2]));
+        assert!(cache.contains_key(&[3]));
+
+        let evicted: Vec<_> = cache.time_out(now + Duration::from_secs(61)).collect();
+        assert_eq!(evicted.len(), 2);
+        assert_eq!(evicted[0].0.as_ref(), &*key1);
+        assert_eq!(evicted[1].0.as_ref(), &*key3);
+    }
+
+    #[test]
+    fn reorder_with_update() {
+        let mut cache = UpdatingCache::<i32>::with_time_to_idle(Duration::from_secs(10));
+        let key1 = Arc::new(vec![1]);
+        let key2 = Arc::new(vec![2]);
+        let now = Instant::now();
+
+        cache.insert(key1.clone(), now, 1, 100);
+        cache.insert(key2.clone(), now, 2, 200);
+
+        cache
+            .modify_and_update(&[1], now + Duration::from_secs(1), |v| {
+                *v += 1;
+                Ok::<(), ()>(())
+            })
+            .unwrap()
+            .unwrap();
+
+        let _ = cache.modify_and_update(&[1], now + Duration::from_secs(2), |v| {
+            *v += 1;
+            Ok::<(), ()>(())
+        });
+    }
+
+    #[test]
+    fn test_ttl_eviction() {
+        let ttl = Duration::from_millis(100);
+        let mut cache = UpdatingCache::with_time_to_idle(ttl);
+        let now = Instant::now();
+        let key1 = Arc::new(vec![1]);
+        let key2 = Arc::new(vec![2]);
+        cache.insert(key1.clone(), now, 1, 10);
+        cache.insert(key2.clone(), now, 2, 20);
+
+        cache
+            .modify_and_update(&[2], now + Duration::from_millis(50), |v| {
+                *v += 1;
+                Ok::<(), ()>(())
+            })
+            .unwrap()
+            .unwrap();
+
+        let now2 = now + Duration::from_millis(150);
+        let evicted: Vec<_> = cache.time_out(now2).collect();
+        assert_eq!(evicted.len(), 2);
+        assert_eq!(evicted[0].0.as_ref(), &[1]);
+        assert_eq!(evicted[1].0.as_ref(), &[2]);
+    }
+
+    #[test]
+    fn test_remove_key() {
+        let ttl = Duration::from_millis(100);
+        let mut cache = UpdatingCache::with_time_to_idle(ttl);
+        let now = Instant::now();
+        let key = Arc::new(vec![1]);
+        cache.insert(key.clone(), now, 1, 42);
+        let value = cache.remove(&[1]).unwrap();
+        assert_eq!(value, 42);
+        assert!(!cache.contains_key(&[1]));
+        let evicted: Vec<_> = cache.time_out(now + Duration::from_millis(200)).collect();
+        assert!(evicted.is_empty());
+    }
+
+    #[test]
+    fn test_update_order() {
+        let ttl = Duration::from_secs(1);
+        let mut cache = UpdatingCache::with_time_to_idle(ttl);
+        let base = Instant::now();
+        let key_a = Arc::new(vec![b'A']);
+        let key_b = Arc::new(vec![b'B']);
+        let key_c = Arc::new(vec![b'C']);
+        cache.insert(key_a.clone(), base, 1, 1);
+        cache.insert(key_b.clone(), base, 2, 2);
+        cache.insert(key_c.clone(), base, 3, 3);
+
+        let t_update = base + Duration::from_millis(500);
+        cache
+            .modify_and_update(b"B", t_update, |v| {
+                *v += 10;
+                Ok::<(), ()>(())
+            })
+            .unwrap()
+            .unwrap();
+
+        let t_eviction = base + Duration::from_secs(2);
+        let evicted: Vec<_> = cache.time_out(t_eviction).collect();
+        assert_eq!(evicted.len(), 3);
+        assert_eq!(evicted[0].0.as_ref(), b"A");
+        assert_eq!(evicted[1].0.as_ref(), b"C");
+        assert_eq!(evicted[2].0.as_ref(), b"B");
+    }
+
+    #[test]
+    fn test_get_mut_key_value() {
+        let ttl = Duration::from_secs(1);
+        let mut cache = UpdatingCache::with_time_to_idle(ttl);
+        let base = Instant::now();
+        let key = Arc::new(vec![1, 2, 3]);
+        cache.insert(key.clone(), base, 1, 42);
+        if let Some((k, v)) = cache.get_mut_key_value(&[1, 2, 3]) {
+            *v += 1;
+            assert_eq!(*v, 43);
+            assert_eq!(k.0.as_ref(), &[1, 2, 3]);
+        } else {
+            panic!("Key not found");
+        }
+    }
+
+    #[test]
+    fn test_modify_error() {
+        let ttl = Duration::from_secs(1);
+        let mut cache = UpdatingCache::with_time_to_idle(ttl);
+        let base = Instant::now();
+        let key = Arc::new(vec![1]);
+        cache.insert(key.clone(), base, 1, 42);
+        let res = cache.modify(&[1], |_v| Err("error"));
+        assert!(res.unwrap().is_err());
+    }
+
+    #[test]
+    fn test_drop_cleanup() {
+        let ttl = Duration::from_secs(1);
+        {
+            let mut cache = UpdatingCache::with_time_to_idle(ttl);
+            let base = Instant::now();
+            for i in 0..10 {
+                cache.insert(Arc::new(vec![i as u8]), base, i as u64, i);
+            }
+        }
+    }
+
+    #[test]
+    fn test_generational_replacement() {
+        let ttl = Duration::from_secs(1);
+        let mut cache = UpdatingCache::with_time_to_idle(ttl);
+        let base = Instant::now();
+        let key = Arc::new(vec![1]);
+
+        cache.insert(key.clone(), base, 1, "first");
+        assert_eq!(cache.get_mut(&[1]), Some(&mut "first"));
+
+        cache.insert(key.clone(), base, 2, "second");
+        assert_eq!(cache.get_mut(&[1]), Some(&mut "second"));
+
+        cache.insert(key.clone(), base, 1, "third");
+        assert_eq!(cache.get_mut(&[1]), Some(&mut "second"));
+    }
+}
diff --git a/src/runtime/streaming/operators/joins/join_instance.rs b/src/runtime/streaming/operators/joins/join_instance.rs
new file mode 100644
index 00000000..18ed3599
--- /dev/null
+++ b/src/runtime/streaming/operators/joins/join_instance.rs
@@ -0,0 +1,304 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+use anyhow::{anyhow, Result};
+use arrow::compute::{max, min, partition, sort_to_indices, take};
+use arrow_array::{RecordBatch, TimestampNanosecondArray};
+use datafusion::execution::context::SessionContext;
+use datafusion::execution::runtime_env::RuntimeEnvBuilder;
+use datafusion::execution::SendableRecordBatchStream;
+use datafusion::physical_plan::ExecutionPlan;
+use datafusion_proto::physical_plan::AsExecutionPlan;
+use datafusion_proto::protobuf::PhysicalPlanNode;
+use futures::StreamExt;
+use prost::Message;
+use std::collections::BTreeMap;
+use std::sync::{Arc, RwLock};
+use std::time::SystemTime;
+use tokio::sync::mpsc::{unbounded_channel, UnboundedReceiver, UnboundedSender};
+use tracing::warn;
+
+use crate::runtime::streaming::api::context::TaskContext;
+use crate::runtime::streaming::api::operator::Operator;
+use crate::runtime::streaming::factory::Registry;
+use async_trait::async_trait;
+use protocol::grpc::api::JoinOperator;
+use crate::runtime::streaming::StreamOutput;
+use crate::sql::common::constants::mem_exec_join_side;
+use crate::sql::common::{from_nanos, CheckpointBarrier, FsSchema, FsSchemaRef, Watermark};
+use crate::sql::physical::{DecodingContext, FsPhysicalExtensionCodec};
+
+#[derive(Debug, Copy, Clone, Eq, PartialEq)]
+enum JoinSide {
+    Left,
+    Right,
+}
+
+impl JoinSide {
+    #[allow(dead_code)]
+    fn name(&self) -> &'static str {
+        match self {
+            JoinSide::Left => mem_exec_join_side::LEFT,
+            JoinSide::Right => mem_exec_join_side::RIGHT,
+        }
+    }
+}
+
+struct JoinInstance {
+    left_tx: UnboundedSender<RecordBatch>,
+    right_tx: UnboundedSender<RecordBatch>,
+    result_stream: SendableRecordBatchStream,
+}
+
+impl JoinInstance {
+    fn feed_data(&self, batch: RecordBatch, side: JoinSide) -> Result<()> {
+        match side {
+            JoinSide::Left => self
+                .left_tx
+                .send(batch)
+                .map_err(|e| anyhow!("Left send err: {}", e)),
+            JoinSide::Right => self
+                .right_tx
+                .send(batch)
+                .map_err(|e| anyhow!("Right send err: {}", e)),
+        }
+    }
+
+    async fn close_and_drain(self) -> Result<Vec<RecordBatch>> {
+        drop(self.left_tx);
+        drop(self.right_tx);
+
+        let mut outputs = Vec::new();
+        let mut stream = self.result_stream;
+
+        while let Some(result_batch) = stream.next().await {
+            outputs.push(result_batch?);
+        }
+
+        Ok(outputs)
+    }
+}
+
+pub struct InstantJoinOperator {
+    left_input_schema: FsSchemaRef,
+    right_input_schema: FsSchemaRef,
+    active_joins: BTreeMap<SystemTime, JoinInstance>,
+    left_receiver_hook: Arc<RwLock<Option<UnboundedReceiver<RecordBatch>>>>,
+    right_receiver_hook: Arc<RwLock<Option<UnboundedReceiver<RecordBatch>>>>,
+    join_exec_plan: Arc<dyn ExecutionPlan>,
+}
+
+impl InstantJoinOperator {
+    fn input_schema(&self, side: JoinSide) -> FsSchemaRef {
+        match side {
+            JoinSide::Left => self.left_input_schema.clone(),
+            JoinSide::Right => self.right_input_schema.clone(),
+        }
+    }
+
+    fn get_or_create_join_instance(&mut self, time: SystemTime) -> Result<&mut JoinInstance> {
+        use std::collections::btree_map::Entry;
+
+        if let Entry::Vacant(e) = self.active_joins.entry(time) {
+            let (left_tx, left_rx) = unbounded_channel();
+            let (right_tx, right_rx) = unbounded_channel();
+
+            *self.left_receiver_hook.write().unwrap() = Some(left_rx);
+            *self.right_receiver_hook.write().unwrap() = Some(right_rx);
+
+            self.join_exec_plan.reset().map_err(|e| anyhow!("{e}"))?;
+            let result_stream = self
+                .join_exec_plan
+                .execute(0, SessionContext::new().task_ctx())
+                .map_err(|e| anyhow!("{e}"))?;
+
+            e.insert(JoinInstance {
+                left_tx,
+                right_tx,
+                result_stream,
+            });
+        }
+
+        self.active_joins
+            .get_mut(&time)
+            .ok_or_else(|| anyhow!("join instance missing after insert"))
+    }
+
+    async fn process_side_internal(
+        &mut self,
+        side: JoinSide,
+        batch: RecordBatch,
+        ctx: &mut TaskContext,
+    ) -> Result<()> {
+        if batch.num_rows() == 0 {
+            return Ok(());
+        }
+
+        let time_column = batch
+            .column(self.input_schema(side).timestamp_index)
+            .as_any()
+            .downcast_ref::<TimestampNanosecondArray>()
+            .ok_or_else(|| anyhow!("Missing timestamp column"))?;
+
+        let min_timestamp = min(time_column).ok_or_else(|| anyhow!("empty timestamp column"))?;
+        let max_timestamp = max(time_column).ok_or_else(|| anyhow!("empty timestamp column"))?;
+
+        if let Some(watermark) = ctx.last_present_watermark() {
+            if watermark > from_nanos(min_timestamp as u128) {
+                warn!("Dropped late batch from {:?} before watermark", side);
+                return Ok(());
+            }
+        }
+
+        let unkeyed_batch = self.input_schema(side).unkeyed_batch(&batch)?;
+
+        if max_timestamp == min_timestamp {
+            let time_key = from_nanos(max_timestamp as u128);
+            let join_instance = self.get_or_create_join_instance(time_key)?;
+            join_instance.feed_data(unkeyed_batch, side)?;
+            return Ok(());
+        }
+
+        let indices = sort_to_indices(time_column, None, None)?;
+        let columns: Vec<_> = unkeyed_batch
+            .columns()
+            .iter()
+            .map(|c| take(c, &indices, None).unwrap())
+            .collect();
+        let sorted_batch = RecordBatch::try_new(unkeyed_batch.schema(), columns)?;
+        let sorted_timestamps = take(time_column, &indices, None).unwrap();
+        let typed_timestamps = sorted_timestamps
+            .as_any()
+            .downcast_ref::<TimestampNanosecondArray>()
+            .ok_or_else(|| anyhow!("sorted timestamps downcast failed"))?;
+        let ranges = partition(std::slice::from_ref(&sorted_timestamps))
+            .unwrap()
+            .ranges();
+
+        for range in ranges {
+            let sub_batch = sorted_batch.slice(range.start, range.end - range.start);
+            let time_key = from_nanos(typed_timestamps.value(range.start) as u128);
+            let join_instance = self.get_or_create_join_instance(time_key)?;
+            join_instance.feed_data(sub_batch, side)?;
+        }
+
+        Ok(())
+    }
+}
+
+#[async_trait]
+impl Operator for InstantJoinOperator {
+    fn name(&self) -> &str {
+        "InstantJoin"
+    }
+
+    async fn on_start(&mut self, _ctx: &mut TaskContext) -> Result<()> {
+        Ok(())
+    }
+
+    async fn process_data(
+        &mut self,
+        input_idx: usize,
+        batch: RecordBatch,
+        ctx: &mut TaskContext,
+    ) -> Result<Vec<StreamOutput>> {
+        let side = if input_idx == 0 {
+            JoinSide::Left
+        } else {
+            JoinSide::Right
+        };
+        self.process_side_internal(side, batch, ctx).await?;
+        Ok(vec![])
+    }
+
+    async fn process_watermark(
+        &mut self,
+        watermark: Watermark,
+        _ctx: &mut TaskContext,
+    ) -> Result<Vec<StreamOutput>> {
+        let Watermark::EventTime(current_time) = watermark else {
+            return Ok(vec![]);
+        };
+        let mut emit_outputs = Vec::new();
+
+        let mut expired_times = Vec::new();
+        for key in self.active_joins.keys() {
+            if *key < current_time {
+                expired_times.push(*key);
+            } else {
+                break;
+            }
+        }
+
+        for time_key in expired_times {
+            if let Some(join_instance) = self.active_joins.remove(&time_key) {
+                let joined_batches = join_instance.close_and_drain().await?;
+                for batch in joined_batches {
+                    emit_outputs.push(StreamOutput::Forward(batch));
+                }
+            }
+        }
+
+        Ok(emit_outputs)
+    }
+
+    async fn snapshot_state(
+        &mut self,
+        _barrier: CheckpointBarrier,
+        _ctx: &mut TaskContext,
+    ) -> Result<()> {
+        Ok(())
+    }
+}
+
+pub struct InstantJoinConstructor;
+
+impl InstantJoinConstructor {
+    pub fn with_config(
+        &self,
+        config: JoinOperator,
+        registry: Arc<Registry>,
+    ) -> anyhow::Result<InstantJoinOperator> {
+        let join_physical_plan_node = PhysicalPlanNode::decode(&mut config.join_plan.as_slice())?;
+
+        let left_input_schema: Arc<FsSchema> =
+            Arc::new(config.left_schema.unwrap().try_into()?);
+        let right_input_schema: Arc<FsSchema> =
+            Arc::new(config.right_schema.unwrap().try_into()?);
+
+        let left_receiver_hook = Arc::new(RwLock::new(None));
+        let right_receiver_hook = Arc::new(RwLock::new(None));
+
+        let codec = FsPhysicalExtensionCodec {
+            context: DecodingContext::LockedJoinStream {
+                left: left_receiver_hook.clone(),
+                right: right_receiver_hook.clone(),
+            },
+        };
+
+        let join_exec_plan = join_physical_plan_node.try_into_physical_plan(
+            registry.as_ref(),
+            &RuntimeEnvBuilder::new().build()?,
+            &codec,
+        )?;
+
+        Ok(InstantJoinOperator {
+            left_input_schema,
+            right_input_schema,
+            active_joins: BTreeMap::new(),
+            left_receiver_hook,
+            right_receiver_hook,
+            join_exec_plan,
+        })
+    }
+}
diff --git a/src/runtime/streaming/operators/joins/join_with_expiration.rs b/src/runtime/streaming/operators/joins/join_with_expiration.rs
new file mode 100644
index 00000000..212cfaad
--- /dev/null
+++ b/src/runtime/streaming/operators/joins/join_with_expiration.rs
@@ -0,0 +1,283 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+use anyhow::{anyhow, Result};
+use arrow::compute::concat_batches;
+use arrow_array::RecordBatch;
+use datafusion::execution::context::SessionContext;
+use datafusion::execution::runtime_env::RuntimeEnvBuilder;
+use datafusion::physical_plan::ExecutionPlan;
+use datafusion_proto::{physical_plan::AsExecutionPlan, protobuf::PhysicalPlanNode};
+use futures::StreamExt;
+use prost::Message;
+use std::collections::VecDeque;
+use std::sync::{Arc, RwLock};
+use std::time::{Duration, SystemTime};
+use tracing::warn;
+
+use crate::runtime::streaming::api::context::TaskContext;
+use crate::runtime::streaming::api::operator::Operator;
+use crate::runtime::streaming::factory::Registry;
+use async_trait::async_trait;
+use protocol::grpc::api::JoinOperator;
+use crate::runtime::streaming::StreamOutput;
+use crate::sql::common::{CheckpointBarrier, FsSchema, Watermark};
+use crate::sql::physical::{DecodingContext, FsPhysicalExtensionCodec};
+
+#[derive(Debug, Copy, Clone, Eq, PartialEq)]
+enum JoinSide {
+    Left,
+    Right,
+}
+
+// ============================================================================
+// ============================================================================
+
+struct StateBuffer {
+    batches: VecDeque<(SystemTime, RecordBatch)>,
+    ttl: Duration,
+}
+
+impl StateBuffer {
+    fn new(ttl: Duration) -> Self {
+        Self {
+            batches: VecDeque::new(),
+            ttl,
+        }
+    }
+
+    fn insert(&mut self, batch: RecordBatch, time: SystemTime) {
+        self.batches.push_back((time, batch));
+    }
+
+    fn expire(&mut self, current_time: SystemTime) {
+        let cutoff = current_time
+            .checked_sub(self.ttl)
+            .unwrap_or(SystemTime::UNIX_EPOCH);
+        while let Some((time, _)) = self.batches.front() {
+            if *time < cutoff {
+                self.batches.pop_front();
+            } else {
+                break;
+            }
+        }
+    }
+
+    fn get_all_batches(&self) -> Vec<RecordBatch> {
+        self.batches.iter().map(|(_, b)| b.clone()).collect()
+    }
+}
+
+// ============================================================================
+// ============================================================================
+
+pub struct JoinWithExpirationOperator {
+    left_input_schema: FsSchema,
+    right_input_schema: FsSchema,
+    left_schema: FsSchema,
+    right_schema: FsSchema,
+
+    left_passer: Arc<RwLock<Option<RecordBatch>>>,
+    right_passer: Arc<RwLock<Option<RecordBatch>>>,
+    join_exec_plan: Arc<dyn ExecutionPlan>,
+
+    left_state: StateBuffer,
+    right_state: StateBuffer,
+}
+
+impl JoinWithExpirationOperator {
+    async fn compute_pair(
+        &mut self,
+        left: RecordBatch,
+        right: RecordBatch,
+    ) -> Result<Vec<RecordBatch>> {
+        if left.num_rows() == 0 || right.num_rows() == 0 {
+            return Ok(vec![]);
+        }
+
+        {
+            self.left_passer.write().unwrap().replace(left);
+            self.right_passer.write().unwrap().replace(right);
+        }
+
+        self.join_exec_plan
+            .reset()
+            .map_err(|e| anyhow!("join plan reset: {e}"))?;
+
+        let mut result_stream = self
+            .join_exec_plan
+            .execute(0, SessionContext::new().task_ctx())
+            .map_err(|e| anyhow!("join execute: {e}"))?;
+
+        let mut outputs = Vec::new();
+        while let Some(batch) = result_stream.next().await {
+            outputs.push(batch.map_err(|e| anyhow!("{e}"))?);
+        }
+
+        Ok(outputs)
+    }
+
+    async fn process_side(
+        &mut self,
+        side: JoinSide,
+        batch: RecordBatch,
+        ctx: &mut TaskContext,
+    ) -> Result<Vec<StreamOutput>> {
+        let current_time = ctx
+            .last_present_watermark()
+            .unwrap_or_else(SystemTime::now);
+
+        self.left_state.expire(current_time);
+        self.right_state.expire(current_time);
+
+        match side {
+            JoinSide::Left => self.left_state.insert(batch.clone(), current_time),
+            JoinSide::Right => self.right_state.insert(batch.clone(), current_time),
+        }
+
+        let opposite_batches = match side {
+            JoinSide::Left => self.right_state.get_all_batches(),
+            JoinSide::Right => self.left_state.get_all_batches(),
+        };
+
+        if opposite_batches.is_empty() {
+            return Ok(vec![]);
+        }
+
+        let opposite_schema = match side {
+            JoinSide::Left => &self.right_schema.schema,
+            JoinSide::Right => &self.left_schema.schema,
+        };
+        let combined_opposite_batch = concat_batches(opposite_schema, opposite_batches.iter())?;
+
+        let unkeyed_target_batch = match side {
+            JoinSide::Left => self.left_input_schema.unkeyed_batch(&batch)?,
+            JoinSide::Right => self.right_input_schema.unkeyed_batch(&batch)?,
+        };
+
+        let (left_input, right_input) = match side {
+            JoinSide::Left => (unkeyed_target_batch, combined_opposite_batch),
+            JoinSide::Right => (combined_opposite_batch, unkeyed_target_batch),
+        };
+
+        let result_batches = self.compute_pair(left_input, right_input).await?;
+
+        Ok(result_batches
+            .into_iter()
+            .map(StreamOutput::Forward)
+            .collect())
+    }
+}
+
+#[async_trait]
+impl Operator for JoinWithExpirationOperator {
+    fn name(&self) -> &str {
+        "JoinWithExpiration"
+    }
+
+    async fn on_start(&mut self, _ctx: &mut TaskContext) -> Result<()> {
+        Ok(())
+    }
+
+    async fn process_data(
+        &mut self,
+        input_idx: usize,
+        batch: RecordBatch,
+        ctx: &mut TaskContext,
+    ) -> Result<Vec<StreamOutput>> {
+        let side = if input_idx == 0 {
+            JoinSide::Left
+        } else {
+            JoinSide::Right
+        };
+        self.process_side(side, batch, ctx).await
+    }
+
+    async fn process_watermark(
+        &mut self,
+        _watermark: Watermark,
+        _ctx: &mut TaskContext,
+    ) -> Result<Vec<StreamOutput>> {
+        Ok(vec![])
+    }
+
+    async fn snapshot_state(
+        &mut self,
+        _barrier: CheckpointBarrier,
+        _ctx: &mut TaskContext,
+    ) -> Result<()> {
+        Ok(())
+    }
+
+    async fn on_close(&mut self, _ctx: &mut TaskContext) -> Result<Vec<StreamOutput>> {
+        Ok(vec![])
+    }
+}
+
+// ============================================================================
+// ============================================================================
+
+pub struct JoinWithExpirationConstructor;
+
+impl JoinWithExpirationConstructor {
+    pub fn with_config(
+        &self,
+        config: JoinOperator,
+        registry: Arc<Registry>,
+    ) -> anyhow::Result<JoinWithExpirationOperator> {
+        let left_passer = Arc::new(RwLock::new(None));
+        let right_passer = Arc::new(RwLock::new(None));
+
+        let codec = FsPhysicalExtensionCodec {
+            context: DecodingContext::LockedJoinPair {
+                left: left_passer.clone(),
+                right: right_passer.clone(),
+            },
+        };
+
+        let join_physical_plan_node = PhysicalPlanNode::decode(&mut config.join_plan.as_slice())?;
+        let join_exec_plan = join_physical_plan_node.try_into_physical_plan(
+            registry.as_ref(),
+            &RuntimeEnvBuilder::new().build()?,
+            &codec,
+        )?;
+
+        let left_input_schema: FsSchema = config.left_schema.unwrap().try_into()?;
+        let right_input_schema: FsSchema = config.right_schema.unwrap().try_into()?;
+        let left_schema = left_input_schema.schema_without_keys()?;
+        let right_schema = right_input_schema.schema_without_keys()?;
+
+        let mut ttl = Duration::from_micros(
+            config
+                .ttl_micros
+                .expect("ttl must be set for non-instant join"),
+        );
+
+        if ttl == Duration::ZERO {
+            warn!("TTL was not set for join with expiration, defaulting to 24 hours.");
+            ttl = Duration::from_secs(24 * 60 * 60);
+        }
+
+        Ok(JoinWithExpirationOperator {
+            left_input_schema,
+            right_input_schema,
+            left_schema,
+            right_schema,
+            left_passer,
+            right_passer,
+            join_exec_plan,
+            left_state: StateBuffer::new(ttl),
+            right_state: StateBuffer::new(ttl),
+        })
+    }
+}
diff --git a/src/runtime/streaming/operators/joins/mod.rs b/src/runtime/streaming/operators/joins/mod.rs
new file mode 100644
index 00000000..1cc83d36
--- /dev/null
+++ b/src/runtime/streaming/operators/joins/mod.rs
@@ -0,0 +1,17 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+pub mod join_instance;
+pub mod join_with_expiration;
+
+pub use join_instance::InstantJoinConstructor;
+pub use join_with_expiration::JoinWithExpirationConstructor;
diff --git a/src/runtime/streaming/operators/key_by.rs b/src/runtime/streaming/operators/key_by.rs
new file mode 100644
index 00000000..edafc063
--- /dev/null
+++ b/src/runtime/streaming/operators/key_by.rs
@@ -0,0 +1,165 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+use anyhow::{anyhow, Result};
+use arrow_array::{Array, RecordBatch, UInt64Array};
+use arrow::compute::{sort_to_indices, take};
+use async_trait::async_trait;
+use datafusion::physical_expr::PhysicalExpr;
+use datafusion_physical_expr::expressions::Column;
+use datafusion_common::hash_utils::create_hashes;
+use std::sync::Arc;
+
+use crate::runtime::streaming::api::context::TaskContext;
+use crate::runtime::streaming::api::operator::Operator;
+use crate::runtime::streaming::StreamOutput;
+use crate::sql::common::{CheckpointBarrier, Watermark};
+
+use protocol::grpc::api::KeyPlanOperator;
+
+pub struct KeyByOperator {
+    name: String,
+    key_extractors: Vec<Arc<dyn PhysicalExpr>>,
+    random_state: ahash::RandomState,
+}
+
+impl KeyByOperator {
+    pub fn new(name: String, key_extractors: Vec<Arc<dyn PhysicalExpr>>) -> Self {
+        Self {
+            name,
+            key_extractors,
+            random_state: ahash::RandomState::new(),
+        }
+    }
+}
+
+#[async_trait]
+impl Operator for KeyByOperator {
+    fn name(&self) -> &str {
+        &self.name
+    }
+
+    async fn on_start(&mut self, _ctx: &mut TaskContext) -> Result<()> {
+        Ok(())
+    }
+
+    async fn process_data(
+        &mut self,
+        _input_idx: usize,
+        batch: RecordBatch,
+        _ctx: &mut TaskContext,
+    ) -> Result<Vec<StreamOutput>> {
+        let num_rows = batch.num_rows();
+        if num_rows == 0 {
+            return Ok(vec![]);
+        }
+
+        let mut key_columns = Vec::with_capacity(self.key_extractors.len());
+        for expr in &self.key_extractors {
+            let column_array = expr
+                .evaluate(&batch)
+                .map_err(|e| anyhow!("Failed to evaluate key expr: {}", e))?
+                .into_array(num_rows)
+                .map_err(|e| anyhow!("Failed to convert into array: {}", e))?;
+            key_columns.push(column_array);
+        }
+
+        let mut hash_buffer = vec![0u64; num_rows];
+        create_hashes(&key_columns, &self.random_state, &mut hash_buffer)
+            .map_err(|e| anyhow!("Failed to compute hashes: {}", e))?;
+
+        let hash_array = UInt64Array::from(hash_buffer);
+
+        let sorted_indices = sort_to_indices(&hash_array, None, None)
+            .map_err(|e| anyhow!("Failed to sort hashes: {}", e))?;
+
+        let sorted_hashes_ref = take(&hash_array, &sorted_indices, None)?;
+        let sorted_hashes = sorted_hashes_ref
+            .as_any()
+            .downcast_ref::<UInt64Array>()
+            .unwrap();
+
+        let sorted_columns: std::result::Result<Vec<_>, _> = batch
+            .columns()
+            .iter()
+            .map(|col| take(col, &sorted_indices, None))
+            .collect();
+        let sorted_batch = RecordBatch::try_new(batch.schema(), sorted_columns?)?;
+
+        let mut outputs = Vec::new();
+        let mut start_idx = 0;
+
+        while start_idx < num_rows {
+            let current_hash = sorted_hashes.value(start_idx);
+            let mut end_idx = start_idx + 1;
+            while end_idx < num_rows && sorted_hashes.value(end_idx) == current_hash {
+                end_idx += 1;
+            }
+
+            let sub_batch = sorted_batch.slice(start_idx, end_idx - start_idx);
+            outputs.push(StreamOutput::Keyed(current_hash, sub_batch));
+            start_idx = end_idx;
+        }
+
+        Ok(outputs)
+    }
+
+    async fn process_watermark(
+        &mut self,
+        watermark: Watermark,
+        _ctx: &mut TaskContext,
+    ) -> Result<Vec<StreamOutput>> {
+        Ok(vec![StreamOutput::Watermark(watermark)])
+    }
+
+    async fn snapshot_state(
+        &mut self,
+        _barrier: CheckpointBarrier,
+        _ctx: &mut TaskContext,
+    ) -> Result<()> {
+        Ok(())
+    }
+
+    async fn on_close(&mut self, _ctx: &mut TaskContext) -> Result<Vec<StreamOutput>> {
+        Ok(vec![])
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Constructor
+// ---------------------------------------------------------------------------
+
+pub struct KeyByConstructor;
+
+impl KeyByConstructor {
+    pub fn with_config(&self, config: KeyPlanOperator) -> Result<KeyByOperator> {
+        let mut key_extractors: Vec<Arc<dyn PhysicalExpr>> =
+            Vec::with_capacity(config.key_fields.len());
+
+        for field_idx in &config.key_fields {
+            let idx = *field_idx as usize;
+            let expr = Arc::new(Column::new(&format!("col_{}", idx), idx))
+                as Arc<dyn PhysicalExpr>;
+            key_extractors.push(expr);
+        }
+
+        let name = if config.name.is_empty() {
+            "KeyBy".to_string()
+        } else {
+            config.name.clone()
+        };
+
+        Ok(KeyByOperator::new(name, key_extractors))
+    }
+}
+
diff --git a/src/runtime/streaming/operators/key_operator.rs b/src/runtime/streaming/operators/key_operator.rs
new file mode 100644
index 00000000..4a3942e0
--- /dev/null
+++ b/src/runtime/streaming/operators/key_operator.rs
@@ -0,0 +1,283 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//!
+
+use anyhow::{anyhow, Result};
+use arrow_array::{Array, ArrayRef, RecordBatch, UInt64Array};
+use arrow::compute::{sort_to_indices, take};
+use async_trait::async_trait;
+use datafusion::physical_expr::PhysicalExpr;
+use datafusion_physical_expr::expressions::Column;
+use datafusion_common::hash_utils::create_hashes;
+use futures::StreamExt;
+use std::sync::Arc;
+
+use crate::runtime::streaming::api::context::TaskContext;
+use crate::runtime::streaming::api::operator::Operator;
+use crate::runtime::streaming::operators::StatelessPhysicalExecutor;
+use crate::runtime::streaming::StreamOutput;
+use crate::sql::common::{CheckpointBarrier, Watermark};
+
+use protocol::grpc::api::KeyPlanOperator;
+
+pub struct KeyByOperator {
+    name: String,
+    key_extractors: Vec<Arc<dyn PhysicalExpr>>,
+    random_state: ahash::RandomState,
+}
+
+impl KeyByOperator {
+    pub fn new(name: String, key_extractors: Vec<Arc<dyn PhysicalExpr>>) -> Self {
+        Self {
+            name,
+            key_extractors,
+            random_state: ahash::RandomState::new(),
+        }
+    }
+}
+
+#[async_trait]
+impl Operator for KeyByOperator {
+    fn name(&self) -> &str {
+        &self.name
+    }
+
+    async fn on_start(&mut self, _ctx: &mut TaskContext) -> Result<()> {
+        Ok(())
+    }
+
+    async fn process_data(
+        &mut self,
+        _input_idx: usize,
+        batch: RecordBatch,
+        _ctx: &mut TaskContext,
+    ) -> Result<Vec<StreamOutput>> {
+        let num_rows = batch.num_rows();
+        if num_rows == 0 {
+            return Ok(vec![]);
+        }
+
+        let mut key_columns = Vec::with_capacity(self.key_extractors.len());
+        for expr in &self.key_extractors {
+            let column_array = expr
+                .evaluate(&batch)
+                .map_err(|e| anyhow!("Failed to evaluate key expr: {}", e))?
+                .into_array(num_rows)
+                .map_err(|e| anyhow!("Failed to convert into array: {}", e))?;
+            key_columns.push(column_array);
+        }
+
+        let mut hash_buffer = vec![0u64; num_rows];
+        create_hashes(&key_columns, &self.random_state, &mut hash_buffer)
+            .map_err(|e| anyhow!("Failed to compute hashes: {}", e))?;
+
+        let hash_array = UInt64Array::from(hash_buffer);
+
+        let sorted_indices = sort_to_indices(&hash_array, None, None)
+            .map_err(|e| anyhow!("Failed to sort hashes: {}", e))?;
+
+        let sorted_hashes_ref = take(&hash_array, &sorted_indices, None)?;
+        let sorted_hashes = sorted_hashes_ref
+            .as_any()
+            .downcast_ref::<UInt64Array>()
+            .unwrap();
+
+        let sorted_columns: std::result::Result<Vec<_>, _> = batch
+            .columns()
+            .iter()
+            .map(|col| take(col, &sorted_indices, None))
+            .collect();
+        let sorted_batch = RecordBatch::try_new(batch.schema(), sorted_columns?)?;
+
+        let mut outputs = Vec::new();
+        let mut start_idx = 0;
+
+        while start_idx < num_rows {
+            let current_hash = sorted_hashes.value(start_idx);
+            let mut end_idx = start_idx + 1;
+            while end_idx < num_rows && sorted_hashes.value(end_idx) == current_hash {
+                end_idx += 1;
+            }
+
+            let sub_batch = sorted_batch.slice(start_idx, end_idx - start_idx);
+            outputs.push(StreamOutput::Keyed(current_hash, sub_batch));
+            start_idx = end_idx;
+        }
+
+        Ok(outputs)
+    }
+
+    async fn process_watermark(
+        &mut self,
+        watermark: Watermark,
+        _ctx: &mut TaskContext,
+    ) -> Result<Vec<StreamOutput>> {
+        Ok(vec![StreamOutput::Watermark(watermark)])
+    }
+
+    async fn snapshot_state(
+        &mut self,
+        _barrier: CheckpointBarrier,
+        _ctx: &mut TaskContext,
+    ) -> Result<()> {
+        Ok(())
+    }
+
+    async fn on_close(&mut self, _ctx: &mut TaskContext) -> Result<Vec<StreamOutput>> {
+        Ok(vec![])
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Constructor
+// ---------------------------------------------------------------------------
+
+pub struct KeyByConstructor;
+
+impl KeyByConstructor {
+    pub fn with_config(&self, config: KeyPlanOperator) -> Result<KeyByOperator> {
+        let mut key_extractors: Vec<Arc<dyn PhysicalExpr>> =
+            Vec::with_capacity(config.key_fields.len());
+
+        for field_idx in &config.key_fields {
+            let idx = *field_idx as usize;
+            let expr = Arc::new(Column::new(&format!("col_{}", idx), idx))
+                as Arc<dyn PhysicalExpr>;
+            key_extractors.push(expr);
+        }
+
+        let name = if config.name.is_empty() {
+            "KeyBy".to_string()
+        } else {
+            config.name.clone()
+        };
+
+        Ok(KeyByOperator::new(name, key_extractors))
+    }
+}
+
+// ===========================================================================
+// ===========================================================================
+
+pub struct KeyExecutionOperator {
+    name: String,
+    executor: StatelessPhysicalExecutor,
+    key_fields: Vec<usize>,
+    random_state: ahash::RandomState,
+}
+
+impl KeyExecutionOperator {
+    pub fn new(
+        name: String,
+        executor: StatelessPhysicalExecutor,
+        key_fields: Vec<usize>,
+    ) -> Self {
+        Self {
+            name,
+            executor,
+            key_fields,
+            random_state: ahash::RandomState::new(),
+        }
+    }
+}
+
+#[async_trait]
+impl Operator for KeyExecutionOperator {
+    fn name(&self) -> &str {
+        &self.name
+    }
+
+    async fn process_data(
+        &mut self,
+        _input_idx: usize,
+        batch: RecordBatch,
+        _ctx: &mut TaskContext,
+    ) -> Result<Vec<StreamOutput>> {
+        let mut outputs = Vec::new();
+
+        let mut stream = self.executor.process_batch(batch).await?;
+
+        while let Some(batch_result) = stream.next().await {
+            let out_batch = batch_result?;
+            let num_rows = out_batch.num_rows();
+            if num_rows == 0 {
+                continue;
+            }
+
+            let key_columns: Vec<ArrayRef> = self
+                .key_fields
+                .iter()
+                .map(|&idx| out_batch.column(idx).clone())
+                .collect();
+
+            let mut hash_buffer = vec![0u64; num_rows];
+            create_hashes(&key_columns, &self.random_state, &mut hash_buffer)
+                .map_err(|e| anyhow!("hash compute: {e}"))?;
+            let hash_array = UInt64Array::from(hash_buffer);
+
+            let sorted_indices = sort_to_indices(&hash_array, None, None)
+                .map_err(|e| anyhow!("sort hashes: {e}"))?;
+
+            let sorted_hashes_ref = take(&hash_array, &sorted_indices, None)?;
+            let sorted_hashes = sorted_hashes_ref
+                .as_any()
+                .downcast_ref::<UInt64Array>()
+                .unwrap();
+
+            let sorted_columns: std::result::Result<Vec<_>, _> = out_batch
+                .columns()
+                .iter()
+                .map(|col| take(col, &sorted_indices, None))
+                .collect();
+            let sorted_batch =
+                RecordBatch::try_new(out_batch.schema(), sorted_columns?)?;
+
+            let mut start_idx = 0;
+            while start_idx < num_rows {
+                let current_hash = sorted_hashes.value(start_idx);
+                let mut end_idx = start_idx + 1;
+                while end_idx < num_rows
+                    && sorted_hashes.value(end_idx) == current_hash
+                {
+                    end_idx += 1;
+                }
+
+                let sub_batch = sorted_batch.slice(start_idx, end_idx - start_idx);
+                outputs.push(StreamOutput::Keyed(current_hash, sub_batch));
+                start_idx = end_idx;
+            }
+        }
+        Ok(outputs)
+    }
+
+    async fn process_watermark(
+        &mut self,
+        watermark: Watermark,
+        _ctx: &mut TaskContext,
+    ) -> Result<Vec<StreamOutput>> {
+        Ok(vec![StreamOutput::Watermark(watermark)])
+    }
+
+    async fn snapshot_state(
+        &mut self,
+        _barrier: CheckpointBarrier,
+        _ctx: &mut TaskContext,
+    ) -> Result<()> {
+        Ok(())
+    }
+
+    async fn on_close(&mut self, _ctx: &mut TaskContext) -> Result<Vec<StreamOutput>> {
+        Ok(vec![])
+    }
+}
+
diff --git a/src/runtime/streaming/operators/mod.rs b/src/runtime/streaming/operators/mod.rs
new file mode 100644
index 00000000..ffe1c101
--- /dev/null
+++ b/src/runtime/streaming/operators/mod.rs
@@ -0,0 +1,30 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+pub mod grouping;
+pub mod joins;
+pub mod key_by;
+pub mod sink;
+pub mod source;
+pub mod watermark;
+pub mod windows;
+mod key_operator;
+pub mod projection;
+mod stateless_physical_executor;
+mod value_execution;
+
+pub use stateless_physical_executor::StatelessPhysicalExecutor;
+pub use projection::ProjectionOperator;
+pub use value_execution::ValueExecutionOperator;
+
+pub use grouping::{Key, UpdatingCache};
diff --git a/src/runtime/streaming/operators/projection.rs b/src/runtime/streaming/operators/projection.rs
new file mode 100644
index 00000000..0136e18e
--- /dev/null
+++ b/src/runtime/streaming/operators/projection.rs
@@ -0,0 +1,140 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use anyhow::{anyhow, Result};
+use arrow_array::RecordBatch;
+use async_trait::async_trait;
+use datafusion::physical_expr::PhysicalExpr;
+use datafusion_proto::physical_plan::DefaultPhysicalExtensionCodec;
+use datafusion_proto::physical_plan::from_proto::parse_physical_expr;
+use datafusion_proto::protobuf::PhysicalExprNode;
+use prost::Message;
+use std::sync::Arc;
+
+use protocol::grpc::api::ProjectionOperator as ProjectionOperatorProto;
+
+use crate::runtime::streaming::api::context::TaskContext;
+use crate::runtime::streaming::api::operator::Operator;
+use crate::runtime::streaming::factory::global::Registry;
+use crate::runtime::streaming::StreamOutput;
+use crate::sql::common::{CheckpointBarrier, FsSchema, FsSchemaRef, Watermark};
+use crate::sql::logical_node::logical::OperatorName;
+
+pub struct ProjectionOperator {
+    name: String,
+    output_schema: FsSchemaRef,
+    exprs: Vec<Arc<dyn PhysicalExpr>>,
+}
+
+impl ProjectionOperator {
+    pub fn new(
+        name: String,
+        output_schema: FsSchemaRef,
+        exprs: Vec<Arc<dyn PhysicalExpr>>,
+    ) -> Self {
+        Self {
+            name,
+            output_schema,
+            exprs,
+        }
+    }
+
+    pub fn from_proto(
+        config: ProjectionOperatorProto,
+        registry: Arc<Registry>,
+    ) -> Result<Self> {
+        let input_schema: FsSchema = config
+            .input_schema
+            .ok_or_else(|| anyhow!("missing projection input_schema"))?
+            .try_into()
+            .map_err(|e| anyhow!("projection input_schema: {e}"))?;
+
+        let output_schema: FsSchema = config
+            .output_schema
+            .ok_or_else(|| anyhow!("missing projection output_schema"))?
+            .try_into()
+            .map_err(|e| anyhow!("projection output_schema: {e}"))?;
+
+        let exprs = config
+            .exprs
+            .iter()
+            .map(|raw| {
+                let expr_node = PhysicalExprNode::decode(&mut raw.as_slice())
+                    .map_err(|e| anyhow!("decode projection expr: {e}"))?;
+                parse_physical_expr(
+                    &expr_node,
+                    registry.as_ref(),
+                    &input_schema.schema,
+                    &DefaultPhysicalExtensionCodec {},
+                )
+                    .map_err(|e| anyhow!("parse projection expr: {e}"))
+            })
+            .collect::<Result<Vec<_>>>()?;
+
+        let name = if config.name.is_empty() {
+            OperatorName::Projection.as_registry_key().to_string()
+        } else {
+            config.name
+        };
+
+        Ok(Self::new(name, Arc::new(output_schema), exprs))
+
+    }
+}
+
+#[async_trait]
+impl Operator for ProjectionOperator {
+    fn name(&self) -> &str {
+        &self.name
+    }
+
+    async fn process_data(
+        &mut self,
+        _input_idx: usize,
+        batch: RecordBatch,
+        _ctx: &mut TaskContext,
+    ) -> Result<Vec<StreamOutput>> {
+        if batch.num_rows() == 0 {
+            return Ok(vec![]);
+        }
+
+        let projected_columns = self
+            .exprs
+            .iter()
+            .map(|expr| {
+                expr.evaluate(&batch)
+                    .and_then(|val| val.into_array(batch.num_rows()))
+            })
+            .collect::<datafusion::common::Result<Vec<_>>>()?;
+
+        let out_batch =
+            RecordBatch::try_new(self.output_schema.schema.clone(), projected_columns)?;
+
+        Ok(vec![StreamOutput::Forward(out_batch)])
+    }
+
+    async fn process_watermark(
+        &mut self,
+        watermark: Watermark,
+        _ctx: &mut TaskContext,
+    ) -> Result<Vec<StreamOutput>> {
+        Ok(vec![StreamOutput::Watermark(watermark)])
+    }
+
+    async fn snapshot_state(
+        &mut self,
+        _barrier: CheckpointBarrier,
+        _ctx: &mut TaskContext,
+    ) -> Result<()> {
+        Ok(())
+    }
+}
diff --git a/src/runtime/streaming/operators/sink/kafka/mod.rs b/src/runtime/streaming/operators/sink/kafka/mod.rs
new file mode 100644
index 00000000..4b6d48cb
--- /dev/null
+++ b/src/runtime/streaming/operators/sink/kafka/mod.rs
@@ -0,0 +1,358 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+use anyhow::{anyhow, bail, Result};
+use arrow_array::cast::AsArray;
+use arrow_array::Array;
+use arrow_array::RecordBatch;
+use arrow_schema::{DataType, TimeUnit};
+use async_trait::async_trait;
+use rdkafka::error::{KafkaError, RDKafkaErrorCode};
+use rdkafka::producer::{DeliveryFuture, FutureProducer, FutureRecord, Producer};
+use rdkafka::util::Timeout;
+use rdkafka::ClientConfig;
+use std::collections::HashMap;
+use std::time::Duration;
+use tokio::time::sleep;
+use tracing::{info, warn};
+
+use crate::runtime::streaming::api::context::TaskContext;
+use crate::runtime::streaming::api::operator::Operator;
+use crate::runtime::streaming::format::DataSerializer;
+use crate::runtime::streaming::StreamOutput;
+use crate::sql::common::constants::factory_operator_name;
+use crate::sql::common::{CheckpointBarrier, FsSchema, Watermark};
+// ============================================================================
+// ============================================================================
+
+#[derive(Debug, Clone)]
+pub enum ConsistencyMode {
+    AtLeastOnce,
+    ExactlyOnce,
+}
+
+struct TransactionalState {
+    next_transaction_index: usize,
+    active_producer: FutureProducer,
+    producer_awaiting_commit: Option<FutureProducer>,
+}
+
+// ============================================================================
+// ============================================================================
+
+pub struct KafkaSinkOperator {
+    pub topic: String,
+    pub bootstrap_servers: String,
+    pub consistency_mode: ConsistencyMode,
+    pub client_config: HashMap<String, String>,
+
+    pub input_schema: FsSchema,
+    pub timestamp_col_idx: Option<usize>,
+    pub key_col_idx: Option<usize>,
+
+    pub serializer: DataSerializer,
+
+    at_least_once_producer: Option<FutureProducer>,
+    transactional_state: Option<TransactionalState>,
+
+    write_futures: Vec<DeliveryFuture>,
+}
+
+impl KafkaSinkOperator {
+    pub fn new(
+        topic: String,
+        bootstrap_servers: String,
+        consistency_mode: ConsistencyMode,
+        client_config: HashMap<String, String>,
+        input_schema: FsSchema,
+        serializer: DataSerializer,
+    ) -> Self {
+        Self {
+            topic,
+            bootstrap_servers,
+            consistency_mode,
+            client_config,
+            input_schema,
+            timestamp_col_idx: None,
+            key_col_idx: None,
+            serializer,
+            at_least_once_producer: None,
+            transactional_state: None,
+            write_futures: Vec::new(),
+        }
+    }
+
+    fn resolve_schema_indices(&mut self) {
+        self.timestamp_col_idx = Some(self.input_schema.timestamp_index);
+
+        if let Some(routing_keys) = self.input_schema.routing_keys() {
+            if !routing_keys.is_empty() {
+                self.key_col_idx = Some(routing_keys[0]);
+            }
+        }
+    }
+
+    fn create_producer(&self, ctx: &TaskContext, tx_index: Option<usize>) -> Result<FutureProducer> {
+        let mut config = ClientConfig::new();
+        config.set("bootstrap.servers", &self.bootstrap_servers);
+
+        for (k, v) in &self.client_config {
+            config.set(k, v);
+        }
+
+        if let Some(idx) = tx_index {
+            config.set("enable.idempotence", "true");
+            let transactional_id = format!(
+                "fs-tx-{}-{}-{}-{}",
+                ctx.job_id, self.topic, ctx.subtask_idx, idx
+            );
+            config.set("transactional.id", &transactional_id);
+
+            let producer: FutureProducer = config.create()?;
+            producer
+                .init_transactions(Timeout::After(Duration::from_secs(30)))
+                .map_err(|e| anyhow!("Failed to init Kafka transactions: {}", e))?;
+            producer
+                .begin_transaction()
+                .map_err(|e| anyhow!("Failed to begin Kafka transaction: {}", e))?;
+
+            Ok(producer)
+        } else {
+            Ok(config.create()?)
+        }
+    }
+
+    async fn flush_to_broker(&mut self) -> Result<()> {
+        let producer = self.current_producer();
+
+        producer.poll(Timeout::After(Duration::ZERO));
+
+        for future in self.write_futures.drain(..) {
+            match future.await {
+                Ok(Ok(_)) => continue,
+                Ok(Err((e, _))) => bail!("Kafka producer delivery failed: {}", e),
+                Err(_) => bail!("Kafka delivery future canceled"),
+            }
+        }
+        Ok(())
+    }
+
+    fn current_producer(&self) -> &FutureProducer {
+        match &self.consistency_mode {
+            ConsistencyMode::AtLeastOnce => self.at_least_once_producer.as_ref().unwrap(),
+            ConsistencyMode::ExactlyOnce => &self.transactional_state.as_ref().unwrap().active_producer,
+        }
+    }
+}
+
+fn event_timestamp_ms(batch: &RecordBatch, row: usize, col: usize) -> Option<i64> {
+    let arr = batch.column(col);
+    match arr.data_type() {
+        DataType::Timestamp(TimeUnit::Second, _) => {
+            let a = arr.as_primitive::<arrow_array::types::TimestampSecondType>();
+            (!a.is_null(row)).then(|| a.value(row) * 1000)
+        }
+        DataType::Timestamp(TimeUnit::Millisecond, _) => {
+            let a = arr.as_primitive::<arrow_array::types::TimestampMillisecondType>();
+            (!a.is_null(row)).then(|| a.value(row))
+        }
+        DataType::Timestamp(TimeUnit::Microsecond, _) => {
+            let a = arr.as_primitive::<arrow_array::types::TimestampMicrosecondType>();
+            (!a.is_null(row)).then(|| a.value(row) / 1000)
+        }
+        DataType::Timestamp(TimeUnit::Nanosecond, _) => {
+            let a = arr.as_primitive::<arrow_array::types::TimestampNanosecondType>();
+            (!a.is_null(row)).then(|| a.value(row) / 1_000_000)
+        }
+        _ => None,
+    }
+}
+
+fn row_key_bytes(batch: &RecordBatch, row: usize, col: usize) -> Option<Vec<u8>> {
+    let arr = batch.column(col);
+    match arr.data_type() {
+        DataType::Utf8 => {
+            let s = arr.as_string::<i32>();
+            if s.is_null(row) {
+                None
+            } else {
+                Some(s.value(row).as_bytes().to_vec())
+            }
+        }
+        DataType::LargeUtf8 => {
+            let s = arr.as_string::<i64>();
+            if s.is_null(row) {
+                None
+            } else {
+                Some(s.value(row).as_bytes().to_vec())
+            }
+        }
+        _ => None,
+    }
+}
+
+// ============================================================================
+// ============================================================================
+
+#[async_trait]
+impl Operator for KafkaSinkOperator {
+    fn name(&self) -> &str {
+        factory_operator_name::KAFKA_SINK
+    }
+
+    async fn on_start(&mut self, ctx: &mut TaskContext) -> Result<()> {
+        self.resolve_schema_indices();
+
+        match self.consistency_mode {
+            ConsistencyMode::AtLeastOnce => {
+                self.at_least_once_producer = Some(self.create_producer(ctx, None)?);
+            }
+            ConsistencyMode::ExactlyOnce => {
+                let mut next_idx = 0usize;
+
+                let active_producer = self.create_producer(ctx, Some(next_idx))?;
+                next_idx += 1;
+
+                self.transactional_state = Some(TransactionalState {
+                    next_transaction_index: next_idx,
+                    active_producer,
+                    producer_awaiting_commit: None,
+                });
+            }
+        }
+        Ok(())
+    }
+
+    async fn process_data(
+        &mut self,
+        _input_idx: usize,
+        batch: RecordBatch,
+        _ctx: &mut TaskContext,
+    ) -> Result<Vec<StreamOutput>> {
+        let payloads = self.serializer.serialize(&batch)?;
+        let producer = self.current_producer().clone();
+
+        for (i, payload) in payloads.iter().enumerate() {
+            let ts_millis = self
+                .timestamp_col_idx
+                .and_then(|idx| event_timestamp_ms(&batch, i, idx));
+            let key_bytes = self
+                .key_col_idx
+                .and_then(|idx| row_key_bytes(&batch, i, idx));
+
+            let mut record = FutureRecord::<Vec<u8>, Vec<u8>>::to(&self.topic).payload(&payload);
+            if let Some(ts) = ts_millis {
+                record = record.timestamp(ts);
+            }
+            if let Some(ref k) = key_bytes {
+                record = record.key(k);
+            }
+
+            loop {
+                match producer.send_result(record) {
+                    Ok(delivery_future) => {
+                        self.write_futures.push(delivery_future);
+                        break;
+                    }
+                    Err((KafkaError::MessageProduction(RDKafkaErrorCode::QueueFull), returned_record)) => {
+                        record = returned_record;
+                        sleep(Duration::from_millis(10)).await;
+                    }
+                    Err((e, _)) => bail!("Fatal Kafka send error: {}", e),
+                }
+            }
+        }
+
+        Ok(vec![])
+    }
+
+    async fn process_watermark(
+        &mut self,
+        _watermark: Watermark,
+        _ctx: &mut TaskContext,
+    ) -> Result<Vec<StreamOutput>> {
+        Ok(vec![])
+    }
+
+    async fn snapshot_state(
+        &mut self,
+        _barrier: CheckpointBarrier,
+        ctx: &mut TaskContext,
+    ) -> Result<()> {
+        self.flush_to_broker().await?;
+
+        if matches!(self.consistency_mode, ConsistencyMode::ExactlyOnce) {
+            let next_tx = self
+                .transactional_state
+                .as_ref()
+                .map(|s| s.next_transaction_index)
+                .unwrap();
+            let new_producer = self.create_producer(ctx, Some(next_tx))?;
+
+            let state = self.transactional_state.as_mut().unwrap();
+            let old_producer = std::mem::replace(&mut state.active_producer, new_producer);
+            state.producer_awaiting_commit = Some(old_producer);
+
+            state.next_transaction_index += 1;
+        }
+
+        Ok(())
+    }
+
+    async fn commit_checkpoint(&mut self, epoch: u32, _ctx: &mut TaskContext) -> Result<()> {
+        if matches!(self.consistency_mode, ConsistencyMode::AtLeastOnce) {
+            return Ok(());
+        }
+
+        let state = self.transactional_state.as_mut().unwrap();
+        let Some(committing_producer) = state.producer_awaiting_commit.take() else {
+            warn!(
+                "Received Commit for epoch {}, but no stashed producer exists. Possibly a recovery duplicate.",
+                epoch
+            );
+            return Ok(());
+        };
+
+        let mut retries = 0;
+        loop {
+            match committing_producer.commit_transaction(Timeout::After(Duration::from_secs(10))) {
+                Ok(_) => {
+                    info!("Successfully committed Kafka transaction for epoch {}", epoch);
+                    break;
+                }
+                Err(e) => {
+                    retries += 1;
+                    if retries >= 5 {
+                        bail!(
+                            "Failed to commit Kafka transaction after 5 retries. Fatal error: {}",
+                            e
+                        );
+                    }
+                    warn!(
+                        "Failed to commit Kafka transaction (Attempt {}/5): {}. Retrying...",
+                        retries, e
+                    );
+                    sleep(Duration::from_secs(2)).await;
+                }
+            }
+        }
+
+        Ok(())
+    }
+
+    async fn on_close(&mut self, _ctx: &mut TaskContext) -> Result<Vec<StreamOutput>> {
+        self.flush_to_broker().await?;
+        info!("Kafka sink shut down gracefully.");
+        Ok(vec![])
+    }
+}
diff --git a/src/runtime/streaming/operators/sink/mod.rs b/src/runtime/streaming/operators/sink/mod.rs
new file mode 100644
index 00000000..aa340614
--- /dev/null
+++ b/src/runtime/streaming/operators/sink/mod.rs
@@ -0,0 +1,15 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+pub mod kafka;
+
diff --git a/src/runtime/streaming/operators/source/kafka/mod.rs b/src/runtime/streaming/operators/source/kafka/mod.rs
new file mode 100644
index 00000000..d0de692a
--- /dev/null
+++ b/src/runtime/streaming/operators/source/kafka/mod.rs
@@ -0,0 +1,377 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+use anyhow::{anyhow, Context as _, Result};
+use arrow_array::RecordBatch;
+use arrow_schema::SchemaRef;
+use async_trait::async_trait;
+use bincode::{Decode, Encode};
+use governor::{DefaultDirectRateLimiter, Quota, RateLimiter as GovernorRateLimiter};
+use rdkafka::consumer::{CommitMode, Consumer, StreamConsumer};
+use rdkafka::{ClientConfig, Message as KMessage, Offset, TopicPartitionList};
+use std::collections::HashMap;
+use std::num::NonZeroU32;
+use std::time::{Duration, Instant};
+use tracing::{debug, error, info, warn};
+
+use crate::runtime::streaming::api::context::TaskContext;
+use crate::runtime::streaming::api::source::{SourceEvent, SourceOffset, SourceOperator};
+use crate::runtime::streaming::format::{BadDataPolicy, DataDeserializer, Format};
+use crate::sql::common::{CheckpointBarrier, MetadataField};
+use crate::sql::common::fs_schema::FieldValueType;
+// ============================================================================
+// ============================================================================
+
+#[derive(Copy, Clone, Debug, Encode, Decode, PartialEq, PartialOrd)]
+pub struct KafkaState {
+    partition: i32,
+    offset: i64,
+}
+
+pub trait BatchDeserializer: Send + 'static {
+    fn deserialize_slice(
+        &mut self,
+        payload: &[u8],
+        timestamp: u64,
+        metadata: Option<HashMap<&str, FieldValueType<'_>>>,
+    ) -> Result<()>;
+
+    fn should_flush(&self) -> bool;
+
+    fn flush_buffer(&mut self) -> Result<Option<RecordBatch>>;
+
+    fn is_empty(&self) -> bool;
+}
+
+// ---------------------------------------------------------------------------
+// ---------------------------------------------------------------------------
+
+pub struct BufferedDeserializer {
+    inner: DataDeserializer,
+    buffer: Vec<Vec<u8>>,
+    batch_size: usize,
+}
+
+impl BufferedDeserializer {
+    pub fn new(format: Format, schema: SchemaRef, bad_data_policy: BadDataPolicy, batch_size: usize) -> Self {
+        Self {
+            inner: DataDeserializer::new(format, schema, bad_data_policy),
+            buffer: Vec::with_capacity(batch_size),
+            batch_size,
+        }
+    }
+}
+
+impl BatchDeserializer for BufferedDeserializer {
+    fn deserialize_slice(
+        &mut self,
+        payload: &[u8],
+        _timestamp: u64,
+        _metadata: Option<HashMap<&str, FieldValueType<'_>>>,
+    ) -> Result<()> {
+        self.buffer.push(payload.to_vec());
+        Ok(())
+    }
+
+    fn should_flush(&self) -> bool {
+        self.buffer.len() >= self.batch_size
+    }
+
+    fn flush_buffer(&mut self) -> Result<Option<RecordBatch>> {
+        if self.buffer.is_empty() {
+            return Ok(None);
+        }
+
+        let refs: Vec<&[u8]> = self.buffer.iter().map(|v| v.as_slice()).collect();
+        let batch = self.inner.deserialize_batch(&refs)?;
+        self.buffer.clear();
+        Ok(Some(batch))
+    }
+
+    fn is_empty(&self) -> bool {
+        self.buffer.is_empty()
+    }
+}
+
+impl SourceOffset {
+    fn rdkafka_offset(self) -> Offset {
+        match self {
+            SourceOffset::Earliest => Offset::Beginning,
+            SourceOffset::Latest => Offset::End,
+            SourceOffset::Group => Offset::Stored,
+        }
+    }
+}
+
+// ============================================================================
+// ============================================================================
+
+const KAFKA_POLL_TIMEOUT: Duration = Duration::from_millis(100);
+const MAX_BATCH_LINGER_TIME: Duration = Duration::from_millis(500);
+
+pub struct KafkaSourceOperator {
+    pub topic: String,
+    pub bootstrap_servers: String,
+    pub group_id: Option<String>,
+    pub group_id_prefix: Option<String>,
+    pub offset_mode: SourceOffset,
+
+    pub client_configs: HashMap<String, String>,
+    pub messages_per_second: NonZeroU32,
+    pub metadata_fields: Vec<MetadataField>,
+
+    consumer: Option<StreamConsumer>,
+    rate_limiter: Option<DefaultDirectRateLimiter>,
+    deserializer: Box<dyn BatchDeserializer>,
+
+    current_offsets: HashMap<i32, i64>,
+    is_empty_assignment: bool,
+
+    last_flush_time: Instant,
+}
+
+impl KafkaSourceOperator {
+    pub fn new(
+        topic: String,
+        bootstrap_servers: String,
+        group_id: Option<String>,
+        group_id_prefix: Option<String>,
+        offset_mode: SourceOffset,
+        client_configs: HashMap<String, String>,
+        messages_per_second: NonZeroU32,
+        metadata_fields: Vec<MetadataField>,
+        deserializer: Box<dyn BatchDeserializer>,
+    ) -> Self {
+        Self {
+            topic,
+            bootstrap_servers,
+            group_id,
+            group_id_prefix,
+            offset_mode,
+            client_configs,
+            messages_per_second,
+            metadata_fields,
+            consumer: None,
+            rate_limiter: None,
+            deserializer,
+            current_offsets: HashMap::new(),
+            is_empty_assignment: false,
+            last_flush_time: Instant::now(),
+        }
+    }
+
+    async fn init_and_assign_consumer(&mut self, ctx: &mut TaskContext) -> Result<()> {
+        info!("Creating kafka consumer for {}", self.bootstrap_servers);
+        let mut client_config = ClientConfig::new();
+
+        let group_id = match (&self.group_id, &self.group_id_prefix) {
+            (Some(gid), _) => gid.clone(),
+            (None, Some(prefix)) => {
+                format!("{}-fs-{}-{}", prefix, ctx.job_id, ctx.subtask_idx)
+            }
+            (None, None) => format!("fs-{}-{}-consumer", ctx.job_id, ctx.subtask_idx),
+        };
+
+        for (key, value) in &self.client_configs {
+            client_config.set(key, value);
+        }
+
+        let consumer: StreamConsumer = client_config
+            .set("bootstrap.servers", &self.bootstrap_servers)
+            .set("enable.partition.eof", "false")
+            .set("enable.auto.commit", "false")
+            .set("group.id", &group_id)
+            .create()?;
+
+        let has_state = false;
+        let state_map: HashMap<i32, KafkaState> = HashMap::new();
+
+        let metadata = consumer
+            .fetch_metadata(Some(&self.topic), Duration::from_secs(30))
+            .context("Failed to fetch Kafka metadata")?;
+
+        let topic_meta = metadata
+            .topics()
+            .iter()
+            .find(|t| t.name() == self.topic)
+            .ok_or_else(|| anyhow!("topic {} not in metadata", self.topic))?;
+
+        let partitions = topic_meta.partitions();
+        let mut our_partitions = HashMap::new();
+        let pmax = ctx.parallelism.max(1) as i32;
+
+        for p in partitions {
+            if p.id().rem_euclid(pmax) == ctx.subtask_idx as i32 {
+                let offset = state_map
+                    .get(&p.id())
+                    .map(|s| Offset::Offset(s.offset))
+                    .unwrap_or_else(|| {
+                        if has_state {
+                            Offset::Beginning
+                        } else {
+                            self.offset_mode.rdkafka_offset()
+                        }
+                    });
+                our_partitions.insert((self.topic.clone(), p.id()), offset);
+            }
+        }
+
+        if our_partitions.is_empty() {
+            warn!(
+                "[Task {}] Subscribed to no partitions. Entering idle mode.",
+                ctx.subtask_idx
+            );
+            self.is_empty_assignment = true;
+        } else {
+            let topic_partitions = TopicPartitionList::from_topic_map(&our_partitions)?;
+            consumer.assign(&topic_partitions)?;
+        }
+
+        self.consumer = Some(consumer);
+        Ok(())
+    }
+}
+
+// ============================================================================
+// ============================================================================
+
+#[async_trait]
+impl SourceOperator for KafkaSourceOperator {
+    fn name(&self) -> &str {
+        &self.topic
+    }
+
+    async fn on_start(&mut self, ctx: &mut TaskContext) -> Result<()> {
+        self.init_and_assign_consumer(ctx).await?;
+        self.rate_limiter = Some(GovernorRateLimiter::direct(Quota::per_second(
+            self.messages_per_second,
+        )));
+        Ok(())
+    }
+
+    async fn fetch_next(&mut self, _ctx: &mut TaskContext) -> Result<SourceEvent> {
+        if self.is_empty_assignment {
+            return Ok(SourceEvent::Idle);
+        }
+
+        let consumer = self
+            .consumer
+            .as_ref()
+            .ok_or_else(|| anyhow!("Kafka consumer not initialized"))?;
+        let rate_limiter = self
+            .rate_limiter
+            .as_ref()
+            .ok_or_else(|| anyhow!("rate limiter not initialized"))?;
+
+        match tokio::time::timeout(KAFKA_POLL_TIMEOUT, consumer.recv()).await {
+            Ok(Ok(msg)) => {
+                let partition = msg.partition();
+                let offset = msg.offset();
+                let timestamp = msg.timestamp().to_millis().unwrap_or(0);
+
+                self.current_offsets.insert(partition, offset);
+
+                if let Some(payload) = msg.payload() {
+                    let topic = msg.topic();
+
+                    let connector_metadata = if !self.metadata_fields.is_empty() {
+                        let mut meta = HashMap::new();
+                        for f in &self.metadata_fields {
+                            meta.insert(
+                                f.field_name.as_str(),
+                                match f.key.as_str() {
+                                    "key" => FieldValueType::Bytes(msg.key()),
+                                    "offset_id" => FieldValueType::Int64(Some(msg.offset())),
+                                    "partition" => FieldValueType::Int32(Some(msg.partition())),
+                                    "topic" => FieldValueType::String(Some(topic)),
+                                    "timestamp" => FieldValueType::Int64(Some(timestamp)),
+                                    _ => continue,
+                                },
+                            );
+                        }
+                        Some(meta)
+                    } else {
+                        None
+                    };
+
+                    self.deserializer.deserialize_slice(
+                        payload,
+                        timestamp.max(0) as u64,
+                        connector_metadata,
+                    )?;
+                } else {
+                    debug!(
+                        "Received tombstone message at partition {} offset {}",
+                        partition, offset
+                    );
+                }
+
+                rate_limiter.until_ready().await;
+
+                let should_flush_by_size = self.deserializer.should_flush();
+                let should_flush_by_time = self.last_flush_time.elapsed() > MAX_BATCH_LINGER_TIME;
+
+                if !self.deserializer.is_empty() && (should_flush_by_size || should_flush_by_time) {
+                    if let Some(batch) = self.deserializer.flush_buffer()? {
+                        self.last_flush_time = Instant::now();
+                        return Ok(SourceEvent::Data(batch));
+                    }
+                }
+
+                Ok(SourceEvent::Idle)
+            }
+            Ok(Err(e)) => {
+                error!("Kafka recv error: {}", e);
+                Err(anyhow!("Kafka error: {}", e))
+            }
+            Err(_) => {
+                if !self.deserializer.is_empty() {
+                    if let Some(batch) = self.deserializer.flush_buffer()? {
+                        self.last_flush_time = Instant::now();
+                        return Ok(SourceEvent::Data(batch));
+                    }
+                }
+                Ok(SourceEvent::Idle)
+            }
+        }
+    }
+
+    async fn snapshot_state(
+        &mut self,
+        _barrier: CheckpointBarrier,
+        ctx: &mut TaskContext,
+    ) -> Result<()> {
+        debug!("Source [{}] executing checkpoint", ctx.subtask_idx);
+
+        let mut topic_partitions = TopicPartitionList::new();
+        for (&partition, &offset) in &self.current_offsets {
+            topic_partitions
+                .add_partition_offset(&self.topic, partition, Offset::Offset(offset))
+                .map_err(|e| anyhow!("add_partition_offset: {e}"))?;
+        }
+
+        if let Some(consumer) = &self.consumer {
+            if let Err(e) = consumer.commit(&topic_partitions, CommitMode::Async) {
+                warn!("Failed to commit async offset to Kafka Broker: {:?}", e);
+            }
+        }
+
+        Ok(())
+    }
+
+    async fn on_close(&mut self, _ctx: &mut TaskContext) -> Result<()> {
+        info!("Kafka source shutting down gracefully");
+        self.consumer.take();
+        Ok(())
+    }
+}
diff --git a/src/runtime/streaming/operators/source/mod.rs b/src/runtime/streaming/operators/source/mod.rs
new file mode 100644
index 00000000..aa340614
--- /dev/null
+++ b/src/runtime/streaming/operators/source/mod.rs
@@ -0,0 +1,15 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+pub mod kafka;
+
diff --git a/src/runtime/streaming/operators/stateless_physical_executor.rs b/src/runtime/streaming/operators/stateless_physical_executor.rs
new file mode 100644
index 00000000..6c1e5c90
--- /dev/null
+++ b/src/runtime/streaming/operators/stateless_physical_executor.rs
@@ -0,0 +1,88 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+use std::sync::{Arc, RwLock};
+
+use anyhow::{anyhow, Result};
+use arrow_array::RecordBatch;
+use datafusion::execution::context::SessionContext;
+use datafusion::execution::runtime_env::RuntimeEnvBuilder;
+use datafusion::execution::SendableRecordBatchStream;
+use datafusion::execution::TaskContext;
+use datafusion::physical_plan::ExecutionPlan;
+use datafusion_proto::physical_plan::AsExecutionPlan;
+use datafusion_proto::protobuf::PhysicalPlanNode;
+use futures::StreamExt;
+use prost::Message;
+
+use crate::runtime::streaming::factory::Registry;
+use crate::sql::physical::{DecodingContext, FsPhysicalExtensionCodec};
+
+pub struct StatelessPhysicalExecutor {
+    batch: Arc<RwLock<Option<RecordBatch>>>,
+    plan: Arc<dyn ExecutionPlan>,
+    task_context: Arc<TaskContext>,
+}
+
+impl StatelessPhysicalExecutor {
+    pub fn new(mut proto: &[u8], registry: &Registry) -> Result<Self> {
+        let batch = Arc::new(RwLock::default());
+
+        let plan_node = PhysicalPlanNode::decode(&mut proto)
+            .map_err(|e| anyhow!("decode PhysicalPlanNode: {e}"))?;
+        let codec = FsPhysicalExtensionCodec {
+            context: DecodingContext::SingleLockedBatch(batch.clone()),
+        };
+
+        let plan = plan_node.try_into_physical_plan(
+            registry,
+            &RuntimeEnvBuilder::new().build()?,
+            &codec,
+        )?;
+
+        Ok(Self {
+            batch,
+            plan,
+            task_context: SessionContext::new().task_ctx(),
+        })
+    }
+
+    pub async fn process_batch(&mut self, batch: RecordBatch) -> Result<SendableRecordBatchStream> {
+        {
+            let mut writer = self
+                .batch
+                .write()
+                .map_err(|e| anyhow!("SingleLockedBatch lock: {e}"))?;
+            *writer = Some(batch);
+        }
+        self.plan
+            .reset()
+            .map_err(|e| anyhow!("reset execution plan: {e}"))?;
+        self.plan
+            .execute(0, self.task_context.clone())
+            .map_err(|e| anyhow!("failed to compute plan: {e}"))
+    }
+
+    pub async fn process_single(&mut self, batch: RecordBatch) -> Result<RecordBatch> {
+        let mut stream = self.process_batch(batch).await?;
+        let result = stream
+            .next()
+            .await
+            .ok_or_else(|| anyhow!("empty output stream"))??;
+        anyhow::ensure!(
+            stream.next().await.is_none(),
+            "expected exactly one output batch"
+        );
+        Ok(result)
+    }
+}
diff --git a/src/runtime/streaming/operators/value_execution.rs b/src/runtime/streaming/operators/value_execution.rs
new file mode 100644
index 00000000..effdf5f6
--- /dev/null
+++ b/src/runtime/streaming/operators/value_execution.rs
@@ -0,0 +1,76 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+use anyhow::Result;
+use arrow_array::RecordBatch;
+use async_trait::async_trait;
+use futures::StreamExt;
+
+use crate::runtime::streaming::api::context::TaskContext;
+use crate::runtime::streaming::api::operator::Operator;
+use crate::runtime::streaming::operators::StatelessPhysicalExecutor;
+use crate::runtime::streaming::StreamOutput;
+use crate::sql::common::{CheckpointBarrier, Watermark};
+
+pub struct ValueExecutionOperator {
+    name: String,
+    executor: StatelessPhysicalExecutor,
+}
+
+impl ValueExecutionOperator {
+    pub fn new(name: String, executor: StatelessPhysicalExecutor) -> Self {
+        Self { name, executor }
+    }
+}
+
+#[async_trait]
+impl Operator for ValueExecutionOperator {
+    fn name(&self) -> &str {
+        &self.name
+    }
+
+    async fn process_data(
+        &mut self,
+        _input_idx: usize,
+        batch: RecordBatch,
+        _ctx: &mut TaskContext,
+    ) -> Result<Vec<StreamOutput>> {
+        let mut outputs = Vec::new();
+
+        let mut stream = self.executor.process_batch(batch).await?;
+
+        while let Some(batch_result) = stream.next().await {
+            let out_batch = batch_result?;
+            if out_batch.num_rows() > 0 {
+                outputs.push(StreamOutput::Forward(out_batch));
+            }
+        }
+        Ok(outputs)
+    }
+
+    async fn process_watermark(
+        &mut self,
+        watermark: Watermark,
+        _ctx: &mut TaskContext,
+    ) -> Result<Vec<StreamOutput>> {
+        Ok(vec![StreamOutput::Watermark(watermark)])
+    }
+
+    async fn snapshot_state(
+        &mut self,
+        _barrier: CheckpointBarrier,
+        _ctx: &mut TaskContext,
+    ) -> Result<()> {
+        Ok(())
+    }
+}
diff --git a/src/runtime/streaming/operators/watermark/mod.rs b/src/runtime/streaming/operators/watermark/mod.rs
new file mode 100644
index 00000000..3a0a1099
--- /dev/null
+++ b/src/runtime/streaming/operators/watermark/mod.rs
@@ -0,0 +1,15 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+pub mod watermark_generator;
+
+pub use watermark_generator::WatermarkGeneratorConstructor;
diff --git a/src/runtime/streaming/operators/watermark/watermark_generator.rs b/src/runtime/streaming/operators/watermark/watermark_generator.rs
new file mode 100644
index 00000000..0fee4a38
--- /dev/null
+++ b/src/runtime/streaming/operators/watermark/watermark_generator.rs
@@ -0,0 +1,239 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+use anyhow::{anyhow, Result};
+use arrow::compute::kernels::aggregate;
+use arrow_array::cast::AsArray;
+use arrow_array::types::TimestampNanosecondType;
+use arrow_array::{RecordBatch, TimestampNanosecondArray};
+use bincode::{Decode, Encode};
+use datafusion::physical_expr::PhysicalExpr;
+use datafusion_proto::physical_plan::DefaultPhysicalExtensionCodec;
+use datafusion_proto::physical_plan::from_proto::parse_physical_expr;
+use datafusion_proto::protobuf::PhysicalExprNode;
+use prost::Message;
+use std::sync::Arc;
+use std::time::{Duration, SystemTime};
+use tracing::{debug, info};
+
+use crate::runtime::streaming::api::context::TaskContext;
+use crate::runtime::streaming::api::operator::Operator;
+use crate::runtime::streaming::factory::Registry;
+use async_trait::async_trait;
+use protocol::grpc::api::ExpressionWatermarkConfig;
+use crate::runtime::streaming::StreamOutput;
+use crate::sql::common::{from_nanos, to_millis, CheckpointBarrier, FsSchema, Watermark};
+
+#[derive(Debug, Copy, Clone, Encode, Decode, PartialEq, Eq)]
+pub struct WatermarkGeneratorState {
+    pub last_watermark_emitted_at: SystemTime,
+    pub max_watermark: SystemTime,
+}
+
+impl Default for WatermarkGeneratorState {
+    fn default() -> Self {
+        Self {
+            last_watermark_emitted_at: SystemTime::UNIX_EPOCH,
+            max_watermark: SystemTime::UNIX_EPOCH,
+        }
+    }
+}
+
+pub struct WatermarkGeneratorOperator {
+    interval: Duration,
+    idle_time: Option<Duration>,
+    expression: Arc<dyn PhysicalExpr>,
+    timestamp_index: usize,
+    state: WatermarkGeneratorState,
+    last_event_wall: SystemTime,
+    is_idle: bool,
+}
+
+impl WatermarkGeneratorOperator {
+    pub fn new(
+        interval: Duration,
+        idle_time: Option<Duration>,
+        expression: Arc<dyn PhysicalExpr>,
+        timestamp_index: usize,
+    ) -> Self {
+        Self {
+            interval,
+            idle_time,
+            expression,
+            timestamp_index,
+            state: WatermarkGeneratorState::default(),
+            last_event_wall: SystemTime::now(),
+            is_idle: false,
+        }
+    }
+
+    fn extract_max_timestamp(&self, batch: &RecordBatch) -> Option<SystemTime> {
+        let ts_column = batch.column(self.timestamp_index);
+        let arr = ts_column.as_primitive::<TimestampNanosecondType>();
+        let max_ts = aggregate::max(arr)?;
+        Some(from_nanos(max_ts as u128))
+    }
+
+    fn evaluate_watermark(&self, batch: &RecordBatch) -> Result<SystemTime> {
+        let watermark_array = self
+            .expression
+            .evaluate(batch)?
+            .into_array(batch.num_rows())?;
+
+        let typed_array = watermark_array
+            .as_any()
+            .downcast_ref::<TimestampNanosecondArray>()
+            .ok_or_else(|| anyhow!("watermark expression must return TimestampNanosecondArray"))?;
+
+        let max_watermark_nanos = aggregate::max(typed_array)
+            .ok_or_else(|| anyhow!("failed to extract max watermark from batch"))?;
+
+        Ok(from_nanos(max_watermark_nanos as u128))
+    }
+}
+
+#[async_trait]
+impl Operator for WatermarkGeneratorOperator {
+    fn name(&self) -> &str {
+        "ExpressionWatermarkGenerator"
+    }
+
+    fn tick_interval(&self) -> Option<Duration> {
+        Some(Duration::from_secs(1))
+    }
+
+    async fn on_start(&mut self, _ctx: &mut TaskContext) -> Result<()> {
+        self.last_event_wall = SystemTime::now();
+        Ok(())
+    }
+
+    async fn process_data(
+        &mut self,
+        _input_idx: usize,
+        batch: RecordBatch,
+        ctx: &mut TaskContext,
+    ) -> Result<Vec<StreamOutput>> {
+        self.last_event_wall = SystemTime::now();
+
+        let mut outputs = vec![StreamOutput::Forward(batch.clone())];
+
+        let Some(max_batch_ts) = self.extract_max_timestamp(&batch) else {
+            return Ok(outputs);
+        };
+
+        let new_watermark = self.evaluate_watermark(&batch)?;
+
+        self.state.max_watermark = self.state.max_watermark.max(new_watermark);
+
+        let time_since_last_emit = max_batch_ts
+            .duration_since(self.state.last_watermark_emitted_at)
+            .unwrap_or(Duration::ZERO);
+
+        if self.is_idle || time_since_last_emit > self.interval {
+            debug!(
+                "[{}] emitting expression watermark {}",
+                ctx.subtask_idx,
+                to_millis(self.state.max_watermark)
+            );
+
+            outputs.push(StreamOutput::Watermark(Watermark::EventTime(
+                self.state.max_watermark,
+            )));
+
+            self.state.last_watermark_emitted_at = max_batch_ts;
+            self.is_idle = false;
+        }
+
+        Ok(outputs)
+    }
+
+    async fn process_watermark(
+        &mut self,
+        _watermark: Watermark,
+        _ctx: &mut TaskContext,
+    ) -> Result<Vec<StreamOutput>> {
+        Ok(vec![])
+    }
+
+    async fn process_tick(
+        &mut self,
+        _tick_index: u64,
+        ctx: &mut TaskContext,
+    ) -> Result<Vec<StreamOutput>> {
+        if let Some(idle_timeout) = self.idle_time {
+            let elapsed = self
+                .last_event_wall
+                .elapsed()
+                .unwrap_or(Duration::ZERO);
+            if !self.is_idle && elapsed > idle_timeout {
+                info!(
+                    "task [{}] entering Idle after {:?}",
+                    ctx.subtask_idx, idle_timeout
+                );
+                self.is_idle = true;
+                return Ok(vec![StreamOutput::Watermark(Watermark::Idle)]);
+            }
+        }
+        Ok(vec![])
+    }
+
+    async fn snapshot_state(&mut self, _barrier: CheckpointBarrier, _ctx: &mut TaskContext) -> Result<()> {
+        Ok(())
+    }
+
+    async fn on_close(&mut self, _ctx: &mut TaskContext) -> Result<Vec<StreamOutput>> {
+        Ok(vec![StreamOutput::Watermark(Watermark::EventTime(from_nanos(
+            u64::MAX as u128,
+        )))])
+    }
+}
+
+pub struct WatermarkGeneratorConstructor;
+
+impl WatermarkGeneratorConstructor {
+    pub fn with_config(
+        &self,
+        config: ExpressionWatermarkConfig,
+        registry: Arc<Registry>,
+    ) -> anyhow::Result<WatermarkGeneratorOperator> {
+        let input_schema: FsSchema = config
+            .input_schema
+            .ok_or_else(|| anyhow!("missing input schema"))?
+            .try_into()
+            .map_err(|e| anyhow!("input schema: {e}"))?;
+        let timestamp_index = input_schema.timestamp_index;
+
+        let expression_node =
+            PhysicalExprNode::decode(&mut config.expression.as_slice()).map_err(|e| {
+                anyhow!("decode expression: {e}")
+            })?;
+        let expression = parse_physical_expr(
+            &expression_node,
+            registry.as_ref(),
+            &input_schema.schema,
+            &DefaultPhysicalExtensionCodec {},
+        )
+        .map_err(|e| anyhow!("parse physical expr: {e}"))?;
+
+        let interval = Duration::from_micros(config.period_micros);
+        let idle_time = config.idle_time_micros.map(Duration::from_micros);
+
+        Ok(WatermarkGeneratorOperator::new(
+            interval,
+            idle_time,
+            expression,
+            timestamp_index,
+        ))
+    }
+}
+
diff --git a/src/runtime/streaming/operators/windows/mod.rs b/src/runtime/streaming/operators/windows/mod.rs
new file mode 100644
index 00000000..f1915f0d
--- /dev/null
+++ b/src/runtime/streaming/operators/windows/mod.rs
@@ -0,0 +1,21 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+pub mod session_aggregating_window;
+pub mod sliding_aggregating_window;
+pub mod tumbling_aggregating_window;
+pub mod window_function;
+
+pub use session_aggregating_window::SessionAggregatingWindowConstructor;
+pub use sliding_aggregating_window::SlidingAggregatingWindowConstructor;
+pub use tumbling_aggregating_window::TumblingAggregateWindowConstructor;
+pub use window_function::WindowFunctionConstructor;
diff --git a/src/runtime/streaming/operators/windows/session_aggregating_window.rs b/src/runtime/streaming/operators/windows/session_aggregating_window.rs
new file mode 100644
index 00000000..93376c4c
--- /dev/null
+++ b/src/runtime/streaming/operators/windows/session_aggregating_window.rs
@@ -0,0 +1,740 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+use anyhow::{anyhow, bail, Context, Result};
+use arrow::compute::{
+    concat_batches, filter_record_batch, kernels::cmp::gt_eq, lexsort_to_indices, partition, take,
+};
+use arrow::row::{RowConverter, SortField};
+use arrow_array::types::TimestampNanosecondType;
+use arrow_array::{
+    Array, BooleanArray, PrimitiveArray, RecordBatch, StructArray, TimestampNanosecondArray,
+};
+use arrow_schema::{DataType, Field, FieldRef, Schema};
+use datafusion::execution::context::SessionContext;
+use datafusion::execution::runtime_env::RuntimeEnvBuilder;
+use datafusion::execution::SendableRecordBatchStream;
+use datafusion::physical_plan::ExecutionPlan;
+use datafusion_proto::physical_plan::AsExecutionPlan;
+use datafusion_proto::protobuf::PhysicalPlanNode;
+use futures::StreamExt;
+use prost::Message;
+use std::collections::{BTreeMap, HashMap, HashSet};
+use std::sync::{Arc, RwLock};
+use std::time::{Duration, SystemTime};
+use tokio::sync::mpsc::{unbounded_channel, UnboundedReceiver, UnboundedSender};
+
+use crate::runtime::streaming::api::context::TaskContext;
+use crate::runtime::streaming::api::operator::Operator;
+use async_trait::async_trait;
+use crate::runtime::streaming::factory::Registry;
+use protocol::grpc::api::SessionWindowAggregateOperator;
+use crate::runtime::streaming::StreamOutput;
+use crate::sql::common::{from_nanos, to_nanos, CheckpointBarrier, FsSchema, FsSchemaRef, Watermark};
+use crate::sql::common::converter::Converter;
+use crate::sql::physical::{DecodingContext, FsPhysicalExtensionCodec};
+use crate::sql::schema::utils::window_arrow_struct;
+// ============================================================================
+// ============================================================================
+
+struct SessionWindowConfig {
+    gap: Duration,
+    input_schema_ref: FsSchemaRef,
+    window_field: FieldRef,
+    window_index: usize,
+    final_physical_exec: Arc<dyn ExecutionPlan>,
+    receiver_hook: Arc<RwLock<Option<UnboundedReceiver<RecordBatch>>>>,
+    output_schema: Arc<Schema>,
+}
+
+struct ActiveSession {
+    data_start: SystemTime,
+    data_end: SystemTime,
+    sender: Option<UnboundedSender<RecordBatch>>,
+    result_stream: SendableRecordBatchStream,
+}
+
+impl ActiveSession {
+    async fn new(
+        aggregation_plan: Arc<dyn ExecutionPlan>,
+        initial_timestamp: SystemTime,
+        sender: UnboundedSender<RecordBatch>,
+    ) -> Result<Self> {
+        aggregation_plan.reset()?;
+        let result_exec = aggregation_plan.execute(0, SessionContext::new().task_ctx())?;
+        Ok(Self {
+            data_start: initial_timestamp,
+            data_end: initial_timestamp,
+            sender: Some(sender),
+            result_stream: result_exec,
+        })
+    }
+
+    fn ingest_batch(
+        &mut self,
+        batch: RecordBatch,
+        gap: Duration,
+        ts_idx: usize,
+    ) -> Result<Option<(SystemTime, RecordBatch)>> {
+        let ts_col = batch
+            .column(ts_idx)
+            .as_any()
+            .downcast_ref::<TimestampNanosecondArray>()
+            .ok_or_else(|| anyhow!("expected timestamp column"))?;
+        let start_ts = ts_col.value(0);
+        let end_ts = ts_col.value(batch.num_rows() - 1);
+
+        let current_end_with_gap = to_nanos(self.data_end + gap) as i64;
+
+        if end_ts < current_end_with_gap {
+            self.data_end = self.data_end.max(from_nanos(end_ts as u128));
+            self.data_start = self.data_start.min(from_nanos(start_ts as u128));
+            self.sender
+                .as_ref()
+                .ok_or_else(|| anyhow!("session sender already closed"))?
+                .send(batch)
+                .map_err(|e| anyhow!("session channel send: {e}"))?;
+            return Ok(None);
+        }
+
+        if current_end_with_gap < start_ts {
+            return Ok(Some((from_nanos(start_ts as u128), batch)));
+        }
+
+        self.data_start = self.data_start.min(from_nanos(start_ts as u128));
+
+        let mut split_idx = 1;
+        while split_idx < batch.num_rows() {
+            let val = ts_col.value(split_idx);
+            if val < to_nanos(self.data_end) as i64 {
+                split_idx += 1;
+                continue;
+            }
+            if val < to_nanos(self.data_end + gap) as i64 {
+                self.data_end = from_nanos(val as u128);
+                split_idx += 1;
+                continue;
+            }
+            break;
+        }
+
+        if split_idx == batch.num_rows() {
+            self.sender
+                .as_ref()
+                .ok_or_else(|| anyhow!("session sender already closed"))?
+                .send(batch)
+                .map_err(|e| anyhow!("session channel send: {e}"))?;
+            return Ok(None);
+        }
+
+        self.sender
+            .as_ref()
+            .ok_or_else(|| anyhow!("session sender already closed"))?
+            .send(batch.slice(0, split_idx))
+            .map_err(|e| anyhow!("session channel send: {e}"))?;
+        let remaining_batch = batch.slice(split_idx, batch.num_rows() - split_idx);
+        let new_start_time = from_nanos(ts_col.value(split_idx) as u128);
+        Ok(Some((new_start_time, remaining_batch)))
+    }
+
+    async fn close_and_drain(mut self, gap: Duration) -> Result<SessionWindowResult> {
+        self.sender.take();
+
+        let mut result_batches = Vec::new();
+        while let Some(batch) = self.result_stream.next().await {
+            result_batches.push(batch?);
+        }
+
+        if result_batches.len() != 1 || result_batches[0].num_rows() != 1 {
+            bail!("active session must yield exactly one aggregate row");
+        }
+
+        Ok(SessionWindowResult {
+            window_start: self.data_start,
+            window_end: self.data_end + gap,
+            batch: result_batches.into_iter().next().unwrap(),
+        })
+    }
+}
+
+struct SessionWindowResult {
+    window_start: SystemTime,
+    window_end: SystemTime,
+    batch: RecordBatch,
+}
+
+struct KeySessionState {
+    config: Arc<SessionWindowConfig>,
+    active_session: Option<ActiveSession>,
+    buffered_batches: BTreeMap<SystemTime, Vec<RecordBatch>>,
+}
+
+impl KeySessionState {
+    fn new(config: Arc<SessionWindowConfig>) -> Self {
+        Self {
+            config,
+            active_session: None,
+            buffered_batches: BTreeMap::new(),
+        }
+    }
+
+    fn is_empty(&self) -> bool {
+        self.active_session.is_none() && self.buffered_batches.is_empty()
+    }
+
+    fn earliest_data_time(&self) -> Option<SystemTime> {
+        self.active_session
+            .as_ref()
+            .map(|s| s.data_start)
+            .or_else(|| self.buffered_batches.keys().next().copied())
+    }
+
+    fn next_watermark_action_time(&self) -> Option<SystemTime> {
+        self.active_session
+            .as_ref()
+            .map(|s| s.data_end + self.config.gap)
+            .or_else(|| {
+                self.buffered_batches
+                    .keys()
+                    .next()
+                    .map(|t| *t - self.config.gap)
+            })
+    }
+
+    async fn advance_by_watermark(&mut self, watermark: SystemTime) -> Result<Vec<SessionWindowResult>> {
+        let mut results = vec![];
+
+        loop {
+            if let Some(session) = &mut self.active_session {
+                if session.data_end + self.config.gap < watermark {
+                    let closed_session = self
+                        .active_session
+                        .take()
+                        .unwrap()
+                        .close_and_drain(self.config.gap)
+                        .await?;
+                    results.push(closed_session);
+                } else {
+                    break;
+                }
+            } else {
+                let Some((initial_ts, _)) = self.buffered_batches.first_key_value() else {
+                    break;
+                };
+                if watermark + self.config.gap < *initial_ts {
+                    break;
+                }
+
+                let (tx, rx) = unbounded_channel();
+                *self.config.receiver_hook.write().unwrap() = Some(rx);
+
+                self.active_session = Some(
+                    ActiveSession::new(
+                        self.config.final_physical_exec.clone(),
+                        *initial_ts,
+                        tx,
+                    )
+                    .await?,
+                );
+
+                self.drain_buffer_to_active_session()?;
+            }
+        }
+        Ok(results)
+    }
+
+    fn drain_buffer_to_active_session(&mut self) -> Result<()> {
+        let session = self
+            .active_session
+            .as_mut()
+            .ok_or_else(|| anyhow!("drain_buffer_to_active_session without active session"))?;
+
+        while let Some((first_key, _)) = self.buffered_batches.first_key_value() {
+            if session.data_end + self.config.gap < *first_key {
+                break;
+            }
+
+            let (_, batches) = self.buffered_batches.pop_first().unwrap();
+            for batch in batches {
+                if let Some((rem_start, rem_batch)) = session.ingest_batch(
+                    batch,
+                    self.config.gap,
+                    self.config.input_schema_ref.timestamp_index,
+                )? {
+                    self.buffered_batches
+                        .entry(rem_start)
+                        .or_default()
+                        .push(rem_batch);
+                }
+            }
+        }
+        Ok(())
+    }
+
+    async fn add_data(
+        &mut self,
+        start_time: SystemTime,
+        batch: RecordBatch,
+        watermark: Option<SystemTime>,
+    ) -> Result<()> {
+        self.buffered_batches
+            .entry(start_time)
+            .or_default()
+            .push(batch);
+
+        if self.active_session.is_some() {
+            self.drain_buffer_to_active_session()?;
+        }
+
+        if let Some(wm) = watermark {
+            let flushed = self.advance_by_watermark(wm).await?;
+            if !flushed.is_empty() {
+                bail!("unexpected flush during data ingestion; session watermark invariant violated");
+            }
+        }
+        Ok(())
+    }
+}
+
+fn start_time_for_sorted_batch(batch: &RecordBatch, schema: &FsSchema) -> SystemTime {
+    let timestamp_array = batch.column(schema.timestamp_index);
+    let timestamp_array = timestamp_array
+        .as_any()
+        .downcast_ref::<PrimitiveArray<TimestampNanosecondType>>()
+        .expect("timestamp column");
+    from_nanos(timestamp_array.value(0) as u128)
+}
+
+fn build_session_output_schema(
+    input: &FsSchema,
+    window_field: FieldRef,
+    window_index: usize,
+    agg_schema: &Schema,
+) -> Result<Arc<Schema>> {
+    let key_count = input.routing_keys().map(|k| k.len()).unwrap_or(0);
+    let mut fields: Vec<FieldRef> = (0..key_count)
+        .map(|i| input.schema.fields()[i].clone())
+        .collect();
+    fields.insert(window_index, window_field);
+    fields.extend(agg_schema.fields().iter().cloned());
+    fields.push(input.schema.fields()[input.timestamp_index].clone());
+    Ok(Arc::new(Schema::new(fields)))
+}
+
+// ============================================================================
+// ============================================================================
+
+pub struct SessionWindowOperator {
+    config: Arc<SessionWindowConfig>,
+    row_converter: Converter,
+
+    session_states: HashMap<Vec<u8>, KeySessionState>,
+    pq_watermark_actions: BTreeMap<SystemTime, HashSet<Vec<u8>>>,
+    pq_start_times: BTreeMap<SystemTime, HashSet<Vec<u8>>>,
+}
+
+impl SessionWindowOperator {
+    fn filter_batch_by_time(&self, batch: RecordBatch, watermark: Option<SystemTime>) -> Result<RecordBatch> {
+        let Some(watermark) = watermark else {
+            return Ok(batch);
+        };
+
+        let timestamp_column = batch
+            .column(self.config.input_schema_ref.timestamp_index)
+            .as_any()
+            .downcast_ref::<TimestampNanosecondArray>()
+            .ok_or_else(|| anyhow!("expected timestamp column"))?;
+
+        let watermark_scalar = TimestampNanosecondArray::new_scalar(to_nanos(watermark) as i64);
+        let on_time = gt_eq(timestamp_column, &watermark_scalar)?;
+
+        Ok(filter_record_batch(&batch, &on_time)?)
+    }
+
+    fn sort_batch(&self, batch: &RecordBatch) -> Result<RecordBatch> {
+        let sort_columns = self.config.input_schema_ref.sort_columns(batch, true);
+        let sort_indices = lexsort_to_indices(&sort_columns, None)?;
+
+        let columns = batch
+            .columns()
+            .iter()
+            .map(|c| take(c, &sort_indices, None).unwrap())
+            .collect();
+
+        Ok(RecordBatch::try_new(batch.schema(), columns)?)
+    }
+
+    async fn ingest_sorted_batch(
+        &mut self,
+        sorted_batch: RecordBatch,
+        watermark: Option<SystemTime>,
+    ) -> Result<()> {
+        let partition_ranges = if !self.config.input_schema_ref.has_routing_keys() {
+            vec![0..sorted_batch.num_rows()]
+        } else {
+            let key_len = self
+                .config
+                .input_schema_ref
+                .routing_keys()
+                .as_ref()
+                .unwrap()
+                .len();
+            let key_cols = sorted_batch
+                .columns()
+                .iter()
+                .take(key_len)
+                .cloned()
+                .collect::<Vec<_>>();
+            partition(key_cols.as_slice())?.ranges()
+        };
+
+        let key_count = self
+            .config
+            .input_schema_ref
+            .routing_keys()
+            .map(|k| k.len())
+            .unwrap_or(0);
+
+        for range in partition_ranges {
+            let key_batch = sorted_batch.slice(range.start, range.end - range.start);
+
+            let row_key = if key_count == 0 {
+                Vec::new()
+            } else {
+                self.row_converter
+                    .convert_columns(&key_batch.slice(0, 1).columns()[0..key_count])
+                    .context("row key convert")?
+                    .as_ref()
+                    .to_vec()
+            };
+
+            let state = self
+                .session_states
+                .entry(row_key.clone())
+                .or_insert_with(|| KeySessionState::new(self.config.clone()));
+
+            let initial_action = state.next_watermark_action_time();
+            let initial_start = state.earliest_data_time();
+
+            let batch_start = start_time_for_sorted_batch(&key_batch, &self.config.input_schema_ref);
+
+            state
+                .add_data(batch_start, key_batch, watermark)
+                .await?;
+
+            let new_action = state
+                .next_watermark_action_time()
+                .ok_or_else(|| anyhow!("missing next watermark action after add_data"))?;
+            let new_start = state
+                .earliest_data_time()
+                .ok_or_else(|| anyhow!("missing earliest data after add_data"))?;
+
+            match initial_action {
+                Some(ia) => {
+                    if ia != new_action {
+                        self.pq_watermark_actions
+                            .get_mut(&ia)
+                            .expect("pq watermark entry")
+                            .remove(&row_key);
+                        self.pq_watermark_actions
+                            .entry(new_action)
+                            .or_default()
+                            .insert(row_key.clone());
+                    }
+                    let is = initial_start.expect("initial start");
+                    if is != new_start {
+                        self.pq_start_times
+                            .get_mut(&is)
+                            .expect("pq start entry")
+                            .remove(&row_key);
+                        self.pq_start_times
+                            .entry(new_start)
+                            .or_default()
+                            .insert(row_key.clone());
+                    }
+                }
+                None => {
+                    self.pq_watermark_actions
+                        .entry(new_action)
+                        .or_default()
+                        .insert(row_key.clone());
+                    self.pq_start_times
+                        .entry(new_start)
+                        .or_default()
+                        .insert(row_key);
+                }
+            }
+        }
+        Ok(())
+    }
+
+    async fn evaluate_watermark(&mut self, watermark: SystemTime) -> Result<Vec<RecordBatch>> {
+        let mut emit_results: Vec<(Vec<u8>, Vec<SessionWindowResult>)> = Vec::new();
+
+        loop {
+            let popped_action_time = match self.pq_watermark_actions.first_key_value() {
+                Some((t, _)) if *t < watermark => *t,
+                _ => break,
+            };
+            let keys = self
+                .pq_watermark_actions
+                .remove(&popped_action_time)
+                .expect("pop watermark pq");
+
+            for key in keys {
+                let state = self
+                    .session_states
+                    .get_mut(&key)
+                    .ok_or_else(|| anyhow!("missing session state for key"))?;
+                let initial_start = state
+                    .earliest_data_time()
+                    .ok_or_else(|| anyhow!("missing earliest data in evaluate_watermark"))?;
+
+                let completed_sessions = state.advance_by_watermark(watermark).await?;
+                if !completed_sessions.is_empty() {
+                    emit_results.push((key.clone(), completed_sessions));
+                }
+
+                self.pq_start_times
+                    .get_mut(&initial_start)
+                    .expect("pq start")
+                    .remove(&key);
+
+                if state.is_empty() {
+                    self.session_states.remove(&key);
+                } else {
+                    let new_start = state
+                        .earliest_data_time()
+                        .expect("earliest after advance");
+                    self.pq_start_times
+                        .entry(new_start)
+                        .or_default()
+                        .insert(key.clone());
+
+                    let new_next_action = state
+                        .next_watermark_action_time()
+                        .expect("next action after advance");
+                    if new_next_action == popped_action_time {
+                        bail!(
+                            "processed watermark at {:?} but next watermark action stayed at {:?}",
+                            watermark, popped_action_time
+                        );
+                    }
+                    self.pq_watermark_actions
+                        .entry(new_next_action)
+                        .or_default()
+                        .insert(key);
+                }
+            }
+        }
+
+        if emit_results.is_empty() {
+            return Ok(vec![]);
+        }
+
+        Ok(vec![self.format_to_arrow(emit_results)?])
+    }
+
+    fn format_to_arrow(&self, results: Vec<(Vec<u8>, Vec<SessionWindowResult>)>) -> Result<RecordBatch> {
+        let (rows, session_results): (Vec<_>, Vec<_>) = results
+            .into_iter()
+            .flat_map(|(row, s_results)| s_results.into_iter().map(move |res| (row.clone(), res)))
+            .unzip();
+
+        let key_columns = if let Some(parser) = self.row_converter.parser() {
+            self.row_converter.convert_rows(
+                rows.iter()
+                    .map(|row| parser.parse(row.as_ref()))
+                    .collect(),
+            )?
+        } else {
+            vec![]
+        };
+
+        let start_times: Vec<i64> = session_results
+            .iter()
+            .map(|r| to_nanos(r.window_start) as i64)
+            .collect();
+        let end_times: Vec<i64> = session_results
+            .iter()
+            .map(|r| to_nanos(r.window_end) as i64)
+            .collect();
+
+        let window_start_array = PrimitiveArray::<TimestampNanosecondType>::from(start_times);
+        let window_end_array = PrimitiveArray::<TimestampNanosecondType>::from(end_times.clone());
+        
+        let result_batches: Vec<&RecordBatch> = session_results.iter().map(|res| &res.batch).collect();
+        let merged_batch = concat_batches(&session_results[0].batch.schema(), result_batches)?;
+
+        let DataType::Struct(window_fields) = self.config.window_field.data_type() else {
+            bail!("expected window field to be a struct");
+        };
+
+        let window_struct_array = StructArray::try_new(
+            window_fields.clone(),
+            vec![Arc::new(window_start_array), Arc::new(window_end_array)],
+            None,
+        )?;
+
+        let mut columns = key_columns;
+        columns.insert(self.config.window_index, Arc::new(window_struct_array));
+        columns.extend_from_slice(merged_batch.columns());
+
+        RecordBatch::try_new(self.config.output_schema.clone(), columns)
+            .context("failed to create session window output batch")
+    }
+
+    #[allow(dead_code)]
+    fn earliest_batch_time(&self) -> Option<SystemTime> {
+        self.pq_start_times
+            .first_key_value()
+            .map(|(start_time, _keys)| *start_time)
+    }
+}
+
+#[async_trait]
+impl Operator for SessionWindowOperator {
+    fn name(&self) -> &str {
+        "SessionWindow"
+    }
+
+    async fn on_start(&mut self, _ctx: &mut TaskContext) -> Result<()> {
+        Ok(())
+    }
+
+    async fn process_data(
+        &mut self,
+        _input_idx: usize,
+        batch: RecordBatch,
+        ctx: &mut TaskContext,
+    ) -> Result<Vec<StreamOutput>> {
+        let watermark_time = ctx.last_present_watermark();
+
+        let filtered_batch = self.filter_batch_by_time(batch, watermark_time)?;
+        if filtered_batch.num_rows() == 0 {
+            return Ok(vec![]);
+        }
+
+        let sorted_batch = self.sort_batch(&filtered_batch)?;
+
+        self.ingest_sorted_batch(sorted_batch, watermark_time).await?;
+
+        Ok(vec![])
+    }
+
+    async fn process_watermark(
+        &mut self,
+        watermark: Watermark,
+        _ctx: &mut TaskContext,
+    ) -> Result<Vec<StreamOutput>> {
+        let Watermark::EventTime(current_time) = watermark else {
+            return Ok(vec![]);
+        };
+
+        let output_batches = self.evaluate_watermark(current_time).await?;
+        Ok(output_batches
+            .into_iter()
+            .map(StreamOutput::Forward)
+            .collect())
+    }
+
+    async fn snapshot_state(&mut self, _barrier: CheckpointBarrier, _ctx: &mut TaskContext) -> Result<()> {
+        Ok(())
+    }
+
+    async fn on_close(&mut self, _ctx: &mut TaskContext) -> Result<Vec<StreamOutput>> {
+        Ok(vec![])
+    }
+}
+
+// ============================================================================
+// ============================================================================
+
+pub struct SessionAggregatingWindowConstructor;
+
+impl SessionAggregatingWindowConstructor {
+    pub fn with_config(
+        &self,
+        config: SessionWindowAggregateOperator,
+        registry: Arc<Registry>,
+    ) -> anyhow::Result<SessionWindowOperator> {
+        let window_field = Arc::new(Field::new(
+            config.window_field_name,
+            window_arrow_struct(),
+            true,
+        ));
+
+        let receiver_hook = Arc::new(RwLock::new(None));
+
+        let codec = FsPhysicalExtensionCodec {
+            context: DecodingContext::UnboundedBatchStream(receiver_hook.clone()),
+        };
+
+        let final_plan = PhysicalPlanNode::decode(&mut config.final_aggregation_plan.as_slice())?;
+        let final_execution_plan = final_plan.try_into_physical_plan(
+            registry.as_ref(),
+            &RuntimeEnvBuilder::new().build()?,
+            &codec,
+        )?;
+
+        let input_schema: FsSchema = config
+            .input_schema
+            .ok_or_else(|| anyhow!("missing input schema"))?
+            .try_into()?;
+
+        let row_converter = if input_schema.routing_keys().is_none() {
+            let array = Arc::new(BooleanArray::from(vec![false]));
+            Converter::Empty(
+                RowConverter::new(vec![SortField::new(DataType::Boolean)])?,
+                array,
+            )
+        } else {
+            let key_count = input_schema.routing_keys().as_ref().unwrap().len();
+            Converter::RowConverter(RowConverter::new(
+                input_schema
+                    .schema
+                    .fields()
+                    .into_iter()
+                    .take(key_count)
+                    .map(|field| SortField::new(field.data_type().clone()))
+                    .collect(),
+            )?)
+        };
+
+        let output_schema = build_session_output_schema(
+            &input_schema,
+            window_field.clone(),
+            config.window_index as usize,
+            final_execution_plan.schema().as_ref(),
+        )?;
+
+        let session_config = Arc::new(SessionWindowConfig {
+            gap: Duration::from_micros(config.gap_micros),
+            window_field,
+            window_index: config.window_index as usize,
+            input_schema_ref: Arc::new(input_schema),
+            final_physical_exec: final_execution_plan,
+            receiver_hook,
+            output_schema,
+        });
+
+        Ok(SessionWindowOperator {
+            config: session_config,
+            session_states: HashMap::new(),
+            pq_start_times: BTreeMap::new(),
+            pq_watermark_actions: BTreeMap::new(),
+            row_converter,
+        })
+    }
+}
+
diff --git a/src/runtime/streaming/operators/windows/sliding_aggregating_window.rs b/src/runtime/streaming/operators/windows/sliding_aggregating_window.rs
new file mode 100644
index 00000000..19a539f3
--- /dev/null
+++ b/src/runtime/streaming/operators/windows/sliding_aggregating_window.rs
@@ -0,0 +1,545 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+use anyhow::{anyhow, bail, Result};
+use arrow::compute::{partition, sort_to_indices, take};
+use arrow_array::{Array, PrimitiveArray, RecordBatch, types::TimestampNanosecondType};
+use arrow_schema::SchemaRef;
+use datafusion::common::ScalarValue;
+use datafusion::execution::context::SessionContext;
+use datafusion::execution::runtime_env::RuntimeEnvBuilder;
+use datafusion::execution::SendableRecordBatchStream;
+use datafusion::physical_expr::PhysicalExpr;
+use datafusion::physical_plan::ExecutionPlan;
+use datafusion_proto::physical_plan::DefaultPhysicalExtensionCodec;
+use datafusion_proto::{
+    physical_plan::{from_proto::parse_physical_expr, AsExecutionPlan},
+    protobuf::{PhysicalExprNode, PhysicalPlanNode},
+};
+use futures::StreamExt;
+use prost::Message;
+use std::collections::{BTreeMap, VecDeque};
+use std::sync::{Arc, RwLock};
+use std::time::{Duration, SystemTime};
+use tokio::sync::mpsc::{unbounded_channel, UnboundedReceiver, UnboundedSender};
+
+use crate::runtime::streaming::api::context::TaskContext;
+use crate::runtime::streaming::api::operator::Operator;
+use async_trait::async_trait;
+use crate::runtime::streaming::factory::Registry;
+use protocol::grpc::api::SlidingWindowAggregateOperator;
+use crate::runtime::streaming::StreamOutput;
+use crate::sql::common::{from_nanos, to_nanos, CheckpointBarrier, FsSchema, Watermark};
+use crate::sql::physical::{DecodingContext, FsPhysicalExtensionCodec};
+// ============================================================================
+// ============================================================================
+
+#[derive(Default, Debug)]
+struct RecordBatchPane {
+    batches: Vec<RecordBatch>,
+}
+
+#[derive(Debug)]
+struct RecordBatchTier {
+    width: Duration,
+    start_time: Option<SystemTime>,
+    panes: VecDeque<RecordBatchPane>,
+}
+
+impl RecordBatchTier {
+    fn new(width: Duration) -> Self {
+        Self {
+            width,
+            start_time: None,
+            panes: VecDeque::new(),
+        }
+    }
+
+    fn bin_start(&self, timestamp: SystemTime) -> SystemTime {
+        if self.width == Duration::ZERO {
+            return timestamp;
+        }
+        let nanos = to_nanos(timestamp) - (to_nanos(timestamp) % self.width.as_nanos());
+        from_nanos(nanos)
+    }
+
+    fn insert(&mut self, batch: RecordBatch, timestamp: SystemTime) -> Result<()> {
+        let bin_start = self.bin_start(timestamp);
+        if self.start_time.is_none() {
+            self.start_time = Some(bin_start);
+            self.panes.push_back(RecordBatchPane {
+                batches: vec![batch],
+            });
+            return Ok(());
+        }
+
+        let start_time = self.start_time.unwrap();
+        let bin_index =
+            (bin_start.duration_since(start_time)?.as_nanos() / self.width.as_nanos()) as usize;
+        while self.panes.len() <= bin_index {
+            self.panes.push_back(RecordBatchPane::default());
+        }
+        self.panes[bin_index].batches.push(batch);
+        Ok(())
+    }
+
+    fn batches_for_timestamp(&self, bin_start: SystemTime) -> Result<Vec<RecordBatch>> {
+        if self
+            .start_time
+            .map(|st| st > bin_start)
+            .unwrap_or(true)
+        {
+            return Ok(vec![]);
+        }
+        let bin_index = (bin_start
+            .duration_since(self.start_time.unwrap())?
+            .as_nanos()
+            / self.width.as_nanos()) as usize;
+        if self.panes.len() <= bin_index {
+            return Ok(vec![]);
+        }
+        Ok(self.panes[bin_index].batches.clone())
+    }
+
+    fn delete_before(&mut self, cutoff: SystemTime) -> Result<()> {
+        let bin_start = self.bin_start(cutoff);
+        if self
+            .start_time
+            .map(|st| st >= bin_start)
+            .unwrap_or(true)
+        {
+            return Ok(());
+        }
+        let bin_index = (bin_start
+            .duration_since(self.start_time.unwrap())
+            .unwrap()
+            .as_nanos()
+            / self.width.as_nanos()) as usize;
+
+        if bin_index >= self.panes.len() {
+            self.panes.clear();
+        } else {
+            self.panes.drain(0..bin_index);
+        }
+        self.start_time = Some(bin_start);
+        Ok(())
+    }
+}
+
+#[derive(Debug)]
+struct TieredRecordBatchHolder {
+    tier_widths: Vec<Duration>,
+    tiers: Vec<RecordBatchTier>,
+}
+
+impl TieredRecordBatchHolder {
+    fn new(tier_widths: Vec<Duration>) -> Result<Self> {
+        for i in 0..tier_widths.len().saturating_sub(1) {
+            if !tier_widths[i + 1].as_nanos().is_multiple_of(tier_widths[i].as_nanos()) {
+                bail!(
+                    "tier width {} does not evenly divide next {}",
+                    tier_widths[i].as_nanos(),
+                    tier_widths[i + 1].as_nanos()
+                );
+            }
+        }
+        let tiers = tier_widths
+            .iter()
+            .map(|w| RecordBatchTier::new(*w))
+            .collect();
+        Ok(Self { tier_widths, tiers })
+    }
+
+    fn insert(&mut self, batch: RecordBatch, timestamp: SystemTime) -> Result<()> {
+        for tier in self.tiers.iter_mut() {
+            tier.insert(batch.clone(), timestamp)?;
+        }
+        Ok(())
+    }
+
+    fn batches_for_interval(
+        &self,
+        interval_start: SystemTime,
+        interval_end: SystemTime,
+    ) -> Result<Vec<RecordBatch>> {
+        let mut batches = Vec::new();
+        let mut current_tier = 0usize;
+        let mut current_start = interval_start;
+
+        while current_start < interval_end {
+            let tier_end = current_start + self.tier_widths[current_tier];
+            if tier_end > interval_end {
+                current_tier = current_tier.saturating_sub(1);
+                continue;
+            }
+            if current_tier < self.tier_widths.len() - 1 {
+                let next_tier = &self.tiers[current_tier + 1];
+                if next_tier.bin_start(current_start) == current_start
+                    && current_start + next_tier.width <= interval_end
+                {
+                    current_tier += 1;
+                    continue;
+                }
+            }
+            batches.extend(self.tiers[current_tier].batches_for_timestamp(current_start)?);
+            current_start += self.tier_widths[current_tier];
+        }
+        if current_start != interval_end {
+            bail!(
+                "interval end {:?} does not match current start {:?}",
+                interval_end, current_start
+            );
+        }
+        Ok(batches)
+    }
+
+    fn delete_before(&mut self, cutoff: SystemTime) -> Result<()> {
+        for tier in self.tiers.iter_mut() {
+            tier.delete_before(cutoff)?;
+        }
+        Ok(())
+    }
+}
+
+// ============================================================================
+// ============================================================================
+
+struct ActiveBin {
+    sender: Option<UnboundedSender<RecordBatch>>,
+    result_stream: Option<SendableRecordBatchStream>,
+    finished_batches: Vec<RecordBatch>,
+}
+
+impl Default for ActiveBin {
+    fn default() -> Self {
+        Self {
+            sender: None,
+            result_stream: None,
+            finished_batches: Vec::new(),
+        }
+    }
+}
+
+impl ActiveBin {
+    fn start_partial(
+        plan: Arc<dyn ExecutionPlan>,
+        hook: &Arc<RwLock<Option<UnboundedReceiver<RecordBatch>>>>,
+    ) -> Result<Self> {
+        let (tx, rx) = unbounded_channel();
+        *hook.write().unwrap() = Some(rx);
+        plan.reset()?;
+        let result_stream = plan.execute(0, SessionContext::new().task_ctx())?;
+        Ok(Self {
+            sender: Some(tx),
+            result_stream: Some(result_stream),
+            finished_batches: Vec::new(),
+        })
+    }
+
+    async fn close_and_drain(&mut self) -> Result<()> {
+        self.sender.take();
+        if let Some(mut stream) = self.result_stream.take() {
+            while let Some(batch) = stream.next().await {
+                self.finished_batches.push(batch?);
+            }
+        }
+        Ok(())
+    }
+}
+
+// ============================================================================
+// ============================================================================
+
+pub struct SlidingWindowOperator {
+    slide: Duration,
+    width: Duration,
+    binning_function: Arc<dyn PhysicalExpr>,
+
+    partial_aggregation_plan: Arc<dyn ExecutionPlan>,
+    partial_schema: FsSchema,
+
+    finish_execution_plan: Arc<dyn ExecutionPlan>,
+    final_projection: Arc<dyn ExecutionPlan>,
+    projection_input_schema: SchemaRef,
+
+    receiver_hook: Arc<RwLock<Option<UnboundedReceiver<RecordBatch>>>>,
+    final_batches_passer: Arc<RwLock<Vec<RecordBatch>>>,
+
+    active_bins: BTreeMap<SystemTime, ActiveBin>,
+    tiered_record_batches: TieredRecordBatchHolder,
+}
+
+impl SlidingWindowOperator {
+    fn bin_start(&self, timestamp: SystemTime) -> SystemTime {
+        if self.slide == Duration::ZERO {
+            return timestamp;
+        }
+        let nanos = to_nanos(timestamp) - (to_nanos(timestamp) % self.slide.as_nanos());
+        from_nanos(nanos)
+    }
+
+    fn add_bin_start_as_timestamp(
+        batch: &RecordBatch,
+        bin_start: SystemTime,
+        schema: SchemaRef,
+    ) -> Result<RecordBatch> {
+        let bin_start_scalar = ScalarValue::TimestampNanosecond(Some(to_nanos(bin_start) as i64), None);
+        let timestamp_array = bin_start_scalar.to_array_of_size(batch.num_rows())?;
+        let mut columns = batch.columns().to_vec();
+        columns.push(timestamp_array);
+        Ok(RecordBatch::try_new(schema, columns)?)
+    }
+
+    fn ensure_bin_running(
+        slot: &mut ActiveBin,
+        plan: Arc<dyn ExecutionPlan>,
+        hook: &Arc<RwLock<Option<UnboundedReceiver<RecordBatch>>>>,
+    ) -> Result<()> {
+        if slot.sender.is_some() {
+            return Ok(());
+        }
+        let preserved = std::mem::take(&mut slot.finished_batches);
+        let mut started = ActiveBin::start_partial(plan, hook)?;
+        started.finished_batches = preserved;
+        *slot = started;
+        Ok(())
+    }
+}
+
+#[async_trait]
+impl Operator for SlidingWindowOperator {
+    fn name(&self) -> &str {
+        "SlidingWindow"
+    }
+
+    async fn on_start(&mut self, _ctx: &mut TaskContext) -> Result<()> {
+        Ok(())
+    }
+
+    async fn process_data(
+        &mut self,
+        _input_idx: usize,
+        batch: RecordBatch,
+        ctx: &mut TaskContext,
+    ) -> Result<Vec<StreamOutput>> {
+        let bin_array = self
+            .binning_function
+            .evaluate(&batch)?
+            .into_array(batch.num_rows())?;
+        let indices = sort_to_indices(bin_array.as_ref(), None, None)?;
+
+        let columns = batch
+            .columns()
+            .iter()
+            .map(|c| take(c, &indices, None).unwrap())
+            .collect();
+        let sorted = RecordBatch::try_new(batch.schema(), columns)?;
+        let sorted_bins = take(bin_array.as_ref(), &indices, None)?;
+
+        let typed_bin = sorted_bins
+            .as_any()
+            .downcast_ref::<PrimitiveArray<TimestampNanosecondType>>()
+            .ok_or_else(|| anyhow!("binning function must produce TimestampNanosecond"))?;
+        let partition_ranges = partition(std::slice::from_ref(&sorted_bins))?.ranges();
+
+        let watermark = ctx.last_present_watermark();
+
+        for range in partition_ranges {
+            let bin_start = from_nanos(typed_bin.value(range.start) as u128);
+
+            if let Some(wm) = watermark {
+                if bin_start < self.bin_start(wm) {
+                    continue;
+                }
+            }
+
+            let bin_batch = sorted.slice(range.start, range.end - range.start);
+            let slot = self.active_bins.entry(bin_start).or_default();
+
+            Self::ensure_bin_running(
+                slot,
+                self.partial_aggregation_plan.clone(),
+                &self.receiver_hook,
+            )?;
+
+            let sender = slot
+                .sender
+                .as_ref()
+                .ok_or_else(|| anyhow!("partial bin sender missing after ensure"))?;
+            sender
+                .send(bin_batch)
+                .map_err(|e| anyhow!("partial channel send: {e}"))?;
+        }
+
+        Ok(vec![])
+    }
+
+    async fn process_watermark(
+        &mut self,
+        watermark: Watermark,
+        _ctx: &mut TaskContext,
+    ) -> Result<Vec<StreamOutput>> {
+        let Watermark::EventTime(current_time) = watermark else {
+            return Ok(vec![]);
+        };
+        let watermark_bin = self.bin_start(current_time);
+
+        let mut final_outputs = Vec::new();
+
+        let mut expired_bins = Vec::new();
+        for &k in self.active_bins.keys() {
+            if k + self.slide <= watermark_bin {
+                expired_bins.push(k);
+            } else {
+                break;
+            }
+        }
+
+        for bin_start in expired_bins {
+            let mut bin = self
+                .active_bins
+                .remove(&bin_start)
+                .ok_or_else(|| anyhow!("missing active bin"))?;
+            let bin_end = bin_start + self.slide;
+
+            bin.close_and_drain().await?;
+            for b in bin.finished_batches {
+                self.tiered_record_batches.insert(b, bin_start)?;
+            }
+
+            let interval_start = bin_end - self.width;
+            let interval_end = bin_end;
+
+            let partials = self
+                .tiered_record_batches
+                .batches_for_interval(interval_start, interval_end)?;
+            *self.final_batches_passer.write().unwrap() = partials;
+
+            self.finish_execution_plan.reset()?;
+            let mut final_exec = self
+                .finish_execution_plan
+                .execute(0, SessionContext::new().task_ctx())?;
+
+            let mut aggregate_results = Vec::new();
+            while let Some(batch) = final_exec.next().await {
+                aggregate_results.push(Self::add_bin_start_as_timestamp(
+                    &batch?,
+                    interval_start,
+                    self.projection_input_schema.clone(),
+                )?);
+            }
+
+            *self.final_batches_passer.write().unwrap() = aggregate_results;
+            self.final_projection.reset()?;
+            let mut proj_exec = self
+                .final_projection
+                .execute(0, SessionContext::new().task_ctx())?;
+
+            while let Some(batch) = proj_exec.next().await {
+                final_outputs.push(StreamOutput::Forward(batch?));
+            }
+
+            self.tiered_record_batches
+                .delete_before(bin_end + self.slide - self.width)?;
+        }
+
+        Ok(final_outputs)
+    }
+
+    async fn snapshot_state(&mut self, _barrier: CheckpointBarrier, _ctx: &mut TaskContext) -> Result<()> {
+        Ok(())
+    }
+
+    async fn on_close(&mut self, _ctx: &mut TaskContext) -> Result<Vec<StreamOutput>> {
+        Ok(vec![])
+    }
+}
+
+// ============================================================================
+// ============================================================================
+
+pub struct SlidingAggregatingWindowConstructor;
+
+impl SlidingAggregatingWindowConstructor {
+    pub fn with_config(
+        &self,
+        config: SlidingWindowAggregateOperator,
+        registry: Arc<Registry>,
+    ) -> anyhow::Result<SlidingWindowOperator> {
+        let width = Duration::from_micros(config.width_micros);
+        let slide = Duration::from_micros(config.slide_micros);
+        let input_schema: FsSchema = config
+            .input_schema
+            .ok_or_else(|| anyhow!("missing input schema"))?
+            .try_into()?;
+
+        let binning_function = parse_physical_expr(
+            &PhysicalExprNode::decode(&mut config.binning_function.as_slice())?,
+            registry.as_ref(),
+            &input_schema.schema,
+            &DefaultPhysicalExtensionCodec {},
+        )?;
+
+        let receiver_hook = Arc::new(RwLock::new(None));
+        let final_batches_passer = Arc::new(RwLock::new(Vec::new()));
+
+        let codec = FsPhysicalExtensionCodec {
+            context: DecodingContext::UnboundedBatchStream(receiver_hook.clone()),
+        };
+        let final_codec = FsPhysicalExtensionCodec {
+            context: DecodingContext::LockedBatchVec(final_batches_passer.clone()),
+        };
+
+        let partial_plan = PhysicalPlanNode::decode(&mut config.partial_aggregation_plan.as_slice())?
+            .try_into_physical_plan(
+                registry.as_ref(),
+                &RuntimeEnvBuilder::new().build()?,
+                &codec,
+            )?;
+
+        let finish_plan = PhysicalPlanNode::decode(&mut config.final_aggregation_plan.as_slice())?
+            .try_into_physical_plan(
+                registry.as_ref(),
+                &RuntimeEnvBuilder::new().build()?,
+                &final_codec,
+            )?;
+
+        let final_proj = PhysicalPlanNode::decode(&mut config.final_projection.as_slice())?
+            .try_into_physical_plan(
+                registry.as_ref(),
+                &RuntimeEnvBuilder::new().build()?,
+                &final_codec,
+            )?;
+
+        let partial_schema: FsSchema = config
+            .partial_schema
+            .ok_or_else(|| anyhow!("missing partial schema"))?
+            .try_into()?;
+
+        Ok(SlidingWindowOperator {
+            slide,
+            width,
+            binning_function,
+            partial_aggregation_plan: partial_plan,
+            partial_schema,
+            finish_execution_plan: finish_plan,
+            final_projection: final_proj.clone(),
+            projection_input_schema: final_proj.children()[0].schema().clone(),
+            receiver_hook,
+            final_batches_passer,
+            active_bins: BTreeMap::new(),
+            tiered_record_batches: TieredRecordBatchHolder::new(vec![slide])?,
+        })
+    }
+}
+
diff --git a/src/runtime/streaming/operators/windows/tumbling_aggregating_window.rs b/src/runtime/streaming/operators/windows/tumbling_aggregating_window.rs
new file mode 100644
index 00000000..c0342d66
--- /dev/null
+++ b/src/runtime/streaming/operators/windows/tumbling_aggregating_window.rs
@@ -0,0 +1,376 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+use anyhow::{anyhow, Result};
+use arrow::compute::{partition, sort_to_indices, take};
+use arrow_array::{Array, PrimitiveArray, RecordBatch, types::TimestampNanosecondType};
+use arrow_schema::SchemaRef;
+use datafusion::common::ScalarValue;
+use datafusion::execution::context::SessionContext;
+use datafusion::execution::runtime_env::RuntimeEnvBuilder;
+use datafusion::execution::SendableRecordBatchStream;
+use datafusion::physical_expr::PhysicalExpr;
+use datafusion::physical_plan::ExecutionPlan;
+use datafusion_proto::physical_plan::DefaultPhysicalExtensionCodec;
+use datafusion_proto::{
+    physical_plan::{from_proto::parse_physical_expr, AsExecutionPlan},
+    protobuf::{PhysicalExprNode, PhysicalPlanNode},
+};
+use futures::StreamExt;
+use prost::Message;
+use std::collections::BTreeMap;
+use std::mem;
+use std::sync::{Arc, RwLock};
+use std::time::{Duration, SystemTime};
+use tokio::sync::mpsc::{unbounded_channel, UnboundedReceiver, UnboundedSender};
+use tracing::warn;
+
+use crate::runtime::streaming::api::context::TaskContext;
+use crate::runtime::streaming::api::operator::Operator;
+use async_trait::async_trait;
+use crate::runtime::streaming::factory::Registry;
+use protocol::grpc::api::TumblingWindowAggregateOperator;
+use crate::runtime::streaming::StreamOutput;
+use crate::sql::common::{from_nanos, to_nanos, CheckpointBarrier, FsSchema, Watermark};
+use crate::sql::common::time_utils::print_time;
+use crate::sql::physical::{DecodingContext, FsPhysicalExtensionCodec};
+use crate::sql::schema::utils::add_timestamp_field_arrow;
+
+struct ActiveBin {
+    sender: Option<UnboundedSender<RecordBatch>>,
+    result_stream: Option<SendableRecordBatchStream>,
+    finished_batches: Vec<RecordBatch>,
+}
+
+impl Default for ActiveBin {
+    fn default() -> Self {
+        Self {
+            sender: None,
+            result_stream: None,
+            finished_batches: Vec::new(),
+        }
+    }
+}
+
+impl ActiveBin {
+    fn start_partial(
+        plan: Arc<dyn ExecutionPlan>,
+        hook: &Arc<RwLock<Option<UnboundedReceiver<RecordBatch>>>>,
+    ) -> Result<Self> {
+        let (tx, rx) = unbounded_channel();
+        *hook.write().unwrap() = Some(rx);
+        plan.reset()?;
+        let result_stream = plan.execute(0, SessionContext::new().task_ctx())?;
+        Ok(Self {
+            sender: Some(tx),
+            result_stream: Some(result_stream),
+            finished_batches: Vec::new(),
+        })
+    }
+
+    async fn close_and_drain(&mut self) -> Result<()> {
+        self.sender.take();
+        if let Some(mut stream) = self.result_stream.take() {
+            while let Some(batch) = stream.next().await {
+                self.finished_batches.push(batch?);
+            }
+        }
+        Ok(())
+    }
+}
+
+pub struct TumblingWindowOperator {
+    width: Duration,
+    binning_function: Arc<dyn PhysicalExpr>,
+
+    partial_aggregation_plan: Arc<dyn ExecutionPlan>,
+    partial_schema: FsSchema,
+
+    finish_execution_plan: Arc<dyn ExecutionPlan>,
+    aggregate_with_timestamp_schema: SchemaRef,
+    final_projection: Option<Arc<dyn ExecutionPlan>>,
+
+    receiver_hook: Arc<RwLock<Option<UnboundedReceiver<RecordBatch>>>>,
+    final_batches_passer: Arc<RwLock<Vec<RecordBatch>>>,
+
+    active_bins: BTreeMap<SystemTime, ActiveBin>,
+}
+
+impl TumblingWindowOperator {
+    fn bin_start(&self, timestamp: SystemTime) -> SystemTime {
+        if self.width == Duration::ZERO {
+            return timestamp;
+        }
+        let nanos = to_nanos(timestamp) - (to_nanos(timestamp) % self.width.as_nanos());
+        from_nanos(nanos)
+    }
+
+    fn add_bin_start_as_timestamp(
+        batch: &RecordBatch,
+        bin_start: SystemTime,
+        schema: SchemaRef,
+    ) -> Result<RecordBatch> {
+        let bin_start_scalar = ScalarValue::TimestampNanosecond(Some(to_nanos(bin_start) as i64), None);
+        let timestamp_array = bin_start_scalar.to_array_of_size(batch.num_rows())?;
+        let mut columns = batch.columns().to_vec();
+        columns.push(timestamp_array);
+        RecordBatch::try_new(schema.clone(), columns)
+            .map_err(|e| anyhow!("add _timestamp column: {e}"))
+    }
+
+    fn ensure_bin_running(
+        slot: &mut ActiveBin,
+        plan: Arc<dyn ExecutionPlan>,
+        hook: &Arc<RwLock<Option<UnboundedReceiver<RecordBatch>>>>,
+    ) -> Result<()> {
+        if slot.sender.is_some() {
+            return Ok(());
+        }
+        let preserved = mem::take(&mut slot.finished_batches);
+        let mut started = ActiveBin::start_partial(plan, hook)?;
+        started.finished_batches = preserved;
+        *slot = started;
+        Ok(())
+    }
+}
+
+#[async_trait]
+impl Operator for TumblingWindowOperator {
+    fn name(&self) -> &str {
+        "TumblingWindow"
+    }
+
+    async fn on_start(&mut self, _ctx: &mut TaskContext) -> Result<()> {
+        Ok(())
+    }
+
+    async fn process_data(
+        &mut self,
+        _input_idx: usize,
+        batch: RecordBatch,
+        ctx: &mut TaskContext,
+    ) -> Result<Vec<StreamOutput>> {
+        let bin_array = self
+            .binning_function
+            .evaluate(&batch)?
+            .into_array(batch.num_rows())?;
+        let indices = sort_to_indices(bin_array.as_ref(), None, None)?;
+
+        let columns = batch
+            .columns()
+            .iter()
+            .map(|c| take(c, &indices, None).unwrap())
+            .collect();
+        let sorted = RecordBatch::try_new(batch.schema(), columns)?;
+        let sorted_bins = take(bin_array.as_ref(), &indices, None)?;
+
+        let typed_bin = sorted_bins
+            .as_any()
+            .downcast_ref::<PrimitiveArray<TimestampNanosecondType>>()
+            .ok_or_else(|| anyhow!("binning function must produce TimestampNanosecond"))?;
+        let partition_ranges = partition(std::slice::from_ref(&sorted_bins))?.ranges();
+
+        for range in partition_ranges {
+            let bin_start = from_nanos(typed_bin.value(range.start) as u128);
+
+            if let Some(watermark) = ctx.last_present_watermark() {
+                if bin_start < self.bin_start(watermark) {
+                    warn!(
+                        "late data dropped: bin {} < watermark {}",
+                        print_time(bin_start),
+                        print_time(watermark)
+                    );
+                    continue;
+                }
+            }
+
+            let bin_batch = sorted.slice(range.start, range.end - range.start);
+            let slot = self.active_bins.entry(bin_start).or_default();
+
+            Self::ensure_bin_running(
+                slot,
+                self.partial_aggregation_plan.clone(),
+                &self.receiver_hook,
+            )?;
+
+            let sender = slot
+                .sender
+                .as_ref()
+                .ok_or_else(|| anyhow!("tumbling bin sender missing after ensure"))?;
+            sender
+                .send(bin_batch)
+                .map_err(|e| anyhow!("partial channel send: {e}"))?;
+        }
+
+        Ok(vec![])
+    }
+
+    async fn process_watermark(
+        &mut self,
+        watermark: Watermark,
+        _ctx: &mut TaskContext,
+    ) -> Result<Vec<StreamOutput>> {
+        let Watermark::EventTime(current_time) = watermark else {
+            return Ok(vec![]);
+        };
+
+        let mut final_outputs = Vec::new();
+
+        let mut expired_bins = Vec::new();
+        for &k in self.active_bins.keys() {
+            if k + self.width <= current_time {
+                expired_bins.push(k);
+            } else {
+                break;
+            }
+        }
+
+        for bin_start in expired_bins {
+            let mut bin = self
+                .active_bins
+                .remove(&bin_start)
+                .ok_or_else(|| anyhow!("missing tumbling bin"))?;
+
+            bin.close_and_drain().await?;
+            let partial_batches = mem::take(&mut bin.finished_batches);
+
+            if partial_batches.is_empty() {
+                continue;
+            }
+
+            *self.final_batches_passer.write().unwrap() = partial_batches;
+            self.finish_execution_plan.reset()?;
+            let mut final_exec = self
+                .finish_execution_plan
+                .execute(0, SessionContext::new().task_ctx())?;
+
+            let mut aggregate_results = Vec::new();
+            while let Some(batch) = final_exec.next().await {
+                let batch = batch?;
+                let with_timestamp = Self::add_bin_start_as_timestamp(
+                    &batch,
+                    bin_start,
+                    self.aggregate_with_timestamp_schema.clone(),
+                )?;
+
+                if self.final_projection.is_none() {
+                    final_outputs.push(StreamOutput::Forward(with_timestamp));
+                } else {
+                    aggregate_results.push(with_timestamp);
+                }
+            }
+
+            if let Some(final_projection) = &self.final_projection {
+                *self.final_batches_passer.write().unwrap() = aggregate_results;
+                final_projection.reset()?;
+                let mut proj_exec = final_projection.execute(0, SessionContext::new().task_ctx())?;
+
+                while let Some(batch) = proj_exec.next().await {
+                    final_outputs.push(StreamOutput::Forward(batch?));
+                }
+            }
+        }
+
+        Ok(final_outputs)
+    }
+
+    async fn snapshot_state(&mut self, _barrier: CheckpointBarrier, _ctx: &mut TaskContext) -> Result<()> {
+        Ok(())
+    }
+
+    async fn on_close(&mut self, _ctx: &mut TaskContext) -> Result<Vec<StreamOutput>> {
+        Ok(vec![])
+    }
+}
+
+pub struct TumblingAggregateWindowConstructor;
+
+impl TumblingAggregateWindowConstructor {
+    pub fn with_config(
+        &self,
+        config: TumblingWindowAggregateOperator,
+        registry: Arc<Registry>,
+    ) -> anyhow::Result<TumblingWindowOperator> {
+        let width = Duration::from_micros(config.width_micros);
+        let input_schema: FsSchema = config
+            .input_schema
+            .ok_or_else(|| anyhow!("missing input schema"))?
+            .try_into()?;
+
+        let binning_function = parse_physical_expr(
+            &PhysicalExprNode::decode(&mut config.binning_function.as_slice())?,
+            registry.as_ref(),
+            &input_schema.schema,
+            &DefaultPhysicalExtensionCodec {},
+        )?;
+
+        let receiver_hook = Arc::new(RwLock::new(None));
+        let final_batches_passer = Arc::new(RwLock::new(Vec::new()));
+
+        let codec = FsPhysicalExtensionCodec {
+            context: DecodingContext::UnboundedBatchStream(receiver_hook.clone()),
+        };
+        let final_codec = FsPhysicalExtensionCodec {
+            context: DecodingContext::LockedBatchVec(final_batches_passer.clone()),
+        };
+
+        let partial_plan = PhysicalPlanNode::decode(&mut config.partial_aggregation_plan.as_slice())?
+            .try_into_physical_plan(
+                registry.as_ref(),
+                &RuntimeEnvBuilder::new().build()?,
+                &codec,
+            )?;
+
+        let partial_schema: FsSchema = config
+            .partial_schema
+            .ok_or_else(|| anyhow!("missing partial schema"))?
+            .try_into()?;
+
+        let finish_plan = PhysicalPlanNode::decode(&mut config.final_aggregation_plan.as_slice())?;
+        let finish_execution_plan = finish_plan.try_into_physical_plan(
+            registry.as_ref(),
+            &RuntimeEnvBuilder::new().build()?,
+            &final_codec,
+        )?;
+
+        let final_projection_plan = match &config.final_projection {
+            Some(proto) if !proto.is_empty() => {
+                let node = PhysicalPlanNode::decode(&mut proto.as_slice())
+                    .map_err(|e| anyhow!("decode final_projection: {e}"))?;
+                Some(node.try_into_physical_plan(
+                    registry.as_ref(),
+                    &RuntimeEnvBuilder::new().build()?,
+                    &final_codec,
+                )?)
+            }
+            _ => None,
+        };
+
+        let aggregate_with_timestamp_schema =
+            add_timestamp_field_arrow((*finish_execution_plan.schema()).clone());
+
+        Ok(TumblingWindowOperator {
+            width,
+            binning_function,
+            partial_aggregation_plan: partial_plan,
+            partial_schema,
+            finish_execution_plan,
+            aggregate_with_timestamp_schema,
+            final_projection: final_projection_plan,
+            receiver_hook,
+            final_batches_passer,
+            active_bins: BTreeMap::new(),
+        })
+    }
+}
+
diff --git a/src/runtime/streaming/operators/windows/window_function.rs b/src/runtime/streaming/operators/windows/window_function.rs
new file mode 100644
index 00000000..4ab68cfd
--- /dev/null
+++ b/src/runtime/streaming/operators/windows/window_function.rs
@@ -0,0 +1,279 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+use anyhow::{anyhow, Result};
+use arrow::compute::{max, min};
+use arrow_array::RecordBatch;
+use datafusion::execution::context::SessionContext;
+use datafusion::execution::runtime_env::RuntimeEnvBuilder;
+use datafusion::execution::SendableRecordBatchStream;
+use datafusion::physical_plan::ExecutionPlan;
+use datafusion_proto::physical_plan::AsExecutionPlan;
+use datafusion_proto::protobuf::PhysicalPlanNode;
+use futures::StreamExt;
+use prost::Message;
+use std::collections::BTreeMap;
+use std::sync::{Arc, RwLock};
+use std::time::SystemTime;
+use tokio::sync::mpsc::{unbounded_channel, UnboundedReceiver, UnboundedSender};
+use tracing::warn;
+
+use crate::runtime::streaming::api::context::TaskContext;
+use crate::runtime::streaming::api::operator::Operator;
+use crate::runtime::streaming::factory::Registry;
+use async_trait::async_trait;
+use crate::runtime::streaming::StreamOutput;
+use crate::sql::common::{from_nanos, CheckpointBarrier, FsSchema, FsSchemaRef, Watermark};
+use crate::sql::common::time_utils::print_time;
+use crate::sql::physical::{DecodingContext, FsPhysicalExtensionCodec};
+
+// ============================================================================
+// ============================================================================
+
+struct ActiveWindowExec {
+    sender: Option<UnboundedSender<RecordBatch>>,
+    result_stream: Option<SendableRecordBatchStream>,
+}
+
+impl ActiveWindowExec {
+    fn new(
+        plan: Arc<dyn ExecutionPlan>,
+        hook: &Arc<RwLock<Option<UnboundedReceiver<RecordBatch>>>>,
+    ) -> Result<Self> {
+        let (tx, rx) = unbounded_channel();
+        *hook.write().unwrap() = Some(rx);
+        plan.reset()?;
+        let result_stream = plan.execute(0, SessionContext::new().task_ctx())?;
+        Ok(Self {
+            sender: Some(tx),
+            result_stream: Some(result_stream),
+        })
+    }
+
+    async fn close_and_drain(&mut self) -> Result<Vec<RecordBatch>> {
+        self.sender.take();
+        let mut results = Vec::new();
+        if let Some(mut stream) = self.result_stream.take() {
+            while let Some(batch) = stream.next().await {
+                results.push(batch?);
+            }
+        }
+        Ok(results)
+    }
+}
+
+// ============================================================================
+// ============================================================================
+
+pub struct WindowFunctionOperator {
+    input_schema: FsSchemaRef,
+    input_schema_unkeyed: FsSchemaRef,
+    window_exec_plan: Arc<dyn ExecutionPlan>,
+    receiver_hook: Arc<RwLock<Option<UnboundedReceiver<RecordBatch>>>>,
+    active_execs: BTreeMap<SystemTime, ActiveWindowExec>,
+}
+
+impl WindowFunctionOperator {
+    fn filter_and_split_batches(
+        &self,
+        batch: RecordBatch,
+        watermark: Option<SystemTime>,
+    ) -> Result<Vec<(RecordBatch, SystemTime)>> {
+        if batch.num_rows() == 0 {
+            return Ok(vec![]);
+        }
+
+        let timestamp_column = self.input_schema.timestamp_column(&batch);
+        let min_timestamp = from_nanos(min(timestamp_column).unwrap() as u128);
+        let max_timestamp = from_nanos(max(timestamp_column).unwrap() as u128);
+
+        if let Some(wm) = watermark {
+            if max_timestamp < wm {
+                warn!(
+                    "dropped late batch: max_ts {} < watermark {}",
+                    print_time(max_timestamp),
+                    print_time(wm)
+                );
+                return Ok(vec![]);
+            }
+        }
+
+        if min_timestamp == max_timestamp {
+            return Ok(vec![(batch, max_timestamp)]);
+        }
+
+        let sorted_batch = self
+            .input_schema_unkeyed
+            .sort(batch, true)
+            .map_err(|e| anyhow!("sort for window fn: {e}"))?;
+        let filtered_batch = self
+            .input_schema_unkeyed
+            .filter_by_time(sorted_batch, watermark)
+            .map_err(|e| anyhow!("filter_by_time: {e}"))?;
+        if filtered_batch.num_rows() == 0 {
+            return Ok(vec![]);
+        }
+
+        let filtered_timestamps = self.input_schema.timestamp_column(&filtered_batch);
+        let ranges = self
+            .input_schema_unkeyed
+            .partition(&filtered_batch, true)
+            .map_err(|e| anyhow!("partition by time: {e}"))?;
+
+        let mut batches = Vec::with_capacity(ranges.len());
+        for range in ranges {
+            let slice = filtered_batch.slice(range.start, range.end - range.start);
+            let ts = from_nanos(filtered_timestamps.value(range.start) as u128);
+            batches.push((slice, ts));
+        }
+        Ok(batches)
+    }
+
+    fn get_or_create_exec(&mut self, timestamp: SystemTime) -> Result<&mut ActiveWindowExec> {
+        use std::collections::btree_map::Entry;
+        match self.active_execs.entry(timestamp) {
+            Entry::Vacant(v) => {
+                let new_exec =
+                    ActiveWindowExec::new(self.window_exec_plan.clone(), &self.receiver_hook)?;
+                Ok(v.insert(new_exec))
+            }
+            Entry::Occupied(o) => Ok(o.into_mut()),
+        }
+    }
+}
+
+#[async_trait]
+impl Operator for WindowFunctionOperator {
+    fn name(&self) -> &str {
+        "WindowFunction"
+    }
+
+    async fn on_start(&mut self, _ctx: &mut TaskContext) -> Result<()> {
+        Ok(())
+    }
+
+    async fn process_data(
+        &mut self,
+        _input_idx: usize,
+        batch: RecordBatch,
+        ctx: &mut TaskContext,
+    ) -> Result<Vec<StreamOutput>> {
+        let current_watermark = ctx.last_present_watermark();
+        let split_batches = self.filter_and_split_batches(batch, current_watermark)?;
+
+        for (sub_batch, timestamp) in split_batches {
+            let exec = self.get_or_create_exec(timestamp)?;
+            exec.sender
+                .as_ref()
+                .ok_or_else(|| anyhow!("window exec sender missing"))?
+                .send(sub_batch)
+                .map_err(|e| anyhow!("route batch to plan: {e}"))?;
+        }
+
+        Ok(vec![])
+    }
+
+    async fn process_watermark(
+        &mut self,
+        watermark: Watermark,
+        _ctx: &mut TaskContext,
+    ) -> Result<Vec<StreamOutput>> {
+        let Watermark::EventTime(current_time) = watermark else {
+            return Ok(vec![]);
+        };
+
+        let mut final_outputs = Vec::new();
+
+        let mut expired_timestamps = Vec::new();
+        for &k in self.active_execs.keys() {
+            if k < current_time {
+                expired_timestamps.push(k);
+            } else {
+                break;
+            }
+        }
+
+        for ts in expired_timestamps {
+            let mut exec = self
+                .active_execs
+                .remove(&ts)
+                .ok_or_else(|| anyhow!("missing window exec"))?;
+            let result_batches = exec.close_and_drain().await?;
+            for batch in result_batches {
+                final_outputs.push(StreamOutput::Forward(batch));
+            }
+        }
+
+        Ok(final_outputs)
+    }
+
+    async fn snapshot_state(&mut self, _barrier: CheckpointBarrier, _ctx: &mut TaskContext) -> Result<()> {
+        Ok(())
+    }
+
+    async fn on_close(&mut self, _ctx: &mut TaskContext) -> Result<Vec<StreamOutput>> {
+        Ok(vec![])
+    }
+}
+
+// ============================================================================
+// ============================================================================
+
+pub struct WindowFunctionConstructor;
+
+impl WindowFunctionConstructor {
+    pub fn with_config(
+        &self,
+        config: protocol::grpc::api::WindowFunctionOperator,
+        registry: Arc<Registry>,
+    ) -> anyhow::Result<WindowFunctionOperator> {
+        let input_schema = Arc::new(
+            FsSchema::try_from(
+                config
+                    .input_schema
+                    .ok_or_else(|| anyhow!("missing input schema"))?,
+            )
+            .map_err(|e| anyhow!("input schema: {e}"))?,
+        );
+
+        let input_schema_unkeyed = Arc::new(
+            FsSchema::from_schema_unkeyed(input_schema.schema.clone())
+                .map_err(|e| anyhow!("unkeyed schema: {e}"))?,
+        );
+
+        let receiver_hook = Arc::new(RwLock::new(None));
+        let codec = FsPhysicalExtensionCodec {
+            context: DecodingContext::UnboundedBatchStream(receiver_hook.clone()),
+        };
+
+        let window_exec_node =
+            PhysicalPlanNode::decode(&mut config.window_function_plan.as_slice())
+                .map_err(|e| anyhow!("decode window_function_plan: {e}"))?;
+        let window_exec_plan = window_exec_node
+            .try_into_physical_plan(
+                registry.as_ref(),
+                &RuntimeEnvBuilder::new().build()?,
+                &codec,
+            )
+            .map_err(|e| anyhow!("window physical plan: {e}"))?;
+
+        Ok(WindowFunctionOperator {
+            input_schema,
+            input_schema_unkeyed,
+            window_exec_plan,
+            receiver_hook,
+            active_execs: BTreeMap::new(),
+        })
+    }
+}
+
diff --git a/src/runtime/streaming/protocol/control.rs b/src/runtime/streaming/protocol/control.rs
new file mode 100644
index 00000000..d337046e
--- /dev/null
+++ b/src/runtime/streaming/protocol/control.rs
@@ -0,0 +1,82 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+use serde::{Deserialize, Serialize};
+use std::time::Duration;
+use tokio::sync::mpsc::{self, Receiver, Sender};
+use crate::sql::common::CheckpointBarrier;
+
+#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
+pub struct CheckpointBarrierWire {
+    pub epoch: u32,
+    pub min_epoch: u32,
+    pub timestamp_secs: u64,
+    pub timestamp_subsec_nanos: u32,
+    pub then_stop: bool,
+}
+
+impl From<CheckpointBarrier> for CheckpointBarrierWire {
+    fn from(b: CheckpointBarrier) -> Self {
+        let d = b
+            .timestamp
+            .duration_since(std::time::UNIX_EPOCH)
+            .unwrap_or_default();
+        Self {
+            epoch: b.epoch,
+            min_epoch: b.min_epoch,
+            timestamp_secs: d.as_secs(),
+            timestamp_subsec_nanos: d.subsec_nanos(),
+            then_stop: b.then_stop,
+        }
+    }
+}
+
+impl From<CheckpointBarrierWire> for CheckpointBarrier {
+    fn from(w: CheckpointBarrierWire) -> Self {
+        Self {
+            epoch: w.epoch,
+            min_epoch: w.min_epoch,
+            timestamp: std::time::UNIX_EPOCH
+                + Duration::new(w.timestamp_secs, w.timestamp_subsec_nanos),
+            then_stop: w.then_stop,
+        }
+    }
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub enum ControlCommand {
+    Start,
+    Stop { mode: StopMode },
+    DropState,
+    Commit { epoch: u32 },
+    UpdateConfig { config_json: String },
+    TriggerCheckpoint { barrier: CheckpointBarrierWire },
+}
+
+impl ControlCommand {
+    pub fn trigger_checkpoint(barrier: CheckpointBarrier) -> Self {
+        Self::TriggerCheckpoint {
+            barrier: barrier.into(),
+        }
+    }
+}
+
+#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
+pub enum StopMode {
+    Graceful,
+    Immediate,
+}
+
+pub fn control_channel(capacity: usize) -> (Sender<ControlCommand>, Receiver<ControlCommand>) {
+    mpsc::channel(capacity)
+}
diff --git a/src/runtime/streaming/protocol/event.rs b/src/runtime/streaming/protocol/event.rs
new file mode 100644
index 00000000..b78b7fbc
--- /dev/null
+++ b/src/runtime/streaming/protocol/event.rs
@@ -0,0 +1,22 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use arrow_array::RecordBatch;
+use crate::sql::common::{CheckpointBarrier, Watermark};
+
+#[derive(Debug, Clone)]
+pub enum StreamEvent {
+    Data(RecordBatch),
+    Watermark(Watermark),
+    Barrier(CheckpointBarrier),
+    EndOfStream,
+}
diff --git a/src/runtime/streaming/protocol/mod.rs b/src/runtime/streaming/protocol/mod.rs
new file mode 100644
index 00000000..fb20c59e
--- /dev/null
+++ b/src/runtime/streaming/protocol/mod.rs
@@ -0,0 +1,20 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+pub mod control;
+pub mod event;
+pub mod stream_out;
+pub mod tracked;
+pub mod watermark;
+
+pub use stream_out::StreamOutput;
diff --git a/src/runtime/streaming/protocol/stream_out.rs b/src/runtime/streaming/protocol/stream_out.rs
new file mode 100644
index 00000000..fc7b9bba
--- /dev/null
+++ b/src/runtime/streaming/protocol/stream_out.rs
@@ -0,0 +1,22 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use arrow_array::RecordBatch;
+use crate::sql::common::Watermark;
+
+#[derive(Debug, Clone)]
+pub enum StreamOutput {
+    Forward(RecordBatch),
+    Keyed(u64, RecordBatch),
+    Broadcast(RecordBatch),
+    Watermark(Watermark),
+}
diff --git a/src/runtime/streaming/protocol/tracked.rs b/src/runtime/streaming/protocol/tracked.rs
new file mode 100644
index 00000000..d4360627
--- /dev/null
+++ b/src/runtime/streaming/protocol/tracked.rs
@@ -0,0 +1,39 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::sync::Arc;
+
+use crate::runtime::streaming::memory::MemoryTicket;
+use crate::runtime::streaming::protocol::event::StreamEvent;
+
+///
+#[derive(Debug, Clone)]
+pub struct TrackedEvent {
+    pub event: StreamEvent,
+    pub _ticket: Option<Arc<MemoryTicket>>,
+}
+
+impl TrackedEvent {
+    pub fn new(event: StreamEvent, ticket: Option<MemoryTicket>) -> Self {
+        Self {
+            event,
+            _ticket: ticket.map(Arc::new),
+        }
+    }
+
+    pub fn control(event: StreamEvent) -> Self {
+        Self {
+            event,
+            _ticket: None,
+        }
+    }
+}
diff --git a/src/runtime/streaming/protocol/watermark.rs b/src/runtime/streaming/protocol/watermark.rs
new file mode 100644
index 00000000..f6e8388a
--- /dev/null
+++ b/src/runtime/streaming/protocol/watermark.rs
@@ -0,0 +1,88 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+use crate::sql::common::Watermark;
+
+pub fn merge_watermarks(per_input: &[Option<Watermark>]) -> Option<Watermark> {
+    if per_input.iter().any(|w| w.is_none()) {
+        return None;
+    }
+
+    let mut min_event: Option<std::time::SystemTime> = None;
+    let mut all_idle = true;
+
+    for w in per_input.iter().flatten() {
+        match w {
+            Watermark::Idle => {}
+            Watermark::EventTime(t) => {
+                all_idle = false;
+                min_event = Some(match min_event {
+                    None => *t,
+                    Some(m) => m.min(*t),
+                });
+            }
+        }
+    }
+
+    if all_idle {
+        Some(Watermark::Idle)
+    } else {
+        Some(Watermark::EventTime(
+            min_event.expect("non-idle alignment must have at least one EventTime"),
+        ))
+    }
+}
+
+pub fn watermark_strictly_advances(new: Watermark, previous: Option<Watermark>) -> bool {
+    match previous {
+        None => true,
+        Some(prev) => match (new, prev) {
+            (Watermark::EventTime(tn), Watermark::EventTime(tp)) => tn > tp,
+            (Watermark::Idle, Watermark::Idle) => false,
+            (Watermark::Idle, Watermark::EventTime(_)) => true,
+            (Watermark::EventTime(_), Watermark::Idle) => true,
+        },
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::time::{Duration, SystemTime};
+
+    #[test]
+    fn merge_waits_for_all_channels() {
+        let wms = vec![Some(Watermark::EventTime(SystemTime::UNIX_EPOCH)), None];
+        assert!(merge_watermarks(&wms).is_none());
+    }
+
+    #[test]
+    fn merge_min_event_time_ignores_idle() {
+        let t1 = SystemTime::UNIX_EPOCH + Duration::from_secs(10);
+        let t2 = SystemTime::UNIX_EPOCH + Duration::from_secs(5);
+        let wms = vec![Some(Watermark::EventTime(t1)), Some(Watermark::Idle)];
+        assert_eq!(merge_watermarks(&wms), Some(Watermark::EventTime(t1)));
+
+        let wms = vec![
+            Some(Watermark::EventTime(t1)),
+            Some(Watermark::EventTime(t2)),
+        ];
+        assert_eq!(merge_watermarks(&wms), Some(Watermark::EventTime(t2)));
+    }
+
+    #[test]
+    fn merge_all_idle() {
+        let wms = vec![Some(Watermark::Idle), Some(Watermark::Idle)];
+        assert_eq!(merge_watermarks(&wms), Some(Watermark::Idle));
+    }
+}
diff --git a/src/runtime/util/mod.rs b/src/runtime/util/mod.rs
new file mode 100644
index 00000000..0e3a3f7b
--- /dev/null
+++ b/src/runtime/util/mod.rs
@@ -0,0 +1,16 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+mod physical_aggregate;
+
+pub use physical_aggregate::decode_aggregate;
diff --git a/src/runtime/util/physical_aggregate.rs b/src/runtime/util/physical_aggregate.rs
new file mode 100644
index 00000000..33dd1e9f
--- /dev/null
+++ b/src/runtime/util/physical_aggregate.rs
@@ -0,0 +1,77 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+use std::sync::Arc;
+
+use arrow::datatypes::SchemaRef;
+use datafusion::common::internal_err;
+use datafusion::common::Result as DFResult;
+use datafusion::execution::FunctionRegistry;
+use datafusion::physical_expr::aggregate::{AggregateExprBuilder, AggregateFunctionExpr};
+use datafusion::physical_expr::{LexOrdering, PhysicalExpr};
+use datafusion_proto::physical_plan::from_proto::{parse_physical_expr, parse_physical_sort_expr};
+use datafusion_proto::physical_plan::{DefaultPhysicalExtensionCodec, PhysicalExtensionCodec};
+use datafusion_proto::protobuf::physical_aggregate_expr_node::AggregateFunction;
+use datafusion_proto::protobuf::physical_expr_node::ExprType;
+use datafusion_proto::protobuf::{PhysicalExprNode, proto_error};
+
+pub fn decode_aggregate(
+    schema: &SchemaRef,
+    name: &str,
+    expr: &PhysicalExprNode,
+    registry: &dyn FunctionRegistry,
+) -> DFResult<Arc<AggregateFunctionExpr>> {
+    let codec = &DefaultPhysicalExtensionCodec {};
+    let expr_type = expr
+        .expr_type
+        .as_ref()
+        .ok_or_else(|| proto_error("Unexpected empty aggregate physical expression"))?;
+
+    match expr_type {
+        ExprType::AggregateExpr(agg_node) => {
+            let input_phy_expr: Vec<Arc<dyn PhysicalExpr>> = agg_node
+                .expr
+                .iter()
+                .map(|e| parse_physical_expr(e, registry, schema, codec))
+                .collect::<DFResult<Vec<_>>>()?;
+            let ordering_req: LexOrdering = agg_node
+                .ordering_req
+                .iter()
+                .map(|e| parse_physical_sort_expr(e, registry, schema, codec))
+                .collect::<DFResult<LexOrdering>>()?;
+            agg_node
+                .aggregate_function
+                .as_ref()
+                .map(|func| match func {
+                    AggregateFunction::UserDefinedAggrFunction(udaf_name) => {
+                        let agg_udf = match &agg_node.fun_definition {
+                            Some(buf) => codec.try_decode_udaf(udaf_name, buf)?,
+                            None => registry.udaf(udaf_name)?,
+                        };
+
+                        AggregateExprBuilder::new(agg_udf, input_phy_expr)
+                            .schema(Arc::clone(schema))
+                            .alias(name)
+                            .with_ignore_nulls(agg_node.ignore_nulls)
+                            .with_distinct(agg_node.distinct)
+                            .order_by(ordering_req)
+                            .build()
+                            .map(Arc::new)
+                    }
+                })
+                .transpose()?
+                .ok_or_else(|| proto_error("Invalid AggregateExpr, missing aggregate_function"))
+        }
+        _ => internal_err!("Invalid aggregate expression for AggregateExec"),
+    }
+}
diff --git a/src/runtime/input/input_protocol.rs b/src/runtime/wasm/input/input_protocol.rs
similarity index 100%
rename from src/runtime/input/input_protocol.rs
rename to src/runtime/wasm/input/input_protocol.rs
diff --git a/src/runtime/input/input_provider.rs b/src/runtime/wasm/input/input_provider.rs
similarity index 100%
rename from src/runtime/input/input_provider.rs
rename to src/runtime/wasm/input/input_provider.rs
diff --git a/src/runtime/input/input_runner.rs b/src/runtime/wasm/input/input_runner.rs
similarity index 100%
rename from src/runtime/input/input_runner.rs
rename to src/runtime/wasm/input/input_runner.rs
diff --git a/src/runtime/input/interface.rs b/src/runtime/wasm/input/interface.rs
similarity index 100%
rename from src/runtime/input/interface.rs
rename to src/runtime/wasm/input/interface.rs
diff --git a/src/runtime/input/mod.rs b/src/runtime/wasm/input/mod.rs
similarity index 100%
rename from src/runtime/input/mod.rs
rename to src/runtime/wasm/input/mod.rs
diff --git a/src/runtime/input/protocol/kafka/config.rs b/src/runtime/wasm/input/protocol/kafka/config.rs
similarity index 100%
rename from src/runtime/input/protocol/kafka/config.rs
rename to src/runtime/wasm/input/protocol/kafka/config.rs
diff --git a/src/runtime/input/protocol/kafka/kafka_protocol.rs b/src/runtime/wasm/input/protocol/kafka/kafka_protocol.rs
similarity index 100%
rename from src/runtime/input/protocol/kafka/kafka_protocol.rs
rename to src/runtime/wasm/input/protocol/kafka/kafka_protocol.rs
diff --git a/src/runtime/input/protocol/kafka/mod.rs b/src/runtime/wasm/input/protocol/kafka/mod.rs
similarity index 100%
rename from src/runtime/input/protocol/kafka/mod.rs
rename to src/runtime/wasm/input/protocol/kafka/mod.rs
diff --git a/src/runtime/input/protocol/mod.rs b/src/runtime/wasm/input/protocol/mod.rs
similarity index 100%
rename from src/runtime/input/protocol/mod.rs
rename to src/runtime/wasm/input/protocol/mod.rs
diff --git a/src/runtime/wasm/mod.rs b/src/runtime/wasm/mod.rs
new file mode 100644
index 00000000..b1c82f4c
--- /dev/null
+++ b/src/runtime/wasm/mod.rs
@@ -0,0 +1,18 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+//
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! WebAssembly runtime integration.
+
+pub mod input;
+pub mod output;
+pub mod processor;
diff --git a/src/runtime/output/interface.rs b/src/runtime/wasm/output/interface.rs
similarity index 100%
rename from src/runtime/output/interface.rs
rename to src/runtime/wasm/output/interface.rs
diff --git a/src/runtime/output/mod.rs b/src/runtime/wasm/output/mod.rs
similarity index 100%
rename from src/runtime/output/mod.rs
rename to src/runtime/wasm/output/mod.rs
diff --git a/src/runtime/output/output_protocol.rs b/src/runtime/wasm/output/output_protocol.rs
similarity index 100%
rename from src/runtime/output/output_protocol.rs
rename to src/runtime/wasm/output/output_protocol.rs
diff --git a/src/runtime/output/output_provider.rs b/src/runtime/wasm/output/output_provider.rs
similarity index 100%
rename from src/runtime/output/output_provider.rs
rename to src/runtime/wasm/output/output_provider.rs
diff --git a/src/runtime/output/output_runner.rs b/src/runtime/wasm/output/output_runner.rs
similarity index 100%
rename from src/runtime/output/output_runner.rs
rename to src/runtime/wasm/output/output_runner.rs
diff --git a/src/runtime/output/protocol/kafka/kafka_protocol.rs b/src/runtime/wasm/output/protocol/kafka/kafka_protocol.rs
similarity index 100%
rename from src/runtime/output/protocol/kafka/kafka_protocol.rs
rename to src/runtime/wasm/output/protocol/kafka/kafka_protocol.rs
diff --git a/src/runtime/output/protocol/kafka/mod.rs b/src/runtime/wasm/output/protocol/kafka/mod.rs
similarity index 100%
rename from src/runtime/output/protocol/kafka/mod.rs
rename to src/runtime/wasm/output/protocol/kafka/mod.rs
diff --git a/src/runtime/output/protocol/kafka/producer_config.rs b/src/runtime/wasm/output/protocol/kafka/producer_config.rs
similarity index 100%
rename from src/runtime/output/protocol/kafka/producer_config.rs
rename to src/runtime/wasm/output/protocol/kafka/producer_config.rs
diff --git a/src/runtime/output/protocol/mod.rs b/src/runtime/wasm/output/protocol/mod.rs
similarity index 100%
rename from src/runtime/output/protocol/mod.rs
rename to src/runtime/wasm/output/protocol/mod.rs
diff --git a/src/runtime/processor/function_error.rs b/src/runtime/wasm/processor/function_error.rs
similarity index 71%
rename from src/runtime/processor/function_error.rs
rename to src/runtime/wasm/processor/function_error.rs
index b38f8dd9..f9b8fe8e 100644
--- a/src/runtime/processor/function_error.rs
+++ b/src/runtime/wasm/processor/function_error.rs
@@ -1,3 +1,15 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 #[derive(Debug, Clone)]
 pub enum FunctionErrorStage {
     Input,
diff --git a/src/runtime/processor/mod.rs b/src/runtime/wasm/processor/mod.rs
similarity index 100%
rename from src/runtime/processor/mod.rs
rename to src/runtime/wasm/processor/mod.rs
diff --git a/src/runtime/processor/python/mod.rs b/src/runtime/wasm/processor/python/mod.rs
similarity index 100%
rename from src/runtime/processor/python/mod.rs
rename to src/runtime/wasm/processor/python/mod.rs
diff --git a/src/runtime/processor/python/python_host.rs b/src/runtime/wasm/processor/python/python_host.rs
similarity index 100%
rename from src/runtime/processor/python/python_host.rs
rename to src/runtime/wasm/processor/python/python_host.rs
diff --git a/src/runtime/processor/python/python_service.rs b/src/runtime/wasm/processor/python/python_service.rs
similarity index 100%
rename from src/runtime/processor/python/python_service.rs
rename to src/runtime/wasm/processor/python/python_service.rs
diff --git a/src/runtime/processor/wasm/input_strategy.rs b/src/runtime/wasm/processor/wasm/input_strategy.rs
similarity index 100%
rename from src/runtime/processor/wasm/input_strategy.rs
rename to src/runtime/wasm/processor/wasm/input_strategy.rs
diff --git a/src/runtime/processor/wasm/mod.rs b/src/runtime/wasm/processor/wasm/mod.rs
similarity index 100%
rename from src/runtime/processor/wasm/mod.rs
rename to src/runtime/wasm/processor/wasm/mod.rs
diff --git a/src/runtime/processor/wasm/thread_pool.rs b/src/runtime/wasm/processor/wasm/thread_pool.rs
similarity index 100%
rename from src/runtime/processor/wasm/thread_pool.rs
rename to src/runtime/wasm/processor/wasm/thread_pool.rs
diff --git a/src/runtime/processor/wasm/wasm_cache.rs b/src/runtime/wasm/processor/wasm/wasm_cache.rs
similarity index 100%
rename from src/runtime/processor/wasm/wasm_cache.rs
rename to src/runtime/wasm/processor/wasm/wasm_cache.rs
diff --git a/src/runtime/processor/wasm/wasm_host.rs b/src/runtime/wasm/processor/wasm/wasm_host.rs
similarity index 100%
rename from src/runtime/processor/wasm/wasm_host.rs
rename to src/runtime/wasm/processor/wasm/wasm_host.rs
diff --git a/src/runtime/processor/wasm/wasm_processor.rs b/src/runtime/wasm/processor/wasm/wasm_processor.rs
similarity index 99%
rename from src/runtime/processor/wasm/wasm_processor.rs
rename to src/runtime/wasm/processor/wasm/wasm_processor.rs
index 1afc9dcf..cd61be98 100644
--- a/src/runtime/processor/wasm/wasm_processor.rs
+++ b/src/runtime/wasm/processor/wasm/wasm_processor.rs
@@ -679,3 +679,4 @@ impl WasmProcessor for WasmProcessorImpl {
         Ok(())
     }
 }
+
diff --git a/src/runtime/processor/wasm/wasm_processor_trait.rs b/src/runtime/wasm/processor/wasm/wasm_processor_trait.rs
similarity index 100%
rename from src/runtime/processor/wasm/wasm_processor_trait.rs
rename to src/runtime/wasm/processor/wasm/wasm_processor_trait.rs
diff --git a/src/runtime/processor/wasm/wasm_task.rs b/src/runtime/wasm/processor/wasm/wasm_task.rs
similarity index 100%
rename from src/runtime/processor/wasm/wasm_task.rs
rename to src/runtime/wasm/processor/wasm/wasm_task.rs
diff --git a/src/server/handler.rs b/src/server/handler.rs
index 4721a5a1..2ef6b529 100644
--- a/src/server/handler.rs
+++ b/src/server/handler.rs
@@ -14,22 +14,21 @@ use std::sync::Arc;
 use std::time::Instant;
 
 use arrow_ipc::writer::StreamWriter;
-use log::{error, info};
 use tonic::{Request, Response as TonicResponse, Status};
+use tracing::{debug, error, info, warn};
 
 use protocol::service::FunctionInfo as ProtoFunctionInfo;
 use protocol::service::{
-    CreateFunctionRequest, CreatePythonFunctionRequest, DropFunctionRequest, Response,
-    ShowFunctionsRequest, ShowFunctionsResponse, SqlRequest, StartFunctionRequest, StatusCode,
-    StopFunctionRequest, function_stream_service_server::FunctionStreamService,
+    function_stream_service_server::FunctionStreamService, CreateFunctionRequest,
+    CreatePythonFunctionRequest, DropFunctionRequest, Response, ShowFunctionsRequest,
+    ShowFunctionsResponse, SqlRequest, StartFunctionRequest, StatusCode, StopFunctionRequest,
 };
 
-use crate::coordinator::Coordinator;
 use crate::coordinator::{
-    CreateFunction, CreatePythonFunction, DataSet, DropFunction, ShowFunctions,
-    ShowFunctionsResult, StartFunction, Statement, StopFunction,
+    Coordinator, CreateFunction, CreatePythonFunction, DataSet, DropFunction, PythonModule,
+    ShowFunctions, ShowFunctionsResult, StartFunction, Statement, StopFunction,
 };
-use crate::sql::SqlParser;
+use crate::sql::parse::parse_sql;
 
 pub struct FunctionStreamServiceImpl {
     coordinator: Arc<Coordinator>,
@@ -40,23 +39,66 @@ impl FunctionStreamServiceImpl {
         Self { coordinator }
     }
 
-    fn build_response(status_code: StatusCode, message: String, data: Option<Vec<u8>>) -> Response {
+    fn serialize_dataset(ds: &dyn DataSet) -> Result<Vec<u8>, String> {
+        let batch = ds.to_record_batch();
+        let mut buf = Vec::new();
+
+        let mut writer = StreamWriter::try_new(&mut buf, &batch.schema())
+            .map_err(|e| format!("IPC writer initialization failed: {e}"))?;
+
+        writer
+            .write(&batch)
+            .map_err(|e| format!("IPC write failed: {e}"))?;
+
+        writer
+            .finish()
+            .map_err(|e| format!("IPC finish failed: {e}"))?;
+
+        Ok(buf)
+    }
+
+    fn build_success_response(
+        status: StatusCode,
+        message: String,
+        data: Option<Arc<dyn DataSet>>,
+    ) -> Response {
+        let payload = match data {
+            Some(ds) => match Self::serialize_dataset(ds.as_ref()) {
+                Ok(bytes) => Some(bytes),
+                Err(e) => {
+                    error!("Data serialization error: {}", e);
+                    return Self::build_error_response(
+                        StatusCode::InternalServerError,
+                        "Internal data serialization error".to_string(),
+                    );
+                }
+            },
+            None => None,
+        };
+
         Response {
-            status_code: status_code as i32,
+            status_code: status as i32,
             message,
-            data,
+            data: payload,
         }
     }
 
-    fn data_set_to_ipc_bytes(ds: &dyn DataSet) -> Option<Vec<u8>> {
-        let batch = ds.to_record_batch();
-        let mut buf = Vec::new();
-        {
-            let mut writer = StreamWriter::try_new(&mut buf, &batch.schema()).ok()?;
-            writer.write(&batch).ok()?;
-            writer.finish().ok()?;
+    fn build_error_response(status: StatusCode, message: String) -> Response {
+        Response {
+            status_code: status as i32,
+            message,
+            data: None,
+        }
+    }
+
+    async fn execute_statement(&self, stmt: &dyn Statement, success_status: StatusCode) -> Response {
+        let result = self.coordinator.execute_with_stream_catalog(stmt).await;
+
+        if result.success {
+            Self::build_success_response(success_status, result.message, result.data)
+        } else {
+            Self::build_error_response(StatusCode::InternalServerError, result.message)
         }
-        Some(buf)
     }
 }
 
@@ -66,225 +108,134 @@ impl FunctionStreamService for FunctionStreamServiceImpl {
         &self,
         request: Request<SqlRequest>,
     ) -> Result<TonicResponse<Response>, Status> {
-        let start_time = Instant::now();
+        let timer = Instant::now();
         let req = request.into_inner();
 
-        let parse_start = Instant::now();
-        let stmt = match SqlParser::parse(&req.sql) {
-            Ok(stmt) => {
-                log::debug!("SQL parsed in {}ms", parse_start.elapsed().as_millis());
-                stmt
-            }
-            Err(e) => {
-                return Ok(TonicResponse::new(Self::build_response(
-                    StatusCode::BadRequest,
-                    format!("Parse error: {}", e),
-                    None,
+        let statements = parse_sql(&req.sql).map_err(|e| {
+            let detail = e.to_string();
+            warn!("SQL parse rejection: {}", detail);
+            Status::invalid_argument(detail)
+        })?;
+
+        if statements.is_empty() {
+            return Ok(TonicResponse::new(Self::build_success_response(
+                StatusCode::Ok,
+                "No statements executed".to_string(),
+                None,
+            )));
+        }
+
+        let mut final_response = None;
+
+        for stmt in statements {
+            let result = self
+                .coordinator
+                .execute_with_stream_catalog(stmt.as_ref())
+                .await;
+
+            if !result.success {
+                error!("SQL execution aborted: {}", result.message);
+                return Ok(TonicResponse::new(Self::build_error_response(
+                    StatusCode::InternalServerError,
+                    result.message,
                 )));
             }
-        };
-
-        let exec_start = Instant::now();
-        let result = self.coordinator.execute(stmt.as_ref());
-        log::debug!(
-            "Coordinator execution finished in {}ms",
-            exec_start.elapsed().as_millis()
-        );
 
-        let status_code = if result.success {
-            StatusCode::Ok
-        } else {
-            error!("Execution failed: {}", result.message);
-            StatusCode::InternalServerError
-        };
+            final_response = Some(result);
+        }
 
-        log::debug!(
-            "Total SQL request cost: {}ms",
-            start_time.elapsed().as_millis()
-        );
+        let result = final_response.unwrap();
+        let response = Self::build_success_response(StatusCode::Ok, result.message, result.data);
 
-        Ok(TonicResponse::new(Self::build_response(
-            status_code,
-            result.message,
-            result
-                .data
-                .as_ref()
-                .and_then(|ds| Self::data_set_to_ipc_bytes(ds.as_ref())),
-        )))
+        debug!("execute_sql completed in {}ms", timer.elapsed().as_millis());
+        Ok(TonicResponse::new(response))
     }
 
     async fn create_function(
         &self,
         request: Request<CreateFunctionRequest>,
     ) -> Result<TonicResponse<Response>, Status> {
-        let start_time = Instant::now();
+        let timer = Instant::now();
         let req = request.into_inner();
-        info!(
-            "Received CreateFunction request. Config size: {}, Function size: {}",
-            req.config_bytes.len(),
-            req.function_bytes.len()
-        );
-
-        let config_bytes = if !req.config_bytes.is_empty() {
-            Some(req.config_bytes)
-        } else {
-            None
-        };
 
+        let config_bytes = (!req.config_bytes.is_empty()).then_some(req.config_bytes);
         let stmt = CreateFunction::from_bytes(req.function_bytes, config_bytes);
 
-        let exec_start = Instant::now();
-        let result = self.coordinator.execute(&stmt as &dyn Statement);
-        info!(
-            "Coordinator execution finished in {}ms",
-            exec_start.elapsed().as_millis()
-        );
-
-        let status_code = if result.success {
-            StatusCode::Created
-        } else {
-            error!("CreateFunction failed: {}", result.message);
-            StatusCode::InternalServerError
-        };
-
-        info!(
-            "Total CreateFunction request cost: {}ms",
-            start_time.elapsed().as_millis()
-        );
+        let response = self.execute_statement(&stmt, StatusCode::Created).await;
 
-        Ok(TonicResponse::new(Self::build_response(
-            status_code,
-            result.message,
-            result
-                .data
-                .as_ref()
-                .and_then(|ds| Self::data_set_to_ipc_bytes(ds.as_ref())),
-        )))
+        info!("create_function completed in {}ms", timer.elapsed().as_millis());
+        Ok(TonicResponse::new(response))
     }
 
     async fn create_python_function(
         &self,
         request: Request<CreatePythonFunctionRequest>,
     ) -> Result<TonicResponse<Response>, Status> {
-        let start_time = Instant::now();
+        let timer = Instant::now();
         let req = request.into_inner();
-        info!(
-            "Received CreatePythonFunction request. Class name: {}, Modules: {}",
-            req.class_name,
-            req.modules.len()
-        );
 
-        // Convert proto modules to PythonModule
-        let modules: Vec<crate::coordinator::PythonModule> = req
+        if req.modules.is_empty() {
+            return Ok(TonicResponse::new(Self::build_error_response(
+                StatusCode::BadRequest,
+                "Python function creation requires at least one module".to_string(),
+            )));
+        }
+
+        let modules: Vec<PythonModule> = req
             .modules
             .into_iter()
-            .map(|m| crate::coordinator::PythonModule {
+            .map(|m| PythonModule {
                 name: m.module_name,
                 bytes: m.module_bytes,
             })
             .collect();
 
-        if modules.is_empty() {
-            return Ok(TonicResponse::new(Self::build_response(
-                StatusCode::BadRequest,
-                "At least one module is required".to_string(),
-                None,
-            )));
-        }
-
         let stmt = CreatePythonFunction::new(req.class_name, modules, req.config_content);
+        let response = self.execute_statement(&stmt, StatusCode::Created).await;
 
-        let exec_start = Instant::now();
-        let result = self.coordinator.execute(&stmt as &dyn Statement);
         info!(
-            "Coordinator execution finished in {}ms",
-            exec_start.elapsed().as_millis()
+            "create_python_function completed in {}ms",
+            timer.elapsed().as_millis()
         );
-
-        let status_code = if result.success {
-            StatusCode::Created
-        } else {
-            error!("CreatePythonFunction failed: {}", result.message);
-            StatusCode::InternalServerError
-        };
-
-        info!(
-            "Total CreatePythonFunction request cost: {}ms",
-            start_time.elapsed().as_millis()
-        );
-
-        Ok(TonicResponse::new(Self::build_response(
-            status_code,
-            result.message,
-            result
-                .data
-                .as_ref()
-                .and_then(|ds| Self::data_set_to_ipc_bytes(ds.as_ref())),
-        )))
+        Ok(TonicResponse::new(response))
     }
 
     async fn drop_function(
         &self,
         request: Request<DropFunctionRequest>,
     ) -> Result<TonicResponse<Response>, Status> {
-        let start_time = Instant::now();
+        let timer = Instant::now();
         let req = request.into_inner();
-        info!(
-            "Received DropFunction request: function_name={}",
-            req.function_name
-        );
 
         let stmt = DropFunction::new(req.function_name);
-        let exec_start = Instant::now();
-        let result = self.coordinator.execute(&stmt as &dyn Statement);
-        info!(
-            "Coordinator execution finished in {}ms",
-            exec_start.elapsed().as_millis()
-        );
+        let response = self.execute_statement(&stmt, StatusCode::Ok).await;
 
-        let status_code = if result.success {
-            StatusCode::Ok
-        } else {
-            error!("DropFunction failed: {}", result.message);
-            StatusCode::InternalServerError
-        };
-
-        info!(
-            "Total DropFunction request cost: {}ms",
-            start_time.elapsed().as_millis()
-        );
-
-        Ok(TonicResponse::new(Self::build_response(
-            status_code,
-            result.message,
-            None,
-        )))
+        info!("drop_function completed in {}ms", timer.elapsed().as_millis());
+        Ok(TonicResponse::new(response))
     }
 
     async fn show_functions(
         &self,
-        request: Request<ShowFunctionsRequest>,
+        _request: Request<ShowFunctionsRequest>,
     ) -> Result<TonicResponse<ShowFunctionsResponse>, Status> {
-        let start_time = Instant::now();
-        let _req = request.into_inner();
-        info!("Received ShowFunctions request");
-
+        let timer = Instant::now();
         let stmt = ShowFunctions::new();
-        let exec_start = Instant::now();
-        let result = self.coordinator.execute(&stmt as &dyn Statement);
-        info!(
-            "Coordinator execution finished in {}ms",
-            exec_start.elapsed().as_millis()
-        );
 
-        let (status_code, message) = if result.success {
-            (StatusCode::Ok as i32, result.message)
-        } else {
-            error!("ShowFunctions failed: {}", result.message);
-            (StatusCode::InternalServerError as i32, result.message)
-        };
+        let result = self
+            .coordinator
+            .execute_with_stream_catalog(&stmt)
+            .await;
+
+        if !result.success {
+            error!("show_functions execution failed: {}", result.message);
+            return Ok(TonicResponse::new(ShowFunctionsResponse {
+                status_code: StatusCode::InternalServerError as i32,
+                message: result.message,
+                functions: vec![],
+            }));
+        }
 
-        let functions: Vec<ProtoFunctionInfo> = result
+        let functions = result
             .data
             .as_ref()
             .and_then(|arc_ds| {
@@ -302,15 +253,10 @@ impl FunctionStreamService for FunctionStreamServiceImpl {
             })
             .unwrap_or_default();
 
-        info!(
-            "Total ShowFunctions request cost: {}ms, count={}",
-            start_time.elapsed().as_millis(),
-            functions.len()
-        );
-
+        info!("show_functions completed in {}ms", timer.elapsed().as_millis());
         Ok(TonicResponse::new(ShowFunctionsResponse {
-            status_code,
-            message,
+            status_code: StatusCode::Ok as i32,
+            message: result.message,
             functions,
         }))
     }
@@ -319,76 +265,28 @@ impl FunctionStreamService for FunctionStreamServiceImpl {
         &self,
         request: Request<StartFunctionRequest>,
     ) -> Result<TonicResponse<Response>, Status> {
-        let start_time = Instant::now();
+        let timer = Instant::now();
         let req = request.into_inner();
-        info!(
-            "Received StartFunction request: function_name={}",
-            req.function_name
-        );
 
         let stmt = StartFunction::new(req.function_name);
-        let exec_start = Instant::now();
-        let result = self.coordinator.execute(&stmt as &dyn Statement);
-        info!(
-            "Coordinator execution finished in {}ms",
-            exec_start.elapsed().as_millis()
-        );
-
-        let status_code = if result.success {
-            StatusCode::Ok
-        } else {
-            error!("StartFunction failed: {}", result.message);
-            StatusCode::InternalServerError
-        };
+        let response = self.execute_statement(&stmt, StatusCode::Ok).await;
 
-        info!(
-            "Total StartFunction request cost: {}ms",
-            start_time.elapsed().as_millis()
-        );
-
-        Ok(TonicResponse::new(Self::build_response(
-            status_code,
-            result.message,
-            None,
-        )))
+        info!("start_function completed in {}ms", timer.elapsed().as_millis());
+        Ok(TonicResponse::new(response))
     }
 
     async fn stop_function(
         &self,
         request: Request<StopFunctionRequest>,
     ) -> Result<TonicResponse<Response>, Status> {
-        let start_time = Instant::now();
+        let timer = Instant::now();
         let req = request.into_inner();
-        info!(
-            "Received StopFunction request: function_name={}",
-            req.function_name
-        );
 
         let stmt = StopFunction::new(req.function_name);
-        let exec_start = Instant::now();
-        let result = self.coordinator.execute(&stmt as &dyn Statement);
-        info!(
-            "Coordinator execution finished in {}ms",
-            exec_start.elapsed().as_millis()
-        );
-
-        let status_code = if result.success {
-            StatusCode::Ok
-        } else {
-            error!("StopFunction failed: {}", result.message);
-            StatusCode::InternalServerError
-        };
-
-        info!(
-            "Total StopFunction request cost: {}ms",
-            start_time.elapsed().as_millis()
-        );
+        let response = self.execute_statement(&stmt, StatusCode::Ok).await;
 
-        Ok(TonicResponse::new(Self::build_response(
-            status_code,
-            result.message,
-            None,
-        )))
+        info!("stop_function completed in {}ms", timer.elapsed().as_millis());
+        Ok(TonicResponse::new(response))
     }
 }
 
diff --git a/src/server/initializer.rs b/src/server/initializer.rs
index ccb02788..70c19685 100644
--- a/src/server/initializer.rs
+++ b/src/server/initializer.rs
@@ -10,15 +10,23 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-use crate::config::GlobalConfig;
+use std::time::Instant;
+
 use anyhow::{Context, Result};
+use tracing::{debug, info, warn};
+
+use crate::config::GlobalConfig;
 
-type InitializerFn = fn(&GlobalConfig) -> Result<()>;
+pub type InitializerFn = fn(&GlobalConfig) -> Result<()>;
 
 #[derive(Clone)]
-struct Component {
-    name: &'static str,
-    initializer: InitializerFn,
+pub struct Component {
+    pub name: &'static str,
+    pub initializer: InitializerFn,
+}
+
+pub struct ComponentRegistry {
+    components: Vec<Component>,
 }
 
 #[derive(Default)]
@@ -27,25 +35,17 @@ pub struct ComponentRegistryBuilder {
 }
 
 impl ComponentRegistryBuilder {
-    #[inline]
     pub fn new() -> Self {
-        Self::with_capacity(8)
-    }
-
-    #[inline]
-    pub fn with_capacity(capacity: usize) -> Self {
         Self {
-            components: Vec::with_capacity(capacity),
+            components: Vec::with_capacity(8),
         }
     }
 
-    #[inline]
     pub fn register(mut self, name: &'static str, initializer: InitializerFn) -> Self {
         self.components.push(Component { name, initializer });
         self
     }
 
-    #[inline]
     pub fn build(self) -> ComponentRegistry {
         ComponentRegistry {
             components: self.components,
@@ -53,57 +53,71 @@ impl ComponentRegistryBuilder {
     }
 }
 
-pub struct ComponentRegistry {
-    components: Vec<Component>,
-}
-
 impl ComponentRegistry {
     pub fn initialize_all(&self, config: &GlobalConfig) -> Result<()> {
         if self.components.is_empty() {
-            log::warn!("No components registered for initialization");
+            warn!("Component registry is empty; no components to initialize");
             return Ok(());
         }
 
-        log::info!("Initializing {} components...", self.components.len());
+        let total = self.components.len();
+        info!(total_components = total, "Commencing system initialization sequence");
 
-        for (idx, component) in self.components.iter().enumerate() {
-            let start = std::time::Instant::now();
-            log::debug!(
-                "[{}/{}] Initializing component: {}",
-                idx + 1,
-                self.components.len(),
-                component.name
+        for (index, component) in self.components.iter().enumerate() {
+            let start_time = Instant::now();
+
+            debug!(
+                component = component.name,
+                step = format!("{}/{}", index + 1, total),
+                "Initializing component"
             );
 
-            (component.initializer)(config)
-                .with_context(|| format!("Component '{}' initialization failed", component.name))?;
+            (component.initializer)(config).with_context(|| {
+                format!("Fatal error initializing component: {}", component.name)
+            })?;
 
-            let elapsed = start.elapsed();
-            log::debug!(
-                "[{}/{}] Component '{}' initialized successfully in {:?}",
-                idx + 1,
-                self.components.len(),
-                component.name,
-                elapsed
+            debug!(
+                component = component.name,
+                elapsed_ms = start_time.elapsed().as_millis(),
+                "Component initialized successfully"
             );
         }
 
-        log::info!(
-            "All {} components initialized successfully",
-            self.components.len()
-        );
+        info!("System initialization sequence completed successfully");
         Ok(())
     }
+}
 
-    #[inline]
-    pub fn len(&self) -> usize {
-        self.components.len()
-    }
+pub fn build_core_registry() -> ComponentRegistry {
+    let builder = {
+        let b = ComponentRegistryBuilder::new()
+            .register("WasmCache", initialize_wasm_cache)
+            .register("TaskManager", initialize_task_manager)
+            .register("JobManager", initialize_job_manager);
+        #[cfg(feature = "python")]
+        let b = b.register("PythonService", initialize_python_service);
+        b
+    };
 
-    #[inline]
-    pub fn is_empty(&self) -> bool {
-        self.components.is_empty()
-    }
+    builder
+        .register(
+            "StreamCatalog",
+            crate::storage::stream_catalog::initialize_stream_catalog,
+        )
+        .register("Coordinator", initialize_coordinator)
+        .build()
+}
+
+pub fn bootstrap_system(config: &GlobalConfig) -> Result<()> {
+    let registry = build_core_registry();
+
+    registry.initialize_all(config)?;
+
+    crate::storage::stream_catalog::restore_global_catalog_from_store();
+    crate::storage::stream_catalog::restore_streaming_jobs_from_store();
+
+    info!("System bootstrap finished. Node is ready to accept traffic.");
+    Ok(())
 }
 
 fn initialize_wasm_cache(config: &GlobalConfig) -> Result<()> {
@@ -114,18 +128,20 @@ fn initialize_wasm_cache(config: &GlobalConfig) -> Result<()> {
             max_size: config.wasm.max_cache_size,
         },
     );
-    log::info!(
-        "WASM cache configuration: enabled={}, dir={}, max_size={} bytes",
-        config.wasm.enable_cache,
-        config.wasm.cache_dir,
-        config.wasm.max_cache_size
+
+    debug!(
+        enabled = config.wasm.enable_cache,
+        dir = %config.wasm.cache_dir,
+        max_size = config.wasm.max_cache_size,
+        "WASM cache configured"
     );
+
     Ok(())
 }
 
 fn initialize_task_manager(config: &GlobalConfig) -> Result<()> {
     crate::runtime::taskexecutor::TaskManager::init(config)
-        .context("TaskManager initialization failed")?;
+        .context("TaskManager service failed to start")?;
     Ok(())
 }
 
@@ -136,24 +152,31 @@ fn initialize_python_service(config: &GlobalConfig) -> Result<()> {
     Ok(())
 }
 
-fn initialize_coordinator(_config: &GlobalConfig) -> Result<()> {
-    crate::runtime::taskexecutor::TaskManager::get()
-        .context("Coordinator requires TaskManager to be initialized first")?;
-    log::info!("Coordinator verified and ready");
+fn initialize_job_manager(config: &GlobalConfig) -> Result<()> {
+    use crate::runtime::streaming::factory::Registry;
+    use crate::runtime::streaming::factory::OperatorFactory;
+    use crate::runtime::streaming::job::JobManager;
+    use std::sync::Arc;
+
+    let registry = Arc::new(Registry::new());
+    let factory = Arc::new(OperatorFactory::new(registry));
+    let max_memory_bytes = config.streaming.max_memory_bytes.unwrap_or(256 * 1024 * 1024);
+
+    JobManager::init(factory, max_memory_bytes)
+        .context("JobManager service failed to start")?;
+
     Ok(())
 }
 
-pub fn register_components() -> ComponentRegistry {
-    let builder = {
-        let b = ComponentRegistryBuilder::new()
-            .register("WasmCache", initialize_wasm_cache)
-            .register("TaskManager", initialize_task_manager);
-        #[cfg(feature = "python")]
-        let b = b.register("PythonService", initialize_python_service);
-        b
-    };
+fn initialize_coordinator(_config: &GlobalConfig) -> Result<()> {
+    crate::runtime::taskexecutor::TaskManager::get()
+        .context("Dependency violation: Coordinator requires TaskManager")?;
 
-    builder
-        .register("Coordinator", initialize_coordinator)
-        .build()
+    crate::storage::stream_catalog::CatalogManager::global()
+        .context("Dependency violation: Coordinator requires StreamCatalog")?;
+
+    crate::runtime::streaming::job::JobManager::global()
+        .context("Dependency violation: Coordinator requires JobManager")?;
+
+    Ok(())
 }
diff --git a/src/server/mod.rs b/src/server/mod.rs
index 03254af3..cb7a4a85 100644
--- a/src/server/mod.rs
+++ b/src/server/mod.rs
@@ -17,5 +17,5 @@ mod initializer;
 mod service;
 
 pub use handler::FunctionStreamServiceImpl;
-pub use initializer::register_components;
+pub use initializer::bootstrap_system;
 pub use service::start_server_with_shutdown;
diff --git a/src/sql/analysis/aggregate_rewriter.rs b/src/sql/analysis/aggregate_rewriter.rs
new file mode 100644
index 00000000..36024ab0
--- /dev/null
+++ b/src/sql/analysis/aggregate_rewriter.rs
@@ -0,0 +1,274 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use datafusion::common::tree_node::{Transformed, TreeNodeRewriter};
+use datafusion::common::{DFSchema, DataFusionError, Result, not_impl_err, plan_err};
+use datafusion::functions_aggregate::expr_fn::max;
+use datafusion::logical_expr::{Aggregate, Expr, Extension, LogicalPlan, Projection};
+use datafusion::prelude::col;
+use std::sync::Arc;
+
+use crate::sql::schema::StreamSchemaProvider;
+use crate::sql::extensions::aggregate::StreamWindowAggregateNode;
+use crate::sql::extensions::key_calculation::{KeyExtractionNode, KeyExtractionStrategy};
+use crate::sql::analysis::streaming_window_analzer::StreamingWindowAnalzer;
+use crate::sql::types::{
+    DFField, TIMESTAMP_FIELD, WindowBehavior, WindowType, fields_with_qualifiers, find_window,
+    schema_from_df_fields_with_metadata,
+};
+
+/// AggregateRewriter transforms batch DataFusion aggregates into streaming stateful operators.
+/// It handles windowing (Tumble/Hop/Session), watermarks, and continuous updating aggregates.
+pub(crate) struct AggregateRewriter<'a> {
+    pub schema_provider: &'a StreamSchemaProvider,
+}
+
+impl TreeNodeRewriter for AggregateRewriter<'_> {
+    type Node = LogicalPlan;
+
+    fn f_up(&mut self, node: Self::Node) -> Result<Transformed<Self::Node>> {
+        let LogicalPlan::Aggregate(mut agg) = node else {
+            return Ok(Transformed::no(node));
+        };
+
+        // 1. Identify windowing functions (e.g., tumble, hop) in GROUP BY.
+        let mut window_exprs: Vec<_> = agg
+            .group_expr
+            .iter()
+            .enumerate()
+            .filter_map(|(i, e)| find_window(e).map(|opt| opt.map(|w| (i, w))).transpose())
+            .collect::<Result<Vec<_>>>()?;
+
+        if window_exprs.len() > 1 {
+            return not_impl_err!("Streaming aggregates support at most one window expression");
+        }
+
+        // 2. Prepare internal metadata for Key-based distribution.
+        let mut key_fields: Vec<DFField> = fields_with_qualifiers(&agg.schema)
+            .iter()
+            .take(agg.group_expr.len())
+            .map(|f| {
+                DFField::new(
+                    f.qualifier().cloned(),
+                    format!("_key_{}", f.name()),
+                    f.data_type().clone(),
+                    f.is_nullable(),
+                )
+            })
+            .collect();
+
+        // 3. Dispatch to Updating Aggregate if no windowing is detected.
+        let input_window = StreamingWindowAnalzer::get_window(&agg.input)?;
+        if window_exprs.is_empty() && input_window.is_none() {
+            return self.rewrite_as_updating_aggregate(
+                agg.input,
+                key_fields,
+                agg.group_expr,
+                agg.aggr_expr,
+                agg.schema,
+            );
+        }
+
+        // 4. Resolve Windowing Strategy (InData vs FromOperator).
+        let behavior = self.resolve_window_context(
+            &agg.input,
+            &mut agg.group_expr,
+            &agg.schema,
+            &mut window_exprs,
+        )?;
+
+        // Adjust keys if windowing is handled by the operator.
+        if let WindowBehavior::FromOperator { window_index, .. } = &behavior {
+            key_fields.remove(*window_index);
+        }
+
+        let key_count = key_fields.len();
+        let keyed_input =
+            self.build_keyed_input(agg.input.clone(), &agg.group_expr, &key_fields)?;
+
+        // 5. Build the final StreamWindowAggregateNode for the physical planner.
+        let mut internal_fields = fields_with_qualifiers(&agg.schema);
+        if let WindowBehavior::FromOperator { window_index, .. } = &behavior {
+            internal_fields.remove(*window_index);
+        }
+        let internal_schema = Arc::new(schema_from_df_fields_with_metadata(
+            &internal_fields,
+            agg.schema.metadata().clone(),
+        )?);
+
+        let rewritten_agg = Aggregate::try_new_with_schema(
+            Arc::new(keyed_input),
+            agg.group_expr,
+            agg.aggr_expr,
+            internal_schema,
+        )?;
+
+        let extension = StreamWindowAggregateNode::try_new(
+            behavior,
+            LogicalPlan::Aggregate(rewritten_agg),
+            (0..key_count).collect(),
+        )?;
+
+        Ok(Transformed::yes(LogicalPlan::Extension(Extension {
+            node: Arc::new(extension),
+        })))
+    }
+}
+
+impl<'a> AggregateRewriter<'a> {
+    pub fn new(schema_provider: &'a StreamSchemaProvider) -> Self {
+        Self { schema_provider }
+    }
+
+    /// [Internal] Builds the physical Key Calculation layer required for distributed Shuffling.
+    /// This wraps the input in a Projection and a KeyExtractionNode.
+    fn build_keyed_input(
+        &self,
+        input: Arc<LogicalPlan>,
+        group_expr: &[Expr],
+        key_fields: &[DFField],
+    ) -> Result<LogicalPlan> {
+        let key_count = group_expr.len();
+        let mut projection_fields = key_fields.to_vec();
+        projection_fields.extend(fields_with_qualifiers(input.schema()));
+
+        let key_schema = Arc::new(schema_from_df_fields_with_metadata(
+            &projection_fields,
+            input.schema().metadata().clone(),
+        )?);
+
+        // Map group expressions to '_key_' aliases while passing through all original columns.
+        let mut exprs: Vec<_> = group_expr
+            .iter()
+            .zip(key_fields.iter())
+            .map(|(expr, f)| expr.clone().alias(f.name().to_string()))
+            .collect();
+
+        exprs.extend(
+            fields_with_qualifiers(input.schema())
+                .iter()
+                .map(|f| Expr::Column(f.qualified_column())),
+        );
+
+        let projection =
+            LogicalPlan::Projection(Projection::try_new_with_schema(exprs, input, key_schema)?);
+
+        Ok(LogicalPlan::Extension(Extension {
+            node: Arc::new(KeyExtractionNode::new(
+                projection,
+                KeyExtractionStrategy::ColumnIndices((0..key_count).collect()),
+            )),
+        }))
+    }
+
+    /// [Strategy] Rewrites standard GROUP BY into a non-windowed updating aggregate.
+    /// Injected max(_timestamp) ensures the streaming pulse (Watermark) continues to propagate.
+    fn rewrite_as_updating_aggregate(
+        &self,
+        input: Arc<LogicalPlan>,
+        key_fields: Vec<DFField>,
+        group_expr: Vec<Expr>,
+        mut aggr_expr: Vec<Expr>,
+        schema: Arc<DFSchema>,
+    ) -> Result<Transformed<LogicalPlan>> {
+        let keyed_input = self.build_keyed_input(input, &group_expr, &key_fields)?;
+
+        // Ensure the updating stream maintains time awareness.
+        let timestamp_col = keyed_input
+            .schema()
+            .qualified_field_with_unqualified_name(TIMESTAMP_FIELD)
+            .map_err(|_| {
+                DataFusionError::Plan(
+                    "Required _timestamp field missing for updating aggregate".to_string(),
+                )
+            })?;
+
+        let timestamp_field: DFField = timestamp_col.into();
+        aggr_expr.push(max(col(timestamp_field.qualified_column())).alias(TIMESTAMP_FIELD));
+
+        let mut output_fields = fields_with_qualifiers(&schema);
+        output_fields.push(timestamp_field);
+
+        let output_schema = Arc::new(schema_from_df_fields_with_metadata(
+            &output_fields,
+            schema.metadata().clone(),
+        )?);
+
+        let aggregate = Aggregate::try_new_with_schema(
+            Arc::new(keyed_input),
+            group_expr,
+            aggr_expr,
+            output_schema,
+        )?;
+
+        Ok(Transformed::yes(LogicalPlan::Aggregate(aggregate)))
+    }
+
+    /// [Strategy] Reconciles window definitions between the input stream and the current GROUP BY.
+    fn resolve_window_context(
+        &self,
+        input: &LogicalPlan,
+        group_expr: &mut Vec<Expr>,
+        schema: &DFSchema,
+        window_expr_info: &mut Vec<(usize, WindowType)>,
+    ) -> Result<WindowBehavior> {
+        let mut visitor = StreamingWindowAnalzer::default();
+        input.visit_with_subqueries(&mut visitor)?;
+
+        let input_window = visitor.window;
+        let has_group_window = !window_expr_info.is_empty();
+
+        match (input_window, has_group_window) {
+            // Re-aggregation or subquery with an existing window.
+            (Some(i_win), true) => {
+                let (idx, g_win) = window_expr_info.pop().unwrap();
+                if i_win != g_win {
+                    return plan_err!(
+                        "Inconsistent windowing: input is {:?}, but group by is {:?}",
+                        i_win,
+                        g_win
+                    );
+                }
+
+                if let Some(field) = visitor.fields.iter().next() {
+                    group_expr[idx] = Expr::Column(field.qualified_column());
+                    Ok(WindowBehavior::InData)
+                } else {
+                    if matches!(i_win, WindowType::Session { .. }) {
+                        return plan_err!("Nested session windows are not supported");
+                    }
+                    group_expr.remove(idx);
+                    Ok(WindowBehavior::FromOperator {
+                        window: i_win,
+                        window_field: schema.qualified_field(idx).into(),
+                        window_index: idx,
+                        is_nested: true,
+                    })
+                }
+            }
+            // First-time windowing defined in this aggregate.
+            (None, true) => {
+                let (idx, g_win) = window_expr_info.pop().unwrap();
+                group_expr.remove(idx);
+                Ok(WindowBehavior::FromOperator {
+                    window: g_win,
+                    window_field: schema.qualified_field(idx).into(),
+                    window_index: idx,
+                    is_nested: false,
+                })
+            }
+            // Passthrough: input is already windowed, no new window in group by.
+            (Some(_), false) => Ok(WindowBehavior::InData),
+            _ => unreachable!("Dispatched to non-windowed path previously"),
+        }
+    }
+}
diff --git a/src/sql/analysis/async_udf_rewriter.rs b/src/sql/analysis/async_udf_rewriter.rs
new file mode 100644
index 00000000..073a1f42
--- /dev/null
+++ b/src/sql/analysis/async_udf_rewriter.rs
@@ -0,0 +1,133 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use crate::sql::extensions::remote_table::RemoteTableBoundaryNode;
+use crate::sql::common::constants::sql_field;
+use crate::sql::extensions::AsyncFunctionExecutionNode;
+use crate::sql::schema::StreamSchemaProvider;
+use datafusion::common::tree_node::{Transformed, TreeNode, TreeNodeRewriter};
+use datafusion::common::{Column, Result as DFResult, TableReference, plan_err};
+use datafusion::logical_expr::expr::ScalarFunction;
+use datafusion::logical_expr::{Expr, Extension, LogicalPlan};
+use std::sync::Arc;
+use std::time::Duration;
+
+type AsyncSplitResult = (String, AsyncOptions, Vec<Expr>);
+
+#[derive(Debug, Clone, Copy)]
+pub struct AsyncOptions {
+    pub ordered: bool,
+    pub max_concurrency: usize,
+    pub timeout: Duration,
+}
+
+pub struct AsyncUdfRewriter<'a> {
+    provider: &'a StreamSchemaProvider,
+}
+
+impl<'a> AsyncUdfRewriter<'a> {
+    pub fn new(provider: &'a StreamSchemaProvider) -> Self {
+        Self { provider }
+    }
+
+    fn split_async(
+        expr: Expr,
+        provider: &StreamSchemaProvider,
+    ) -> DFResult<(Expr, Option<AsyncSplitResult>)> {
+        let mut found: Option<(String, AsyncOptions, Vec<Expr>)> = None;
+        let expr = expr.transform_up(|e| {
+            if let Expr::ScalarFunction(ScalarFunction { func: udf, args }) = &e {
+                if let Some(opts) = provider.get_async_udf_options(udf.name()) {
+                    if found
+                        .replace((udf.name().to_string(), opts, args.clone()))
+                        .is_some()
+                    {
+                        return plan_err!(
+                            "multiple async calls in the same expression, which is not allowed"
+                        );
+                    }
+                    return Ok(Transformed::yes(Expr::Column(Column::new_unqualified(
+                        sql_field::ASYNC_RESULT,
+                    ))));
+                }
+            }
+            Ok(Transformed::no(e))
+        })?;
+
+        Ok((expr.data, found))
+    }
+}
+
+impl TreeNodeRewriter for AsyncUdfRewriter<'_> {
+    type Node = LogicalPlan;
+
+    fn f_up(&mut self, node: Self::Node) -> DFResult<Transformed<Self::Node>> {
+        let LogicalPlan::Projection(mut projection) = node else {
+            for e in node.expressions() {
+                if let (_, Some((udf, _, _))) = Self::split_async(e.clone(), self.provider)? {
+                    return plan_err!(
+                        "async UDFs are only supported in projections, but {udf} was called in another context"
+                    );
+                }
+            }
+            return Ok(Transformed::no(node));
+        };
+
+        let mut args = None;
+        for e in projection.expr.iter_mut() {
+            let (new_e, Some(udf)) = Self::split_async(e.clone(), self.provider)? else {
+                continue;
+            };
+            if let Some((prev, _, _)) = args.replace(udf) {
+                return plan_err!(
+                    "Projection contains multiple async UDFs, which is not supported \
+                    \n(hint: two async UDF calls, {} and {}, appear in the same SELECT statement)",
+                    prev,
+                    args.unwrap().0
+                );
+            }
+            *e = new_e;
+        }
+
+        let Some((name, opts, arg_exprs)) = args else {
+            return Ok(Transformed::no(LogicalPlan::Projection(projection)));
+        };
+        let udf = self.provider.dylib_udfs.get(&name).unwrap().clone();
+
+        let input = if matches!(*projection.input, LogicalPlan::Projection(..)) {
+            Arc::new(LogicalPlan::Extension(Extension {
+                node: Arc::new(RemoteTableBoundaryNode {
+                    upstream_plan: (*projection.input).clone(),
+                    table_identifier: TableReference::bare("subquery_projection"),
+                    resolved_schema: projection.input.schema().clone(),
+                    requires_materialization: false,
+                }),
+            }))
+        } else {
+            projection.input
+        };
+
+        Ok(Transformed::yes(LogicalPlan::Extension(Extension {
+            node: Arc::new(AsyncFunctionExecutionNode {
+                upstream_plan: input,
+                operator_name: name,
+                function_config: udf,
+                invocation_args: arg_exprs,
+                result_projections: projection.expr,
+                preserve_ordering: opts.ordered,
+                concurrency_limit: opts.max_concurrency,
+                execution_timeout: opts.timeout,
+                resolved_schema: projection.schema,
+            }),
+        })))
+    }
+}
diff --git a/src/sql/analysis/join_rewriter.rs b/src/sql/analysis/join_rewriter.rs
new file mode 100644
index 00000000..058a5bd8
--- /dev/null
+++ b/src/sql/analysis/join_rewriter.rs
@@ -0,0 +1,237 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use crate::sql::schema::StreamSchemaProvider;
+use crate::sql::extensions::join::StreamingJoinNode;
+use crate::sql::extensions::key_calculation::KeyExtractionNode;
+use crate::sql::analysis::streaming_window_analzer::StreamingWindowAnalzer;
+use crate::sql::types::{WindowType, fields_with_qualifiers, schema_from_df_fields_with_metadata};
+use crate::sql::common::constants::mem_exec_join_side;
+use crate::sql::common::TIMESTAMP_FIELD;
+use datafusion::common::tree_node::{Transformed, TreeNodeRewriter};
+use datafusion::common::{
+    JoinConstraint, JoinType, Result, ScalarValue, TableReference,
+    not_impl_err, plan_err,
+};
+use datafusion::logical_expr::{
+    self, BinaryExpr, Case, Expr, Extension, Join, LogicalPlan, Projection, build_join_schema,
+};
+use datafusion::prelude::coalesce;
+use std::sync::Arc;
+
+/// JoinRewriter handles the transformation of standard SQL joins into streaming-capable joins.
+/// It manages stateful "Updating Joins" and time-aligned "Instant Joins".
+pub(crate) struct JoinRewriter<'a> {
+    pub schema_provider: &'a StreamSchemaProvider,
+}
+
+impl<'a> JoinRewriter<'a> {
+    pub fn new(schema_provider: &'a StreamSchemaProvider) -> Self {
+        Self { schema_provider }
+    }
+
+    /// [Validation] Ensures left and right streams have compatible windowing strategies.
+    fn validate_join_windows(&self, join: &Join) -> Result<bool> {
+        let left_win = StreamingWindowAnalzer::get_window(&join.left)?;
+        let right_win = StreamingWindowAnalzer::get_window(&join.right)?;
+
+        match (left_win, right_win) {
+            (None, None) => {
+                if join.join_type == JoinType::Inner {
+                    Ok(false) // Standard Updating Join (Inner)
+                } else {
+                    plan_err!(
+                        "Non-inner joins (e.g., LEFT/RIGHT) require windowing to bound state."
+                    )
+                }
+            }
+            (Some(l), Some(r)) => {
+                if l != r {
+                    return plan_err!(
+                        "Join window mismatch: left={:?}, right={:?}. Windows must match exactly.",
+                        l,
+                        r
+                    );
+                }
+                if let WindowType::Session { .. } = l {
+                    return plan_err!(
+                        "Session windows are currently not supported in streaming joins."
+                    );
+                }
+                Ok(true) // Instant Windowed Join
+            }
+            _ => plan_err!(
+                "Mixed windowing detected. Both sides of a join must be either windowed or non-windowed."
+            ),
+        }
+    }
+
+    /// [Internal] Wraps a join input in a key-extraction layer to facilitate shuffle / key-by distribution.
+    fn build_keyed_side(
+        &self,
+        input: Arc<LogicalPlan>,
+        keys: Vec<Expr>,
+        side: &str,
+    ) -> Result<LogicalPlan> {
+        let key_count = keys.len();
+
+        let projection_exprs = keys
+            .into_iter()
+            .enumerate()
+            .map(|(i, e)| {
+                e.alias_qualified(Some(TableReference::bare("_stream")), format!("_key_{i}"))
+            })
+            .chain(
+                fields_with_qualifiers(input.schema())
+                    .iter()
+                    .map(|f| Expr::Column(f.qualified_column())),
+            )
+            .collect();
+
+        let projection = Projection::try_new(projection_exprs, input)?;
+        let key_ext = KeyExtractionNode::try_new_with_projection(
+            LogicalPlan::Projection(projection),
+            (0..key_count).collect(),
+            side.to_string(),
+        )?;
+
+        Ok(LogicalPlan::Extension(Extension {
+            node: Arc::new(key_ext),
+        }))
+    }
+
+    /// [Strategy] Resolves the output timestamp of the join.
+    /// Streaming joins must output the 'max' of the two input timestamps to ensure Watermark progression.
+    fn apply_timestamp_resolution(&self, join_plan: LogicalPlan) -> Result<LogicalPlan> {
+        let schema = join_plan.schema();
+        let all_fields = fields_with_qualifiers(schema);
+
+        let timestamp_fields: Vec<_> = all_fields
+            .iter()
+            .filter(|f| f.name() == "_timestamp")
+            .cloned()
+            .collect();
+
+        if timestamp_fields.len() != 2 {
+            return plan_err!(
+                "Streaming join requires exactly two input timestamp fields to resolve output time."
+            );
+        }
+
+        // Project all fields except the two raw timestamps
+        let mut exprs: Vec<_> = all_fields
+            .iter()
+            .filter(|f| f.name() != "_timestamp")
+            .map(|f| Expr::Column(f.qualified_column()))
+            .collect();
+
+        // Calculate: GREATEST(left._timestamp, right._timestamp)
+        let left_ts = Expr::Column(timestamp_fields[0].qualified_column());
+        let right_ts = Expr::Column(timestamp_fields[1].qualified_column());
+
+        let max_ts_expr = Expr::Case(Case {
+            expr: Some(Box::new(Expr::BinaryExpr(BinaryExpr {
+                left: Box::new(left_ts.clone()),
+                op: logical_expr::Operator::GtEq,
+                right: Box::new(right_ts.clone()),
+            }))),
+            when_then_expr: vec![
+                (
+                    Box::new(Expr::Literal(ScalarValue::Boolean(Some(true)), None)),
+                    Box::new(left_ts.clone()),
+                ),
+                (
+                    Box::new(Expr::Literal(ScalarValue::Boolean(Some(false)), None)),
+                    Box::new(right_ts.clone()),
+                ),
+            ],
+            else_expr: Some(Box::new(coalesce(vec![left_ts, right_ts]))),
+        })
+        .alias(TIMESTAMP_FIELD);
+
+        exprs.push(max_ts_expr);
+
+        let out_fields: Vec<_> = all_fields
+            .iter()
+            .filter(|f| f.name() != "_timestamp")
+            .cloned()
+            .chain(std::iter::once(timestamp_fields[0].clone()))
+            .collect();
+
+        let out_schema = Arc::new(schema_from_df_fields_with_metadata(
+            &out_fields,
+            schema.metadata().clone(),
+        )?);
+
+        Ok(LogicalPlan::Projection(Projection::try_new_with_schema(
+            exprs,
+            Arc::new(join_plan),
+            out_schema,
+        )?))
+    }
+}
+
+impl TreeNodeRewriter for JoinRewriter<'_> {
+    type Node = LogicalPlan;
+
+    fn f_up(&mut self, node: Self::Node) -> Result<Transformed<Self::Node>> {
+        let LogicalPlan::Join(join) = node else {
+            return Ok(Transformed::no(node));
+        };
+
+        // 1. Validate Streaming Context
+        let is_instant = self.validate_join_windows(&join)?;
+        if join.join_constraint != JoinConstraint::On {
+            return not_impl_err!("Only 'ON' join constraints are supported in streaming SQL.");
+        }
+        if join.on.is_empty() && !is_instant {
+            return plan_err!("Updating joins require at least one equality condition (Equijoin).");
+        }
+
+        // 2. Prepare Keyed Inputs for Shuffle
+        let (left_on, right_on): (Vec<_>, Vec<_>) = join.on.clone().into_iter().unzip();
+        let keyed_left = self.build_keyed_side(join.left, left_on, mem_exec_join_side::LEFT)?;
+        let keyed_right = self.build_keyed_side(join.right, right_on, mem_exec_join_side::RIGHT)?;
+
+        // 3. Assemble Rewritten Join Node
+        let join_schema = Arc::new(build_join_schema(
+            keyed_left.schema(),
+            keyed_right.schema(),
+            &join.join_type,
+        )?);
+        let rewritten_join = LogicalPlan::Join(Join {
+            left: Arc::new(keyed_left),
+            right: Arc::new(keyed_right),
+            on: join.on,
+            filter: join.filter,
+            join_type: join.join_type,
+            join_constraint: JoinConstraint::On,
+            schema: join_schema,
+            null_equals_null: false,
+        });
+
+        // 4. Resolve Output Watermark (Timestamp Projection)
+        let plan_with_timestamp = self.apply_timestamp_resolution(rewritten_join)?;
+
+        // 5. Wrap in StreamingJoinNode for physical planning
+        let state_retention_ttl = (!is_instant).then_some(self.schema_provider.planning_options.ttl);
+        let extension = StreamingJoinNode::new(
+            plan_with_timestamp,
+            is_instant,
+            state_retention_ttl,
+        );
+
+        Ok(Transformed::yes(LogicalPlan::Extension(Extension {
+            node: Arc::new(extension),
+        })))
+    }
+}
diff --git a/src/sql/analysis/mod.rs b/src/sql/analysis/mod.rs
new file mode 100644
index 00000000..cd26a4e6
--- /dev/null
+++ b/src/sql/analysis/mod.rs
@@ -0,0 +1,217 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#![allow(clippy::new_without_default)]
+
+pub(crate) mod aggregate_rewriter;
+pub(crate) mod join_rewriter;
+pub(crate) mod row_time_rewriter;
+pub(crate) mod stream_rewriter;
+pub(crate) mod streaming_window_analzer;
+pub(crate) mod window_function_rewriter;
+
+pub mod async_udf_rewriter;
+pub mod sink_input_rewriter;
+pub mod source_metadata_visitor;
+pub mod source_rewriter;
+pub mod time_window;
+pub mod unnest_rewriter;
+
+pub use async_udf_rewriter::AsyncOptions;
+pub use sink_input_rewriter::SinkInputRewriter;
+pub use time_window::{TimeWindowNullCheckRemover, TimeWindowUdfChecker};
+pub use unnest_rewriter::UNNESTED_COL;
+
+pub use crate::sql::schema::schema_provider::StreamSchemaProvider;
+
+use std::collections::HashMap;
+use std::sync::Arc;
+
+use datafusion::common::tree_node::{Transformed, TreeNode};
+use datafusion::common::{Result, plan_err};
+use datafusion::error::DataFusionError;
+use datafusion::logical_expr::{Extension, LogicalPlan, UserDefinedLogicalNodeCore};
+use tracing::{debug, info, instrument};
+
+use crate::sql::extensions::key_calculation::{KeyExtractionNode, KeyExtractionStrategy};
+use crate::sql::extensions::projection::StreamProjectionNode;
+use crate::sql::extensions::sink::StreamEgressNode;
+use crate::sql::extensions::StreamingOperatorBlueprint;
+use crate::sql::logical_planner::planner::NamedNode;
+
+fn duration_from_sql_expr(
+    expr: &datafusion::sql::sqlparser::ast::Expr,
+) -> Result<std::time::Duration> {
+    use datafusion::sql::sqlparser::ast::Expr as SqlExpr;
+    use datafusion::sql::sqlparser::ast::Value as SqlValue;
+    use datafusion::sql::sqlparser::ast::ValueWithSpan;
+
+    match expr {
+        SqlExpr::Interval(interval) => {
+            let value_str = match interval.value.as_ref() {
+                SqlExpr::Value(ValueWithSpan {
+                    value: SqlValue::SingleQuotedString(s),
+                    ..
+                }) => s.clone(),
+                other => return plan_err!("expected interval string literal, found {other}"),
+            };
+
+            parse_interval_to_duration(&value_str)
+        }
+        SqlExpr::Value(ValueWithSpan {
+            value: SqlValue::SingleQuotedString(s),
+            ..
+        }) => parse_interval_to_duration(s),
+        other => plan_err!("expected an interval expression, found {other}"),
+    }
+}
+
+fn parse_interval_to_duration(s: &str) -> Result<std::time::Duration> {
+    let parts: Vec<&str> = s.trim().split_whitespace().collect();
+    if parts.len() != 2 {
+        return plan_err!("invalid interval string '{s}'; expected '<value> <unit>'");
+    }
+    let value: u64 = parts[0]
+        .parse()
+        .map_err(|_| DataFusionError::Plan(format!("invalid interval number: {}", parts[0])))?;
+    match parts[1].to_lowercase().as_str() {
+        "second" | "seconds" | "s" => Ok(std::time::Duration::from_secs(value)),
+        "minute" | "minutes" | "min" => Ok(std::time::Duration::from_secs(value * 60)),
+        "hour" | "hours" | "h" => Ok(std::time::Duration::from_secs(value * 3600)),
+        "day" | "days" | "d" => Ok(std::time::Duration::from_secs(value * 86400)),
+        unit => plan_err!("unsupported interval unit '{unit}'"),
+    }
+}
+
+fn build_sink_inputs(extensions: &[LogicalPlan]) -> HashMap<NamedNode, Vec<LogicalPlan>> {
+    let mut sink_inputs = HashMap::<NamedNode, Vec<LogicalPlan>>::new();
+    for extension in extensions.iter() {
+        if let LogicalPlan::Extension(ext) = extension {
+            if let Some(sink_node) = ext.node.as_any().downcast_ref::<StreamEgressNode>() {
+                if let Some(named_node) = sink_node.operator_identity() {
+                    let inputs = sink_node
+                        .inputs()
+                        .into_iter()
+                        .cloned()
+                        .collect::<Vec<LogicalPlan>>();
+                    sink_inputs.entry(named_node).or_default().extend(inputs);
+                }
+            }
+        }
+    }
+    sink_inputs
+}
+
+pub(crate) fn maybe_add_key_extension_to_sink(plan: LogicalPlan) -> Result<LogicalPlan> {
+    let LogicalPlan::Extension(ref ext) = plan else {
+        return Ok(plan);
+    };
+
+    let Some(sink) = ext.node.as_any().downcast_ref::<StreamEgressNode>() else {
+        return Ok(plan);
+    };
+
+    let Some(partition_exprs) = sink.destination_table.partition_exprs() else {
+        return Ok(plan);
+    };
+
+    if partition_exprs.is_empty() {
+        return Ok(plan);
+    }
+
+    let inputs = plan
+        .inputs()
+        .into_iter()
+        .map(|input| {
+            Ok(LogicalPlan::Extension(Extension {
+                node: Arc::new(KeyExtractionNode {
+                    operator_label: Some("key-calc-partition".to_string()),
+                    resolved_schema: input.schema().clone(),
+                    upstream_plan: input.clone(),
+                    extraction_strategy: KeyExtractionStrategy::CalculatedExpressions(
+                        partition_exprs.clone(),
+                    ),
+                }),
+            }))
+        })
+        .collect::<Result<_>>()?;
+
+    use datafusion::prelude::col;
+    let unkey = LogicalPlan::Extension(Extension {
+        node: Arc::new(
+            StreamProjectionNode::try_new(
+                inputs,
+                Some("unkey".to_string()),
+                sink.schema().iter().map(|(_, f)| col(f.name())).collect(),
+            )?
+            .with_shuffle_routing(),
+        ),
+    });
+
+    let node = sink.with_exprs_and_inputs(vec![], vec![unkey])?;
+    Ok(LogicalPlan::Extension(Extension {
+        node: Arc::new(node),
+    }))
+}
+
+pub fn rewrite_sinks(extensions: Vec<LogicalPlan>) -> Result<Vec<LogicalPlan>> {
+    let mut sink_inputs = build_sink_inputs(&extensions);
+    let mut new_extensions = vec![];
+    for extension in extensions {
+        let mut rewriter = SinkInputRewriter::new(&mut sink_inputs);
+        let result = extension.rewrite(&mut rewriter)?;
+        if !rewriter.was_removed {
+            new_extensions.push(result.data);
+        }
+    }
+
+    new_extensions
+        .into_iter()
+        .map(maybe_add_key_extension_to_sink)
+        .collect()
+
+}
+
+/// Entry point for transforming a standard DataFusion LogicalPlan into a
+/// Streaming-aware LogicalPlan.
+///
+/// This function coordinates multiple rewriting passes and ensures the
+/// resulting plan satisfies streaming constraints.
+#[instrument(skip_all, level = "debug")]
+pub fn rewrite_plan(
+    plan: LogicalPlan,
+    schema_provider: &StreamSchemaProvider,
+) -> Result<LogicalPlan> {
+    info!("Starting streaming plan rewrite pipeline");
+
+    let Transformed {
+        data: plan, ..
+    } = plan.rewrite_with_subqueries(&mut source_rewriter::SourceRewriter::new(schema_provider))?;
+
+    let mut rewriter = stream_rewriter::StreamRewriter::new(schema_provider);
+    let Transformed {
+        data: rewritten_plan,
+        ..
+    } = plan.rewrite_with_subqueries(&mut rewriter)?;
+
+    rewritten_plan.visit_with_subqueries(&mut TimeWindowUdfChecker {})?;
+
+    if cfg!(debug_assertions) {
+        debug!(
+            "Streaming logical plan graphviz:\n{}",
+            rewritten_plan.display_graphviz()
+        );
+    }
+
+    info!("Streaming plan rewrite completed successfully");
+    Ok(rewritten_plan)
+}
diff --git a/src/sql/analysis/row_time_rewriter.rs b/src/sql/analysis/row_time_rewriter.rs
new file mode 100644
index 00000000..13e2a048
--- /dev/null
+++ b/src/sql/analysis/row_time_rewriter.rs
@@ -0,0 +1,49 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use datafusion::common::tree_node::{Transformed, TreeNodeRewriter};
+use datafusion::common::{Column, Result as DFResult};
+use datafusion::logical_expr::Expr;
+
+use crate::sql::common::constants::planning_placeholder_udf;
+use crate::sql::types::TIMESTAMP_FIELD;
+
+/// Replaces the virtual `row_time()` scalar function with a physical reference to `_timestamp`.
+///
+/// This is a critical mapping step that allows users to use a friendly SQL function
+/// while the engine operates on the mandatory internal streaming timestamp.
+pub struct RowTimeRewriter;
+
+impl TreeNodeRewriter for RowTimeRewriter {
+    type Node = Expr;
+
+    fn f_down(&mut self, node: Self::Node) -> DFResult<Transformed<Self::Node>> {
+        // Use pattern matching to identify the `row_time` scalar function.
+        if let Expr::ScalarFunction(func) = &node
+            && func.name() == planning_placeholder_udf::ROW_TIME
+        {
+            // Map the virtual function to the physical internal timestamp column.
+            // We use .alias() to preserve the original name "row_time()" in the output schema,
+            // ensuring that user-facing column names do not change unexpectedly.
+            let physical_col = Expr::Column(Column {
+                relation: None,
+                name: TIMESTAMP_FIELD.to_string(),
+                spans: Default::default(),
+            })
+            .alias("row_time()");
+
+            return Ok(Transformed::yes(physical_col));
+        }
+
+        Ok(Transformed::no(node))
+    }
+}
diff --git a/src/sql/analysis/sink_input_rewriter.rs b/src/sql/analysis/sink_input_rewriter.rs
new file mode 100644
index 00000000..ad36046f
--- /dev/null
+++ b/src/sql/analysis/sink_input_rewriter.rs
@@ -0,0 +1,59 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use crate::sql::extensions::sink::StreamEgressNode;
+use crate::sql::extensions::StreamingOperatorBlueprint;
+use datafusion::common::Result as DFResult;
+use datafusion::common::tree_node::{Transformed, TreeNodeRecursion, TreeNodeRewriter};
+use datafusion::logical_expr::{Extension, LogicalPlan, UserDefinedLogicalNodeCore};
+use std::collections::HashMap;
+use std::sync::Arc;
+use crate::sql::logical_planner::planner::NamedNode;
+
+type SinkInputs = HashMap<NamedNode, Vec<LogicalPlan>>;
+
+/// Merges inputs for sinks with the same name to avoid duplicate sinks in the plan.
+pub struct SinkInputRewriter<'a> {
+    sink_inputs: &'a mut SinkInputs,
+    pub was_removed: bool,
+}
+
+impl<'a> SinkInputRewriter<'a> {
+    pub(crate) fn new(sink_inputs: &'a mut SinkInputs) -> Self {
+        Self {
+            sink_inputs,
+            was_removed: false,
+        }
+    }
+}
+
+impl TreeNodeRewriter for SinkInputRewriter<'_> {
+    type Node = LogicalPlan;
+
+    fn f_down(&mut self, node: Self::Node) -> DFResult<Transformed<Self::Node>> {
+        if let LogicalPlan::Extension(extension) = &node {
+            if let Some(sink_node) = extension.node.as_any().downcast_ref::<StreamEgressNode>() {
+                if let Some(named_node) = sink_node.operator_identity() {
+                    if let Some(inputs) = self.sink_inputs.remove(&named_node) {
+                        let new_node = LogicalPlan::Extension(Extension {
+                            node: Arc::new(sink_node.with_exprs_and_inputs(vec![], inputs)?),
+                        });
+                        return Ok(Transformed::new(new_node, true, TreeNodeRecursion::Jump));
+                    } else {
+                        self.was_removed = true;
+                    }
+                }
+            }
+        }
+        Ok(Transformed::no(node))
+    }
+}
diff --git a/src/sql/analysis/source_metadata_visitor.rs b/src/sql/analysis/source_metadata_visitor.rs
new file mode 100644
index 00000000..81b9b179
--- /dev/null
+++ b/src/sql/analysis/source_metadata_visitor.rs
@@ -0,0 +1,69 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use crate::sql::extensions::sink::{StreamEgressNode, STREAM_EGRESS_NODE_NAME};
+use crate::sql::extensions::table_source::{StreamIngestionNode, STREAM_INGESTION_NODE_NAME};
+use crate::sql::schema::StreamSchemaProvider;
+use datafusion::common::Result as DFResult;
+use datafusion::common::tree_node::{TreeNodeRecursion, TreeNodeVisitor};
+use datafusion::logical_expr::{Extension, LogicalPlan};
+use std::collections::HashSet;
+
+/// Collects connection IDs from source and sink nodes in the logical plan.
+pub struct SourceMetadataVisitor<'a> {
+    schema_provider: &'a StreamSchemaProvider,
+    pub connection_ids: HashSet<i64>,
+}
+
+impl<'a> SourceMetadataVisitor<'a> {
+    pub fn new(schema_provider: &'a StreamSchemaProvider) -> Self {
+        Self {
+            schema_provider,
+            connection_ids: HashSet::new(),
+        }
+    }
+
+    fn get_connection_id(&self, node: &LogicalPlan) -> Option<i64> {
+        let LogicalPlan::Extension(Extension { node }) = node else {
+            return None;
+        };
+
+        let table_name = match node.name() {
+            name if name == STREAM_INGESTION_NODE_NAME => {
+                let ext = node.as_any().downcast_ref::<StreamIngestionNode>()?;
+                ext.source_identifier.to_string()
+            }
+            name if name == STREAM_EGRESS_NODE_NAME => {
+                let ext = node.as_any().downcast_ref::<StreamEgressNode>()?;
+                ext.target_identifier.to_string()
+            }
+            _ => return None,
+        };
+
+        let table = self.schema_provider.get_catalog_table(&table_name)?;
+        match table {
+            crate::sql::schema::table::Table::ConnectorTable(t) => t.registry_id,
+            _ => None,
+        }
+    }
+}
+
+impl TreeNodeVisitor<'_> for SourceMetadataVisitor<'_> {
+    type Node = LogicalPlan;
+
+    fn f_down(&mut self, node: &Self::Node) -> DFResult<TreeNodeRecursion> {
+        if let Some(id) = self.get_connection_id(node) {
+            self.connection_ids.insert(id);
+        }
+        Ok(TreeNodeRecursion::Continue)
+    }
+}
diff --git a/src/sql/analysis/source_rewriter.rs b/src/sql/analysis/source_rewriter.rs
new file mode 100644
index 00000000..0ade3ea1
--- /dev/null
+++ b/src/sql/analysis/source_rewriter.rs
@@ -0,0 +1,299 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::sync::Arc;
+use std::time::Duration;
+
+use datafusion::common::ScalarValue;
+use datafusion::common::tree_node::{Transformed, TreeNodeRewriter};
+use datafusion::common::{Column, DataFusionError, Result as DFResult, TableReference, plan_err};
+use datafusion::logical_expr::{
+    self, BinaryExpr, Expr, Extension, LogicalPlan, Projection, TableScan,
+};
+
+use crate::sql::schema::source_table::SourceTable;
+use crate::sql::schema::ColumnDescriptor;
+use crate::sql::schema::table::Table;
+use crate::sql::schema::StreamSchemaProvider;
+use crate::sql::common::UPDATING_META_FIELD;
+use crate::sql::extensions::debezium::UnrollDebeziumPayloadNode;
+use crate::sql::extensions::remote_table::RemoteTableBoundaryNode;
+use crate::sql::extensions::table_source::StreamIngestionNode;
+use crate::sql::extensions::watermark_node::EventTimeWatermarkNode;
+use crate::sql::types::TIMESTAMP_FIELD;
+
+/// Rewrites table scans: projections are lifted out of scans into a dedicated projection node
+/// (including virtual fields), using a connector table-source extension instead of a bare
+/// `TableScan`, optionally with Debezium unrolling for updating sources, then remote boundary and
+/// watermark.
+pub struct SourceRewriter<'a> {
+    pub(crate) schema_provider: &'a StreamSchemaProvider,
+}
+
+impl<'a> SourceRewriter<'a> {
+    pub fn new(schema_provider: &'a StreamSchemaProvider) -> Self {
+        Self { schema_provider }
+    }
+}
+
+impl SourceRewriter<'_> {
+    fn projection_expr_for_column(col: &ColumnDescriptor, qualifier: &TableReference) -> Expr {
+        if let Some(logic) = col.computation_logic() {
+            logic
+                .clone()
+                .alias_qualified(Some(qualifier.clone()), col.arrow_field().name().to_string())
+        } else {
+            Expr::Column(Column {
+                relation: Some(qualifier.clone()),
+                name: col.arrow_field().name().to_string(),
+                spans: Default::default(),
+            })
+        }
+    }
+
+    fn watermark_expression(table: &SourceTable) -> DFResult<Expr> {
+        match table.temporal_config.watermark_strategy_column.clone() {
+            Some(watermark_field) => table
+                .schema_specs
+                .iter()
+                .find_map(|c| {
+                    if c.arrow_field().name() == watermark_field.as_str() {
+                        return if let Some(expr) = c.computation_logic() {
+                            Some(expr.clone())
+                        } else {
+                            Some(Expr::Column(Column {
+                                relation: None,
+                                name: c.arrow_field().name().to_string(),
+                                spans: Default::default(),
+                            }))
+                        };
+                    }
+                    None
+                })
+                .ok_or_else(|| {
+                    DataFusionError::Plan(format!("Watermark field {watermark_field} not found"))
+                }),
+            None => Ok(Expr::BinaryExpr(BinaryExpr {
+                left: Box::new(Expr::Column(Column {
+                    relation: None,
+                    name: TIMESTAMP_FIELD.to_string(),
+                    spans: Default::default(),
+                })),
+                op: logical_expr::Operator::Minus,
+                right: Box::new(Expr::Literal(
+                    ScalarValue::DurationNanosecond(Some(Duration::from_secs(1).as_nanos() as i64)),
+                    None,
+                )),
+            })),
+        }
+    }
+
+    fn projection_expressions(
+        table: &SourceTable,
+        qualifier: &TableReference,
+        projection: &Option<Vec<usize>>,
+    ) -> DFResult<Vec<Expr>> {
+        let mut expressions: Vec<Expr> = table
+            .schema_specs
+            .iter()
+            .map(|col| Self::projection_expr_for_column(col, qualifier))
+            .collect();
+
+        if let Some(proj) = projection {
+            expressions = proj.iter().map(|i| expressions[*i].clone()).collect();
+        }
+
+        if let Some(event_time_field) = table.temporal_config.event_column.clone() {
+            let expr = table
+                .schema_specs
+                .iter()
+                .find_map(|c| {
+                    if c.arrow_field().name() == event_time_field.as_str() {
+                        return Some(Self::projection_expr_for_column(c, qualifier));
+                    }
+                    None
+                })
+                .ok_or_else(|| {
+                    DataFusionError::Plan(format!("Event time field {event_time_field} not found"))
+                })?;
+
+            expressions
+                .push(expr.alias_qualified(Some(qualifier.clone()), TIMESTAMP_FIELD.to_string()));
+        } else {
+            let has_ts = table
+                .schema_specs
+                .iter()
+                .any(|c| c.arrow_field().name() == TIMESTAMP_FIELD);
+            if !has_ts {
+                return plan_err!(
+                    "Connector table '{}' has no `{}` column; declare WATERMARK FOR <event_time> AS ... in CREATE TABLE",
+                    table.table_identifier,
+                    TIMESTAMP_FIELD
+                );
+            }
+            expressions.push(Expr::Column(Column::new(
+                Some(qualifier.clone()),
+                TIMESTAMP_FIELD,
+            )));
+        }
+
+        if table.is_updating() {
+            expressions.push(Expr::Column(Column::new(
+                Some(qualifier.clone()),
+                UPDATING_META_FIELD,
+            )));
+        }
+
+        Ok(expressions)
+    }
+
+
+    /// Connector path: `StreamIngestionNode` (table source) → optional `UnrollDebeziumPayloadNode`
+    /// → `Projection`, mirroring Arroyo `TableSourceExtension` + Debezium unroll + projection.
+    fn projection(&self, table_scan: &TableScan, table: &SourceTable) -> DFResult<LogicalPlan> {
+        let qualifier = table_scan.table_name.clone();
+
+        let table_source = LogicalPlan::Extension(Extension {
+            node: Arc::new(StreamIngestionNode::try_new(
+                qualifier.clone(),
+                table.clone(),
+            )?),
+        });
+
+        let (projection_input, scan_projection) = if table.is_updating() {
+            if table.key_constraints.is_empty() {
+                return plan_err!(
+                    "Updating connector table `{}` requires at least one PRIMARY KEY for CDC unrolling",
+                    table.table_identifier
+                );
+            }
+            let unrolled = LogicalPlan::Extension(Extension {
+                node: Arc::new(UnrollDebeziumPayloadNode::try_new(
+                    table_source,
+                    Arc::new(table.key_constraints.clone()),
+                )?),
+            });
+            (unrolled, None)
+        } else {
+            (table_source, table_scan.projection.clone())
+        };
+
+        Ok(LogicalPlan::Projection(Projection::try_new(
+            Self::projection_expressions(table, &qualifier, &scan_projection)?,
+            Arc::new(projection_input),
+        )?))
+    }
+
+    fn mutate_connector_table(
+        &self,
+        table_scan: &TableScan,
+        table: &SourceTable,
+    ) -> DFResult<Transformed<LogicalPlan>> {
+        let input = self.projection(table_scan, table)?;
+
+        let schema = input.schema().clone();
+        let remote = LogicalPlan::Extension(Extension {
+            node: Arc::new(RemoteTableBoundaryNode {
+                upstream_plan: input,
+                table_identifier: table_scan.table_name.to_owned(),
+                resolved_schema: schema,
+                requires_materialization: true,
+            }),
+        });
+
+        let watermark_node = EventTimeWatermarkNode::try_new(
+            remote,
+            table_scan.table_name.clone(),
+            Self::watermark_expression(table)?,
+        )
+        .map_err(|err| {
+            DataFusionError::Internal(format!("failed to create watermark node: {err}"))
+        })?;
+
+        Ok(Transformed::yes(LogicalPlan::Extension(Extension {
+            node: Arc::new(watermark_node),
+        })))
+    }
+
+    fn mutate_table_from_query(
+        &self,
+        table_scan: &TableScan,
+        logical_plan: &LogicalPlan,
+    ) -> DFResult<Transformed<LogicalPlan>> {
+        let column_expressions: Vec<_> = if let Some(projection) = &table_scan.projection {
+            logical_plan
+                .schema()
+                .columns()
+                .into_iter()
+                .enumerate()
+                .filter_map(|(i, col)| {
+                    if projection.contains(&i) {
+                        Some(Expr::Column(col))
+                    } else {
+                        None
+                    }
+                })
+                .collect()
+        } else {
+            logical_plan
+                .schema()
+                .columns()
+                .into_iter()
+                .map(Expr::Column)
+                .collect()
+        };
+
+        let target_columns: Vec<_> = table_scan.projected_schema.columns().into_iter().collect();
+
+        let expressions = column_expressions
+            .into_iter()
+            .zip(target_columns)
+            .map(|(expr, col)| expr.alias_qualified(col.relation, col.name))
+            .collect();
+
+        let projection = LogicalPlan::Projection(Projection::try_new_with_schema(
+            expressions,
+            Arc::new(logical_plan.clone()),
+            table_scan.projected_schema.clone(),
+        )?);
+
+        Ok(Transformed::yes(projection))
+    }
+}
+
+impl TreeNodeRewriter for SourceRewriter<'_> {
+    type Node = LogicalPlan;
+
+    fn f_up(&mut self, node: Self::Node) -> DFResult<Transformed<Self::Node>> {
+        let LogicalPlan::TableScan(table_scan) = node else {
+            return Ok(Transformed::no(node));
+        };
+
+        let table_name = table_scan.table_name.table();
+        let table = self
+            .schema_provider
+            .get_catalog_table(table_name)
+            .ok_or_else(|| DataFusionError::Plan(format!("Table {table_name} not found")))?;
+
+        match table {
+            Table::ConnectorTable(table) => self.mutate_connector_table(&table_scan, table),
+            Table::LookupTable(_table) => {
+                // TODO: implement LookupSource extension
+                plan_err!("Lookup tables are not yet supported")
+            }
+            Table::TableFromQuery {
+                name: _,
+                logical_plan,
+            } => self.mutate_table_from_query(&table_scan, logical_plan),
+        }
+    }
+}
diff --git a/src/sql/analysis/stream_rewriter.rs b/src/sql/analysis/stream_rewriter.rs
new file mode 100644
index 00000000..a62a7bd1
--- /dev/null
+++ b/src/sql/analysis/stream_rewriter.rs
@@ -0,0 +1,231 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::sync::Arc;
+
+use super::StreamSchemaProvider;
+use crate::sql::extensions::StreamingOperatorBlueprint;
+use crate::sql::extensions::remote_table::RemoteTableBoundaryNode;
+use crate::sql::analysis::row_time_rewriter::RowTimeRewriter;
+use crate::sql::analysis::{
+    aggregate_rewriter::AggregateRewriter, join_rewriter::JoinRewriter,
+    window_function_rewriter::WindowFunctionRewriter,
+};
+use crate::sql::analysis::TimeWindowNullCheckRemover;
+use crate::sql::schema::utils::{add_timestamp_field, has_timestamp_field};
+use crate::sql::types::{DFField, TIMESTAMP_FIELD};
+use datafusion::common::tree_node::{Transformed, TreeNodeRewriter};
+use datafusion::common::{Column, DataFusionError, Result, Spans, TableReference, plan_err};
+use datafusion::logical_expr::{
+    Expr, Extension, Filter, LogicalPlan, Projection, SubqueryAlias, Union,
+};
+use datafusion_common::tree_node::TreeNode;
+use datafusion_expr::{Aggregate, Join};
+
+pub struct StreamRewriter<'a> {
+    pub(crate) schema_provider: &'a StreamSchemaProvider,
+}
+
+impl TreeNodeRewriter for StreamRewriter<'_> {
+    type Node = LogicalPlan;
+
+    fn f_up(&mut self, node: Self::Node) -> Result<Transformed<Self::Node>> {
+        match node {
+            // Logic Delegation
+            LogicalPlan::Projection(p) => self.rewrite_projection(p),
+            LogicalPlan::Filter(f) => self.rewrite_filter(f),
+            LogicalPlan::Union(u) => self.rewrite_union(u),
+
+            // Delegation to specialized sub-rewriters
+            LogicalPlan::Aggregate(agg) => self.rewrite_aggregate(agg),
+            LogicalPlan::Join(join) => self.rewrite_join(join),
+            LogicalPlan::Window(_) => self.rewrite_window(node),
+            LogicalPlan::SubqueryAlias(sa) => self.rewrite_subquery_alias(sa),
+
+            // Explicitly Unsupported Operations
+            LogicalPlan::Sort(_) => self.unsupported_error("ORDER BY", &node),
+            LogicalPlan::Limit(_) => self.unsupported_error("LIMIT", &node),
+            LogicalPlan::Repartition(_) => self.unsupported_error("Repartitions", &node),
+            LogicalPlan::Explain(_) => self.unsupported_error("EXPLAIN", &node),
+            LogicalPlan::Analyze(_) => self.unsupported_error("ANALYZE", &node),
+
+            _ => Ok(Transformed::no(node)),
+        }
+    }
+}
+
+impl<'a> StreamRewriter<'a> {
+    pub fn new(schema_provider: &'a StreamSchemaProvider) -> Self {
+        Self { schema_provider }
+    }
+
+    /// Delegates to AggregateRewriter to transform batch aggregates into streaming stateful operators.
+    fn rewrite_aggregate(&self, agg: Aggregate) -> Result<Transformed<LogicalPlan>> {
+        AggregateRewriter {
+            schema_provider: self.schema_provider,
+        }
+        .f_up(LogicalPlan::Aggregate(agg))
+    }
+
+    /// Delegates to JoinRewriter to handle streaming join semantics (e.g., TTL, state management).
+    fn rewrite_join(&self, join: Join) -> Result<Transformed<LogicalPlan>> {
+        JoinRewriter {
+            schema_provider: self.schema_provider,
+        }
+        .f_up(LogicalPlan::Join(join))
+    }
+
+    /// Delegates to WindowFunctionRewriter for stream-aware windowing logic.
+    fn rewrite_window(&self, node: LogicalPlan) -> Result<Transformed<LogicalPlan>> {
+        WindowFunctionRewriter {}.f_up(node)
+    }
+
+    /// Refreshes SubqueryAlias metadata to align with potentially rewritten internal schemas.
+    fn rewrite_subquery_alias(&self, sa: SubqueryAlias) -> Result<Transformed<LogicalPlan>> {
+        // Since the inner 'sa.input' has been rewritten (bottom-up), we must re-create
+        // the alias node to ensure the outer schema correctly reflects internal changes.
+        let new_sa = SubqueryAlias::try_new(sa.input, sa.alias).map_err(|e| {
+            DataFusionError::Internal(format!("Failed to re-alias subquery: {}", e))
+        })?;
+
+        Ok(Transformed::yes(LogicalPlan::SubqueryAlias(new_sa)))
+    }
+
+    /// Handles timestamp propagation and row_time() mapping for Projections
+    fn rewrite_projection(&self, mut projection: Projection) -> Result<Transformed<LogicalPlan>> {
+        // Check if the current projection already has a timestamp field;
+        // if not, we must inject it to maintain streaming heartbeats.
+        if !has_timestamp_field(&projection.schema) {
+            let input_schema = projection.input.schema();
+
+            // Resolve the timestamp field from the input schema using the global constant.
+            let timestamp_field: DFField = input_schema
+                .qualified_field_with_unqualified_name(TIMESTAMP_FIELD)
+                .map_err(|_| {
+                    DataFusionError::Plan(format!(
+                        "No timestamp field found in projection input ({})",
+                        projection.input.display()
+                    ))
+                })?
+                .into();
+
+            // Update the logical schema to include the newly injected timestamp.
+            projection.schema = add_timestamp_field(
+                projection.schema.clone(),
+                timestamp_field.qualifier().cloned(),
+            )
+            .expect("Failed to add timestamp to projection schema");
+
+            // Physically push the timestamp column into the expression list.
+            projection.expr.push(Expr::Column(Column {
+                relation: timestamp_field.qualifier().cloned(),
+                name: TIMESTAMP_FIELD.to_string(),
+                spans: Spans::default(),
+            }));
+        }
+
+        // Map user-friendly row_time() function calls to internal _timestamp column references.
+        let rewritten = projection
+            .expr
+            .iter()
+            .map(|expr| expr.clone().rewrite(&mut RowTimeRewriter {}))
+            .collect::<Result<Vec<_>>>()?;
+
+        // If any expressions were modified (e.g., row_time() was replaced), update the projection.
+        if rewritten.iter().any(|r| r.transformed) {
+            projection.expr = rewritten.into_iter().map(|r| r.data).collect();
+        }
+
+        // Return the updated plan node wrapped in a Transformed container.
+        Ok(Transformed::yes(LogicalPlan::Projection(projection)))
+    }
+
+    /// Harmonizes schemas across Union branches and wraps them in RemoteTableBoundaryNodes.
+    ///
+    /// This ensures that all inputs to a UNION operation share the exact same schema metadata,
+    /// preventing "Schema Drift" where different branches have different field qualifiers.
+    fn rewrite_union(&self, mut union: Union) -> Result<Transformed<LogicalPlan>> {
+        // Industrial engines use the first branch as the "Master Schema" for the Union.
+        // We clone it once to ensure all subsequent branches are forced to comply.
+        let master_schema = union.inputs[0].schema().clone();
+        union.schema = master_schema.clone();
+
+        for input in union.inputs.iter_mut() {
+            // Optimization: If the node is already a non-transparent Extension,
+            // we skip wrapping to avoid unnecessary nesting of logical nodes.
+            if let LogicalPlan::Extension(Extension { node }) = input.as_ref() {
+                let stream_ext: &dyn StreamingOperatorBlueprint = node.try_into().map_err(|e| {
+                    DataFusionError::Internal(format!("Failed to resolve StreamingOperatorBlueprint: {}", e))
+                })?;
+
+                if !stream_ext.is_passthrough_boundary() {
+                    continue;
+                }
+            }
+
+            // Wrap each branch in a RemoteTableBoundaryNode.
+            // This acts as a logical "bridge" that forces the input to adopt the master_schema,
+            // effectively stripping away branch-specific qualifiers (e.g., table aliases).
+            let remote_ext = Arc::new(RemoteTableBoundaryNode {
+                upstream_plan: input.as_ref().clone(),
+                table_identifier: TableReference::bare("union_input"),
+                resolved_schema: master_schema.clone(),
+                requires_materialization: false, // Internal logical boundary only; does not require physical sink.
+            });
+
+            // Atomically replace the input with the wrapped version.
+            *input = Arc::new(LogicalPlan::Extension(Extension { node: remote_ext }));
+        }
+
+        Ok(Transformed::yes(LogicalPlan::Union(union)))
+    }
+
+    /// Optimizes Filter nodes by stripping redundant NULL checks on time window expressions.
+    ///
+    /// In streaming SQL, DataFusion often injects 'IS NOT NULL' guards for window functions
+    /// that are redundant or can interfere with watermark propagation. This rewriter
+    /// cleans those predicates to simplify the physical execution plan.
+    fn rewrite_filter(&self, filter: Filter) -> Result<Transformed<LogicalPlan>> {
+        // We attempt to rewrite the predicate using a specialized sub-rewriter.
+        // The TimeWindowNullCheckRemover specifically targets expressions like
+        // `tumble(...) IS NOT NULL` and simplifies them to `TRUE`.
+        let rewritten_expr = filter
+            .predicate
+            .clone()
+            .rewrite(&mut TimeWindowNullCheckRemover {})?;
+
+        if !rewritten_expr.transformed {
+            return Ok(Transformed::no(LogicalPlan::Filter(filter)));
+        }
+
+        // Industrial Guard: Re-validate the predicate against the input schema.
+        // 'Filter::try_new' ensures that the transformed expression is still semantically
+        // valid for the underlying data stream.
+        let new_filter = Filter::try_new(rewritten_expr.data, filter.input).map_err(|e| {
+            DataFusionError::Internal(format!(
+                "Failed to re-validate filtered predicate after NULL-check removal: {}",
+                e
+            ))
+        })?;
+
+        Ok(Transformed::yes(LogicalPlan::Filter(new_filter)))
+    }
+
+    /// Centralized error handler for unsupported streaming operations
+    fn unsupported_error(&self, op: &str, node: &LogicalPlan) -> Result<Transformed<LogicalPlan>> {
+        plan_err!(
+            "{} is not currently supported in streaming SQL ({})",
+            op,
+            node.display()
+        )
+    }
+}
diff --git a/src/sql/analysis/streaming_window_analzer.rs b/src/sql/analysis/streaming_window_analzer.rs
new file mode 100644
index 00000000..609bd2ee
--- /dev/null
+++ b/src/sql/analysis/streaming_window_analzer.rs
@@ -0,0 +1,215 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::collections::HashSet;
+use std::sync::Arc;
+
+use datafusion::common::tree_node::{TreeNodeRecursion, TreeNodeVisitor};
+use datafusion::common::{Column, DFSchema, DataFusionError, Result};
+use datafusion::logical_expr::{Expr, Extension, LogicalPlan, expr::Alias};
+
+use crate::sql::extensions::aggregate::{STREAM_AGG_EXTENSION_NAME, StreamWindowAggregateNode};
+use crate::sql::extensions::join::STREAM_JOIN_NODE_TYPE;
+use crate::sql::types::{DFField, WindowBehavior, WindowType, fields_with_qualifiers, find_window};
+
+/// WindowDetectingVisitor identifies windowing strategies and tracks window-carrying fields
+/// as they propagate upward through the logical plan tree.
+#[derive(Debug, Default)]
+pub(crate) struct StreamingWindowAnalzer {
+    /// The specific window type discovered (Tumble, Hop, etc.)
+    pub(crate) window: Option<WindowType>,
+    /// Set of fields in the current plan node that carry window semantics.
+    pub(crate) fields: HashSet<DFField>,
+}
+
+impl StreamingWindowAnalzer {
+    /// Entry point to resolve the WindowType of a given plan branch.
+    pub(crate) fn get_window(logical_plan: &LogicalPlan) -> Result<Option<WindowType>> {
+        let mut visitor = Self::default();
+        logical_plan.visit_with_subqueries(&mut visitor)?;
+        Ok(visitor.window)
+    }
+
+    /// Resolves whether an expression is a reference to an existing window field
+    /// or a definition of a new window function.
+    fn resolve_window_from_expr(
+        &self,
+        expr: &Expr,
+        input_schema: &DFSchema,
+    ) -> Result<Option<WindowType>> {
+        // 1. Check if the expression directly references a known window field.
+        if let Some(col) = extract_column(expr) {
+            let field = input_schema.field_with_name(col.relation.as_ref(), &col.name)?;
+            let df_field: DFField = (col.relation.clone(), Arc::new(field.clone())).into();
+
+            if self.fields.contains(&df_field) {
+                return Ok(self.window.clone());
+            }
+        }
+
+        // 2. Otherwise, check if it's a new window function call (e.g., tumble(), hop()).
+        find_window(expr)
+    }
+
+    /// Updates the internal state with new window findings and maps them to the output schema.
+    fn update_state(
+        &mut self,
+        matched_windows: Vec<(usize, WindowType)>,
+        schema: &DFSchema,
+    ) -> Result<()> {
+        // Clear fields from the previous level to maintain schema strictly for the current node.
+        self.fields.clear();
+
+        for (index, window) in matched_windows {
+            if let Some(existing) = &self.window {
+                if existing != &window {
+                    return Err(DataFusionError::Plan(format!(
+                        "Conflicting windows in the same operator: expected {:?}, found {:?}",
+                        existing, window
+                    )));
+                }
+            } else {
+                self.window = Some(window);
+            }
+            // Record this specific index in the schema as a window carrier.
+            self.fields.insert(schema.qualified_field(index).into());
+        }
+        Ok(())
+    }
+}
+
+pub(crate) fn extract_column(expr: &Expr) -> Option<&Column> {
+    match expr {
+        Expr::Column(column) => Some(column),
+        Expr::Alias(Alias { expr, .. }) => extract_column(expr),
+        _ => None,
+    }
+}
+
+impl TreeNodeVisitor<'_> for StreamingWindowAnalzer {
+    type Node = LogicalPlan;
+
+    fn f_down(&mut self, node: &Self::Node) -> Result<TreeNodeRecursion> {
+        // Joins require cross-branch validation to ensure left and right sides align on time.
+        if let LogicalPlan::Extension(Extension { node }) = node
+            && node.name() == STREAM_JOIN_NODE_TYPE
+        {
+            let mut branch_windows = HashSet::new();
+            for input in node.inputs() {
+                if let Some(w) = Self::get_window(input)? {
+                    branch_windows.insert(w);
+                }
+            }
+
+            if branch_windows.len() > 1 {
+                return Err(DataFusionError::Plan(
+                    "Join inputs have mismatched windowing strategies.".into(),
+                ));
+            }
+            self.window = branch_windows.into_iter().next();
+
+            // Optimization: No need to recurse manually if we've resolved the join boundary.
+            return Ok(TreeNodeRecursion::Jump);
+        }
+        Ok(TreeNodeRecursion::Continue)
+    }
+
+    fn f_up(&mut self, node: &Self::Node) -> Result<TreeNodeRecursion> {
+        match node {
+            LogicalPlan::Projection(p) => {
+                let windows = p
+                    .expr
+                    .iter()
+                    .enumerate()
+                    .filter_map(|(i, e)| {
+                        self.resolve_window_from_expr(e, p.input.schema())
+                            .transpose()
+                            .map(|res| res.map(|w| (i, w)))
+                    })
+                    .collect::<Result<Vec<_>>>()?;
+
+                self.update_state(windows, &p.schema)?;
+            }
+
+            LogicalPlan::Aggregate(agg) => {
+                let windows = agg
+                    .group_expr
+                    .iter()
+                    .enumerate()
+                    .filter_map(|(i, e)| {
+                        self.resolve_window_from_expr(e, agg.input.schema())
+                            .transpose()
+                            .map(|res| res.map(|w| (i, w)))
+                    })
+                    .collect::<Result<Vec<_>>>()?;
+
+                self.update_state(windows, &agg.schema)?;
+            }
+
+            LogicalPlan::SubqueryAlias(sa) => {
+                // Map fields through the alias layer by resolving column indices.
+                let input_schema = sa.input.schema();
+                let mapped = self
+                    .fields
+                    .drain()
+                    .map(|f| {
+                        let idx = input_schema.index_of_column(&f.qualified_column())?;
+                        Ok(sa.schema.qualified_field(idx).into())
+                    })
+                    .collect::<Result<HashSet<_>>>()?;
+
+                self.fields = mapped;
+            }
+
+            LogicalPlan::Extension(Extension { node })
+                if node.name() == STREAM_AGG_EXTENSION_NAME =>
+            {
+                let ext = node
+                    .as_any()
+                    .downcast_ref::<StreamWindowAggregateNode>()
+                    .ok_or_else(|| {
+                        DataFusionError::Internal("StreamWindowAggregateNode is malformed".into())
+                    })?;
+
+                match &ext.window_spec {
+                    WindowBehavior::FromOperator {
+                        window,
+                        window_field,
+                        is_nested,
+                        ..
+                    } => {
+                        if self.window.is_some() && !*is_nested {
+                            return Err(DataFusionError::Plan(
+                                "Redundant window definition on an already windowed stream.".into(),
+                            ));
+                        }
+                        self.window = Some(window.clone());
+                        self.fields.insert(window_field.clone());
+                    }
+                    WindowBehavior::InData => {
+                        let current_schema_fields: HashSet<_> =
+                            fields_with_qualifiers(node.schema()).into_iter().collect();
+                        self.fields.retain(|f| current_schema_fields.contains(f));
+
+                        if self.fields.is_empty() {
+                            return Err(DataFusionError::Plan(
+                                "Windowed aggregate missing window metadata from its input.".into(),
+                            ));
+                        }
+                    }
+                }
+            }
+            _ => {}
+        }
+        Ok(TreeNodeRecursion::Continue)
+    }
+}
diff --git a/src/sql/analysis/time_window.rs b/src/sql/analysis/time_window.rs
new file mode 100644
index 00000000..104c0cca
--- /dev/null
+++ b/src/sql/analysis/time_window.rs
@@ -0,0 +1,83 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use datafusion::common::tree_node::{
+    Transformed, TreeNodeRecursion, TreeNodeRewriter, TreeNodeVisitor,
+};
+use datafusion::common::{DataFusionError, Result as DFResult, ScalarValue, plan_err};
+use datafusion::logical_expr::expr::ScalarFunction;
+use datafusion::logical_expr::{Expr, LogicalPlan};
+
+/// Returns the time window function name if the expression is one (tumble/hop/session).
+pub fn is_time_window(expr: &Expr) -> Option<&str> {
+    if let Expr::ScalarFunction(ScalarFunction { func, args: _ }) = expr {
+        match func.name() {
+            "tumble" | "hop" | "session" => return Some(func.name()),
+            _ => {}
+        }
+    }
+    None
+}
+
+struct TimeWindowExprChecker {}
+
+impl TreeNodeVisitor<'_> for TimeWindowExprChecker {
+    type Node = Expr;
+
+    fn f_down(&mut self, node: &Self::Node) -> DFResult<TreeNodeRecursion> {
+        if let Some(w) = is_time_window(node) {
+            return plan_err!(
+                "time window function {} is not allowed in this context. \
+                 Are you missing a GROUP BY clause?",
+                w
+            );
+        }
+        Ok(TreeNodeRecursion::Continue)
+    }
+}
+
+/// Visitor that checks an entire LogicalPlan for misplaced time window UDFs.
+pub struct TimeWindowUdfChecker {}
+
+impl TreeNodeVisitor<'_> for TimeWindowUdfChecker {
+    type Node = LogicalPlan;
+
+    fn f_down(&mut self, node: &Self::Node) -> DFResult<TreeNodeRecursion> {
+        use datafusion::common::tree_node::TreeNode;
+        node.expressions().iter().try_for_each(|expr| {
+            let mut checker = TimeWindowExprChecker {};
+            expr.visit(&mut checker)?;
+            Ok::<(), DataFusionError>(())
+        })?;
+        Ok(TreeNodeRecursion::Continue)
+    }
+}
+
+/// Removes `IS NOT NULL` checks wrapping time window functions,
+/// replacing them with `true` since time windows are never null.
+pub struct TimeWindowNullCheckRemover {}
+
+impl TreeNodeRewriter for TimeWindowNullCheckRemover {
+    type Node = Expr;
+
+    fn f_down(&mut self, node: Self::Node) -> DFResult<Transformed<Self::Node>> {
+        if let Expr::IsNotNull(expr) = &node
+            && is_time_window(expr).is_some()
+        {
+            return Ok(Transformed::yes(Expr::Literal(
+                ScalarValue::Boolean(Some(true)),
+                None,
+            )));
+        }
+        Ok(Transformed::no(node))
+    }
+}
diff --git a/src/sql/analysis/udafs.rs b/src/sql/analysis/udafs.rs
new file mode 100644
index 00000000..73fc062c
--- /dev/null
+++ b/src/sql/analysis/udafs.rs
@@ -0,0 +1,43 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use datafusion::arrow::array::ArrayRef;
+use datafusion::error::Result;
+use datafusion::physical_plan::Accumulator;
+use datafusion::scalar::ScalarValue;
+use std::fmt::Debug;
+
+/// Fake UDAF used just for plan-time placeholder.
+#[derive(Debug)]
+pub struct EmptyUdaf {}
+
+impl Accumulator for EmptyUdaf {
+    fn update_batch(&mut self, _: &[ArrayRef]) -> Result<()> {
+        unreachable!()
+    }
+
+    fn evaluate(&self) -> Result<ScalarValue> {
+        unreachable!()
+    }
+
+    fn size(&self) -> usize {
+        unreachable!()
+    }
+
+    fn state(&self) -> Result<Vec<ScalarValue>> {
+        unreachable!()
+    }
+
+    fn merge_batch(&mut self, _: &[ArrayRef]) -> Result<()> {
+        unreachable!()
+    }
+}
diff --git a/src/sql/analysis/unnest_rewriter.rs b/src/sql/analysis/unnest_rewriter.rs
new file mode 100644
index 00000000..535590c8
--- /dev/null
+++ b/src/sql/analysis/unnest_rewriter.rs
@@ -0,0 +1,179 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::sync::Arc;
+
+use datafusion::arrow::datatypes::DataType;
+use datafusion::common::tree_node::{Transformed, TreeNode, TreeNodeRewriter};
+use datafusion::common::{Column, Result as DFResult, plan_err};
+use datafusion::logical_expr::expr::ScalarFunction;
+use datafusion::logical_expr::{ColumnUnnestList, Expr, LogicalPlan, Projection, Unnest};
+
+use crate::sql::common::constants::planning_placeholder_udf;
+use crate::sql::types::{DFField, fields_with_qualifiers, schema_from_df_fields};
+
+pub const UNNESTED_COL: &str = "__unnested";
+
+/// Rewrites projections containing `unnest()` calls into proper Unnest logical plans.
+pub struct UnnestRewriter {}
+
+impl UnnestRewriter {
+    fn split_unnest(expr: Expr) -> DFResult<(Expr, Option<Expr>)> {
+        let mut captured: Option<Expr> = None;
+
+        let expr = expr.transform_up(|e| {
+            if let Expr::ScalarFunction(ScalarFunction { func: udf, args }) = &e
+                && udf.name() == planning_placeholder_udf::UNNEST
+            {
+                match args.len() {
+                    1 => {
+                        if captured.replace(args[0].clone()).is_some() {
+                            return plan_err!(
+                                "Multiple unnests in expression, which is not allowed"
+                            );
+                        }
+                        return Ok(Transformed::yes(Expr::Column(Column::new_unqualified(
+                            UNNESTED_COL,
+                        ))));
+                    }
+                    n => {
+                        panic!("Unnest has wrong number of arguments (expected 1, found {n})");
+                    }
+                }
+            }
+            Ok(Transformed::no(e))
+        })?;
+
+        Ok((expr.data, captured))
+    }
+}
+
+impl TreeNodeRewriter for UnnestRewriter {
+    type Node = LogicalPlan;
+
+    fn f_up(&mut self, node: Self::Node) -> DFResult<Transformed<Self::Node>> {
+        let LogicalPlan::Projection(projection) = &node else {
+            if node.expressions().iter().any(|e| {
+                let e = Self::split_unnest(e.clone());
+                e.is_err() || e.unwrap().1.is_some()
+            }) {
+                return plan_err!("unnest is only supported in SELECT statements");
+            }
+            return Ok(Transformed::no(node));
+        };
+
+        let mut unnest = None;
+        let exprs = projection
+            .expr
+            .clone()
+            .into_iter()
+            .enumerate()
+            .map(|(i, expr)| {
+                let (expr, opt) = Self::split_unnest(expr)?;
+                let is_unnest = if let Some(e) = opt {
+                    if let Some(prev) = unnest.replace((e, i))
+                        && &prev != unnest.as_ref().unwrap()
+                    {
+                        return plan_err!(
+                            "Projection contains multiple unnests, which is not currently supported"
+                        );
+                    }
+                    true
+                } else {
+                    false
+                };
+
+                Ok((expr, is_unnest))
+            })
+            .collect::<DFResult<Vec<_>>>()?;
+
+        if let Some((unnest_inner, unnest_idx)) = unnest {
+            let produce_list = Arc::new(LogicalPlan::Projection(Projection::try_new(
+                exprs
+                    .iter()
+                    .cloned()
+                    .map(|(e, is_unnest)| {
+                        if is_unnest {
+                            unnest_inner.clone().alias(UNNESTED_COL)
+                        } else {
+                            e
+                        }
+                    })
+                    .collect(),
+                projection.input.clone(),
+            )?));
+
+            let unnest_fields = fields_with_qualifiers(produce_list.schema())
+                .iter()
+                .enumerate()
+                .map(|(i, f)| {
+                    if i == unnest_idx {
+                        let DataType::List(inner) = f.data_type() else {
+                            return plan_err!(
+                                "Argument '{}' to unnest is not a List",
+                                f.qualified_name()
+                            );
+                        };
+                        Ok(DFField::new_unqualified(
+                            UNNESTED_COL,
+                            inner.data_type().clone(),
+                            inner.is_nullable(),
+                        ))
+                    } else {
+                        Ok((*f).clone())
+                    }
+                })
+                .collect::<DFResult<Vec<_>>>()?;
+
+            let unnest_node = LogicalPlan::Unnest(Unnest {
+                exec_columns: vec![
+                    DFField::from(produce_list.schema().qualified_field(unnest_idx))
+                        .qualified_column(),
+                ],
+                input: produce_list,
+                list_type_columns: vec![(
+                    unnest_idx,
+                    ColumnUnnestList {
+                        output_column: Column::new_unqualified(UNNESTED_COL),
+                        depth: 1,
+                    },
+                )],
+                struct_type_columns: vec![],
+                dependency_indices: vec![],
+                schema: Arc::new(schema_from_df_fields(&unnest_fields)?),
+                options: Default::default(),
+            });
+
+            let output_node = LogicalPlan::Projection(Projection::try_new(
+                exprs
+                    .iter()
+                    .enumerate()
+                    .map(|(i, (expr, has_unnest))| {
+                        if *has_unnest {
+                            expr.clone()
+                        } else {
+                            Expr::Column(
+                                DFField::from(unnest_node.schema().qualified_field(i))
+                                    .qualified_column(),
+                            )
+                        }
+                    })
+                    .collect(),
+                Arc::new(unnest_node),
+            )?);
+
+            Ok(Transformed::yes(output_node))
+        } else {
+            Ok(Transformed::no(LogicalPlan::Projection(projection.clone())))
+        }
+    }
+}
diff --git a/src/sql/analysis/window_function_rewriter.rs b/src/sql/analysis/window_function_rewriter.rs
new file mode 100644
index 00000000..63c502bf
--- /dev/null
+++ b/src/sql/analysis/window_function_rewriter.rs
@@ -0,0 +1,203 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use datafusion::common::tree_node::Transformed;
+use datafusion::common::{Result as DFResult, plan_err, tree_node::TreeNodeRewriter};
+use datafusion::logical_expr::{
+    self, Expr, Extension, LogicalPlan, Projection, Sort, Window, expr::WindowFunction,
+    expr::WindowFunctionParams,
+};
+use datafusion_common::DataFusionError;
+use std::sync::Arc;
+use tracing::debug;
+
+use crate::sql::extensions::key_calculation::{KeyExtractionNode, KeyExtractionStrategy};
+use crate::sql::extensions::windows_function::StreamingWindowFunctionNode;
+use crate::sql::analysis::streaming_window_analzer::{StreamingWindowAnalzer, extract_column};
+use crate::sql::types::{WindowType, fields_with_qualifiers, schema_from_df_fields};
+
+/// WindowFunctionRewriter transforms standard SQL Window functions into streaming-compatible
+/// stateful operators, ensuring proper data partitioning and sorting for distributed execution.
+pub(crate) struct WindowFunctionRewriter;
+
+impl WindowFunctionRewriter {
+    /// Recursively unwraps Aliases to find the underlying WindowFunction.
+    fn resolve_window_function(&self, expr: &Expr) -> DFResult<(WindowFunction, String)> {
+        match expr {
+            Expr::Alias(alias) => {
+                let (func, _) = self.resolve_window_function(&alias.expr)?;
+                Ok((func, alias.name.clone()))
+            }
+            Expr::WindowFunction(wf) => Ok((wf.as_ref().clone(), expr.name_for_alias()?)),
+            _ => plan_err!("Expected WindowFunction or Alias, found: {:?}", expr),
+        }
+    }
+
+    /// Identifies which field in the PARTITION BY clause corresponds to the streaming window.
+    fn identify_window_partition(
+        &self,
+        params: &WindowFunctionParams,
+        input: &LogicalPlan,
+        input_window_fields: &std::collections::HashSet<crate::sql::types::DFField>,
+    ) -> DFResult<usize> {
+        let matched: Vec<_> = params
+            .partition_by
+            .iter()
+            .enumerate()
+            .filter_map(|(i, e)| {
+                let col = extract_column(e)?;
+                let field = input
+                    .schema()
+                    .field_with_name(col.relation.as_ref(), &col.name)
+                    .ok()?;
+                let df_field = (col.relation.clone(), Arc::new(field.clone())).into();
+
+                if input_window_fields.contains(&df_field) {
+                    Some(i)
+                } else {
+                    None
+                }
+            })
+            .collect();
+
+        if matched.len() != 1 {
+            return plan_err!(
+                "Streaming window functions require exactly one window column in PARTITION BY. Found: {}",
+                matched.len()
+            );
+        }
+        Ok(matched[0])
+    }
+
+    /// Wraps the input in a Projection and KeyExtractionNode to handle data distribution.
+    fn build_keyed_input(
+        &self,
+        input: Arc<LogicalPlan>,
+        partition_keys: &[Expr],
+    ) -> DFResult<LogicalPlan> {
+        let key_count = partition_keys.len();
+
+        // 1. Build projection: [_key_0, _key_1, ..., original_columns]
+        let mut exprs: Vec<_> = partition_keys
+            .iter()
+            .enumerate()
+            .map(|(i, e)| e.clone().alias(format!("_key_{i}")))
+            .collect();
+
+        exprs.extend(
+            fields_with_qualifiers(input.schema())
+                .iter()
+                .map(|f| Expr::Column(f.qualified_column())),
+        );
+
+        // 2. Derive the keyed schema
+        let mut keyed_fields =
+            fields_with_qualifiers(&Projection::try_new(exprs.clone(), input.clone())?.schema)
+                .iter()
+                .take(key_count)
+                .cloned()
+                .collect::<Vec<_>>();
+        keyed_fields.extend(fields_with_qualifiers(input.schema()));
+
+        let keyed_schema = Arc::new(schema_from_df_fields(&keyed_fields)?);
+
+        let projection =
+            LogicalPlan::Projection(Projection::try_new_with_schema(exprs, input, keyed_schema)?);
+
+        // 3. Wrap in KeyExtractionNode for the physical planner
+        Ok(LogicalPlan::Extension(Extension {
+            node: Arc::new(KeyExtractionNode::new(
+                projection,
+                KeyExtractionStrategy::ColumnIndices((0..key_count).collect()),
+            )),
+        }))
+    }
+}
+
+impl TreeNodeRewriter for WindowFunctionRewriter {
+    type Node = LogicalPlan;
+
+    fn f_up(&mut self, node: Self::Node) -> DFResult<Transformed<Self::Node>> {
+        let LogicalPlan::Window(window) = node else {
+            return Ok(Transformed::no(node));
+        };
+
+        debug!("Rewriting window function for streaming: {:?}", window);
+
+        // 1. Analyze input windowing context
+        let mut analyzer = StreamingWindowAnalzer::default();
+        window.input.visit_with_subqueries(&mut analyzer)?;
+
+        let input_window = analyzer.window.ok_or_else(|| {
+            DataFusionError::Plan(
+                "Window functions require a windowed input stream (e.g., TUMBLE/HOP)".into(),
+            )
+        })?;
+
+        if matches!(input_window, WindowType::Session { .. }) {
+            return plan_err!(
+                "Streaming window functions (OVER) are not supported on Session windows."
+            );
+        }
+
+        // 2. Validate window expression constraints
+        if window.window_expr.len() != 1 {
+            return plan_err!(
+                "Arroyo currently supports exactly one window expression per OVER clause."
+            );
+        }
+
+        let (mut wf, original_name) = self.resolve_window_function(&window.window_expr[0])?;
+
+        // 3. Identify and extract the window column from PARTITION BY
+        let window_part_idx =
+            self.identify_window_partition(&wf.params, &window.input, &analyzer.fields)?;
+        let mut partition_keys = wf.params.partition_by.clone();
+        partition_keys.remove(window_part_idx);
+
+        // Update function params to exclude the window column from internal partitioning
+        // as the streaming engine handles window boundaries natively.
+        wf.params.partition_by = partition_keys.clone();
+        let key_count = partition_keys.len();
+
+        // 4. Build the data-shuffling pipeline (Projection -> KeyCalc -> Sort)
+        let keyed_plan = self.build_keyed_input(window.input.clone(), &partition_keys)?;
+
+        let mut sort_exprs: Vec<_> = partition_keys
+            .iter()
+            .map(|e| logical_expr::expr::Sort {
+                expr: e.clone(),
+                asc: true,
+                nulls_first: false,
+            })
+            .collect();
+        sort_exprs.extend(wf.params.order_by.clone());
+
+        let sorted_plan = LogicalPlan::Sort(Sort {
+            expr: sort_exprs,
+            input: Arc::new(keyed_plan),
+            fetch: None,
+        });
+
+        // 5. Final Assembly
+        let final_wf_expr = Expr::WindowFunction(Box::new(wf)).alias_if_changed(original_name)?;
+        let rewritten_window =
+            LogicalPlan::Window(Window::try_new(vec![final_wf_expr], Arc::new(sorted_plan))?);
+
+        Ok(Transformed::yes(LogicalPlan::Extension(Extension {
+            node: Arc::new(StreamingWindowFunctionNode::new(
+                rewritten_window,
+                (0..key_count).collect(),
+            )),
+        })))
+    }
+}
diff --git a/src/sql/api/checkpoints.rs b/src/sql/api/checkpoints.rs
new file mode 100644
index 00000000..d9bdc139
--- /dev/null
+++ b/src/sql/api/checkpoints.rs
@@ -0,0 +1,108 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use crate::sql::common::to_micros;
+use serde::{Deserialize, Serialize};
+use std::time::SystemTime;
+
+#[derive(Serialize, Deserialize, Clone, Debug)]
+#[serde(rename_all = "snake_case")]
+pub struct Checkpoint {
+    pub epoch: u32,
+    pub backend: String,
+    pub start_time: u64,
+    pub finish_time: Option<u64>,
+    pub events: Vec<CheckpointEventSpan>,
+}
+
+#[derive(Serialize, Deserialize, Clone, Debug)]
+#[serde(rename_all = "snake_case")]
+pub struct CheckpointEventSpan {
+    pub start_time: u64,
+    pub finish_time: u64,
+    pub event: String,
+    pub description: String,
+}
+
+#[derive(Serialize, Deserialize, Clone, Debug)]
+#[serde(rename_all = "snake_case")]
+pub struct SubtaskCheckpointGroup {
+    pub index: u32,
+    pub bytes: u64,
+    pub event_spans: Vec<CheckpointEventSpan>,
+}
+
+#[derive(Serialize, Deserialize, Clone, Debug)]
+#[serde(rename_all = "snake_case")]
+pub struct OperatorCheckpointGroup {
+    pub operator_id: String,
+    pub bytes: u64,
+    pub started_metadata_write: Option<u64>,
+    pub finish_time: Option<u64>,
+    pub subtasks: Vec<SubtaskCheckpointGroup>,
+}
+
+#[derive(Debug, Copy, Clone, Eq, PartialEq, Serialize, Deserialize)]
+pub enum JobCheckpointEventType {
+    Checkpointing,
+    CheckpointingOperators,
+    WritingMetadata,
+    Compacting,
+    Committing,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct JobCheckpointSpan {
+    pub event: JobCheckpointEventType,
+    pub start_time: u64,
+    pub finish_time: Option<u64>,
+}
+
+impl JobCheckpointSpan {
+    pub fn now(event: JobCheckpointEventType) -> Self {
+        Self {
+            event,
+            start_time: to_micros(SystemTime::now()),
+            finish_time: None,
+        }
+    }
+
+    pub fn finish(&mut self) {
+        if self.finish_time.is_none() {
+            self.finish_time = Some(to_micros(SystemTime::now()));
+        }
+    }
+}
+
+impl From<JobCheckpointSpan> for CheckpointEventSpan {
+    fn from(value: JobCheckpointSpan) -> Self {
+        let description = match value.event {
+            JobCheckpointEventType::Checkpointing => "The entire checkpointing process",
+            JobCheckpointEventType::CheckpointingOperators => {
+                "The time spent checkpointing operator states"
+            }
+            JobCheckpointEventType::WritingMetadata => "Writing the final checkpoint metadata",
+            JobCheckpointEventType::Compacting => "Compacting old checkpoints",
+            JobCheckpointEventType::Committing => {
+                "Running two-phase commit for transactional connectors"
+            }
+        }
+        .to_string();
+
+        Self {
+            start_time: value.start_time,
+            finish_time: value.finish_time.unwrap_or_default(),
+            event: format!("{:?}", value.event),
+            description,
+        }
+    }
+}
diff --git a/src/sql/api/connections.rs b/src/sql/api/connections.rs
new file mode 100644
index 00000000..148df69d
--- /dev/null
+++ b/src/sql/api/connections.rs
@@ -0,0 +1,616 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use crate::sql::common::formats::{BadData, Format, Framing};
+use crate::sql::common::{FsExtensionType, FsSchema};
+use datafusion::arrow::datatypes::{DataType, Field, Fields, TimeUnit};
+use serde::ser::SerializeMap;
+use serde::{Deserialize, Serialize, Serializer};
+use std::collections::{BTreeMap, HashMap, HashSet};
+use std::fmt::{Display, Formatter};
+use std::sync::Arc;
+
+#[derive(Serialize, Deserialize, Clone, Debug)]
+#[serde(rename_all = "snake_case")]
+pub struct Connector {
+    pub id: String,
+    pub name: String,
+    pub icon: String,
+    pub description: String,
+    pub table_config: String,
+    pub enabled: bool,
+    pub source: bool,
+    pub sink: bool,
+    pub custom_schemas: bool,
+    pub testing: bool,
+    pub hidden: bool,
+    pub connection_config: Option<String>,
+}
+
+#[derive(Serialize, Deserialize, Clone, Debug)]
+#[serde(rename_all = "snake_case")]
+pub struct ConnectionProfile {
+    pub id: String,
+    pub name: String,
+    pub connector: String,
+    pub config: serde_json::Value,
+    pub description: String,
+}
+
+#[derive(Serialize, Deserialize, Clone, Debug)]
+#[serde(rename_all = "snake_case")]
+pub struct ConnectionProfilePost {
+    pub name: String,
+    pub connector: String,
+    pub config: serde_json::Value,
+}
+
+#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)]
+#[serde(rename_all = "snake_case")]
+pub enum ConnectionType {
+    Source,
+    Sink,
+    Lookup,
+}
+
+impl Display for ConnectionType {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        match self {
+            ConnectionType::Source => write!(f, "SOURCE"),
+            ConnectionType::Sink => write!(f, "SINK"),
+            ConnectionType::Lookup => write!(f, "LOOKUP"),
+        }
+    }
+}
+
+impl TryFrom<String> for ConnectionType {
+    type Error = String;
+
+    fn try_from(value: String) -> Result<Self, Self::Error> {
+        match value.to_lowercase().as_str() {
+            "source" => Ok(ConnectionType::Source),
+            "sink" => Ok(ConnectionType::Sink),
+            "lookup" => Ok(ConnectionType::Lookup),
+            _ => Err(format!("Invalid connection type: {value}")),
+        }
+    }
+}
+
+// ─────────────────── Field Types ───────────────────
+
+#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq)]
+#[serde(tag = "type", rename_all = "snake_case")]
+pub enum FieldType {
+    Int32,
+    Int64,
+    Uint32,
+    Uint64,
+    #[serde(alias = "f32")]
+    Float32,
+    #[serde(alias = "f64")]
+    Float64,
+    Decimal128(DecimalField),
+    Bool,
+    #[serde(alias = "utf8")]
+    String,
+    #[serde(alias = "binary")]
+    Bytes,
+    Timestamp(TimestampField),
+    Json,
+    Struct(StructField),
+    List(ListField),
+}
+
+impl FieldType {
+    pub fn sql_type(&self) -> String {
+        match self {
+            FieldType::Int32 => "INTEGER".into(),
+            FieldType::Int64 => "BIGINT".into(),
+            FieldType::Uint32 => "INTEGER UNSIGNED".into(),
+            FieldType::Uint64 => "BIGINT UNSIGNED".into(),
+            FieldType::Float32 => "FLOAT".into(),
+            FieldType::Float64 => "DOUBLE".into(),
+            FieldType::Decimal128(f) => format!("DECIMAL({}, {})", f.precision, f.scale),
+            FieldType::Bool => "BOOLEAN".into(),
+            FieldType::String => "TEXT".into(),
+            FieldType::Bytes => "BINARY".into(),
+            FieldType::Timestamp(t) => format!("TIMESTAMP({})", t.unit.precision()),
+            FieldType::Json => "JSON".into(),
+            FieldType::List(item) => format!("{}[]", item.items.field_type.sql_type()),
+            FieldType::Struct(StructField { fields, .. }) => {
+                format!(
+                    "STRUCT <{}>",
+                    fields
+                        .iter()
+                        .map(|f| format!("{} {}", f.name, f.field_type.sql_type()))
+                        .collect::<Vec<_>>()
+                        .join(", ")
+                )
+            }
+        }
+    }
+}
+
+#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq, Default)]
+#[serde(rename_all = "snake_case")]
+pub enum TimestampUnit {
+    #[serde(alias = "s")]
+    Second,
+    #[default]
+    #[serde(alias = "ms")]
+    Millisecond,
+    #[serde(alias = "µs", alias = "us")]
+    Microsecond,
+    #[serde(alias = "ns")]
+    Nanosecond,
+}
+
+impl TimestampUnit {
+    pub fn precision(&self) -> u8 {
+        match self {
+            TimestampUnit::Second => 0,
+            TimestampUnit::Millisecond => 3,
+            TimestampUnit::Microsecond => 6,
+            TimestampUnit::Nanosecond => 9,
+        }
+    }
+}
+
+#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq)]
+#[serde(rename_all = "snake_case")]
+pub struct TimestampField {
+    #[serde(default)]
+    pub unit: TimestampUnit,
+}
+
+#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq)]
+#[serde(rename_all = "snake_case")]
+pub struct DecimalField {
+    pub precision: u8,
+    pub scale: i8,
+}
+
+#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq)]
+#[serde(rename_all = "snake_case")]
+pub struct StructField {
+    pub fields: Vec<SourceField>,
+}
+
+#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq)]
+#[serde(rename_all = "snake_case")]
+pub struct ListField {
+    pub items: Box<ListFieldItem>,
+}
+
+fn default_item_name() -> String {
+    "item".to_string()
+}
+
+#[derive(Deserialize, Clone, Debug, PartialEq, Eq)]
+#[serde(rename_all = "snake_case")]
+pub struct ListFieldItem {
+    #[serde(default = "default_item_name")]
+    pub name: String,
+    #[serde(flatten)]
+    pub field_type: FieldType,
+    #[serde(default)]
+    pub required: bool,
+    #[serde(default)]
+    pub sql_name: Option<String>,
+}
+
+impl From<ListFieldItem> for Field {
+    fn from(value: ListFieldItem) -> Self {
+        SourceField {
+            name: value.name,
+            field_type: value.field_type,
+            required: value.required,
+            sql_name: None,
+            metadata_key: None,
+        }
+        .into()
+    }
+}
+
+impl Serialize for ListFieldItem {
+    fn serialize<S>(&self, s: S) -> Result<S::Ok, S::Error>
+    where
+        S: Serializer,
+    {
+        let mut f = Serializer::serialize_map(s, None)?;
+        f.serialize_entry("name", &self.name)?;
+        serialize_field_type_flat(&self.field_type, &mut f)?;
+        f.serialize_entry("required", &self.required)?;
+        f.serialize_entry("sql_name", &self.field_type.sql_type())?;
+        f.end()
+    }
+}
+
+impl TryFrom<Field> for ListFieldItem {
+    type Error = String;
+
+    fn try_from(value: Field) -> Result<Self, Self::Error> {
+        let source_field: SourceField = value.try_into()?;
+        Ok(Self {
+            name: source_field.name,
+            field_type: source_field.field_type,
+            required: source_field.required,
+            sql_name: None,
+        })
+    }
+}
+
+fn serialize_field_type_flat<M: SerializeMap>(ft: &FieldType, map: &mut M) -> Result<(), M::Error> {
+    let type_tag = match ft {
+        FieldType::Int32 => "int32",
+        FieldType::Int64 => "int64",
+        FieldType::Uint32 => "uint32",
+        FieldType::Uint64 => "uint64",
+        FieldType::Float32 => "float32",
+        FieldType::Float64 => "float64",
+        FieldType::Decimal128(_) => "decimal128",
+        FieldType::Bool => "bool",
+        FieldType::String => "string",
+        FieldType::Bytes => "bytes",
+        FieldType::Timestamp(_) => "timestamp",
+        FieldType::Json => "json",
+        FieldType::Struct(_) => "struct",
+        FieldType::List(_) => "list",
+    };
+    map.serialize_entry("type", type_tag)?;
+
+    match ft {
+        FieldType::Decimal128(d) => {
+            map.serialize_entry("precision", &d.precision)?;
+            map.serialize_entry("scale", &d.scale)?;
+        }
+        FieldType::Timestamp(t) => {
+            map.serialize_entry("unit", &t.unit)?;
+        }
+        FieldType::Struct(s) => {
+            map.serialize_entry("fields", &s.fields)?;
+        }
+        FieldType::List(l) => {
+            map.serialize_entry("items", &l.items)?;
+        }
+        _ => {}
+    }
+    Ok(())
+}
+
+// ─────────────────── Source Field ───────────────────
+
+#[derive(Deserialize, Clone, Debug, PartialEq, Eq)]
+#[serde(rename_all = "snake_case")]
+pub struct SourceField {
+    pub name: String,
+    #[serde(flatten)]
+    pub field_type: FieldType,
+    #[serde(default)]
+    pub required: bool,
+    #[serde(default)]
+    pub sql_name: Option<String>,
+    #[serde(default)]
+    pub metadata_key: Option<String>,
+}
+
+impl Serialize for SourceField {
+    fn serialize<S>(&self, s: S) -> Result<S::Ok, S::Error>
+    where
+        S: Serializer,
+    {
+        let mut f = Serializer::serialize_map(s, None)?;
+        f.serialize_entry("name", &self.name)?;
+        serialize_field_type_flat(&self.field_type, &mut f)?;
+        f.serialize_entry("required", &self.required)?;
+        if let Some(metadata_key) = &self.metadata_key {
+            f.serialize_entry("metadata_key", metadata_key)?;
+        }
+        f.serialize_entry("sql_name", &self.field_type.sql_type())?;
+        f.end()
+    }
+}
+
+impl From<SourceField> for Field {
+    fn from(f: SourceField) -> Self {
+        let (t, ext) = match f.field_type {
+            FieldType::Int32 => (DataType::Int32, None),
+            FieldType::Int64 => (DataType::Int64, None),
+            FieldType::Uint32 => (DataType::UInt32, None),
+            FieldType::Uint64 => (DataType::UInt64, None),
+            FieldType::Float32 => (DataType::Float32, None),
+            FieldType::Float64 => (DataType::Float64, None),
+            FieldType::Bool => (DataType::Boolean, None),
+            FieldType::String => (DataType::Utf8, None),
+            FieldType::Bytes => (DataType::Binary, None),
+            FieldType::Decimal128(d) => (DataType::Decimal128(d.precision, d.scale), None),
+            FieldType::Timestamp(TimestampField {
+                unit: TimestampUnit::Second,
+            }) => (DataType::Timestamp(TimeUnit::Second, None), None),
+            FieldType::Timestamp(TimestampField {
+                unit: TimestampUnit::Millisecond,
+            }) => (DataType::Timestamp(TimeUnit::Millisecond, None), None),
+            FieldType::Timestamp(TimestampField {
+                unit: TimestampUnit::Microsecond,
+            }) => (DataType::Timestamp(TimeUnit::Microsecond, None), None),
+            FieldType::Timestamp(TimestampField {
+                unit: TimestampUnit::Nanosecond,
+            }) => (DataType::Timestamp(TimeUnit::Nanosecond, None), None),
+            FieldType::Json => (DataType::Utf8, Some(FsExtensionType::JSON)),
+            FieldType::Struct(s) => (
+                DataType::Struct(Fields::from(
+                    s.fields
+                        .into_iter()
+                        .map(|t| t.into())
+                        .collect::<Vec<Field>>(),
+                )),
+                None,
+            ),
+            FieldType::List(t) => (DataType::List(Arc::new((*t.items).into())), None),
+        };
+
+        FsExtensionType::add_metadata(ext, Field::new(f.name, t, !f.required))
+    }
+}
+
+impl TryFrom<Field> for SourceField {
+    type Error = String;
+
+    fn try_from(f: Field) -> Result<Self, Self::Error> {
+        let field_type = match (f.data_type(), FsExtensionType::from_map(f.metadata())) {
+            (DataType::Boolean, None) => FieldType::Bool,
+            (DataType::Int32, None) => FieldType::Int32,
+            (DataType::Int64, None) => FieldType::Int64,
+            (DataType::UInt32, None) => FieldType::Uint32,
+            (DataType::UInt64, None) => FieldType::Uint64,
+            (DataType::Float32, None) => FieldType::Float32,
+            (DataType::Float64, None) => FieldType::Float64,
+            (DataType::Decimal128(p, s), None) => FieldType::Decimal128(DecimalField {
+                precision: *p,
+                scale: *s,
+            }),
+            (DataType::Binary | DataType::LargeBinary | DataType::BinaryView, None) => FieldType::Bytes,
+            (DataType::Timestamp(TimeUnit::Second, _), None) => {
+                FieldType::Timestamp(TimestampField {
+                    unit: TimestampUnit::Second,
+                })
+            }
+            (DataType::Timestamp(TimeUnit::Millisecond, _), None) => {
+                FieldType::Timestamp(TimestampField {
+                    unit: TimestampUnit::Millisecond,
+                })
+            }
+            (DataType::Timestamp(TimeUnit::Microsecond, _), None) => {
+                FieldType::Timestamp(TimestampField {
+                    unit: TimestampUnit::Microsecond,
+                })
+            }
+            (DataType::Timestamp(TimeUnit::Nanosecond, _), None) => {
+                FieldType::Timestamp(TimestampField {
+                    unit: TimestampUnit::Nanosecond,
+                })
+            }
+            (DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View, None) => FieldType::String,
+            (DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View, Some(FsExtensionType::JSON)) => FieldType::Json,
+            (DataType::Struct(fields), None) => {
+                let fields: Result<_, String> = fields
+                    .into_iter()
+                    .map(|f| (**f).clone().try_into())
+                    .collect();
+                FieldType::Struct(StructField { fields: fields? })
+            }
+            (DataType::List(item), None) => FieldType::List(ListField {
+                items: Box::new((**item).clone().try_into()?),
+            }),
+            dt => return Err(format!("Unsupported data type {dt:?}")),
+        };
+
+        Ok(SourceField {
+            name: f.name().clone(),
+            field_type,
+            required: !f.is_nullable(),
+            sql_name: None,
+            metadata_key: None,
+        })
+    }
+}
+
+// ─────────────────── Schema Definitions ───────────────────
+
+#[derive(Serialize, Deserialize, Clone, Debug, PartialEq)]
+#[serde(rename_all = "snake_case", tag = "type")]
+pub enum SchemaDefinition {
+    JsonSchema {
+        schema: String,
+    },
+    ProtobufSchema {
+        schema: String,
+        #[serde(default)]
+        dependencies: HashMap<String, String>,
+    },
+    AvroSchema {
+        schema: String,
+    },
+}
+
+#[derive(Serialize, Deserialize, Clone, Debug, PartialEq)]
+#[serde(rename_all = "snake_case")]
+pub struct ConnectionSchema {
+    pub format: Option<Format>,
+    #[serde(default)]
+    pub bad_data: Option<BadData>,
+    #[serde(default)]
+    pub framing: Option<Framing>,
+    #[serde(default)]
+    pub fields: Vec<SourceField>,
+    #[serde(default)]
+    pub definition: Option<SchemaDefinition>,
+    #[serde(default)]
+    pub inferred: Option<bool>,
+    #[serde(default)]
+    pub primary_keys: HashSet<String>,
+}
+
+impl ConnectionSchema {
+    pub fn try_new(
+        format: Option<Format>,
+        bad_data: Option<BadData>,
+        framing: Option<Framing>,
+        fields: Vec<SourceField>,
+        definition: Option<SchemaDefinition>,
+        inferred: Option<bool>,
+        primary_keys: HashSet<String>,
+    ) -> anyhow::Result<Self> {
+        let s = ConnectionSchema {
+            format,
+            bad_data,
+            framing,
+            fields,
+            definition,
+            inferred,
+            primary_keys,
+        };
+        s.validate()
+    }
+
+    pub fn validate(self) -> anyhow::Result<Self> {
+        let non_metadata_fields: Vec<_> = self
+            .fields
+            .iter()
+            .filter(|f| f.metadata_key.is_none())
+            .collect();
+
+        if let Some(Format::RawString(_)) = &self.format {
+            if non_metadata_fields.len() != 1
+                || non_metadata_fields.first().unwrap().field_type != FieldType::String
+                || non_metadata_fields.first().unwrap().name != "value"
+            {
+                anyhow::bail!(
+                    "raw_string format requires a schema with a single field called `value` of type TEXT"
+                );
+            }
+        }
+
+        if let Some(Format::Json(json_format)) = &self.format {
+            if json_format.unstructured
+                && (non_metadata_fields.len() != 1
+                    || non_metadata_fields.first().unwrap().field_type != FieldType::Json
+                    || non_metadata_fields.first().unwrap().name != "value")
+            {
+                anyhow::bail!(
+                    "json format with unstructured flag enabled requires a schema with a single field called `value` of type JSON"
+                );
+            }
+        }
+
+        Ok(self)
+    }
+
+    pub fn fs_schema(&self) -> Arc<FsSchema> {
+        let fields: Vec<Field> = self.fields.iter().map(|f| f.clone().into()).collect();
+        Arc::new(FsSchema::from_fields(fields))
+    }
+}
+
+impl From<ConnectionSchema> for FsSchema {
+    fn from(val: ConnectionSchema) -> Self {
+        let fields: Vec<Field> = val.fields.into_iter().map(|f| f.into()).collect();
+        FsSchema::from_fields(fields)
+    }
+}
+
+// ─────────────────── Connection Table ───────────────────
+
+#[derive(Serialize, Clone, Debug)]
+#[serde(rename_all = "snake_case")]
+pub struct ConnectionTable {
+    #[serde(skip_serializing)]
+    pub id: i64,
+    #[serde(rename = "id")]
+    pub pub_id: String,
+    pub name: String,
+    pub created_at: u64,
+    pub connector: String,
+    pub connection_profile: Option<ConnectionProfile>,
+    pub table_type: ConnectionType,
+    pub config: serde_json::Value,
+    pub schema: ConnectionSchema,
+    pub consumers: u32,
+}
+
+#[derive(Serialize, Deserialize, Clone, Debug)]
+#[serde(rename_all = "snake_case")]
+pub struct ConnectionTablePost {
+    pub name: String,
+    pub connector: String,
+    pub connection_profile_id: Option<String>,
+    pub config: serde_json::Value,
+    pub schema: Option<ConnectionSchema>,
+}
+
+#[derive(Serialize, Deserialize, Clone, Debug)]
+#[serde(rename_all = "snake_case")]
+pub struct ConnectionAutocompleteResp {
+    pub values: BTreeMap<String, Vec<String>>,
+}
+
+#[derive(Serialize, Deserialize, Clone, Debug)]
+#[serde(rename_all = "snake_case")]
+pub struct TestSourceMessage {
+    pub error: bool,
+    pub done: bool,
+    pub message: String,
+}
+
+impl TestSourceMessage {
+    pub fn info(message: impl Into<String>) -> Self {
+        Self {
+            error: false,
+            done: false,
+            message: message.into(),
+        }
+    }
+    pub fn error(message: impl Into<String>) -> Self {
+        Self {
+            error: true,
+            done: false,
+            message: message.into(),
+        }
+    }
+    pub fn done(message: impl Into<String>) -> Self {
+        Self {
+            error: false,
+            done: true,
+            message: message.into(),
+        }
+    }
+    pub fn fail(message: impl Into<String>) -> Self {
+        Self {
+            error: true,
+            done: true,
+            message: message.into(),
+        }
+    }
+}
+
+#[derive(Serialize, Deserialize, Clone, Debug)]
+#[serde(rename_all = "snake_case")]
+pub struct ConfluentSchema {
+    pub schema: String,
+}
+
+#[derive(Serialize, Deserialize, Clone, Debug)]
+#[serde(rename_all = "snake_case")]
+pub struct ConfluentSchemaQueryParams {
+    pub endpoint: String,
+    pub topic: String,
+}
diff --git a/src/sql/api/metrics.rs b/src/sql/api/metrics.rs
new file mode 100644
index 00000000..671b52f6
--- /dev/null
+++ b/src/sql/api/metrics.rs
@@ -0,0 +1,53 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use serde::{Deserialize, Serialize};
+
+#[derive(Serialize, Deserialize, Copy, Clone, Debug, Hash, PartialEq, Eq)]
+#[serde(rename_all = "snake_case")]
+pub enum MetricName {
+    BytesRecv,
+    BytesSent,
+    MessagesRecv,
+    MessagesSent,
+    Backpressure,
+    TxQueueSize,
+    TxQueueRem,
+}
+
+#[derive(Serialize, Deserialize, Clone, Debug)]
+#[serde(rename_all = "snake_case")]
+pub struct Metric {
+    pub time: u64,
+    pub value: f64,
+}
+
+#[derive(Serialize, Deserialize, Clone, Debug)]
+#[serde(rename_all = "snake_case")]
+pub struct SubtaskMetrics {
+    pub index: u32,
+    pub metrics: Vec<Metric>,
+}
+
+#[derive(Serialize, Deserialize, Clone, Debug)]
+#[serde(rename_all = "snake_case")]
+pub struct MetricGroup {
+    pub name: MetricName,
+    pub subtasks: Vec<SubtaskMetrics>,
+}
+
+#[derive(Serialize, Deserialize, Clone, Debug)]
+#[serde(rename_all = "snake_case")]
+pub struct OperatorMetricGroup {
+    pub node_id: u32,
+    pub metric_groups: Vec<MetricGroup>,
+}
diff --git a/src/sql/api/mod.rs b/src/sql/api/mod.rs
new file mode 100644
index 00000000..cdc119b7
--- /dev/null
+++ b/src/sql/api/mod.rs
@@ -0,0 +1,48 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! REST/RPC API types for the FunctionStream system.
+//!
+//! Adapted from Arroyo's `arroyo-rpc/src/api_types` and utility modules.
+
+pub mod checkpoints;
+pub mod connections;
+pub mod metrics;
+pub mod pipelines;
+pub mod public_ids;
+pub mod schema_resolver;
+pub mod udfs;
+pub mod var_str;
+
+use serde::{Deserialize, Serialize};
+
+pub use connections::ConnectionProfile;
+
+#[derive(Serialize, Deserialize, Clone, Debug)]
+#[serde(rename_all = "camelCase")]
+pub struct PaginatedCollection<T> {
+    pub data: Vec<T>,
+    pub has_more: bool,
+}
+
+#[derive(Serialize, Deserialize, Clone, Debug)]
+#[serde(rename_all = "camelCase")]
+pub struct NonPaginatedCollection<T> {
+    pub data: Vec<T>,
+}
+
+#[derive(Serialize, Deserialize, Clone, Debug)]
+#[serde(rename_all = "snake_case")]
+pub struct PaginationQueryParams {
+    pub starting_after: Option<String>,
+    pub limit: Option<u32>,
+}
diff --git a/src/sql/api/pipelines.rs b/src/sql/api/pipelines.rs
new file mode 100644
index 00000000..d6cc5253
--- /dev/null
+++ b/src/sql/api/pipelines.rs
@@ -0,0 +1,168 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use super::udfs::Udf;
+use crate::sql::common::control::ErrorDomain;
+use serde::{Deserialize, Serialize};
+
+#[derive(Serialize, Deserialize, Clone, Debug)]
+#[serde(rename_all = "snake_case")]
+pub struct ValidateQueryPost {
+    pub query: String,
+    pub udfs: Option<Vec<Udf>>,
+}
+
+#[derive(Serialize, Deserialize, Clone, Debug)]
+#[serde(rename_all = "snake_case")]
+pub struct QueryValidationResult {
+    pub graph: Option<PipelineGraph>,
+    pub errors: Vec<String>,
+}
+
+#[derive(Serialize, Deserialize, Clone, Debug)]
+#[serde(rename_all = "snake_case")]
+pub struct PipelinePost {
+    pub name: String,
+    pub query: String,
+    pub udfs: Option<Vec<Udf>>,
+    pub parallelism: u64,
+    pub checkpoint_interval_micros: Option<u64>,
+}
+
+#[derive(Serialize, Deserialize, Clone, Debug)]
+#[serde(rename_all = "snake_case")]
+pub struct PreviewPost {
+    pub query: String,
+    pub udfs: Option<Vec<Udf>>,
+    #[serde(default)]
+    pub enable_sinks: bool,
+}
+
+#[derive(Serialize, Deserialize, Clone, Debug)]
+#[serde(rename_all = "snake_case")]
+pub struct PipelinePatch {
+    pub parallelism: Option<u64>,
+    pub checkpoint_interval_micros: Option<u64>,
+    pub stop: Option<StopType>,
+}
+
+#[derive(Serialize, Deserialize, Clone, Debug)]
+#[serde(rename_all = "snake_case")]
+pub struct PipelineRestart {
+    pub force: Option<bool>,
+    pub ignore_state: Option<bool>,
+}
+
+#[derive(Serialize, Deserialize, Clone, Debug)]
+#[serde(rename_all = "snake_case")]
+pub struct Pipeline {
+    pub id: String,
+    pub name: String,
+    pub query: String,
+    pub udfs: Vec<Udf>,
+    pub checkpoint_interval_micros: u64,
+    pub stop: StopType,
+    pub created_at: u64,
+    pub action: Option<StopType>,
+    pub action_text: String,
+    pub action_in_progress: bool,
+    pub graph: PipelineGraph,
+    pub preview: bool,
+}
+
+#[derive(Serialize, Deserialize, Clone, Debug)]
+#[serde(rename_all = "snake_case")]
+pub struct PipelineGraph {
+    pub nodes: Vec<PipelineNode>,
+    pub edges: Vec<PipelineEdge>,
+}
+
+#[derive(Serialize, Deserialize, Clone, Debug)]
+#[serde(rename_all = "snake_case")]
+pub struct PipelineNode {
+    pub node_id: u32,
+    pub operator: String,
+    pub description: String,
+    pub parallelism: u32,
+}
+
+#[derive(Serialize, Deserialize, Clone, Debug)]
+#[serde(rename_all = "snake_case")]
+pub struct PipelineEdge {
+    pub src_id: u32,
+    pub dest_id: u32,
+    pub key_type: String,
+    pub value_type: String,
+    pub edge_type: String,
+}
+
+#[derive(Serialize, Deserialize, Clone, Debug)]
+#[serde(rename_all = "snake_case")]
+pub enum StopType {
+    None,
+    Checkpoint,
+    Graceful,
+    Immediate,
+    Force,
+}
+
+#[derive(Serialize, Deserialize, Clone, Debug)]
+#[serde(rename_all = "snake_case")]
+pub struct FailureReason {
+    pub error: String,
+    pub domain: ErrorDomain,
+}
+
+#[derive(Serialize, Deserialize, Clone, Debug)]
+#[serde(rename_all = "snake_case")]
+pub struct Job {
+    pub id: String,
+    pub running_desired: bool,
+    pub state: String,
+    pub run_id: u64,
+    pub start_time: Option<u64>,
+    pub finish_time: Option<u64>,
+    pub tasks: Option<u64>,
+    pub failure_reason: Option<FailureReason>,
+    pub created_at: u64,
+}
+
+#[derive(Serialize, Deserialize, Clone, Debug)]
+#[serde(rename_all = "snake_case")]
+pub enum JobLogLevel {
+    Info,
+    Warn,
+    Error,
+}
+
+#[derive(Serialize, Deserialize, Clone, Debug)]
+#[serde(rename_all = "snake_case")]
+pub struct JobLogMessage {
+    pub id: String,
+    pub created_at: u64,
+    pub operator_id: Option<String>,
+    pub task_index: Option<u64>,
+    pub level: JobLogLevel,
+    pub message: String,
+    pub details: String,
+    pub error_domain: Option<ErrorDomain>,
+}
+
+#[derive(Serialize, Deserialize, Clone, Debug)]
+#[serde(rename_all = "snake_case")]
+pub struct OutputData {
+    pub operator_id: String,
+    pub subtask_idx: u32,
+    pub timestamps: Vec<u64>,
+    pub start_id: u64,
+    pub batch: String,
+}
diff --git a/src/sql/api/public_ids.rs b/src/sql/api/public_ids.rs
new file mode 100644
index 00000000..33aa6427
--- /dev/null
+++ b/src/sql/api/public_ids.rs
@@ -0,0 +1,69 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::time::{SystemTime, UNIX_EPOCH};
+
+const ID_LENGTH: usize = 10;
+
+const ALPHABET: &[u8; 62] = b"0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
+
+pub enum IdTypes {
+    ApiKey,
+    ConnectionProfile,
+    Schema,
+    Pipeline,
+    JobConfig,
+    Checkpoint,
+    JobStatus,
+    ClusterInfo,
+    JobLogMessage,
+    ConnectionTable,
+    ConnectionTablePipeline,
+    Udf,
+}
+
+/// Generates a unique identifier with a type-specific prefix.
+///
+/// Uses a simple time + random approach instead of nanoid to avoid an extra dependency.
+pub fn generate_id(id_type: IdTypes) -> String {
+    let prefix = match id_type {
+        IdTypes::ApiKey => "ak",
+        IdTypes::ConnectionProfile => "cp",
+        IdTypes::Schema => "sch",
+        IdTypes::Pipeline => "pl",
+        IdTypes::JobConfig => "job",
+        IdTypes::Checkpoint => "chk",
+        IdTypes::JobStatus => "js",
+        IdTypes::ClusterInfo => "ci",
+        IdTypes::JobLogMessage => "jlm",
+        IdTypes::ConnectionTable => "ct",
+        IdTypes::ConnectionTablePipeline => "ctp",
+        IdTypes::Udf => "udf",
+    };
+
+    let nanos = SystemTime::now()
+        .duration_since(UNIX_EPOCH)
+        .unwrap_or_default()
+        .as_nanos();
+
+    let mut id = String::with_capacity(ID_LENGTH);
+    let mut seed = nanos;
+    for _ in 0..ID_LENGTH {
+        seed ^= seed
+            .wrapping_mul(6364136223846793005)
+            .wrapping_add(1442695040888963407);
+        let idx = (seed % ALPHABET.len() as u128) as usize;
+        id.push(ALPHABET[idx] as char);
+    }
+
+    format!("{prefix}_{id}")
+}
diff --git a/src/sql/api/schema_resolver.rs b/src/sql/api/schema_resolver.rs
new file mode 100644
index 00000000..57d3d702
--- /dev/null
+++ b/src/sql/api/schema_resolver.rs
@@ -0,0 +1,94 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use async_trait::async_trait;
+
+/// Trait for resolving schemas by ID (e.g., from a schema registry).
+#[async_trait]
+pub trait SchemaResolver: Send {
+    async fn resolve_schema(&self, id: u32) -> Result<Option<String>, String>;
+}
+
+/// A resolver that always fails — used when no schema registry is configured.
+pub struct FailingSchemaResolver;
+
+impl Default for FailingSchemaResolver {
+    fn default() -> Self {
+        Self
+    }
+}
+
+#[async_trait]
+impl SchemaResolver for FailingSchemaResolver {
+    async fn resolve_schema(&self, id: u32) -> Result<Option<String>, String> {
+        Err(format!(
+            "Schema with id {id} not available, and no schema registry configured"
+        ))
+    }
+}
+
+/// A resolver that returns a fixed schema for a known ID.
+pub struct FixedSchemaResolver {
+    id: u32,
+    schema: String,
+}
+
+impl FixedSchemaResolver {
+    pub fn new(id: u32, schema: String) -> Self {
+        FixedSchemaResolver { id, schema }
+    }
+}
+
+#[async_trait]
+impl SchemaResolver for FixedSchemaResolver {
+    async fn resolve_schema(&self, id: u32) -> Result<Option<String>, String> {
+        if id == self.id {
+            Ok(Some(self.schema.clone()))
+        } else {
+            Err(format!("Unexpected schema id {}, expected {}", id, self.id))
+        }
+    }
+}
+
+/// A caching wrapper around any `SchemaResolver`.
+pub struct CachingSchemaResolver<R: SchemaResolver> {
+    inner: R,
+    cache: tokio::sync::RwLock<std::collections::HashMap<u32, String>>,
+}
+
+impl<R: SchemaResolver> CachingSchemaResolver<R> {
+    pub fn new(inner: R) -> Self {
+        Self {
+            inner,
+            cache: tokio::sync::RwLock::new(std::collections::HashMap::new()),
+        }
+    }
+}
+
+#[async_trait]
+impl<R: SchemaResolver + Sync> SchemaResolver for CachingSchemaResolver<R> {
+    async fn resolve_schema(&self, id: u32) -> Result<Option<String>, String> {
+        {
+            let cache = self.cache.read().await;
+            if let Some(schema) = cache.get(&id) {
+                return Ok(Some(schema.clone()));
+            }
+        }
+
+        let result = self.inner.resolve_schema(id).await?;
+        if let Some(ref schema) = result {
+            let mut cache = self.cache.write().await;
+            cache.insert(id, schema.clone());
+        }
+        Ok(result)
+    }
+}
diff --git a/src/sql/api/udfs.rs b/src/sql/api/udfs.rs
new file mode 100644
index 00000000..781d5b07
--- /dev/null
+++ b/src/sql/api/udfs.rs
@@ -0,0 +1,68 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use serde::{Deserialize, Serialize};
+
+#[derive(Serialize, Deserialize, Clone, Debug)]
+#[serde(rename_all = "snake_case")]
+pub struct Udf {
+    pub definition: String,
+    #[serde(default)]
+    pub language: UdfLanguage,
+}
+
+#[derive(Serialize, Deserialize, Clone, Debug)]
+#[serde(rename_all = "snake_case")]
+pub struct ValidateUdfPost {
+    pub definition: String,
+    #[serde(default)]
+    pub language: UdfLanguage,
+}
+
+#[derive(Serialize, Deserialize, Clone, Debug)]
+#[serde(rename_all = "snake_case")]
+pub struct UdfValidationResult {
+    pub udf_name: Option<String>,
+    pub errors: Vec<String>,
+}
+
+#[derive(Serialize, Deserialize, Copy, Clone, Debug, Default, Eq, PartialEq)]
+#[serde(rename_all = "snake_case")]
+pub enum UdfLanguage {
+    Python,
+    #[default]
+    Rust,
+}
+
+#[derive(Serialize, Deserialize, Clone, Debug)]
+#[serde(rename_all = "snake_case")]
+pub struct UdfPost {
+    pub prefix: String,
+    #[serde(default)]
+    pub language: UdfLanguage,
+    pub definition: String,
+    pub description: Option<String>,
+}
+
+#[derive(Serialize, Deserialize, Clone, Debug)]
+#[serde(rename_all = "snake_case")]
+pub struct GlobalUdf {
+    pub id: String,
+    pub prefix: String,
+    pub name: String,
+    pub language: UdfLanguage,
+    pub created_at: u64,
+    pub updated_at: u64,
+    pub definition: String,
+    pub description: Option<String>,
+    pub dylib_url: Option<String>,
+}
diff --git a/src/sql/api/var_str.rs b/src/sql/api/var_str.rs
new file mode 100644
index 00000000..2638cd06
--- /dev/null
+++ b/src/sql/api/var_str.rs
@@ -0,0 +1,91 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use serde::{Deserialize, Serialize};
+use std::env;
+
+/// A string that may contain `{{ VAR }}` placeholders for environment variable substitution.
+#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
+#[serde(transparent)]
+pub struct VarStr {
+    raw_val: String,
+}
+
+impl VarStr {
+    pub fn new(raw_val: String) -> Self {
+        VarStr { raw_val }
+    }
+
+    pub fn raw(&self) -> &str {
+        &self.raw_val
+    }
+
+    /// Substitute `{{ VAR_NAME }}` patterns with the corresponding environment variable values.
+    pub fn sub_env_vars(&self) -> anyhow::Result<String> {
+        let mut result = self.raw_val.clone();
+        let mut start = 0;
+
+        while let Some(open) = result[start..].find("{{") {
+            let open_abs = start + open;
+            let Some(close) = result[open_abs..].find("}}") else {
+                break;
+            };
+            let close_abs = open_abs + close;
+
+            let var_name = result[open_abs + 2..close_abs].trim();
+            if var_name.is_empty() {
+                start = close_abs + 2;
+                continue;
+            }
+
+            match env::var(var_name) {
+                Ok(value) => {
+                    let full_match = &result[open_abs..close_abs + 2];
+                    let full_match_owned = full_match.to_string();
+                    result = result.replacen(&full_match_owned, &value, 1);
+                    start = open_abs + value.len();
+                }
+                Err(_) => {
+                    anyhow::bail!("Environment variable {} not found", var_name);
+                }
+            }
+        }
+
+        Ok(result)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_no_placeholders() {
+        let input = "This is a test string with no placeholders";
+        assert_eq!(
+            VarStr::new(input.to_string()).sub_env_vars().unwrap(),
+            input
+        );
+    }
+
+    #[test]
+    fn test_with_placeholders() {
+        unsafe { env::set_var("FS_TEST_VAR", "environment variable") };
+        let input = "This is a {{ FS_TEST_VAR }}";
+        let expected = "This is a environment variable";
+        assert_eq!(
+            VarStr::new(input.to_string()).sub_env_vars().unwrap(),
+            expected
+        );
+        unsafe { env::remove_var("FS_TEST_VAR") };
+    }
+}
diff --git a/src/sql/common/arrow_ext.rs b/src/sql/common/arrow_ext.rs
new file mode 100644
index 00000000..782f4358
--- /dev/null
+++ b/src/sql/common/arrow_ext.rs
@@ -0,0 +1,181 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::collections::HashMap;
+use std::fmt::{Display, Formatter};
+use std::time::SystemTime;
+
+use datafusion::arrow::datatypes::{DataType, Field, TimeUnit};
+
+pub struct DisplayAsSql<'a>(pub &'a DataType);
+
+impl Display for DisplayAsSql<'_> {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        match self.0 {
+            DataType::Boolean => write!(f, "BOOLEAN"),
+            DataType::Int8 | DataType::Int16 | DataType::Int32 => write!(f, "INT"),
+            DataType::Int64 => write!(f, "BIGINT"),
+            DataType::UInt8 | DataType::UInt16 | DataType::UInt32 => write!(f, "INT UNSIGNED"),
+            DataType::UInt64 => write!(f, "BIGINT UNSIGNED"),
+            DataType::Float16 | DataType::Float32 => write!(f, "FLOAT"),
+            DataType::Float64 => write!(f, "DOUBLE"),
+            DataType::Timestamp(_, _) => write!(f, "TIMESTAMP"),
+            DataType::Date32 => write!(f, "DATE"),
+            DataType::Date64 => write!(f, "DATETIME"),
+            DataType::Time32(_) => write!(f, "TIME"),
+            DataType::Time64(_) => write!(f, "TIME"),
+            DataType::Duration(_) => write!(f, "INTERVAL"),
+            DataType::Interval(_) => write!(f, "INTERVAL"),
+            DataType::Binary | DataType::FixedSizeBinary(_) | DataType::LargeBinary => {
+                write!(f, "BYTEA")
+            }
+            DataType::Utf8 | DataType::LargeUtf8 => write!(f, "TEXT"),
+            DataType::List(inner) => {
+                write!(f, "{}[]", DisplayAsSql(inner.data_type()))
+            }
+            dt => write!(f, "{dt}"),
+        }
+    }
+}
+
+/// Arrow extension type markers for FunctionStream-specific semantics.
+#[derive(Copy, Clone, Eq, PartialEq, Debug)]
+pub enum FsExtensionType {
+    JSON,
+}
+
+impl FsExtensionType {
+    pub fn from_map(map: &HashMap<String, String>) -> Option<Self> {
+        match map.get("ARROW:extension:name")?.as_str() {
+            "functionstream.json" => Some(Self::JSON),
+            _ => None,
+        }
+    }
+
+    pub fn add_metadata(v: Option<Self>, field: Field) -> Field {
+        if let Some(v) = v {
+            let mut m = HashMap::new();
+            match v {
+                FsExtensionType::JSON => {
+                    m.insert(
+                        "ARROW:extension:name".to_string(),
+                        "functionstream.json".to_string(),
+                    );
+                }
+            }
+            field.with_metadata(m)
+        } else {
+            field
+        }
+    }
+}
+
+pub trait GetArrowType {
+    fn arrow_type() -> DataType;
+}
+
+pub trait GetArrowSchema {
+    fn arrow_schema() -> datafusion::arrow::datatypes::Schema;
+}
+
+impl<T> GetArrowType for T
+where
+    T: GetArrowSchema,
+{
+    fn arrow_type() -> DataType {
+        DataType::Struct(Self::arrow_schema().fields.clone())
+    }
+}
+
+impl GetArrowType for bool {
+    fn arrow_type() -> DataType {
+        DataType::Boolean
+    }
+}
+
+impl GetArrowType for i8 {
+    fn arrow_type() -> DataType {
+        DataType::Int8
+    }
+}
+
+impl GetArrowType for i16 {
+    fn arrow_type() -> DataType {
+        DataType::Int16
+    }
+}
+
+impl GetArrowType for i32 {
+    fn arrow_type() -> DataType {
+        DataType::Int32
+    }
+}
+
+impl GetArrowType for i64 {
+    fn arrow_type() -> DataType {
+        DataType::Int64
+    }
+}
+
+impl GetArrowType for u8 {
+    fn arrow_type() -> DataType {
+        DataType::UInt8
+    }
+}
+
+impl GetArrowType for u16 {
+    fn arrow_type() -> DataType {
+        DataType::UInt16
+    }
+}
+
+impl GetArrowType for u32 {
+    fn arrow_type() -> DataType {
+        DataType::UInt32
+    }
+}
+
+impl GetArrowType for u64 {
+    fn arrow_type() -> DataType {
+        DataType::UInt64
+    }
+}
+
+impl GetArrowType for f32 {
+    fn arrow_type() -> DataType {
+        DataType::Float32
+    }
+}
+
+impl GetArrowType for f64 {
+    fn arrow_type() -> DataType {
+        DataType::Float64
+    }
+}
+
+impl GetArrowType for String {
+    fn arrow_type() -> DataType {
+        DataType::Utf8
+    }
+}
+
+impl GetArrowType for Vec<u8> {
+    fn arrow_type() -> DataType {
+        DataType::Binary
+    }
+}
+
+impl GetArrowType for SystemTime {
+    fn arrow_type() -> DataType {
+        DataType::Timestamp(TimeUnit::Nanosecond, None)
+    }
+}
diff --git a/src/sql/common/connector_options.rs b/src/sql/common/connector_options.rs
new file mode 100644
index 00000000..6f82782e
--- /dev/null
+++ b/src/sql/common/connector_options.rs
@@ -0,0 +1,434 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::collections::{BTreeMap, HashMap};
+use std::num::{NonZero, NonZeroU64};
+use std::str::FromStr;
+use std::time::Duration;
+
+use datafusion::common::{Result as DFResult, plan_datafusion_err};
+use datafusion::error::DataFusionError;
+use datafusion::sql::sqlparser::ast::{Expr, Ident, SqlOption, Value as SqlValue, ValueWithSpan};
+use tracing::warn;
+
+use super::constants::{interval_duration_unit, with_opt_bool_str};
+
+pub trait FromOpts: Sized {
+    fn from_opts(opts: &mut ConnectorOptions) -> DFResult<Self>;
+}
+
+pub struct ConnectorOptions {
+    options: HashMap<String, Expr>,
+    partitions: Vec<Expr>,
+}
+
+fn sql_expr_to_catalog_string(e: &Expr) -> String {
+    match e {
+        Expr::Value(ValueWithSpan { value, .. }) => match value {
+            SqlValue::SingleQuotedString(s) | SqlValue::DoubleQuotedString(s) => s.clone(),
+            SqlValue::NationalStringLiteral(s) => s.clone(),
+            SqlValue::HexStringLiteral(s) => s.clone(),
+            SqlValue::Number(n, _) => n.clone(),
+            SqlValue::Boolean(b) => b.to_string(),
+            SqlValue::Null => "NULL".to_string(),
+            other => other.to_string(),
+        },
+        Expr::Identifier(ident) => ident.value.clone(),
+        other => other.to_string(),
+    }
+}
+
+impl ConnectorOptions {
+    /// Build options from persisted catalog string maps (same semantics as SQL `WITH` literals).
+    pub fn from_flat_string_map(map: HashMap<String, String>) -> DFResult<Self> {
+        let mut options = HashMap::with_capacity(map.len());
+        for (k, v) in map {
+            options.insert(
+                k,
+                Expr::Value(SqlValue::SingleQuotedString(v).with_empty_span()),
+            );
+        }
+        Ok(Self {
+            options,
+            partitions: Vec::new(),
+        })
+    }
+
+    pub fn new(sql_opts: &[SqlOption], partition_by: &Option<Vec<Expr>>) -> DFResult<Self> {
+        let mut options = HashMap::new();
+
+        for option in sql_opts {
+            let SqlOption::KeyValue { key, value } = option else {
+                return Err(plan_datafusion_err!(
+                    "invalid with option: '{}'; expected an `=` delimited key-value pair",
+                    option
+                ));
+            };
+
+            options.insert(key.value.clone(), value.clone());
+        }
+
+        Ok(Self {
+            options,
+            partitions: partition_by.clone().unwrap_or_default(),
+        })
+    }
+
+    pub fn partitions(&self) -> &[Expr] {
+        &self.partitions
+    }
+
+    pub fn pull_struct<T: FromOpts>(&mut self) -> DFResult<T> {
+        T::from_opts(self)
+    }
+
+    pub fn pull_opt_str(&mut self, name: &str) -> DFResult<Option<String>> {
+        match self.options.remove(name) {
+            Some(Expr::Value(ValueWithSpan {
+                value: SqlValue::SingleQuotedString(s),
+                span: _,
+            })) => Ok(Some(s)),
+            Some(e) => Err(plan_datafusion_err!(
+                "expected with option '{}' to be a single-quoted string, but it was `{:?}`",
+                name,
+                e
+            )),
+            None => Ok(None),
+        }
+    }
+
+    pub fn pull_str(&mut self, name: &str) -> DFResult<String> {
+        self.pull_opt_str(name)?
+            .ok_or_else(|| plan_datafusion_err!("required option '{}' not set", name))
+    }
+
+    pub fn pull_opt_bool(&mut self, name: &str) -> DFResult<Option<bool>> {
+        match self.options.remove(name) {
+            Some(Expr::Value(ValueWithSpan {
+                value: SqlValue::Boolean(b),
+                span: _,
+            })) => Ok(Some(b)),
+            Some(Expr::Value(ValueWithSpan {
+                value: SqlValue::SingleQuotedString(s),
+                span: _,
+            })) => match s.as_str() {
+                with_opt_bool_str::TRUE | with_opt_bool_str::YES => Ok(Some(true)),
+                with_opt_bool_str::FALSE | with_opt_bool_str::NO => Ok(Some(false)),
+                _ => Err(plan_datafusion_err!(
+                    "expected with option '{}' to be a boolean, but it was `'{}'`",
+                    name,
+                    s
+                )),
+            },
+            Some(e) => Err(plan_datafusion_err!(
+                "expected with option '{}' to be a boolean, but it was `{:?}`",
+                name,
+                e
+            )),
+            None => Ok(None),
+        }
+    }
+
+    pub fn pull_opt_u64(&mut self, name: &str) -> DFResult<Option<u64>> {
+        match self.options.remove(name) {
+            Some(Expr::Value(ValueWithSpan {
+                value: SqlValue::Number(s, _),
+                span: _,
+            }))
+            | Some(Expr::Value(ValueWithSpan {
+                value: SqlValue::SingleQuotedString(s),
+                span: _,
+            })) => s.parse::<u64>().map(Some).map_err(|_| {
+                plan_datafusion_err!(
+                    "expected with option '{}' to be an unsigned integer, but it was `{}`",
+                    name,
+                    s
+                )
+            }),
+            Some(e) => Err(plan_datafusion_err!(
+                "expected with option '{}' to be an unsigned integer, but it was `{:?}`",
+                name,
+                e
+            )),
+            None => Ok(None),
+        }
+    }
+
+    pub fn pull_opt_nonzero_u64(&mut self, name: &str) -> DFResult<Option<NonZero<u64>>> {
+        match self.pull_opt_u64(name)? {
+            Some(0) => Err(plan_datafusion_err!(
+                "expected with option '{name}' to be greater than 0, but it was 0"
+            )),
+            Some(i) => Ok(Some(NonZeroU64::new(i).unwrap())),
+            None => Ok(None),
+        }
+    }
+
+    pub fn pull_opt_data_size_bytes(&mut self, name: &str) -> DFResult<Option<u64>> {
+        self.pull_opt_str(name)?
+            .map(|s| {
+                s.parse::<u64>().map_err(|_| {
+                    plan_datafusion_err!(
+                        "expected with option '{}' to be a size in bytes (unsigned integer), but it was `{}`",
+                        name,
+                        s
+                    )
+                })
+            })
+            .transpose()
+    }
+
+    pub fn pull_opt_i64(&mut self, name: &str) -> DFResult<Option<i64>> {
+        match self.options.remove(name) {
+            Some(Expr::Value(ValueWithSpan {
+                value: SqlValue::Number(s, _),
+                span: _,
+            }))
+            | Some(Expr::Value(ValueWithSpan {
+                value: SqlValue::SingleQuotedString(s),
+                span: _,
+            })) => s.parse::<i64>().map(Some).map_err(|_| {
+                plan_datafusion_err!(
+                    "expected with option '{}' to be an integer, but it was `{}`",
+                    name,
+                    s
+                )
+            }),
+            Some(e) => Err(plan_datafusion_err!(
+                "expected with option '{}' to be an integer, but it was `{:?}`",
+                name,
+                e
+            )),
+            None => Ok(None),
+        }
+    }
+
+    pub fn pull_i64(&mut self, name: &str) -> DFResult<i64> {
+        self.pull_opt_i64(name)?
+            .ok_or_else(|| plan_datafusion_err!("required option '{}' not set", name))
+    }
+
+    pub fn pull_u64(&mut self, name: &str) -> DFResult<u64> {
+        self.pull_opt_u64(name)?
+            .ok_or_else(|| plan_datafusion_err!("required option '{}' not set", name))
+    }
+
+    pub fn pull_opt_f64(&mut self, name: &str) -> DFResult<Option<f64>> {
+        match self.options.remove(name) {
+            Some(Expr::Value(ValueWithSpan {
+                value: SqlValue::Number(s, _),
+                span: _,
+            }))
+            | Some(Expr::Value(ValueWithSpan {
+                value: SqlValue::SingleQuotedString(s),
+                span: _,
+            })) => s.parse::<f64>().map(Some).map_err(|_| {
+                plan_datafusion_err!(
+                    "expected with option '{}' to be a double, but it was `{}`",
+                    name,
+                    s
+                )
+            }),
+            Some(e) => Err(plan_datafusion_err!(
+                "expected with option '{}' to be a double, but it was `{:?}`",
+                name,
+                e
+            )),
+            None => Ok(None),
+        }
+    }
+
+    pub fn pull_f64(&mut self, name: &str) -> DFResult<f64> {
+        self.pull_opt_f64(name)?
+            .ok_or_else(|| plan_datafusion_err!("required option '{}' not set", name))
+    }
+
+    pub fn pull_bool(&mut self, name: &str) -> DFResult<bool> {
+        self.pull_opt_bool(name)?
+            .ok_or_else(|| plan_datafusion_err!("required option '{}' not set", name))
+    }
+
+    pub fn pull_opt_duration(&mut self, name: &str) -> DFResult<Option<Duration>> {
+        match self.options.remove(name) {
+            Some(e) => Ok(Some(duration_from_sql_expr(&e).map_err(|e| {
+                plan_datafusion_err!("in with clause '{name}': {}", e)
+            })?)),
+            None => Ok(None),
+        }
+    }
+
+    pub fn pull_opt_field(&mut self, name: &str) -> DFResult<Option<String>> {
+        match self.options.remove(name) {
+            Some(Expr::Value(ValueWithSpan {
+                value: SqlValue::SingleQuotedString(s),
+                span: _,
+            })) => {
+                warn!(
+                    "Referred to a field in `{name}` with a string—this is deprecated and will be unsupported after Arroyo 0.14"
+                );
+                Ok(Some(s))
+            }
+            Some(Expr::Identifier(Ident { value, .. })) => Ok(Some(value)),
+            Some(e) => Err(plan_datafusion_err!(
+                "expected with option '{}' to be a field, but it was `{:?}`",
+                name,
+                e
+            )),
+            None => Ok(None),
+        }
+    }
+
+    pub fn pull_opt_array(&mut self, name: &str) -> Option<Vec<Expr>> {
+        Some(match self.options.remove(name)? {
+            Expr::Value(ValueWithSpan {
+                value: SqlValue::SingleQuotedString(s),
+                span,
+            }) => s
+                .split(',')
+                .map(|p| {
+                    Expr::Value(ValueWithSpan {
+                        value: SqlValue::SingleQuotedString(p.to_string()),
+                        span: span.clone(),
+                    })
+                })
+                .collect(),
+            Expr::Array(a) => a.elem,
+            e => vec![e],
+        })
+    }
+
+    pub fn pull_opt_parsed<T: FromStr>(&mut self, name: &str) -> DFResult<Option<T>> {
+        Ok(match self.pull_opt_str(name)? {
+            Some(s) => Some(
+                s.parse()
+                    .map_err(|_| plan_datafusion_err!("invalid value '{s}' for {name}"))?,
+            ),
+            None => None,
+        })
+    }
+
+    pub fn keys(&self) -> impl Iterator<Item = &String> {
+        self.options.keys()
+    }
+
+    pub fn keys_with_prefix<'a, 'b>(
+        &'a self,
+        prefix: &'b str,
+    ) -> impl Iterator<Item = &'a String> + 'b
+    where
+        'a: 'b,
+    {
+        self.options.keys().filter(move |k| k.starts_with(prefix))
+    }
+
+    pub fn insert_str(
+        &mut self,
+        name: impl Into<String>,
+        value: impl Into<String>,
+    ) -> DFResult<Option<String>> {
+        let name = name.into();
+        let value = value.into();
+        let existing = self.pull_opt_str(&name)?;
+        self.options.insert(
+            name,
+            Expr::Value(SqlValue::SingleQuotedString(value).with_empty_span()),
+        );
+        Ok(existing)
+    }
+
+    pub fn is_empty(&self) -> bool {
+        self.options.is_empty()
+    }
+
+    pub fn contains_key(&self, key: &str) -> bool {
+        self.options.contains_key(key)
+    }
+
+    /// Drain all remaining options into string values (for connector runtime config).
+    pub fn drain_remaining_string_values(&mut self) -> DFResult<HashMap<String, String>> {
+        let taken = std::mem::take(&mut self.options);
+        let mut out = HashMap::with_capacity(taken.len());
+        for (k, v) in taken {
+            out.insert(k, format!("{v}"));
+        }
+        Ok(out)
+    }
+
+    /// Snapshot of all current `WITH` key/value pairs for catalog persistence (`SHOW CREATE TABLE`).
+    /// Call before any `pull_*` consumes options.
+    pub fn snapshot_for_catalog(&self) -> BTreeMap<String, String> {
+        self.options
+            .iter()
+            .map(|(k, v)| (k.clone(), sql_expr_to_catalog_string(v)))
+            .collect()
+    }
+}
+
+fn duration_from_sql_expr(expr: &Expr) -> Result<Duration, DataFusionError> {
+    match expr {
+        Expr::Interval(interval) => {
+            let s = match interval.value.as_ref() {
+                Expr::Value(ValueWithSpan {
+                    value: SqlValue::SingleQuotedString(s),
+                    ..
+                }) => s.clone(),
+                other => {
+                    return Err(DataFusionError::Plan(format!(
+                        "expected interval string literal, found {other}"
+                    )));
+                }
+            };
+            parse_interval_to_duration(&s)
+        }
+        Expr::Value(ValueWithSpan {
+            value: SqlValue::SingleQuotedString(s),
+            ..
+        }) => parse_interval_to_duration(s),
+        other => Err(DataFusionError::Plan(format!(
+            "expected an interval expression, found {other}"
+        ))),
+    }
+}
+
+fn parse_interval_to_duration(s: &str) -> Result<Duration, DataFusionError> {
+    let parts: Vec<&str> = s.trim().split_whitespace().collect();
+    if parts.len() != 2 {
+        return Err(DataFusionError::Plan(format!(
+            "invalid interval string '{s}'; expected '<value> <unit>'"
+        )));
+    }
+    let value: u64 = parts[0]
+        .parse()
+        .map_err(|_| DataFusionError::Plan(format!("invalid interval number: {}", parts[0])))?;
+    let unit_lc = parts[1].to_lowercase();
+    let unit = unit_lc.as_str();
+    let duration = match unit {
+        interval_duration_unit::SECOND
+        | interval_duration_unit::SECONDS
+        | interval_duration_unit::S => Duration::from_secs(value),
+        interval_duration_unit::MINUTE
+        | interval_duration_unit::MINUTES
+        | interval_duration_unit::MIN => Duration::from_secs(value * 60),
+        interval_duration_unit::HOUR
+        | interval_duration_unit::HOURS
+        | interval_duration_unit::H => Duration::from_secs(value * 3600),
+        interval_duration_unit::DAY
+        | interval_duration_unit::DAYS
+        | interval_duration_unit::D => Duration::from_secs(value * 86400),
+        unit => {
+            return Err(DataFusionError::Plan(format!(
+                "unsupported interval unit '{unit}'"
+            )));
+        }
+    };
+    Ok(duration)
+}
diff --git a/src/sql/common/constants.rs b/src/sql/common/constants.rs
new file mode 100644
index 00000000..8eb697e2
--- /dev/null
+++ b/src/sql/common/constants.rs
@@ -0,0 +1,299 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+
+pub mod scalar_fn {
+    pub const GET_FIRST_JSON_OBJECT: &str = "get_first_json_object";
+    pub const EXTRACT_JSON: &str = "extract_json";
+    pub const EXTRACT_JSON_STRING: &str = "extract_json_string";
+    pub const SERIALIZE_JSON_UNION: &str = "serialize_json_union";
+    pub const MULTI_HASH: &str = "multi_hash";
+}
+
+
+pub mod window_fn {
+    pub const HOP: &str = "hop";
+    pub const TUMBLE: &str = "tumble";
+    pub const SESSION: &str = "session";
+}
+
+
+pub mod planning_placeholder_udf {
+    pub const UNNEST: &str = "unnest";
+    pub const ROW_TIME: &str = "row_time";
+    pub const LIST_ELEMENT_FIELD: &str = "field";
+}
+
+
+pub mod operator_feature {
+    pub const ASYNC_UDF: &str = "async-udf";
+    pub const JOIN_WITH_EXPIRATION: &str = "join-with-expiration";
+    pub const WINDOWED_JOIN: &str = "windowed-join";
+    pub const SQL_WINDOW_FUNCTION: &str = "sql-window-function";
+    pub const LOOKUP_JOIN: &str = "lookup-join";
+    pub const SQL_TUMBLING_WINDOW_AGGREGATE: &str = "sql-tumbling-window-aggregate";
+    pub const SQL_SLIDING_WINDOW_AGGREGATE: &str = "sql-sliding-window-aggregate";
+    pub const SQL_SESSION_WINDOW_AGGREGATE: &str = "sql-session-window-aggregate";
+    pub const SQL_UPDATING_AGGREGATE: &str = "sql-updating-aggregate";
+    pub const KEY_BY_ROUTING: &str = "key-by-routing";
+    pub const CONNECTOR_SOURCE: &str = "connector-source";
+    pub const CONNECTOR_SINK: &str = "connector-sink";
+}
+
+
+pub mod extension_node {
+    pub const STREAM_WINDOW_AGGREGATE: &str = "StreamWindowAggregateNode";
+    pub const STREAMING_WINDOW_FUNCTION: &str = "StreamingWindowFunctionNode";
+    pub const EVENT_TIME_WATERMARK: &str = "EventTimeWatermarkNode";
+    pub const CONTINUOUS_AGGREGATE: &str = "ContinuousAggregateNode";
+    pub const SYSTEM_TIMESTAMP_INJECTOR: &str = "SystemTimestampInjectorNode";
+    pub const STREAM_INGESTION: &str = "StreamIngestionNode";
+    pub const STREAM_EGRESS: &str = "StreamEgressNode";
+    pub const STREAM_PROJECTION: &str = "StreamProjectionNode";
+    pub const REMOTE_TABLE_BOUNDARY: &str = "RemoteTableBoundaryNode";
+    pub const REFERENCE_TABLE_SOURCE: &str = "ReferenceTableSource";
+    pub const STREAM_REFERENCE_JOIN: &str = "StreamReferenceJoin";
+    pub const KEY_EXTRACTION: &str = "KeyExtractionNode";
+    pub const STREAMING_JOIN: &str = "StreamingJoinNode";
+    pub const ASYNC_FUNCTION_EXECUTION: &str = "AsyncFunctionExecutionNode";
+    pub const UNROLL_DEBEZIUM_PAYLOAD: &str = "UnrollDebeziumPayloadNode";
+    pub const PACK_DEBEZIUM_ENVELOPE: &str = "PackDebeziumEnvelopeNode";
+}
+
+
+pub mod proto_operator_name {
+    pub const TUMBLING_WINDOW: &str = "TumblingWindow";
+    pub const UPDATING_AGGREGATE: &str = "UpdatingAggregate";
+    pub const WINDOW_FUNCTION: &str = "WindowFunction";
+    pub const SLIDING_WINDOW_LABEL: &str = "sliding window";
+    pub const INSTANT_WINDOW: &str = "InstantWindow";
+    pub const INSTANT_WINDOW_LABEL: &str = "instant window";
+}
+
+
+pub mod runtime_operator_kind {
+    pub const STREAMING_JOIN: &str = "streaming_join";
+    pub const WATERMARK_GENERATOR: &str = "watermark_generator";
+    pub const STREAMING_WINDOW_EVALUATOR: &str = "streaming_window_evaluator";
+}
+
+
+pub mod factory_operator_name {
+    pub const CONNECTOR_SOURCE: &str = "ConnectorSource";
+    pub const CONNECTOR_SINK: &str = "ConnectorSink";
+    pub const KAFKA_SOURCE: &str = "KafkaSource";
+    pub const KAFKA_SINK: &str = "KafkaSink";
+}
+
+
+pub mod cdc {
+    pub const BEFORE: &str = "before";
+    pub const AFTER: &str = "after";
+    pub const OP: &str = "op";
+}
+
+
+pub mod updating_state_field {
+    pub const IS_RETRACT: &str = "is_retract";
+    pub const ID: &str = "id";
+}
+
+
+pub mod sql_field {
+    pub const ASYNC_RESULT: &str = "__async_result";
+    pub const DEFAULT_KEY_LABEL: &str = "key";
+    pub const DEFAULT_PROJECTION_LABEL: &str = "projection";
+    pub const COMPUTED_WATERMARK: &str = "__watermark";
+    pub const TIMESTAMP_FIELD: &str = "_timestamp";
+    pub const UPDATING_META_FIELD: &str = "_updating_meta";
+}
+
+
+pub mod sql_planning_default {
+    pub const DEFAULT_PARALLELISM: usize = 4;
+    pub const PLANNING_TTL_SECS: u64 = 24 * 60 * 60;
+}
+
+
+pub mod with_opt_bool_str {
+    pub const TRUE: &str = "true";
+    pub const YES: &str = "yes";
+    pub const FALSE: &str = "false";
+    pub const NO: &str = "no";
+}
+
+pub mod interval_duration_unit {
+    pub const SECOND: &str = "second";
+    pub const SECONDS: &str = "seconds";
+    pub const S: &str = "s";
+    pub const MINUTE: &str = "minute";
+    pub const MINUTES: &str = "minutes";
+    pub const MIN: &str = "min";
+    pub const HOUR: &str = "hour";
+    pub const HOURS: &str = "hours";
+    pub const H: &str = "h";
+    pub const DAY: &str = "day";
+    pub const DAYS: &str = "days";
+    pub const D: &str = "d";
+}
+
+
+pub mod connection_format_value {
+    pub const JSON: &str = "json";
+    pub const DEBEZIUM_JSON: &str = "debezium_json";
+    pub const AVRO: &str = "avro";
+    pub const PARQUET: &str = "parquet";
+    pub const PROTOBUF: &str = "protobuf";
+    pub const RAW_STRING: &str = "raw_string";
+    pub const RAW_BYTES: &str = "raw_bytes";
+}
+
+pub mod framing_method_value {
+    pub const NEWLINE: &str = "newline";
+    pub const NEWLINE_DELIMITED: &str = "newline_delimited";
+}
+
+pub mod bad_data_value {
+    pub const FAIL: &str = "fail";
+    pub const DROP: &str = "drop";
+}
+
+
+pub mod timestamp_format_value {
+    pub const RFC3339_SNAKE: &str = "rfc3339";
+    pub const RFC3339_UPPER: &str = "RFC3339";
+    pub const UNIX_MILLIS_SNAKE: &str = "unix_millis";
+    pub const UNIX_MILLIS_PASCAL: &str = "UnixMillis";
+}
+
+pub mod decimal_encoding_value {
+    pub const NUMBER: &str = "number";
+    pub const STRING: &str = "string";
+    pub const BYTES: &str = "bytes";
+}
+
+pub mod json_compression_value {
+    pub const UNCOMPRESSED: &str = "uncompressed";
+    pub const GZIP: &str = "gzip";
+}
+
+pub mod parquet_compression_value {
+    pub const UNCOMPRESSED: &str = "uncompressed";
+    pub const SNAPPY: &str = "snappy";
+    pub const GZIP: &str = "gzip";
+    pub const ZSTD: &str = "zstd";
+    pub const LZ4: &str = "lz4";
+    pub const LZ4_RAW: &str = "lz4_raw";
+}
+
+
+pub mod date_part_keyword {
+    pub const YEAR: &str = "year";
+    pub const MONTH: &str = "month";
+    pub const WEEK: &str = "week";
+    pub const DAY: &str = "day";
+    pub const HOUR: &str = "hour";
+    pub const MINUTE: &str = "minute";
+    pub const SECOND: &str = "second";
+    pub const MILLISECOND: &str = "millisecond";
+    pub const MICROSECOND: &str = "microsecond";
+    pub const NANOSECOND: &str = "nanosecond";
+    pub const DOW: &str = "dow";
+    pub const DOY: &str = "doy";
+}
+
+pub mod date_trunc_keyword {
+    pub const YEAR: &str = "year";
+    pub const QUARTER: &str = "quarter";
+    pub const MONTH: &str = "month";
+    pub const WEEK: &str = "week";
+    pub const DAY: &str = "day";
+    pub const HOUR: &str = "hour";
+    pub const MINUTE: &str = "minute";
+    pub const SECOND: &str = "second";
+}
+
+
+pub mod mem_exec_join_side {
+    pub const LEFT: &str = "left";
+    pub const RIGHT: &str = "right";
+}
+
+pub mod physical_plan_node_name {
+    pub const RW_LOCK_READER: &str = "rw_lock_reader";
+    pub const UNBOUNDED_READER: &str = "unbounded_reader";
+    pub const VEC_READER: &str = "vec_reader";
+    pub const MEM_EXEC: &str = "mem_exec";
+    pub const DEBEZIUM_UNROLLING_EXEC: &str = "debezium_unrolling_exec";
+    pub const TO_DEBEZIUM_EXEC: &str = "to_debezium_exec";
+}
+
+pub mod window_function_udf {
+    pub const NAME: &str = "window";
+}
+
+pub mod window_interval_field {
+    pub const START: &str = "start";
+    pub const END: &str = "end";
+}
+
+pub mod debezium_op_short {
+    pub const CREATE: &str = "c";
+    pub const READ: &str = "r";
+    pub const UPDATE: &str = "u";
+    pub const DELETE: &str = "d";
+}
+
+
+pub mod connector_type {
+    pub const KAFKA: &str = "kafka";
+    pub const KINESIS: &str = "kinesis";
+    pub const FILESYSTEM: &str = "filesystem";
+    pub const DELTA: &str = "delta";
+    pub const ICEBERG: &str = "iceberg";
+    pub const PULSAR: &str = "pulsar";
+    pub const NATS: &str = "nats";
+    pub const REDIS: &str = "redis";
+    pub const MQTT: &str = "mqtt";
+    pub const WEBSOCKET: &str = "websocket";
+    pub const SSE: &str = "sse";
+    pub const NEXMARK: &str = "nexmark";
+    pub const BLACKHOLE: &str = "blackhole";
+    pub const MEMORY: &str = "memory";
+    pub const POSTGRES: &str = "postgres";
+}
+
+
+pub mod connection_table_role {
+    pub const SOURCE: &str = "source";
+    pub const SINK: &str = "sink";
+    pub const LOOKUP: &str = "lookup";
+}
+
+pub const SUPPORTED_CONNECTOR_ADAPTERS: &[&str] = &[
+    connector_type::KAFKA,
+];
+
+
+pub mod kafka_with_value {
+    pub const SCAN_LATEST: &str = "latest";
+    pub const SCAN_EARLIEST: &str = "earliest";
+    pub const SCAN_GROUP_OFFSETS: &str = "group-offsets";
+    pub const SCAN_GROUP: &str = "group";
+    pub const ISOLATION_READ_COMMITTED: &str = "read_committed";
+    pub const ISOLATION_READ_UNCOMMITTED: &str = "read_uncommitted";
+    pub const SINK_COMMIT_EXACTLY_ONCE_HYPHEN: &str = "exactly-once";
+    pub const SINK_COMMIT_EXACTLY_ONCE_UNDERSCORE: &str = "exactly_once";
+    pub const SINK_COMMIT_AT_LEAST_ONCE_HYPHEN: &str = "at-least-once";
+    pub const SINK_COMMIT_AT_LEAST_ONCE_UNDERSCORE: &str = "at_least_once";
+}
diff --git a/src/sql/common/control.rs b/src/sql/common/control.rs
new file mode 100644
index 00000000..4ea9a12f
--- /dev/null
+++ b/src/sql/common/control.rs
@@ -0,0 +1,164 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::collections::HashMap;
+use std::time::SystemTime;
+
+use super::message::CheckpointBarrier;
+
+/// Control messages sent from the controller to worker tasks.
+#[derive(Debug, Clone)]
+pub enum ControlMessage {
+    Checkpoint(CheckpointBarrier),
+    Stop {
+        mode: StopMode,
+    },
+    Commit {
+        epoch: u32,
+        commit_data: HashMap<String, HashMap<u32, Vec<u8>>>,
+    },
+    LoadCompacted {
+        compacted: CompactionResult,
+    },
+    NoOp,
+}
+
+#[derive(Debug, Copy, Clone, PartialEq, Eq)]
+pub enum StopMode {
+    Graceful,
+    Immediate,
+}
+
+#[derive(Debug, Clone)]
+pub struct CompactionResult {
+    pub operator_id: String,
+    pub compacted_tables: HashMap<String, TableCheckpointMetadata>,
+}
+
+#[derive(Debug, Clone)]
+pub struct TableCheckpointMetadata {
+    pub table_type: TableType,
+    pub data: Vec<u8>,
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum TableType {
+    GlobalKeyValue,
+    ExpiringKeyedTimeTable,
+}
+
+/// Responses sent from worker tasks back to the controller.
+#[derive(Debug, Clone)]
+pub enum ControlResp {
+    CheckpointEvent(CheckpointEvent),
+    CheckpointCompleted(CheckpointCompleted),
+    TaskStarted {
+        node_id: u32,
+        task_index: usize,
+        start_time: SystemTime,
+    },
+    TaskFinished {
+        node_id: u32,
+        task_index: usize,
+    },
+    TaskFailed {
+        node_id: u32,
+        task_index: usize,
+        error: TaskError,
+    },
+    Error {
+        node_id: u32,
+        operator_id: String,
+        task_index: usize,
+        message: String,
+        details: String,
+    },
+}
+
+#[derive(Debug, Clone)]
+pub struct CheckpointCompleted {
+    pub checkpoint_epoch: u32,
+    pub node_id: u32,
+    pub operator_id: String,
+    pub subtask_metadata: SubtaskCheckpointMetadata,
+}
+
+#[derive(Debug, Clone)]
+pub struct SubtaskCheckpointMetadata {
+    pub subtask_index: u32,
+    pub start_time: u64,
+    pub finish_time: u64,
+    pub watermark: Option<u64>,
+    pub bytes: u64,
+    pub table_metadata: HashMap<String, TableSubtaskCheckpointMetadata>,
+    pub table_configs: HashMap<String, TableConfig>,
+}
+
+#[derive(Debug, Clone)]
+pub struct TableSubtaskCheckpointMetadata {
+    pub subtask_index: u32,
+    pub table_type: TableType,
+    pub data: Vec<u8>,
+}
+
+#[derive(Debug, Clone)]
+pub struct TableConfig {
+    pub table_type: TableType,
+    pub config: Vec<u8>,
+    pub state_version: u32,
+}
+
+#[derive(Debug, Clone)]
+pub struct CheckpointEvent {
+    pub checkpoint_epoch: u32,
+    pub node_id: u32,
+    pub operator_id: String,
+    pub subtask_index: u32,
+    pub time: SystemTime,
+    pub event_type: TaskCheckpointEventType,
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum TaskCheckpointEventType {
+    StartedAlignment,
+    StartedCheckpointing,
+    FinishedOperatorSetup,
+    FinishedSync,
+    FinishedCommit,
+}
+
+#[derive(Debug, Clone)]
+pub struct TaskError {
+    pub job_id: String,
+    pub node_id: u32,
+    pub operator_id: String,
+    pub operator_subtask: u64,
+    pub error: String,
+    pub error_domain: ErrorDomain,
+    pub retry_hint: RetryHint,
+    pub details: String,
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
+#[serde(rename_all = "snake_case")]
+pub enum ErrorDomain {
+    User,
+    Internal,
+    External,
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
+#[serde(rename_all = "snake_case")]
+pub enum RetryHint {
+    NoRetry,
+    WithBackoff,
+}
diff --git a/src/sql/common/converter.rs b/src/sql/common/converter.rs
new file mode 100644
index 00000000..ec4687f8
--- /dev/null
+++ b/src/sql/common/converter.rs
@@ -0,0 +1,95 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::sync::Arc;
+use arrow::row::{OwnedRow, RowConverter, RowParser, Rows, SortField};
+use arrow_array::{Array, ArrayRef, BooleanArray};
+use arrow_schema::{ArrowError, DataType};
+
+// need to handle the empty case as a row converter without sort fields emits empty Rows.
+#[derive(Debug)]
+pub enum Converter {
+    RowConverter(RowConverter),
+    Empty(RowConverter, Arc<dyn Array>),
+}
+
+impl Converter {
+    pub fn new(sort_fields: Vec<SortField>) -> Result<Self, ArrowError> {
+        if sort_fields.is_empty() {
+            let array = Arc::new(BooleanArray::from(vec![false]));
+            Ok(Self::Empty(
+                RowConverter::new(vec![SortField::new(DataType::Boolean)])?,
+                array,
+            ))
+        } else {
+            Ok(Self::RowConverter(RowConverter::new(sort_fields)?))
+        }
+    }
+
+    pub fn convert_columns(&self, columns: &[Arc<dyn Array>]) -> Result<OwnedRow, ArrowError> {
+        match self {
+            Converter::RowConverter(row_converter) => {
+                Ok(row_converter.convert_columns(columns)?.row(0).owned())
+            }
+            Converter::Empty(row_converter, array) => Ok(row_converter
+                .convert_columns(std::slice::from_ref(array))?
+                .row(0)
+                .owned()),
+        }
+    }
+
+    pub fn convert_all_columns(
+        &self,
+        columns: &[Arc<dyn Array>],
+        num_rows: usize,
+    ) -> Result<Rows, ArrowError> {
+        match self {
+            Converter::RowConverter(row_converter) => Ok(row_converter.convert_columns(columns)?),
+            Converter::Empty(row_converter, _array) => {
+                let array = Arc::new(BooleanArray::from(vec![false; num_rows]));
+                Ok(row_converter.convert_columns(&[array])?)
+            }
+        }
+    }
+
+    pub fn convert_rows(
+        &self,
+        rows: Vec<arrow::row::Row<'_>>,
+    ) -> Result<Vec<ArrayRef>, ArrowError> {
+        match self {
+            Converter::RowConverter(row_converter) => Ok(row_converter.convert_rows(rows)?),
+            Converter::Empty(_row_converter, _array) => Ok(vec![]),
+        }
+    }
+
+    pub fn convert_raw_rows(&self, row_bytes: Vec<&[u8]>) -> Result<Vec<ArrayRef>, ArrowError> {
+        match self {
+            Converter::RowConverter(row_converter) => {
+                let parser = row_converter.parser();
+                let mut row_list = vec![];
+                for bytes in row_bytes {
+                    let row = parser.parse(bytes);
+                    row_list.push(row);
+                }
+                Ok(row_converter.convert_rows(row_list)?)
+            }
+            Converter::Empty(_row_converter, _array) => Ok(vec![]),
+        }
+    }
+
+    pub fn parser(&self) -> Option<RowParser> {
+        match self {
+            Converter::RowConverter(r) => Some(r.parser()),
+            Converter::Empty(_, _) => None,
+        }
+    }
+}
\ No newline at end of file
diff --git a/src/sql/common/date.rs b/src/sql/common/date.rs
new file mode 100644
index 00000000..ec310326
--- /dev/null
+++ b/src/sql/common/date.rs
@@ -0,0 +1,86 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use serde::Serialize;
+use std::convert::TryFrom;
+
+use super::constants::{date_part_keyword, date_trunc_keyword};
+
+#[derive(Debug, Clone, Copy, Eq, PartialEq, PartialOrd, Hash, Serialize)]
+pub enum DatePart {
+    Year,
+    Month,
+    Week,
+    Day,
+    Hour,
+    Minute,
+    Second,
+    Millisecond,
+    Microsecond,
+    Nanosecond,
+    DayOfWeek,
+    DayOfYear,
+}
+
+impl TryFrom<&str> for DatePart {
+    type Error = String;
+
+    fn try_from(value: &str) -> Result<Self, Self::Error> {
+        let v = value.to_lowercase();
+        match v.as_str() {
+            date_part_keyword::YEAR => Ok(DatePart::Year),
+            date_part_keyword::MONTH => Ok(DatePart::Month),
+            date_part_keyword::WEEK => Ok(DatePart::Week),
+            date_part_keyword::DAY => Ok(DatePart::Day),
+            date_part_keyword::HOUR => Ok(DatePart::Hour),
+            date_part_keyword::MINUTE => Ok(DatePart::Minute),
+            date_part_keyword::SECOND => Ok(DatePart::Second),
+            date_part_keyword::MILLISECOND => Ok(DatePart::Millisecond),
+            date_part_keyword::MICROSECOND => Ok(DatePart::Microsecond),
+            date_part_keyword::NANOSECOND => Ok(DatePart::Nanosecond),
+            date_part_keyword::DOW => Ok(DatePart::DayOfWeek),
+            date_part_keyword::DOY => Ok(DatePart::DayOfYear),
+            _ => Err(format!("'{value}' is not a valid DatePart")),
+        }
+    }
+}
+
+#[derive(Clone, Copy, Debug, Eq, PartialEq, Hash, PartialOrd, Serialize)]
+pub enum DateTruncPrecision {
+    Year,
+    Quarter,
+    Month,
+    Week,
+    Day,
+    Hour,
+    Minute,
+    Second,
+}
+
+impl TryFrom<&str> for DateTruncPrecision {
+    type Error = String;
+
+    fn try_from(value: &str) -> Result<Self, Self::Error> {
+        let v = value.to_lowercase();
+        match v.as_str() {
+            date_trunc_keyword::YEAR => Ok(DateTruncPrecision::Year),
+            date_trunc_keyword::QUARTER => Ok(DateTruncPrecision::Quarter),
+            date_trunc_keyword::MONTH => Ok(DateTruncPrecision::Month),
+            date_trunc_keyword::WEEK => Ok(DateTruncPrecision::Week),
+            date_trunc_keyword::DAY => Ok(DateTruncPrecision::Day),
+            date_trunc_keyword::HOUR => Ok(DateTruncPrecision::Hour),
+            date_trunc_keyword::MINUTE => Ok(DateTruncPrecision::Minute),
+            date_trunc_keyword::SECOND => Ok(DateTruncPrecision::Second),
+            _ => Err(format!("'{value}' is not a valid DateTruncPrecision")),
+        }
+    }
+}
diff --git a/src/sql/common/debezium.rs b/src/sql/common/debezium.rs
new file mode 100644
index 00000000..9dbc401f
--- /dev/null
+++ b/src/sql/common/debezium.rs
@@ -0,0 +1,148 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use bincode::{Decode, Encode};
+use serde::{Deserialize, Serialize};
+use std::convert::TryFrom;
+use std::fmt::Debug;
+
+pub trait Key:
+    Debug + Clone + Encode + Decode<()> + std::hash::Hash + PartialEq + Eq + Send + 'static
+{
+}
+impl<T: Debug + Clone + Encode + Decode<()> + std::hash::Hash + PartialEq + Eq + Send + 'static> Key
+    for T
+{
+}
+
+pub trait Data: Debug + Clone + Encode + Decode<()> + Send + 'static {}
+impl<T: Debug + Clone + Encode + Decode<()> + Send + 'static> Data for T {}
+
+#[derive(Debug, Clone, PartialEq, Encode, Decode, Serialize, Deserialize)]
+pub enum UpdatingData<T: Data> {
+    Retract(T),
+    Update { old: T, new: T },
+    Append(T),
+}
+
+impl<T: Data> UpdatingData<T> {
+    pub fn lower(&self) -> T {
+        match self {
+            UpdatingData::Retract(_) => panic!("cannot lower retractions"),
+            UpdatingData::Update { new, .. } => new.clone(),
+            UpdatingData::Append(t) => t.clone(),
+        }
+    }
+
+    pub fn unwrap_append(&self) -> &T {
+        match self {
+            UpdatingData::Append(t) => t,
+            _ => panic!("UpdatingData is not an append"),
+        }
+    }
+}
+
+#[derive(Clone, Encode, Decode, Debug, Serialize, Deserialize, PartialEq)]
+#[serde(try_from = "DebeziumShadow<T>")]
+pub struct Debezium<T: Data> {
+    pub before: Option<T>,
+    pub after: Option<T>,
+    pub op: DebeziumOp,
+}
+
+#[derive(Clone, Encode, Decode, Debug, Serialize, Deserialize, PartialEq)]
+struct DebeziumShadow<T: Data> {
+    before: Option<T>,
+    after: Option<T>,
+    op: DebeziumOp,
+}
+
+impl<T: Data> TryFrom<DebeziumShadow<T>> for Debezium<T> {
+    type Error = &'static str;
+
+    fn try_from(value: DebeziumShadow<T>) -> Result<Self, Self::Error> {
+        match (value.op, &value.before, &value.after) {
+            (DebeziumOp::Create, _, None) => {
+                Err("`after` must be set for Debezium create messages")
+            }
+            (DebeziumOp::Update, None, _) => {
+                Err("`before` must be set for Debezium update messages")
+            }
+            (DebeziumOp::Update, _, None) => {
+                Err("`after` must be set for Debezium update messages")
+            }
+            (DebeziumOp::Delete, None, _) => {
+                Err("`before` must be set for Debezium delete messages")
+            }
+            _ => Ok(Debezium {
+                before: value.before,
+                after: value.after,
+                op: value.op,
+            }),
+        }
+    }
+}
+
+#[derive(Copy, Clone, Encode, Decode, Debug, PartialEq)]
+pub enum DebeziumOp {
+    Create,
+    Update,
+    Delete,
+}
+
+#[allow(clippy::to_string_trait_impl)]
+impl ToString for DebeziumOp {
+    fn to_string(&self) -> String {
+        match self {
+            DebeziumOp::Create => "c",
+            DebeziumOp::Update => "u",
+            DebeziumOp::Delete => "d",
+        }
+        .to_string()
+    }
+}
+
+impl<'de> Deserialize<'de> for DebeziumOp {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        let s = String::deserialize(deserializer)?;
+        match s.as_str() {
+            "c" | "r" => Ok(DebeziumOp::Create),
+            "u" => Ok(DebeziumOp::Update),
+            "d" => Ok(DebeziumOp::Delete),
+            _ => Err(serde::de::Error::custom(format!("Invalid DebeziumOp {s}"))),
+        }
+    }
+}
+
+impl Serialize for DebeziumOp {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        match self {
+            DebeziumOp::Create => serializer.serialize_str("c"),
+            DebeziumOp::Update => serializer.serialize_str("u"),
+            DebeziumOp::Delete => serializer.serialize_str("d"),
+        }
+    }
+}
+
+#[derive(Copy, Clone, Encode, Decode, Debug, PartialEq, Serialize, Deserialize)]
+pub enum JoinType {
+    Inner,
+    Left,
+    Right,
+    Full,
+}
diff --git a/src/sql/common/errors.rs b/src/sql/common/errors.rs
new file mode 100644
index 00000000..fa4a722e
--- /dev/null
+++ b/src/sql/common/errors.rs
@@ -0,0 +1,92 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::fmt;
+
+/// Result type for streaming operators and collectors.
+pub type DataflowResult<T> = std::result::Result<T, DataflowError>;
+
+/// Unified error type for streaming dataflow operations.
+#[derive(Debug)]
+pub enum DataflowError {
+    Arrow(arrow_schema::ArrowError),
+    DataFusion(datafusion::error::DataFusionError),
+    Operator(String),
+    State(String),
+    Connector(String),
+    Internal(String),
+}
+
+impl fmt::Display for DataflowError {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match self {
+            DataflowError::Arrow(e) => write!(f, "Arrow error: {e}"),
+            DataflowError::DataFusion(e) => write!(f, "DataFusion error: {e}"),
+            DataflowError::Operator(msg) => write!(f, "Operator error: {msg}"),
+            DataflowError::State(msg) => write!(f, "State error: {msg}"),
+            DataflowError::Connector(msg) => write!(f, "Connector error: {msg}"),
+            DataflowError::Internal(msg) => write!(f, "Internal error: {msg}"),
+        }
+    }
+}
+
+impl std::error::Error for DataflowError {}
+
+impl DataflowError {
+    pub fn with_operator(self, operator_id: impl Into<String>) -> Self {
+        let id = operator_id.into();
+        match self {
+            DataflowError::Operator(m) => DataflowError::Operator(format!("{id}: {m}")),
+            other => DataflowError::Operator(format!("{id}: {other}")),
+        }
+    }
+}
+
+impl From<arrow_schema::ArrowError> for DataflowError {
+    fn from(e: arrow_schema::ArrowError) -> Self {
+        DataflowError::Arrow(e)
+    }
+}
+
+impl From<datafusion::error::DataFusionError> for DataflowError {
+    fn from(e: datafusion::error::DataFusionError) -> Self {
+        DataflowError::DataFusion(e)
+    }
+}
+
+/// Macro for creating connector errors.
+#[macro_export]
+macro_rules! connector_err {
+    ($($arg:tt)*) => {
+        $crate::sql::common::errors::DataflowError::Connector(format!($($arg)*))
+    };
+}
+
+/// State-related errors.
+#[derive(Debug)]
+pub enum StateError {
+    KeyNotFound(String),
+    SerializationError(String),
+    BackendError(String),
+}
+
+impl fmt::Display for StateError {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match self {
+            StateError::KeyNotFound(key) => write!(f, "Key not found: {key}"),
+            StateError::SerializationError(msg) => write!(f, "Serialization error: {msg}"),
+            StateError::BackendError(msg) => write!(f, "State backend error: {msg}"),
+        }
+    }
+}
+
+impl std::error::Error for StateError {}
diff --git a/src/sql/common/format_from_opts.rs b/src/sql/common/format_from_opts.rs
new file mode 100644
index 00000000..34b6a586
--- /dev/null
+++ b/src/sql/common/format_from_opts.rs
@@ -0,0 +1,184 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Parse `WITH` clause format / framing / bad-data options (Arroyo-compatible keys).
+
+use std::str::FromStr;
+
+use datafusion::common::{Result as DFResult, plan_datafusion_err, plan_err};
+
+use super::connector_options::ConnectorOptions;
+use super::constants::{bad_data_value, connection_format_value, framing_method_value};
+use super::with_option_keys as opt;
+use super::formats::{
+    AvroFormat, BadData, DecimalEncoding, Format, Framing, JsonCompression, JsonFormat,
+    NewlineDelimitedFraming, ParquetCompression, ParquetFormat, ProtobufFormat, RawBytesFormat,
+    RawStringFormat, TimestampFormat,
+};
+
+impl JsonFormat {
+    pub fn from_opts(opts: &mut ConnectorOptions) -> DFResult<Self> {
+        let mut j = JsonFormat::default();
+        if let Some(v) = opts.pull_opt_bool(opt::JSON_CONFLUENT_SCHEMA_REGISTRY)? {
+            j.confluent_schema_registry = v;
+        }
+        if let Some(v) = opts.pull_opt_u64(opt::JSON_CONFLUENT_SCHEMA_VERSION)? {
+            j.schema_id = Some(v as u32);
+        }
+        if let Some(v) = opts.pull_opt_bool(opt::JSON_INCLUDE_SCHEMA)? {
+            j.include_schema = v;
+        }
+        if let Some(v) = opts.pull_opt_bool(opt::JSON_DEBEZIUM)? {
+            j.debezium = v;
+        }
+        if let Some(v) = opts.pull_opt_bool(opt::JSON_UNSTRUCTURED)? {
+            j.unstructured = v;
+        }
+        if let Some(s) = opts.pull_opt_str(opt::JSON_TIMESTAMP_FORMAT)? {
+            j.timestamp_format = TimestampFormat::try_from(s.as_str()).map_err(|_| {
+                plan_datafusion_err!("invalid json.timestamp_format '{}'", s)
+            })?;
+        }
+        if let Some(s) = opts.pull_opt_str(opt::JSON_DECIMAL_ENCODING)? {
+            j.decimal_encoding = DecimalEncoding::try_from(s.as_str()).map_err(|_| {
+                plan_datafusion_err!("invalid json.decimal_encoding '{s}'")
+            })?;
+        }
+        if let Some(s) = opts.pull_opt_str(opt::JSON_COMPRESSION)? {
+            j.compression = JsonCompression::from_str(&s)
+                .map_err(|e| plan_datafusion_err!("invalid json.compression: {e}"))?;
+        }
+        Ok(j)
+    }
+}
+
+impl Format {
+    pub fn from_opts(opts: &mut ConnectorOptions) -> DFResult<Option<Self>> {
+        let Some(name) = opts.pull_opt_str(opt::FORMAT)? else {
+            return Ok(None);
+        };
+        let n = name.to_lowercase();
+        match n.as_str() {
+            connection_format_value::JSON => Ok(Some(Format::Json(JsonFormat::from_opts(opts)?))),
+            connection_format_value::DEBEZIUM_JSON => {
+                let mut j = JsonFormat::from_opts(opts)?;
+                j.debezium = true;
+                Ok(Some(Format::Json(j)))
+            }
+            connection_format_value::AVRO => Ok(Some(Format::Avro(AvroFormat::from_opts(opts)?))),
+            connection_format_value::PARQUET => {
+                Ok(Some(Format::Parquet(ParquetFormat::from_opts(opts)?)))
+            }
+            connection_format_value::PROTOBUF => {
+                Ok(Some(Format::Protobuf(ProtobufFormat::from_opts(opts)?)))
+            }
+            connection_format_value::RAW_STRING => {
+                Ok(Some(Format::RawString(RawStringFormat {})))
+            }
+            connection_format_value::RAW_BYTES => Ok(Some(Format::RawBytes(RawBytesFormat {}))),
+            _ => plan_err!("unknown format '{name}'"),
+        }
+    }
+}
+
+impl AvroFormat {
+    fn from_opts(opts: &mut ConnectorOptions) -> DFResult<Self> {
+        let mut a = AvroFormat {
+            confluent_schema_registry: false,
+            raw_datums: false,
+            into_unstructured_json: false,
+            schema_id: None,
+        };
+        if let Some(v) = opts.pull_opt_bool(opt::AVRO_CONFLUENT_SCHEMA_REGISTRY)? {
+            a.confluent_schema_registry = v;
+        }
+        if let Some(v) = opts.pull_opt_bool(opt::AVRO_RAW_DATUMS)? {
+            a.raw_datums = v;
+        }
+        if let Some(v) = opts.pull_opt_bool(opt::AVRO_INTO_UNSTRUCTURED_JSON)? {
+            a.into_unstructured_json = v;
+        }
+        if let Some(v) = opts.pull_opt_u64(opt::AVRO_SCHEMA_ID)? {
+            a.schema_id = Some(v as u32);
+        }
+        Ok(a)
+    }
+}
+
+impl ParquetFormat {
+    fn from_opts(opts: &mut ConnectorOptions) -> DFResult<Self> {
+        let mut p = ParquetFormat::default();
+        if let Some(s) = opts.pull_opt_str(opt::PARQUET_COMPRESSION)? {
+            p.compression = ParquetCompression::from_str(&s)
+                .map_err(|e| plan_datafusion_err!("invalid parquet.compression: {e}"))?;
+        }
+        if let Some(v) = opts.pull_opt_u64(opt::PARQUET_ROW_GROUP_BYTES)? {
+            p.row_group_bytes = Some(v);
+        }
+        Ok(p)
+    }
+}
+
+impl ProtobufFormat {
+    fn from_opts(opts: &mut ConnectorOptions) -> DFResult<Self> {
+        let mut p = ProtobufFormat {
+            into_unstructured_json: false,
+            message_name: None,
+            compiled_schema: None,
+            confluent_schema_registry: false,
+            length_delimited: false,
+        };
+        if let Some(v) = opts.pull_opt_bool(opt::PROTOBUF_INTO_UNSTRUCTURED_JSON)? {
+            p.into_unstructured_json = v;
+        }
+        if let Some(s) = opts.pull_opt_str(opt::PROTOBUF_MESSAGE_NAME)? {
+            p.message_name = Some(s);
+        }
+        if let Some(v) = opts.pull_opt_bool(opt::PROTOBUF_CONFLUENT_SCHEMA_REGISTRY)? {
+            p.confluent_schema_registry = v;
+        }
+        if let Some(v) = opts.pull_opt_bool(opt::PROTOBUF_LENGTH_DELIMITED)? {
+            p.length_delimited = v;
+        }
+        Ok(p)
+    }
+}
+
+impl Framing {
+    pub fn from_opts(opts: &mut ConnectorOptions) -> DFResult<Option<Self>> {
+        let method = opts.pull_opt_str(opt::FRAMING_METHOD)?;
+        match method.as_deref() {
+            None => Ok(None),
+            Some(framing_method_value::NEWLINE) | Some(framing_method_value::NEWLINE_DELIMITED) => {
+                let max = opts.pull_opt_u64(opt::FRAMING_MAX_LINE_LENGTH)?;
+                Ok(Some(Framing::Newline(NewlineDelimitedFraming {
+                    max_line_length: max,
+                })))
+            }
+            Some(other) => plan_err!("unknown framing.method '{other}'"),
+        }
+    }
+}
+
+impl BadData {
+    pub fn from_opts(opts: &mut ConnectorOptions) -> DFResult<Self> {
+        let Some(s) = opts.pull_opt_str(opt::BAD_DATA)? else {
+            return Ok(BadData::Fail {});
+        };
+        let v = s.to_lowercase();
+        match v.as_str() {
+            bad_data_value::FAIL => Ok(BadData::Fail {}),
+            bad_data_value::DROP => Ok(BadData::Drop {}),
+            _ => plan_err!("invalid bad_data '{s}'"),
+        }
+    }
+}
diff --git a/src/sql/common/formats.rs b/src/sql/common/formats.rs
new file mode 100644
index 00000000..b2885797
--- /dev/null
+++ b/src/sql/common/formats.rs
@@ -0,0 +1,256 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use serde::{Deserialize, Serialize};
+use std::convert::TryFrom;
+use std::fmt::{Display, Formatter};
+use std::str::FromStr;
+
+use super::constants::{
+    connection_format_value, decimal_encoding_value, json_compression_value,
+    parquet_compression_value, timestamp_format_value,
+};
+
+#[derive(Copy, Clone, Debug, Serialize, Deserialize, PartialEq, Eq, Default, Hash, PartialOrd)]
+#[serde(rename_all = "snake_case")]
+pub enum TimestampFormat {
+    #[default]
+    #[serde(rename = "rfc3339")]
+    RFC3339,
+    UnixMillis,
+}
+
+impl TryFrom<&str> for TimestampFormat {
+    type Error = ();
+
+    fn try_from(value: &str) -> Result<Self, Self::Error> {
+        match value {
+            timestamp_format_value::RFC3339_UPPER | timestamp_format_value::RFC3339_SNAKE => {
+                Ok(TimestampFormat::RFC3339)
+            }
+            timestamp_format_value::UNIX_MILLIS_PASCAL | timestamp_format_value::UNIX_MILLIS_SNAKE => {
+                Ok(TimestampFormat::UnixMillis)
+            }
+            _ => Err(()),
+        }
+    }
+}
+
+#[derive(Copy, Clone, Debug, Serialize, Deserialize, PartialEq, Eq, Default, Hash, PartialOrd)]
+#[serde(rename_all = "snake_case")]
+pub enum DecimalEncoding {
+    #[default]
+    Number,
+    String,
+    Bytes,
+}
+
+impl TryFrom<&str> for DecimalEncoding {
+    type Error = ();
+
+    fn try_from(s: &str) -> Result<Self, Self::Error> {
+        match s {
+            decimal_encoding_value::NUMBER => Ok(Self::Number),
+            decimal_encoding_value::STRING => Ok(Self::String),
+            decimal_encoding_value::BYTES => Ok(Self::Bytes),
+            _ => Err(()),
+        }
+    }
+}
+
+#[derive(Serialize, Deserialize, Default, Copy, Clone, Debug, PartialEq, Eq, Hash, PartialOrd)]
+#[serde(rename_all = "snake_case")]
+pub enum JsonCompression {
+    #[default]
+    Uncompressed,
+    Gzip,
+}
+
+impl FromStr for JsonCompression {
+    type Err = String;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        match s {
+            json_compression_value::UNCOMPRESSED => Ok(JsonCompression::Uncompressed),
+            json_compression_value::GZIP => Ok(JsonCompression::Gzip),
+            _ => Err(format!("invalid json compression '{s}'")),
+        }
+    }
+}
+
+#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq, Default, Hash, PartialOrd)]
+#[serde(rename_all = "snake_case")]
+pub struct JsonFormat {
+    #[serde(default)]
+    pub confluent_schema_registry: bool,
+    #[serde(default, alias = "confluent_schema_version")]
+    pub schema_id: Option<u32>,
+    #[serde(default)]
+    pub include_schema: bool,
+    #[serde(default)]
+    pub debezium: bool,
+    #[serde(default)]
+    pub unstructured: bool,
+    #[serde(default)]
+    pub timestamp_format: TimestampFormat,
+    #[serde(default)]
+    pub decimal_encoding: DecimalEncoding,
+    #[serde(default)]
+    pub compression: JsonCompression,
+}
+
+#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq, Hash, PartialOrd)]
+#[serde(rename_all = "snake_case")]
+pub struct RawStringFormat {}
+
+#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq, Hash, PartialOrd)]
+#[serde(rename_all = "snake_case")]
+pub struct RawBytesFormat {}
+
+#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq, Hash, PartialOrd)]
+#[serde(rename_all = "snake_case")]
+pub struct AvroFormat {
+    #[serde(default)]
+    pub confluent_schema_registry: bool,
+    #[serde(default)]
+    pub raw_datums: bool,
+    #[serde(default)]
+    pub into_unstructured_json: bool,
+    #[serde(default)]
+    pub schema_id: Option<u32>,
+}
+
+impl AvroFormat {
+    pub fn new(
+        confluent_schema_registry: bool,
+        raw_datums: bool,
+        into_unstructured_json: bool,
+    ) -> Self {
+        Self {
+            confluent_schema_registry,
+            raw_datums,
+            into_unstructured_json,
+            schema_id: None,
+        }
+    }
+}
+
+#[derive(Serialize, Deserialize, Copy, Clone, Debug, PartialEq, Eq, Hash, PartialOrd, Default)]
+#[serde(rename_all = "snake_case")]
+pub enum ParquetCompression {
+    Uncompressed,
+    Snappy,
+    Gzip,
+    #[default]
+    Zstd,
+    Lz4,
+    Lz4Raw,
+}
+
+impl FromStr for ParquetCompression {
+    type Err = String;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        match s {
+            parquet_compression_value::UNCOMPRESSED => Ok(ParquetCompression::Uncompressed),
+            parquet_compression_value::SNAPPY => Ok(ParquetCompression::Snappy),
+            parquet_compression_value::GZIP => Ok(ParquetCompression::Gzip),
+            parquet_compression_value::ZSTD => Ok(ParquetCompression::Zstd),
+            parquet_compression_value::LZ4 => Ok(ParquetCompression::Lz4),
+            parquet_compression_value::LZ4_RAW => Ok(ParquetCompression::Lz4Raw),
+            _ => Err(format!("invalid parquet compression '{s}'")),
+        }
+    }
+}
+
+#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq, Hash, PartialOrd, Default)]
+#[serde(rename_all = "snake_case")]
+pub struct ParquetFormat {
+    #[serde(default)]
+    pub compression: ParquetCompression,
+    #[serde(default)]
+    pub row_group_bytes: Option<u64>,
+}
+
+#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq, Hash, PartialOrd)]
+#[serde(rename_all = "snake_case")]
+pub struct ProtobufFormat {
+    #[serde(default)]
+    pub into_unstructured_json: bool,
+    #[serde(default)]
+    pub message_name: Option<String>,
+    #[serde(default)]
+    pub compiled_schema: Option<Vec<u8>>,
+    #[serde(default)]
+    pub confluent_schema_registry: bool,
+    #[serde(default)]
+    pub length_delimited: bool,
+}
+
+#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq, Hash, PartialOrd)]
+#[serde(rename_all = "snake_case", tag = "type")]
+pub enum Format {
+    Json(JsonFormat),
+    Avro(AvroFormat),
+    Protobuf(ProtobufFormat),
+    Parquet(ParquetFormat),
+    RawString(RawStringFormat),
+    RawBytes(RawBytesFormat),
+}
+
+impl Display for Format {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        f.write_str(self.name())
+    }
+}
+
+impl Format {
+    pub fn name(&self) -> &'static str {
+        match self {
+            Format::Json(_) => connection_format_value::JSON,
+            Format::Avro(_) => connection_format_value::AVRO,
+            Format::Protobuf(_) => connection_format_value::PROTOBUF,
+            Format::Parquet(_) => connection_format_value::PARQUET,
+            Format::RawString(_) => connection_format_value::RAW_STRING,
+            Format::RawBytes(_) => connection_format_value::RAW_BYTES,
+        }
+    }
+
+    pub fn is_updating(&self) -> bool {
+        matches!(self, Format::Json(JsonFormat { debezium: true, .. }))
+    }
+}
+
+#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq, Hash, PartialOrd)]
+#[serde(rename_all = "snake_case", tag = "behavior")]
+pub enum BadData {
+    Fail {},
+    Drop {},
+}
+
+impl Default for BadData {
+    fn default() -> Self {
+        BadData::Fail {}
+    }
+}
+
+#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq, Hash, PartialOrd)]
+#[serde(rename_all = "snake_case", tag = "method")]
+pub enum Framing {
+    Newline(NewlineDelimitedFraming),
+}
+
+#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq, Hash, PartialOrd)]
+#[serde(rename_all = "snake_case")]
+pub struct NewlineDelimitedFraming {
+    pub max_line_length: Option<u64>,
+}
diff --git a/src/sql/common/fs_schema.rs b/src/sql/common/fs_schema.rs
new file mode 100644
index 00000000..eb92d4ac
--- /dev/null
+++ b/src/sql/common/fs_schema.rs
@@ -0,0 +1,474 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! FunctionStream table/stream schema: Arrow [`Schema`] plus timestamp index and optional key columns.
+//!
+//! [`Schema`]: datafusion::arrow::datatypes::Schema
+
+use datafusion::arrow::array::builder::{ArrayBuilder, make_builder};
+use datafusion::arrow::array::{RecordBatch, TimestampNanosecondArray};
+use datafusion::arrow::datatypes::{DataType, Field, FieldRef, Schema, SchemaBuilder, TimeUnit};
+use datafusion::arrow::error::ArrowError;
+use datafusion::common::{DataFusionError, Result as DFResult};
+use serde::{Deserialize, Serialize};
+use std::sync::Arc;
+use std::time::SystemTime;
+use arrow::compute::{filter_record_batch, lexsort_to_indices, partition, take, SortColumn};
+use arrow::compute::kernels::cmp::gt_eq;
+use arrow::compute::kernels::numeric::div;
+use arrow::row::SortField;
+use arrow_array::{PrimitiveArray, UInt64Array};
+use arrow_array::types::UInt64Type;
+use protocol::grpc::api;
+use super::{to_nanos, TIMESTAMP_FIELD};
+use std::ops::Range;
+use crate::sql::common::converter::Converter;
+
+#[derive(Debug, Copy, Clone)]
+pub enum FieldValueType<'a> {
+    Int64(Option<i64>),
+    UInt64(Option<u64>),
+    Int32(Option<i32>),
+    String(Option<&'a str>),
+    Bytes(Option<&'a [u8]>),
+}
+
+pub type FsSchemaRef = Arc<FsSchema>;
+
+#[derive(Debug, Clone, Eq, PartialEq, Hash, Serialize, Deserialize)]
+pub struct FsSchema {
+    pub schema: Arc<Schema>,
+    pub timestamp_index: usize,
+    key_indices: Option<Vec<usize>>,
+    /// If defined, these indices are used for routing (i.e., which subtask gets which piece of data)
+    routing_key_indices: Option<Vec<usize>>,
+}
+
+impl TryFrom<api::FsSchema> for FsSchema {
+    type Error = DataFusionError;
+    fn try_from(schema_proto: api::FsSchema) -> Result<Self, DataFusionError> {
+        let schema: Schema = serde_json::from_str(&schema_proto.arrow_schema)
+            .map_err(|e| DataFusionError::Plan(format!("Invalid arrow schema: {e}")))?;
+        let timestamp_index = schema_proto.timestamp_index as usize;
+
+        let key_indices = schema_proto.has_keys.then(|| {
+            schema_proto
+                .key_indices
+                .into_iter()
+                .map(|index| index as usize)
+                .collect()
+        });
+
+        let routing_key_indices = schema_proto.has_routing_keys.then(|| {
+            schema_proto
+                .routing_key_indices
+                .into_iter()
+                .map(|index| index as usize)
+                .collect()
+        });
+
+        Ok(Self {
+            schema: Arc::new(schema),
+            timestamp_index,
+            key_indices,
+            routing_key_indices,
+        })
+    }
+}
+
+impl From<FsSchema> for api::FsSchema {
+    fn from(schema: FsSchema) -> Self {
+        let arrow_schema = serde_json::to_string(schema.schema.as_ref()).unwrap();
+        let timestamp_index = schema.timestamp_index as u32;
+
+        let has_keys = schema.key_indices.is_some();
+        let key_indices = schema
+            .key_indices
+            .map(|ks| ks.into_iter().map(|index| index as u32).collect())
+            .unwrap_or_default();
+
+        let has_routing_keys = schema.routing_key_indices.is_some();
+        let routing_key_indices = schema
+            .routing_key_indices
+            .map(|ks| ks.into_iter().map(|index| index as u32).collect())
+            .unwrap_or_default();
+
+        Self {
+            arrow_schema,
+            timestamp_index,
+            key_indices,
+            has_keys,
+            routing_key_indices,
+            has_routing_keys,
+        }
+    }
+}
+
+impl FsSchema {
+    pub fn new(
+        schema: Arc<Schema>,
+        timestamp_index: usize,
+        key_indices: Option<Vec<usize>>,
+        routing_key_indices: Option<Vec<usize>>,
+    ) -> Self {
+        Self {
+            schema,
+            timestamp_index,
+            key_indices,
+            routing_key_indices,
+        }
+    }
+    pub fn new_unkeyed(schema: Arc<Schema>, timestamp_index: usize) -> Self {
+        Self {
+            schema,
+            timestamp_index,
+            key_indices: None,
+            routing_key_indices: None,
+        }
+    }
+    pub fn new_keyed(schema: Arc<Schema>, timestamp_index: usize, key_indices: Vec<usize>) -> Self {
+        Self {
+            schema,
+            timestamp_index,
+            key_indices: Some(key_indices),
+            routing_key_indices: None,
+        }
+    }
+
+    pub fn from_fields(mut fields: Vec<Field>) -> Self {
+        if !fields.iter().any(|f| f.name() == TIMESTAMP_FIELD) {
+            fields.push(Field::new(
+                TIMESTAMP_FIELD,
+                DataType::Timestamp(TimeUnit::Nanosecond, None),
+                false,
+            ));
+        }
+
+        Self::from_schema_keys(Arc::new(Schema::new(fields)), vec![]).unwrap()
+    }
+
+    pub fn from_schema_unkeyed(schema: Arc<Schema>) -> DFResult<Self> {
+        let timestamp_index = schema
+            .column_with_name(TIMESTAMP_FIELD)
+            .ok_or_else(|| {
+                DataFusionError::Plan(format!(
+                    "no {TIMESTAMP_FIELD} field in schema, schema is {schema:?}"
+                ))
+            })?
+            .0;
+
+        Ok(Self {
+            schema,
+            timestamp_index,
+            key_indices: None,
+            routing_key_indices: None,
+        })
+    }
+
+    pub fn from_schema_keys(schema: Arc<Schema>, key_indices: Vec<usize>) -> DFResult<Self> {
+        let timestamp_index = schema
+            .column_with_name(TIMESTAMP_FIELD)
+            .ok_or_else(|| {
+                DataFusionError::Plan(format!(
+                    "no {TIMESTAMP_FIELD} field in schema, schema is {schema:?}"
+                ))
+            })?
+            .0;
+
+        Ok(Self {
+            schema,
+            timestamp_index,
+            key_indices: Some(key_indices),
+            routing_key_indices: None,
+        })
+    }
+
+    pub fn schema_without_timestamp(&self) -> Schema {
+        let mut builder = SchemaBuilder::from(self.schema.fields());
+        builder.remove(self.timestamp_index);
+        builder.finish()
+    }
+
+    pub fn remove_timestamp_column(&self, batch: &mut RecordBatch) {
+        batch.remove_column(self.timestamp_index);
+    }
+
+    pub fn builders(&self) -> Vec<Box<dyn ArrayBuilder>> {
+        self.schema
+            .fields
+            .iter()
+            .map(|f| make_builder(f.data_type(), 8))
+            .collect()
+    }
+
+    pub fn timestamp_column<'a>(&self, batch: &'a RecordBatch) -> &'a TimestampNanosecondArray {
+        batch
+            .column(self.timestamp_index)
+            .as_any()
+            .downcast_ref::<TimestampNanosecondArray>()
+            .unwrap()
+    }
+
+    pub fn has_routing_keys(&self) -> bool {
+        self.routing_keys().map(|k| !k.is_empty()).unwrap_or(false)
+    }
+
+    pub fn routing_keys(&self) -> Option<&Vec<usize>> {
+        self.routing_key_indices
+            .as_ref()
+            .or(self.key_indices.as_ref())
+    }
+
+    pub fn storage_keys(&self) -> Option<&Vec<usize>> {
+        self.key_indices.as_ref()
+    }
+
+    pub fn clone_storage_key_indices(&self) -> Option<Vec<usize>> {
+        self.key_indices.clone()
+    }
+
+    pub fn clone_routing_key_indices(&self) -> Option<Vec<usize>> {
+        self.routing_key_indices.clone()
+    }
+
+    pub fn filter_by_time(
+        &self,
+        batch: RecordBatch,
+        cutoff: Option<SystemTime>,
+    ) -> Result<RecordBatch, ArrowError> {
+        let Some(cutoff) = cutoff else {
+            // no watermark, so we just return the same batch.
+            return Ok(batch);
+        };
+        // filter out late data
+        let timestamp_column = batch
+            .column(self.timestamp_index)
+            .as_any()
+            .downcast_ref::<TimestampNanosecondArray>()
+            .ok_or_else(|| ArrowError::CastError(
+                format!("failed to downcast column {} of {:?} to timestamp. Schema is supposed to be {:?}",
+                        self.timestamp_index, batch, self.schema)))?;
+        let cutoff_scalar = TimestampNanosecondArray::new_scalar(to_nanos(cutoff) as i64);
+        let on_time = gt_eq(timestamp_column, &cutoff_scalar)?;
+        filter_record_batch(&batch, &on_time)
+    }
+
+    pub fn sort_columns(&self, batch: &RecordBatch, with_timestamp: bool) -> Vec<SortColumn> {
+        let mut columns = vec![];
+        if let Some(keys) = &self.key_indices {
+            columns.extend(keys.iter().map(|index| SortColumn {
+                values: batch.column(*index).clone(),
+                options: None,
+            }));
+        }
+        if with_timestamp {
+            columns.push(SortColumn {
+                values: batch.column(self.timestamp_index).clone(),
+                options: None,
+            });
+        }
+        columns
+    }
+
+    pub fn sort_fields(&self, with_timestamp: bool) -> Vec<SortField> {
+        let mut sort_fields = vec![];
+        if let Some(keys) = &self.key_indices {
+            sort_fields.extend(keys.iter());
+        }
+        if with_timestamp {
+            sort_fields.push(self.timestamp_index);
+        }
+        self.sort_fields_by_indices(&sort_fields)
+    }
+
+    fn sort_fields_by_indices(&self, indices: &[usize]) -> Vec<SortField> {
+        indices
+            .iter()
+            .map(|index| SortField::new(self.schema.field(*index).data_type().clone()))
+            .collect()
+    }
+
+    pub fn converter(&self, with_timestamp: bool) -> Result<Converter, ArrowError> {
+        Converter::new(self.sort_fields(with_timestamp))
+    }
+
+    pub fn value_converter(
+        &self,
+        with_timestamp: bool,
+        generation_index: usize,
+    ) -> Result<Converter, ArrowError> {
+        match &self.key_indices {
+            None => {
+                let mut indices = (0..self.schema.fields().len()).collect::<Vec<_>>();
+                indices.remove(generation_index);
+                if !with_timestamp {
+                    indices.remove(self.timestamp_index);
+                }
+                Converter::new(self.sort_fields_by_indices(&indices))
+            }
+            Some(keys) => {
+                let indices = (0..self.schema.fields().len())
+                    .filter(|index| {
+                        !keys.contains(index)
+                            && (with_timestamp || *index != self.timestamp_index)
+                            && *index != generation_index
+                    })
+                    .collect::<Vec<_>>();
+                Converter::new(self.sort_fields_by_indices(&indices))
+            }
+        }
+    }
+
+    pub fn value_indices(&self, with_timestamp: bool) -> Vec<usize> {
+        let field_count = self.schema.fields().len();
+        match &self.key_indices {
+            None => {
+                let mut indices = (0..field_count).collect::<Vec<_>>();
+
+                if !with_timestamp {
+                    indices.remove(self.timestamp_index);
+                }
+                indices
+            }
+            Some(keys) => (0..field_count)
+                .filter(|index| {
+                    !keys.contains(index) && (with_timestamp || *index != self.timestamp_index)
+                })
+                .collect::<Vec<_>>(),
+        }
+    }
+
+    pub fn sort(
+        &self,
+        batch: RecordBatch,
+        with_timestamp: bool,
+    ) -> Result<RecordBatch, ArrowError> {
+        if self.key_indices.is_none() && !with_timestamp {
+            return Ok(batch);
+        }
+        let sort_columns = self.sort_columns(&batch, with_timestamp);
+        let sort_indices = lexsort_to_indices(&sort_columns, None).expect("should be able to sort");
+        let columns = batch
+            .columns()
+            .iter()
+            .map(|c| take(c, &sort_indices, None).unwrap())
+            .collect();
+
+        RecordBatch::try_new(batch.schema(), columns)
+    }
+
+    pub fn partition(
+        &self,
+        batch: &RecordBatch,
+        with_timestamp: bool,
+    ) -> Result<Vec<Range<usize>>, ArrowError> {
+        if self.key_indices.is_none() && !with_timestamp {
+            #[allow(clippy::single_range_in_vec_init)]
+            return Ok(vec![0..batch.num_rows()]);
+        }
+
+        let mut partition_columns = vec![];
+
+        if let Some(keys) = &self.routing_keys() {
+            partition_columns.extend(keys.iter().map(|index| batch.column(*index).clone()));
+        }
+        if with_timestamp {
+            partition_columns.push(batch.column(self.timestamp_index).clone());
+        }
+
+        Ok(partition(&partition_columns)?.ranges())
+    }
+
+    pub fn unkeyed_batch(&self, batch: &RecordBatch) -> Result<RecordBatch, ArrowError> {
+        if self.key_indices.is_none() {
+            return Ok(batch.clone());
+        }
+        let columns: Vec<_> = (0..batch.num_columns())
+            .filter(|index| !self.key_indices.as_ref().unwrap().contains(index))
+            .collect();
+        batch.project(&columns)
+    }
+
+    pub fn schema_without_keys(&self) -> Result<Self, ArrowError> {
+        if self.key_indices.is_none() {
+            return Ok(self.clone());
+        }
+        let key_indices = self.key_indices.as_ref().unwrap();
+        let unkeyed_schema = Schema::new(
+            self.schema
+                .fields()
+                .iter()
+                .enumerate()
+                .filter(|(index, _field)| !key_indices.contains(index))
+                .map(|(_, field)| field.as_ref().clone())
+                .collect::<Vec<_>>(),
+        );
+        let timestamp_index = unkeyed_schema.index_of(TIMESTAMP_FIELD)?;
+        Ok(Self {
+            schema: Arc::new(unkeyed_schema),
+            timestamp_index,
+            key_indices: None,
+            routing_key_indices: None,
+        })
+    }
+
+    pub fn with_fields(&self, fields: Vec<FieldRef>) -> Result<Self, ArrowError> {
+        let schema = Arc::new(Schema::new_with_metadata(
+            fields,
+            self.schema.metadata.clone(),
+        ));
+
+        let timestamp_index = schema.index_of(TIMESTAMP_FIELD)?;
+        let max_index = *[&self.key_indices, &self.routing_key_indices]
+            .iter()
+            .map(|indices| indices.as_ref().and_then(|k| k.iter().max()))
+            .max()
+            .flatten()
+            .unwrap_or(&0);
+
+        if schema.fields.len() - 1 < max_index {
+            return Err(ArrowError::InvalidArgumentError(format!(
+                "expected at least {} fields, but were only {}",
+                max_index + 1,
+                schema.fields.len()
+            )));
+        }
+
+        Ok(Self {
+            schema,
+            timestamp_index,
+            key_indices: self.key_indices.clone(),
+            routing_key_indices: self.routing_key_indices.clone(),
+        })
+    }
+
+    pub fn with_additional_fields(
+        &self,
+        new_fields: impl Iterator<Item = Field>,
+    ) -> Result<Self, ArrowError> {
+        let mut fields = self.schema.fields.to_vec();
+        fields.extend(new_fields.map(Arc::new));
+
+        self.with_fields(fields)
+    }
+}
+
+pub fn server_for_hash_array(
+    hash: &PrimitiveArray<UInt64Type>,
+    n: usize,
+) -> Result<PrimitiveArray<UInt64Type>, ArrowError> {
+    let range_size = u64::MAX / (n as u64) + 1;
+    let range_scalar = UInt64Array::new_scalar(range_size);
+    let division = div(hash, &range_scalar)?;
+    let result: &PrimitiveArray<UInt64Type> = division.as_any().downcast_ref().unwrap();
+    Ok(result.clone())
+}
diff --git a/src/sql/common/kafka_catalog.rs b/src/sql/common/kafka_catalog.rs
new file mode 100644
index 00000000..5d54b1b2
--- /dev/null
+++ b/src/sql/common/kafka_catalog.rs
@@ -0,0 +1,126 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//!
+//!
+
+use serde::{Deserialize, Serialize};
+use std::collections::HashMap;
+
+
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
+pub struct KafkaTable {
+    pub topic: String,
+    #[serde(flatten)]
+    pub kind: TableType,
+    #[serde(default)]
+    pub client_configs: HashMap<String, String>,
+    pub value_subject: Option<String>,
+}
+
+impl KafkaTable {
+    pub fn subject(&self) -> String {
+        self.value_subject
+            .clone()
+            .unwrap_or_else(|| format!("{}-value", self.topic))
+    }
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
+#[serde(tag = "type", rename_all = "snake_case")]
+pub enum TableType {
+    Source {
+        offset: KafkaTableSourceOffset,
+        read_mode: Option<ReadMode>,
+        group_id: Option<String>,
+        group_id_prefix: Option<String>,
+    },
+    Sink {
+        commit_mode: SinkCommitMode,
+        key_field: Option<String>,
+        timestamp_field: Option<String>,
+    },
+}
+
+#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Default)]
+#[serde(rename_all = "snake_case")]
+pub enum KafkaTableSourceOffset {
+    Latest,
+    Earliest,
+    #[default]
+    Group,
+}
+
+#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
+#[serde(rename_all = "snake_case")]
+pub enum ReadMode {
+    ReadUncommitted,
+    ReadCommitted,
+}
+
+#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Default)]
+#[serde(rename_all = "snake_case")]
+pub enum SinkCommitMode {
+    #[default]
+    AtLeastOnce,
+    ExactlyOnce,
+}
+
+
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
+#[serde(rename_all = "camelCase")]
+pub struct KafkaConfig {
+    pub bootstrap_servers: String,
+    #[serde(default)]
+    pub authentication: KafkaConfigAuthentication,
+    #[serde(default)]
+    pub schema_registry_enum: Option<SchemaRegistryConfig>,
+    #[serde(default)]
+    pub connection_properties: HashMap<String, String>,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
+#[serde(tag = "type")]
+pub enum KafkaConfigAuthentication {
+    #[serde(rename = "None")]
+    None,
+    #[serde(rename = "AWS_MSK_IAM")]
+    AwsMskIam { region: String },
+    #[serde(rename = "SASL")]
+    Sasl {
+        protocol: String,
+        mechanism: String,
+        username: String,
+        password: String,
+    },
+}
+
+impl Default for KafkaConfigAuthentication {
+    fn default() -> Self {
+        Self::None
+    }
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
+#[serde(tag = "type")]
+pub enum SchemaRegistryConfig {
+    #[serde(rename = "None")]
+    None,
+    #[serde(rename = "Confluent Schema Registry")]
+    ConfluentSchemaRegistry {
+        endpoint: String,
+        #[serde(rename = "apiKey")]
+        api_key: Option<String>,
+        #[serde(rename = "apiSecret")]
+        api_secret: Option<String>,
+    },
+}
diff --git a/src/sql/common/message.rs b/src/sql/common/message.rs
new file mode 100644
index 00000000..4dcde95b
--- /dev/null
+++ b/src/sql/common/message.rs
@@ -0,0 +1,54 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use bincode::{Decode, Encode};
+use datafusion::arrow::array::RecordBatch;
+use serde::{Deserialize, Serialize};
+use std::time::SystemTime;
+
+#[derive(Debug, Copy, Clone, PartialEq, Eq, Encode, Decode, Serialize, Deserialize)]
+pub enum Watermark {
+    EventTime(SystemTime),
+    Idle,
+}
+
+#[derive(Debug, Clone, PartialEq)]
+pub enum ArrowMessage {
+    Data(RecordBatch),
+    Signal(SignalMessage),
+}
+
+impl ArrowMessage {
+    pub fn is_end(&self) -> bool {
+        matches!(
+            self,
+            ArrowMessage::Signal(SignalMessage::Stop)
+                | ArrowMessage::Signal(SignalMessage::EndOfData)
+        )
+    }
+}
+
+#[derive(Debug, Clone, PartialEq, Encode, Decode)]
+pub enum SignalMessage {
+    Barrier(CheckpointBarrier),
+    Watermark(Watermark),
+    Stop,
+    EndOfData,
+}
+
+#[derive(Debug, Copy, Clone, PartialEq, Eq, Encode, Decode, Serialize, Deserialize)]
+pub struct CheckpointBarrier {
+    pub epoch: u32,
+    pub min_epoch: u32,
+    pub timestamp: SystemTime,
+    pub then_stop: bool,
+}
diff --git a/src/sql/common/mod.rs b/src/sql/common/mod.rs
new file mode 100644
index 00000000..e042aea6
--- /dev/null
+++ b/src/sql/common/mod.rs
@@ -0,0 +1,66 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Shared core types and constants for FunctionStream (`crate::sql::common`).
+//!
+//! Used by the runtime, SQL planner, coordinator, and other subsystems —
+//! analogous to `arroyo-types` + `arroyo-rpc` in Arroyo.
+
+pub mod arrow_ext;
+pub mod connector_options;
+pub mod with_option_keys;
+pub mod constants;
+pub mod control;
+pub mod date;
+pub mod debezium;
+pub mod fs_schema;
+pub mod errors;
+pub mod format_from_opts;
+pub mod formats;
+pub mod kafka_catalog;
+pub mod message;
+pub mod operator_config;
+pub mod time_utils;
+pub mod converter;
+pub mod topology;
+
+// ── Re-exports from existing modules ──
+pub use arrow_ext::FsExtensionType;
+pub use message::{CheckpointBarrier, Watermark};
+pub use time_utils::{from_nanos, to_micros, to_millis, to_nanos};
+
+// ── Re-exports from new modules ──
+pub use fs_schema::{FsSchema, FsSchemaRef};
+pub use connector_options::ConnectorOptions;
+pub use formats::{BadData, Format, Framing, JsonCompression, JsonFormat};
+pub use operator_config::MetadataField;
+
+// ── Well-known column names ──
+pub use constants::sql_field::{TIMESTAMP_FIELD, UPDATING_META_FIELD};
+pub use topology::render_program_topology;
+
+// ── Environment variables ──
+pub const JOB_ID_ENV: &str = "JOB_ID";
+pub const RUN_ID_ENV: &str = "RUN_ID";
+
+// ── Metric names ──
+pub const MESSAGES_RECV: &str = "fs_worker_messages_recv";
+pub const MESSAGES_SENT: &str = "fs_worker_messages_sent";
+pub const BYTES_RECV: &str = "fs_worker_bytes_recv";
+pub const BYTES_SENT: &str = "fs_worker_bytes_sent";
+pub const BATCHES_RECV: &str = "fs_worker_batches_recv";
+pub const BATCHES_SENT: &str = "fs_worker_batches_sent";
+pub const TX_QUEUE_SIZE: &str = "fs_worker_tx_queue_size";
+pub const TX_QUEUE_REM: &str = "fs_worker_tx_queue_rem";
+pub const DESERIALIZATION_ERRORS: &str = "fs_worker_deserialization_errors";
+
+pub const LOOKUP_KEY_INDEX_FIELD: &str = "__lookup_key_index";
diff --git a/src/sql/common/operator_config.rs b/src/sql/common/operator_config.rs
new file mode 100644
index 00000000..b5360cd7
--- /dev/null
+++ b/src/sql/common/operator_config.rs
@@ -0,0 +1,12 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+
+use serde::{Deserialize, Serialize};
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct MetadataField {
+    pub field_name: String,
+    pub key: String,
+    #[serde(default)]
+    pub data_type: Option<String>,
+}
diff --git a/src/sql/common/time_utils.rs b/src/sql/common/time_utils.rs
new file mode 100644
index 00000000..323445cd
--- /dev/null
+++ b/src/sql/common/time_utils.rs
@@ -0,0 +1,74 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::collections::HashMap;
+use std::hash::Hash;
+use std::time::{Duration, SystemTime, UNIX_EPOCH};
+
+pub fn to_millis(time: SystemTime) -> u64 {
+    time.duration_since(UNIX_EPOCH).unwrap().as_millis() as u64
+}
+
+pub fn to_micros(time: SystemTime) -> u64 {
+    time.duration_since(UNIX_EPOCH).unwrap().as_micros() as u64
+}
+
+pub fn from_millis(ts: u64) -> SystemTime {
+    UNIX_EPOCH + Duration::from_millis(ts)
+}
+
+pub fn from_micros(ts: u64) -> SystemTime {
+    UNIX_EPOCH + Duration::from_micros(ts)
+}
+
+pub fn to_nanos(time: SystemTime) -> u128 {
+    time.duration_since(UNIX_EPOCH).unwrap().as_nanos()
+}
+
+pub fn from_nanos(ts: u128) -> SystemTime {
+    UNIX_EPOCH
+        + Duration::from_secs((ts / 1_000_000_000) as u64)
+        + Duration::from_nanos((ts % 1_000_000_000) as u64)
+}
+
+pub fn print_time(time: SystemTime) -> String {
+    chrono::DateTime::<chrono::Utc>::from(time)
+        .format("%Y-%m-%d %H:%M:%S%.3f")
+        .to_string()
+}
+
+/// Returns the number of days since the UNIX epoch (for Avro serialization).
+pub fn days_since_epoch(time: SystemTime) -> i32 {
+    time.duration_since(UNIX_EPOCH)
+        .unwrap()
+        .as_secs()
+        .div_euclid(86400) as i32
+}
+
+pub fn single_item_hash_map<I: Into<K>, K: Hash + Eq, V>(key: I, value: V) -> HashMap<K, V> {
+    let mut map = HashMap::new();
+    map.insert(key.into(), value);
+    map
+}
+
+pub fn string_to_map(s: &str, pair_delimiter: char) -> Option<HashMap<String, String>> {
+    if s.trim().is_empty() {
+        return Some(HashMap::new());
+    }
+
+    s.split(',')
+        .map(|s| {
+            let mut kv = s.trim().split(pair_delimiter);
+            Some((kv.next()?.trim().to_string(), kv.next()?.trim().to_string()))
+        })
+        .collect()
+}
diff --git a/src/sql/common/topology.rs b/src/sql/common/topology.rs
new file mode 100644
index 00000000..bc71d57f
--- /dev/null
+++ b/src/sql/common/topology.rs
@@ -0,0 +1,280 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! EXPLAIN-like DAG text renderer for [`FsProgram`].
+//!
+//! Renders a streaming pipeline topology as a human-readable ASCII graph using
+//! Kahn's topological sort.  Handles linear chains, fan-out, and fan-in (JOIN).
+
+use std::collections::{BTreeMap, VecDeque};
+use std::fmt::Write;
+
+use protocol::grpc::api::FsProgram;
+
+fn edge_type_label(edge_type: i32) -> &'static str {
+    match edge_type {
+        1 => "Forward",
+        2 => "Shuffle",
+        3 => "LeftJoin",
+        4 => "RightJoin",
+        _ => "Unknown",
+    }
+}
+
+/// Render an [`FsProgram`] as an EXPLAIN-style topology string.
+pub fn render_program_topology(program: &FsProgram) -> String {
+    if program.nodes.is_empty() {
+        return "(empty topology)".to_string();
+    }
+
+    struct EdgeInfo { target: i32, edge_type: i32 }
+    struct InputInfo { source: i32, edge_type: i32 }
+
+    let node_map: BTreeMap<i32, &protocol::grpc::api::FsNode> =
+        program.nodes.iter().map(|n| (n.node_index, n)).collect();
+
+    let mut downstream: BTreeMap<i32, Vec<EdgeInfo>> = BTreeMap::new();
+    let mut upstream: BTreeMap<i32, Vec<InputInfo>> = BTreeMap::new();
+    let mut in_degree: BTreeMap<i32, usize> = BTreeMap::new();
+
+    for idx in node_map.keys() {
+        in_degree.entry(*idx).or_insert(0);
+    }
+    for edge in &program.edges {
+        downstream.entry(edge.source).or_default().push(EdgeInfo {
+            target: edge.target,
+            edge_type: edge.edge_type,
+        });
+        upstream.entry(edge.target).or_default().push(InputInfo {
+            source: edge.source,
+            edge_type: edge.edge_type,
+        });
+        *in_degree.entry(edge.target).or_insert(0) += 1;
+    }
+
+    // Kahn's topological sort
+    let mut queue: VecDeque<i32> = in_degree
+        .iter()
+        .filter(|(_, deg)| **deg == 0)
+        .map(|(idx, _)| *idx)
+        .collect();
+    let mut topo_order: Vec<i32> = Vec::with_capacity(node_map.len());
+    let mut remaining = in_degree.clone();
+    while let Some(idx) = queue.pop_front() {
+        topo_order.push(idx);
+        if let Some(edges) = downstream.get(&idx) {
+            for e in edges {
+                if let Some(deg) = remaining.get_mut(&e.target) {
+                    *deg -= 1;
+                    if *deg == 0 {
+                        queue.push_back(e.target);
+                    }
+                }
+            }
+        }
+    }
+    for idx in node_map.keys() {
+        if !topo_order.contains(idx) {
+            topo_order.push(*idx);
+        }
+    }
+
+    let is_source = |idx: &i32| upstream.get(idx).map_or(true, |v| v.is_empty());
+    let is_sink = |idx: &i32| downstream.get(idx).map_or(true, |v| v.is_empty());
+
+    let mut out = String::new();
+    let _ = writeln!(
+        out,
+        "Pipeline Topology  ({} nodes, {} edges)",
+        program.nodes.len(),
+        program.edges.len(),
+    );
+    let _ = writeln!(out, "{}", "=".repeat(50));
+
+    for (pos, &node_idx) in topo_order.iter().enumerate() {
+        let Some(node) = node_map.get(&node_idx) else {
+            continue;
+        };
+
+        let op_chain: String = node
+            .operators
+            .iter()
+            .map(|op| op.operator_name.as_str())
+            .collect::<Vec<_>>()
+            .join(" -> ");
+
+        let role = if is_source(&node_idx) {
+            "Source"
+        } else if is_sink(&node_idx) {
+            "Sink"
+        } else {
+            "Operator"
+        };
+
+        let _ = writeln!(out);
+        let _ = writeln!(
+            out,
+            "[{role}] Node {node_idx}    parallelism = {}",
+            node.parallelism,
+        );
+        let _ = writeln!(out, "  operators:  {op_chain}");
+
+        if !node.description.is_empty() {
+            let _ = writeln!(out, "  desc:       {}", node.description);
+        }
+
+        if let Some(inputs) = upstream.get(&node_idx) {
+            if inputs.len() == 1 {
+                let i = &inputs[0];
+                let _ = writeln!(
+                    out,
+                    "  input:      <-- [{}] Node {}",
+                    edge_type_label(i.edge_type),
+                    i.source,
+                );
+            } else if inputs.len() > 1 {
+                let _ = writeln!(out, "  inputs:");
+                for i in inputs {
+                    let _ = writeln!(
+                        out,
+                        "              <-- [{}] Node {}",
+                        edge_type_label(i.edge_type),
+                        i.source,
+                    );
+                }
+            }
+        }
+
+        if let Some(outputs) = downstream.get(&node_idx) {
+            if outputs.len() == 1 {
+                let e = &outputs[0];
+                let _ = writeln!(
+                    out,
+                    "  output:     --> [{}] Node {}",
+                    edge_type_label(e.edge_type),
+                    e.target,
+                );
+            } else if outputs.len() > 1 {
+                let _ = writeln!(out, "  outputs:");
+                for e in outputs {
+                    let _ = writeln!(
+                        out,
+                        "              --> [{}] Node {}",
+                        edge_type_label(e.edge_type),
+                        e.target,
+                    );
+                }
+            }
+        }
+
+        if pos < topo_order.len() - 1 {
+            let single_out = downstream.get(&node_idx).map_or(false, |v| v.len() == 1);
+            let next_idx = topo_order.get(pos + 1).copied();
+            let is_direct = single_out
+                && next_idx.map_or(false, |n| {
+                    downstream.get(&node_idx).map_or(false, |v| v[0].target == n)
+                });
+            let next_single_in = next_idx
+                .and_then(|n| upstream.get(&n))
+                .map_or(false, |v| v.len() == 1);
+
+            if is_direct && next_single_in {
+                let etype = downstream.get(&node_idx).unwrap()[0].edge_type;
+                let _ = writeln!(out, "        |");
+                let _ = writeln!(out, "        | {}", edge_type_label(etype));
+                let _ = writeln!(out, "        v");
+            }
+        }
+    }
+
+    out.trim_end().to_string()
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use protocol::grpc::api::{ChainedOperator, FsEdge, FsNode, FsProgram};
+
+    fn make_node(node_index: i32, operators: Vec<(&str, &str)>, desc: &str, parallelism: u32) -> FsNode {
+        FsNode {
+            node_index,
+            node_id: node_index as u32,
+            parallelism,
+            description: desc.to_string(),
+            operators: operators
+                .into_iter()
+                .map(|(id, name)| ChainedOperator {
+                    operator_id: id.to_string(),
+                    operator_name: name.to_string(),
+                    operator_config: Vec::new(),
+                })
+                .collect(),
+            edges: Vec::new(),
+        }
+    }
+
+    fn make_edge(source: i32, target: i32, edge_type: i32) -> FsEdge {
+        FsEdge { source, target, schema: None, edge_type }
+    }
+
+    #[test]
+    fn empty_program_renders_placeholder() {
+        let program = FsProgram { nodes: vec![], edges: vec![], program_config: None };
+        assert_eq!(render_program_topology(&program), "(empty topology)");
+    }
+
+    #[test]
+    fn linear_pipeline_renders_correctly() {
+        let program = FsProgram {
+            nodes: vec![
+                make_node(0, vec![("src_0", "ConnectorSource")], "", 1),
+                make_node(1, vec![("val_1", "Value"), ("wm_2", "ExpressionWatermark")], "source -> watermark", 1),
+                make_node(2, vec![("sink_3", "ConnectorSink")], "sink (kafka)", 1),
+            ],
+            edges: vec![
+                make_edge(0, 1, 1),
+                make_edge(1, 2, 1),
+            ],
+            program_config: None,
+        };
+        let result = render_program_topology(&program);
+        assert!(result.contains("[Source] Node 0"));
+        assert!(result.contains("[Operator] Node 1"));
+        assert!(result.contains("[Sink] Node 2"));
+        assert!(result.contains("ConnectorSource"));
+        assert!(result.contains("Value -> ExpressionWatermark"));
+        assert!(result.contains("Forward"));
+    }
+
+    #[test]
+    fn join_topology_shows_multiple_inputs() {
+        let program = FsProgram {
+            nodes: vec![
+                make_node(0, vec![("src_a", "ConnectorSource")], "source A", 1),
+                make_node(1, vec![("src_b", "ConnectorSource")], "source B", 1),
+                make_node(2, vec![("join_0", "WindowJoin")], "join node", 2),
+                make_node(3, vec![("sink_0", "ConnectorSink")], "sink", 1),
+            ],
+            edges: vec![
+                make_edge(0, 2, 3), // LeftJoin
+                make_edge(1, 2, 4), // RightJoin
+                make_edge(2, 3, 1), // Forward
+            ],
+            program_config: None,
+        };
+        let result = render_program_topology(&program);
+        assert!(result.contains("inputs:"));
+        assert!(result.contains("LeftJoin"));
+        assert!(result.contains("RightJoin"));
+        assert!(result.contains("[Operator] Node 2"));
+    }
+}
diff --git a/src/sql/common/with_option_keys.rs b/src/sql/common/with_option_keys.rs
new file mode 100644
index 00000000..a42f7405
--- /dev/null
+++ b/src/sql/common/with_option_keys.rs
@@ -0,0 +1,91 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+
+pub const CONNECTOR: &str = "connector";
+pub const TYPE: &str = "type";
+pub const FORMAT: &str = "format";
+pub const DEFAULT_FORMAT_VALUE: &str = "json";
+pub const BAD_DATA: &str = "bad_data";
+pub const PARTITION_BY: &str = "partition_by";
+
+pub const EVENT_TIME_FIELD: &str = "event_time_field";
+pub const WATERMARK_FIELD: &str = "watermark_field";
+
+pub const IDLE_MICROS: &str = "idle_micros";
+pub const IDLE_TIME: &str = "idle_time";
+
+pub const LOOKUP_CACHE_MAX_BYTES: &str = "lookup.cache.max_bytes";
+pub const LOOKUP_CACHE_TTL: &str = "lookup.cache.ttl";
+
+
+pub const CONNECTION_SCHEMA: &str = "connection_schema";
+
+
+pub const ADAPTER: &str = "adapter";
+
+// ── Kafka ─────────────────────────────────────────────────────────────────
+
+pub const KAFKA_BOOTSTRAP_SERVERS: &str = "bootstrap.servers";
+pub const KAFKA_BOOTSTRAP_SERVERS_LEGACY: &str = "bootstrap_servers";
+pub const KAFKA_TOPIC: &str = "topic";
+pub const KAFKA_RATE_LIMIT_MESSAGES_PER_SECOND: &str = "rate_limit.messages_per_second";
+pub const KAFKA_VALUE_SUBJECT: &str = "value.subject";
+pub const KAFKA_SCAN_STARTUP_MODE: &str = "scan.startup.mode";
+pub const KAFKA_ISOLATION_LEVEL: &str = "isolation.level";
+pub const KAFKA_GROUP_ID: &str = "group.id";
+pub const KAFKA_GROUP_ID_LEGACY: &str = "group_id";
+pub const KAFKA_GROUP_ID_PREFIX: &str = "group.id.prefix";
+pub const KAFKA_SINK_COMMIT_MODE: &str = "sink.commit.mode";
+pub const KAFKA_SINK_KEY_FIELD: &str = "sink.key.field";
+pub const KAFKA_KEY_FIELD_LEGACY: &str = "key.field";
+pub const KAFKA_SINK_TIMESTAMP_FIELD: &str = "sink.timestamp.field";
+pub const KAFKA_TIMESTAMP_FIELD_LEGACY: &str = "timestamp.field";
+
+// ── JSON format ───────────────────────────────────────────────────────────
+
+pub const JSON_CONFLUENT_SCHEMA_REGISTRY: &str = "json.confluent_schema_registry";
+pub const JSON_CONFLUENT_SCHEMA_VERSION: &str = "json.confluent_schema_version";
+pub const JSON_INCLUDE_SCHEMA: &str = "json.include_schema";
+pub const JSON_DEBEZIUM: &str = "json.debezium";
+pub const JSON_UNSTRUCTURED: &str = "json.unstructured";
+pub const JSON_TIMESTAMP_FORMAT: &str = "json.timestamp_format";
+pub const JSON_DECIMAL_ENCODING: &str = "json.decimal_encoding";
+pub const JSON_COMPRESSION: &str = "json.compression";
+
+// ── Avro ──────────────────────────────────────────────────────────────────
+
+pub const AVRO_CONFLUENT_SCHEMA_REGISTRY: &str = "avro.confluent_schema_registry";
+pub const AVRO_RAW_DATUMS: &str = "avro.raw_datums";
+pub const AVRO_INTO_UNSTRUCTURED_JSON: &str = "avro.into_unstructured_json";
+pub const AVRO_SCHEMA_ID: &str = "avro.schema_id";
+
+// ── Parquet ───────────────────────────────────────────────────────────────
+
+pub const PARQUET_COMPRESSION: &str = "parquet.compression";
+pub const PARQUET_ROW_GROUP_BYTES: &str = "parquet.row_group_bytes";
+
+// ── Protobuf ────────────────────────────────────────────────────────────────
+
+pub const PROTOBUF_INTO_UNSTRUCTURED_JSON: &str = "protobuf.into_unstructured_json";
+pub const PROTOBUF_MESSAGE_NAME: &str = "protobuf.message_name";
+pub const PROTOBUF_CONFLUENT_SCHEMA_REGISTRY: &str = "protobuf.confluent_schema_registry";
+pub const PROTOBUF_LENGTH_DELIMITED: &str = "protobuf.length_delimited";
+
+// ── Framing ─────────────────────────────────────────────────────────────────
+
+pub const FRAMING_METHOD: &str = "framing.method";
+pub const FRAMING_MAX_LINE_LENGTH: &str = "framing.max_line_length";
+
+
+pub const FORMAT_DEBEZIUM_FLAG: &str = "format.debezium";
diff --git a/src/sql/datastream/logical.rs b/src/sql/datastream/logical.rs
new file mode 100644
index 00000000..e26be9f3
--- /dev/null
+++ b/src/sql/datastream/logical.rs
@@ -0,0 +1,371 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use itertools::Itertools;
+
+use datafusion::arrow::datatypes::DataType;
+use petgraph::Direction;
+use petgraph::dot::Dot;
+use petgraph::graph::DiGraph;
+use std::collections::{HashMap, HashSet};
+use std::fmt::{Debug, Display, Formatter};
+use std::sync::Arc;
+use datafusion_proto::protobuf::ArrowType;
+use prost::Message;
+use strum::{Display, EnumString};
+use protocol::grpc::api;
+use crate::types::FsSchema;
+
+#[derive(Clone, Copy, Debug, Eq, PartialEq, EnumString, Display)]
+pub enum OperatorName {
+    ExpressionWatermark,
+    ArrowValue,
+    ArrowKey,
+    Projection,
+    AsyncUdf,
+    Join,
+    InstantJoin,
+    LookupJoin,
+    WindowFunction,
+    TumblingWindowAggregate,
+    SlidingWindowAggregate,
+    SessionWindowAggregate,
+    UpdatingAggregate,
+    KeyBy,
+    ConnectorSource,
+    ConnectorSink,
+}
+
+#[derive(Copy, Clone, Debug, Eq, PartialEq, PartialOrd, Ord)]
+pub enum LogicalEdgeType {
+    Forward,
+    Shuffle,
+    LeftJoin,
+    RightJoin,
+}
+
+impl Display for LogicalEdgeType {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        match self {
+            LogicalEdgeType::Forward => write!(f, "→"),
+            LogicalEdgeType::Shuffle => write!(f, "⤨"),
+            LogicalEdgeType::LeftJoin => write!(f, "-[left]⤨"),
+            LogicalEdgeType::RightJoin => write!(f, "-[right]⤨"),
+        }
+    }
+}
+
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct LogicalEdge {
+    pub edge_type: LogicalEdgeType,
+    pub schema: Arc<FsSchema>,
+}
+
+impl LogicalEdge {
+    pub fn new(edge_type: LogicalEdgeType, schema: FsSchema) -> Self {
+        LogicalEdge {
+            edge_type,
+            schema: Arc::new(schema),
+        }
+    }
+
+    pub fn project_all(edge_type: LogicalEdgeType, schema: FsSchema) -> Self {
+        LogicalEdge {
+            edge_type,
+            schema: Arc::new(schema),
+        }
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct ChainedLogicalOperator {
+    pub operator_id: String,
+    pub operator_name: OperatorName,
+    pub operator_config: Vec<u8>,
+}
+
+#[derive(Clone, Debug)]
+pub struct OperatorChain {
+    pub(crate) operators: Vec<ChainedLogicalOperator>,
+    pub(crate) edges: Vec<Arc<FsSchema>>,
+}
+
+impl OperatorChain {
+    pub fn new(operator: ChainedLogicalOperator) -> Self {
+        Self {
+            operators: vec![operator],
+            edges: vec![],
+        }
+    }
+
+    pub fn iter(
+        &self,
+    ) -> impl Iterator<Item = (&ChainedLogicalOperator, Option<&Arc<FsSchema>>)> {
+        self.operators
+            .iter()
+            .zip_longest(self.edges.iter())
+            .map(|e| e.left_and_right())
+            .map(|(l, r)| (l.unwrap(), r))
+    }
+
+    pub fn iter_mut(
+        &mut self,
+    ) -> impl Iterator<Item = (&mut ChainedLogicalOperator, Option<&Arc<FsSchema>>)> {
+        self.operators
+            .iter_mut()
+            .zip_longest(self.edges.iter())
+            .map(|e| e.left_and_right())
+            .map(|(l, r)| (l.unwrap(), r))
+    }
+
+    pub fn first(&self) -> &ChainedLogicalOperator {
+        &self.operators[0]
+    }
+
+    pub fn len(&self) -> usize {
+        self.operators.len()
+    }
+
+    pub fn is_empty(&self) -> bool {
+        self.operators.is_empty()
+    }
+
+    pub fn is_source(&self) -> bool {
+        self.operators[0].operator_name == OperatorName::ConnectorSource
+    }
+
+    pub fn is_sink(&self) -> bool {
+        self.operators[0].operator_name == OperatorName::ConnectorSink
+    }
+}
+
+#[derive(Clone)]
+pub struct LogicalNode {
+    pub node_id: u32,
+    pub description: String,
+    pub operator_chain: OperatorChain,
+    pub parallelism: usize,
+}
+
+impl LogicalNode {
+    pub fn single(
+        id: u32,
+        operator_id: String,
+        name: OperatorName,
+        config: Vec<u8>,
+        description: String,
+        parallelism: usize,
+    ) -> Self {
+        Self {
+            node_id: id,
+            description,
+            operator_chain: OperatorChain {
+                operators: vec![ChainedLogicalOperator {
+                    operator_id,
+                    operator_name: name,
+                    operator_config: config,
+                }],
+                edges: vec![],
+            },
+            parallelism,
+        }
+    }
+}
+
+impl Display for LogicalNode {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", self.description)
+    }
+}
+
+impl Debug for LogicalNode {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "{}[{}]",
+            self.operator_chain
+                .operators
+                .iter()
+                .map(|op| op.operator_id.clone())
+                .collect::<Vec<_>>()
+                .join(" -> "),
+            self.parallelism
+        )
+    }
+}
+
+pub type LogicalGraph = DiGraph<LogicalNode, LogicalEdge>;
+
+pub trait Optimizer {
+    fn optimize_once(&self, plan: &mut LogicalGraph) -> bool;
+
+    fn optimize(&self, plan: &mut LogicalGraph) {
+        loop {
+            if !self.optimize_once(plan) {
+                break;
+            }
+        }
+    }
+}
+
+#[derive(Clone, Debug, Eq, PartialEq, Hash, PartialOrd)]
+pub struct DylibUdfConfig {
+    pub dylib_path: String,
+    pub arg_types: Vec<DataType>,
+    pub return_type: DataType,
+    pub aggregate: bool,
+    pub is_async: bool,
+}
+
+#[derive(Clone, Debug, Eq, PartialEq, Hash)]
+pub struct PythonUdfConfig {
+    pub arg_types: Vec<DataType>,
+    pub return_type: DataType,
+    pub name: Arc<String>,
+    pub definition: Arc<String>,
+}
+
+#[derive(Clone, Debug, Default)]
+pub struct ProgramConfig {
+    pub udf_dylibs: HashMap<String, DylibUdfConfig>,
+    pub python_udfs: HashMap<String, PythonUdfConfig>,
+}
+
+#[derive(Clone, Debug, Default)]
+pub struct LogicalProgram {
+    pub graph: LogicalGraph,
+    pub program_config: ProgramConfig,
+}
+
+impl LogicalProgram {
+    pub fn new(graph: LogicalGraph, program_config: ProgramConfig) -> Self {
+        Self {
+            graph,
+            program_config,
+        }
+    }
+
+    pub fn optimize(&mut self, optimizer: &dyn Optimizer) {
+        optimizer.optimize(&mut self.graph);
+    }
+
+    pub fn update_parallelism(&mut self, overrides: &HashMap<u32, usize>) {
+        for node in self.graph.node_weights_mut() {
+            if let Some(p) = overrides.get(&node.node_id) {
+                node.parallelism = *p;
+            }
+        }
+    }
+
+    pub fn dot(&self) -> String {
+        format!("{:?}", Dot::with_config(&self.graph, &[]))
+    }
+
+    pub fn task_count(&self) -> usize {
+        self.graph.node_weights().map(|nw| nw.parallelism).sum()
+    }
+
+    pub fn sources(&self) -> HashSet<u32> {
+        self.graph
+            .externals(Direction::Incoming)
+            .map(|t| self.graph.node_weight(t).unwrap().node_id)
+            .collect()
+    }
+
+    pub fn tasks_per_operator(&self) -> HashMap<String, usize> {
+        let mut tasks_per_operator = HashMap::new();
+        for node in self.graph.node_weights() {
+            for op in &node.operator_chain.operators {
+                tasks_per_operator.insert(op.operator_id.clone(), node.parallelism);
+            }
+        }
+        tasks_per_operator
+    }
+
+    pub fn operator_names_by_id(&self) -> HashMap<String, String> {
+        let mut m = HashMap::new();
+        for node in self.graph.node_weights() {
+            for op in &node.operator_chain.operators {
+                m.insert(op.operator_id.clone(), op.operator_name.to_string());
+            }
+        }
+        m
+    }
+
+    pub fn tasks_per_node(&self) -> HashMap<u32, usize> {
+        let mut tasks_per_node = HashMap::new();
+        for node in self.graph.node_weights() {
+            tasks_per_node.insert(node.node_id, node.parallelism);
+        }
+        tasks_per_node
+    }
+
+    pub fn features(&self) -> HashSet<String> {
+        let mut s = HashSet::new();
+        for n in self.graph.node_weights() {
+            for t in &n.operator_chain.operators {
+                let Some(tag) = t.operator_name.feature_tag() else {
+                    continue;
+                };
+                s.insert(tag.to_string());
+            }
+        }
+        s
+    }
+}
+
+
+impl From<DylibUdfConfig> for api::DylibUdfConfig {
+    fn from(from: DylibUdfConfig) -> Self {
+        api::DylibUdfConfig {
+            dylib_path: from.dylib_path,
+            arg_types: from
+                .arg_types
+                .iter()
+                .map(|t| {
+                    ArrowType::try_from(t)
+                        .expect("unsupported data type")
+                        .encode_to_vec()
+                })
+                .collect(),
+            return_type: ArrowType::try_from(&from.return_type)
+                .expect("unsupported data type")
+                .encode_to_vec(),
+            aggregate: from.aggregate,
+            is_async: from.is_async,
+        }
+    }
+}
+
+impl From<api::DylibUdfConfig> for DylibUdfConfig {
+    fn from(from: api::DylibUdfConfig) -> Self {
+        DylibUdfConfig {
+            dylib_path: from.dylib_path,
+            arg_types: from
+                .arg_types
+                .iter()
+                .map(|t| {
+                    DataType::try_from(
+                        &ArrowType::decode(&mut t.as_slice()).expect("invalid arrow type"),
+                    )
+                        .expect("invalid arrow type")
+                })
+                .collect(),
+            return_type: DataType::try_from(
+                &ArrowType::decode(&mut from.return_type.as_slice()).unwrap(),
+            )
+                .expect("invalid arrow type"),
+            aggregate: from.aggregate,
+            is_async: from.is_async,
+        }
+    }
+}
\ No newline at end of file
diff --git a/src/sql/datastream/mod.rs b/src/sql/datastream/mod.rs
new file mode 100644
index 00000000..922801f6
--- /dev/null
+++ b/src/sql/datastream/mod.rs
@@ -0,0 +1,13 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+pub mod logical;
diff --git a/src/sql/extensions/aggregate.rs b/src/sql/extensions/aggregate.rs
new file mode 100644
index 00000000..645315af
--- /dev/null
+++ b/src/sql/extensions/aggregate.rs
@@ -0,0 +1,633 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::fmt::Formatter;
+use std::sync::Arc;
+use std::time::Duration;
+
+use arrow_array::types::IntervalMonthDayNanoType;
+use datafusion::common::{Column, DFSchemaRef, Result, ScalarValue, internal_err};
+use datafusion::logical_expr::{
+    self, expr::ScalarFunction, BinaryExpr, Expr, Extension, LogicalPlan,
+    UserDefinedLogicalNodeCore,
+};
+use datafusion_common::{plan_err, DFSchema, DataFusionError};
+use datafusion_expr::Aggregate;
+use datafusion_proto::physical_plan::{AsExecutionPlan, DefaultPhysicalExtensionCodec};
+use datafusion_proto::physical_plan::to_proto::serialize_physical_expr;
+use datafusion_proto::protobuf::PhysicalPlanNode;
+use prost::Message;
+use protocol::grpc::api::{
+    SessionWindowAggregateOperator, SlidingWindowAggregateOperator, TumblingWindowAggregateOperator,
+};
+
+use crate::multifield_partial_ord;
+use crate::sql::common::constants::{extension_node, proto_operator_name};
+use crate::sql::common::{FsSchema, FsSchemaRef};
+use crate::sql::extensions::{
+    CompiledTopologyNode, StreamingOperatorBlueprint, SystemTimestampInjectorNode,
+};
+use crate::sql::logical_node::logical::{LogicalEdge, LogicalEdgeType, LogicalNode, OperatorName};
+use crate::sql::logical_planner::planner::{NamedNode, Planner, SplitPlanOutput};
+use crate::sql::physical::{window, FsPhysicalExtensionCodec};
+use crate::sql::types::{
+    DFField, TIMESTAMP_FIELD, WindowBehavior, WindowType, fields_with_qualifiers,
+    schema_from_df_fields, schema_from_df_fields_with_metadata,
+};
+
+pub(crate) const STREAM_AGG_EXTENSION_NAME: &str = extension_node::STREAM_WINDOW_AGGREGATE;
+
+/// Represents a streaming windowed aggregation node in the logical plan.
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+pub(crate) struct StreamWindowAggregateNode {
+    pub(crate) window_spec: WindowBehavior,
+    pub(crate) base_agg_plan: LogicalPlan,
+    pub(crate) output_schema: DFSchemaRef,
+    pub(crate) partition_keys: Vec<usize>,
+    pub(crate) post_aggregation_plan: LogicalPlan,
+}
+
+multifield_partial_ord!(
+    StreamWindowAggregateNode,
+    base_agg_plan,
+    partition_keys,
+    post_aggregation_plan
+);
+
+impl StreamWindowAggregateNode {
+    /// Safely constructs a new node, computing the final projection without panicking.
+    pub fn try_new(
+        window_spec: WindowBehavior,
+        base_agg_plan: LogicalPlan,
+        partition_keys: Vec<usize>,
+    ) -> Result<Self> {
+        let post_aggregation_plan =
+            WindowBoundaryMath::build_post_aggregation(&base_agg_plan, window_spec.clone())?;
+
+        Ok(Self {
+            window_spec,
+            base_agg_plan,
+            output_schema: post_aggregation_plan.schema().clone(),
+            partition_keys,
+            post_aggregation_plan,
+        })
+    }
+
+    fn build_tumbling_operator(
+        &self,
+        planner: &Planner,
+        node_id: usize,
+        input_schema: DFSchemaRef,
+        duration: Duration,
+    ) -> Result<LogicalNode> {
+        let binning_expr = planner.binning_function_proto(duration, input_schema.clone())?;
+
+        let SplitPlanOutput {
+            partial_aggregation_plan,
+            partial_schema,
+            finish_plan,
+        } = planner.split_physical_plan(self.partition_keys.clone(), &self.base_agg_plan, true)?;
+
+        let final_physical = planner.sync_plan(&self.post_aggregation_plan)?;
+        let final_physical_proto = PhysicalPlanNode::try_from_physical_plan(
+            final_physical,
+            &FsPhysicalExtensionCodec::default(),
+        )?;
+
+        let operator_config = TumblingWindowAggregateOperator {
+            name: proto_operator_name::TUMBLING_WINDOW.to_string(),
+            width_micros: duration.as_micros() as u64,
+            binning_function: binning_expr.encode_to_vec(),
+            input_schema: Some(
+                FsSchema::from_schema_keys(
+                    Arc::new(input_schema.as_ref().into()),
+                    self.partition_keys.clone(),
+                )?
+                .into(),
+            ),
+            partial_schema: Some(partial_schema.into()),
+            partial_aggregation_plan: partial_aggregation_plan.encode_to_vec(),
+            final_aggregation_plan: finish_plan.encode_to_vec(),
+            final_projection: Some(final_physical_proto.encode_to_vec()),
+        };
+
+        Ok(LogicalNode::single(
+            node_id as u32,
+            format!("tumbling_{node_id}"),
+            OperatorName::TumblingWindowAggregate,
+            operator_config.encode_to_vec(),
+            format!("TumblingWindow<{}>", operator_config.name),
+            1,
+        ))
+    }
+
+    fn build_sliding_operator(
+        &self,
+        planner: &Planner,
+        node_id: usize,
+        input_schema: DFSchemaRef,
+        duration: Duration,
+        slide_interval: Duration,
+    ) -> Result<LogicalNode> {
+        let binning_expr = planner.binning_function_proto(slide_interval, input_schema.clone())?;
+
+        let SplitPlanOutput {
+            partial_aggregation_plan,
+            partial_schema,
+            finish_plan,
+        } = planner.split_physical_plan(self.partition_keys.clone(), &self.base_agg_plan, true)?;
+
+        let final_physical = planner.sync_plan(&self.post_aggregation_plan)?;
+        let final_physical_proto = PhysicalPlanNode::try_from_physical_plan(
+            final_physical,
+            &FsPhysicalExtensionCodec::default(),
+        )?;
+
+        let operator_config = SlidingWindowAggregateOperator {
+            name: format!("SlidingWindow<{duration:?}>"),
+            width_micros: duration.as_micros() as u64,
+            slide_micros: slide_interval.as_micros() as u64,
+            binning_function: binning_expr.encode_to_vec(),
+            input_schema: Some(
+                FsSchema::from_schema_keys(
+                    Arc::new(input_schema.as_ref().into()),
+                    self.partition_keys.clone(),
+                )?
+                .into(),
+            ),
+            partial_schema: Some(partial_schema.into()),
+            partial_aggregation_plan: partial_aggregation_plan.encode_to_vec(),
+            final_aggregation_plan: finish_plan.encode_to_vec(),
+            final_projection: final_physical_proto.encode_to_vec(),
+        };
+
+        Ok(LogicalNode::single(
+            node_id as u32,
+            format!("sliding_window_{node_id}"),
+            OperatorName::SlidingWindowAggregate,
+            operator_config.encode_to_vec(),
+            proto_operator_name::SLIDING_WINDOW_LABEL.to_string(),
+            1,
+        ))
+    }
+
+    fn build_session_operator(
+        &self,
+        planner: &Planner,
+        node_id: usize,
+        input_schema: DFSchemaRef,
+    ) -> Result<LogicalNode> {
+        let WindowBehavior::FromOperator {
+            window: WindowType::Session { gap },
+            window_index,
+            window_field,
+            is_nested: false,
+        } = &self.window_spec
+        else {
+            return plan_err!("Expected standard session window configuration");
+        };
+
+        let output_fields = fields_with_qualifiers(self.base_agg_plan.schema());
+        let LogicalPlan::Aggregate(base_agg) = self.base_agg_plan.clone() else {
+            return plan_err!("Base plan must be an Aggregate node");
+        };
+
+        let key_count = self.partition_keys.len();
+        let unkeyed_schema = Arc::new(schema_from_df_fields_with_metadata(
+            &output_fields[key_count..],
+            self.base_agg_plan.schema().metadata().clone(),
+        )?);
+
+        let unkeyed_agg_node = Aggregate::try_new_with_schema(
+            base_agg.input.clone(),
+            vec![],
+            base_agg.aggr_expr.clone(),
+            unkeyed_schema,
+        )?;
+
+        let physical_agg = planner.sync_plan(&LogicalPlan::Aggregate(unkeyed_agg_node))?;
+        let physical_agg_proto = PhysicalPlanNode::try_from_physical_plan(
+            physical_agg,
+            &FsPhysicalExtensionCodec::default(),
+        )?;
+
+        let operator_config = SessionWindowAggregateOperator {
+            name: format!("session_window_{node_id}"),
+            gap_micros: gap.as_micros() as u64,
+            window_field_name: window_field.name().to_string(),
+            window_index: *window_index as u64,
+            input_schema: Some(
+                FsSchema::from_schema_keys(
+                    Arc::new(input_schema.as_ref().into()),
+                    self.partition_keys.clone(),
+                )?
+                .into(),
+            ),
+            unkeyed_aggregate_schema: None,
+            partial_aggregation_plan: vec![],
+            final_aggregation_plan: physical_agg_proto.encode_to_vec(),
+        };
+
+        Ok(LogicalNode::single(
+            node_id as u32,
+            format!("SessionWindow<{gap:?}>"),
+            OperatorName::SessionWindowAggregate,
+            operator_config.encode_to_vec(),
+            operator_config.name.clone(),
+            1,
+        ))
+    }
+
+    fn build_instant_operator(
+        &self,
+        planner: &Planner,
+        node_id: usize,
+        input_schema: DFSchemaRef,
+        apply_final_projection: bool,
+    ) -> Result<LogicalNode> {
+        let ts_column_expr =
+            Expr::Column(Column::new_unqualified(TIMESTAMP_FIELD.to_string()));
+        let binning_expr = planner.create_physical_expr(&ts_column_expr, &input_schema)?;
+        let binning_proto = serialize_physical_expr(&binning_expr, &DefaultPhysicalExtensionCodec {})?;
+
+        let final_projection_payload = if apply_final_projection {
+            let physical_plan = planner.sync_plan(&self.post_aggregation_plan)?;
+            let proto_node = PhysicalPlanNode::try_from_physical_plan(
+                physical_plan,
+                &FsPhysicalExtensionCodec::default(),
+            )?;
+            Some(proto_node.encode_to_vec())
+        } else {
+            None
+        };
+
+        let SplitPlanOutput {
+            partial_aggregation_plan,
+            partial_schema,
+            finish_plan,
+        } = planner.split_physical_plan(self.partition_keys.clone(), &self.base_agg_plan, true)?;
+
+        let operator_config = TumblingWindowAggregateOperator {
+            name: proto_operator_name::INSTANT_WINDOW.to_string(),
+            width_micros: 0,
+            binning_function: binning_proto.encode_to_vec(),
+            input_schema: Some(
+                FsSchema::from_schema_keys(
+                    Arc::new(input_schema.as_ref().into()),
+                    self.partition_keys.clone(),
+                )?
+                .into(),
+            ),
+            partial_schema: Some(partial_schema.into()),
+            partial_aggregation_plan: partial_aggregation_plan.encode_to_vec(),
+            final_aggregation_plan: finish_plan.encode_to_vec(),
+            final_projection: final_projection_payload,
+        };
+
+        Ok(LogicalNode::single(
+            node_id as u32,
+            format!("instant_window_{node_id}"),
+            OperatorName::TumblingWindowAggregate,
+            operator_config.encode_to_vec(),
+            proto_operator_name::INSTANT_WINDOW_LABEL.to_string(),
+            1,
+        ))
+    }
+}
+
+impl StreamingOperatorBlueprint for StreamWindowAggregateNode {
+    fn operator_identity(&self) -> Option<NamedNode> {
+        None
+    }
+
+    fn compile_to_graph_node(
+        &self,
+        planner: &Planner,
+        node_id: usize,
+        mut input_schemas: Vec<FsSchemaRef>,
+    ) -> Result<CompiledTopologyNode> {
+        if input_schemas.len() != 1 {
+            return plan_err!("StreamWindowAggregateNode requires exactly one input schema");
+        }
+
+        let raw_schema = input_schemas.remove(0);
+        let df_schema = Arc::new(DFSchema::try_from(raw_schema.schema.as_ref().clone())?);
+
+        let logical_operator = match &self.window_spec {
+            WindowBehavior::FromOperator { window, is_nested, .. } => {
+                if *is_nested {
+                    self.build_instant_operator(planner, node_id, df_schema, true)?
+                } else {
+                    match window {
+                        WindowType::Tumbling { width } => {
+                            self.build_tumbling_operator(planner, node_id, df_schema, *width)?
+                        }
+                        WindowType::Sliding { width, slide } => {
+                            self.build_sliding_operator(planner, node_id, df_schema, *width, *slide)?
+                        }
+                        WindowType::Session { .. } => {
+                            self.build_session_operator(planner, node_id, df_schema)?
+                        }
+                        WindowType::Instant => {
+                            return plan_err!(
+                                "Instant window is invalid within standard operator context"
+                            );
+                        }
+                    }
+                }
+            }
+            WindowBehavior::InData => self
+                .build_instant_operator(planner, node_id, df_schema, false)
+                .map_err(|e| e.context("Failed compiling instant window"))?,
+        };
+
+        let link = LogicalEdge::project_all(LogicalEdgeType::Shuffle, (*raw_schema).clone());
+        Ok(CompiledTopologyNode {
+            execution_unit: logical_operator,
+            routing_edges: vec![link],
+        })
+    }
+
+    fn yielded_schema(&self) -> FsSchema {
+        let schema_ref = (*self.output_schema).clone().into();
+        FsSchema::from_schema_unkeyed(Arc::new(schema_ref)).expect(
+            "StreamWindowAggregateNode output schema must contain timestamp column",
+        )
+    }
+}
+
+impl UserDefinedLogicalNodeCore for StreamWindowAggregateNode {
+    fn name(&self) -> &str {
+        STREAM_AGG_EXTENSION_NAME
+    }
+
+    fn inputs(&self) -> Vec<&LogicalPlan> {
+        vec![&self.base_agg_plan]
+    }
+
+    fn schema(&self) -> &DFSchemaRef {
+        &self.output_schema
+    }
+
+    fn expressions(&self) -> Vec<Expr> {
+        vec![]
+    }
+
+    fn fmt_for_explain(&self, f: &mut Formatter) -> std::fmt::Result {
+        let spec_desc = match &self.window_spec {
+            WindowBehavior::InData => "InData".to_string(),
+            WindowBehavior::FromOperator { window, .. } => format!("FromOperator({window:?})"),
+        };
+        write!(
+            f,
+            "StreamWindowAggregate: {} | spec: {}",
+            self.schema(),
+            spec_desc
+        )
+    }
+
+    fn with_exprs_and_inputs(&self, _exprs: Vec<Expr>, inputs: Vec<LogicalPlan>) -> Result<Self> {
+        if inputs.len() != 1 {
+            return internal_err!("StreamWindowAggregateNode expects exactly 1 input");
+        }
+        Self::try_new(
+            self.window_spec.clone(),
+            inputs[0].clone(),
+            self.partition_keys.clone(),
+        )
+    }
+}
+
+// -----------------------------------------------------------------------------
+// Dedicated boundary math for window bin / post-aggregation projection
+// -----------------------------------------------------------------------------
+
+struct WindowBoundaryMath;
+
+impl WindowBoundaryMath {
+    fn interval_nanos(nanos: i64) -> Expr {
+        Expr::Literal(
+            ScalarValue::IntervalMonthDayNano(Some(
+                IntervalMonthDayNanoType::make_value(0, 0, nanos),
+            )),
+            None,
+        )
+    }
+
+    fn build_post_aggregation(
+        agg_plan: &LogicalPlan,
+        window_spec: WindowBehavior,
+    ) -> Result<LogicalPlan> {
+        let ts_field: DFField = agg_plan
+            .inputs()
+            .first()
+            .ok_or_else(|| DataFusionError::Plan("Aggregate has no inputs".into()))?
+            .schema()
+            .qualified_field_with_unqualified_name(TIMESTAMP_FIELD)?
+            .into();
+
+        let plan_with_ts = LogicalPlan::Extension(Extension {
+            node: Arc::new(SystemTimestampInjectorNode::try_new(
+                agg_plan.clone(),
+                ts_field.qualifier().cloned(),
+            )?),
+        });
+
+        let (win_field, win_index, duration, is_nested) = match window_spec {
+            WindowBehavior::InData => return Ok(plan_with_ts),
+            WindowBehavior::FromOperator {
+                window,
+                window_field,
+                window_index,
+                is_nested,
+            } => match window {
+                WindowType::Tumbling { width } | WindowType::Sliding { width, .. } => {
+                    (window_field, window_index, width, is_nested)
+                }
+                WindowType::Session { .. } => {
+                    return Ok(LogicalPlan::Extension(Extension {
+                        node: Arc::new(InjectWindowFieldNode::try_new(
+                            plan_with_ts,
+                            window_field,
+                            window_index,
+                        )?),
+                    }));
+                }
+                WindowType::Instant => return Ok(plan_with_ts),
+            },
+        };
+
+        if is_nested {
+            return Self::build_nested_projection(plan_with_ts, win_field, win_index, duration);
+        }
+
+        let mut output_fields = fields_with_qualifiers(agg_plan.schema());
+        let mut projections: Vec<_> = output_fields
+            .iter()
+            .map(|f| Expr::Column(f.qualified_column()))
+            .collect();
+
+        let ts_col_expr = Expr::Column(Column::new(ts_field.qualifier().cloned(), ts_field.name()));
+
+        output_fields.insert(win_index, win_field.clone());
+
+        let win_func_expr = Expr::ScalarFunction(ScalarFunction {
+            func: window(),
+            args: vec![
+                ts_col_expr.clone(),
+                Expr::BinaryExpr(BinaryExpr {
+                    left: Box::new(ts_col_expr.clone()),
+                    op: logical_expr::Operator::Plus,
+                    right: Box::new(Self::interval_nanos(duration.as_nanos() as i64)),
+                }),
+            ],
+        });
+
+        projections.insert(
+            win_index,
+            win_func_expr.alias_qualified(win_field.qualifier().cloned(), win_field.name()),
+        );
+
+        output_fields.push(ts_field);
+
+        let bin_end_expr = Expr::BinaryExpr(BinaryExpr {
+            left: Box::new(ts_col_expr),
+            op: logical_expr::Operator::Plus,
+            right: Box::new(Self::interval_nanos((duration.as_nanos() - 1) as i64)),
+        });
+        projections.push(bin_end_expr);
+
+        Ok(LogicalPlan::Projection(logical_expr::Projection::try_new_with_schema(
+            projections,
+            Arc::new(plan_with_ts),
+            Arc::new(schema_from_df_fields(&output_fields)?),
+        )?))
+    }
+
+    fn build_nested_projection(
+        plan: LogicalPlan,
+        win_field: DFField,
+        win_index: usize,
+        duration: Duration,
+    ) -> Result<LogicalPlan> {
+        let ts_field: DFField = plan
+            .schema()
+            .qualified_field_with_unqualified_name(TIMESTAMP_FIELD)?
+            .into();
+        let ts_col_expr = Expr::Column(Column::new(ts_field.qualifier().cloned(), ts_field.name()));
+
+        let mut output_fields = fields_with_qualifiers(plan.schema());
+        let mut projections: Vec<_> = output_fields
+            .iter()
+            .map(|f| Expr::Column(f.qualified_column()))
+            .collect();
+
+        output_fields.insert(win_index, win_field.clone());
+
+        let win_func_expr = Expr::ScalarFunction(ScalarFunction {
+            func: window(),
+            args: vec![
+                Expr::BinaryExpr(BinaryExpr {
+                    left: Box::new(ts_col_expr.clone()),
+                    op: logical_expr::Operator::Minus,
+                    right: Box::new(Self::interval_nanos(duration.as_nanos() as i64 - 1)),
+                }),
+                Expr::BinaryExpr(BinaryExpr {
+                    left: Box::new(ts_col_expr),
+                    op: logical_expr::Operator::Plus,
+                    right: Box::new(Self::interval_nanos(1)),
+                }),
+            ],
+        });
+
+        projections.insert(
+            win_index,
+            win_func_expr.alias_qualified(win_field.qualifier().cloned(), win_field.name()),
+        );
+
+        Ok(LogicalPlan::Projection(logical_expr::Projection::try_new_with_schema(
+            projections,
+            Arc::new(plan),
+            Arc::new(schema_from_df_fields(&output_fields)?),
+        )?))
+    }
+}
+
+// -----------------------------------------------------------------------------
+// Field injection node (session window column placement)
+// -----------------------------------------------------------------------------
+
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+struct InjectWindowFieldNode {
+    pub(crate) upstream_plan: LogicalPlan,
+    pub(crate) target_field: DFField,
+    pub(crate) insertion_index: usize,
+    pub(crate) new_schema: DFSchemaRef,
+}
+
+multifield_partial_ord!(InjectWindowFieldNode, upstream_plan, insertion_index);
+
+impl InjectWindowFieldNode {
+    fn try_new(
+        upstream_plan: LogicalPlan,
+        target_field: DFField,
+        insertion_index: usize,
+    ) -> Result<Self> {
+        let mut fields = fields_with_qualifiers(upstream_plan.schema());
+        fields.insert(insertion_index, target_field.clone());
+        let meta = upstream_plan.schema().metadata().clone();
+
+        Ok(Self {
+            upstream_plan,
+            target_field,
+            insertion_index,
+            new_schema: Arc::new(schema_from_df_fields_with_metadata(&fields, meta)?),
+        })
+    }
+}
+
+impl UserDefinedLogicalNodeCore for InjectWindowFieldNode {
+    fn name(&self) -> &str {
+        "InjectWindowFieldNode"
+    }
+
+    fn inputs(&self) -> Vec<&LogicalPlan> {
+        vec![&self.upstream_plan]
+    }
+
+    fn schema(&self) -> &DFSchemaRef {
+        &self.new_schema
+    }
+
+    fn expressions(&self) -> Vec<Expr> {
+        vec![]
+    }
+
+    fn fmt_for_explain(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        write!(
+            f,
+            "InjectWindowField: insert {:?} at offset {}",
+            self.target_field, self.insertion_index
+        )
+    }
+
+    fn with_exprs_and_inputs(&self, _exprs: Vec<Expr>, inputs: Vec<LogicalPlan>) -> Result<Self> {
+        if inputs.len() != 1 {
+            return internal_err!("InjectWindowFieldNode expects exactly 1 input");
+        }
+        Self::try_new(
+            inputs[0].clone(),
+            self.target_field.clone(),
+            self.insertion_index,
+        )
+    }
+}
diff --git a/src/sql/extensions/async_udf.rs b/src/sql/extensions/async_udf.rs
new file mode 100644
index 00000000..ee2ce60a
--- /dev/null
+++ b/src/sql/extensions/async_udf.rs
@@ -0,0 +1,243 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::fmt::Formatter;
+use std::sync::Arc;
+use std::time::Duration;
+
+use datafusion::common::{DFSchemaRef, Result};
+use datafusion::logical_expr::{
+    Expr, LogicalPlan, UserDefinedLogicalNode, UserDefinedLogicalNodeCore,
+};
+use datafusion_common::{internal_err, plan_err};
+use datafusion_proto::physical_plan::DefaultPhysicalExtensionCodec;
+use datafusion_proto::physical_plan::to_proto::serialize_physical_expr;
+use prost::Message;
+use protocol::grpc::api::{AsyncUdfOperator, AsyncUdfOrdering};
+
+use crate::multifield_partial_ord;
+use crate::sql::common::constants::extension_node;
+use crate::sql::common::{FsSchema, FsSchemaRef};
+use crate::sql::extensions::streaming_operator_blueprint::{CompiledTopologyNode, StreamingOperatorBlueprint};
+use crate::sql::logical_node::logical::{
+    DylibUdfConfig, LogicalEdge, LogicalEdgeType, LogicalNode, OperatorName,
+};
+use crate::sql::common::constants::sql_field;
+use crate::sql::logical_planner::planner::{NamedNode, Planner};
+use crate::sql::types::{DFField, fields_with_qualifiers, schema_from_df_fields};
+
+pub(crate) const NODE_TYPE_NAME: &str = extension_node::ASYNC_FUNCTION_EXECUTION;
+
+/// Represents a logical node that executes an external asynchronous function (UDF)
+/// and projects the final results into the streaming pipeline.
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+pub(crate) struct AsyncFunctionExecutionNode {
+    pub(crate) upstream_plan: Arc<LogicalPlan>,
+    pub(crate) operator_name: String,
+    pub(crate) function_config: DylibUdfConfig,
+    pub(crate) invocation_args: Vec<Expr>,
+    pub(crate) result_projections: Vec<Expr>,
+    pub(crate) preserve_ordering: bool,
+    pub(crate) concurrency_limit: usize,
+    pub(crate) execution_timeout: Duration,
+    pub(crate) resolved_schema: DFSchemaRef,
+}
+
+multifield_partial_ord!(
+    AsyncFunctionExecutionNode,
+    upstream_plan,
+    operator_name,
+    function_config,
+    invocation_args,
+    result_projections,
+    preserve_ordering,
+    concurrency_limit,
+    execution_timeout
+);
+
+impl AsyncFunctionExecutionNode {
+    /// Compiles logical expressions into serialized physical protobuf bytes.
+    fn compile_physical_expressions(
+        &self,
+        planner: &Planner,
+        expressions: &[Expr],
+        schema_context: &DFSchemaRef,
+    ) -> Result<Vec<Vec<u8>>> {
+        expressions
+            .iter()
+            .map(|logical_expr| {
+                let physical_expr = planner.create_physical_expr(logical_expr, schema_context)?;
+                let serialized =
+                    serialize_physical_expr(&physical_expr, &DefaultPhysicalExtensionCodec {})?;
+                Ok(serialized.encode_to_vec())
+            })
+            .collect()
+    }
+
+    /// Computes the intermediate schema which bridges the upstream output
+    /// and the raw asynchronous result injected by the UDF execution.
+    fn compute_intermediate_schema(&self) -> Result<DFSchemaRef> {
+        let mut fields = fields_with_qualifiers(self.upstream_plan.schema());
+
+        let raw_result_field = DFField::new(
+            None,
+            sql_field::ASYNC_RESULT,
+            self.function_config.return_type.clone(),
+            true,
+        );
+        fields.push(raw_result_field);
+
+        Ok(Arc::new(schema_from_df_fields(&fields)?))
+    }
+
+    fn to_protobuf_config(
+        &self,
+        compiled_args: Vec<Vec<u8>>,
+        compiled_projections: Vec<Vec<u8>>,
+    ) -> AsyncUdfOperator {
+        let ordering_strategy = if self.preserve_ordering {
+            AsyncUdfOrdering::Ordered
+        } else {
+            AsyncUdfOrdering::Unordered
+        };
+
+        AsyncUdfOperator {
+            name: self.operator_name.clone(),
+            udf: Some(self.function_config.clone().into()),
+            arg_exprs: compiled_args,
+            final_exprs: compiled_projections,
+            ordering: ordering_strategy as i32,
+            max_concurrency: self.concurrency_limit as u32,
+            timeout_micros: self.execution_timeout.as_micros() as u64,
+        }
+    }
+}
+
+impl StreamingOperatorBlueprint for AsyncFunctionExecutionNode {
+    fn operator_identity(&self) -> Option<NamedNode> {
+        None
+    }
+
+    fn compile_to_graph_node(
+        &self,
+        planner: &Planner,
+        node_index: usize,
+        mut input_schemas: Vec<FsSchemaRef>,
+    ) -> Result<CompiledTopologyNode> {
+        if input_schemas.len() != 1 {
+            return plan_err!("AsyncFunctionExecutionNode requires exactly one input schema");
+        }
+
+        let compiled_args = self.compile_physical_expressions(
+            planner,
+            &self.invocation_args,
+            self.upstream_plan.schema(),
+        )?;
+
+        let intermediate_schema = self.compute_intermediate_schema()?;
+        let compiled_projections = self.compile_physical_expressions(
+            planner,
+            &self.result_projections,
+            &intermediate_schema,
+        )?;
+
+        let operator_config = self.to_protobuf_config(compiled_args, compiled_projections);
+
+        let logical_node = LogicalNode::single(
+            node_index as u32,
+            format!("async_udf_{node_index}"),
+            OperatorName::AsyncUdf,
+            operator_config.encode_to_vec(),
+            format!("AsyncUdf<{}>", self.operator_name),
+            1,
+        );
+
+        let upstream_schema = input_schemas.remove(0);
+        let data_edge =
+            LogicalEdge::project_all(LogicalEdgeType::Forward, (*upstream_schema).clone());
+
+        Ok(CompiledTopologyNode {
+            execution_unit: logical_node,
+            routing_edges: vec![data_edge],
+        })
+    }
+
+    fn yielded_schema(&self) -> FsSchema {
+        let arrow_fields: Vec<_> = self
+            .resolved_schema
+            .fields()
+            .iter()
+            .map(|f| (**f).clone())
+            .collect();
+
+        FsSchema::from_fields(arrow_fields)
+    }
+}
+
+impl UserDefinedLogicalNodeCore for AsyncFunctionExecutionNode {
+    fn name(&self) -> &str {
+        NODE_TYPE_NAME
+    }
+
+    fn inputs(&self) -> Vec<&LogicalPlan> {
+        vec![&self.upstream_plan]
+    }
+
+    fn schema(&self) -> &DFSchemaRef {
+        &self.resolved_schema
+    }
+
+    fn expressions(&self) -> Vec<Expr> {
+        self.invocation_args
+            .iter()
+            .chain(self.result_projections.iter())
+            .cloned()
+            .collect()
+    }
+
+    fn fmt_for_explain(&self, f: &mut Formatter) -> std::fmt::Result {
+        write!(
+            f,
+            "AsyncFunctionExecution<{}>: Concurrency={}, Ordered={}",
+            self.operator_name,
+            self.concurrency_limit,
+            self.preserve_ordering
+        )
+    }
+
+    fn with_exprs_and_inputs(&self, exprs: Vec<Expr>, mut inputs: Vec<LogicalPlan>) -> Result<Self> {
+        if inputs.len() != 1 {
+            return internal_err!(
+                "AsyncFunctionExecutionNode expects exactly 1 input, but received {}",
+                inputs.len()
+            );
+        }
+
+        if UserDefinedLogicalNode::expressions(self) != exprs {
+            return internal_err!(
+                "Attempted to mutate async UDF expressions during logical planning, which is not supported."
+            );
+        }
+
+        Ok(Self {
+            upstream_plan: Arc::new(inputs.remove(0)),
+            operator_name: self.operator_name.clone(),
+            function_config: self.function_config.clone(),
+            invocation_args: self.invocation_args.clone(),
+            result_projections: self.result_projections.clone(),
+            preserve_ordering: self.preserve_ordering,
+            concurrency_limit: self.concurrency_limit,
+            execution_timeout: self.execution_timeout,
+            resolved_schema: self.resolved_schema.clone(),
+        })
+    }
+}
diff --git a/src/sql/extensions/debezium.rs b/src/sql/extensions/debezium.rs
new file mode 100644
index 00000000..2afda2b4
--- /dev/null
+++ b/src/sql/extensions/debezium.rs
@@ -0,0 +1,384 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::sync::Arc;
+
+use arrow_schema::{DataType, Field, Schema};
+use datafusion::common::{
+    internal_err, plan_err, DFSchema, DFSchemaRef, DataFusionError, Result, TableReference,
+};
+use datafusion::logical_expr::{Expr, LogicalPlan, UserDefinedLogicalNodeCore};
+use datafusion::physical_plan::DisplayAs;
+
+use crate::multifield_partial_ord;
+use crate::sql::common::constants::{cdc, extension_node};
+use crate::sql::common::{FsSchema, FsSchemaRef, UPDATING_META_FIELD};
+use crate::sql::logical_planner::planner::{NamedNode, Planner};
+use crate::sql::physical::updating_meta_field;
+use crate::sql::types::TIMESTAMP_FIELD;
+
+use super::{CompiledTopologyNode, StreamingOperatorBlueprint};
+
+// -----------------------------------------------------------------------------
+// Constants & Identifiers
+// -----------------------------------------------------------------------------
+
+pub(crate) const UNROLL_NODE_NAME: &str = extension_node::UNROLL_DEBEZIUM_PAYLOAD;
+pub(crate) const PACK_NODE_NAME: &str = extension_node::PACK_DEBEZIUM_ENVELOPE;
+
+// -----------------------------------------------------------------------------
+// Core Schema Codec
+// -----------------------------------------------------------------------------
+
+/// Transforms between flat schemas and Debezium CDC envelopes.
+pub(crate) struct DebeziumSchemaCodec;
+
+impl DebeziumSchemaCodec {
+    /// Wraps a flat physical schema into a Debezium CDC envelope structure.
+    pub(crate) fn wrap_into_envelope(
+        flat_schema: &DFSchemaRef,
+        qualifier_override: Option<TableReference>,
+    ) -> Result<DFSchemaRef> {
+        let ts_field = if flat_schema.has_column_with_unqualified_name(TIMESTAMP_FIELD) {
+            Some(flat_schema.field_with_unqualified_name(TIMESTAMP_FIELD)?.clone())
+        } else {
+            None
+        };
+
+        let payload_fields: Vec<_> = flat_schema
+            .fields()
+            .iter()
+            .filter(|f| f.name() != TIMESTAMP_FIELD && f.name() != UPDATING_META_FIELD)
+            .cloned()
+            .collect();
+
+        let payload_struct_type = DataType::Struct(payload_fields.into());
+
+        let mut envelope_fields = vec![
+            Arc::new(Field::new(
+                cdc::BEFORE,
+                payload_struct_type.clone(),
+                true,
+            )),
+            Arc::new(Field::new(cdc::AFTER, payload_struct_type, true)),
+            Arc::new(Field::new(cdc::OP, DataType::Utf8, true)),
+        ];
+
+        if let Some(ts) = ts_field {
+            envelope_fields.push(Arc::new(ts));
+        }
+
+        let arrow_schema = Schema::new(envelope_fields);
+        let final_schema = match qualifier_override {
+            Some(qualifier) => DFSchema::try_from_qualified_schema(qualifier, &arrow_schema)?,
+            None => DFSchema::try_from(arrow_schema)?,
+        };
+
+        Ok(Arc::new(final_schema))
+    }
+}
+
+// -----------------------------------------------------------------------------
+// Logical Node: Unroll Debezium Payload
+// -----------------------------------------------------------------------------
+
+/// Decodes an incoming Debezium envelope into a flat, updating stream representation.
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+pub struct UnrollDebeziumPayloadNode {
+    upstream_plan: LogicalPlan,
+    resolved_schema: DFSchemaRef,
+    pub pk_indices: Vec<usize>,
+    pk_names: Arc<Vec<String>>,
+}
+
+multifield_partial_ord!(
+    UnrollDebeziumPayloadNode,
+    upstream_plan,
+    pk_indices,
+    pk_names
+);
+
+impl UnrollDebeziumPayloadNode {
+    pub fn try_new(upstream_plan: LogicalPlan, pk_names: Arc<Vec<String>>) -> Result<Self> {
+        let input_schema = upstream_plan.schema();
+
+        let (before_idx, after_idx) = Self::validate_envelope_structure(input_schema)?;
+
+        let payload_fields = Self::extract_payload_fields(input_schema, before_idx)?;
+
+        let pk_indices = Self::map_primary_keys(payload_fields, &pk_names)?;
+
+        let qualifier = Self::resolve_schema_qualifier(input_schema, before_idx, after_idx)?;
+
+        let resolved_schema =
+            Self::compile_unrolled_schema(input_schema, payload_fields, qualifier)?;
+
+        Ok(Self {
+            upstream_plan,
+            resolved_schema,
+            pk_indices,
+            pk_names,
+        })
+    }
+
+    fn validate_envelope_structure(schema: &DFSchemaRef) -> Result<(usize, usize)> {
+        let before_idx = schema.index_of_column_by_name(None, cdc::BEFORE).ok_or_else(
+            || DataFusionError::Plan("Missing 'before' state column in CDC stream".into()),
+        )?;
+
+        let after_idx = schema.index_of_column_by_name(None, cdc::AFTER).ok_or_else(
+            || DataFusionError::Plan("Missing 'after' state column in CDC stream".into()),
+        )?;
+
+        let op_idx = schema.index_of_column_by_name(None, cdc::OP).ok_or_else(|| {
+            DataFusionError::Plan("Missing 'op' operation column in CDC stream".into())
+        })?;
+
+        let before_type = schema.field(before_idx).data_type();
+        let after_type = schema.field(after_idx).data_type();
+
+        if before_type != after_type {
+            return plan_err!(
+                "State column type mismatch: 'before' is {before_type}, but 'after' is {after_type}"
+            );
+        }
+
+        if *schema.field(op_idx).data_type() != DataType::Utf8 {
+            return plan_err!(
+                "The '{}' column must be of type Utf8",
+                cdc::OP
+            );
+        }
+
+        Ok((before_idx, after_idx))
+    }
+
+    fn extract_payload_fields<'a>(
+        schema: &'a DFSchemaRef,
+        state_idx: usize,
+    ) -> Result<&'a arrow_schema::Fields> {
+        match schema.field(state_idx).data_type() {
+            DataType::Struct(fields) => Ok(fields),
+            other => plan_err!("State columns must be of type Struct, found {other}"),
+        }
+    }
+
+    fn map_primary_keys(
+        fields: &arrow_schema::Fields,
+        pk_names: &[String],
+    ) -> Result<Vec<usize>> {
+        pk_names
+            .iter()
+            .map(|pk| fields.find(pk).map(|(idx, _)| idx))
+            .collect::<Option<Vec<_>>>()
+            .ok_or_else(|| {
+                DataFusionError::Plan("Specified primary key not found in payload schema".into())
+            })
+    }
+
+    fn resolve_schema_qualifier(
+        schema: &DFSchemaRef,
+        before_idx: usize,
+        after_idx: usize,
+    ) -> Result<Option<TableReference>> {
+        let before_qualifier = schema.qualified_field(before_idx).0;
+        let after_qualifier = schema.qualified_field(after_idx).0;
+
+        match (before_qualifier, after_qualifier) {
+            (Some(bq), Some(aq)) if bq == aq => Ok(Some(bq.clone())),
+            (None, None) => Ok(None),
+            _ => plan_err!(
+                "'before' and 'after' columns must share the same namespace/qualifier"
+            ),
+        }
+    }
+
+    fn compile_unrolled_schema(
+        original_schema: &DFSchemaRef,
+        payload_fields: &arrow_schema::Fields,
+        qualifier: Option<TableReference>,
+    ) -> Result<DFSchemaRef> {
+        let mut flat_fields = payload_fields.to_vec();
+
+        flat_fields.push(updating_meta_field());
+
+        let ts_idx = original_schema
+            .index_of_column_by_name(None, TIMESTAMP_FIELD)
+            .ok_or_else(|| {
+                DataFusionError::Plan(format!(
+                    "Required event time field '{TIMESTAMP_FIELD}' is missing"
+                ))
+            })?;
+
+        flat_fields.push(Arc::new(original_schema.field(ts_idx).clone()));
+
+        let arrow_schema = Schema::new(flat_fields);
+        let compiled_schema = match qualifier {
+            Some(q) => DFSchema::try_from_qualified_schema(q, &arrow_schema)?,
+            None => DFSchema::try_from(arrow_schema)?,
+        };
+
+        Ok(Arc::new(compiled_schema))
+    }
+}
+
+impl UserDefinedLogicalNodeCore for UnrollDebeziumPayloadNode {
+    fn name(&self) -> &str {
+        UNROLL_NODE_NAME
+    }
+
+    fn inputs(&self) -> Vec<&LogicalPlan> {
+        vec![&self.upstream_plan]
+    }
+
+    fn schema(&self) -> &DFSchemaRef {
+        &self.resolved_schema
+    }
+
+    fn expressions(&self) -> Vec<Expr> {
+        vec![]
+    }
+
+    fn fmt_for_explain(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        write!(f, "UnrollDebeziumPayload")
+    }
+
+    fn with_exprs_and_inputs(&self, _exprs: Vec<Expr>, mut inputs: Vec<LogicalPlan>) -> Result<Self> {
+        if inputs.len() != 1 {
+            return internal_err!(
+                "UnrollDebeziumPayloadNode expects exactly 1 input, got {}",
+                inputs.len()
+            );
+        }
+        Self::try_new(inputs.remove(0), self.pk_names.clone())
+    }
+}
+
+impl StreamingOperatorBlueprint for UnrollDebeziumPayloadNode {
+    fn operator_identity(&self) -> Option<NamedNode> {
+        None
+    }
+
+    fn is_passthrough_boundary(&self) -> bool {
+        true
+    }
+
+    fn compile_to_graph_node(
+        &self,
+        _: &Planner,
+        _: usize,
+        _: Vec<FsSchemaRef>,
+    ) -> Result<CompiledTopologyNode> {
+        plan_err!("UnrollDebeziumPayloadNode is a logical boundary and should not be physically planned")
+    }
+
+    fn yielded_schema(&self) -> FsSchema {
+        FsSchema::from_schema_unkeyed(Arc::new(self.resolved_schema.as_ref().into())).unwrap_or_else(
+            |_| panic!("Failed to extract physical schema for {}", UNROLL_NODE_NAME),
+        )
+    }
+}
+
+// -----------------------------------------------------------------------------
+// Logical Node: Pack Debezium Envelope
+// -----------------------------------------------------------------------------
+
+/// Encodes a flat updating stream back into a Debezium CDC envelope representation.
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+pub(crate) struct PackDebeziumEnvelopeNode {
+    upstream_plan: Arc<LogicalPlan>,
+    envelope_schema: DFSchemaRef,
+}
+
+multifield_partial_ord!(PackDebeziumEnvelopeNode, upstream_plan);
+
+impl PackDebeziumEnvelopeNode {
+    pub(crate) fn try_new(upstream_plan: LogicalPlan) -> Result<Self> {
+        let envelope_schema = DebeziumSchemaCodec::wrap_into_envelope(upstream_plan.schema(), None)
+            .map_err(|e| {
+                DataFusionError::Plan(format!("Failed to compile Debezium envelope schema: {e}"))
+            })?;
+
+        Ok(Self {
+            upstream_plan: Arc::new(upstream_plan),
+            envelope_schema,
+        })
+    }
+}
+
+impl DisplayAs for PackDebeziumEnvelopeNode {
+    fn fmt_as(
+        &self,
+        _t: datafusion::physical_plan::DisplayFormatType,
+        f: &mut std::fmt::Formatter,
+    ) -> std::fmt::Result {
+        write!(f, "PackDebeziumEnvelope")
+    }
+}
+
+impl UserDefinedLogicalNodeCore for PackDebeziumEnvelopeNode {
+    fn name(&self) -> &str {
+        PACK_NODE_NAME
+    }
+
+    fn inputs(&self) -> Vec<&LogicalPlan> {
+        vec![&self.upstream_plan]
+    }
+
+    fn schema(&self) -> &DFSchemaRef {
+        &self.envelope_schema
+    }
+
+    fn expressions(&self) -> Vec<Expr> {
+        vec![]
+    }
+
+    fn fmt_for_explain(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        write!(f, "PackDebeziumEnvelope")
+    }
+
+    fn with_exprs_and_inputs(&self, _exprs: Vec<Expr>, mut inputs: Vec<LogicalPlan>) -> Result<Self> {
+        if inputs.len() != 1 {
+            return internal_err!(
+                "PackDebeziumEnvelopeNode expects exactly 1 input, got {}",
+                inputs.len()
+            );
+        }
+        Self::try_new(inputs.remove(0))
+    }
+}
+
+impl StreamingOperatorBlueprint for PackDebeziumEnvelopeNode {
+    fn operator_identity(&self) -> Option<NamedNode> {
+        None
+    }
+
+    fn is_passthrough_boundary(&self) -> bool {
+        true
+    }
+
+    fn compile_to_graph_node(
+        &self,
+        _: &Planner,
+        _: usize,
+        _: Vec<FsSchemaRef>,
+    ) -> Result<CompiledTopologyNode> {
+        internal_err!("PackDebeziumEnvelopeNode is a logical boundary and should not be physically planned")
+    }
+
+    fn yielded_schema(&self) -> FsSchema {
+        FsSchema::from_schema_unkeyed(Arc::new(self.envelope_schema.as_ref().into()))
+            .unwrap_or_else(|_| {
+                panic!("Failed to extract physical schema for {}", PACK_NODE_NAME)
+            })
+    }
+}
diff --git a/src/sql/extensions/extension_try_from.rs b/src/sql/extensions/extension_try_from.rs
new file mode 100644
index 00000000..a64ac9cf
--- /dev/null
+++ b/src/sql/extensions/extension_try_from.rs
@@ -0,0 +1,70 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::sync::Arc;
+
+use datafusion::common::{DataFusionError, Result};
+use datafusion::logical_expr::UserDefinedLogicalNode;
+
+use crate::sql::extensions::aggregate::StreamWindowAggregateNode;
+use crate::sql::extensions::async_udf::AsyncFunctionExecutionNode;
+use crate::sql::extensions::debezium::{PackDebeziumEnvelopeNode, UnrollDebeziumPayloadNode};
+use crate::sql::extensions::join::StreamingJoinNode;
+use crate::sql::extensions::key_calculation::KeyExtractionNode;
+use crate::sql::extensions::lookup::StreamReferenceJoinNode;
+use crate::sql::extensions::projection::StreamProjectionNode;
+use crate::sql::extensions::remote_table::RemoteTableBoundaryNode;
+use crate::sql::extensions::sink::StreamEgressNode;
+use crate::sql::extensions::streaming_operator_blueprint::StreamingOperatorBlueprint;
+use crate::sql::extensions::table_source::StreamIngestionNode;
+use crate::sql::extensions::updating_aggregate::ContinuousAggregateNode;
+use crate::sql::extensions::watermark_node::EventTimeWatermarkNode;
+use crate::sql::extensions::windows_function::StreamingWindowFunctionNode;
+
+fn try_from_t<T: StreamingOperatorBlueprint + 'static>(
+    node: &dyn UserDefinedLogicalNode,
+) -> std::result::Result<&dyn StreamingOperatorBlueprint, ()> {
+    node.as_any()
+        .downcast_ref::<T>()
+        .map(|t| t as &dyn StreamingOperatorBlueprint)
+        .ok_or(())
+}
+
+impl<'a> TryFrom<&'a dyn UserDefinedLogicalNode> for &'a dyn StreamingOperatorBlueprint {
+    type Error = DataFusionError;
+
+    fn try_from(node: &'a dyn UserDefinedLogicalNode) -> Result<Self, Self::Error> {
+        try_from_t::<StreamIngestionNode>(node)
+            .or_else(|_| try_from_t::<EventTimeWatermarkNode>(node))
+            .or_else(|_| try_from_t::<StreamEgressNode>(node))
+            .or_else(|_| try_from_t::<KeyExtractionNode>(node))
+            .or_else(|_| try_from_t::<StreamWindowAggregateNode>(node))
+            .or_else(|_| try_from_t::<RemoteTableBoundaryNode>(node))
+            .or_else(|_| try_from_t::<StreamingJoinNode>(node))
+            .or_else(|_| try_from_t::<StreamingWindowFunctionNode>(node))
+            .or_else(|_| try_from_t::<AsyncFunctionExecutionNode>(node))
+            .or_else(|_| try_from_t::<PackDebeziumEnvelopeNode>(node))
+            .or_else(|_| try_from_t::<UnrollDebeziumPayloadNode>(node))
+            .or_else(|_| try_from_t::<ContinuousAggregateNode>(node))
+            .or_else(|_| try_from_t::<StreamReferenceJoinNode>(node))
+            .or_else(|_| try_from_t::<StreamProjectionNode>(node))
+            .map_err(|_| DataFusionError::Plan(format!("unexpected node: {}", node.name())))
+    }
+}
+
+impl<'a> TryFrom<&'a Arc<dyn UserDefinedLogicalNode>> for &'a dyn StreamingOperatorBlueprint {
+    type Error = DataFusionError;
+
+    fn try_from(node: &'a Arc<dyn UserDefinedLogicalNode>) -> Result<Self, Self::Error> {
+        TryFrom::try_from(node.as_ref())
+    }
+}
diff --git a/src/sql/extensions/is_retract.rs b/src/sql/extensions/is_retract.rs
new file mode 100644
index 00000000..96493781
--- /dev/null
+++ b/src/sql/extensions/is_retract.rs
@@ -0,0 +1,80 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::sync::Arc;
+
+use datafusion::arrow::datatypes::{DataType, TimeUnit};
+use datafusion::common::{DFSchemaRef, Result, TableReference};
+use datafusion::logical_expr::{Expr, LogicalPlan, UserDefinedLogicalNodeCore};
+
+use crate::multifield_partial_ord;
+use crate::sql::physical::updating_meta_field;
+use crate::sql::types::{DFField, TIMESTAMP_FIELD, fields_with_qualifiers, schema_from_df_fields};
+
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+pub(crate) struct IsRetractExtension {
+    pub(crate) input: LogicalPlan,
+    pub(crate) schema: DFSchemaRef,
+    pub(crate) timestamp_qualifier: Option<TableReference>,
+}
+
+multifield_partial_ord!(IsRetractExtension, input, timestamp_qualifier);
+
+impl IsRetractExtension {
+    pub(crate) fn new(input: LogicalPlan, timestamp_qualifier: Option<TableReference>) -> Self {
+        let mut output_fields = fields_with_qualifiers(input.schema());
+
+        let timestamp_index = output_fields.len() - 1;
+        output_fields[timestamp_index] = DFField::new(
+            timestamp_qualifier.clone(),
+            TIMESTAMP_FIELD,
+            DataType::Timestamp(TimeUnit::Nanosecond, None),
+            false,
+        );
+        output_fields.push((timestamp_qualifier.clone(), updating_meta_field()).into());
+        let schema = Arc::new(schema_from_df_fields(&output_fields).unwrap());
+        Self {
+            input,
+            schema,
+            timestamp_qualifier,
+        }
+    }
+}
+
+impl UserDefinedLogicalNodeCore for IsRetractExtension {
+    fn name(&self) -> &str {
+        "IsRetractExtension"
+    }
+
+    fn inputs(&self) -> Vec<&LogicalPlan> {
+        vec![&self.input]
+    }
+
+    fn schema(&self) -> &DFSchemaRef {
+        &self.schema
+    }
+
+    fn expressions(&self) -> Vec<Expr> {
+        vec![]
+    }
+
+    fn fmt_for_explain(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        write!(f, "IsRetractExtension")
+    }
+
+    fn with_exprs_and_inputs(&self, _exprs: Vec<Expr>, inputs: Vec<LogicalPlan>) -> Result<Self> {
+        Ok(Self::new(
+            inputs[0].clone(),
+            self.timestamp_qualifier.clone(),
+        ))
+    }
+}
diff --git a/src/sql/extensions/join.rs b/src/sql/extensions/join.rs
new file mode 100644
index 00000000..829247ae
--- /dev/null
+++ b/src/sql/extensions/join.rs
@@ -0,0 +1,209 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::fmt::Formatter;
+use std::time::Duration;
+
+use datafusion::common::{DFSchemaRef, Result};
+use datafusion::logical_expr::expr::Expr;
+use datafusion::logical_expr::{LogicalPlan, UserDefinedLogicalNodeCore};
+use datafusion_common::plan_err;
+use datafusion_proto::physical_plan::AsExecutionPlan;
+use datafusion_proto::protobuf::PhysicalPlanNode;
+use prost::Message;
+use protocol::grpc::api::JoinOperator;
+
+use crate::sql::common::constants::{extension_node, runtime_operator_kind};
+use crate::sql::common::{FsSchema, FsSchemaRef};
+use crate::sql::extensions::{CompiledTopologyNode, StreamingOperatorBlueprint};
+use crate::sql::logical_node::logical::{
+    LogicalEdge, LogicalEdgeType, LogicalNode, OperatorName,
+};
+use crate::sql::logical_planner::planner::{NamedNode, Planner};
+use crate::sql::physical::FsPhysicalExtensionCodec;
+
+// -----------------------------------------------------------------------------
+// Constants
+// -----------------------------------------------------------------------------
+
+pub(crate) const STREAM_JOIN_NODE_TYPE: &str = extension_node::STREAMING_JOIN;
+
+// -----------------------------------------------------------------------------
+// Logical Node Definition
+// -----------------------------------------------------------------------------
+
+/// A logical plan node representing a streaming join operation.
+/// It bridges the DataFusion logical plan with the physical streaming execution engine.
+#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd)]
+pub struct StreamingJoinNode {
+    pub(crate) underlying_plan: LogicalPlan,
+    pub(crate) instant_execution_mode: bool,
+    pub(crate) state_retention_ttl: Option<Duration>,
+}
+
+impl StreamingJoinNode {
+    /// Creates a new instance of the streaming join node.
+    pub fn new(
+        underlying_plan: LogicalPlan,
+        instant_execution_mode: bool,
+        state_retention_ttl: Option<Duration>,
+    ) -> Self {
+        Self {
+            underlying_plan,
+            instant_execution_mode,
+            state_retention_ttl,
+        }
+    }
+
+    /// Compiles the physical execution plan and serializes it into a Protobuf configuration payload.
+    fn compile_operator_config(
+        &self,
+        planner: &Planner,
+        node_identifier: &str,
+        left_schema: FsSchemaRef,
+        right_schema: FsSchemaRef,
+    ) -> Result<JoinOperator> {
+        let physical_plan = planner.sync_plan(&self.underlying_plan)?;
+
+        let proto_node = PhysicalPlanNode::try_from_physical_plan(
+            physical_plan,
+            &FsPhysicalExtensionCodec::default(),
+        )?;
+
+        Ok(JoinOperator {
+            name: node_identifier.to_string(),
+            left_schema: Some(left_schema.as_ref().clone().into()),
+            right_schema: Some(right_schema.as_ref().clone().into()),
+            output_schema: Some(self.extract_fs_schema().into()),
+            join_plan: proto_node.encode_to_vec(),
+            ttl_micros: self.state_retention_ttl.map(|ttl| ttl.as_micros() as u64),
+        })
+    }
+
+    fn determine_operator_type(&self) -> OperatorName {
+        if self.instant_execution_mode {
+            OperatorName::InstantJoin
+        } else {
+            OperatorName::Join
+        }
+    }
+
+    fn extract_fs_schema(&self) -> FsSchema {
+        FsSchema::from_schema_unkeyed(self.underlying_plan.schema().inner().clone())
+            .expect("Fatal: Failed to convert internal join schema to FsSchema without keys")
+    }
+}
+
+// -----------------------------------------------------------------------------
+// DataFusion Logical Node Core Implementation
+// -----------------------------------------------------------------------------
+
+impl UserDefinedLogicalNodeCore for StreamingJoinNode {
+    fn name(&self) -> &str {
+        STREAM_JOIN_NODE_TYPE
+    }
+
+    fn inputs(&self) -> Vec<&LogicalPlan> {
+        vec![&self.underlying_plan]
+    }
+
+    fn schema(&self) -> &DFSchemaRef {
+        self.underlying_plan.schema()
+    }
+
+    fn expressions(&self) -> Vec<Expr> {
+        vec![]
+    }
+
+    fn fmt_for_explain(&self, f: &mut Formatter) -> std::fmt::Result {
+        write!(
+            f,
+            "StreamingJoinNode: Schema={}, InstantMode={}, TTL={:?}",
+            self.schema(),
+            self.instant_execution_mode,
+            self.state_retention_ttl
+        )
+    }
+
+    fn with_exprs_and_inputs(&self, _exprs: Vec<Expr>, mut inputs: Vec<LogicalPlan>) -> Result<Self> {
+        if inputs.len() != 1 {
+            return plan_err!(
+                "StreamingJoinNode expects exactly 1 underlying logical plan during recreation"
+            );
+        }
+
+        Ok(Self::new(
+            inputs.remove(0),
+            self.instant_execution_mode,
+            self.state_retention_ttl,
+        ))
+    }
+}
+
+// -----------------------------------------------------------------------------
+// Streaming Graph Extension Implementation
+// -----------------------------------------------------------------------------
+
+impl StreamingOperatorBlueprint for StreamingJoinNode {
+    fn operator_identity(&self) -> Option<NamedNode> {
+        None
+    }
+
+    fn compile_to_graph_node(
+        &self,
+        planner: &Planner,
+        node_index: usize,
+        mut input_schemas: Vec<FsSchemaRef>,
+    ) -> Result<CompiledTopologyNode> {
+        if input_schemas.len() != 2 {
+            return plan_err!(
+                "Invalid topology: StreamingJoinNode requires exactly two upstream inputs, received {}",
+                input_schemas.len()
+            );
+        }
+
+        let right_schema = input_schemas.pop().unwrap();
+        let left_schema = input_schemas.pop().unwrap();
+
+        let node_identifier = format!("stream_join_{node_index}");
+
+        let operator_config = self.compile_operator_config(
+            planner,
+            &node_identifier,
+            left_schema.clone(),
+            right_schema.clone(),
+        )?;
+
+        let logical_node = LogicalNode::single(
+            node_index as u32,
+            node_identifier.clone(),
+            self.determine_operator_type(),
+            operator_config.encode_to_vec(),
+            runtime_operator_kind::STREAMING_JOIN.to_string(),
+            1,
+        );
+
+        let left_edge =
+            LogicalEdge::project_all(LogicalEdgeType::LeftJoin, left_schema.as_ref().clone());
+        let right_edge =
+            LogicalEdge::project_all(LogicalEdgeType::RightJoin, right_schema.as_ref().clone());
+
+        Ok(CompiledTopologyNode {
+            execution_unit: logical_node,
+            routing_edges: vec![left_edge, right_edge],
+        })
+    }
+
+    fn yielded_schema(&self) -> FsSchema {
+        self.extract_fs_schema()
+    }
+}
diff --git a/src/sql/extensions/key_calculation.rs b/src/sql/extensions/key_calculation.rs
new file mode 100644
index 00000000..25206429
--- /dev/null
+++ b/src/sql/extensions/key_calculation.rs
@@ -0,0 +1,302 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::fmt::Formatter;
+use std::sync::Arc;
+
+use datafusion::arrow::datatypes::{Field, Schema};
+use datafusion::common::{DFSchemaRef, Result, internal_err, plan_err};
+use datafusion::logical_expr::{Expr, ExprSchemable, LogicalPlan, UserDefinedLogicalNodeCore};
+use datafusion_common::DFSchema;
+use datafusion_expr::col;
+use datafusion_proto::physical_plan::{AsExecutionPlan, DefaultPhysicalExtensionCodec};
+use datafusion_proto::physical_plan::to_proto::serialize_physical_expr;
+use datafusion_proto::protobuf::PhysicalPlanNode;
+use itertools::Itertools;
+use prost::Message;
+
+use protocol::grpc::api::{KeyPlanOperator, ProjectionOperator};
+
+use crate::multifield_partial_ord;
+use crate::sql::common::constants::{extension_node, sql_field};
+use crate::sql::common::{FsSchema, FsSchemaRef};
+use crate::sql::extensions::{CompiledTopologyNode, StreamingOperatorBlueprint};
+use crate::sql::logical_node::logical::{LogicalEdge, LogicalEdgeType, LogicalNode, OperatorName};
+use crate::sql::physical::FsPhysicalExtensionCodec;
+use crate::sql::logical_planner::planner::{NamedNode, Planner};
+use crate::sql::types::{fields_with_qualifiers, schema_from_df_fields_with_metadata};
+
+pub(crate) const EXTENSION_NODE_IDENTIFIER: &str = extension_node::KEY_EXTRACTION;
+
+/// Routing strategy for shuffling data across the stream topology.
+#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd)]
+pub enum KeyExtractionStrategy {
+    ColumnIndices(Vec<usize>),
+    CalculatedExpressions(Vec<Expr>),
+}
+
+/// Logical node that computes or extracts routing keys.
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+pub(crate) struct KeyExtractionNode {
+    pub(crate) operator_label: Option<String>,
+    pub(crate) upstream_plan: LogicalPlan,
+    pub(crate) extraction_strategy: KeyExtractionStrategy,
+    pub(crate) resolved_schema: DFSchemaRef,
+}
+
+multifield_partial_ord!(
+    KeyExtractionNode,
+    operator_label,
+    upstream_plan,
+    extraction_strategy
+);
+
+impl KeyExtractionNode {
+    /// Extracts keys and hides them from the downstream projection.
+    pub fn try_new_with_projection(
+        upstream_plan: LogicalPlan,
+        target_indices: Vec<usize>,
+        label: String,
+    ) -> Result<Self> {
+        let projected_fields: Vec<_> = fields_with_qualifiers(upstream_plan.schema())
+            .into_iter()
+            .enumerate()
+            .filter(|(idx, _)| !target_indices.contains(idx))
+            .map(|(_, field)| field)
+            .collect();
+
+        let metadata = upstream_plan.schema().metadata().clone();
+        let resolved_schema = schema_from_df_fields_with_metadata(&projected_fields, metadata)?;
+
+        Ok(Self {
+            operator_label: Some(label),
+            upstream_plan,
+            extraction_strategy: KeyExtractionStrategy::ColumnIndices(target_indices),
+            resolved_schema: Arc::new(resolved_schema),
+        })
+    }
+
+    /// Creates a node using an explicit strategy without changing the visible schema.
+    pub fn new(upstream_plan: LogicalPlan, strategy: KeyExtractionStrategy) -> Self {
+        let resolved_schema = upstream_plan.schema().clone();
+        Self {
+            operator_label: None,
+            upstream_plan,
+            extraction_strategy: strategy,
+            resolved_schema,
+        }
+    }
+
+    fn compile_index_router(
+        &self,
+        physical_plan_proto: PhysicalPlanNode,
+        indices: &[usize],
+    ) -> (Vec<u8>, OperatorName) {
+        let operator_config = KeyPlanOperator {
+            name: sql_field::DEFAULT_KEY_LABEL.into(),
+            physical_plan: physical_plan_proto.encode_to_vec(),
+            key_fields: indices.iter().map(|&idx| idx as u64).collect(),
+        };
+
+        (operator_config.encode_to_vec(), OperatorName::KeyBy)
+    }
+
+    fn compile_expression_router(
+        &self,
+        planner: &Planner,
+        expressions: &[Expr],
+        input_schema_ref: &FsSchemaRef,
+        input_df_schema: &DFSchemaRef,
+    ) -> Result<(Vec<u8>, OperatorName)> {
+        let mut target_exprs = expressions.to_vec();
+
+        for field in input_schema_ref.schema.fields.iter() {
+            target_exprs.push(col(field.name()));
+        }
+
+        let output_fs_schema = self.generate_fs_schema()?;
+
+        for (compiled_expr, expected_field) in target_exprs
+            .iter()
+            .zip(output_fs_schema.schema.fields())
+        {
+            let (expr_type, expr_nullable) = compiled_expr.data_type_and_nullable(input_df_schema)?;
+            if expr_type != *expected_field.data_type() || expr_nullable != expected_field.is_nullable()
+            {
+                return plan_err!(
+                    "Type mismatch in key calculation: Expected {} (nullable: {}), got {} (nullable: {})",
+                    expected_field.data_type(),
+                    expected_field.is_nullable(),
+                    expr_type,
+                    expr_nullable
+                );
+            }
+        }
+
+        let mut physical_expr_payloads = Vec::with_capacity(target_exprs.len());
+        for logical_expr in target_exprs {
+            let physical_expr = planner
+                .create_physical_expr(&logical_expr, input_df_schema)
+                .map_err(|e| e.context("Failed to physicalize PARTITION BY expression"))?;
+
+            let serialized_expr =
+                serialize_physical_expr(&physical_expr, &DefaultPhysicalExtensionCodec {})?;
+            physical_expr_payloads.push(serialized_expr.encode_to_vec());
+        }
+
+        let operator_config = ProjectionOperator {
+            name: self
+                .operator_label
+                .as_deref()
+                .unwrap_or(sql_field::DEFAULT_KEY_LABEL)
+                .to_string(),
+            input_schema: Some(input_schema_ref.as_ref().clone().into()),
+            output_schema: Some(output_fs_schema.into()),
+            exprs: physical_expr_payloads,
+        };
+
+        Ok((operator_config.encode_to_vec(), OperatorName::Projection))
+    }
+
+    fn generate_fs_schema(&self) -> Result<FsSchema> {
+        let base_arrow_schema = self.upstream_plan.schema().as_ref();
+
+        match &self.extraction_strategy {
+            KeyExtractionStrategy::ColumnIndices(indices) => {
+                FsSchema::from_schema_keys(Arc::new(base_arrow_schema.into()), indices.clone())
+            }
+            KeyExtractionStrategy::CalculatedExpressions(expressions) => {
+                let mut composite_fields =
+                    Vec::with_capacity(expressions.len() + base_arrow_schema.fields().len());
+
+                for (idx, expr) in expressions.iter().enumerate() {
+                    let (data_type, nullable) = expr.data_type_and_nullable(base_arrow_schema)?;
+                    composite_fields.push(Field::new(format!("__key_{idx}"), data_type, nullable).into());
+                }
+
+                for field in base_arrow_schema.fields().iter() {
+                    composite_fields.push(field.clone());
+                }
+
+                let final_schema = Arc::new(Schema::new(composite_fields));
+                let key_mapping = (1..=expressions.len()).collect_vec();
+                FsSchema::from_schema_keys(final_schema, key_mapping)
+            }
+        }
+    }
+}
+
+impl StreamingOperatorBlueprint for KeyExtractionNode {
+    fn operator_identity(&self) -> Option<NamedNode> {
+        None
+    }
+
+    fn compile_to_graph_node(
+        &self,
+        planner: &Planner,
+        node_index: usize,
+        mut input_schemas: Vec<FsSchemaRef>,
+    ) -> Result<CompiledTopologyNode> {
+        if input_schemas.len() != 1 {
+            return plan_err!("KeyExtractionNode requires exactly one upstream input schema");
+        }
+
+        let input_schema_ref = input_schemas.remove(0);
+        let input_df_schema = Arc::new(DFSchema::try_from(input_schema_ref.schema.as_ref().clone())?);
+
+        let physical_plan = planner.sync_plan(&self.upstream_plan)?;
+        let physical_plan_proto = PhysicalPlanNode::try_from_physical_plan(
+            physical_plan,
+            &FsPhysicalExtensionCodec::default(),
+        )?;
+
+        let (protobuf_payload, engine_operator_name) = match &self.extraction_strategy {
+            KeyExtractionStrategy::ColumnIndices(indices) => {
+                self.compile_index_router(physical_plan_proto, indices)
+            }
+            KeyExtractionStrategy::CalculatedExpressions(exprs) => {
+                self.compile_expression_router(planner, exprs, &input_schema_ref, &input_df_schema)?
+            }
+        };
+
+        let logical_node = LogicalNode::single(
+            node_index as u32,
+            format!("key_{node_index}"),
+            engine_operator_name,
+            protobuf_payload,
+            format!("ArrowKey<{}>", self.operator_label.as_deref().unwrap_or("_")),
+            1,
+        );
+
+        let data_edge =
+            LogicalEdge::project_all(LogicalEdgeType::Forward, (*input_schema_ref).clone());
+
+        Ok(CompiledTopologyNode {
+            execution_unit: logical_node,
+            routing_edges: vec![data_edge],
+        })
+    }
+
+    fn yielded_schema(&self) -> FsSchema {
+        self.generate_fs_schema()
+            .expect("Fatal: Failed to generate output schema for KeyExtractionNode")
+    }
+}
+
+impl UserDefinedLogicalNodeCore for KeyExtractionNode {
+    fn name(&self) -> &str {
+        EXTENSION_NODE_IDENTIFIER
+    }
+
+    fn inputs(&self) -> Vec<&LogicalPlan> {
+        vec![&self.upstream_plan]
+    }
+
+    fn schema(&self) -> &DFSchemaRef {
+        &self.resolved_schema
+    }
+
+    fn expressions(&self) -> Vec<Expr> {
+        vec![]
+    }
+
+    fn fmt_for_explain(&self, f: &mut Formatter) -> std::fmt::Result {
+        write!(
+            f,
+            "KeyExtractionNode: Strategy={:?} | Schema={}",
+            self.extraction_strategy,
+            self.resolved_schema
+        )
+    }
+
+    fn with_exprs_and_inputs(&self, exprs: Vec<Expr>, mut inputs: Vec<LogicalPlan>) -> Result<Self> {
+        if inputs.len() != 1 {
+            return internal_err!("KeyExtractionNode requires exactly 1 input logical plan");
+        }
+
+        let strategy = match &self.extraction_strategy {
+            KeyExtractionStrategy::ColumnIndices(indices) => {
+                KeyExtractionStrategy::ColumnIndices(indices.clone())
+            }
+            KeyExtractionStrategy::CalculatedExpressions(_) => {
+                KeyExtractionStrategy::CalculatedExpressions(exprs)
+            }
+        };
+
+        Ok(Self {
+            operator_label: self.operator_label.clone(),
+            upstream_plan: inputs.remove(0),
+            extraction_strategy: strategy,
+            resolved_schema: self.resolved_schema.clone(),
+        })
+    }
+}
diff --git a/src/sql/extensions/lookup.rs b/src/sql/extensions/lookup.rs
new file mode 100644
index 00000000..8371efce
--- /dev/null
+++ b/src/sql/extensions/lookup.rs
@@ -0,0 +1,287 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::collections::HashMap;
+use std::fmt::Formatter;
+use std::sync::Arc;
+
+use datafusion::common::{Column, DFSchemaRef, JoinType, Result, internal_err, plan_err};
+use datafusion::logical_expr::{Expr, LogicalPlan, UserDefinedLogicalNodeCore};
+use datafusion::sql::TableReference;
+use datafusion_proto::physical_plan::DefaultPhysicalExtensionCodec;
+use datafusion_proto::physical_plan::to_proto::serialize_physical_expr;
+use prost::Message;
+
+use protocol::grpc::api;
+use protocol::grpc::api::{ConnectorOp, GenericConnectorConfig, LookupJoinCondition, LookupJoinOperator};
+
+use crate::multifield_partial_ord;
+use crate::sql::common::constants::extension_node;
+use crate::sql::common::{FsSchema, FsSchemaRef};
+use crate::sql::extensions::{CompiledTopologyNode, StreamingOperatorBlueprint};
+use crate::sql::logical_node::logical::{LogicalEdge, LogicalEdgeType, LogicalNode, OperatorName};
+use crate::sql::logical_planner::planner::{NamedNode, Planner};
+use crate::sql::schema::SourceTable;
+use crate::sql::schema::utils::add_timestamp_field_arrow;
+
+// -----------------------------------------------------------------------------
+// Constants & Identifiers
+// -----------------------------------------------------------------------------
+
+pub const DICTIONARY_SOURCE_NODE_NAME: &str = extension_node::REFERENCE_TABLE_SOURCE;
+pub const STREAM_DICTIONARY_JOIN_NODE_NAME: &str = extension_node::STREAM_REFERENCE_JOIN;
+
+// -----------------------------------------------------------------------------
+// Logical Node: Reference Table Source
+// -----------------------------------------------------------------------------
+
+/// Static or periodically updated reference table used for lookups.
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+pub struct ReferenceTableSourceNode {
+    pub(crate) source_definition: SourceTable,
+    pub(crate) resolved_schema: DFSchemaRef,
+}
+
+multifield_partial_ord!(ReferenceTableSourceNode, source_definition);
+
+impl UserDefinedLogicalNodeCore for ReferenceTableSourceNode {
+    fn name(&self) -> &str {
+        DICTIONARY_SOURCE_NODE_NAME
+    }
+
+    fn inputs(&self) -> Vec<&LogicalPlan> {
+        vec![]
+    }
+
+    fn schema(&self) -> &DFSchemaRef {
+        &self.resolved_schema
+    }
+
+    fn expressions(&self) -> Vec<Expr> {
+        vec![]
+    }
+
+    fn fmt_for_explain(&self, f: &mut Formatter) -> std::fmt::Result {
+        write!(f, "ReferenceTableSource: Schema={}", self.resolved_schema)
+    }
+
+    fn with_exprs_and_inputs(
+        &self,
+        _exprs: Vec<Expr>,
+        inputs: Vec<LogicalPlan>,
+    ) -> Result<Self> {
+        if !inputs.is_empty() {
+            return internal_err!(
+                "ReferenceTableSource is a leaf node and cannot accept upstream inputs"
+            );
+        }
+
+        Ok(Self {
+            source_definition: self.source_definition.clone(),
+            resolved_schema: self.resolved_schema.clone(),
+        })
+    }
+}
+
+// -----------------------------------------------------------------------------
+// Logical Node: Stream to Reference Join
+// -----------------------------------------------------------------------------
+
+/// Join between an unbounded stream and a reference (lookup) table.
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+pub struct StreamReferenceJoinNode {
+    pub(crate) upstream_stream_plan: LogicalPlan,
+    pub(crate) output_schema: DFSchemaRef,
+    pub(crate) external_dictionary: SourceTable,
+    pub(crate) equijoin_conditions: Vec<(Expr, Column)>,
+    pub(crate) post_join_filter: Option<Expr>,
+    pub(crate) namespace_alias: Option<TableReference>,
+    pub(crate) join_semantics: JoinType,
+}
+
+multifield_partial_ord!(
+    StreamReferenceJoinNode,
+    upstream_stream_plan,
+    external_dictionary,
+    equijoin_conditions,
+    post_join_filter,
+    namespace_alias
+);
+
+impl StreamReferenceJoinNode {
+    fn compile_join_conditions(&self, planner: &Planner) -> Result<Vec<LookupJoinCondition>> {
+        self.equijoin_conditions
+            .iter()
+            .map(|(logical_left_expr, right_column)| {
+                let physical_expr =
+                    planner.create_physical_expr(logical_left_expr, &self.output_schema)?;
+                let serialized_expr =
+                    serialize_physical_expr(&physical_expr, &DefaultPhysicalExtensionCodec {})?;
+
+                Ok(LookupJoinCondition {
+                    left_expr: serialized_expr.encode_to_vec(),
+                    right_key: right_column.name.clone(),
+                })
+            })
+            .collect()
+    }
+
+    fn map_api_join_type(&self) -> Result<i32> {
+        match self.join_semantics {
+            JoinType::Inner => Ok(api::JoinType::Inner as i32),
+            JoinType::Left => Ok(api::JoinType::Left as i32),
+            unsupported => plan_err!(
+                "Unsupported join type '{unsupported}' for dictionary lookups. Only INNER and LEFT joins are permitted."
+            ),
+        }
+    }
+
+    fn build_engine_operator(
+        &self,
+        planner: &Planner,
+        _upstream_schema: &FsSchemaRef,
+    ) -> Result<LookupJoinOperator> {
+        let internal_input_schema = FsSchema::from_schema_unkeyed(Arc::new(
+            self.output_schema.as_ref().into(),
+        ))?;
+        let dictionary_physical_schema = self.external_dictionary.produce_physical_schema();
+        let lookup_fs_schema =
+            FsSchema::from_schema_unkeyed(add_timestamp_field_arrow(dictionary_physical_schema))?;
+
+        let properties: HashMap<String, String> = self
+            .external_dictionary
+            .catalog_with_options
+            .iter()
+            .map(|(k, v)| (k.clone(), v.clone()))
+            .collect();
+
+        Ok(LookupJoinOperator {
+            input_schema: Some(internal_input_schema.into()),
+            lookup_schema: Some(lookup_fs_schema.clone().into()),
+            connector: Some(ConnectorOp {
+                connector: self.external_dictionary.adapter_type.clone(),
+                fs_schema: Some(lookup_fs_schema.into()),
+                name: self.external_dictionary.table_identifier.clone(),
+                description: self.external_dictionary.description.clone(),
+                config: Some(protocol::grpc::api::connector_op::Config::Generic(
+                    GenericConnectorConfig { properties },
+                )),
+            }),
+            key_exprs: self.compile_join_conditions(planner)?,
+            join_type: self.map_api_join_type()?,
+            ttl_micros: self
+                .external_dictionary
+                .lookup_cache_ttl
+                .map(|t| t.as_micros() as u64),
+            max_capacity_bytes: self.external_dictionary.lookup_cache_max_bytes,
+        })
+    }
+}
+
+impl StreamingOperatorBlueprint for StreamReferenceJoinNode {
+    fn operator_identity(&self) -> Option<NamedNode> {
+        None
+    }
+
+    fn compile_to_graph_node(
+        &self,
+        planner: &Planner,
+        node_index: usize,
+        mut input_schemas: Vec<FsSchemaRef>,
+    ) -> Result<CompiledTopologyNode> {
+        if input_schemas.len() != 1 {
+            return plan_err!(
+                "StreamReferenceJoinNode requires exactly one upstream stream input"
+            );
+        }
+        let upstream_schema = input_schemas.remove(0);
+
+        let operator_config = self.build_engine_operator(planner, &upstream_schema)?;
+
+        let logical_node = LogicalNode::single(
+            node_index as u32,
+            format!("lookup_join_{node_index}"),
+            OperatorName::LookupJoin,
+            operator_config.encode_to_vec(),
+            format!("DictionaryJoin<{}>", self.external_dictionary.table_identifier),
+            1,
+        );
+
+        let incoming_edge = LogicalEdge::project_all(
+            LogicalEdgeType::Shuffle,
+            (*upstream_schema).clone(),
+        );
+
+        Ok(CompiledTopologyNode {
+            execution_unit: logical_node,
+            routing_edges: vec![incoming_edge],
+        })
+    }
+
+    fn yielded_schema(&self) -> FsSchema {
+        FsSchema::from_schema_unkeyed(self.output_schema.inner().clone())
+            .expect("Failed to convert lookup join output schema to FsSchema")
+    }
+}
+
+impl UserDefinedLogicalNodeCore for StreamReferenceJoinNode {
+    fn name(&self) -> &str {
+        STREAM_DICTIONARY_JOIN_NODE_NAME
+    }
+
+    fn inputs(&self) -> Vec<&LogicalPlan> {
+        vec![&self.upstream_stream_plan]
+    }
+
+    fn schema(&self) -> &DFSchemaRef {
+        &self.output_schema
+    }
+
+    fn expressions(&self) -> Vec<Expr> {
+        let mut exprs: Vec<_> = self
+            .equijoin_conditions
+            .iter()
+            .map(|(l, _)| l.clone())
+            .collect();
+        if let Some(filter) = &self.post_join_filter {
+            exprs.push(filter.clone());
+        }
+        exprs
+    }
+
+    fn fmt_for_explain(&self, f: &mut Formatter) -> std::fmt::Result {
+        write!(
+            f,
+            "StreamReferenceJoin: join_type={:?} | {}",
+            self.join_semantics,
+            self.output_schema
+        )
+    }
+
+    fn with_exprs_and_inputs(&self, _: Vec<Expr>, inputs: Vec<LogicalPlan>) -> Result<Self> {
+        if inputs.len() != 1 {
+            return internal_err!(
+                "StreamReferenceJoinNode expects exactly 1 upstream plan, got {}",
+                inputs.len()
+            );
+        }
+        Ok(Self {
+            upstream_stream_plan: inputs[0].clone(),
+            output_schema: self.output_schema.clone(),
+            external_dictionary: self.external_dictionary.clone(),
+            equijoin_conditions: self.equijoin_conditions.clone(),
+            post_join_filter: self.post_join_filter.clone(),
+            namespace_alias: self.namespace_alias.clone(),
+            join_semantics: self.join_semantics,
+        })
+    }
+}
diff --git a/src/sql/extensions/macros.rs b/src/sql/extensions/macros.rs
new file mode 100644
index 00000000..4ce649c2
--- /dev/null
+++ b/src/sql/extensions/macros.rs
@@ -0,0 +1,28 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#[macro_export]
+macro_rules! multifield_partial_ord {
+    ($ty:ty, $($field:tt), *) => {
+        impl PartialOrd for $ty {
+            fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
+                $(
+                    let cmp = self.$field.partial_cmp(&other.$field)?;
+                    if cmp != std::cmp::Ordering::Equal {
+                        return Some(cmp);
+                    }
+                )*
+                Some(std::cmp::Ordering::Equal)
+            }
+        }
+    };
+}
diff --git a/src/sql/extensions/mod.rs b/src/sql/extensions/mod.rs
new file mode 100644
index 00000000..6c0ca08a
--- /dev/null
+++ b/src/sql/extensions/mod.rs
@@ -0,0 +1,40 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+mod macros;
+
+pub(crate) mod streaming_operator_blueprint;
+pub(crate) use streaming_operator_blueprint::{CompiledTopologyNode, StreamingOperatorBlueprint};
+
+pub(crate) mod aggregate;
+pub(crate) mod debezium;
+pub(crate) mod join;
+pub(crate) mod key_calculation;
+pub(crate) mod lookup;
+pub(crate) mod projection;
+pub(crate) mod remote_table;
+pub(crate) mod sink;
+pub(crate) mod table_source;
+pub(crate) mod updating_aggregate;
+pub(crate) mod watermark_node;
+pub(crate) mod windows_function;
+
+pub(crate) mod timestamp_append;
+pub(crate) use timestamp_append::SystemTimestampInjectorNode;
+
+pub(crate) mod async_udf;
+pub(crate) use async_udf::AsyncFunctionExecutionNode;
+
+pub(crate) mod is_retract;
+pub(crate) use is_retract::IsRetractExtension;
+
+mod extension_try_from;
diff --git a/src/sql/extensions/projection.rs b/src/sql/extensions/projection.rs
new file mode 100644
index 00000000..d1b9e755
--- /dev/null
+++ b/src/sql/extensions/projection.rs
@@ -0,0 +1,240 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::fmt::Formatter;
+use std::sync::Arc;
+
+use datafusion::common::{DFSchema, DFSchemaRef, Result, internal_err};
+use datafusion::logical_expr::{Expr, ExprSchemable, LogicalPlan, UserDefinedLogicalNodeCore};
+use datafusion_proto::physical_plan::DefaultPhysicalExtensionCodec;
+use datafusion_proto::physical_plan::to_proto::serialize_physical_expr;
+use prost::Message;
+
+use protocol::grpc::api::ProjectionOperator;
+
+use crate::multifield_partial_ord;
+use crate::sql::common::constants::{extension_node, sql_field};
+use crate::sql::common::{FsSchema, FsSchemaRef};
+use crate::sql::extensions::{CompiledTopologyNode, StreamingOperatorBlueprint};
+use crate::sql::logical_node::logical::{LogicalEdge, LogicalEdgeType, LogicalNode, OperatorName};
+use crate::sql::logical_planner::planner::{NamedNode, Planner};
+use crate::sql::types::{DFField, schema_from_df_fields};
+
+// -----------------------------------------------------------------------------
+// Constants & Identifiers
+// -----------------------------------------------------------------------------
+
+pub(crate) const STREAM_PROJECTION_NODE_NAME: &str = extension_node::STREAM_PROJECTION;
+const DEFAULT_PROJECTION_LABEL: &str = sql_field::DEFAULT_PROJECTION_LABEL;
+
+// -----------------------------------------------------------------------------
+// Logical Node Definition
+// -----------------------------------------------------------------------------
+
+/// Projection within a streaming execution topology.
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+pub(crate) struct StreamProjectionNode {
+    pub(crate) upstream_plans: Vec<LogicalPlan>,
+    pub(crate) operator_label: Option<String>,
+    pub(crate) projection_exprs: Vec<Expr>,
+    pub(crate) resolved_schema: DFSchemaRef,
+    pub(crate) requires_shuffle: bool,
+}
+
+multifield_partial_ord!(StreamProjectionNode, operator_label, projection_exprs);
+
+impl StreamProjectionNode {
+    pub(crate) fn try_new(
+        upstream_plans: Vec<LogicalPlan>,
+        operator_label: Option<String>,
+        projection_exprs: Vec<Expr>,
+    ) -> Result<Self> {
+        if upstream_plans.is_empty() {
+            return internal_err!("StreamProjectionNode requires at least one upstream plan");
+        }
+        let primary_input = &upstream_plans[0];
+        let upstream_schema = primary_input.schema();
+
+        let mut projected_fields = Vec::with_capacity(projection_exprs.len());
+        for logical_expr in &projection_exprs {
+            let arrow_field = logical_expr.to_field(upstream_schema)?;
+            projected_fields.push(DFField::from(arrow_field));
+        }
+
+        let resolved_schema = Arc::new(schema_from_df_fields(&projected_fields)?);
+
+        Ok(Self {
+            upstream_plans,
+            operator_label,
+            projection_exprs,
+            resolved_schema,
+            requires_shuffle: false,
+        })
+    }
+
+    pub(crate) fn with_shuffle_routing(mut self) -> Self {
+        self.requires_shuffle = true;
+        self
+    }
+
+    fn validate_uniform_schemas(input_schemas: &[FsSchemaRef]) -> Result<FsSchemaRef> {
+        if input_schemas.is_empty() {
+            return internal_err!("No input schemas provided to projection planner");
+        }
+        let primary_schema = input_schemas[0].clone();
+
+        for schema in input_schemas.iter().skip(1) {
+            if **schema != *primary_schema {
+                return internal_err!(
+                    "Schema mismatch: All upstream inputs to a projection node must share the identical schema topology."
+                );
+            }
+        }
+
+        Ok(primary_schema)
+    }
+
+    fn compile_physical_expressions(
+        &self,
+        planner: &Planner,
+        input_df_schema: &DFSchemaRef,
+    ) -> Result<Vec<Vec<u8>>> {
+        self.projection_exprs
+            .iter()
+            .map(|logical_expr| {
+                let physical_expr = planner
+                    .create_physical_expr(logical_expr, input_df_schema)
+                    .map_err(|e| e.context("Failed to compile physical projection expression"))?;
+
+                let serialized_expr = serialize_physical_expr(
+                    &physical_expr,
+                    &DefaultPhysicalExtensionCodec {},
+                )?;
+
+                Ok(serialized_expr.encode_to_vec())
+            })
+            .collect()
+    }
+}
+
+// -----------------------------------------------------------------------------
+// Stream Extension Trait Implementation
+// -----------------------------------------------------------------------------
+
+impl StreamingOperatorBlueprint for StreamProjectionNode {
+    fn operator_identity(&self) -> Option<NamedNode> {
+        None
+    }
+
+    fn compile_to_graph_node(
+        &self,
+        planner: &Planner,
+        node_index: usize,
+        input_schemas: Vec<FsSchemaRef>,
+    ) -> Result<CompiledTopologyNode> {
+        let unified_input_schema = Self::validate_uniform_schemas(&input_schemas)?;
+        let input_df_schema =
+            Arc::new(DFSchema::try_from(unified_input_schema.schema.as_ref().clone())?);
+
+        let compiled_expr_payloads = self.compile_physical_expressions(planner, &input_df_schema)?;
+
+        let operator_config = ProjectionOperator {
+            name: self
+                .operator_label
+                .as_deref()
+                .unwrap_or(DEFAULT_PROJECTION_LABEL)
+                .to_string(),
+            input_schema: Some(unified_input_schema.as_ref().clone().into()),
+            output_schema: Some(self.yielded_schema().into()),
+            exprs: compiled_expr_payloads,
+        };
+
+        let node_identifier = format!("projection_{node_index}");
+        let label = format!(
+            "ArrowProjection<{}>",
+            self.operator_label.as_deref().unwrap_or("_")
+        );
+
+        let logical_node = LogicalNode::single(
+            node_index as u32,
+            node_identifier,
+            OperatorName::Projection,
+            operator_config.encode_to_vec(),
+            label,
+            1,
+        );
+
+        let routing_strategy = if self.requires_shuffle {
+            LogicalEdgeType::Shuffle
+        } else {
+            LogicalEdgeType::Forward
+        };
+
+        let outgoing_edge =
+            LogicalEdge::project_all(routing_strategy, (*unified_input_schema).clone());
+
+        Ok(CompiledTopologyNode {
+            execution_unit: logical_node,
+            routing_edges: vec![outgoing_edge],
+        })
+    }
+
+    fn yielded_schema(&self) -> FsSchema {
+        FsSchema::from_schema_unkeyed(Arc::new(self.resolved_schema.as_arrow().clone()))
+            .expect("Fatal: Failed to generate unkeyed output schema for projection")
+    }
+}
+
+// -----------------------------------------------------------------------------
+// DataFusion Logical Node Hooks
+// -----------------------------------------------------------------------------
+
+impl UserDefinedLogicalNodeCore for StreamProjectionNode {
+    fn name(&self) -> &str {
+        STREAM_PROJECTION_NODE_NAME
+    }
+
+    fn inputs(&self) -> Vec<&LogicalPlan> {
+        self.upstream_plans.iter().collect()
+    }
+
+    fn schema(&self) -> &DFSchemaRef {
+        &self.resolved_schema
+    }
+
+    fn expressions(&self) -> Vec<Expr> {
+        vec![]
+    }
+
+    fn fmt_for_explain(&self, f: &mut Formatter) -> std::fmt::Result {
+        write!(
+            f,
+            "StreamProjectionNode: RequiresShuffle={}, Schema={}",
+            self.requires_shuffle,
+            self.resolved_schema
+        )
+    }
+
+    fn with_exprs_and_inputs(&self, _exprs: Vec<Expr>, inputs: Vec<LogicalPlan>) -> Result<Self> {
+        let mut new_node = Self::try_new(
+            inputs,
+            self.operator_label.clone(),
+            self.projection_exprs.clone(),
+        )?;
+
+        if self.requires_shuffle {
+            new_node = new_node.with_shuffle_routing();
+        }
+
+        Ok(new_node)
+    }
+}
diff --git a/src/sql/extensions/remote_table.rs b/src/sql/extensions/remote_table.rs
new file mode 100644
index 00000000..72b6150c
--- /dev/null
+++ b/src/sql/extensions/remote_table.rs
@@ -0,0 +1,188 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::fmt::Formatter;
+use std::sync::Arc;
+
+use datafusion::common::{DFSchemaRef, Result, TableReference, internal_err, plan_err};
+use datafusion::logical_expr::{Expr, LogicalPlan, UserDefinedLogicalNodeCore};
+use datafusion_proto::physical_plan::AsExecutionPlan;
+use datafusion_proto::protobuf::PhysicalPlanNode;
+use prost::Message;
+
+use protocol::grpc::api::ValuePlanOperator;
+
+use crate::multifield_partial_ord;
+use crate::sql::common::constants::extension_node;
+use crate::sql::common::{FsSchema, FsSchemaRef};
+use crate::sql::extensions::{CompiledTopologyNode, StreamingOperatorBlueprint};
+use crate::sql::logical_node::logical::{LogicalEdge, LogicalEdgeType, LogicalNode, OperatorName};
+use crate::sql::physical::FsPhysicalExtensionCodec;
+use crate::sql::logical_planner::planner::{NamedNode, Planner};
+
+// -----------------------------------------------------------------------------
+// Constants & Identifiers
+// -----------------------------------------------------------------------------
+
+pub(crate) const REMOTE_TABLE_NODE_NAME: &str = extension_node::REMOTE_TABLE_BOUNDARY;
+
+// -----------------------------------------------------------------------------
+// Logical Node Definition
+// -----------------------------------------------------------------------------
+
+/// Segments the execution graph and merges nodes sharing the same identifier; acts as a boundary.
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+pub(crate) struct RemoteTableBoundaryNode {
+    pub(crate) upstream_plan: LogicalPlan,
+    pub(crate) table_identifier: TableReference,
+    pub(crate) resolved_schema: DFSchemaRef,
+    pub(crate) requires_materialization: bool,
+}
+
+multifield_partial_ord!(
+    RemoteTableBoundaryNode,
+    upstream_plan,
+    table_identifier,
+    requires_materialization
+);
+
+impl RemoteTableBoundaryNode {
+    fn compile_engine_operator(&self, planner: &Planner) -> Result<Vec<u8>> {
+        let physical_plan = planner.sync_plan(&self.upstream_plan)?;
+
+        let physical_plan_proto = PhysicalPlanNode::try_from_physical_plan(
+            physical_plan,
+            &FsPhysicalExtensionCodec::default(),
+        )?;
+
+        let operator_config = ValuePlanOperator {
+            name: format!("value_calculation({})", self.table_identifier),
+            physical_plan: physical_plan_proto.encode_to_vec(),
+        };
+
+        Ok(operator_config.encode_to_vec())
+    }
+
+    fn validate_uniform_schemas(input_schemas: &[FsSchemaRef]) -> Result<()> {
+        if input_schemas.len() <= 1 {
+            return Ok(());
+        }
+
+        let primary_schema = &input_schemas[0];
+        for schema in input_schemas.iter().skip(1) {
+            if *schema != *primary_schema {
+                return plan_err!(
+                    "Topology error: Multiple input streams routed to the same remote table must share an identical schema structure."
+                );
+            }
+        }
+
+        Ok(())
+    }
+}
+
+// -----------------------------------------------------------------------------
+// Stream Extension Trait Implementation
+// -----------------------------------------------------------------------------
+
+impl StreamingOperatorBlueprint for RemoteTableBoundaryNode {
+    fn operator_identity(&self) -> Option<NamedNode> {
+        if self.requires_materialization {
+            Some(NamedNode::RemoteTable(self.table_identifier.clone()))
+        } else {
+            None
+        }
+    }
+
+    fn compile_to_graph_node(
+        &self,
+        planner: &Planner,
+        node_index: usize,
+        input_schemas: Vec<FsSchemaRef>,
+    ) -> Result<CompiledTopologyNode> {
+        Self::validate_uniform_schemas(&input_schemas)?;
+
+        let operator_payload = self.compile_engine_operator(planner)?;
+
+        let logical_node = LogicalNode::single(
+            node_index as u32,
+            format!("value_{node_index}"),
+            OperatorName::Value,
+            operator_payload,
+            self.table_identifier.to_string(),
+            1,
+        );
+
+        let routing_edges: Vec<LogicalEdge> = input_schemas
+            .into_iter()
+            .map(|schema| LogicalEdge::project_all(LogicalEdgeType::Forward, (*schema).clone()))
+            .collect();
+
+        Ok(CompiledTopologyNode {
+            execution_unit: logical_node,
+            routing_edges: routing_edges,
+        })
+    }
+
+    fn yielded_schema(&self) -> FsSchema {
+        FsSchema::from_schema_keys(Arc::new(self.resolved_schema.as_ref().into()), vec![])
+            .expect("Fatal: Failed to generate output schema for remote table boundary")
+    }
+}
+
+// -----------------------------------------------------------------------------
+// DataFusion Logical Node Hooks
+// -----------------------------------------------------------------------------
+
+impl UserDefinedLogicalNodeCore for RemoteTableBoundaryNode {
+    fn name(&self) -> &str {
+        REMOTE_TABLE_NODE_NAME
+    }
+
+    fn inputs(&self) -> Vec<&LogicalPlan> {
+        vec![&self.upstream_plan]
+    }
+
+    fn schema(&self) -> &DFSchemaRef {
+        &self.resolved_schema
+    }
+
+    fn expressions(&self) -> Vec<Expr> {
+        vec![]
+    }
+
+    fn fmt_for_explain(&self, f: &mut Formatter) -> std::fmt::Result {
+        write!(
+            f,
+            "RemoteTableBoundaryNode: Identifier={}, Materialized={}, Schema={}",
+            self.table_identifier,
+            self.requires_materialization,
+            self.resolved_schema
+        )
+    }
+
+    fn with_exprs_and_inputs(&self, _exprs: Vec<Expr>, mut inputs: Vec<LogicalPlan>) -> Result<Self> {
+        if inputs.len() != 1 {
+            return internal_err!(
+                "RemoteTableBoundaryNode expects exactly 1 upstream logical plan, but received {}",
+                inputs.len()
+            );
+        }
+
+        Ok(Self {
+            upstream_plan: inputs.remove(0),
+            table_identifier: self.table_identifier.clone(),
+            resolved_schema: self.resolved_schema.clone(),
+            requires_materialization: self.requires_materialization,
+        })
+    }
+}
diff --git a/src/sql/extensions/sink.rs b/src/sql/extensions/sink.rs
new file mode 100644
index 00000000..d2916486
--- /dev/null
+++ b/src/sql/extensions/sink.rs
@@ -0,0 +1,229 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::fmt::Formatter;
+use std::sync::Arc;
+
+use datafusion::common::{DFSchemaRef, Result, TableReference, plan_err};
+use datafusion::logical_expr::{Expr, Extension, LogicalPlan, UserDefinedLogicalNodeCore};
+use prost::Message;
+
+use crate::multifield_partial_ord;
+use crate::sql::common::constants::extension_node;
+use crate::sql::common::{FsSchema, FsSchemaRef, UPDATING_META_FIELD};
+use crate::sql::extensions::{CompiledTopologyNode, StreamingOperatorBlueprint};
+use crate::sql::logical_node::logical::{LogicalEdge, LogicalEdgeType, LogicalNode, OperatorName};
+use crate::sql::logical_planner::planner::{NamedNode, Planner};
+use crate::sql::schema::Table;
+
+use super::debezium::PackDebeziumEnvelopeNode;
+use super::remote_table::RemoteTableBoundaryNode;
+
+// -----------------------------------------------------------------------------
+// Constants & Identifiers
+// -----------------------------------------------------------------------------
+
+pub(crate) const STREAM_EGRESS_NODE_NAME: &str = extension_node::STREAM_EGRESS;
+
+// -----------------------------------------------------------------------------
+// Logical Node Definition
+// -----------------------------------------------------------------------------
+
+/// Terminal node routing processed data into an external sink (e.g. Kafka, PostgreSQL).
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+pub(crate) struct StreamEgressNode {
+    pub(crate) target_identifier: TableReference,
+    pub(crate) destination_table: Table,
+    pub(crate) egress_schema: DFSchemaRef,
+    upstream_plans: Arc<Vec<LogicalPlan>>,
+}
+
+multifield_partial_ord!(StreamEgressNode, target_identifier, upstream_plans);
+
+impl StreamEgressNode {
+    pub fn try_new(
+        target_identifier: TableReference,
+        destination_table: Table,
+        initial_schema: DFSchemaRef,
+        upstream_plan: LogicalPlan,
+    ) -> Result<Self> {
+        let (mut processed_plan, mut resolved_schema) = Self::apply_cdc_transformations(
+            upstream_plan,
+            initial_schema,
+            &destination_table,
+        )?;
+
+        Self::enforce_computational_boundary(&mut resolved_schema, &mut processed_plan);
+
+        Ok(Self {
+            target_identifier,
+            destination_table,
+            egress_schema: resolved_schema,
+            upstream_plans: Arc::new(vec![processed_plan]),
+        })
+    }
+
+    fn apply_cdc_transformations(
+        plan: LogicalPlan,
+        schema: DFSchemaRef,
+        destination: &Table,
+    ) -> Result<(LogicalPlan, DFSchemaRef)> {
+        let is_upstream_updating = plan
+            .schema()
+            .has_column_with_unqualified_name(UPDATING_META_FIELD);
+
+        match destination {
+            Table::ConnectorTable(connector) => {
+                let is_sink_updating = connector.is_updating();
+
+                match (is_upstream_updating, is_sink_updating) {
+                    (_, true) => {
+                        let debezium_encoder = PackDebeziumEnvelopeNode::try_new(plan)?;
+                        let wrapped_plan = LogicalPlan::Extension(Extension {
+                            node: Arc::new(debezium_encoder),
+                        });
+                        let new_schema = wrapped_plan.schema().clone();
+
+                        Ok((wrapped_plan, new_schema))
+                    }
+                    (true, false) => {
+                        plan_err!(
+                            "Topology Mismatch: The upstream is producing an updating stream (CDC), \
+                             but the target sink '{}' is not configured to accept updates. \
+                             Hint: set `format = 'debezium_json'` in the WITH clause.",
+                            connector.name()
+                        )
+                    }
+                    (false, false) => Ok((plan, schema)),
+                }
+            }
+            Table::LookupTable(..) => {
+                plan_err!("Topology Violation: A Lookup Table cannot be used as a streaming data sink.")
+            }
+            Table::TableFromQuery { .. } => Ok((plan, schema)),
+        }
+    }
+
+    fn enforce_computational_boundary(schema: &mut DFSchemaRef, plan: &mut LogicalPlan) {
+        let requires_boundary = if let LogicalPlan::Extension(extension) = plan {
+            let stream_ext: &dyn StreamingOperatorBlueprint = (&extension.node)
+                .try_into()
+                .expect("Fatal: Egress node encountered an extension that does not implement StreamingOperatorBlueprint");
+
+            stream_ext.is_passthrough_boundary()
+        } else {
+            true
+        };
+
+        if requires_boundary {
+            let boundary_node = RemoteTableBoundaryNode {
+                upstream_plan: plan.clone(),
+                table_identifier: TableReference::bare("sink projection"),
+                resolved_schema: schema.clone(),
+                requires_materialization: false,
+            };
+
+            *plan = LogicalPlan::Extension(Extension {
+                node: Arc::new(boundary_node),
+            });
+        }
+    }
+}
+
+// -----------------------------------------------------------------------------
+// Stream Extension Trait Implementation
+// -----------------------------------------------------------------------------
+
+impl StreamingOperatorBlueprint for StreamEgressNode {
+    fn operator_identity(&self) -> Option<NamedNode> {
+        Some(NamedNode::Sink(self.target_identifier.clone()))
+    }
+
+    fn compile_to_graph_node(
+        &self,
+        _planner: &Planner,
+        node_index: usize,
+        input_schemas: Vec<FsSchemaRef>,
+    ) -> Result<CompiledTopologyNode> {
+        let connector_operator = self
+            .destination_table
+            .connector_op()
+            .map_err(|e| e.context("Failed to generate connector operation payload"))?;
+
+        let operator_description = connector_operator.description.clone();
+        let operator_payload = connector_operator.encode_to_vec();
+
+        let logical_node = LogicalNode::single(
+            node_index as u32,
+            format!("sink_{}_{node_index}", self.target_identifier),
+            OperatorName::ConnectorSink,
+            operator_payload,
+            operator_description,
+            1,
+        );
+
+        let routing_edges: Vec<LogicalEdge> = input_schemas
+            .into_iter()
+            .map(|input_schema| {
+                LogicalEdge::project_all(LogicalEdgeType::Forward, (*input_schema).clone())
+            })
+            .collect();
+
+        Ok(CompiledTopologyNode {
+            execution_unit: logical_node,
+            routing_edges: routing_edges,
+        })
+    }
+
+    fn yielded_schema(&self) -> FsSchema {
+        FsSchema::from_fields(vec![])
+    }
+}
+
+// -----------------------------------------------------------------------------
+// DataFusion Logical Node Hooks
+// -----------------------------------------------------------------------------
+
+impl UserDefinedLogicalNodeCore for StreamEgressNode {
+    fn name(&self) -> &str {
+        STREAM_EGRESS_NODE_NAME
+    }
+
+    fn inputs(&self) -> Vec<&LogicalPlan> {
+        self.upstream_plans.iter().collect()
+    }
+
+    fn schema(&self) -> &DFSchemaRef {
+        &self.egress_schema
+    }
+
+    fn expressions(&self) -> Vec<Expr> {
+        vec![]
+    }
+
+    fn fmt_for_explain(&self, f: &mut Formatter) -> std::fmt::Result {
+        write!(
+            f,
+            "StreamEgressNode({:?}): Schema={}",
+            self.target_identifier, self.egress_schema
+        )
+    }
+
+    fn with_exprs_and_inputs(&self, _exprs: Vec<Expr>, inputs: Vec<LogicalPlan>) -> Result<Self> {
+        Ok(Self {
+            target_identifier: self.target_identifier.clone(),
+            destination_table: self.destination_table.clone(),
+            egress_schema: self.egress_schema.clone(),
+            upstream_plans: Arc::new(inputs),
+        })
+    }
+}
diff --git a/src/sql/extensions/streaming_operator_blueprint.rs b/src/sql/extensions/streaming_operator_blueprint.rs
new file mode 100644
index 00000000..d3f9d459
--- /dev/null
+++ b/src/sql/extensions/streaming_operator_blueprint.rs
@@ -0,0 +1,65 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::fmt::Debug;
+
+use datafusion::common::Result;
+
+use crate::sql::common::{FsSchema, FsSchemaRef};
+use crate::sql::logical_node::logical::{LogicalEdge, LogicalNode};
+use crate::sql::logical_planner::planner::{NamedNode, Planner};
+
+// -----------------------------------------------------------------------------
+// Core Execution Blueprint
+// -----------------------------------------------------------------------------
+
+/// Atomic unit within a streaming execution topology: translates streaming SQL into graph nodes.
+pub(crate) trait StreamingOperatorBlueprint: Debug {
+    /// Canonical named identity for this operator, if any (sources, sinks, etc.).
+    fn operator_identity(&self) -> Option<NamedNode>;
+
+    /// Compiles this operator into a graph vertex and its incoming routing edges.
+    fn compile_to_graph_node(
+        &self,
+        compiler_context: &Planner,
+        node_id_sequence: usize,
+        upstream_schemas: Vec<FsSchemaRef>,
+    ) -> Result<CompiledTopologyNode>;
+
+    /// Schema of records this operator yields downstream.
+    fn yielded_schema(&self) -> FsSchema;
+
+    /// Logical passthrough boundary (no physical state change); default is stateful / materializing.
+    fn is_passthrough_boundary(&self) -> bool {
+        false
+    }
+}
+
+// -----------------------------------------------------------------------------
+// Graph Topology Structures
+// -----------------------------------------------------------------------------
+
+/// Compiled vertex: execution unit plus upstream routing edges.
+#[derive(Debug, Clone)]
+pub(crate) struct CompiledTopologyNode {
+    pub execution_unit: LogicalNode,
+    pub routing_edges: Vec<LogicalEdge>,
+}
+
+impl CompiledTopologyNode {
+    pub fn new(execution_unit: LogicalNode, routing_edges: Vec<LogicalEdge>) -> Self {
+        Self {
+            execution_unit,
+            routing_edges,
+        }
+    }
+}
diff --git a/src/sql/extensions/table_source.rs b/src/sql/extensions/table_source.rs
new file mode 100644
index 00000000..3f998c5a
--- /dev/null
+++ b/src/sql/extensions/table_source.rs
@@ -0,0 +1,176 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::fmt::Formatter;
+use std::sync::Arc;
+
+use datafusion::common::{DFSchemaRef, Result, TableReference, plan_err};
+use datafusion::logical_expr::{Expr, LogicalPlan, UserDefinedLogicalNodeCore};
+use prost::Message;
+
+use crate::multifield_partial_ord;
+use crate::sql::common::constants::extension_node;
+use crate::sql::common::{FsSchema, FsSchemaRef};
+use crate::sql::extensions::debezium::DebeziumSchemaCodec;
+use crate::sql::logical_node::logical::{LogicalNode, OperatorName};
+use crate::sql::logical_planner::planner::{NamedNode, Planner};
+use crate::sql::schema::SourceTable;
+use crate::sql::schema::utils::add_timestamp_field;
+use crate::sql::types::schema_from_df_fields;
+
+use super::{CompiledTopologyNode, StreamingOperatorBlueprint};
+
+// -----------------------------------------------------------------------------
+// Constants & Identifiers
+// -----------------------------------------------------------------------------
+
+pub(crate) const STREAM_INGESTION_NODE_NAME: &str = extension_node::STREAM_INGESTION;
+
+// -----------------------------------------------------------------------------
+// Logical Node Definition
+// -----------------------------------------------------------------------------
+
+/// Foundational ingestion point: connects to external systems and injects raw or CDC data.
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+pub(crate) struct StreamIngestionNode {
+    pub(crate) source_identifier: TableReference,
+    pub(crate) source_definition: SourceTable,
+    pub(crate) resolved_schema: DFSchemaRef,
+}
+
+multifield_partial_ord!(StreamIngestionNode, source_identifier, source_definition);
+
+impl StreamIngestionNode {
+    pub fn try_new(
+        source_identifier: TableReference,
+        source_definition: SourceTable,
+    ) -> Result<Self> {
+        let resolved_schema =
+            Self::build_ingestion_schema(&source_identifier, &source_definition)?;
+
+        Ok(Self {
+            source_identifier,
+            source_definition,
+            resolved_schema,
+        })
+    }
+
+    fn build_ingestion_schema(
+        identifier: &TableReference,
+        definition: &SourceTable,
+    ) -> Result<DFSchemaRef> {
+        let physical_fields: Vec<_> = definition
+            .schema_specs
+            .iter()
+            .filter(|col| !col.is_computed())
+            .map(|col| (Some(identifier.clone()), Arc::new(col.arrow_field().clone())).into())
+            .collect();
+
+        let base_schema = Arc::new(schema_from_df_fields(&physical_fields)?);
+
+        let enveloped_schema = if definition.is_updating() {
+            DebeziumSchemaCodec::wrap_into_envelope(&base_schema, Some(identifier.clone()))?
+        } else {
+            base_schema
+        };
+
+        add_timestamp_field(enveloped_schema, Some(identifier.clone()))
+    }
+}
+
+// -----------------------------------------------------------------------------
+// DataFusion Logical Node Hooks
+// -----------------------------------------------------------------------------
+
+impl UserDefinedLogicalNodeCore for StreamIngestionNode {
+    fn name(&self) -> &str {
+        STREAM_INGESTION_NODE_NAME
+    }
+
+    fn inputs(&self) -> Vec<&LogicalPlan> {
+        vec![]
+    }
+
+    fn schema(&self) -> &DFSchemaRef {
+        &self.resolved_schema
+    }
+
+    fn expressions(&self) -> Vec<Expr> {
+        vec![]
+    }
+
+    fn fmt_for_explain(&self, f: &mut Formatter) -> std::fmt::Result {
+        write!(
+            f,
+            "StreamIngestionNode({}): Schema={}",
+            self.source_identifier, self.resolved_schema
+        )
+    }
+
+    fn with_exprs_and_inputs(&self, _exprs: Vec<Expr>, inputs: Vec<LogicalPlan>) -> Result<Self> {
+        if !inputs.is_empty() {
+            return plan_err!(
+                "StreamIngestionNode acts as a leaf boundary and cannot accept upstream inputs."
+            );
+        }
+
+        Ok(Self {
+            source_identifier: self.source_identifier.clone(),
+            source_definition: self.source_definition.clone(),
+            resolved_schema: self.resolved_schema.clone(),
+        })
+    }
+}
+
+// -----------------------------------------------------------------------------
+// Core Execution Blueprint Implementation
+// -----------------------------------------------------------------------------
+
+impl StreamingOperatorBlueprint for StreamIngestionNode {
+    fn operator_identity(&self) -> Option<NamedNode> {
+        Some(NamedNode::Source(self.source_identifier.clone()))
+    }
+
+    fn compile_to_graph_node(
+        &self,
+        _compiler_context: &Planner,
+        node_id_sequence: usize,
+        upstream_schemas: Vec<FsSchemaRef>,
+    ) -> Result<CompiledTopologyNode> {
+        if !upstream_schemas.is_empty() {
+            return plan_err!(
+                "Topology Violation: StreamIngestionNode is a source origin and cannot process upstream routing edges."
+            );
+        }
+
+        let sql_source = self.source_definition.as_sql_source()?;
+        let connector_payload = sql_source.source.config.encode_to_vec();
+        let operator_description = sql_source.source.config.description.clone();
+
+        let execution_unit = LogicalNode::single(
+            node_id_sequence as u32,
+            format!("source_{}_{node_id_sequence}", self.source_identifier),
+            OperatorName::ConnectorSource,
+            connector_payload,
+            operator_description,
+            1,
+        );
+
+        Ok(CompiledTopologyNode::new(execution_unit, vec![]))
+    }
+
+    fn yielded_schema(&self) -> FsSchema {
+        FsSchema::from_schema_keys(Arc::new(self.resolved_schema.as_ref().into()), vec![]).expect(
+            "Fatal: Failed to generate output schema for stream ingestion",
+        )
+    }
+}
diff --git a/src/sql/extensions/timestamp_append.rs b/src/sql/extensions/timestamp_append.rs
new file mode 100644
index 00000000..2d8b985b
--- /dev/null
+++ b/src/sql/extensions/timestamp_append.rs
@@ -0,0 +1,117 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::fmt::Formatter;
+
+use datafusion::common::{DFSchemaRef, Result, TableReference, internal_err};
+use datafusion::logical_expr::{Expr, LogicalPlan, UserDefinedLogicalNodeCore};
+
+use crate::multifield_partial_ord;
+use crate::sql::common::constants::extension_node;
+use crate::sql::schema::utils::{add_timestamp_field, has_timestamp_field};
+
+// -----------------------------------------------------------------------------
+// Constants & Identifiers
+// -----------------------------------------------------------------------------
+
+pub(crate) const TIMESTAMP_INJECTOR_NODE_NAME: &str = extension_node::SYSTEM_TIMESTAMP_INJECTOR;
+
+// -----------------------------------------------------------------------------
+// Logical Node Definition
+// -----------------------------------------------------------------------------
+
+/// Injects the mandatory system `_timestamp` field into the upstream streaming schema.
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+pub(crate) struct SystemTimestampInjectorNode {
+    pub(crate) upstream_plan: LogicalPlan,
+    pub(crate) target_qualifier: Option<TableReference>,
+    pub(crate) resolved_schema: DFSchemaRef,
+}
+
+multifield_partial_ord!(SystemTimestampInjectorNode, upstream_plan, target_qualifier);
+
+impl SystemTimestampInjectorNode {
+    pub(crate) fn try_new(
+        upstream_plan: LogicalPlan,
+        target_qualifier: Option<TableReference>,
+    ) -> Result<Self> {
+        let upstream_schema = upstream_plan.schema();
+
+        if has_timestamp_field(upstream_schema) {
+            return internal_err!(
+                "Topology Violation: Attempted to inject a system timestamp into an upstream plan \
+                 that already contains one. \
+                 \nPlan:\n {:?} \nSchema:\n {:?}",
+                upstream_plan,
+                upstream_schema
+            );
+        }
+
+        let resolved_schema =
+            add_timestamp_field(upstream_schema.clone(), target_qualifier.clone())?;
+
+        Ok(Self {
+            upstream_plan,
+            target_qualifier,
+            resolved_schema,
+        })
+    }
+}
+
+// -----------------------------------------------------------------------------
+// DataFusion Logical Node Hooks
+// -----------------------------------------------------------------------------
+
+impl UserDefinedLogicalNodeCore for SystemTimestampInjectorNode {
+    fn name(&self) -> &str {
+        TIMESTAMP_INJECTOR_NODE_NAME
+    }
+
+    fn inputs(&self) -> Vec<&LogicalPlan> {
+        vec![&self.upstream_plan]
+    }
+
+    fn schema(&self) -> &DFSchemaRef {
+        &self.resolved_schema
+    }
+
+    fn expressions(&self) -> Vec<Expr> {
+        vec![]
+    }
+
+    fn fmt_for_explain(&self, f: &mut Formatter) -> std::fmt::Result {
+        let field_names = self
+            .resolved_schema
+            .fields()
+            .iter()
+            .map(|field| field.name().to_string())
+            .collect::<Vec<String>>()
+            .join(", ");
+
+        write!(
+            f,
+            "SystemTimestampInjector(Qualifier={:?}): [{}]",
+            self.target_qualifier, field_names
+        )
+    }
+
+    fn with_exprs_and_inputs(&self, _exprs: Vec<Expr>, mut inputs: Vec<LogicalPlan>) -> Result<Self> {
+        if inputs.len() != 1 {
+            return internal_err!(
+                "SystemTimestampInjectorNode requires exactly 1 upstream logical plan, but received {}",
+                inputs.len()
+            );
+        }
+
+        Self::try_new(inputs.remove(0), self.target_qualifier.clone())
+    }
+}
diff --git a/src/sql/extensions/updating_aggregate.rs b/src/sql/extensions/updating_aggregate.rs
new file mode 100644
index 00000000..a76d15d4
--- /dev/null
+++ b/src/sql/extensions/updating_aggregate.rs
@@ -0,0 +1,242 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::sync::Arc;
+use std::time::Duration;
+
+use datafusion::common::{DFSchemaRef, Result, TableReference, ToDFSchema, internal_err, plan_err};
+use datafusion::logical_expr::expr::ScalarFunction;
+use datafusion::logical_expr::{
+    col, lit, Expr, Extension, LogicalPlan, UserDefinedLogicalNodeCore,
+};
+use datafusion::prelude::named_struct;
+use datafusion::scalar::ScalarValue;
+use datafusion_proto::physical_plan::AsExecutionPlan;
+use datafusion_proto::protobuf::PhysicalPlanNode;
+use prost::Message;
+use protocol::grpc::api::UpdatingAggregateOperator;
+
+use crate::sql::common::constants::{extension_node, proto_operator_name, updating_state_field};
+use crate::sql::common::{FsSchema, FsSchemaRef};
+use crate::sql::extensions::{CompiledTopologyNode, IsRetractExtension, StreamingOperatorBlueprint};
+use crate::sql::functions::multi_hash;
+use crate::sql::logical_node::logical::{LogicalEdge, LogicalEdgeType, LogicalNode, OperatorName};
+use crate::sql::physical::FsPhysicalExtensionCodec;
+use crate::sql::logical_planner::planner::{NamedNode, Planner};
+
+// -----------------------------------------------------------------------------
+// Constants & Configuration
+// -----------------------------------------------------------------------------
+
+pub(crate) const CONTINUOUS_AGGREGATE_NODE_NAME: &str = extension_node::CONTINUOUS_AGGREGATE;
+
+const DEFAULT_FLUSH_INTERVAL_MICROS: u64 = 10_000_000;
+
+const STATIC_HASH_SIZE_BYTES: i32 = 16;
+
+// -----------------------------------------------------------------------------
+// Logical Node Definition
+// -----------------------------------------------------------------------------
+
+/// Stateful continuous aggregation: running aggregates with updating / retraction semantics.
+#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd)]
+pub(crate) struct ContinuousAggregateNode {
+    pub(crate) base_aggregate_plan: LogicalPlan,
+    pub(crate) partition_key_indices: Vec<usize>,
+    pub(crate) retract_injected_plan: LogicalPlan,
+    pub(crate) namespace_qualifier: Option<TableReference>,
+    pub(crate) state_retention_ttl: Duration,
+}
+
+impl ContinuousAggregateNode {
+    pub fn try_new(
+        base_aggregate_plan: LogicalPlan,
+        partition_key_indices: Vec<usize>,
+        namespace_qualifier: Option<TableReference>,
+        state_retention_ttl: Duration,
+    ) -> Result<Self> {
+        let retract_injected_plan = LogicalPlan::Extension(Extension {
+            node: Arc::new(IsRetractExtension::new(
+                base_aggregate_plan.clone(),
+                namespace_qualifier.clone(),
+            )),
+        });
+
+        Ok(Self {
+            base_aggregate_plan,
+            partition_key_indices,
+            retract_injected_plan,
+            namespace_qualifier,
+            state_retention_ttl,
+        })
+    }
+
+    fn construct_state_metadata_expr(&self, upstream_schema: &FsSchemaRef) -> Expr {
+        let routing_keys: Vec<Expr> = self
+            .partition_key_indices
+            .iter()
+            .map(|&idx| col(upstream_schema.schema.field(idx).name()))
+            .collect();
+
+        let state_id_hash = if routing_keys.is_empty() {
+            Expr::Literal(
+                ScalarValue::FixedSizeBinary(
+                    STATIC_HASH_SIZE_BYTES,
+                    Some(vec![0; STATIC_HASH_SIZE_BYTES as usize]),
+                ),
+                None,
+            )
+        } else {
+            Expr::ScalarFunction(ScalarFunction {
+                func: multi_hash(),
+                args: routing_keys,
+            })
+        };
+
+        named_struct(vec![
+            lit(updating_state_field::IS_RETRACT),
+            lit(false),
+            lit(updating_state_field::ID),
+            state_id_hash,
+        ])
+    }
+
+    fn compile_operator_config(
+        &self,
+        planner: &Planner,
+        upstream_schema: &FsSchemaRef,
+    ) -> Result<UpdatingAggregateOperator> {
+        let upstream_df_schema = upstream_schema.schema.clone().to_dfschema()?;
+
+        let physical_agg_plan = planner.sync_plan(&self.base_aggregate_plan)?;
+        let compiled_agg_payload = PhysicalPlanNode::try_from_physical_plan(
+            physical_agg_plan,
+            &FsPhysicalExtensionCodec::default(),
+        )?
+        .encode_to_vec();
+
+        let meta_expr = self.construct_state_metadata_expr(upstream_schema);
+        let compiled_meta_expr =
+            planner.serialize_as_physical_expr(&meta_expr, &upstream_df_schema)?;
+
+        Ok(UpdatingAggregateOperator {
+            name: proto_operator_name::UPDATING_AGGREGATE.to_string(),
+            input_schema: Some((**upstream_schema).clone().into()),
+            final_schema: Some(self.yielded_schema().into()),
+            aggregate_exec: compiled_agg_payload,
+            metadata_expr: compiled_meta_expr,
+            flush_interval_micros: DEFAULT_FLUSH_INTERVAL_MICROS,
+            ttl_micros: self.state_retention_ttl.as_micros() as u64,
+        })
+    }
+}
+
+// -----------------------------------------------------------------------------
+// DataFusion Logical Node Hooks
+// -----------------------------------------------------------------------------
+
+impl UserDefinedLogicalNodeCore for ContinuousAggregateNode {
+    fn name(&self) -> &str {
+        CONTINUOUS_AGGREGATE_NODE_NAME
+    }
+
+    fn inputs(&self) -> Vec<&LogicalPlan> {
+        vec![&self.base_aggregate_plan]
+    }
+
+    fn schema(&self) -> &DFSchemaRef {
+        self.retract_injected_plan.schema()
+    }
+
+    fn expressions(&self) -> Vec<Expr> {
+        vec![]
+    }
+
+    fn fmt_for_explain(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        write!(
+            f,
+            "ContinuousAggregateNode(TTL={:?})",
+            self.state_retention_ttl
+        )
+    }
+
+    fn with_exprs_and_inputs(
+        &self,
+        _exprs: Vec<Expr>,
+        mut inputs: Vec<LogicalPlan>,
+    ) -> Result<Self> {
+        if inputs.len() != 1 {
+            return internal_err!(
+                "ContinuousAggregateNode requires exactly 1 upstream input, got {}",
+                inputs.len()
+            );
+        }
+
+        Self::try_new(
+            inputs.remove(0),
+            self.partition_key_indices.clone(),
+            self.namespace_qualifier.clone(),
+            self.state_retention_ttl,
+        )
+    }
+}
+
+// -----------------------------------------------------------------------------
+// Core Execution Blueprint Implementation
+// -----------------------------------------------------------------------------
+
+impl StreamingOperatorBlueprint for ContinuousAggregateNode {
+    fn operator_identity(&self) -> Option<NamedNode> {
+        None
+    }
+
+    fn compile_to_graph_node(
+        &self,
+        planner: &Planner,
+        node_index: usize,
+        mut upstream_schemas: Vec<FsSchemaRef>,
+    ) -> Result<CompiledTopologyNode> {
+        if upstream_schemas.len() != 1 {
+            return plan_err!(
+                "Topology Violation: ContinuousAggregateNode requires exactly 1 upstream input, received {}",
+                upstream_schemas.len()
+            );
+        }
+
+        let upstream_schema = upstream_schemas.remove(0);
+
+        let operator_config = self.compile_operator_config(planner, &upstream_schema)?;
+
+        let logical_node = LogicalNode::single(
+            node_index as u32,
+            format!("updating_aggregate_{node_index}"),
+            OperatorName::UpdatingAggregate,
+            operator_config.encode_to_vec(),
+            proto_operator_name::UPDATING_AGGREGATE.to_string(),
+            1,
+        );
+
+        let shuffle_edge =
+            LogicalEdge::project_all(LogicalEdgeType::Shuffle, (*upstream_schema).clone());
+
+        Ok(CompiledTopologyNode {
+            execution_unit: logical_node,
+            routing_edges: vec![shuffle_edge],
+        })
+    }
+
+    fn yielded_schema(&self) -> FsSchema {
+        FsSchema::from_schema_unkeyed(Arc::new(self.schema().as_ref().into())).expect(
+            "Fatal: Failed to generate unkeyed output schema for continuous aggregate",
+        )
+    }
+}
diff --git a/src/sql/extensions/watermark_node.rs b/src/sql/extensions/watermark_node.rs
new file mode 100644
index 00000000..231e1951
--- /dev/null
+++ b/src/sql/extensions/watermark_node.rs
@@ -0,0 +1,231 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::fmt::Formatter;
+use std::sync::Arc;
+
+use datafusion::common::{DFSchemaRef, Result, TableReference, internal_err, plan_err};
+use datafusion::error::DataFusionError;
+use datafusion::logical_expr::{Expr, LogicalPlan, UserDefinedLogicalNodeCore};
+use datafusion_proto::physical_plan::DefaultPhysicalExtensionCodec;
+use datafusion_proto::physical_plan::to_proto::serialize_physical_expr;
+use prost::Message;
+use protocol::grpc::api::ExpressionWatermarkConfig;
+
+use crate::multifield_partial_ord;
+use crate::sql::common::constants::{extension_node, runtime_operator_kind};
+use crate::sql::common::{FsSchema, FsSchemaRef};
+use crate::sql::extensions::{CompiledTopologyNode, StreamingOperatorBlueprint};
+use crate::sql::logical_node::logical::{LogicalEdge, LogicalEdgeType, LogicalNode, OperatorName};
+use crate::sql::logical_planner::planner::{NamedNode, Planner};
+use crate::sql::schema::utils::add_timestamp_field;
+use crate::sql::types::TIMESTAMP_FIELD;
+
+// -----------------------------------------------------------------------------
+// Constants & Identifiers
+// -----------------------------------------------------------------------------
+
+pub(crate) const EVENT_TIME_WATERMARK_NODE_NAME: &str = extension_node::EVENT_TIME_WATERMARK;
+
+const DEFAULT_WATERMARK_EMISSION_PERIOD_MICROS: u64 = 1_000_000;
+
+// -----------------------------------------------------------------------------
+// Logical Node Definition
+// -----------------------------------------------------------------------------
+
+/// Event-time watermark from a user strategy; drives time progress in stateful operators.
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+pub(crate) struct EventTimeWatermarkNode {
+    pub(crate) upstream_plan: LogicalPlan,
+    pub(crate) namespace_qualifier: TableReference,
+    pub(crate) watermark_strategy_expr: Expr,
+    pub(crate) resolved_schema: DFSchemaRef,
+    pub(crate) internal_timestamp_offset: usize,
+}
+
+multifield_partial_ord!(
+    EventTimeWatermarkNode,
+    upstream_plan,
+    namespace_qualifier,
+    watermark_strategy_expr,
+    internal_timestamp_offset
+);
+
+impl EventTimeWatermarkNode {
+    pub(crate) fn try_new(
+        upstream_plan: LogicalPlan,
+        namespace_qualifier: TableReference,
+        watermark_strategy_expr: Expr,
+    ) -> Result<Self> {
+        let resolved_schema = add_timestamp_field(
+            upstream_plan.schema().clone(),
+            Some(namespace_qualifier.clone()),
+        )?;
+
+        let internal_timestamp_offset = resolved_schema
+            .index_of_column_by_name(None, TIMESTAMP_FIELD)
+            .ok_or_else(|| {
+                DataFusionError::Plan(format!(
+                    "Fatal: Failed to resolve mandatory temporal column '{}'",
+                    TIMESTAMP_FIELD
+                ))
+            })?;
+
+        Ok(Self {
+            upstream_plan,
+            namespace_qualifier,
+            watermark_strategy_expr,
+            resolved_schema,
+            internal_timestamp_offset,
+        })
+    }
+
+    pub(crate) fn generate_fs_schema(&self) -> FsSchema {
+        FsSchema::new_unkeyed(
+            Arc::new(self.resolved_schema.as_ref().into()),
+            self.internal_timestamp_offset,
+        )
+    }
+
+    fn compile_operator_config(&self, planner: &Planner) -> Result<ExpressionWatermarkConfig> {
+        let physical_expr = planner.create_physical_expr(
+            &self.watermark_strategy_expr,
+            &self.resolved_schema,
+        )?;
+
+        let serialized_expr =
+            serialize_physical_expr(&physical_expr, &DefaultPhysicalExtensionCodec {})?;
+
+        Ok(ExpressionWatermarkConfig {
+            period_micros: DEFAULT_WATERMARK_EMISSION_PERIOD_MICROS,
+            idle_time_micros: None,
+            expression: serialized_expr.encode_to_vec(),
+            input_schema: Some(self.generate_fs_schema().into()),
+        })
+    }
+}
+
+// -----------------------------------------------------------------------------
+// DataFusion Logical Node Hooks
+// -----------------------------------------------------------------------------
+
+impl UserDefinedLogicalNodeCore for EventTimeWatermarkNode {
+    fn name(&self) -> &str {
+        EVENT_TIME_WATERMARK_NODE_NAME
+    }
+
+    fn inputs(&self) -> Vec<&LogicalPlan> {
+        vec![&self.upstream_plan]
+    }
+
+    fn schema(&self) -> &DFSchemaRef {
+        &self.resolved_schema
+    }
+
+    fn expressions(&self) -> Vec<Expr> {
+        vec![self.watermark_strategy_expr.clone()]
+    }
+
+    fn fmt_for_explain(&self, f: &mut Formatter) -> std::fmt::Result {
+        write!(
+            f,
+            "EventTimeWatermarkNode({}): Schema={}",
+            self.namespace_qualifier, self.resolved_schema
+        )
+    }
+
+    fn with_exprs_and_inputs(
+        &self,
+        mut exprs: Vec<Expr>,
+        mut inputs: Vec<LogicalPlan>,
+    ) -> Result<Self> {
+        if inputs.len() != 1 {
+            return internal_err!(
+                "EventTimeWatermarkNode requires exactly 1 upstream logical plan, but received {}",
+                inputs.len()
+            );
+        }
+        if exprs.len() != 1 {
+            return internal_err!(
+                "EventTimeWatermarkNode requires exactly 1 watermark strategy expression, but received {}",
+                exprs.len()
+            );
+        }
+
+        let internal_timestamp_offset = self
+            .resolved_schema
+            .index_of_column_by_name(Some(&self.namespace_qualifier), TIMESTAMP_FIELD)
+            .ok_or_else(|| {
+                DataFusionError::Plan(format!(
+                    "Optimizer Error: Lost tracking of temporal column '{}'",
+                    TIMESTAMP_FIELD
+                ))
+            })?;
+
+        Ok(Self {
+            upstream_plan: inputs.remove(0),
+            namespace_qualifier: self.namespace_qualifier.clone(),
+            watermark_strategy_expr: exprs.remove(0),
+            resolved_schema: self.resolved_schema.clone(),
+            internal_timestamp_offset,
+        })
+    }
+}
+
+// -----------------------------------------------------------------------------
+// Core Execution Blueprint Implementation
+// -----------------------------------------------------------------------------
+
+impl StreamingOperatorBlueprint for EventTimeWatermarkNode {
+    fn operator_identity(&self) -> Option<NamedNode> {
+        Some(NamedNode::Watermark(self.namespace_qualifier.clone()))
+    }
+
+    fn compile_to_graph_node(
+        &self,
+        planner: &Planner,
+        node_index: usize,
+        mut upstream_schemas: Vec<FsSchemaRef>,
+    ) -> Result<CompiledTopologyNode> {
+        if upstream_schemas.len() != 1 {
+            return plan_err!(
+                "Topology Violation: EventTimeWatermarkNode requires exactly 1 upstream input, received {}",
+                upstream_schemas.len()
+            );
+        }
+
+        let operator_config = self.compile_operator_config(planner)?;
+
+        let execution_unit = LogicalNode::single(
+            node_index as u32,
+            format!("watermark_{node_index}"),
+            OperatorName::ExpressionWatermark,
+            operator_config.encode_to_vec(),
+            runtime_operator_kind::WATERMARK_GENERATOR.to_string(),
+            1,
+        );
+
+        let incoming_edge = LogicalEdge::project_all(
+            LogicalEdgeType::Forward,
+            (*upstream_schemas.remove(0)).clone(),
+        );
+
+        Ok(CompiledTopologyNode {
+            execution_unit,
+            routing_edges: vec![incoming_edge],
+        })
+    }
+
+    fn yielded_schema(&self) -> FsSchema {
+        self.generate_fs_schema()
+    }
+}
diff --git a/src/sql/extensions/windows_function.rs b/src/sql/extensions/windows_function.rs
new file mode 100644
index 00000000..ccb0ff89
--- /dev/null
+++ b/src/sql/extensions/windows_function.rs
@@ -0,0 +1,198 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::fmt::Formatter;
+use std::sync::Arc;
+
+use datafusion::common::{Column, DFSchema, DFSchemaRef, Result, internal_err, plan_err};
+use datafusion::logical_expr::{Expr, LogicalPlan, UserDefinedLogicalNodeCore};
+use datafusion_proto::physical_plan::DefaultPhysicalExtensionCodec;
+use datafusion_proto::physical_plan::to_proto::serialize_physical_expr;
+use datafusion_proto::{physical_plan::AsExecutionPlan, protobuf::PhysicalPlanNode};
+use prost::Message;
+use protocol::grpc::api::WindowFunctionOperator;
+
+use crate::sql::common::constants::{extension_node, proto_operator_name, runtime_operator_kind};
+use crate::sql::common::{FsSchema, FsSchemaRef};
+use crate::sql::logical_node::logical::{LogicalEdge, LogicalEdgeType, LogicalNode, OperatorName};
+use crate::sql::physical::FsPhysicalExtensionCodec;
+use crate::sql::logical_planner::planner::{NamedNode, Planner};
+use crate::sql::types::TIMESTAMP_FIELD;
+
+use super::{CompiledTopologyNode, StreamingOperatorBlueprint};
+
+// -----------------------------------------------------------------------------
+// Constants & Identifiers
+// -----------------------------------------------------------------------------
+
+pub(crate) const STREAMING_WINDOW_NODE_NAME: &str = extension_node::STREAMING_WINDOW_FUNCTION;
+
+// -----------------------------------------------------------------------------
+// Logical Node Definition
+// -----------------------------------------------------------------------------
+
+/// Stateful streaming window: temporal binning plus underlying window evaluation plan.
+#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd)]
+pub(crate) struct StreamingWindowFunctionNode {
+    pub(crate) underlying_evaluation_plan: LogicalPlan,
+    pub(crate) partition_key_indices: Vec<usize>,
+}
+
+impl StreamingWindowFunctionNode {
+    pub fn new(
+        underlying_evaluation_plan: LogicalPlan,
+        partition_key_indices: Vec<usize>,
+    ) -> Self {
+        Self {
+            underlying_evaluation_plan,
+            partition_key_indices,
+        }
+    }
+
+    fn compile_temporal_binning_function(
+        &self,
+        planner: &Planner,
+        input_df_schema: &DFSchema,
+    ) -> Result<Vec<u8>> {
+        let timestamp_column = Expr::Column(Column::new_unqualified(TIMESTAMP_FIELD.to_string()));
+
+        let physical_binning_expr =
+            planner.create_physical_expr(&timestamp_column, input_df_schema)?;
+
+        let serialized_expr =
+            serialize_physical_expr(&physical_binning_expr, &DefaultPhysicalExtensionCodec {})?;
+
+        Ok(serialized_expr.encode_to_vec())
+    }
+
+    fn compile_physical_evaluation_plan(&self, planner: &Planner) -> Result<Vec<u8>> {
+        let physical_window_plan = planner.sync_plan(&self.underlying_evaluation_plan)?;
+
+        let proto_plan_node = PhysicalPlanNode::try_from_physical_plan(
+            physical_window_plan,
+            &FsPhysicalExtensionCodec::default(),
+        )?;
+
+        Ok(proto_plan_node.encode_to_vec())
+    }
+}
+
+// -----------------------------------------------------------------------------
+// DataFusion Logical Node Hooks
+// -----------------------------------------------------------------------------
+
+impl UserDefinedLogicalNodeCore for StreamingWindowFunctionNode {
+    fn name(&self) -> &str {
+        STREAMING_WINDOW_NODE_NAME
+    }
+
+    fn inputs(&self) -> Vec<&LogicalPlan> {
+        vec![&self.underlying_evaluation_plan]
+    }
+
+    fn schema(&self) -> &DFSchemaRef {
+        self.underlying_evaluation_plan.schema()
+    }
+
+    fn expressions(&self) -> Vec<Expr> {
+        vec![]
+    }
+
+    fn fmt_for_explain(&self, f: &mut Formatter) -> std::fmt::Result {
+        write!(
+            f,
+            "StreamingWindowFunction: Schema={}",
+            self.schema()
+        )
+    }
+
+    fn with_exprs_and_inputs(
+        &self,
+        _exprs: Vec<Expr>,
+        mut inputs: Vec<LogicalPlan>,
+    ) -> Result<Self> {
+        if inputs.len() != 1 {
+            return internal_err!(
+                "StreamingWindowFunctionNode requires exactly 1 upstream input, got {}",
+                inputs.len()
+            );
+        }
+
+        Ok(Self::new(
+            inputs.remove(0),
+            self.partition_key_indices.clone(),
+        ))
+    }
+}
+
+// -----------------------------------------------------------------------------
+// Core Execution Blueprint Implementation
+// -----------------------------------------------------------------------------
+
+impl StreamingOperatorBlueprint for StreamingWindowFunctionNode {
+    fn operator_identity(&self) -> Option<NamedNode> {
+        None
+    }
+
+    fn compile_to_graph_node(
+        &self,
+        planner: &Planner,
+        node_index: usize,
+        mut input_schemas: Vec<FsSchemaRef>,
+    ) -> Result<CompiledTopologyNode> {
+        if input_schemas.len() != 1 {
+            return plan_err!(
+                "Topology Violation: StreamingWindowFunctionNode requires exactly 1 upstream input schema, received {}",
+                input_schemas.len()
+            );
+        }
+
+        let input_schema = input_schemas.remove(0);
+
+        let input_df_schema = DFSchema::try_from(input_schema.schema.as_ref().clone())?;
+
+        let binning_payload = self.compile_temporal_binning_function(planner, &input_df_schema)?;
+        let evaluation_plan_payload = self.compile_physical_evaluation_plan(planner)?;
+
+        let operator_config = WindowFunctionOperator {
+            name: proto_operator_name::WINDOW_FUNCTION.to_string(),
+            input_schema: Some(input_schema.as_ref().clone().into()),
+            binning_function: binning_payload,
+            window_function_plan: evaluation_plan_payload,
+        };
+
+        let logical_node = LogicalNode::single(
+            node_index as u32,
+            format!("window_function_{node_index}"),
+            OperatorName::WindowFunction,
+            operator_config.encode_to_vec(),
+            runtime_operator_kind::STREAMING_WINDOW_EVALUATOR.to_string(),
+            1,
+        );
+
+        let routing_edge = LogicalEdge::project_all(
+            LogicalEdgeType::Shuffle,
+            (*input_schema).clone(),
+        );
+
+        Ok(CompiledTopologyNode {
+            execution_unit: logical_node,
+            routing_edges: vec![routing_edge],
+        })
+    }
+
+    fn yielded_schema(&self) -> FsSchema {
+        FsSchema::from_schema_unkeyed(Arc::new(self.schema().as_ref().clone().into())).expect(
+            "Fatal: Failed to generate unkeyed output schema for StreamingWindowFunctionNode",
+        )
+    }
+}
diff --git a/src/sql/functions/mod.rs b/src/sql/functions/mod.rs
new file mode 100644
index 00000000..b78f5d2a
--- /dev/null
+++ b/src/sql/functions/mod.rs
@@ -0,0 +1,612 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use crate::sql::schema::StreamSchemaProvider;
+use datafusion::arrow::array::{
+    Array, ArrayRef, StringArray, UnionArray,
+    builder::{FixedSizeBinaryBuilder, ListBuilder, StringBuilder},
+    cast::{AsArray, as_string_array},
+    types::{Float64Type, Int64Type},
+};
+use datafusion::arrow::datatypes::{DataType, Field, UnionFields, UnionMode};
+use datafusion::arrow::row::{RowConverter, SortField};
+use datafusion::common::{DataFusionError, ScalarValue};
+use datafusion::common::{Result, TableReference};
+use datafusion::execution::FunctionRegistry;
+use datafusion::logical_expr::expr::{Alias, ScalarFunction};
+use datafusion::logical_expr::{
+    ColumnarValue, LogicalPlan, Projection, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    TypeSignature, Volatility, create_udf,
+};
+use datafusion::prelude::{Expr, col};
+use serde_json_path::JsonPath;
+use std::any::Any;
+use std::collections::HashMap;
+use std::fmt::{Debug, Write};
+use std::sync::{Arc, OnceLock};
+
+use crate::sql::common::constants::scalar_fn;
+
+/// Borrowed from DataFusion
+///
+/// Creates a singleton `ScalarUDF` of the `$UDF` function named `$GNAME` and a
+/// function named `$NAME` which returns that function named $NAME.
+///
+/// This is used to ensure creating the list of `ScalarUDF` only happens once.
+#[macro_export]
+macro_rules! make_udf_function {
+    ($UDF:ty, $GNAME:ident, $NAME:ident) => {
+        /// Singleton instance of the function
+        static $GNAME: std::sync::OnceLock<std::sync::Arc<datafusion::logical_expr::ScalarUDF>> =
+            std::sync::OnceLock::new();
+
+        /// Return a [`ScalarUDF`] for [`$UDF`]
+        ///
+        /// [`ScalarUDF`]: datafusion_expr::ScalarUDF
+        pub fn $NAME() -> std::sync::Arc<datafusion::logical_expr::ScalarUDF> {
+            $GNAME
+                .get_or_init(|| {
+                    std::sync::Arc::new(datafusion::logical_expr::ScalarUDF::new_from_impl(
+                        <$UDF>::default(),
+                    ))
+                })
+                .clone()
+        }
+    };
+}
+
+make_udf_function!(MultiHashFunction, MULTI_HASH, multi_hash);
+
+pub fn register_all(registry: &mut dyn FunctionRegistry) {
+    registry
+        .register_udf(Arc::new(create_udf(
+            scalar_fn::GET_FIRST_JSON_OBJECT,
+            vec![DataType::Utf8, DataType::Utf8],
+            DataType::Utf8,
+            Volatility::Immutable,
+            Arc::new(get_first_json_object),
+        )))
+        .unwrap();
+
+    registry
+        .register_udf(Arc::new(create_udf(
+            scalar_fn::EXTRACT_JSON,
+            vec![DataType::Utf8, DataType::Utf8],
+            DataType::List(Arc::new(Field::new("item", DataType::Utf8, true))),
+            Volatility::Immutable,
+            Arc::new(extract_json),
+        )))
+        .unwrap();
+
+    registry
+        .register_udf(Arc::new(create_udf(
+            scalar_fn::EXTRACT_JSON_STRING,
+            vec![DataType::Utf8, DataType::Utf8],
+            DataType::Utf8,
+            Volatility::Immutable,
+            Arc::new(extract_json_string),
+        )))
+        .unwrap();
+
+    registry
+        .register_udf(Arc::new(create_udf(
+            scalar_fn::SERIALIZE_JSON_UNION,
+            vec![DataType::Union(union_fields(), UnionMode::Sparse)],
+            DataType::Utf8,
+            Volatility::Immutable,
+            Arc::new(serialize_json_union),
+        )))
+        .unwrap();
+
+    registry.register_udf(multi_hash()).unwrap();
+}
+
+fn parse_path(name: &str, path: &ScalarValue) -> Result<Arc<JsonPath>> {
+    let path = match path {
+        ScalarValue::Utf8(Some(s)) => JsonPath::parse(s)
+            .map_err(|e| DataFusionError::Execution(format!("Invalid json path '{s}': {e:?}")))?,
+        ScalarValue::Utf8(None) => {
+            return Err(DataFusionError::Execution(format!(
+                "The path argument to {name} cannot be null"
+            )));
+        }
+        _ => {
+            return Err(DataFusionError::Execution(format!(
+                "The path argument to {name} must be of type TEXT"
+            )));
+        }
+    };
+
+    Ok(Arc::new(path))
+}
+
+// Hash function that can take any number of arguments and produces a fast (non-cryptographic)
+// 128-bit hash from their string representations
+#[derive(Debug)]
+pub struct MultiHashFunction {
+    signature: Signature,
+}
+
+impl MultiHashFunction {
+    pub fn invoke(&self, args: &[ColumnarValue]) -> Result<ColumnarValue> {
+        let mut hasher = xxhash_rust::xxh3::Xxh3::new();
+
+        let all_scalar = args.iter().all(|a| matches!(a, ColumnarValue::Scalar(_)));
+
+        let length = args
+            .iter()
+            .map(|t| match t {
+                ColumnarValue::Scalar(_) => 1,
+                ColumnarValue::Array(a) => a.len(),
+            })
+            .max()
+            .ok_or_else(|| {
+                DataFusionError::Plan("multi_hash must have at least one argument".to_string())
+            })?;
+
+        let row_builder = RowConverter::new(
+            args.iter()
+                .map(|t| SortField::new(t.data_type().clone()))
+                .collect(),
+        )?;
+
+        let arrays = args
+            .iter()
+            .map(|c| c.clone().into_array(length))
+            .collect::<Result<Vec<_>>>()?;
+        let rows = row_builder.convert_columns(&arrays)?;
+
+        if all_scalar {
+            hasher.update(rows.row(0).as_ref());
+            let result = hasher.digest128().to_be_bytes().to_vec();
+            hasher.reset();
+            Ok(ColumnarValue::Scalar(ScalarValue::FixedSizeBinary(
+                size_of::<u128>() as i32,
+                Some(result),
+            )))
+        } else {
+            let mut builder =
+                FixedSizeBinaryBuilder::with_capacity(length, size_of::<u128>() as i32);
+
+            for row in rows.iter() {
+                hasher.update(row.as_ref());
+                builder.append_value(hasher.digest128().to_be_bytes())?;
+                hasher.reset();
+            }
+
+            Ok(ColumnarValue::Array(Arc::new(builder.finish())))
+        }
+    }
+}
+
+impl Default for MultiHashFunction {
+    fn default() -> Self {
+        Self {
+            signature: Signature::new(TypeSignature::VariadicAny, Volatility::Immutable),
+        }
+    }
+}
+
+impl ScalarUDFImpl for MultiHashFunction {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        scalar_fn::MULTI_HASH
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(DataType::FixedSizeBinary(size_of::<u128>() as i32))
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        self.invoke(&args.args)
+    }
+}
+
+fn json_function<T, ArrayT, F, ToS>(
+    name: &str,
+    f: F,
+    to_scalar: ToS,
+    args: &[ColumnarValue],
+) -> Result<ColumnarValue>
+where
+    ArrayT: Array + FromIterator<Option<T>> + 'static,
+    F: Fn(serde_json::Value, &JsonPath) -> Option<T>,
+    ToS: Fn(Option<T>) -> ScalarValue,
+{
+    assert_eq!(args.len(), 2);
+    Ok(match (&args[0], &args[1]) {
+        (ColumnarValue::Array(values), ColumnarValue::Scalar(path)) => {
+            let path = parse_path(name, path)?;
+            let vs = as_string_array(values);
+            ColumnarValue::Array(Arc::new(
+                vs.iter()
+                    .map(|s| s.and_then(|s| f(serde_json::from_str(s).ok()?, &path)))
+                    .collect::<ArrayT>(),
+            ) as ArrayRef)
+        }
+        (ColumnarValue::Scalar(value), ColumnarValue::Scalar(path)) => {
+            let path = parse_path(name, path)?;
+            let ScalarValue::Utf8(value) = value else {
+                return Err(DataFusionError::Execution(format!(
+                    "The value argument to {name} must be of type TEXT"
+                )));
+            };
+
+            let result = value
+                .as_ref()
+                .and_then(|v| f(serde_json::from_str(v).ok()?, &path));
+            ColumnarValue::Scalar(to_scalar(result))
+        }
+        _ => {
+            return Err(DataFusionError::Execution(
+                "The path argument to {name} must be a literal".to_string(),
+            ));
+        }
+    })
+}
+
+pub fn extract_json(args: &[ColumnarValue]) -> Result<ColumnarValue> {
+    assert_eq!(args.len(), 2);
+
+    let inner = |s, path: &JsonPath| {
+        Some(
+            path.query(&serde_json::from_str(s).ok()?)
+                .iter()
+                .map(|v| Some(v.to_string()))
+                .collect::<Vec<Option<String>>>(),
+        )
+    };
+
+    Ok(match (&args[0], &args[1]) {
+        (ColumnarValue::Array(values), ColumnarValue::Scalar(path)) => {
+            let path = parse_path("extract_json", path)?;
+            let values = as_string_array(values);
+
+            let mut builder = ListBuilder::with_capacity(StringBuilder::new(), values.len());
+
+            let queried = values.iter().map(|s| s.and_then(|s| inner(s, &path)));
+
+            for v in queried {
+                builder.append_option(v);
+            }
+
+            ColumnarValue::Array(Arc::new(builder.finish()))
+        }
+        (ColumnarValue::Scalar(value), ColumnarValue::Scalar(path)) => {
+            let path = parse_path("extract_json", path)?;
+            let ScalarValue::Utf8(v) = value else {
+                return Err(DataFusionError::Execution(
+                    "The value argument to extract_json must be of type TEXT".to_string(),
+                ));
+            };
+
+            let mut builder = ListBuilder::with_capacity(StringBuilder::new(), 1);
+            let result = v.as_ref().and_then(|s| inner(s, &path));
+            builder.append_option(result);
+
+            ColumnarValue::Scalar(ScalarValue::List(Arc::new(builder.finish())))
+        }
+        _ => {
+            return Err(DataFusionError::Execution(
+                "The path argument to extract_json must be a literal".to_string(),
+            ));
+        }
+    })
+}
+
+pub fn get_first_json_object(args: &[ColumnarValue]) -> Result<ColumnarValue> {
+    json_function::<String, StringArray, _, _>(
+        "get_first_json_object",
+        |s, path| path.query(&s).first().map(|v| v.to_string()),
+        |s| s.as_deref().into(),
+        args,
+    )
+}
+
+pub fn extract_json_string(args: &[ColumnarValue]) -> Result<ColumnarValue> {
+    json_function::<String, StringArray, _, _>(
+        "extract_json_string",
+        |s, path| {
+            path.query(&s)
+                .first()
+                .and_then(|v| v.as_str().map(|s| s.to_string()))
+        },
+        |s| s.as_deref().into(),
+        args,
+    )
+}
+
+// This code is vendored from
+// https://github.com/datafusion-contrib/datafusion-functions-json/blob/main/src/common_union.rs
+// as the `is_json_union` function is not public. It should be kept in sync with that code so
+// that we are able to detect JSON unions and rewrite them to serialized JSON for sinks.
+pub(crate) fn is_json_union(data_type: &DataType) -> bool {
+    match data_type {
+        DataType::Union(fields, UnionMode::Sparse) => fields == &union_fields(),
+        _ => false,
+    }
+}
+
+pub(crate) const TYPE_ID_NULL: i8 = 0;
+const TYPE_ID_BOOL: i8 = 1;
+const TYPE_ID_INT: i8 = 2;
+const TYPE_ID_FLOAT: i8 = 3;
+const TYPE_ID_STR: i8 = 4;
+const TYPE_ID_ARRAY: i8 = 5;
+const TYPE_ID_OBJECT: i8 = 6;
+
+fn union_fields() -> UnionFields {
+    static FIELDS: OnceLock<UnionFields> = OnceLock::new();
+    FIELDS
+        .get_or_init(|| {
+            let json_metadata: HashMap<String, String> =
+                HashMap::from_iter(vec![("is_json".to_string(), "true".to_string())]);
+            UnionFields::from_iter([
+                (
+                    TYPE_ID_NULL,
+                    Arc::new(Field::new("null", DataType::Null, true)),
+                ),
+                (
+                    TYPE_ID_BOOL,
+                    Arc::new(Field::new("bool", DataType::Boolean, false)),
+                ),
+                (
+                    TYPE_ID_INT,
+                    Arc::new(Field::new("int", DataType::Int64, false)),
+                ),
+                (
+                    TYPE_ID_FLOAT,
+                    Arc::new(Field::new("float", DataType::Float64, false)),
+                ),
+                (
+                    TYPE_ID_STR,
+                    Arc::new(Field::new("str", DataType::Utf8, false)),
+                ),
+                (
+                    TYPE_ID_ARRAY,
+                    Arc::new(
+                        Field::new("array", DataType::Utf8, false)
+                            .with_metadata(json_metadata.clone()),
+                    ),
+                ),
+                (
+                    TYPE_ID_OBJECT,
+                    Arc::new(
+                        Field::new("object", DataType::Utf8, false)
+                            .with_metadata(json_metadata.clone()),
+                    ),
+                ),
+            ])
+        })
+        .clone()
+}
+// End vendored code
+
+pub fn serialize_json_union(args: &[ColumnarValue]) -> Result<ColumnarValue> {
+    assert_eq!(args.len(), 1);
+    let array = match args.first().unwrap() {
+        ColumnarValue::Array(a) => a.clone(),
+        ColumnarValue::Scalar(s) => s.to_array_of_size(1)?,
+    };
+
+    let mut b = StringBuilder::with_capacity(array.len(), array.get_array_memory_size());
+
+    write_union(&mut b, &array)?;
+
+    Ok(ColumnarValue::Array(Arc::new(b.finish())))
+}
+
+fn write_union(b: &mut StringBuilder, array: &ArrayRef) -> Result<(), std::fmt::Error> {
+    assert!(
+        is_json_union(array.data_type()),
+        "array item is not a valid JSON union"
+    );
+    let json_union = array.as_any().downcast_ref::<UnionArray>().unwrap();
+
+    for i in 0..json_union.len() {
+        if json_union.is_null(i) {
+            b.append_null();
+        } else {
+            write_value(b, json_union.type_id(i), &json_union.value(i))?;
+            b.append_value("");
+        }
+    }
+
+    Ok(())
+}
+
+fn write_value(b: &mut StringBuilder, id: i8, a: &ArrayRef) -> Result<(), std::fmt::Error> {
+    match id {
+        TYPE_ID_NULL => write!(b, "null")?,
+        TYPE_ID_BOOL => write!(b, "{}", a.as_boolean().value(0))?,
+        TYPE_ID_INT => write!(b, "{}", a.as_primitive::<Int64Type>().value(0))?,
+        TYPE_ID_FLOAT => write!(b, "{}", a.as_primitive::<Float64Type>().value(0))?,
+        TYPE_ID_STR => {
+            // assumes that this is already a valid (escaped) json string as the only way to
+            // construct these values are by parsing (valid) JSON
+            b.write_char('"')?;
+            b.write_str(a.as_string::<i32>().value(0))?;
+            b.write_char('"')?;
+        }
+        TYPE_ID_ARRAY => {
+            b.write_str(a.as_string::<i32>().value(0))?;
+        }
+        TYPE_ID_OBJECT => {
+            b.write_str(a.as_string::<i32>().value(0))?;
+        }
+        _ => unreachable!("invalid union type in JSON union: {}", id),
+    }
+
+    Ok(())
+}
+
+pub(crate) fn serialize_outgoing_json(
+    registry: &StreamSchemaProvider,
+    node: Arc<LogicalPlan>,
+) -> LogicalPlan {
+    let exprs = node
+        .schema()
+        .fields()
+        .iter()
+        .map(|f| {
+            if is_json_union(f.data_type()) {
+                Expr::Alias(Alias::new(
+                    Expr::ScalarFunction(ScalarFunction::new_udf(
+                        registry.udf(scalar_fn::SERIALIZE_JSON_UNION).unwrap(),
+                        vec![col(f.name())],
+                    )),
+                    Option::<TableReference>::None,
+                    f.name(),
+                ))
+            } else {
+                col(f.name())
+            }
+        })
+        .collect();
+
+    LogicalPlan::Projection(Projection::try_new(exprs, node).unwrap())
+}
+
+#[cfg(test)]
+mod test {
+    use datafusion::arrow::array::StringArray;
+    use datafusion::arrow::array::builder::{ListBuilder, StringBuilder};
+    use datafusion::common::ScalarValue;
+    use std::sync::Arc;
+
+    #[test]
+    fn test_extract_json() {
+        let input = Arc::new(StringArray::from(vec![
+            r#"{"a": 1, "b": 2, "c": { "d": "hello" }}"#,
+            r#"{"a": 3, "b": 4}"#,
+            r#"{"a": 5, "b": 6}"#,
+        ]));
+
+        let path = "$.c.d";
+
+        let result = super::extract_json(&[
+            super::ColumnarValue::Array(input),
+            super::ColumnarValue::Scalar(path.into()),
+        ])
+        .unwrap();
+
+        let mut expected = ListBuilder::new(StringBuilder::new());
+        expected.append_value(vec![Some("\"hello\"".to_string())]);
+        expected.append_value(Vec::<Option<String>>::new());
+        expected.append_value(Vec::<Option<String>>::new());
+        if let super::ColumnarValue::Array(result) = result {
+            assert_eq!(*result, expected.finish());
+        } else {
+            panic!("Expected array, got scalar");
+        }
+
+        let result = super::extract_json(&[
+            super::ColumnarValue::Scalar(r#"{"a": 1, "b": 2, "c": { "d": "hello" }}"#.into()),
+            super::ColumnarValue::Scalar(path.into()),
+        ])
+        .unwrap();
+
+        let mut expected = ListBuilder::with_capacity(StringBuilder::new(), 1);
+        expected.append_value(vec![Some("\"hello\"".to_string())]);
+
+        if let super::ColumnarValue::Scalar(ScalarValue::List(result)) = result {
+            assert_eq!(*result, expected.finish());
+        } else {
+            panic!("Expected scalar list");
+        }
+    }
+
+    #[test]
+    fn test_get_first_json_object() {
+        let input = Arc::new(StringArray::from(vec![
+            r#"{"a": 1, "b": 2}"#,
+            r#"{"a": 3}"#,
+            r#"{"a": 5, "b": 6}"#,
+        ]));
+
+        let path = "$.b";
+
+        let result = super::get_first_json_object(&[
+            super::ColumnarValue::Array(input),
+            super::ColumnarValue::Scalar(path.into()),
+        ])
+        .unwrap();
+
+        let expected = StringArray::from(vec![Some("2"), None, Some("6")]);
+
+        if let super::ColumnarValue::Array(result) = result {
+            assert_eq!(*result, expected);
+        } else {
+            panic!("Expected array, got scalar");
+        }
+
+        let result = super::get_first_json_object(&[
+            super::ColumnarValue::Scalar(r#"{"a": 1, "b": 2, "c": { "d": "hello" }}"#.into()),
+            super::ColumnarValue::Scalar("$.c.d".into()),
+        ])
+        .unwrap();
+
+        let expected = ScalarValue::Utf8(Some("\"hello\"".to_string()));
+
+        if let super::ColumnarValue::Scalar(result) = result {
+            assert_eq!(result, expected);
+        } else {
+            panic!("Expected scalar");
+        }
+    }
+
+    #[test]
+    fn test_extract_json_string() {
+        let input = Arc::new(StringArray::from(vec![
+            r#"{"a": 1, "b": 2, "c": { "d": "hello" }}"#,
+            r#"{"a": 3, "b": 4}"#,
+            r#"{"a": 5, "b": 6}"#,
+        ]));
+
+        let path = "$.c.d";
+
+        let result = super::extract_json_string(&[
+            super::ColumnarValue::Array(input),
+            super::ColumnarValue::Scalar(path.into()),
+        ])
+        .unwrap();
+
+        let expected = StringArray::from(vec![Some("hello"), None, None]);
+
+        if let super::ColumnarValue::Array(result) = result {
+            assert_eq!(*result, expected);
+        } else {
+            panic!("Expected array, got scalar");
+        }
+
+        let result = super::extract_json_string(&[
+            super::ColumnarValue::Scalar(r#"{"a": 1, "b": 2, "c": { "d": "hello" }}"#.into()),
+            super::ColumnarValue::Scalar(path.into()),
+        ])
+        .unwrap();
+
+        let expected = ScalarValue::Utf8(Some("hello".to_string()));
+
+        if let super::ColumnarValue::Scalar(result) = result {
+            assert_eq!(result, expected);
+        } else {
+            panic!("Expected scalar");
+        }
+    }
+}
diff --git a/src/sql/grammar.pest b/src/sql/grammar.pest
deleted file mode 100644
index 15f70dd7..00000000
--- a/src/sql/grammar.pest
+++ /dev/null
@@ -1,134 +0,0 @@
-// =============================================================================
-// FUNCTION SQL Grammar
-// 
-// Using pest PEG syntax, referencing ANTLR style
-// =============================================================================
-
-// =============================================================================
-// 1. Whitespace (automatically skipped)
-// =============================================================================
-
-WHITESPACE = _{ " " | "\t" | "\r" | "\n" }
-
-// =============================================================================
-// 2. Keywords (case-insensitive)
-// =============================================================================
-
-kw_create    = _{ C ~ R ~ E ~ A ~ T ~ E }
-kw_drop      = _{ D ~ R ~ O ~ P }
-kw_start     = _{ S ~ T ~ A ~ R ~ T }
-kw_stop      = _{ S ~ T ~ O ~ P }
-kw_show      = _{ S ~ H ~ O ~ W }
-kw_with      = _{ W ~ I ~ T ~ H }
-kw_function  = _{ F ~ U ~ N ~ C ~ T ~ I ~ O ~ N }
-kw_functions = _{ F ~ U ~ N ~ C ~ T ~ I ~ O ~ N ~ S }
-
-// =============================================================================
-// 3. Operators & Symbols
-// =============================================================================
-
-LPAREN  = _{ "(" }
-RPAREN  = _{ ")" }
-COMMA   = _{ "," }
-EQ      = _{ "=" }
-SQUOTE  = _{ "'" }
-DQUOTE  = _{ "\"" }
-
-// =============================================================================
-// 4. Literals
-// =============================================================================
-
-// String literal (single or double quotes)
-string_literal = @{ 
-    SQUOTE ~ string_inner_single ~ SQUOTE | 
-    DQUOTE ~ string_inner_double ~ DQUOTE 
-}
-
-string_inner_single = @{ (!(SQUOTE | "\\") ~ ANY | escape_seq)* }
-string_inner_double = @{ (!(DQUOTE | "\\") ~ ANY | escape_seq)* }
-escape_seq = @{ "\\" ~ ANY }
-
-// =============================================================================
-// 5. Identifiers
-// =============================================================================
-
-// Task name identifier
-identifier = @{ (ASCII_ALPHA | "_") ~ (ASCII_ALPHANUMERIC | "_" | "-")* }
-
-// =============================================================================
-// 6. Statements
-// =============================================================================
-
-// Entry rule
-statement = _{ 
-    SOI ~ (
-        create_stmt | 
-        drop_stmt | 
-        start_stmt | 
-        stop_stmt | 
-        show_stmt
-    ) ~ EOI 
-}
-
-// CREATE FUNCTION WITH (...)
-// Note: name is read from config file, not from SQL statement
-create_stmt = { kw_create ~ kw_function ~ kw_with ~ properties }
-
-// DROP FUNCTION name
-drop_stmt = { kw_drop ~ kw_function ~ identifier }
-
-// START FUNCTION name
-start_stmt = { kw_start ~ kw_function ~ identifier }
-
-// STOP FUNCTION name
-stop_stmt = { kw_stop ~ kw_function ~ identifier }
-
-// SHOW FUNCTIONS
-show_stmt = { kw_show ~ kw_functions }
-
-// =============================================================================
-// 7. Properties
-// =============================================================================
-
-// Property list ('key'='value', ...)
-properties = { LPAREN ~ property ~ (COMMA ~ property)* ~ RPAREN }
-
-// Single property 'key'='value'
-property = { property_key ~ EQ ~ property_value }
-
-// Property key (string)
-property_key = { string_literal }
-
-// Property value (string)
-property_value = { string_literal }
-
-// =============================================================================
-// 8. Character Fragments (for case-insensitive matching)
-// =============================================================================
-
-A = _{ "A" | "a" }
-B = _{ "B" | "b" }
-C = _{ "C" | "c" }
-D = _{ "D" | "d" }
-E = _{ "E" | "e" }
-F = _{ "F" | "f" }
-G = _{ "G" | "g" }
-H = _{ "H" | "h" }
-I = _{ "I" | "i" }
-J = _{ "J" | "j" }
-K = _{ "K" | "k" }
-L = _{ "L" | "l" }
-M = _{ "M" | "m" }
-N = _{ "N" | "n" }
-O = _{ "O" | "o" }
-P = _{ "P" | "p" }
-Q = _{ "Q" | "q" }
-R = _{ "R" | "r" }
-S = _{ "S" | "s" }
-T = _{ "T" | "t" }
-U = _{ "U" | "u" }
-V = _{ "V" | "v" }
-W = _{ "W" | "w" }
-X = _{ "X" | "x" }
-Y = _{ "Y" | "y" }
-Z = _{ "Z" | "z" }
diff --git a/src/sql/logical_node/logical/dylib_udf_config.rs b/src/sql/logical_node/logical/dylib_udf_config.rs
new file mode 100644
index 00000000..6c88054f
--- /dev/null
+++ b/src/sql/logical_node/logical/dylib_udf_config.rs
@@ -0,0 +1,71 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use datafusion::arrow::datatypes::DataType;
+use datafusion_proto::protobuf::ArrowType;
+use prost::Message;
+use protocol::grpc::api;
+
+#[derive(Clone, Debug, Eq, PartialEq, Hash, PartialOrd)]
+pub struct DylibUdfConfig {
+    pub dylib_path: String,
+    pub arg_types: Vec<DataType>,
+    pub return_type: DataType,
+    pub aggregate: bool,
+    pub is_async: bool,
+}
+
+impl From<DylibUdfConfig> for api::DylibUdfConfig {
+    fn from(from: DylibUdfConfig) -> Self {
+        api::DylibUdfConfig {
+            dylib_path: from.dylib_path,
+            arg_types: from
+                .arg_types
+                .iter()
+                .map(|t| {
+                    ArrowType::try_from(t)
+                        .expect("unsupported data type")
+                        .encode_to_vec()
+                })
+                .collect(),
+            return_type: ArrowType::try_from(&from.return_type)
+                .expect("unsupported data type")
+                .encode_to_vec(),
+            aggregate: from.aggregate,
+            is_async: from.is_async,
+        }
+    }
+}
+
+impl From<api::DylibUdfConfig> for DylibUdfConfig {
+    fn from(from: api::DylibUdfConfig) -> Self {
+        DylibUdfConfig {
+            dylib_path: from.dylib_path,
+            arg_types: from
+                .arg_types
+                .iter()
+                .map(|t| {
+                    DataType::try_from(
+                        &ArrowType::decode(&mut t.as_slice()).expect("invalid arrow type"),
+                    )
+                    .expect("invalid arrow type")
+                })
+                .collect(),
+            return_type: DataType::try_from(
+                &ArrowType::decode(&mut from.return_type.as_slice()).unwrap(),
+            )
+            .expect("invalid arrow type"),
+            aggregate: from.aggregate,
+            is_async: from.is_async,
+        }
+    }
+}
diff --git a/src/sql/logical_node/logical/fs_program_convert.rs b/src/sql/logical_node/logical/fs_program_convert.rs
new file mode 100644
index 00000000..a8ac20b1
--- /dev/null
+++ b/src/sql/logical_node/logical/fs_program_convert.rs
@@ -0,0 +1,201 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+//
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Conversions between [`LogicalProgram`] and `protocol::grpc::api::FsProgram` / pipeline API types.
+
+use std::collections::HashMap;
+use std::str::FromStr;
+use std::sync::Arc;
+
+use datafusion::common::{DataFusionError, Result as DFResult};
+use petgraph::graph::DiGraph;
+use petgraph::prelude::EdgeRef;
+use protocol::grpc::api::{
+    ChainedOperator, EdgeType as ProtoEdgeType, FsEdge, FsNode, FsProgram, FsSchema as ProtoFsSchema,
+};
+
+use crate::sql::api::pipelines::{PipelineEdge, PipelineGraph, PipelineNode};
+use crate::sql::common::FsSchema;
+
+use super::logical_edge::logical_edge_type_from_proto_i32;
+use super::operator_chain::{ChainedLogicalOperator, OperatorChain};
+use super::operator_name::OperatorName;
+use super::{LogicalEdge, LogicalNode, LogicalProgram, ProgramConfig};
+
+impl TryFrom<FsProgram> for LogicalProgram {
+    type Error = DataFusionError;
+
+    fn try_from(value: FsProgram) -> DFResult<Self> {
+        let mut graph = DiGraph::new();
+        let mut id_map = HashMap::with_capacity(value.nodes.len());
+
+        for node in value.nodes {
+            let operators = node
+                .operators
+                .into_iter()
+                .map(|op| {
+                    let ChainedOperator {
+                        operator_id,
+                        operator_name: name_str,
+                        operator_config,
+                    } = op;
+                    let operator_name = OperatorName::from_str(&name_str).map_err(|_| {
+                        DataFusionError::Plan(format!("Invalid operator name: {name_str}"))
+                    })?;
+                    Ok(ChainedLogicalOperator {
+                        operator_id,
+                        operator_name,
+                        operator_config,
+                    })
+                })
+                .collect::<DFResult<Vec<_>>>()?;
+
+            let edges = node
+                .edges
+                .into_iter()
+                .map(|e| {
+                    let fs: FsSchema = e.try_into()?;
+                    Ok(Arc::new(fs))
+                })
+                .collect::<DFResult<Vec<_>>>()?;
+
+            let logical_node = LogicalNode {
+                node_id: node.node_id,
+                description: node.description,
+                operator_chain: OperatorChain { operators, edges },
+                parallelism: node.parallelism as usize,
+            };
+
+            id_map.insert(node.node_index, graph.add_node(logical_node));
+        }
+
+        for edge in value.edges {
+            let source = *id_map.get(&edge.source).ok_or_else(|| {
+                DataFusionError::Plan("Graph integrity error: Missing source node".into())
+            })?;
+            let target = *id_map.get(&edge.target).ok_or_else(|| {
+                DataFusionError::Plan("Graph integrity error: Missing target node".into())
+            })?;
+            let schema = edge
+                .schema
+                .ok_or_else(|| DataFusionError::Plan("Graph integrity error: Missing edge schema".into()))?;
+            let edge_type = logical_edge_type_from_proto_i32(edge.edge_type)?;
+
+            graph.add_edge(
+                source,
+                target,
+                LogicalEdge {
+                    edge_type,
+                    schema: Arc::new(FsSchema::try_from(schema)?),
+                },
+            );
+        }
+
+        let program_config = value
+            .program_config
+            .map(ProgramConfig::from)
+            .unwrap_or_default();
+
+        Ok(LogicalProgram::new(graph, program_config))
+    }
+}
+
+impl From<LogicalProgram> for FsProgram {
+    fn from(value: LogicalProgram) -> Self {
+        let nodes = value
+            .graph
+            .node_indices()
+            .filter_map(|idx| value.graph.node_weight(idx).map(|node| (idx, node)))
+            .map(|(idx, node)| FsNode {
+                node_index: idx.index() as i32,
+                node_id: node.node_id,
+                parallelism: node.parallelism as u32,
+                description: node.description.clone(),
+                operators: node
+                    .operator_chain
+                    .operators
+                    .iter()
+                    .map(|op| ChainedOperator {
+                        operator_id: op.operator_id.clone(),
+                        operator_name: op.operator_name.to_string(),
+                        operator_config: op.operator_config.clone(),
+                    })
+                    .collect(),
+                edges: node
+                    .operator_chain
+                    .edges
+                    .iter()
+                    .map(|edge| ProtoFsSchema::from((**edge).clone()))
+                    .collect(),
+            })
+            .collect();
+
+        let edges = value
+            .graph
+            .edge_indices()
+            .filter_map(|eidx| {
+                let edge = value.graph.edge_weight(eidx)?;
+                let (source, target) = value.graph.edge_endpoints(eidx)?;
+                Some(FsEdge {
+                    source: source.index() as i32,
+                    target: target.index() as i32,
+                    schema: Some(ProtoFsSchema::from((*edge.schema).clone())),
+                    edge_type: ProtoEdgeType::from(edge.edge_type) as i32,
+                })
+            })
+            .collect();
+
+        FsProgram {
+            nodes,
+            edges,
+            program_config: Some(value.program_config.into()),
+        }
+    }
+}
+
+impl TryFrom<LogicalProgram> for PipelineGraph {
+    type Error = DataFusionError;
+
+    fn try_from(value: LogicalProgram) -> DFResult<Self> {
+        let nodes = value
+            .graph
+            .node_weights()
+            .map(|node| {
+                Ok(PipelineNode {
+                    node_id: node.node_id,
+                    operator: node.resolve_pipeline_operator_name()?,
+                    description: node.description.clone(),
+                    parallelism: node.parallelism as u32,
+                })
+            })
+            .collect::<DFResult<Vec<_>>>()?;
+
+        let edges = value
+            .graph
+            .edge_references()
+            .filter_map(|edge| {
+                let src = value.graph.node_weight(edge.source())?;
+                let target = value.graph.node_weight(edge.target())?;
+                Some(PipelineEdge {
+                    src_id: src.node_id,
+                    dest_id: target.node_id,
+                    key_type: "()".to_string(),
+                    value_type: "()".to_string(),
+                    edge_type: format!("{:?}", edge.weight().edge_type),
+                })
+            })
+            .collect();
+
+        Ok(PipelineGraph { nodes, edges })
+    }
+}
diff --git a/src/sql/logical_node/logical/logical_edge.rs b/src/sql/logical_node/logical/logical_edge.rs
new file mode 100644
index 00000000..1a169c1d
--- /dev/null
+++ b/src/sql/logical_node/logical/logical_edge.rs
@@ -0,0 +1,102 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::fmt::{Display, Formatter};
+use std::sync::Arc;
+
+use datafusion::common::{DataFusionError, Result};
+use protocol::grpc::api::EdgeType as ProtoEdgeType;
+use serde::{Deserialize, Serialize};
+
+use crate::sql::common::FsSchema;
+
+#[derive(Copy, Clone, Debug, Eq, PartialEq, PartialOrd, Ord, Serialize, Deserialize)]
+pub enum LogicalEdgeType {
+    Forward,
+    Shuffle,
+    LeftJoin,
+    RightJoin,
+}
+
+impl Display for LogicalEdgeType {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        let symbol = match self {
+            LogicalEdgeType::Forward => "→",
+            LogicalEdgeType::Shuffle => "⤨",
+            LogicalEdgeType::LeftJoin => "-[left]⤨",
+            LogicalEdgeType::RightJoin => "-[right]⤨",
+        };
+        write!(f, "{symbol}")
+    }
+}
+
+impl From<ProtoEdgeType> for LogicalEdgeType {
+    fn from(value: ProtoEdgeType) -> Self {
+        match value {
+            ProtoEdgeType::Unused => {
+                panic!("Critical: Invalid EdgeType 'Unused' encountered")
+            }
+            ProtoEdgeType::Forward => Self::Forward,
+            ProtoEdgeType::Shuffle => Self::Shuffle,
+            ProtoEdgeType::LeftJoin => Self::LeftJoin,
+            ProtoEdgeType::RightJoin => Self::RightJoin,
+        }
+    }
+}
+
+impl From<LogicalEdgeType> for ProtoEdgeType {
+    fn from(value: LogicalEdgeType) -> Self {
+        match value {
+            LogicalEdgeType::Forward => Self::Forward,
+            LogicalEdgeType::Shuffle => Self::Shuffle,
+            LogicalEdgeType::LeftJoin => Self::LeftJoin,
+            LogicalEdgeType::RightJoin => Self::RightJoin,
+        }
+    }
+}
+
+pub(crate) fn logical_edge_type_from_proto_i32(i: i32) -> Result<LogicalEdgeType> {
+    let e = ProtoEdgeType::try_from(i).map_err(|_| {
+        DataFusionError::Plan(format!("invalid protobuf EdgeType discriminant {i}"))
+    })?;
+    match e {
+        ProtoEdgeType::Unused => Err(DataFusionError::Plan(
+            "Critical: Invalid EdgeType 'Unused' encountered".into(),
+        )),
+        ProtoEdgeType::Forward => Ok(LogicalEdgeType::Forward),
+        ProtoEdgeType::Shuffle => Ok(LogicalEdgeType::Shuffle),
+        ProtoEdgeType::LeftJoin => Ok(LogicalEdgeType::LeftJoin),
+        ProtoEdgeType::RightJoin => Ok(LogicalEdgeType::RightJoin),
+    }
+}
+
+#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize)]
+pub struct LogicalEdge {
+    pub edge_type: LogicalEdgeType,
+    pub schema: Arc<FsSchema>,
+}
+
+impl LogicalEdge {
+    pub fn new(edge_type: LogicalEdgeType, schema: FsSchema) -> Self {
+        LogicalEdge {
+            edge_type,
+            schema: Arc::new(schema),
+        }
+    }
+
+    pub fn project_all(edge_type: LogicalEdgeType, schema: FsSchema) -> Self {
+        LogicalEdge {
+            edge_type,
+            schema: Arc::new(schema),
+        }
+    }
+}
diff --git a/src/sql/logical_node/logical/logical_graph.rs b/src/sql/logical_node/logical/logical_graph.rs
new file mode 100644
index 00000000..b877e2a0
--- /dev/null
+++ b/src/sql/logical_node/logical/logical_graph.rs
@@ -0,0 +1,30 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use petgraph::graph::DiGraph;
+
+use super::logical_edge::LogicalEdge;
+use super::logical_node::LogicalNode;
+
+pub type LogicalGraph = DiGraph<LogicalNode, LogicalEdge>;
+
+pub trait Optimizer {
+    fn optimize_once(&self, plan: &mut LogicalGraph) -> bool;
+
+    fn optimize(&self, plan: &mut LogicalGraph) {
+        loop {
+            if !self.optimize_once(plan) {
+                break;
+            }
+        }
+    }
+}
diff --git a/src/sql/logical_node/logical/logical_node.rs b/src/sql/logical_node/logical/logical_node.rs
new file mode 100644
index 00000000..26129b26
--- /dev/null
+++ b/src/sql/logical_node/logical/logical_node.rs
@@ -0,0 +1,89 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::fmt::{Debug, Display, Formatter};
+
+use datafusion::common::{DataFusionError, Result};
+use itertools::Itertools;
+use serde::{Deserialize, Serialize};
+
+use super::operator_chain::{ChainedLogicalOperator, OperatorChain};
+use super::operator_name::OperatorName;
+
+#[derive(Clone, Serialize, Deserialize)]
+pub struct LogicalNode {
+    pub node_id: u32,
+    pub description: String,
+    pub operator_chain: OperatorChain,
+    pub parallelism: usize,
+}
+
+impl LogicalNode {
+    pub fn single(
+        id: u32,
+        operator_id: String,
+        name: OperatorName,
+        config: Vec<u8>,
+        description: String,
+        parallelism: usize,
+    ) -> Self {
+        Self {
+            node_id: id,
+            description,
+            operator_chain: OperatorChain {
+                operators: vec![ChainedLogicalOperator {
+                    operator_id,
+                    operator_name: name,
+                    operator_config: config,
+                }],
+                edges: vec![],
+            },
+            parallelism,
+        }
+    }
+
+    pub fn resolve_pipeline_operator_name(&self) -> Result<String> {
+        let first_op = self
+            .operator_chain
+            .operators
+            .first()
+            .ok_or_else(|| DataFusionError::Plan("Invalid LogicalNode: Operator chain is empty".into()))?;
+
+        if let Some(connector_name) = first_op.extract_connector_name() {
+            return Ok(connector_name);
+        }
+
+        if self.operator_chain.len() == 1 {
+            return Ok(first_op.operator_id.clone());
+        }
+
+        Ok("chained_op".to_string())
+    }
+}
+
+impl Display for LogicalNode {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", self.description)
+    }
+}
+
+impl Debug for LogicalNode {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        let chain_path = self
+            .operator_chain
+            .operators
+            .iter()
+            .map(|op| op.operator_id.as_str())
+            .join(" -> ");
+        write!(f, "{chain_path}[{}]", self.parallelism)
+    }
+}
diff --git a/src/sql/logical_node/logical/logical_program.rs b/src/sql/logical_node/logical/logical_program.rs
new file mode 100644
index 00000000..888f4292
--- /dev/null
+++ b/src/sql/logical_node/logical/logical_program.rs
@@ -0,0 +1,156 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+//
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::collections::hash_map::DefaultHasher;
+use std::collections::{HashMap, HashSet};
+use std::hash::Hasher;
+use std::sync::Arc;
+
+use datafusion::arrow::datatypes::Schema;
+use datafusion::common::{DataFusionError, Result as DFResult};
+use petgraph::Direction;
+use petgraph::dot::Dot;
+use prost::Message;
+use protocol::grpc::api::FsProgram;
+use rand::distributions::Alphanumeric;
+use rand::rngs::SmallRng;
+use rand::{Rng, SeedableRng};
+
+use super::logical_graph::{LogicalGraph, Optimizer};
+use super::operator_name::OperatorName;
+use super::program_config::ProgramConfig;
+
+#[derive(Clone, Debug, Default)]
+pub struct LogicalProgram {
+    pub graph: LogicalGraph,
+    pub program_config: ProgramConfig,
+}
+
+impl LogicalProgram {
+    pub fn new(graph: LogicalGraph, program_config: ProgramConfig) -> Self {
+        Self {
+            graph,
+            program_config,
+        }
+    }
+
+    pub fn optimize(&mut self, optimizer: &dyn Optimizer) {
+        optimizer.optimize(&mut self.graph);
+    }
+
+    pub fn update_parallelism(&mut self, overrides: &HashMap<u32, usize>) {
+        for node in self.graph.node_weights_mut() {
+            if let Some(&p) = overrides.get(&node.node_id) {
+                node.parallelism = p;
+            }
+        }
+    }
+
+    pub fn dot(&self) -> String {
+        format!("{:?}", Dot::with_config(&self.graph, &[]))
+    }
+
+    pub fn task_count(&self) -> usize {
+        self.graph.node_weights().map(|nw| nw.parallelism).sum()
+    }
+
+    pub fn sources(&self) -> HashSet<u32> {
+        self.graph
+            .externals(Direction::Incoming)
+            .filter_map(|idx| self.graph.node_weight(idx))
+            .map(|node| node.node_id)
+            .collect()
+    }
+
+    pub fn get_hash(&self) -> String {
+        let mut hasher = DefaultHasher::new();
+        let program_bytes = FsProgram::from(self.clone()).encode_to_vec();
+        hasher.write(&program_bytes);
+        let rng = SmallRng::seed_from_u64(hasher.finish());
+        rng.sample_iter(&Alphanumeric)
+            .take(16)
+            .map(|c| (c as char).to_ascii_lowercase())
+            .collect()
+    }
+
+    pub fn tasks_per_operator(&self) -> HashMap<String, usize> {
+        self.graph
+            .node_weights()
+            .flat_map(|node| {
+                node.operator_chain
+                    .operators
+                    .iter()
+                    .map(move |op| (op.operator_id.clone(), node.parallelism))
+            })
+            .collect()
+    }
+
+    pub fn operator_names_by_id(&self) -> HashMap<String, String> {
+        self.graph
+            .node_weights()
+            .flat_map(|node| &node.operator_chain.operators)
+            .map(|op| {
+                let resolved_name = op
+                    .extract_connector_name()
+                    .unwrap_or_else(|| op.operator_name.to_string());
+                (op.operator_id.clone(), resolved_name)
+            })
+            .collect()
+    }
+
+    pub fn tasks_per_node(&self) -> HashMap<u32, usize> {
+        self.graph
+            .node_weights()
+            .map(|node| (node.node_id, node.parallelism))
+            .collect()
+    }
+
+    pub fn features(&self) -> HashSet<String> {
+        self.graph
+            .node_weights()
+            .flat_map(|node| &node.operator_chain.operators)
+            .filter_map(|op| op.extract_feature())
+            .collect()
+    }
+
+    /// Arrow schema carried on edges into the connector-sink node, if present.
+    pub fn egress_arrow_schema(&self) -> Option<Arc<Schema>> {
+        for idx in self.graph.node_indices() {
+            let node = self.graph.node_weight(idx)?;
+            if node
+                .operator_chain
+                .operators
+                .iter()
+                .any(|op| op.operator_name == OperatorName::ConnectorSink)
+            {
+                let e = self
+                    .graph
+                    .edges_directed(idx, Direction::Incoming)
+                    .next()?;
+                return Some(Arc::clone(&e.weight().schema.schema));
+            }
+        }
+        None
+    }
+
+    pub fn encode_for_catalog(&self) -> DFResult<Vec<u8>> {
+        Ok(FsProgram::from(self.clone()).encode_to_vec())
+    }
+
+    pub fn decode_for_catalog(bytes: &[u8]) -> DFResult<Self> {
+        let proto = FsProgram::decode(bytes).map_err(|e| {
+            DataFusionError::Execution(format!("FsProgram catalog decode failed: {e}"))
+        })?;
+        LogicalProgram::try_from(proto)
+    }
+}
diff --git a/src/sql/logical_node/logical/mod.rs b/src/sql/logical_node/logical/mod.rs
new file mode 100644
index 00000000..d2e9a327
--- /dev/null
+++ b/src/sql/logical_node/logical/mod.rs
@@ -0,0 +1,30 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+mod dylib_udf_config;
+mod fs_program_convert;
+mod logical_edge;
+mod logical_graph;
+mod logical_node;
+mod logical_program;
+mod operator_chain;
+mod operator_name;
+mod program_config;
+mod python_udf_config;
+
+pub use dylib_udf_config::DylibUdfConfig;
+pub use logical_edge::{LogicalEdge, LogicalEdgeType};
+pub use logical_graph::{LogicalGraph, Optimizer};
+pub use logical_node::LogicalNode;
+pub use logical_program::LogicalProgram;
+pub use operator_name::OperatorName;
+pub use program_config::ProgramConfig;
diff --git a/src/sql/logical_node/logical/operator_chain.rs b/src/sql/logical_node/logical/operator_chain.rs
new file mode 100644
index 00000000..e74684ba
--- /dev/null
+++ b/src/sql/logical_node/logical/operator_chain.rs
@@ -0,0 +1,128 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::sync::Arc;
+
+use itertools::{EitherOrBoth, Itertools};
+use prost::Message;
+use protocol::grpc::api::ConnectorOp;
+use serde::{Deserialize, Serialize};
+
+use super::operator_name::OperatorName;
+use crate::sql::common::FsSchema;
+
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct ChainedLogicalOperator {
+    pub operator_id: String,
+    pub operator_name: OperatorName,
+    pub operator_config: Vec<u8>,
+}
+
+impl ChainedLogicalOperator {
+    pub fn extract_connector_name(&self) -> Option<String> {
+        if matches!(
+            self.operator_name,
+            OperatorName::ConnectorSource | OperatorName::ConnectorSink
+        ) {
+            ConnectorOp::decode(self.operator_config.as_slice())
+                .ok()
+                .map(|op| op.connector)
+        } else {
+            None
+        }
+    }
+
+    pub fn extract_feature(&self) -> Option<String> {
+        match self.operator_name {
+            OperatorName::AsyncUdf => Some("async-udf".to_string()),
+            OperatorName::Join => Some("join-with-expiration".to_string()),
+            OperatorName::InstantJoin => Some("windowed-join".to_string()),
+            OperatorName::WindowFunction => Some("sql-window-function".to_string()),
+            OperatorName::LookupJoin => Some("lookup-join".to_string()),
+            OperatorName::TumblingWindowAggregate => {
+                Some("sql-tumbling-window-aggregate".to_string())
+            }
+            OperatorName::SlidingWindowAggregate => {
+                Some("sql-sliding-window-aggregate".to_string())
+            }
+            OperatorName::SessionWindowAggregate => {
+                Some("sql-session-window-aggregate".to_string())
+            }
+            OperatorName::UpdatingAggregate => Some("sql-updating-aggregate".to_string()),
+            OperatorName::ConnectorSource => self
+                .extract_connector_name()
+                .map(|c| format!("{c}-source")),
+            OperatorName::ConnectorSink => self.extract_connector_name().map(|c| format!("{c}-sink")),
+            _ => None,
+        }
+    }
+}
+
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct OperatorChain {
+    pub(crate) operators: Vec<ChainedLogicalOperator>,
+    pub(crate) edges: Vec<Arc<FsSchema>>,
+}
+
+impl OperatorChain {
+    pub fn new(operator: ChainedLogicalOperator) -> Self {
+        Self {
+            operators: vec![operator],
+            edges: vec![],
+        }
+    }
+
+    pub fn iter(
+        &self,
+    ) -> impl Iterator<Item = (&ChainedLogicalOperator, Option<&Arc<FsSchema>>)> {
+        self.operators.iter().zip_longest(&self.edges).filter_map(|e| match e {
+            EitherOrBoth::Both(op, edge) => Some((op, Some(edge))),
+            EitherOrBoth::Left(op) => Some((op, None)),
+            EitherOrBoth::Right(_) => None,
+        })
+    }
+
+    pub fn iter_mut(
+        &mut self,
+    ) -> impl Iterator<Item = (&mut ChainedLogicalOperator, Option<&Arc<FsSchema>>)> {
+        self.operators
+            .iter_mut()
+            .zip_longest(&self.edges)
+            .filter_map(|e| match e {
+                EitherOrBoth::Both(op, edge) => Some((op, Some(edge))),
+                EitherOrBoth::Left(op) => Some((op, None)),
+                EitherOrBoth::Right(_) => None,
+            })
+    }
+
+    pub fn first(&self) -> &ChainedLogicalOperator {
+        self.operators
+            .first()
+            .expect("OperatorChain must contain at least one operator")
+    }
+
+    pub fn len(&self) -> usize {
+        self.operators.len()
+    }
+
+    pub fn is_empty(&self) -> bool {
+        self.operators.is_empty()
+    }
+
+    pub fn is_source(&self) -> bool {
+        self.operators[0].operator_name == OperatorName::ConnectorSource
+    }
+
+    pub fn is_sink(&self) -> bool {
+        self.operators[0].operator_name == OperatorName::ConnectorSink
+    }
+}
diff --git a/src/sql/logical_node/logical/operator_name.rs b/src/sql/logical_node/logical/operator_name.rs
new file mode 100644
index 00000000..57f53f90
--- /dev/null
+++ b/src/sql/logical_node/logical/operator_name.rs
@@ -0,0 +1,82 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::str::FromStr;
+
+use serde::{Deserialize, Deserializer, Serialize, Serializer};
+use strum::{Display, EnumString, IntoStaticStr};
+
+use crate::sql::common::constants::operator_feature;
+
+#[derive(Clone, Copy, Debug, Eq, PartialEq, EnumString, Display, IntoStaticStr)]
+pub enum OperatorName {
+    ExpressionWatermark,
+    Value,
+    KeyBy,
+    Projection,
+    AsyncUdf,
+    Join,
+    InstantJoin,
+    LookupJoin,
+    WindowFunction,
+    TumblingWindowAggregate,
+    SlidingWindowAggregate,
+    SessionWindowAggregate,
+    UpdatingAggregate,
+    ConnectorSource,
+    ConnectorSink,
+}
+
+impl OperatorName {
+    /// Registry / worker lookup key; matches [`Display`] and protobuf operator names.
+    #[inline]
+    pub fn as_registry_key(self) -> &'static str {
+        self.into()
+    }
+
+    pub fn feature_tag(self) -> Option<&'static str> {
+        match self {
+            Self::ExpressionWatermark | Self::Value | Self::Projection => None,
+            Self::AsyncUdf => Some(operator_feature::ASYNC_UDF),
+            Self::Join => Some(operator_feature::JOIN_WITH_EXPIRATION),
+            Self::InstantJoin => Some(operator_feature::WINDOWED_JOIN),
+            Self::WindowFunction => Some(operator_feature::SQL_WINDOW_FUNCTION),
+            Self::LookupJoin => Some(operator_feature::LOOKUP_JOIN),
+            Self::TumblingWindowAggregate => Some(operator_feature::SQL_TUMBLING_WINDOW_AGGREGATE),
+            Self::SlidingWindowAggregate => Some(operator_feature::SQL_SLIDING_WINDOW_AGGREGATE),
+            Self::SessionWindowAggregate => Some(operator_feature::SQL_SESSION_WINDOW_AGGREGATE),
+            Self::UpdatingAggregate => Some(operator_feature::SQL_UPDATING_AGGREGATE),
+            Self::KeyBy => Some(operator_feature::KEY_BY_ROUTING),
+            Self::ConnectorSource => Some(operator_feature::CONNECTOR_SOURCE),
+            Self::ConnectorSink => Some(operator_feature::CONNECTOR_SINK),
+        }
+    }
+}
+
+impl Serialize for OperatorName {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: Serializer,
+    {
+        serializer.serialize_str(&self.to_string())
+    }
+}
+
+impl<'de> Deserialize<'de> for OperatorName {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: Deserializer<'de>,
+    {
+        let s = String::deserialize(deserializer)?;
+        Self::from_str(&s).map_err(serde::de::Error::custom)
+    }
+}
diff --git a/src/sql/logical_node/logical/program_config.rs b/src/sql/logical_node/logical/program_config.rs
new file mode 100644
index 00000000..931a5424
--- /dev/null
+++ b/src/sql/logical_node/logical/program_config.rs
@@ -0,0 +1,33 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+//
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use protocol::grpc::api::FsProgramConfig;
+
+/// Placeholder program-level config (UDF tables live elsewhere; wire maps stay empty).
+#[derive(Clone, Debug, Default)]
+pub struct ProgramConfig {}
+
+impl From<ProgramConfig> for FsProgramConfig {
+    fn from(_: ProgramConfig) -> Self {
+        Self {
+            udf_dylibs: Default::default(),
+            python_udfs: Default::default(),
+        }
+    }
+}
+
+impl From<FsProgramConfig> for ProgramConfig {
+    fn from(_: FsProgramConfig) -> Self {
+        Self::default()
+    }
+}
diff --git a/src/sql/logical_node/logical/python_udf_config.rs b/src/sql/logical_node/logical/python_udf_config.rs
new file mode 100644
index 00000000..6e7d5c66
--- /dev/null
+++ b/src/sql/logical_node/logical/python_udf_config.rs
@@ -0,0 +1,23 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::sync::Arc;
+
+use datafusion::arrow::datatypes::DataType;
+
+#[derive(Clone, Debug, Eq, PartialEq, Hash)]
+pub struct PythonUdfConfig {
+    pub arg_types: Vec<DataType>,
+    pub return_type: DataType,
+    pub name: Arc<String>,
+    pub definition: Arc<String>,
+}
diff --git a/src/sql/logical_node/mod.rs b/src/sql/logical_node/mod.rs
new file mode 100644
index 00000000..922801f6
--- /dev/null
+++ b/src/sql/logical_node/mod.rs
@@ -0,0 +1,13 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+pub mod logical;
diff --git a/src/sql/logical_planner/mod.rs b/src/sql/logical_planner/mod.rs
new file mode 100644
index 00000000..f29cba18
--- /dev/null
+++ b/src/sql/logical_planner/mod.rs
@@ -0,0 +1,14 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+pub(crate) mod planner;
+pub mod optimizers;
diff --git a/src/sql/logical_planner/optimizers/chaining.rs b/src/sql/logical_planner/optimizers/chaining.rs
new file mode 100644
index 00000000..8c1534a6
--- /dev/null
+++ b/src/sql/logical_planner/optimizers/chaining.rs
@@ -0,0 +1,173 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::mem;
+
+use petgraph::graph::{EdgeIndex, NodeIndex};
+use petgraph::prelude::*;
+use petgraph::visit::NodeRef;
+
+
+use crate::sql::logical_node::logical::{LogicalEdgeType, LogicalGraph, Optimizer};
+
+pub type NodeId = NodeIndex;
+pub type EdgeId = EdgeIndex;
+
+pub struct ChainingOptimizer {}
+
+fn remove_in_place<N, E>(graph: &mut DiGraph<N, E>, node: NodeIndex) {
+    let incoming = graph.edges_directed(node, Incoming).next().unwrap();
+
+    let parent = incoming.source().id();
+    let incoming = incoming.id();
+    graph.remove_edge(incoming);
+
+    let outgoing: Vec<_> = graph
+        .edges_directed(node, Outgoing)
+        .map(|e| (e.id(), e.target().id()))
+        .collect();
+
+    for (edge, target) in outgoing {
+        let weight = graph.remove_edge(edge).unwrap();
+        graph.add_edge(parent, target, weight);
+    }
+
+    graph.remove_node(node);
+}
+
+impl Optimizer for ChainingOptimizer {
+    fn optimize_once(&self, plan: &mut LogicalGraph) -> bool {
+        let node_indices: Vec<NodeIndex> = plan.node_indices().collect();
+
+        for &node_idx in &node_indices {
+            let cur = plan.node_weight(node_idx).unwrap();
+
+            // sources can't be chained
+            if cur.operator_chain.is_source() {
+                continue;
+            }
+
+            let mut successors = plan.edges_directed(node_idx, Outgoing).collect::<Vec<_>>();
+
+            if successors.len() != 1 {
+                continue;
+            }
+
+            let edge = successors.remove(0);
+            let edge_type = edge.weight().edge_type;
+
+            if edge_type != LogicalEdgeType::Forward {
+                continue;
+            }
+
+            let successor_idx = edge.target();
+
+            let successor_node = plan.node_weight(successor_idx).unwrap();
+
+            // skip if parallelism doesn't match or successor is a sink
+            if cur.parallelism != successor_node.parallelism
+                || successor_node.operator_chain.is_sink()
+            {
+                continue;
+            }
+
+            // skip successors with multiple predecessors
+            if plan.edges_directed(successor_idx, Incoming).count() > 1 {
+                continue;
+            }
+
+            // construct the new node
+            let mut new_cur = cur.clone();
+
+            new_cur.description = format!("{} -> {}", cur.description, successor_node.description);
+
+            new_cur
+                .operator_chain
+                .operators
+                .extend(successor_node.operator_chain.operators.clone());
+
+            new_cur
+                .operator_chain
+                .edges
+                .push(edge.weight().schema.clone());
+
+            mem::swap(&mut new_cur, plan.node_weight_mut(node_idx).unwrap());
+
+            // remove the old successor
+            remove_in_place(plan, successor_idx);
+            return true;
+        }
+
+        false
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use datafusion::arrow::datatypes::{DataType, Field, Schema, TimeUnit};
+
+    use crate::sql::common::FsSchema;
+    use crate::sql::logical_node::logical::{
+        LogicalEdge, LogicalEdgeType, LogicalGraph, LogicalNode, Optimizer, OperatorName,
+    };
+
+    use super::ChainingOptimizer;
+
+    fn forward_edge() -> LogicalEdge {
+        let s = Arc::new(Schema::new(vec![Field::new(
+            "_timestamp",
+            DataType::Timestamp(TimeUnit::Nanosecond, None),
+            false,
+        )]));
+        LogicalEdge::new(LogicalEdgeType::Forward, FsSchema::new_unkeyed(s, 0))
+    }
+
+    fn proj_node(id: u32, label: &str) -> LogicalNode {
+        LogicalNode::single(
+            id,
+            format!("op_{label}"),
+            OperatorName::Projection,
+            vec![],
+            label.to_string(),
+            1,
+        )
+    }
+
+    fn source_node() -> LogicalNode {
+        LogicalNode::single(
+            0,
+            "src".into(),
+            OperatorName::ConnectorSource,
+            vec![],
+            "source".into(),
+            1,
+        )
+    }
+
+    /// Regression: upstream at last `NodeIndex` + remove non-last downstream swaps indices.
+    #[test]
+    fn fusion_remaps_when_upstream_was_last_node_index() {
+        let mut g = LogicalGraph::new();
+        let n0 = g.add_node(source_node());
+        let n1 = g.add_node(proj_node(1, "downstream"));
+        let n2 = g.add_node(proj_node(2, "upstream_last_index"));
+        let e = forward_edge();
+        g.add_edge(n0, n2, e.clone());
+        g.add_edge(n2, n1, e);
+
+        let changed = ChainingOptimizer {}.optimize_once(&mut g);
+        assert!(changed);
+        assert_eq!(g.node_count(), 2);
+    }
+}
diff --git a/src/sql/logical_planner/optimizers/datafusion_logical.rs b/src/sql/logical_planner/optimizers/datafusion_logical.rs
new file mode 100644
index 00000000..fbb64845
--- /dev/null
+++ b/src/sql/logical_planner/optimizers/datafusion_logical.rs
@@ -0,0 +1,95 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::sync::Arc;
+
+use datafusion::common::Result;
+use datafusion::common::config::ConfigOptions;
+use datafusion::logical_expr::LogicalPlan;
+use datafusion::optimizer::OptimizerContext;
+use datafusion::optimizer::OptimizerRule;
+use datafusion::optimizer::common_subexpr_eliminate::CommonSubexprEliminate;
+use datafusion::optimizer::decorrelate_lateral_join::DecorrelateLateralJoin;
+use datafusion::optimizer::decorrelate_predicate_subquery::DecorrelatePredicateSubquery;
+use datafusion::optimizer::eliminate_cross_join::EliminateCrossJoin;
+use datafusion::optimizer::eliminate_duplicated_expr::EliminateDuplicatedExpr;
+use datafusion::optimizer::eliminate_filter::EliminateFilter;
+use datafusion::optimizer::eliminate_group_by_constant::EliminateGroupByConstant;
+use datafusion::optimizer::eliminate_join::EliminateJoin;
+use datafusion::optimizer::eliminate_limit::EliminateLimit;
+use datafusion::optimizer::eliminate_nested_union::EliminateNestedUnion;
+use datafusion::optimizer::eliminate_one_union::EliminateOneUnion;
+use datafusion::optimizer::eliminate_outer_join::EliminateOuterJoin;
+use datafusion::optimizer::extract_equijoin_predicate::ExtractEquijoinPredicate;
+use datafusion::optimizer::filter_null_join_keys::FilterNullJoinKeys;
+use datafusion::optimizer::optimizer::Optimizer;
+use datafusion::optimizer::propagate_empty_relation::PropagateEmptyRelation;
+use datafusion::optimizer::push_down_filter::PushDownFilter;
+use datafusion::optimizer::push_down_limit::PushDownLimit;
+use datafusion::optimizer::replace_distinct_aggregate::ReplaceDistinctWithAggregate;
+use datafusion::optimizer::scalar_subquery_to_join::ScalarSubqueryToJoin;
+use datafusion::optimizer::simplify_expressions::SimplifyExpressions;
+use datafusion::sql::planner::SqlToRel;
+use datafusion::sql::sqlparser::ast::Statement;
+
+use crate::sql::schema::StreamSchemaProvider;
+
+/// Converts a SQL statement into an optimized DataFusion logical plan.
+///
+/// Applies the DataFusion analyzer followed by a curated set of optimizer rules
+/// suitable for streaming SQL (some rules like OptimizeProjections are excluded
+/// because they can drop event-time calculation fields).
+pub fn produce_optimized_plan(
+    statement: &Statement,
+    schema_provider: &StreamSchemaProvider,
+) -> Result<LogicalPlan> {
+    let sql_to_rel = SqlToRel::new(schema_provider);
+    let plan = sql_to_rel.sql_statement_to_plan(statement.clone())?;
+
+    let analyzed_plan = schema_provider.analyzer.execute_and_check(
+        plan,
+        &ConfigOptions::default(),
+        |_plan, _rule| {},
+    )?;
+
+    let rules: Vec<Arc<dyn OptimizerRule + Send + Sync>> = vec![
+        Arc::new(EliminateNestedUnion::new()),
+        Arc::new(SimplifyExpressions::new()),
+        Arc::new(ReplaceDistinctWithAggregate::new()),
+        Arc::new(EliminateJoin::new()),
+        Arc::new(DecorrelatePredicateSubquery::new()),
+        Arc::new(ScalarSubqueryToJoin::new()),
+        Arc::new(DecorrelateLateralJoin::new()),
+        Arc::new(ExtractEquijoinPredicate::new()),
+        Arc::new(EliminateDuplicatedExpr::new()),
+        Arc::new(EliminateFilter::new()),
+        Arc::new(EliminateCrossJoin::new()),
+        Arc::new(EliminateLimit::new()),
+        Arc::new(PropagateEmptyRelation::new()),
+        Arc::new(EliminateOneUnion::new()),
+        Arc::new(FilterNullJoinKeys::default()),
+        Arc::new(EliminateOuterJoin::new()),
+        Arc::new(PushDownLimit::new()),
+        Arc::new(PushDownFilter::new()),
+        Arc::new(EliminateGroupByConstant::new()),
+        Arc::new(CommonSubexprEliminate::new()),
+    ];
+
+    let optimizer = Optimizer::with_rules(rules);
+    let optimized = optimizer.optimize(
+        analyzed_plan,
+        &OptimizerContext::default(),
+        |_plan, _rule| {},
+    )?;
+
+    Ok(optimized)
+}
diff --git a/src/sql/logical_planner/optimizers/mod.rs b/src/sql/logical_planner/optimizers/mod.rs
new file mode 100644
index 00000000..0e0de6a2
--- /dev/null
+++ b/src/sql/logical_planner/optimizers/mod.rs
@@ -0,0 +1,20 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Logical planner optimizers: graph-level chaining ([`ChainingOptimizer`]) and
+//! DataFusion SQL logical-plan rules ([`produce_optimized_plan`]).
+
+mod chaining;
+mod datafusion_logical;
+
+pub use chaining::ChainingOptimizer;
+pub use datafusion_logical::produce_optimized_plan;
diff --git a/src/sql/logical_planner/planner.rs b/src/sql/logical_planner/planner.rs
new file mode 100644
index 00000000..b0a712c7
--- /dev/null
+++ b/src/sql/logical_planner/planner.rs
@@ -0,0 +1,418 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::collections::HashMap;
+use std::sync::Arc;
+use std::thread;
+use std::time::Duration;
+
+use datafusion::arrow::datatypes::IntervalMonthDayNanoType;
+use datafusion::common::tree_node::{TreeNode, TreeNodeRecursion, TreeNodeVisitor};
+use datafusion::common::{
+    DFSchema, DFSchemaRef, DataFusionError, Result, ScalarValue, Spans, plan_err,
+};
+use datafusion::execution::context::SessionState;
+use datafusion::execution::runtime_env::RuntimeEnvBuilder;
+use datafusion::functions::datetime::date_bin;
+use datafusion::logical_expr::{Expr, Extension, LogicalPlan, UserDefinedLogicalNode};
+use datafusion::physical_expr::PhysicalExpr;
+use datafusion::physical_plan::ExecutionPlan;
+use datafusion::physical_planner::{DefaultPhysicalPlanner, ExtensionPlanner, PhysicalPlanner};
+use datafusion_proto::protobuf::{PhysicalExprNode, PhysicalPlanNode};
+use datafusion_proto::{
+    physical_plan::AsExecutionPlan,
+    protobuf::{AggregateMode, physical_plan_node::PhysicalPlanType},
+};
+use petgraph::graph::{DiGraph, NodeIndex};
+use prost::Message;
+use tokio::runtime::Builder;
+use tokio::sync::oneshot;
+
+use async_trait::async_trait;
+use datafusion_common::TableReference;
+use datafusion_proto::physical_plan::DefaultPhysicalExtensionCodec;
+use datafusion_proto::physical_plan::to_proto::serialize_physical_expr;
+
+use crate::sql::logical_node::logical::{LogicalEdge, LogicalGraph, LogicalNode};
+use crate::sql::physical::{
+    DebeziumUnrollingExec, DecodingContext, FsMemExec, FsPhysicalExtensionCodec, ToDebeziumExec,
+};
+use crate::sql::extensions::debezium::{PACK_NODE_NAME, UNROLL_NODE_NAME, UnrollDebeziumPayloadNode};
+use crate::sql::extensions::key_calculation::KeyExtractionNode;
+use crate::sql::extensions::{CompiledTopologyNode, StreamingOperatorBlueprint};
+use crate::sql::schema::utils::add_timestamp_field_arrow;
+use crate::sql::schema::StreamSchemaProvider;
+use crate::sql::common::{FsSchema, FsSchemaRef};
+
+#[derive(Eq, Hash, PartialEq)]
+#[derive(Debug)]
+pub(crate) enum NamedNode {
+    Source(TableReference),
+    Watermark(TableReference),
+    RemoteTable(TableReference),
+    Sink(TableReference),
+}
+
+pub(crate) struct PlanToGraphVisitor<'a> {
+    graph: DiGraph<LogicalNode, LogicalEdge>,
+    output_schemas: HashMap<NodeIndex, FsSchemaRef>,
+    named_nodes: HashMap<NamedNode, NodeIndex>,
+    traversal: Vec<Vec<NodeIndex>>,
+    planner: Planner<'a>,
+}
+
+impl<'a> PlanToGraphVisitor<'a> {
+    pub fn new(schema_provider: &'a StreamSchemaProvider, session_state: &'a SessionState) -> Self {
+        Self {
+            graph: Default::default(),
+            output_schemas: Default::default(),
+            named_nodes: Default::default(),
+            traversal: vec![],
+            planner: Planner::new(schema_provider, session_state),
+        }
+    }
+}
+
+pub(crate) struct Planner<'a> {
+    schema_provider: &'a StreamSchemaProvider,
+    planner: DefaultPhysicalPlanner,
+    session_state: &'a SessionState,
+}
+
+impl<'a> Planner<'a> {
+    pub(crate) fn new(
+        schema_provider: &'a StreamSchemaProvider,
+        session_state: &'a SessionState,
+    ) -> Self {
+        let planner =
+            DefaultPhysicalPlanner::with_extension_planners(vec![Arc::new(FsExtensionPlanner {})]);
+        Self {
+            schema_provider,
+            planner,
+            session_state,
+        }
+    }
+
+    pub(crate) fn sync_plan(&self, plan: &LogicalPlan) -> Result<Arc<dyn ExecutionPlan>> {
+        let fut = self.planner.create_physical_plan(plan, self.session_state);
+        let (tx, mut rx) = oneshot::channel();
+        thread::scope(|s| {
+            let builder = thread::Builder::new();
+            let builder = if cfg!(debug_assertions) {
+                builder.stack_size(10_000_000)
+            } else {
+                builder
+            };
+            builder
+                .spawn_scoped(s, move || {
+                    let rt = Builder::new_current_thread().enable_all().build().unwrap();
+                    rt.block_on(async {
+                        let plan = fut.await;
+                        tx.send(plan).unwrap();
+                    });
+                })
+                .unwrap();
+        });
+
+        rx.try_recv().unwrap()
+    }
+
+    pub(crate) fn create_physical_expr(
+        &self,
+        expr: &Expr,
+        input_dfschema: &DFSchema,
+    ) -> Result<Arc<dyn PhysicalExpr>> {
+        self.planner
+            .create_physical_expr(expr, input_dfschema, self.session_state)
+    }
+
+    pub(crate) fn serialize_as_physical_expr(
+        &self,
+        expr: &Expr,
+        schema: &DFSchema,
+    ) -> Result<Vec<u8>> {
+        let physical = self.create_physical_expr(expr, schema)?;
+        let proto = serialize_physical_expr(&physical, &DefaultPhysicalExtensionCodec {})?;
+        Ok(proto.encode_to_vec())
+    }
+
+    pub(crate) fn split_physical_plan(
+        &self,
+        key_indices: Vec<usize>,
+        aggregate: &LogicalPlan,
+        add_timestamp_field: bool,
+    ) -> Result<SplitPlanOutput> {
+        let physical_plan = self.sync_plan(aggregate)?;
+        let codec = FsPhysicalExtensionCodec {
+            context: DecodingContext::Planning,
+        };
+        let mut physical_plan_node =
+            PhysicalPlanNode::try_from_physical_plan(physical_plan.clone(), &codec)?;
+        let PhysicalPlanType::Aggregate(mut final_aggregate_proto) = physical_plan_node
+            .physical_plan_type
+            .take()
+            .ok_or_else(|| DataFusionError::Plan("missing physical plan type".to_string()))?
+        else {
+            return plan_err!("unexpected physical plan type");
+        };
+        let AggregateMode::Final = final_aggregate_proto.mode() else {
+            return plan_err!("unexpected physical plan type");
+        };
+
+        let partial_aggregation_plan = *final_aggregate_proto
+            .input
+            .take()
+            .ok_or_else(|| DataFusionError::Plan("missing input".to_string()))?;
+
+        let partial_aggregation_exec_plan = partial_aggregation_plan.try_into_physical_plan(
+            self.schema_provider,
+            &RuntimeEnvBuilder::new().build().unwrap(),
+            &codec,
+        )?;
+
+        let partial_schema = partial_aggregation_exec_plan.schema();
+        let final_input_table_provider = FsMemExec::new("partial".into(), partial_schema.clone());
+
+        final_aggregate_proto.input = Some(Box::new(PhysicalPlanNode::try_from_physical_plan(
+            Arc::new(final_input_table_provider),
+            &codec,
+        )?));
+
+        let finish_plan = PhysicalPlanNode {
+            physical_plan_type: Some(PhysicalPlanType::Aggregate(final_aggregate_proto)),
+        };
+
+        let (partial_schema, timestamp_index) = if add_timestamp_field {
+            (
+                add_timestamp_field_arrow((*partial_schema).clone()),
+                partial_schema.fields().len(),
+            )
+        } else {
+            (partial_schema.clone(), partial_schema.fields().len() - 1)
+        };
+
+        let partial_schema = FsSchema::new_keyed(partial_schema, timestamp_index, key_indices);
+
+        Ok(SplitPlanOutput {
+            partial_aggregation_plan,
+            partial_schema,
+            finish_plan,
+        })
+    }
+
+    pub fn binning_function_proto(
+        &self,
+        width: Duration,
+        input_schema: DFSchemaRef,
+    ) -> Result<PhysicalExprNode> {
+        let date_bin = date_bin().call(vec![
+            Expr::Literal(
+                ScalarValue::IntervalMonthDayNano(Some(IntervalMonthDayNanoType::make_value(
+                    0,
+                    0,
+                    width.as_nanos() as i64,
+                ))),
+                None,
+            ),
+            Expr::Column(datafusion::common::Column {
+                relation: None,
+                name: "_timestamp".into(),
+                spans: Spans::new(),
+            }),
+        ]);
+
+        let binning_function = self.create_physical_expr(&date_bin, &input_schema)?;
+        serialize_physical_expr(&binning_function, &DefaultPhysicalExtensionCodec {})
+    }
+}
+
+struct FsExtensionPlanner {}
+
+#[async_trait]
+impl ExtensionPlanner for FsExtensionPlanner {
+    async fn plan_extension(
+        &self,
+        _planner: &dyn PhysicalPlanner,
+        node: &dyn UserDefinedLogicalNode,
+        _logical_inputs: &[&LogicalPlan],
+        physical_inputs: &[Arc<dyn ExecutionPlan>],
+        _session_state: &SessionState,
+    ) -> Result<Option<Arc<dyn ExecutionPlan>>> {
+        let schema = node.schema().as_ref().into();
+        if let Ok::<&dyn StreamingOperatorBlueprint, _>(stream_extension) = node.try_into() {
+            if stream_extension.is_passthrough_boundary() {
+                match node.name() {
+                    UNROLL_NODE_NAME => {
+                        let node = node
+                            .as_any()
+                            .downcast_ref::<UnrollDebeziumPayloadNode>()
+                            .unwrap();
+                        let input = physical_inputs[0].clone();
+                        return Ok(Some(Arc::new(DebeziumUnrollingExec::try_new(
+                            input,
+                            node.pk_indices.clone(),
+                        )?)));
+                    }
+                    PACK_NODE_NAME => {
+                        let input = physical_inputs[0].clone();
+                        return Ok(Some(Arc::new(ToDebeziumExec::try_new(input)?)));
+                    }
+                    _ => return Ok(None),
+                }
+            }
+        };
+        let name =
+            if let Some(key_extension) = node.as_any().downcast_ref::<KeyExtractionNode>() {
+                key_extension.operator_label.clone()
+            } else {
+                None
+            };
+        Ok(Some(Arc::new(FsMemExec::new(
+            name.unwrap_or("memory".to_string()),
+            Arc::new(schema),
+        ))))
+    }
+}
+
+impl PlanToGraphVisitor<'_> {
+    fn add_index_to_traversal(&mut self, index: NodeIndex) {
+        if let Some(last) = self.traversal.last_mut() {
+            last.push(index);
+        }
+    }
+
+    pub(crate) fn add_plan(&mut self, plan: LogicalPlan) -> Result<()> {
+        self.traversal.clear();
+        plan.visit(self)?;
+        Ok(())
+    }
+
+    pub fn into_graph(self) -> LogicalGraph {
+        self.graph
+    }
+
+    pub fn build_extension(
+        &mut self,
+        input_nodes: Vec<NodeIndex>,
+        extension: &dyn StreamingOperatorBlueprint,
+    ) -> Result<()> {
+        if let Some(node_name) = extension.operator_identity() {
+            if self.named_nodes.contains_key(&node_name) {
+                return plan_err!(
+                    "extension {:?} has already been planned, shouldn't try again.",
+                    node_name
+                );
+            }
+        }
+
+        let input_schemas = input_nodes
+            .iter()
+            .map(|index| {
+                Ok(self
+                    .output_schemas
+                    .get(index)
+                    .ok_or_else(|| DataFusionError::Plan("missing input node".to_string()))?
+                    .clone())
+            })
+            .collect::<Result<Vec<_>>>()?;
+
+        let CompiledTopologyNode {
+            execution_unit,
+            routing_edges,
+        } = extension
+            .compile_to_graph_node(&self.planner, self.graph.node_count(), input_schemas)
+            .map_err(|e| e.context(format!("planning operator {extension:?}")))?;
+
+        let node_index = self.graph.add_node(execution_unit);
+        self.add_index_to_traversal(node_index);
+
+        for (source, edge) in input_nodes.into_iter().zip(routing_edges.into_iter()) {
+            self.graph.add_edge(source, node_index, edge);
+        }
+
+        self.output_schemas
+            .insert(node_index, extension.yielded_schema().into());
+
+        if let Some(node_name) = extension.operator_identity() {
+            self.named_nodes.insert(node_name, node_index);
+        }
+        Ok(())
+    }
+}
+
+impl TreeNodeVisitor<'_> for PlanToGraphVisitor<'_> {
+    type Node = LogicalPlan;
+
+    fn f_down(&mut self, node: &Self::Node) -> Result<TreeNodeRecursion> {
+        let LogicalPlan::Extension(Extension { node }) = node else {
+            return Ok(TreeNodeRecursion::Continue);
+        };
+
+        let stream_extension: &dyn StreamingOperatorBlueprint = node
+            .try_into()
+            .map_err(|e: DataFusionError| e.context("converting extension"))?;
+        if stream_extension.is_passthrough_boundary() {
+            return Ok(TreeNodeRecursion::Continue);
+        }
+
+        if let Some(name) = stream_extension.operator_identity() {
+            if let Some(node_index) = self.named_nodes.get(&name) {
+                self.add_index_to_traversal(*node_index);
+                return Ok(TreeNodeRecursion::Jump);
+            }
+        }
+
+        if !node.inputs().is_empty() {
+            self.traversal.push(vec![]);
+        }
+
+        Ok(TreeNodeRecursion::Continue)
+    }
+
+    fn f_up(&mut self, node: &Self::Node) -> Result<TreeNodeRecursion> {
+        let LogicalPlan::Extension(Extension { node }) = node else {
+            return Ok(TreeNodeRecursion::Continue);
+        };
+
+        let stream_extension: &dyn StreamingOperatorBlueprint = node
+            .try_into()
+            .map_err(|e: DataFusionError| e.context("planning extension"))?;
+
+        if stream_extension.is_passthrough_boundary() {
+            return Ok(TreeNodeRecursion::Continue);
+        }
+
+        if let Some(name) = stream_extension.operator_identity() {
+            if self.named_nodes.contains_key(&name) {
+                return Ok(TreeNodeRecursion::Continue);
+            }
+        }
+
+        let input_nodes = if !node.inputs().is_empty() {
+            self.traversal.pop().unwrap_or_default()
+        } else {
+            vec![]
+        };
+        let stream_extension: &dyn StreamingOperatorBlueprint = node
+            .try_into()
+            .map_err(|e: DataFusionError| e.context("converting extension"))?;
+        self.build_extension(input_nodes, stream_extension)?;
+
+        Ok(TreeNodeRecursion::Continue)
+    }
+}
+
+pub(crate) struct SplitPlanOutput {
+    pub(crate) partial_aggregation_plan: PhysicalPlanNode,
+    pub(crate) partial_schema: FsSchema,
+    pub(crate) finish_plan: PhysicalPlanNode,
+}
diff --git a/src/sql/mod.rs b/src/sql/mod.rs
index ed3c2e30..dc98a4de 100644
--- a/src/sql/mod.rs
+++ b/src/sql/mod.rs
@@ -10,6 +10,18 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-pub mod parser;
+pub mod common;
+pub mod api;
+
+pub mod schema;
+pub mod functions;
+pub mod parse;
+pub mod logical_node;
+pub mod logical_planner;
+pub mod physical;
+pub mod analysis;
+pub(crate) mod extensions;
+pub mod types;
+
+pub use analysis::rewrite_plan;
 
-pub use parser::SqlParser;
diff --git a/src/sql/parse.rs b/src/sql/parse.rs
new file mode 100644
index 00000000..5fd4a59f
--- /dev/null
+++ b/src/sql/parse.rs
@@ -0,0 +1,404 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Coordinator-facing SQL parsing (`parse_sql`).
+//!
+//! **Data-definition / pipeline shape (this entry point)**  
+//! Only these table-related forms are supported:
+//! - **`CREATE TABLE ... (cols [, WATERMARK FOR ...]) WITH ('connector' = '...', 'format' = '...', ...)`**  
+//!   connector-backed **source** DDL (no `AS SELECT`; `connector` in `WITH` selects this path)
+//! - **`CREATE TABLE ...`** other forms (including `CREATE TABLE ... AS SELECT` where DataFusion accepts it)
+//! - **`CREATE STREAMING TABLE ... WITH (...) AS SELECT ...`** (streaming sink DDL)
+//! - **`DROP TABLE`** / **`DROP TABLE IF EXISTS`** / **`DROP STREAMING TABLE`** (alias for `DROP TABLE` on the stream catalog)
+//! - **`SHOW TABLES`** — list stream catalog tables (connector sources and streaming sinks)
+//! - **`SHOW CREATE TABLE <name>`** — best-effort DDL text (full `WITH` / `AS SELECT` may not be stored)
+//!
+//! **`INSERT` is not supported** here — use `CREATE TABLE ... AS SELECT` or
+//! `CREATE STREAMING TABLE ... AS SELECT` to define the query shape instead.
+//!
+//! Other supported statements include function lifecycle (`CREATE FUNCTION WITH`, `START FUNCTION`, …).
+
+use std::collections::HashMap;
+
+use datafusion::common::{Result, plan_err};
+use datafusion::error::DataFusionError;
+use datafusion::sql::sqlparser::ast::{
+    ObjectType, ShowCreateObject, SqlOption, Statement as DFStatement,
+};
+use datafusion::sql::sqlparser::dialect::FunctionStreamDialect;
+use datafusion::sql::sqlparser::parser::Parser;
+
+use crate::coordinator::{
+    CreateFunction, CreateTable, DropFunction, DropStreamingTableStatement, DropTableStatement,
+    ShowCatalogTables, ShowCreateStreamingTable, ShowCreateTable, ShowFunctions,
+    ShowStreamingTables, StartFunction, Statement as CoordinatorStatement, StopFunction,
+    StreamingTableStatement,
+};
+
+/// Streaming-specific SQL that the sqlparser dialect does not natively handle.
+///
+/// Returns `Some(statement)` if the SQL was intercepted, `None` otherwise so
+/// the caller falls through to the normal sqlparser pipeline.
+fn try_parse_streaming_statement(sql: &str) -> Option<Box<dyn CoordinatorStatement>> {
+    let tokens: Vec<&str> = sql.split_whitespace().collect();
+    if tokens.is_empty() {
+        return None;
+    }
+
+    // SHOW STREAMING TABLES
+    if tokens.len() == 3
+        && tokens[0].eq_ignore_ascii_case("show")
+        && tokens[1].eq_ignore_ascii_case("streaming")
+        && tokens[2].eq_ignore_ascii_case("tables")
+    {
+        return Some(Box::new(ShowStreamingTables::new()));
+    }
+
+    // SHOW CREATE STREAMING TABLE <name>
+    if tokens.len() == 5
+        && tokens[0].eq_ignore_ascii_case("show")
+        && tokens[1].eq_ignore_ascii_case("create")
+        && tokens[2].eq_ignore_ascii_case("streaming")
+        && tokens[3].eq_ignore_ascii_case("table")
+    {
+        let name = tokens[4].trim_end_matches(';').to_string();
+        return Some(Box::new(ShowCreateStreamingTable::new(name)));
+    }
+
+    // DROP STREAMING TABLE [IF EXISTS] <name>
+    if tokens.len() >= 4
+        && tokens[0].eq_ignore_ascii_case("drop")
+        && tokens[1].eq_ignore_ascii_case("streaming")
+        && tokens[2].eq_ignore_ascii_case("table")
+    {
+        let (if_exists, name_idx) = if tokens.len() >= 6
+            && tokens[3].eq_ignore_ascii_case("if")
+            && tokens[4].eq_ignore_ascii_case("exists")
+        {
+            (true, 5)
+        } else {
+            (false, 3)
+        };
+
+        if name_idx >= tokens.len() {
+            return None;
+        }
+        let name = tokens[name_idx].trim_end_matches(';').to_string();
+        return Some(Box::new(DropStreamingTableStatement::new(name, if_exists)));
+    }
+
+    None
+}
+
+pub fn parse_sql(query: &str) -> Result<Vec<Box<dyn CoordinatorStatement>>> {
+    let trimmed = query.trim();
+    if trimmed.is_empty() {
+        return plan_err!("Query is empty");
+    }
+
+    if let Some(stmt) = try_parse_streaming_statement(trimmed) {
+        return Ok(vec![stmt]);
+    }
+
+    let dialect = FunctionStreamDialect {};
+    let statements = Parser::parse_sql(&dialect, trimmed)
+        .map_err(|e| DataFusionError::Plan(format!("SQL parse error: {e}")))?;
+
+    if statements.is_empty() {
+        return plan_err!("No SQL statements found");
+    }
+
+    statements.into_iter().map(classify_statement).collect()
+}
+
+fn classify_statement(stmt: DFStatement) -> Result<Box<dyn CoordinatorStatement>> {
+    match stmt {
+        DFStatement::CreateFunctionWith { options } => {
+            let properties = sql_options_to_map(&options);
+            let create_fn = CreateFunction::from_properties(properties)
+                .map_err(|e| DataFusionError::Plan(format!("CREATE FUNCTION: {e}")))?;
+            Ok(Box::new(create_fn))
+        }
+        DFStatement::StartFunction { name } => Ok(Box::new(StartFunction::new(name.to_string()))),
+        DFStatement::StopFunction { name } => Ok(Box::new(StopFunction::new(name.to_string()))),
+        DFStatement::DropFunction { func_desc, .. } => {
+            let name = func_desc
+                .first()
+                .map(|d| d.name.to_string())
+                .unwrap_or_default();
+            Ok(Box::new(DropFunction::new(name)))
+        }
+        DFStatement::ShowFunctions { .. } => Ok(Box::new(ShowFunctions::new())),
+        DFStatement::ShowTables { .. } => Ok(Box::new(ShowCatalogTables::new())),
+        DFStatement::ShowCreate { obj_type, obj_name } => {
+            if obj_type != ShowCreateObject::Table {
+                return plan_err!(
+                    "SHOW CREATE {obj_type} is not supported; use SHOW CREATE TABLE <name>"
+                );
+            }
+            Ok(Box::new(ShowCreateTable::new(obj_name.to_string())))
+        },
+        s @ DFStatement::CreateTable(_) => Ok(Box::new(CreateTable::new(s))),
+        s @ DFStatement::CreateStreamingTable { .. } => {
+            Ok(Box::new(StreamingTableStatement::new(s)))
+        }
+        stmt @ DFStatement::Drop { .. } => {
+            {
+                let DFStatement::Drop {
+                    object_type,
+                    names,
+                    ..
+                } = &stmt
+                else {
+                    unreachable!()
+                };
+                if *object_type != ObjectType::Table {
+                    return plan_err!("Only DROP TABLE is supported in this SQL frontend");
+                }
+                if names.len() != 1 {
+                    return plan_err!("DROP TABLE supports exactly one table name per statement");
+                }
+            }
+            Ok(Box::new(DropTableStatement::new(stmt)))
+        }
+        DFStatement::Insert { .. } => plan_err!(
+            "INSERT is not supported; only CREATE TABLE and CREATE STREAMING TABLE (with AS SELECT) \
+             are supported for defining table/query pipelines in this SQL frontend"
+        ),
+        other => plan_err!("Unsupported SQL statement: {other}"),
+    }
+}
+
+/// Convert Vec<SqlOption> (KeyValue pairs) into HashMap.
+fn sql_options_to_map(options: &[SqlOption]) -> HashMap<String, String> {
+    options
+        .iter()
+        .filter_map(|opt| match opt {
+            SqlOption::KeyValue { key, value } => Some((
+                key.value.clone(),
+                value.to_string().trim_matches('\'').to_string(),
+            )),
+            _ => None,
+        })
+        .collect()
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn first_stmt(sql: &str) -> Box<dyn CoordinatorStatement> {
+        let mut stmts = parse_sql(sql).unwrap();
+        assert!(!stmts.is_empty());
+        stmts.remove(0)
+    }
+
+    fn is_type(stmt: &dyn CoordinatorStatement, prefix: &str) -> bool {
+        format!("{:?}", stmt).starts_with(prefix)
+    }
+
+    #[test]
+    fn test_parse_create_function() {
+        let sql =
+            "CREATE FUNCTION WITH ('function_path'='./test.wasm', 'config_path'='./config.yml')";
+        let stmt = first_stmt(sql);
+        assert!(is_type(stmt.as_ref(), "CreateFunction"));
+    }
+
+    #[test]
+    fn test_parse_create_function_minimal() {
+        let sql = "CREATE FUNCTION WITH ('function_path'='./processor.wasm')";
+        let stmt = first_stmt(sql);
+        assert!(is_type(stmt.as_ref(), "CreateFunction"));
+    }
+
+    #[test]
+    fn test_parse_drop_function() {
+        let stmt = first_stmt("DROP FUNCTION my_task");
+        assert!(is_type(stmt.as_ref(), "DropFunction"));
+    }
+
+    #[test]
+    fn test_parse_start_function() {
+        let stmt = first_stmt("START FUNCTION my_task");
+        assert!(is_type(stmt.as_ref(), "StartFunction"));
+    }
+
+    #[test]
+    fn test_parse_stop_function() {
+        let stmt = first_stmt("STOP FUNCTION my_task");
+        assert!(is_type(stmt.as_ref(), "StopFunction"));
+    }
+
+    #[test]
+    fn test_parse_show_functions() {
+        let stmt = first_stmt("SHOW FUNCTIONS");
+        assert!(is_type(stmt.as_ref(), "ShowFunctions"));
+    }
+
+    #[test]
+    fn test_parse_show_tables() {
+        let stmt = first_stmt("SHOW TABLES");
+        assert!(is_type(stmt.as_ref(), "ShowCatalogTables"));
+    }
+
+    #[test]
+    fn test_parse_show_create_table() {
+        let stmt = first_stmt("SHOW CREATE TABLE my_src");
+        assert!(is_type(stmt.as_ref(), "ShowCreateTable"));
+    }
+
+    #[test]
+    fn test_parse_create_table() {
+        let stmt = first_stmt("CREATE TABLE foo (id INT, name VARCHAR)");
+        assert!(is_type(stmt.as_ref(), "CreateTable"));
+    }
+
+    #[test]
+    fn test_parse_create_table_connector_source_ddl() {
+        let sql = concat!(
+            "CREATE TABLE kafka_src (id BIGINT, ts TIMESTAMP NOT NULL, WATERMARK FOR ts) ",
+            "WITH ('connector' = 'kafka', 'format' = 'json', 'topic' = 'events')",
+        );
+        let stmt = first_stmt(sql);
+        assert!(is_type(stmt.as_ref(), "CreateTable"));
+    }
+
+    #[test]
+    fn test_parse_drop_table() {
+        let stmt = first_stmt("DROP TABLE foo");
+        assert!(is_type(stmt.as_ref(), "DropTableStatement"));
+    }
+
+    #[test]
+    fn test_parse_drop_table_if_exists() {
+        let stmt = first_stmt("DROP TABLE IF EXISTS foo");
+        assert!(is_type(stmt.as_ref(), "DropTableStatement"));
+    }
+
+    #[test]
+    fn test_parse_drop_streaming_table() {
+        let stmt = first_stmt("DROP STREAMING TABLE my_sink");
+        assert!(is_type(stmt.as_ref(), "DropStreamingTableStatement"));
+    }
+
+    #[test]
+    fn test_parse_drop_streaming_table_if_exists() {
+        let stmt = first_stmt("DROP STREAMING TABLE IF EXISTS my_sink");
+        assert!(is_type(stmt.as_ref(), "DropStreamingTableStatement"));
+    }
+
+    #[test]
+    fn test_parse_show_streaming_tables() {
+        let stmt = first_stmt("SHOW STREAMING TABLES");
+        assert!(is_type(stmt.as_ref(), "ShowStreamingTables"));
+    }
+
+    #[test]
+    fn test_parse_show_create_streaming_table() {
+        let stmt = first_stmt("SHOW CREATE STREAMING TABLE my_sink");
+        assert!(is_type(stmt.as_ref(), "ShowCreateStreamingTable"));
+    }
+
+    /// `CREATE STREAMING TABLE` is the sink DDL supported by FunctionStream (not `CREATE STREAM TABLE`).
+    #[test]
+    fn test_parse_create_streaming_table() {
+        let sql = concat!(
+            "CREATE STREAMING TABLE my_sink ",
+            "WITH ('connector' = 'kafka') ",
+            "AS SELECT id FROM src",
+        );
+        let stmt = first_stmt(sql);
+        assert!(
+            is_type(stmt.as_ref(), "StreamingTableStatement"),
+            "expected StreamingTableStatement, got {:?}",
+            stmt
+        );
+    }
+
+    #[test]
+    fn test_parse_create_streaming_table_case_insensitive() {
+        let sql = concat!(
+            "create streaming table out_q ",
+            "with ('connector' = 'memory') ",
+            "as select 1 as x",
+        );
+        let stmt = first_stmt(sql);
+        assert!(is_type(stmt.as_ref(), "StreamingTableStatement"));
+    }
+
+    #[test]
+    fn test_parse_case_insensitive() {
+        assert!(is_type(
+            first_stmt("create function with ('function_path'='./test.wasm')").as_ref(),
+            "CreateFunction"
+        ));
+        assert!(is_type(
+            first_stmt("show functions").as_ref(),
+            "ShowFunctions"
+        ));
+        assert!(is_type(
+            first_stmt("start function my_task").as_ref(),
+            "StartFunction"
+        ));
+    }
+
+    #[test]
+    fn test_parse_multiple_statements() {
+        let sql = concat!(
+            "CREATE TABLE t1 (id INT); ",
+            "CREATE STREAMING TABLE sk WITH ('connector' = 'kafka') AS SELECT id FROM t1",
+        );
+        let stmts = parse_sql(sql).unwrap();
+        assert_eq!(stmts.len(), 2);
+        assert!(is_type(stmts[0].as_ref(), "CreateTable"));
+        assert!(is_type(stmts[1].as_ref(), "StreamingTableStatement"));
+    }
+
+    #[test]
+    fn test_parse_empty() {
+        assert!(parse_sql("").is_err());
+        assert!(parse_sql("  ").is_err());
+    }
+
+    #[test]
+    fn test_parse_unsupported_statement() {
+        let result = parse_sql("SELECT 1");
+        assert!(result.is_err());
+    }
+
+    #[test]
+    fn test_insert_not_supported() {
+        let err = parse_sql("INSERT INTO sink SELECT * FROM src").unwrap_err();
+        let msg = err.to_string();
+        assert!(
+            msg.contains("INSERT") && msg.contains("not supported"),
+            "expected explicit INSERT rejection, got: {msg}"
+        );
+        assert!(
+            msg.contains("CREATE TABLE") || msg.contains("CREATE STREAMING TABLE"),
+            "error should mention supported alternatives, got: {msg}"
+        );
+    }
+
+    #[test]
+    fn test_parse_with_extra_properties() {
+        let sql = r#"CREATE FUNCTION WITH (
+            'function_path'='./test.wasm',
+            'config_path'='./config.yml',
+            'parallelism'='4',
+            'memory-limit'='256mb'
+        )"#;
+        let stmt = first_stmt(sql);
+        assert!(is_type(stmt.as_ref(), "CreateFunction"));
+    }
+}
diff --git a/src/sql/parser/sql_parser.rs b/src/sql/parser/sql_parser.rs
deleted file mode 100644
index dc110745..00000000
--- a/src/sql/parser/sql_parser.rs
+++ /dev/null
@@ -1,249 +0,0 @@
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-use pest::Parser;
-use pest_derive::Parser;
-
-use super::ParseError;
-use crate::coordinator::{
-    CreateFunction, DropFunction, ShowFunctions, StartFunction, Statement, StopFunction,
-};
-use std::collections::HashMap;
-
-#[derive(Parser)]
-#[grammar = "src/sql/grammar.pest"]
-struct Grammar;
-
-#[derive(Debug, Default)]
-pub struct SqlParser;
-
-impl SqlParser {
-    pub fn parse(sql: &str) -> Result<Box<dyn Statement>, ParseError> {
-        let pairs = Grammar::parse(Rule::statement, sql)
-            .map_err(|e| ParseError::new(format!("Parse error: {}", e)))?;
-
-        for pair in pairs {
-            return match pair.as_rule() {
-                Rule::create_stmt => {
-                    handle_create_stmt(pair).map(|stmt| stmt as Box<dyn Statement>)
-                }
-                Rule::drop_stmt => handle_drop_stmt(pair).map(|stmt| stmt as Box<dyn Statement>),
-                Rule::start_stmt => handle_start_stmt(pair).map(|stmt| stmt as Box<dyn Statement>),
-                Rule::stop_stmt => handle_stop_stmt(pair).map(|stmt| stmt as Box<dyn Statement>),
-                Rule::show_stmt => handle_show_stmt(pair).map(|stmt| stmt as Box<dyn Statement>),
-                _ => continue,
-            };
-        }
-
-        Err(ParseError::new("Unknown statement type"))
-    }
-}
-
-fn handle_create_stmt(
-    pair: pest::iterators::Pair<Rule>,
-) -> Result<Box<CreateFunction>, ParseError> {
-    let mut inner = pair.into_inner();
-    // Note: name is read from config file, not from SQL statement
-    // Pass empty string here, name will be read from config file later
-    let properties = inner
-        .next()
-        .map(parse_properties)
-        .ok_or_else(|| ParseError::new("Missing WITH clause"))?;
-
-    Ok(Box::new(
-        CreateFunction::from_properties(properties).map_err(ParseError::from)?,
-    ))
-}
-
-fn handle_drop_stmt(pair: pest::iterators::Pair<Rule>) -> Result<Box<DropFunction>, ParseError> {
-    let mut inner = pair.into_inner();
-    let name = inner.next().map(extract_string).unwrap_or_default();
-    Ok(Box::new(DropFunction::new(name)))
-}
-
-fn handle_start_stmt(pair: pest::iterators::Pair<Rule>) -> Result<Box<StartFunction>, ParseError> {
-    let mut inner = pair.into_inner();
-    let name = inner.next().map(extract_string).unwrap_or_default();
-    Ok(Box::new(StartFunction::new(name)))
-}
-
-fn handle_stop_stmt(pair: pest::iterators::Pair<Rule>) -> Result<Box<StopFunction>, ParseError> {
-    let mut inner = pair.into_inner();
-    let name = inner.next().map(extract_string).unwrap_or_default();
-    Ok(Box::new(StopFunction::new(name)))
-}
-
-fn handle_show_stmt(_pair: pest::iterators::Pair<Rule>) -> Result<Box<ShowFunctions>, ParseError> {
-    Ok(Box::new(ShowFunctions::new()))
-}
-
-fn extract_string(pair: pest::iterators::Pair<Rule>) -> String {
-    match pair.as_rule() {
-        Rule::string_literal => {
-            let s = pair.as_str();
-            if (s.starts_with('\'') && s.ends_with('\''))
-                || (s.starts_with('"') && s.ends_with('"'))
-            {
-                unescape_string(&s[1..s.len() - 1])
-            } else {
-                unescape_string(s)
-            }
-        }
-        Rule::identifier => pair.as_str().to_string(),
-        _ => pair.as_str().to_string(),
-    }
-}
-
-fn unescape_string(s: &str) -> String {
-    let mut result = String::with_capacity(s.len());
-    let mut chars = s.chars().peekable();
-
-    while let Some(ch) = chars.next() {
-        if ch == '\\' {
-            if let Some(&next) = chars.peek() {
-                chars.next();
-                match next {
-                    'n' => result.push('\n'),
-                    't' => result.push('\t'),
-                    'r' => result.push('\r'),
-                    '\\' => result.push('\\'),
-                    '\'' => result.push('\''),
-                    '"' => result.push('"'),
-                    _ => {
-                        result.push('\\');
-                        result.push(next);
-                    }
-                }
-            } else {
-                result.push(ch);
-            }
-        } else {
-            result.push(ch);
-        }
-    }
-
-    result
-}
-
-fn parse_properties(pair: pest::iterators::Pair<Rule>) -> HashMap<String, String> {
-    let mut properties = HashMap::new();
-
-    for prop in pair.into_inner() {
-        if prop.as_rule() == Rule::property {
-            let mut inner = prop.into_inner();
-            if let (Some(key_pair), Some(val_pair)) = (inner.next(), inner.next()) {
-                let key = key_pair
-                    .into_inner()
-                    .next()
-                    .map(extract_string)
-                    .unwrap_or_default();
-                let value = val_pair
-                    .into_inner()
-                    .next()
-                    .map(extract_string)
-                    .unwrap_or_default();
-                properties.insert(key, value);
-            }
-        }
-    }
-
-    properties
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn test_create_function() {
-        let sql =
-            "CREATE FUNCTION WITH ('function_path'='./test.wasm', 'config_path'='./config.yml')";
-        let _stmt = SqlParser::parse(sql).unwrap();
-    }
-
-    #[test]
-    fn test_create_function_minimal() {
-        let sql = "CREATE FUNCTION WITH ('function_path'='./processor.wasm')";
-        let _stmt = SqlParser::parse(sql).unwrap();
-    }
-
-    // Note: SQL only supports Path mode, not Bytes mode
-    // Bytes mode is only for gRPC requests
-
-    #[test]
-    fn test_drop_function() {
-        let sql = "DROP FUNCTION my_task";
-        let _stmt = SqlParser::parse(sql).unwrap();
-    }
-
-    #[test]
-    fn test_start_function() {
-        let sql = "START FUNCTION my_task";
-        let _stmt = SqlParser::parse(sql).unwrap();
-    }
-
-    #[test]
-    fn test_stop_function() {
-        let sql = "STOP FUNCTION my_task";
-        let _stmt = SqlParser::parse(sql).unwrap();
-    }
-
-    #[test]
-    fn test_show_functions() {
-        let sql = "SHOW FUNCTIONS";
-        let _stmt = SqlParser::parse(sql).unwrap();
-    }
-
-    #[test]
-    fn test_case_insensitive_keywords() {
-        let sql1 = "create function with ('function_path'='./test.wasm')";
-        let _stmt1 = SqlParser::parse(sql1).unwrap();
-
-        let sql2 = "Create Function With ('Function_Path'='./test.wasm')";
-        let _stmt2 = SqlParser::parse(sql2).unwrap();
-
-        let sql3 = "show functions";
-        let _stmt3 = SqlParser::parse(sql3).unwrap();
-
-        let sql4 = "start function my_task";
-        let _stmt4 = SqlParser::parse(sql4).unwrap();
-    }
-
-    #[test]
-    fn test_case_insensitive_property_keys() {
-        let sql1 =
-            "CREATE FUNCTION WITH ('function_path'='./test.wasm', 'config_path'='./config.yml')";
-        let _stmt1 = SqlParser::parse(sql1).unwrap();
-
-        let sql2 =
-            "CREATE FUNCTION WITH ('Function_Path'='./test.wasm', 'Config_Path'='./config.yml')";
-        let _stmt2 = SqlParser::parse(sql2).unwrap();
-
-        let sql3 =
-            "CREATE FUNCTION WITH ('FUNCTION_PATH'='./test.wasm', 'CONFIG_PATH'='./config.yml')";
-        let _stmt3 = SqlParser::parse(sql3).unwrap();
-
-        // Note: SQL only supports Path mode (function_path, config_path)
-        // Bytes mode (function, config) is only for gRPC requests
-    }
-
-    #[test]
-    fn test_with_extra_properties() {
-        let sql = r#"CREATE FUNCTION WITH (
-            'function_path'='./test.wasm',
-            'config_path'='./config.yml',
-            'parallelism'='4',
-            'memory-limit'='256mb'
-        )"#;
-        let _stmt = SqlParser::parse(sql).unwrap();
-    }
-}
diff --git a/src/sql/physical/cdc/encode.rs b/src/sql/physical/cdc/encode.rs
new file mode 100644
index 00000000..07495a38
--- /dev/null
+++ b/src/sql/physical/cdc/encode.rs
@@ -0,0 +1,329 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+use std::any::Any;
+use std::collections::HashMap;
+use std::pin::Pin;
+use std::sync::Arc;
+use std::task::{Context, Poll};
+
+use datafusion::arrow::array::AsArray;
+use datafusion::arrow::array::{
+    Array, BooleanArray, FixedSizeBinaryArray, PrimitiveArray, RecordBatch, StringArray,
+    StructArray, TimestampNanosecondBuilder,
+};
+use datafusion::arrow::buffer::NullBuffer;
+use datafusion::arrow::compute::take;
+use datafusion::arrow::datatypes::{DataType, Field, Schema, SchemaRef, UInt64Type};
+use datafusion::arrow::datatypes::TimestampNanosecondType;
+use datafusion::common::{DataFusionError, Result};
+use datafusion::execution::{RecordBatchStream, SendableRecordBatchStream, TaskContext};
+use datafusion::physical_plan::{DisplayAs, ExecutionPlan, PlanProperties};
+use futures::{ready, stream::Stream, StreamExt};
+
+use crate::sql::common::constants::{cdc, debezium_op_short, physical_plan_node_name};
+use crate::sql::common::{TIMESTAMP_FIELD, UPDATING_META_FIELD};
+use crate::sql::physical::readers::make_stream_properties;
+
+#[derive(Debug)]
+pub struct ToDebeziumExec {
+    input: Arc<dyn ExecutionPlan>,
+    schema: SchemaRef,
+    properties: PlanProperties,
+}
+
+impl ToDebeziumExec {
+    pub fn try_new(input: Arc<dyn ExecutionPlan>) -> Result<Self> {
+        let input_schema = input.schema();
+        let timestamp_index = input_schema.index_of(TIMESTAMP_FIELD)?;
+        let struct_fields: Vec<_> = input_schema
+            .fields()
+            .into_iter()
+            .enumerate()
+            .filter_map(|(index, field)| {
+                if field.name() == UPDATING_META_FIELD || index == timestamp_index {
+                    None
+                } else {
+                    Some(field.clone())
+                }
+            })
+            .collect();
+        let struct_data_type = DataType::Struct(struct_fields.into());
+        let before_field = Arc::new(Field::new(cdc::BEFORE, struct_data_type.clone(), true));
+        let after_field = Arc::new(Field::new(cdc::AFTER, struct_data_type, true));
+        let op_field = Arc::new(Field::new(cdc::OP, DataType::Utf8, false));
+        let timestamp_field = Arc::new(input_schema.field(timestamp_index).clone());
+
+        let output_schema = Arc::new(Schema::new(vec![
+            before_field,
+            after_field,
+            op_field,
+            timestamp_field,
+        ]));
+
+        Ok(Self {
+            input,
+            schema: output_schema.clone(),
+            properties: make_stream_properties(output_schema),
+        })
+    }
+
+    pub(crate) fn from_decoded_parts(input: Arc<dyn ExecutionPlan>, schema: SchemaRef) -> Self {
+        Self {
+            properties: make_stream_properties(schema.clone()),
+            input,
+            schema,
+        }
+    }
+}
+
+impl DisplayAs for ToDebeziumExec {
+    fn fmt_as(
+        &self,
+        _t: datafusion::physical_plan::DisplayFormatType,
+        f: &mut std::fmt::Formatter,
+    ) -> std::fmt::Result {
+        write!(f, "ToDebeziumExec")
+    }
+}
+
+impl ExecutionPlan for ToDebeziumExec {
+    fn name(&self) -> &str {
+        physical_plan_node_name::TO_DEBEZIUM_EXEC
+    }
+
+    fn as_any(&self) -> &dyn Any {
+        self as &dyn Any
+    }
+
+    fn schema(&self) -> SchemaRef {
+        self.schema.clone()
+    }
+
+    fn properties(&self) -> &PlanProperties {
+        &self.properties
+    }
+
+    fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> {
+        vec![&self.input]
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        if children.len() != 1 {
+            return Err(DataFusionError::Internal(
+                "ToDebeziumExec wrong number of children".to_string(),
+            ));
+        }
+        Ok(Arc::new(ToDebeziumExec::try_new(children[0].clone())?))
+    }
+
+    fn execute(
+        &self,
+        partition: usize,
+        context: Arc<TaskContext>,
+    ) -> Result<SendableRecordBatchStream> {
+        let updating_meta_index = self.input.schema().index_of(UPDATING_META_FIELD).ok();
+        let timestamp_index = self.input.schema().index_of(TIMESTAMP_FIELD)?;
+        let struct_projection = (0..self.input.schema().fields().len())
+            .filter(|index| {
+                updating_meta_index
+                    .map(|is_retract_index| *index != is_retract_index)
+                    .unwrap_or(true)
+                    && *index != timestamp_index
+            })
+            .collect();
+
+        Ok(Box::pin(ToDebeziumStream {
+            input: self.input.execute(partition, context)?,
+            schema: self.schema.clone(),
+            updating_meta_index,
+            timestamp_index,
+            struct_projection,
+        }))
+    }
+
+    fn reset(&self) -> Result<()> {
+        self.input.reset()
+    }
+}
+
+struct ToDebeziumStream {
+    input: SendableRecordBatchStream,
+    schema: SchemaRef,
+    updating_meta_index: Option<usize>,
+    timestamp_index: usize,
+    struct_projection: Vec<usize>,
+}
+
+fn compact_changelog_by_id<'a>(
+    num_rows: usize,
+    is_retract: &'a BooleanArray,
+    id: &'a FixedSizeBinaryArray,
+    timestamps: &'a PrimitiveArray<TimestampNanosecondType>,
+) -> (
+    Vec<&'a [u8]>,
+    HashMap<&'a [u8], (usize, usize, bool, bool, i64)>,
+) {
+    let mut id_map: HashMap<&[u8], (usize, usize, bool, bool, i64)> = HashMap::new();
+    let mut order = vec![];
+    for i in 0..num_rows {
+        let row_id = id.value(i);
+        let is_create = !is_retract.value(i);
+        let timestamp = timestamps.value(i);
+
+        id_map
+            .entry(row_id)
+            .and_modify(|e| {
+                e.1 = i;
+                e.3 = is_create;
+                e.4 = e.4.max(timestamp);
+            })
+            .or_insert_with(|| {
+                order.push(row_id);
+                (i, i, is_create, is_create, timestamp)
+            });
+    }
+    (order, id_map)
+}
+
+impl ToDebeziumStream {
+    fn as_debezium_batch(&mut self, batch: &RecordBatch) -> Result<RecordBatch> {
+        let value_struct = batch.project(&self.struct_projection)?;
+        let timestamps = batch
+            .column(self.timestamp_index)
+            .as_primitive::<TimestampNanosecondType>();
+
+        let columns: Vec<Arc<dyn Array>> = if let Some(metadata_index) = self.updating_meta_index {
+            let metadata = batch
+                .column(metadata_index)
+                .as_any()
+                .downcast_ref::<StructArray>()
+                .ok_or_else(|| {
+                    DataFusionError::Internal("Invalid type for updating_meta column".to_string())
+                })?;
+
+            let is_retract = metadata.column(0).as_boolean();
+            let id = metadata.column(1).as_fixed_size_binary();
+
+            let (order, id_map) =
+                compact_changelog_by_id(batch.num_rows(), is_retract, id, timestamps);
+
+            let mut before = Vec::with_capacity(id_map.len());
+            let mut after = Vec::with_capacity(id_map.len());
+            let mut op = Vec::with_capacity(id_map.len());
+            let mut ts = TimestampNanosecondBuilder::with_capacity(id_map.len());
+
+            for row_id in order {
+                let (first_idx, last_idx, first_is_create, last_is_create, timestamp) =
+                    id_map.get(row_id).unwrap();
+
+                if *first_is_create && *last_is_create {
+                    before.push(None);
+                    after.push(Some(*last_idx));
+                    op.push(debezium_op_short::CREATE);
+                } else if !(*first_is_create) && !(*last_is_create) {
+                    before.push(Some(*first_idx));
+                    after.push(None);
+                    op.push(debezium_op_short::DELETE);
+                } else if !(*first_is_create) && *last_is_create {
+                    before.push(Some(*first_idx));
+                    after.push(Some(*last_idx));
+                    op.push(debezium_op_short::UPDATE);
+                } else {
+                    continue;
+                }
+
+                ts.append_value(*timestamp);
+            }
+
+            let before_array = Self::create_output_array(&value_struct, &before)?;
+            let after_array = Self::create_output_array(&value_struct, &after)?;
+            let op_array = StringArray::from(op);
+
+            vec![
+                Arc::new(before_array),
+                Arc::new(after_array),
+                Arc::new(op_array),
+                Arc::new(ts.finish()),
+            ]
+        } else {
+            let after_array = StructArray::try_new(
+                value_struct.schema().fields().clone(),
+                value_struct.columns().to_vec(),
+                None,
+            )?;
+
+            let before_array = StructArray::new_null(
+                value_struct.schema().fields().clone(),
+                value_struct.num_rows(),
+            );
+
+            vec![
+                Arc::new(before_array),
+                Arc::new(after_array),
+                Arc::new(StringArray::from(vec![
+                    debezium_op_short::CREATE;
+                    value_struct.num_rows()
+                ])),
+                batch.column(self.timestamp_index).clone(),
+            ]
+        };
+
+        Ok(RecordBatch::try_new(self.schema.clone(), columns)?)
+    }
+
+    fn create_output_array(
+        value_struct: &RecordBatch,
+        indices: &[Option<usize>],
+    ) -> Result<StructArray> {
+        let mut arrays: Vec<Arc<dyn Array>> = Vec::with_capacity(value_struct.num_columns());
+        for col in value_struct.columns() {
+            let new_array = take(
+                col.as_ref(),
+                &indices
+                    .iter()
+                    .map(|&idx| idx.map(|i| i as u64))
+                    .collect::<PrimitiveArray<UInt64Type>>(),
+                None,
+            )?;
+            arrays.push(new_array);
+        }
+
+        Ok(StructArray::try_new(
+            value_struct.schema().fields().clone(),
+            arrays,
+            Some(NullBuffer::from(
+                indices.iter().map(|&idx| idx.is_some()).collect::<Vec<_>>(),
+            )),
+        )?)
+    }
+}
+
+impl Stream for ToDebeziumStream {
+    type Item = Result<RecordBatch>;
+
+    fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context) -> Poll<Option<Self::Item>> {
+        let result =
+            ready!(self.input.poll_next_unpin(cx)).map(|result| self.as_debezium_batch(&result?));
+        Poll::Ready(result)
+    }
+}
+
+impl RecordBatchStream for ToDebeziumStream {
+    fn schema(&self) -> SchemaRef {
+        self.schema.clone()
+    }
+}
diff --git a/src/sql/physical/cdc/mod.rs b/src/sql/physical/cdc/mod.rs
new file mode 100644
index 00000000..9e32e67a
--- /dev/null
+++ b/src/sql/physical/cdc/mod.rs
@@ -0,0 +1,18 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+mod encode;
+mod unroll;
+
+pub use encode::ToDebeziumExec;
+pub use unroll::DebeziumUnrollingExec;
diff --git a/src/sql/physical/cdc/unroll.rs b/src/sql/physical/cdc/unroll.rs
new file mode 100644
index 00000000..f40beb06
--- /dev/null
+++ b/src/sql/physical/cdc/unroll.rs
@@ -0,0 +1,298 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+use std::any::Any;
+use std::pin::Pin;
+use std::sync::Arc;
+use std::task::{Context, Poll};
+
+use datafusion::arrow::array::AsArray;
+use datafusion::arrow::array::{
+    Array, BooleanBuilder, RecordBatch, StringArray, StructArray, TimestampNanosecondArray,
+    TimestampNanosecondBuilder, UInt32Builder,
+};
+use datafusion::arrow::compute::{concat, take};
+use datafusion::arrow::datatypes::{DataType, Field, Schema, SchemaRef, TimeUnit};
+use datafusion::common::{DataFusionError, Result, plan_err};
+use datafusion::execution::{RecordBatchStream, SendableRecordBatchStream, TaskContext};
+use datafusion::logical_expr::ColumnarValue;
+use datafusion::physical_plan::{DisplayAs, ExecutionPlan, PlanProperties};
+use futures::{ready, stream::Stream, StreamExt};
+
+use crate::sql::common::constants::{cdc, debezium_op_short, physical_plan_node_name};
+use crate::sql::common::TIMESTAMP_FIELD;
+use crate::sql::functions::MultiHashFunction;
+use crate::sql::physical::meta::{updating_meta_field, updating_meta_fields};
+use crate::sql::physical::readers::make_stream_properties;
+
+#[derive(Debug)]
+pub struct DebeziumUnrollingExec {
+    input: Arc<dyn ExecutionPlan>,
+    schema: SchemaRef,
+    properties: PlanProperties,
+    primary_keys: Vec<usize>,
+}
+
+impl DebeziumUnrollingExec {
+    pub fn try_new(input: Arc<dyn ExecutionPlan>, primary_keys: Vec<usize>) -> Result<Self> {
+        let input_schema = input.schema();
+        let before_index = input_schema.index_of(cdc::BEFORE)?;
+        let after_index = input_schema.index_of(cdc::AFTER)?;
+        let op_index = input_schema.index_of(cdc::OP)?;
+        let _timestamp_index = input_schema.index_of(TIMESTAMP_FIELD)?;
+        let before_type = input_schema.field(before_index).data_type();
+        let after_type = input_schema.field(after_index).data_type();
+        if before_type != after_type {
+            return Err(DataFusionError::Internal(
+                "before and after columns must have the same type".to_string(),
+            ));
+        }
+        let op_type = input_schema.field(op_index).data_type();
+        if *op_type != DataType::Utf8 {
+            return Err(DataFusionError::Internal(
+                "op column must be a string".to_string(),
+            ));
+        }
+        let DataType::Struct(fields) = before_type else {
+            return Err(DataFusionError::Internal(
+                "before and after columns must be structs".to_string(),
+            ));
+        };
+        let mut fields = fields.to_vec();
+        fields.push(updating_meta_field());
+        fields.push(Arc::new(Field::new(
+            TIMESTAMP_FIELD,
+            DataType::Timestamp(TimeUnit::Nanosecond, None),
+            false,
+        )));
+
+        let schema = Arc::new(Schema::new(fields));
+        Ok(Self {
+            input,
+            schema: schema.clone(),
+            properties: make_stream_properties(schema),
+            primary_keys,
+        })
+    }
+
+    pub(crate) fn from_decoded_parts(
+        input: Arc<dyn ExecutionPlan>,
+        schema: SchemaRef,
+        primary_keys: Vec<usize>,
+    ) -> Self {
+        Self {
+            properties: make_stream_properties(schema.clone()),
+            input,
+            schema,
+            primary_keys,
+        }
+    }
+
+    pub fn primary_key_indices(&self) -> &[usize] {
+        &self.primary_keys
+    }
+}
+
+impl DisplayAs for DebeziumUnrollingExec {
+    fn fmt_as(
+        &self,
+        _t: datafusion::physical_plan::DisplayFormatType,
+        f: &mut std::fmt::Formatter,
+    ) -> std::fmt::Result {
+        write!(f, "DebeziumUnrollingExec")
+    }
+}
+
+impl ExecutionPlan for DebeziumUnrollingExec {
+    fn name(&self) -> &str {
+        physical_plan_node_name::DEBEZIUM_UNROLLING_EXEC
+    }
+
+    fn as_any(&self) -> &dyn Any {
+        self as &dyn Any
+    }
+
+    fn schema(&self) -> SchemaRef {
+        self.schema.clone()
+    }
+
+    fn properties(&self) -> &PlanProperties {
+        &self.properties
+    }
+
+    fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> {
+        vec![&self.input]
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        if children.len() != 1 {
+            return Err(DataFusionError::Internal(
+                "DebeziumUnrollingExec wrong number of children".to_string(),
+            ));
+        }
+        Ok(Arc::new(DebeziumUnrollingExec {
+            input: children[0].clone(),
+            schema: self.schema.clone(),
+            properties: self.properties.clone(),
+            primary_keys: self.primary_keys.clone(),
+        }))
+    }
+
+    fn execute(
+        &self,
+        partition: usize,
+        context: Arc<TaskContext>,
+    ) -> Result<SendableRecordBatchStream> {
+        Ok(Box::pin(DebeziumUnrollingStream::try_new(
+            self.input.execute(partition, context)?,
+            self.schema.clone(),
+            self.primary_keys.clone(),
+        )?))
+    }
+
+    fn reset(&self) -> Result<()> {
+        self.input.reset()
+    }
+}
+
+struct DebeziumUnrollingStream {
+    input: SendableRecordBatchStream,
+    schema: SchemaRef,
+    before_index: usize,
+    after_index: usize,
+    op_index: usize,
+    timestamp_index: usize,
+    primary_keys: Vec<usize>,
+}
+
+impl DebeziumUnrollingStream {
+    fn try_new(
+        input: SendableRecordBatchStream,
+        schema: SchemaRef,
+        primary_keys: Vec<usize>,
+    ) -> Result<Self> {
+        if primary_keys.is_empty() {
+            return plan_err!("there must be at least one primary key for a Debezium source");
+        }
+        let input_schema = input.schema();
+        let before_index = input_schema.index_of(cdc::BEFORE)?;
+        let after_index = input_schema.index_of(cdc::AFTER)?;
+        let op_index = input_schema.index_of(cdc::OP)?;
+        let timestamp_index = input_schema.index_of(TIMESTAMP_FIELD)?;
+
+        Ok(Self {
+            input,
+            schema,
+            before_index,
+            after_index,
+            op_index,
+            timestamp_index,
+            primary_keys,
+        })
+    }
+
+    fn unroll_batch(&self, batch: &RecordBatch) -> Result<RecordBatch> {
+        let before = batch.column(self.before_index).as_ref();
+        let after = batch.column(self.after_index).as_ref();
+        let op = batch
+            .column(self.op_index)
+            .as_any()
+            .downcast_ref::<StringArray>()
+            .ok_or_else(|| DataFusionError::Internal("op column is not a string".to_string()))?;
+
+        let timestamp = batch
+            .column(self.timestamp_index)
+            .as_any()
+            .downcast_ref::<TimestampNanosecondArray>()
+            .ok_or_else(|| {
+                DataFusionError::Internal("timestamp column is not a timestamp".to_string())
+            })?;
+
+        let num_rows = batch.num_rows();
+        let combined_array = concat(&[before, after])?;
+        let mut take_indices = UInt32Builder::with_capacity(num_rows);
+        let mut is_retract_builder = BooleanBuilder::with_capacity(num_rows);
+
+        let mut timestamp_builder = TimestampNanosecondBuilder::with_capacity(2 * num_rows);
+        for i in 0..num_rows {
+            let op = op.value(i);
+            match op {
+                debezium_op_short::CREATE | debezium_op_short::READ => {
+                    take_indices.append_value((i + num_rows) as u32);
+                    is_retract_builder.append_value(false);
+                    timestamp_builder.append_value(timestamp.value(i));
+                }
+                debezium_op_short::UPDATE => {
+                    take_indices.append_value(i as u32);
+                    is_retract_builder.append_value(true);
+                    timestamp_builder.append_value(timestamp.value(i));
+                    take_indices.append_value((i + num_rows) as u32);
+                    is_retract_builder.append_value(false);
+                    timestamp_builder.append_value(timestamp.value(i));
+                }
+                debezium_op_short::DELETE => {
+                    take_indices.append_value(i as u32);
+                    is_retract_builder.append_value(true);
+                    timestamp_builder.append_value(timestamp.value(i));
+                }
+                _ => {
+                    return Err(DataFusionError::Internal(format!(
+                        "unexpected op value: {op}"
+                    )));
+                }
+            }
+        }
+        let take_indices = take_indices.finish();
+        let unrolled_array = take(&combined_array, &take_indices, None)?;
+
+        let mut columns = unrolled_array.as_struct().columns().to_vec();
+
+        let hash = MultiHashFunction::default().invoke(
+            &self
+                .primary_keys
+                .iter()
+                .map(|i| ColumnarValue::Array(columns[*i].clone()))
+                .collect::<Vec<_>>(),
+        )?;
+
+        let ids = hash.into_array(num_rows)?;
+
+        let meta = StructArray::try_new(
+            updating_meta_fields(),
+            vec![Arc::new(is_retract_builder.finish()), ids],
+            None,
+        )?;
+        columns.push(Arc::new(meta));
+        columns.push(Arc::new(timestamp_builder.finish()));
+        Ok(RecordBatch::try_new(self.schema.clone(), columns)?)
+    }
+}
+
+impl Stream for DebeziumUnrollingStream {
+    type Item = Result<RecordBatch>;
+
+    fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context) -> Poll<Option<Self::Item>> {
+        let result =
+            ready!(self.input.poll_next_unpin(cx)).map(|result| self.unroll_batch(&result?));
+        Poll::Ready(result)
+    }
+}
+
+impl RecordBatchStream for DebeziumUnrollingStream {
+    fn schema(&self) -> SchemaRef {
+        self.schema.clone()
+    }
+}
diff --git a/src/sql/physical/codec.rs b/src/sql/physical/codec.rs
new file mode 100644
index 00000000..c8349dc6
--- /dev/null
+++ b/src/sql/physical/codec.rs
@@ -0,0 +1,271 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+use std::sync::Arc;
+
+use datafusion::arrow::array::RecordBatch;
+use datafusion::arrow::datatypes::Schema;
+use datafusion::common::{DataFusionError, Result, UnnestOptions, not_impl_err};
+use datafusion::execution::FunctionRegistry;
+use datafusion::logical_expr::ScalarUDF;
+use datafusion::physical_plan::ExecutionPlan;
+use datafusion::physical_plan::unnest::{ListUnnest, UnnestExec};
+use datafusion_proto::physical_plan::PhysicalExtensionCodec;
+use prost::Message;
+use protocol::grpc::api::{
+    DebeziumDecodeNode, DebeziumEncodeNode, FsExecNode, MemExecNode, UnnestExecNode,
+    fs_exec_node::Node,
+};
+use tokio::sync::mpsc::UnboundedReceiver;
+
+use crate::sql::analysis::UNNESTED_COL;
+use crate::sql::common::constants::{mem_exec_join_side, window_function_udf};
+use crate::sql::physical::udfs::window;
+use crate::sql::physical::cdc::{DebeziumUnrollingExec, ToDebeziumExec};
+use crate::sql::physical::readers::{
+    FsMemExec, RecordBatchVecReader, RwLockRecordBatchReader, UnboundedRecordBatchReader,
+};
+
+#[derive(Debug)]
+pub struct FsPhysicalExtensionCodec {
+    pub context: DecodingContext,
+}
+
+impl Default for FsPhysicalExtensionCodec {
+    fn default() -> Self {
+        Self {
+            context: DecodingContext::None,
+        }
+    }
+}
+
+#[derive(Debug)]
+pub enum DecodingContext {
+    None,
+    Planning,
+    SingleLockedBatch(Arc<std::sync::RwLock<Option<RecordBatch>>>),
+    UnboundedBatchStream(Arc<std::sync::RwLock<Option<UnboundedReceiver<RecordBatch>>>>),
+    LockedBatchVec(Arc<std::sync::RwLock<Vec<RecordBatch>>>),
+    LockedJoinPair {
+        left: Arc<std::sync::RwLock<Option<RecordBatch>>>,
+        right: Arc<std::sync::RwLock<Option<RecordBatch>>>,
+    },
+    LockedJoinStream {
+        left: Arc<std::sync::RwLock<Option<UnboundedReceiver<RecordBatch>>>>,
+        right: Arc<std::sync::RwLock<Option<UnboundedReceiver<RecordBatch>>>>,
+    },
+}
+
+impl PhysicalExtensionCodec for FsPhysicalExtensionCodec {
+    fn try_decode(
+        &self,
+        buf: &[u8],
+        inputs: &[Arc<dyn ExecutionPlan>],
+        _registry: &dyn FunctionRegistry,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        let exec: FsExecNode = Message::decode(buf)
+            .map_err(|err| DataFusionError::Internal(format!("couldn't deserialize: {err}")))?;
+
+        let node = exec
+            .node
+            .ok_or_else(|| DataFusionError::Internal("exec node is empty".to_string()))?;
+
+        match node {
+            Node::MemExec(mem) => self.decode_mem_exec(mem),
+            Node::UnnestExec(unnest) => decode_unnest_exec(unnest, inputs),
+            Node::DebeziumDecode(debezium) => decode_debezium_decode(debezium, inputs),
+            Node::DebeziumEncode(debezium) => decode_debezium_encode(debezium, inputs),
+        }
+    }
+
+    fn try_encode(&self, node: Arc<dyn ExecutionPlan>, buf: &mut Vec<u8>) -> Result<()> {
+        let mut proto = None;
+
+        if let Some(table) = node.as_any().downcast_ref::<FsMemExec>() {
+            proto = Some(FsExecNode {
+                node: Some(Node::MemExec(MemExecNode {
+                    table_name: table.table_name.clone(),
+                    schema: serde_json::to_string(&table.schema).unwrap(),
+                })),
+            });
+        }
+
+        if let Some(unnest) = node.as_any().downcast_ref::<UnnestExec>() {
+            proto = Some(FsExecNode {
+                node: Some(Node::UnnestExec(UnnestExecNode {
+                    schema: serde_json::to_string(&unnest.schema()).unwrap(),
+                })),
+            });
+        }
+
+        if let Some(decode) = node.as_any().downcast_ref::<DebeziumUnrollingExec>() {
+            proto = Some(FsExecNode {
+                node: Some(Node::DebeziumDecode(DebeziumDecodeNode {
+                    schema: serde_json::to_string(decode.schema().as_ref()).unwrap(),
+                    primary_keys: decode
+                        .primary_key_indices()
+                        .iter()
+                        .map(|c| *c as u64)
+                        .collect(),
+                })),
+            });
+        }
+
+        if let Some(encode) = node.as_any().downcast_ref::<ToDebeziumExec>() {
+            proto = Some(FsExecNode {
+                node: Some(Node::DebeziumEncode(DebeziumEncodeNode {
+                    schema: serde_json::to_string(encode.schema().as_ref()).unwrap(),
+                })),
+            });
+        }
+
+        if let Some(node) = proto {
+            node.encode(buf).map_err(|err| {
+                DataFusionError::Internal(format!("couldn't serialize exec node {err}"))
+            })?;
+            Ok(())
+        } else {
+            Err(DataFusionError::Internal(format!(
+                "cannot serialize {node:?}"
+            )))
+        }
+    }
+
+    fn try_decode_udf(&self, name: &str, _buf: &[u8]) -> Result<Arc<ScalarUDF>> {
+        if name == window_function_udf::NAME {
+            return Ok(window());
+        }
+        not_impl_err!("PhysicalExtensionCodec is not provided for scalar function {name}")
+    }
+}
+
+impl FsPhysicalExtensionCodec {
+    fn decode_mem_exec(&self, mem_exec: MemExecNode) -> Result<Arc<dyn ExecutionPlan>> {
+        let schema: Schema = serde_json::from_str(&mem_exec.schema).map_err(|e| {
+            DataFusionError::Internal(format!("invalid schema in exec codec: {e:?}"))
+        })?;
+        let schema = Arc::new(schema);
+        match &self.context {
+            DecodingContext::SingleLockedBatch(single_batch) => Ok(Arc::new(
+                RwLockRecordBatchReader::new(schema, single_batch.clone()),
+            )),
+            DecodingContext::UnboundedBatchStream(unbounded_stream) => Ok(Arc::new(
+                UnboundedRecordBatchReader::new(schema, unbounded_stream.clone()),
+            )),
+            DecodingContext::LockedBatchVec(locked_batches) => Ok(Arc::new(
+                RecordBatchVecReader::new(schema, locked_batches.clone()),
+            )),
+            DecodingContext::Planning => Ok(Arc::new(FsMemExec::new(mem_exec.table_name, schema))),
+            DecodingContext::None => Err(DataFusionError::Internal(
+                "Need an internal context to decode".into(),
+            )),
+            DecodingContext::LockedJoinPair { left, right } => {
+                match mem_exec.table_name.as_str() {
+                    mem_exec_join_side::LEFT => {
+                        Ok(Arc::new(RwLockRecordBatchReader::new(schema, left.clone())))
+                    }
+                    mem_exec_join_side::RIGHT => Ok(Arc::new(RwLockRecordBatchReader::new(
+                        schema,
+                        right.clone(),
+                    ))),
+                    _ => Err(DataFusionError::Internal(format!(
+                        "unknown table name {}",
+                        mem_exec.table_name
+                    ))),
+                }
+            }
+            DecodingContext::LockedJoinStream { left, right } => {
+                match mem_exec.table_name.as_str() {
+                    mem_exec_join_side::LEFT => Ok(Arc::new(UnboundedRecordBatchReader::new(
+                        schema,
+                        left.clone(),
+                    ))),
+                    mem_exec_join_side::RIGHT => Ok(Arc::new(UnboundedRecordBatchReader::new(
+                        schema,
+                        right.clone(),
+                    ))),
+                    _ => Err(DataFusionError::Internal(format!(
+                        "unknown table name {}",
+                        mem_exec.table_name
+                    ))),
+                }
+            }
+        }
+    }
+}
+
+fn decode_unnest_exec(
+    unnest: UnnestExecNode,
+    inputs: &[Arc<dyn ExecutionPlan>],
+) -> Result<Arc<dyn ExecutionPlan>> {
+    let schema: Schema = serde_json::from_str(&unnest.schema).map_err(|e| {
+        DataFusionError::Internal(format!("invalid schema in exec codec: {e:?}"))
+    })?;
+
+    let column = schema.index_of(UNNESTED_COL).map_err(|_| {
+        DataFusionError::Internal(format!(
+            "unnest node schema does not contain {UNNESTED_COL} col"
+        ))
+    })?;
+
+    Ok(Arc::new(UnnestExec::new(
+        inputs
+            .first()
+            .ok_or_else(|| DataFusionError::Internal("no input for unnest node".to_string()))?
+            .clone(),
+        vec![ListUnnest {
+            index_in_input_schema: column,
+            depth: 1,
+        }],
+        vec![],
+        Arc::new(schema),
+        UnnestOptions::default(),
+    )))
+}
+
+fn decode_debezium_decode(
+    debezium: DebeziumDecodeNode,
+    inputs: &[Arc<dyn ExecutionPlan>],
+) -> Result<Arc<dyn ExecutionPlan>> {
+    let schema = Arc::new(serde_json::from_str::<Schema>(&debezium.schema).map_err(|e| {
+        DataFusionError::Internal(format!("invalid schema in exec codec: {e:?}"))
+    })?);
+    let input = inputs
+        .first()
+        .ok_or_else(|| DataFusionError::Internal("no input for debezium node".to_string()))?
+        .clone();
+    let primary_keys = debezium
+        .primary_keys
+        .into_iter()
+        .map(|c| c as usize)
+        .collect();
+    Ok(Arc::new(DebeziumUnrollingExec::from_decoded_parts(
+        input,
+        schema.clone(),
+        primary_keys,
+    )))
+}
+
+fn decode_debezium_encode(
+    debezium: DebeziumEncodeNode,
+    inputs: &[Arc<dyn ExecutionPlan>],
+) -> Result<Arc<dyn ExecutionPlan>> {
+    let schema = Arc::new(serde_json::from_str::<Schema>(&debezium.schema).map_err(|e| {
+        DataFusionError::Internal(format!("invalid schema in exec codec: {e:?}"))
+    })?);
+    let input = inputs
+        .first()
+        .ok_or_else(|| DataFusionError::Internal("no input for debezium node".to_string()))?
+        .clone();
+    Ok(Arc::new(ToDebeziumExec::from_decoded_parts(input, schema)))
+}
diff --git a/src/sql/physical/meta.rs b/src/sql/physical/meta.rs
new file mode 100644
index 00000000..95dd8fd8
--- /dev/null
+++ b/src/sql/physical/meta.rs
@@ -0,0 +1,52 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+use std::sync::{Arc, OnceLock};
+
+use datafusion::arrow::datatypes::{DataType, Field, Fields};
+
+use crate::sql::common::constants::updating_state_field;
+use crate::sql::common::UPDATING_META_FIELD;
+
+pub fn updating_meta_fields() -> Fields {
+    static FIELDS: OnceLock<Fields> = OnceLock::new();
+    FIELDS
+        .get_or_init(|| {
+            Fields::from(vec![
+                Field::new(
+                    updating_state_field::IS_RETRACT,
+                    DataType::Boolean,
+                    true,
+                ),
+                Field::new(
+                    updating_state_field::ID,
+                    DataType::FixedSizeBinary(16),
+                    true,
+                ),
+            ])
+        })
+        .clone()
+}
+
+pub fn updating_meta_field() -> Arc<Field> {
+    static FIELD: OnceLock<Arc<Field>> = OnceLock::new();
+    FIELD
+        .get_or_init(|| {
+            Arc::new(Field::new(
+                UPDATING_META_FIELD,
+                DataType::Struct(updating_meta_fields()),
+                false,
+            ))
+        })
+        .clone()
+}
diff --git a/src/sql/physical/mod.rs b/src/sql/physical/mod.rs
new file mode 100644
index 00000000..7cbb3231
--- /dev/null
+++ b/src/sql/physical/mod.rs
@@ -0,0 +1,24 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+mod cdc;
+mod codec;
+mod meta;
+mod readers;
+mod udfs;
+
+pub use cdc::{DebeziumUnrollingExec, ToDebeziumExec};
+pub use codec::{DecodingContext, FsPhysicalExtensionCodec};
+pub use meta::{updating_meta_field, updating_meta_fields};
+pub use readers::FsMemExec;
+pub use udfs::window;
diff --git a/src/sql/physical/readers.rs b/src/sql/physical/readers.rs
new file mode 100644
index 00000000..1c785464
--- /dev/null
+++ b/src/sql/physical/readers.rs
@@ -0,0 +1,371 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+use std::any::Any;
+use std::mem;
+use std::sync::Arc;
+
+use datafusion::arrow::array::RecordBatch;
+use datafusion::arrow::datatypes::SchemaRef;
+use datafusion::catalog::memory::MemorySourceConfig;
+use datafusion::common::{DataFusionError, Result, Statistics, not_impl_err, plan_err};
+use datafusion::datasource::memory::DataSourceExec;
+use datafusion::execution::{SendableRecordBatchStream, TaskContext};
+use datafusion::physical_expr::EquivalenceProperties;
+use datafusion::physical_plan::execution_plan::{Boundedness, EmissionType};
+use datafusion::physical_plan::memory::MemoryStream;
+use datafusion::physical_plan::stream::RecordBatchStreamAdapter;
+use datafusion::physical_plan::{
+    DisplayAs, ExecutionPlan, Partitioning, PlanProperties,
+};
+use futures::StreamExt;
+use tokio::sync::mpsc::UnboundedReceiver;
+use tokio_stream::wrappers::UnboundedReceiverStream;
+
+use crate::sql::common::constants::physical_plan_node_name;
+
+pub(crate) fn make_stream_properties(schema: SchemaRef) -> PlanProperties {
+    PlanProperties::new(
+        EquivalenceProperties::new(schema),
+        Partitioning::UnknownPartitioning(1),
+        EmissionType::Incremental,
+        Boundedness::Unbounded {
+            requires_infinite_memory: false,
+        },
+    )
+}
+
+#[derive(Debug)]
+pub(crate) struct RwLockRecordBatchReader {
+    schema: SchemaRef,
+    locked_batch: Arc<std::sync::RwLock<Option<RecordBatch>>>,
+    properties: PlanProperties,
+}
+
+impl RwLockRecordBatchReader {
+    pub(crate) fn new(
+        schema: SchemaRef,
+        locked_batch: Arc<std::sync::RwLock<Option<RecordBatch>>>,
+    ) -> Self {
+        Self {
+            schema: schema.clone(),
+            locked_batch,
+            properties: make_stream_properties(schema),
+        }
+    }
+}
+
+impl DisplayAs for RwLockRecordBatchReader {
+    fn fmt_as(
+        &self,
+        _t: datafusion::physical_plan::DisplayFormatType,
+        f: &mut std::fmt::Formatter,
+    ) -> std::fmt::Result {
+        write!(f, "RW Lock RecordBatchReader")
+    }
+}
+
+impl ExecutionPlan for RwLockRecordBatchReader {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn schema(&self) -> SchemaRef {
+        self.schema.clone()
+    }
+
+    fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> {
+        vec![]
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        _children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        Err(DataFusionError::Internal("not supported".into()))
+    }
+
+    fn execute(
+        &self,
+        _partition: usize,
+        _context: Arc<TaskContext>,
+    ) -> Result<SendableRecordBatchStream> {
+        let result = self
+            .locked_batch
+            .write()
+            .unwrap()
+            .take()
+            .expect("should have set a record batch before calling execute()");
+        Ok(Box::pin(MemoryStream::try_new(
+            vec![result],
+            self.schema.clone(),
+            None,
+        )?))
+    }
+
+    fn statistics(&self) -> Result<Statistics> {
+        Ok(Statistics::new_unknown(&self.schema))
+    }
+
+    fn reset(&self) -> Result<()> {
+        Ok(())
+    }
+
+    fn properties(&self) -> &PlanProperties {
+        &self.properties
+    }
+
+    fn name(&self) -> &str {
+        physical_plan_node_name::RW_LOCK_READER
+    }
+}
+
+#[derive(Debug)]
+pub(crate) struct UnboundedRecordBatchReader {
+    schema: SchemaRef,
+    receiver: Arc<std::sync::RwLock<Option<UnboundedReceiver<RecordBatch>>>>,
+    properties: PlanProperties,
+}
+
+impl UnboundedRecordBatchReader {
+    pub(crate) fn new(
+        schema: SchemaRef,
+        receiver: Arc<std::sync::RwLock<Option<UnboundedReceiver<RecordBatch>>>>,
+    ) -> Self {
+        Self {
+            schema: schema.clone(),
+            receiver,
+            properties: make_stream_properties(schema),
+        }
+    }
+}
+
+impl DisplayAs for UnboundedRecordBatchReader {
+    fn fmt_as(
+        &self,
+        _t: datafusion::physical_plan::DisplayFormatType,
+        f: &mut std::fmt::Formatter,
+    ) -> std::fmt::Result {
+        write!(f, "unbounded record batch reader")
+    }
+}
+
+impl ExecutionPlan for UnboundedRecordBatchReader {
+    fn name(&self) -> &str {
+        physical_plan_node_name::UNBOUNDED_READER
+    }
+
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn schema(&self) -> SchemaRef {
+        self.schema.clone()
+    }
+
+    fn properties(&self) -> &PlanProperties {
+        &self.properties
+    }
+
+    fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> {
+        vec![]
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        _children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        Err(DataFusionError::Internal("not supported".into()))
+    }
+
+    fn execute(
+        &self,
+        _partition: usize,
+        _context: Arc<TaskContext>,
+    ) -> Result<SendableRecordBatchStream> {
+        Ok(Box::pin(RecordBatchStreamAdapter::new(
+            self.schema.clone(),
+            UnboundedReceiverStream::new(
+                self.receiver
+                    .write()
+                    .unwrap()
+                    .take()
+                    .expect("unbounded receiver should be present before calling exec"),
+            )
+            .map(Ok),
+        )))
+    }
+
+    fn statistics(&self) -> Result<Statistics> {
+        Ok(Statistics::new_unknown(&self.schema))
+    }
+
+    fn reset(&self) -> Result<()> {
+        Ok(())
+    }
+}
+
+#[derive(Debug)]
+pub(crate) struct RecordBatchVecReader {
+    schema: SchemaRef,
+    receiver: Arc<std::sync::RwLock<Vec<RecordBatch>>>,
+    properties: PlanProperties,
+}
+
+impl RecordBatchVecReader {
+    pub(crate) fn new(
+        schema: SchemaRef,
+        receiver: Arc<std::sync::RwLock<Vec<RecordBatch>>>,
+    ) -> Self {
+        Self {
+            schema: schema.clone(),
+            receiver,
+            properties: make_stream_properties(schema),
+        }
+    }
+}
+
+impl DisplayAs for RecordBatchVecReader {
+    fn fmt_as(
+        &self,
+        _t: datafusion::physical_plan::DisplayFormatType,
+        f: &mut std::fmt::Formatter,
+    ) -> std::fmt::Result {
+        write!(f, "record batch vec reader")
+    }
+}
+
+impl ExecutionPlan for RecordBatchVecReader {
+    fn name(&self) -> &str {
+        physical_plan_node_name::VEC_READER
+    }
+
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn schema(&self) -> SchemaRef {
+        self.schema.clone()
+    }
+
+    fn properties(&self) -> &PlanProperties {
+        &self.properties
+    }
+
+    fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> {
+        vec![]
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        _children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        Err(DataFusionError::Internal("not supported".into()))
+    }
+
+    fn execute(
+        &self,
+        partition: usize,
+        context: Arc<TaskContext>,
+    ) -> Result<SendableRecordBatchStream> {
+        let memory = MemorySourceConfig::try_new(
+            &[mem::take(self.receiver.write().unwrap().as_mut())],
+            self.schema.clone(),
+            None,
+        )?;
+
+        DataSourceExec::new(Arc::new(memory)).execute(partition, context)
+    }
+
+    fn statistics(&self) -> Result<Statistics> {
+        Ok(Statistics::new_unknown(&self.schema))
+    }
+
+    fn reset(&self) -> Result<()> {
+        Ok(())
+    }
+}
+
+#[derive(Debug, Clone)]
+pub struct FsMemExec {
+    pub table_name: String,
+    pub schema: SchemaRef,
+    properties: PlanProperties,
+}
+
+impl DisplayAs for FsMemExec {
+    fn fmt_as(
+        &self,
+        _t: datafusion::physical_plan::DisplayFormatType,
+        f: &mut std::fmt::Formatter,
+    ) -> std::fmt::Result {
+        write!(f, "EmptyPartitionStream: schema={}", self.schema)
+    }
+}
+
+impl FsMemExec {
+    pub fn new(table_name: String, schema: SchemaRef) -> Self {
+        Self {
+            schema: schema.clone(),
+            table_name,
+            properties: make_stream_properties(schema),
+        }
+    }
+}
+
+impl ExecutionPlan for FsMemExec {
+    fn name(&self) -> &str {
+        physical_plan_node_name::MEM_EXEC
+    }
+
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn schema(&self) -> SchemaRef {
+        self.schema.clone()
+    }
+
+    fn properties(&self) -> &PlanProperties {
+        &self.properties
+    }
+
+    fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> {
+        vec![]
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        _children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        not_impl_err!("with_new_children is not implemented for mem_exec; should not be called")
+    }
+
+    fn execute(
+        &self,
+        _partition: usize,
+        _context: Arc<TaskContext>,
+    ) -> Result<SendableRecordBatchStream> {
+        plan_err!(
+            "EmptyPartitionStream cannot be executed, this is only used for physical planning before serialization"
+        )
+    }
+
+    fn statistics(&self) -> Result<Statistics> {
+        Ok(Statistics::new_unknown(&self.schema))
+    }
+
+    fn reset(&self) -> Result<()> {
+        Ok(())
+    }
+}
diff --git a/src/sql/physical/udfs.rs b/src/sql/physical/udfs.rs
new file mode 100644
index 00000000..03895fda
--- /dev/null
+++ b/src/sql/physical/udfs.rs
@@ -0,0 +1,131 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+use std::any::Any;
+use std::sync::Arc;
+
+use datafusion::arrow::array::StructArray;
+use datafusion::arrow::datatypes::{DataType, Field, TimeUnit};
+use datafusion::common::{Result, ScalarValue, plan_err};
+use datafusion::logical_expr::{
+    ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, TypeSignature, Volatility,
+};
+
+use crate::make_udf_function;
+use crate::sql::common::constants::{window_function_udf, window_interval_field};
+use crate::sql::schema::utils::window_arrow_struct;
+
+#[derive(Debug)]
+pub struct WindowFunctionUdf {
+    signature: Signature,
+}
+
+impl Default for WindowFunctionUdf {
+    fn default() -> Self {
+        Self {
+            signature: Signature::new(
+                TypeSignature::Exact(vec![
+                    DataType::Timestamp(TimeUnit::Nanosecond, None),
+                    DataType::Timestamp(TimeUnit::Nanosecond, None),
+                ]),
+                Volatility::Immutable,
+            ),
+        }
+    }
+}
+
+impl ScalarUDFImpl for WindowFunctionUdf {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        window_function_udf::NAME
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _: &[DataType]) -> Result<DataType> {
+        Ok(window_arrow_struct())
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        let columns = args.args;
+        if columns.len() != 2 {
+            return plan_err!(
+                "window function expected 2 arguments, got {}",
+                columns.len()
+            );
+        }
+        if columns[0].data_type() != DataType::Timestamp(TimeUnit::Nanosecond, None) {
+            return plan_err!(
+                "window function expected first argument to be a timestamp, got {:?}",
+                columns[0].data_type()
+            );
+        }
+        if columns[1].data_type() != DataType::Timestamp(TimeUnit::Nanosecond, None) {
+            return plan_err!(
+                "window function expected second argument to be a timestamp, got {:?}",
+                columns[1].data_type()
+            );
+        }
+        let fields = vec![
+            Arc::new(Field::new(
+                window_interval_field::START,
+                DataType::Timestamp(TimeUnit::Nanosecond, None),
+                false,
+            )),
+            Arc::new(Field::new(
+                window_interval_field::END,
+                DataType::Timestamp(TimeUnit::Nanosecond, None),
+                false,
+            )),
+        ]
+        .into();
+
+        match (&columns[0], &columns[1]) {
+            (ColumnarValue::Array(start), ColumnarValue::Array(end)) => {
+                Ok(ColumnarValue::Array(Arc::new(StructArray::new(
+                    fields,
+                    vec![start.clone(), end.clone()],
+                    None,
+                ))))
+            }
+            (ColumnarValue::Array(start), ColumnarValue::Scalar(end)) => {
+                let end = end.to_array_of_size(start.len())?;
+                Ok(ColumnarValue::Array(Arc::new(StructArray::new(
+                    fields,
+                    vec![start.clone(), end],
+                    None,
+                ))))
+            }
+            (ColumnarValue::Scalar(start), ColumnarValue::Array(end)) => {
+                let start = start.to_array_of_size(end.len())?;
+                Ok(ColumnarValue::Array(Arc::new(StructArray::new(
+                    fields,
+                    vec![start, end.clone()],
+                    None,
+                ))))
+            }
+            (ColumnarValue::Scalar(start), ColumnarValue::Scalar(end)) => {
+                Ok(ColumnarValue::Scalar(ScalarValue::Struct(
+                    StructArray::new(fields, vec![start.to_array()?, end.to_array()?], None).into(),
+                )))
+            }
+        }
+    }
+}
+
+make_udf_function!(WindowFunctionUdf, WINDOW_FUNCTION, window);
diff --git a/src/sql/schema/catalog_ddl.rs b/src/sql/schema/catalog_ddl.rs
new file mode 100644
index 00000000..3729c99c
--- /dev/null
+++ b/src/sql/schema/catalog_ddl.rs
@@ -0,0 +1,253 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Best-effort SQL text for catalog introspection (`SHOW CREATE TABLE`).
+
+use std::collections::BTreeMap;
+
+use datafusion::arrow::datatypes::{DataType, TimeUnit};
+
+use super::schema_provider::StreamTable;
+use super::table::Table as CatalogTable;
+use crate::sql::logical_node::logical::LogicalProgram;
+
+fn data_type_sql(dt: &DataType) -> String {
+    match dt {
+        DataType::Null => "NULL".to_string(),
+        DataType::Boolean => "BOOLEAN".to_string(),
+        DataType::Int8 => "TINYINT".to_string(),
+        DataType::Int16 => "SMALLINT".to_string(),
+        DataType::Int32 => "INT".to_string(),
+        DataType::Int64 => "BIGINT".to_string(),
+        DataType::UInt8 => "TINYINT UNSIGNED".to_string(),
+        DataType::UInt16 => "SMALLINT UNSIGNED".to_string(),
+        DataType::UInt32 => "INT UNSIGNED".to_string(),
+        DataType::UInt64 => "BIGINT UNSIGNED".to_string(),
+        DataType::Float16 => "FLOAT".to_string(),
+        DataType::Float32 => "REAL".to_string(),
+        DataType::Float64 => "DOUBLE".to_string(),
+        DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View => "VARCHAR".to_string(),
+        DataType::Binary | DataType::LargeBinary => "VARBINARY".to_string(),
+        DataType::Date32 => "DATE".to_string(),
+        DataType::Date64 => "DATE".to_string(),
+        DataType::Timestamp(unit, tz) => match (unit, tz) {
+            (TimeUnit::Second, None) => "TIMESTAMP(0)".to_string(),
+            (TimeUnit::Millisecond, None) => "TIMESTAMP(3)".to_string(),
+            (TimeUnit::Microsecond, None) => "TIMESTAMP(6)".to_string(),
+            (TimeUnit::Nanosecond, None) => "TIMESTAMP(9)".to_string(),
+            (_, Some(_)) => "TIMESTAMP WITH TIME ZONE".to_string(),
+        },
+        DataType::Decimal128(p, s) => format!("DECIMAL({p},{s})"),
+        DataType::Decimal256(p, s) => format!("DECIMAL({p},{s})"),
+        _ => dt.to_string(),
+    }
+}
+
+fn format_columns(schema: &datafusion::arrow::datatypes::Schema) -> Vec<String> {
+    schema
+        .fields()
+        .iter()
+        .map(|f| {
+            let null = if f.is_nullable() {
+                ""
+            } else {
+                " NOT NULL"
+            };
+            format!("  {} {}{}", f.name(), data_type_sql(f.data_type()), null)
+        })
+        .collect()
+}
+
+fn format_with_clause(opts: &BTreeMap<String, String>) -> String {
+    if opts.is_empty() {
+        return "WITH ('connector' = '...', 'format' = '...');\n/* Original WITH options are not persisted in the stream catalog. */\n"
+            .to_string();
+    }
+    let pairs: Vec<String> = opts
+        .iter()
+        .map(|(k, v)| {
+            let k_esc = k.replace('\'', "''");
+            let v_esc = v.replace('\'', "''");
+            format!("  '{k_esc}' = '{v_esc}'")
+        })
+        .collect();
+    format!("WITH (\n{}\n);\n", pairs.join(",\n"))
+}
+
+/// Single-line `col:TYPE` list for result grids.
+pub fn schema_columns_one_line(schema: &datafusion::arrow::datatypes::Schema) -> String {
+    schema
+        .fields()
+        .iter()
+        .map(|f| format!("{}:{}", f.name(), data_type_sql(f.data_type())))
+        .collect::<Vec<_>>()
+        .join(", ")
+}
+
+fn pipeline_summary_short(program: &LogicalProgram) -> String {
+    let mut parts: Vec<String> = Vec::new();
+    parts.push(format!("tasks={}", program.task_count()));
+    parts.push(format!("hash={}", program.get_hash()));
+    for nw in program.graph.node_weights() {
+        let chain = nw
+            .operator_chain
+            .operators
+            .iter()
+            .map(|o| format!("{}", o.operator_name))
+            .collect::<Vec<_>>()
+            .join("->");
+        parts.push(format!("n{}:{}", nw.node_id, chain));
+    }
+    parts.join(" | ")
+}
+
+/// Extra fields for `SHOW TABLES` result grid (pipeline summary; no full Graphviz).
+pub fn stream_table_row_detail(table: &StreamTable) -> String {
+    match table {
+        StreamTable::Source {
+            connector,
+            event_time_field,
+            watermark_field,
+            with_options,
+            ..
+        } => {
+            format!(
+                "connector={}, event_time={:?}, watermark={:?}, with_options={}",
+                connector,
+                event_time_field,
+                watermark_field,
+                with_options.len()
+            )
+        }
+        StreamTable::Sink { program, .. } => pipeline_summary_short(program),
+    }
+}
+
+fn pipeline_text(program: &LogicalProgram) -> String {
+    let mut lines: Vec<String> = Vec::new();
+    lines.push(format!("tasks_total: {}", program.task_count()));
+    lines.push(format!("program_hash: {}", program.get_hash()));
+    for nw in program.graph.node_weights() {
+        let chain = nw
+            .operator_chain
+            .operators
+            .iter()
+            .map(|o| format!("{}[{}]", o.operator_name, o.operator_id))
+            .collect::<Vec<_>>()
+            .join(" -> ");
+        lines.push(format!(
+            "node {} (parallelism={}): {chain}",
+            nw.node_id, nw.parallelism
+        ));
+    }
+    let dot = program.dot();
+    const MAX_DOT: usize = 12_000;
+    if dot.len() > MAX_DOT {
+        lines.push(format!(
+            "graphviz_dot_truncated:\n{}... [{} more bytes]",
+            &dot[..MAX_DOT],
+            dot.len() - MAX_DOT
+        ));
+    } else {
+        lines.push(format!("graphviz_dot:\n{dot}"));
+    }
+    lines.join("\n")
+}
+
+/// Human-readable `SHOW CREATE TABLE` text (sink `AS SELECT` is not stored).
+pub fn show_create_stream_table(table: &StreamTable) -> String {
+    match table {
+        StreamTable::Source {
+            name,
+            connector,
+            schema,
+            event_time_field,
+            watermark_field,
+            with_options,
+        } => {
+            let cols = format_columns(schema);
+            let mut ddl = format!("CREATE TABLE {name} (\n{}\n)", cols.join(",\n"));
+            if let Some(e) = event_time_field {
+                ddl.push_str(&format!("\n/* EVENT TIME COLUMN: {e} */\n"));
+            }
+            if let Some(w) = watermark_field {
+                ddl.push_str(&format!("/* WATERMARK: {w} */\n"));
+            }
+            let mut merged_opts = with_options.clone();
+            merged_opts
+                .entry("connector".to_string())
+                .or_insert_with(|| connector.clone());
+            ddl.push_str(&format_with_clause(&merged_opts));
+            ddl
+        }
+        StreamTable::Sink { name, program } => {
+            let schema = program
+                .egress_arrow_schema()
+                .unwrap_or_else(|| std::sync::Arc::new(datafusion::arrow::datatypes::Schema::empty()));
+            let cols = format_columns(&schema);
+            let mut ddl = format!(
+                "CREATE STREAMING TABLE {name}\nWITH ('connector' = '...') AS SELECT ...\n/* Sink WITH / AS SELECT text is not stored. Output schema:\n{}\n*/\n\n",
+                cols.join(",\n")
+            );
+            ddl.push_str("-- Resolved logical pipeline:\n");
+            ddl.push_str(&pipeline_text(program));
+            ddl.push('\n');
+            ddl
+        }
+    }
+}
+
+/// Extra fields for `SHOW TABLES` result grid for persisted catalog rows.
+pub fn catalog_table_row_detail(table: &CatalogTable) -> String {
+    match table {
+        CatalogTable::ConnectorTable(source) => format!(
+            "kind=connector, connector={}, event_time={:?}, watermark={:?}, with_options={}",
+            source.connector(),
+            source.event_time_field(),
+            source.temporal_config.watermark_strategy_column,
+            source.catalog_with_options().len()
+        ),
+        CatalogTable::LookupTable(source) => format!(
+            "kind=lookup, connector={}, event_time={:?}, watermark={:?}, with_options={}",
+            source.connector(),
+            source.event_time_field(),
+            source.temporal_config.watermark_strategy_column,
+            source.catalog_with_options().len()
+        ),
+        CatalogTable::TableFromQuery { .. } => "kind=query".to_string(),
+    }
+}
+
+/// Human-readable `SHOW CREATE TABLE` text for persisted catalog rows.
+pub fn show_create_catalog_table(table: &CatalogTable) -> String {
+    match table {
+        CatalogTable::ConnectorTable(source) | CatalogTable::LookupTable(source) => {
+            let schema = source.produce_physical_schema();
+            let cols = format_columns(&schema);
+            let mut ddl = format!("CREATE TABLE {} (\n{}\n)", source.name(), cols.join(",\n"));
+            if let Some(e) = source.event_time_field() {
+                ddl.push_str(&format!("\n/* EVENT TIME COLUMN: {e} */\n"));
+            }
+            if let Some(w) = source.temporal_config.watermark_strategy_column.as_deref() {
+                ddl.push_str(&format!("/* WATERMARK: {w} */\n"));
+            }
+            let mut opts = source.catalog_with_options().clone();
+            opts.entry("connector".to_string())
+                .or_insert_with(|| source.connector().to_string());
+            ddl.push_str(&format_with_clause(&opts));
+            ddl
+        }
+        CatalogTable::TableFromQuery { name, .. } => {
+            format!("CREATE TABLE {name} AS SELECT ...;\n/* logical query text is not persisted */\n")
+        }
+    }
+}
diff --git a/src/sql/schema/column_descriptor.rs b/src/sql/schema/column_descriptor.rs
new file mode 100644
index 00000000..533708cc
--- /dev/null
+++ b/src/sql/schema/column_descriptor.rs
@@ -0,0 +1,146 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use datafusion::arrow::datatypes::{DataType, Field, TimeUnit};
+use datafusion::logical_expr::Expr;
+
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+pub enum ColumnDescriptor {
+    Physical(Field),
+    SystemMeta {
+        field: Field,
+        meta_key: String,
+    },
+    Computed {
+        field: Field,
+        logic: Box<Expr>,
+    },
+}
+
+impl ColumnDescriptor {
+    #[inline]
+    pub fn new_physical(field: Field) -> Self {
+        Self::Physical(field)
+    }
+
+    #[inline]
+    pub fn new_system_meta(field: Field, meta_key: impl Into<String>) -> Self {
+        Self::SystemMeta {
+            field,
+            meta_key: meta_key.into(),
+        }
+    }
+
+    #[inline]
+    pub fn new_computed(field: Field, logic: Expr) -> Self {
+        Self::Computed {
+            field,
+            logic: Box::new(logic),
+        }
+    }
+
+    #[inline]
+    pub fn arrow_field(&self) -> &Field {
+        match self {
+            Self::Physical(f) => f,
+            Self::SystemMeta { field: f, .. } => f,
+            Self::Computed { field: f, .. } => f,
+        }
+    }
+
+    #[inline]
+    pub fn into_arrow_field(self) -> Field {
+        match self {
+            Self::Physical(f) => f,
+            Self::SystemMeta { field: f, .. } => f,
+            Self::Computed { field: f, .. } => f,
+        }
+    }
+
+    #[inline]
+    pub fn is_computed(&self) -> bool {
+        matches!(self, Self::Computed { .. })
+    }
+
+    #[inline]
+    pub fn is_physical(&self) -> bool {
+        matches!(self, Self::Physical(_))
+    }
+
+    #[inline]
+    pub fn system_meta_key(&self) -> Option<&str> {
+        if let Self::SystemMeta { meta_key, .. } = self {
+            Some(meta_key.as_str())
+        } else {
+            None
+        }
+    }
+
+    #[inline]
+    pub fn computation_logic(&self) -> Option<&Expr> {
+        if let Self::Computed { logic, .. } = self {
+            Some(logic)
+        } else {
+            None
+        }
+    }
+
+    #[inline]
+    pub fn data_type(&self) -> &DataType {
+        self.arrow_field().data_type()
+    }
+
+    pub fn set_nullable(&mut self, nullable: bool) {
+        let f = match self {
+            Self::Physical(f) => f,
+            Self::SystemMeta { field, .. } => field,
+            Self::Computed { field, .. } => field,
+        };
+        *f = Field::new(f.name(), f.data_type().clone(), nullable)
+            .with_metadata(f.metadata().clone());
+    }
+
+    pub fn force_precision(&mut self, unit: TimeUnit) {
+        match self {
+            Self::Physical(f) => {
+                if let DataType::Timestamp(_, tz) = f.data_type() {
+                    *f = Field::new(f.name(), DataType::Timestamp(unit, tz.clone()), f.is_nullable());
+                }
+            }
+            Self::SystemMeta { field, .. } => {
+                if let DataType::Timestamp(_, tz) = field.data_type() {
+                    *field = Field::new(
+                        field.name(),
+                        DataType::Timestamp(unit, tz.clone()),
+                        field.is_nullable(),
+                    );
+                }
+            }
+            Self::Computed { field, .. } => {
+                if let DataType::Timestamp(_, tz) = field.data_type() {
+                    *field = Field::new(
+                        field.name(),
+                        DataType::Timestamp(unit, tz.clone()),
+                        field.is_nullable(),
+                    );
+                }
+            }
+        }
+    }
+}
+
+impl From<Field> for ColumnDescriptor {
+    #[inline]
+    fn from(field: Field) -> Self {
+        Self::Physical(field)
+    }
+}
diff --git a/src/sql/schema/connection_type.rs b/src/sql/schema/connection_type.rs
new file mode 100644
index 00000000..06a3df92
--- /dev/null
+++ b/src/sql/schema/connection_type.rs
@@ -0,0 +1,31 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::fmt;
+
+/// Describes the role of a connection in the streaming pipeline.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub enum ConnectionType {
+    Source,
+    Sink,
+    Lookup,
+}
+
+impl fmt::Display for ConnectionType {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match self {
+            ConnectionType::Source => write!(f, "source"),
+            ConnectionType::Sink => write!(f, "sink"),
+            ConnectionType::Lookup => write!(f, "lookup"),
+        }
+    }
+}
diff --git a/src/sql/schema/connector_config.rs b/src/sql/schema/connector_config.rs
new file mode 100644
index 00000000..f47e05d9
--- /dev/null
+++ b/src/sql/schema/connector_config.rs
@@ -0,0 +1,82 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+//
+// Strongly-typed in-memory connector configuration for the SQL catalog layer.
+// Maps 1:1 to the `ConnectorOp.oneof config` proto variants.
+
+use std::collections::HashMap;
+
+use protocol::grpc::api::{
+    connector_op, GenericConnectorConfig, KafkaSinkConfig, KafkaSourceConfig,
+};
+
+/// Strongly-typed connector configuration stored in [`super::SourceTable`].
+///
+/// Each variant corresponds directly to a proto `ConnectorOp.oneof config` branch.
+/// Adding a new connector (e.g. MySQL CDC) means adding a variant here and a proto message —
+/// the Rust compiler will then guide you to every call-site that needs updating.
+#[derive(Debug, Clone)]
+pub enum ConnectorConfig {
+    KafkaSource(KafkaSourceConfig),
+    KafkaSink(KafkaSinkConfig),
+    /// Fallback for connectors not yet strongly typed (e.g. future Redis, JDBC).
+    Generic(HashMap<String, String>),
+}
+
+impl ConnectorConfig {
+    /// Convert to the proto `ConnectorOp.oneof config` representation — zero JSON involved.
+    pub fn to_proto_config(&self) -> connector_op::Config {
+        match self {
+            ConnectorConfig::KafkaSource(cfg) => {
+                connector_op::Config::KafkaSource(cfg.clone())
+            }
+            ConnectorConfig::KafkaSink(cfg) => {
+                connector_op::Config::KafkaSink(cfg.clone())
+            }
+            ConnectorConfig::Generic(props) => {
+                connector_op::Config::Generic(GenericConnectorConfig {
+                    properties: props.clone(),
+                })
+            }
+        }
+    }
+}
+
+// Proto-generated types do not derive Eq/Hash/PartialEq since they contain f32/f64
+// in the general case. For our subset (Kafka configs) all fields are integers, strings,
+// and maps — logically hashable. We impl the traits via serialized proto bytes so the
+// SourceTable derive chain stays intact.
+
+impl PartialEq for ConnectorConfig {
+    fn eq(&self, other: &Self) -> bool {
+        use prost::Message;
+        match (self, other) {
+            (ConnectorConfig::KafkaSource(a), ConnectorConfig::KafkaSource(b)) => {
+                a.encode_to_vec() == b.encode_to_vec()
+            }
+            (ConnectorConfig::KafkaSink(a), ConnectorConfig::KafkaSink(b)) => {
+                a.encode_to_vec() == b.encode_to_vec()
+            }
+            (ConnectorConfig::Generic(a), ConnectorConfig::Generic(b)) => a == b,
+            _ => false,
+        }
+    }
+}
+
+impl Eq for ConnectorConfig {}
+
+impl std::hash::Hash for ConnectorConfig {
+    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+        use prost::Message;
+        std::mem::discriminant(self).hash(state);
+        match self {
+            ConnectorConfig::KafkaSource(cfg) => cfg.encode_to_vec().hash(state),
+            ConnectorConfig::KafkaSink(cfg) => cfg.encode_to_vec().hash(state),
+            ConnectorConfig::Generic(m) => {
+                let mut pairs: Vec<_> = m.iter().collect();
+                pairs.sort_by_key(|(k, _)| (*k).clone());
+                pairs.hash(state);
+            }
+        }
+    }
+}
diff --git a/src/sql/schema/data_encoding_format.rs b/src/sql/schema/data_encoding_format.rs
new file mode 100644
index 00000000..29828c86
--- /dev/null
+++ b/src/sql/schema/data_encoding_format.rs
@@ -0,0 +1,88 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::collections::HashMap;
+
+use datafusion::arrow::datatypes::{DataType, Field};
+use datafusion::common::{Result, plan_err};
+
+use super::column_descriptor::ColumnDescriptor;
+use crate::sql::common::constants::{cdc, connection_format_value, with_opt_bool_str};
+use crate::sql::common::with_option_keys as opt;
+use crate::sql::common::Format;
+
+/// High-level payload encoding (orthogonal to `Format` wire details in `ConnectionSchema`).
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub enum DataEncodingFormat {
+    StandardJson,
+    DebeziumJson,
+    Avro,
+    Parquet,
+    Raw,
+}
+
+impl DataEncodingFormat {
+    pub fn extract_from_map(opts: &HashMap<String, String>) -> Result<Self> {
+        let format_str = opts
+            .get(opt::FORMAT)
+            .map(|s| s.as_str())
+            .unwrap_or(opt::DEFAULT_FORMAT_VALUE);
+        let is_debezium = opts
+            .get(opt::FORMAT_DEBEZIUM_FLAG)
+            .or_else(|| opts.get(opt::JSON_DEBEZIUM))
+            .map(|s| s == with_opt_bool_str::TRUE)
+            .unwrap_or(false);
+
+        match (format_str, is_debezium) {
+            (f, true) if f == connection_format_value::JSON => Ok(Self::DebeziumJson),
+            (f, _) if f == connection_format_value::DEBEZIUM_JSON => Ok(Self::DebeziumJson),
+            (f, false) if f == connection_format_value::JSON => Ok(Self::StandardJson),
+            (f, _) if f == connection_format_value::AVRO => Ok(Self::Avro),
+            (f, _) if f == connection_format_value::PARQUET => Ok(Self::Parquet),
+            _ => Ok(Self::Raw),
+        }
+    }
+
+    pub fn from_connection_format(format: &Format) -> Self {
+        match format {
+            Format::Json(j) if j.debezium => Self::DebeziumJson,
+            Format::Json(_) => Self::StandardJson,
+            Format::Avro(_) => Self::Avro,
+            Format::Parquet(_) => Self::Parquet,
+            Format::Protobuf(_) | Format::RawString(_) | Format::RawBytes(_) => Self::Raw,
+        }
+    }
+
+    pub fn supports_delta_updates(&self) -> bool {
+        matches!(self, Self::DebeziumJson)
+    }
+
+    pub fn apply_envelope(self, columns: Vec<ColumnDescriptor>) -> Result<Vec<ColumnDescriptor>> {
+        if !self.supports_delta_updates() {
+            return Ok(columns);
+        }
+        if columns.iter().any(|c| c.is_computed()) {
+            return plan_err!("Virtual fields are not supported with CDC envelope");
+        }
+        if columns.is_empty() {
+            return Ok(columns);
+        }
+        let fields: Vec<Field> = columns.into_iter().map(|c| c.into_arrow_field()).collect();
+        let struct_type = DataType::Struct(fields.into());
+
+        Ok(vec![
+            ColumnDescriptor::new_physical(Field::new(cdc::BEFORE, struct_type.clone(), true)),
+            ColumnDescriptor::new_physical(Field::new(cdc::AFTER, struct_type.clone(), true)),
+            ColumnDescriptor::new_physical(Field::new(cdc::OP, DataType::Utf8, true)),
+        ])
+    }
+}
diff --git a/src/sql/schema/kafka_operator_config.rs b/src/sql/schema/kafka_operator_config.rs
new file mode 100644
index 00000000..4dd70906
--- /dev/null
+++ b/src/sql/schema/kafka_operator_config.rs
@@ -0,0 +1,250 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+//
+// Builds strongly-typed proto Kafka configs from SQL DDL WITH options.
+
+use std::collections::HashMap;
+
+use datafusion::arrow::datatypes::Schema;
+use datafusion::common::{Result as DFResult, plan_datafusion_err, plan_err};
+
+use protocol::grpc::api::connector_op::Config as ProtoConfig;
+use protocol::grpc::api::{
+    BadDataPolicy, DecimalEncodingProto, FormatConfig, JsonFormatConfig, KafkaAuthConfig,
+    KafkaAuthNone, KafkaOffsetMode, KafkaReadMode, KafkaSinkCommitMode, KafkaSinkConfig,
+    KafkaSourceConfig, RawBytesFormatConfig, RawStringFormatConfig, TimestampFormatProto,
+};
+
+use crate::sql::common::constants::{connection_table_role, kafka_with_value};
+use crate::sql::common::connector_options::ConnectorOptions;
+use crate::sql::common::formats::{
+    BadData, DecimalEncoding as SqlDecimalEncoding, Format as SqlFormat,
+    TimestampFormat as SqlTimestampFormat,
+};
+use crate::sql::common::with_option_keys as opt;
+use crate::sql::schema::table_role::TableRole;
+
+fn sql_format_to_proto(fmt: &SqlFormat) -> DFResult<FormatConfig> {
+    match fmt {
+        SqlFormat::Json(j) => Ok(FormatConfig {
+            format: Some(protocol::grpc::api::format_config::Format::Json(
+                JsonFormatConfig {
+                    timestamp_format: match j.timestamp_format {
+                        SqlTimestampFormat::RFC3339 => TimestampFormatProto::TimestampRfc3339 as i32,
+                        SqlTimestampFormat::UnixMillis => {
+                            TimestampFormatProto::TimestampUnixMillis as i32
+                        }
+                    },
+                    decimal_encoding: match j.decimal_encoding {
+                        SqlDecimalEncoding::Number => DecimalEncodingProto::DecimalNumber as i32,
+                        SqlDecimalEncoding::String => DecimalEncodingProto::DecimalString as i32,
+                        SqlDecimalEncoding::Bytes => DecimalEncodingProto::DecimalBytes as i32,
+                    },
+                    include_schema: j.include_schema,
+                    confluent_schema_registry: j.confluent_schema_registry,
+                    schema_id: j.schema_id,
+                    debezium: j.debezium,
+                    unstructured: j.unstructured,
+                },
+            )),
+        }),
+        SqlFormat::RawString(_) => Ok(FormatConfig {
+            format: Some(protocol::grpc::api::format_config::Format::RawString(
+                RawStringFormatConfig {},
+            )),
+        }),
+        SqlFormat::RawBytes(_) => Ok(FormatConfig {
+            format: Some(protocol::grpc::api::format_config::Format::RawBytes(
+                RawBytesFormatConfig {},
+            )),
+        }),
+        other => plan_err!(
+            "Kafka connector: format '{}' is not supported yet",
+            other.name()
+        ),
+    }
+}
+
+fn sql_bad_data_to_proto(bad: &BadData) -> i32 {
+    match bad {
+        BadData::Fail {} => BadDataPolicy::BadDataFail as i32,
+        BadData::Drop {} => BadDataPolicy::BadDataDrop as i32,
+    }
+}
+
+/// Build Kafka proto config from a flat string map (catalog rebuild path).
+pub fn build_kafka_proto_config_from_string_map(
+    map: HashMap<String, String>,
+    _physical_schema: &Schema,
+) -> DFResult<ProtoConfig> {
+    let mut options = ConnectorOptions::from_flat_string_map(map)?;
+    let format = crate::sql::common::formats::Format::from_opts(&mut options)
+        .map_err(|e| datafusion::error::DataFusionError::Plan(format!("invalid format: {e}")))?;
+    let bad_data = BadData::from_opts(&mut options)
+        .map_err(|e| datafusion::error::DataFusionError::Plan(format!("Invalid bad_data: '{e}'")))?;
+    let _framing = crate::sql::common::formats::Framing::from_opts(&mut options)
+        .map_err(|e| datafusion::error::DataFusionError::Plan(format!("invalid framing: '{e}'")))?;
+
+    let role = match options.pull_opt_str(opt::TYPE)?.as_deref() {
+        None | Some(connection_table_role::SOURCE) => TableRole::Ingestion,
+        Some(connection_table_role::SINK) => TableRole::Egress,
+        Some(connection_table_role::LOOKUP) => TableRole::Reference,
+        Some(other) => {
+            return plan_err!("invalid connection type '{other}' in WITH options");
+        }
+    };
+
+    build_kafka_proto_config(&mut options, role, &format, bad_data)
+}
+
+/// Core builder shared by SQL DDL and catalog reload paths.
+pub fn build_kafka_proto_config(
+    options: &mut ConnectorOptions,
+    role: TableRole,
+    format: &Option<SqlFormat>,
+    bad_data: BadData,
+) -> DFResult<ProtoConfig> {
+    let bootstrap_servers = match options.pull_opt_str(opt::KAFKA_BOOTSTRAP_SERVERS)? {
+        Some(s) => s,
+        None => options
+            .pull_opt_str(opt::KAFKA_BOOTSTRAP_SERVERS_LEGACY)?
+            .ok_or_else(|| {
+                plan_datafusion_err!(
+                    "Kafka connector requires 'bootstrap.servers' in the WITH clause"
+                )
+            })?,
+    };
+
+    let topic = options
+        .pull_opt_str(opt::KAFKA_TOPIC)?
+        .ok_or_else(|| plan_datafusion_err!("Kafka connector requires 'topic' in the WITH clause"))?;
+
+    let sql_format = format.clone().ok_or_else(|| {
+        plan_datafusion_err!(
+            "Kafka connector requires 'format' in the WITH clause (e.g. format = 'json')"
+        )
+    })?;
+    let proto_format = sql_format_to_proto(&sql_format)?;
+
+    let rate_limit = options
+        .pull_opt_u64(opt::KAFKA_RATE_LIMIT_MESSAGES_PER_SECOND)?
+        .map(|v| v.clamp(1, u32::MAX as u64) as u32)
+        .unwrap_or(0);
+
+    let value_subject = options.pull_opt_str(opt::KAFKA_VALUE_SUBJECT)?;
+
+    let auth = Some(KafkaAuthConfig {
+        auth: Some(protocol::grpc::api::kafka_auth_config::Auth::None(
+            KafkaAuthNone {},
+        )),
+    });
+
+    let _ = options.pull_opt_str(opt::TYPE)?;
+    let _ = options.pull_opt_str(opt::CONNECTOR)?;
+
+    match role {
+        TableRole::Ingestion => {
+            let offset_mode = match options.pull_opt_str(opt::KAFKA_SCAN_STARTUP_MODE)?.as_deref() {
+                Some(s) if s == kafka_with_value::SCAN_LATEST => {
+                    KafkaOffsetMode::KafkaOffsetLatest as i32
+                }
+                Some(s) if s == kafka_with_value::SCAN_EARLIEST => {
+                    KafkaOffsetMode::KafkaOffsetEarliest as i32
+                }
+                Some(s)
+                    if s == kafka_with_value::SCAN_GROUP_OFFSETS
+                        || s == kafka_with_value::SCAN_GROUP =>
+                {
+                    KafkaOffsetMode::KafkaOffsetGroup as i32
+                }
+                None => KafkaOffsetMode::KafkaOffsetGroup as i32,
+                Some(other) => {
+                    return plan_err!(
+                        "invalid scan.startup.mode '{other}'; expected latest, earliest, or group-offsets"
+                    );
+                }
+            };
+
+            let read_mode = match options.pull_opt_str(opt::KAFKA_ISOLATION_LEVEL)?.as_deref() {
+                Some(s) if s == kafka_with_value::ISOLATION_READ_COMMITTED => {
+                    KafkaReadMode::KafkaReadCommitted as i32
+                }
+                Some(s) if s == kafka_with_value::ISOLATION_READ_UNCOMMITTED => {
+                    KafkaReadMode::KafkaReadUncommitted as i32
+                }
+                None => KafkaReadMode::KafkaReadDefault as i32,
+                Some(other) => {
+                    return plan_err!("invalid isolation.level '{other}'");
+                }
+            };
+
+            let group_id = match options.pull_opt_str(opt::KAFKA_GROUP_ID)? {
+                Some(s) => Some(s),
+                None => options.pull_opt_str(opt::KAFKA_GROUP_ID_LEGACY)?,
+            };
+            let group_id_prefix = options.pull_opt_str(opt::KAFKA_GROUP_ID_PREFIX)?;
+
+            let client_configs = options.drain_remaining_string_values()?;
+
+            Ok(ProtoConfig::KafkaSource(KafkaSourceConfig {
+                topic,
+                bootstrap_servers,
+                group_id,
+                group_id_prefix,
+                offset_mode,
+                read_mode,
+                auth,
+                client_configs,
+                format: Some(proto_format),
+                bad_data_policy: sql_bad_data_to_proto(&bad_data),
+                rate_limit_msgs_per_sec: rate_limit,
+                value_subject,
+            }))
+        }
+        TableRole::Egress => {
+            let commit_mode = match options.pull_opt_str(opt::KAFKA_SINK_COMMIT_MODE)?.as_deref() {
+                Some(s)
+                    if s == kafka_with_value::SINK_COMMIT_EXACTLY_ONCE_HYPHEN
+                        || s == kafka_with_value::SINK_COMMIT_EXACTLY_ONCE_UNDERSCORE =>
+                {
+                    KafkaSinkCommitMode::KafkaSinkExactlyOnce as i32
+                }
+                None => KafkaSinkCommitMode::KafkaSinkAtLeastOnce as i32,
+                Some(s)
+                    if s == kafka_with_value::SINK_COMMIT_AT_LEAST_ONCE_HYPHEN
+                        || s == kafka_with_value::SINK_COMMIT_AT_LEAST_ONCE_UNDERSCORE =>
+                {
+                    KafkaSinkCommitMode::KafkaSinkAtLeastOnce as i32
+                }
+                Some(other) => {
+                    return plan_err!("invalid sink.commit.mode '{other}'");
+                }
+            };
+            let key_field = match options.pull_opt_str(opt::KAFKA_SINK_KEY_FIELD)? {
+                Some(s) => Some(s),
+                None => options.pull_opt_str(opt::KAFKA_KEY_FIELD_LEGACY)?,
+            };
+            let timestamp_field = match options.pull_opt_str(opt::KAFKA_SINK_TIMESTAMP_FIELD)? {
+                Some(s) => Some(s),
+                None => options.pull_opt_str(opt::KAFKA_TIMESTAMP_FIELD_LEGACY)?,
+            };
+
+            let client_configs = options.drain_remaining_string_values()?;
+
+            Ok(ProtoConfig::KafkaSink(KafkaSinkConfig {
+                topic,
+                bootstrap_servers,
+                commit_mode,
+                key_field,
+                timestamp_field,
+                auth,
+                client_configs,
+                format: Some(proto_format),
+                value_subject,
+            }))
+        }
+        TableRole::Reference => {
+            plan_err!("Kafka connector cannot be used as a lookup table in this path")
+        }
+    }
+}
diff --git a/src/sql/schema/mod.rs b/src/sql/schema/mod.rs
new file mode 100644
index 00000000..f3bf1946
--- /dev/null
+++ b/src/sql/schema/mod.rs
@@ -0,0 +1,41 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+pub mod catalog_ddl;
+pub mod column_descriptor;
+pub mod connection_type;
+pub mod connector_config;
+pub mod kafka_operator_config;
+pub mod source_table;
+pub mod data_encoding_format;
+pub mod schema_context;
+pub mod schema_provider;
+pub mod table;
+pub mod table_execution_unit;
+pub mod table_role;
+pub mod temporal_pipeline_config;
+pub mod utils;
+
+pub use catalog_ddl::{
+    catalog_table_row_detail, schema_columns_one_line, show_create_catalog_table,
+};
+pub use column_descriptor::ColumnDescriptor;
+pub use connection_type::ConnectionType;
+pub use connector_config::ConnectorConfig;
+pub use source_table::SourceTable;
+
+/// Back-compat alias for [`SourceTable`].
+pub type ConnectorTable = SourceTable;
+pub use schema_provider::{
+    ObjectName, StreamPlanningContext, StreamSchemaProvider, StreamTable,
+};
+pub use table::Table;
diff --git a/src/sql/schema/schema_context.rs b/src/sql/schema/schema_context.rs
new file mode 100644
index 00000000..232fd9e7
--- /dev/null
+++ b/src/sql/schema/schema_context.rs
@@ -0,0 +1,37 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use datafusion::arrow::datatypes::{DataType, Schema};
+use datafusion::common::{Result, DFSchema};
+use datafusion::logical_expr::Expr;
+use datafusion_expr::ExprSchemable;
+
+pub trait SchemaContext {
+    fn resolve_expression(&self, expr: &Expr, schema: &Schema) -> Result<Expr>;
+    fn extract_datatype(&self, expr: &Expr, schema: &Schema) -> Result<DataType>;
+}
+
+/// [`SchemaContext`] backed by a [`DFSchema`] built from the physical Arrow schema.
+pub struct DfSchemaContext;
+
+impl SchemaContext for DfSchemaContext {
+    fn resolve_expression(&self, expr: &Expr, schema: &Schema) -> Result<Expr> {
+        let df = DFSchema::try_from(schema.clone())?;
+        let _ = expr.get_type(&df)?;
+        Ok(expr.clone())
+    }
+
+    fn extract_datatype(&self, expr: &Expr, schema: &Schema) -> Result<DataType> {
+        let df = DFSchema::try_from(schema.clone())?;
+        expr.get_type(&df)
+    }
+}
diff --git a/src/sql/schema/schema_provider.rs b/src/sql/schema/schema_provider.rs
new file mode 100644
index 00000000..bbe03079
--- /dev/null
+++ b/src/sql/schema/schema_provider.rs
@@ -0,0 +1,430 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::collections::{BTreeMap, HashMap, HashSet};
+use std::sync::Arc;
+
+use datafusion::arrow::datatypes::{self as datatypes, DataType, Field, Schema};
+use datafusion::common::{DataFusionError, Result};
+use datafusion::datasource::{DefaultTableSource, TableProvider, TableType};
+use datafusion::execution::{FunctionRegistry, SessionStateDefaults};
+use datafusion::logical_expr::expr_rewriter::FunctionRewrite;
+use datafusion::logical_expr::planner::ExprPlanner;
+use datafusion::logical_expr::{AggregateUDF, Expr, ScalarUDF, TableSource, WindowUDF};
+use datafusion::optimizer::Analyzer;
+use datafusion::sql::planner::ContextProvider;
+use datafusion::sql::TableReference;
+use unicase::UniCase;
+
+use crate::sql::logical_node::logical::{DylibUdfConfig, LogicalProgram};
+use crate::sql::common::constants::{planning_placeholder_udf, window_fn};
+use crate::sql::schema::table::Table as CatalogTable;
+use crate::sql::schema::utils::window_arrow_struct;
+use crate::sql::types::{PlaceholderUdf, PlanningOptions};
+
+pub type ObjectName = UniCase<String>;
+
+#[inline]
+fn object_name(s: impl Into<String>) -> ObjectName {
+    UniCase::new(s.into())
+}
+
+#[derive(Clone, Debug)]
+pub enum StreamTable {
+    Source {
+        name: String,
+        connector: String,
+        schema: Arc<Schema>,
+        event_time_field: Option<String>,
+        watermark_field: Option<String>,
+        /// Persisted `WITH` options for `SHOW CREATE TABLE`.
+        with_options: BTreeMap<String, String>,
+    },
+    Sink {
+        name: String,
+        program: LogicalProgram,
+    },
+}
+
+impl StreamTable {
+    pub fn name(&self) -> &str {
+        match self {
+            Self::Source { name, .. } | Self::Sink { name, .. } => name,
+        }
+    }
+
+    pub fn schema(&self) -> Arc<Schema> {
+        match self {
+            Self::Source { schema, .. } => Arc::clone(schema),
+            Self::Sink { program, .. } => program
+                .egress_arrow_schema()
+                .unwrap_or_else(|| Arc::new(Schema::empty())),
+        }
+    }
+}
+
+#[derive(Debug, Clone)]
+pub struct LogicalBatchInput {
+    pub table_name: String,
+    pub schema: Arc<Schema>,
+}
+
+#[async_trait::async_trait]
+impl TableProvider for LogicalBatchInput {
+    fn as_any(&self) -> &dyn std::any::Any {
+        self
+    }
+
+    fn schema(&self) -> Arc<Schema> {
+        Arc::clone(&self.schema)
+    }
+
+    fn table_type(&self) -> TableType {
+        TableType::Temporary
+    }
+
+    async fn scan(
+        &self,
+        _state: &dyn datafusion::catalog::Session,
+        _projection: Option<&Vec<usize>>,
+        _filters: &[Expr],
+        _limit: Option<usize>,
+    ) -> Result<Arc<dyn datafusion::physical_plan::ExecutionPlan>> {
+        Ok(Arc::new(crate::sql::physical::FsMemExec::new(
+            self.table_name.clone(),
+            Arc::clone(&self.schema),
+        )))
+    }
+}
+
+#[derive(Clone, Default)]
+pub struct FunctionCatalog {
+    pub scalars: HashMap<String, Arc<ScalarUDF>>,
+    pub aggregates: HashMap<String, Arc<AggregateUDF>>,
+    pub windows: HashMap<String, Arc<WindowUDF>>,
+    pub planners: Vec<Arc<dyn ExprPlanner>>,
+}
+
+#[derive(Clone, Default)]
+pub struct TableCatalog {
+    pub streams: HashMap<ObjectName, Arc<StreamTable>>,
+    pub catalogs: HashMap<ObjectName, Arc<CatalogTable>>,
+    pub source_defs: HashMap<String, String>,
+}
+
+#[derive(Clone)]
+pub struct StreamPlanningContext {
+    pub tables: TableCatalog,
+    pub functions: FunctionCatalog,
+    pub dylib_udfs: HashMap<String, DylibUdfConfig>,
+    pub config_options: datafusion::config::ConfigOptions,
+    pub planning_options: PlanningOptions,
+    pub analyzer: Analyzer,
+}
+
+impl Default for StreamPlanningContext {
+    fn default() -> Self {
+        Self {
+            tables: TableCatalog::default(),
+            functions: FunctionCatalog::default(),
+            dylib_udfs: HashMap::new(),
+            config_options: datafusion::config::ConfigOptions::default(),
+            planning_options: PlanningOptions::default(),
+            analyzer: Analyzer::default(),
+        }
+    }
+}
+
+/// Back-compat name for [`StreamPlanningContext`].
+pub type StreamSchemaProvider = StreamPlanningContext;
+
+impl StreamPlanningContext {
+    pub fn builder() -> StreamPlanningContextBuilder {
+        StreamPlanningContextBuilder::default()
+    }
+
+    /// Same registration order as the historical `StreamSchemaProvider::new` (placeholders, then DataFusion defaults).
+    pub fn new() -> Self {
+        Self::builder()
+            .with_streaming_extensions()
+            .expect("streaming extensions")
+            .with_default_functions()
+            .expect("default functions")
+            .build()
+    }
+
+    pub fn register_stream_table(&mut self, table: StreamTable) {
+        let key = object_name(table.name().to_string());
+        self.tables.streams.insert(key, Arc::new(table));
+    }
+
+    pub fn get_stream_table(&self, name: &str) -> Option<Arc<StreamTable>> {
+        self.tables.streams.get(&object_name(name.to_string())).cloned()
+    }
+
+    pub fn register_catalog_table(&mut self, table: CatalogTable) {
+        let key = object_name(table.name().to_string());
+        self.tables.catalogs.insert(key, Arc::new(table));
+    }
+
+    pub fn get_catalog_table(&self, table_name: impl AsRef<str>) -> Option<&CatalogTable> {
+        self.tables
+            .catalogs
+            .get(&object_name(table_name.as_ref().to_string()))
+            .map(|t| t.as_ref())
+    }
+
+    pub fn get_catalog_table_mut(
+        &mut self,
+        table_name: impl AsRef<str>,
+    ) -> Option<&mut CatalogTable> {
+        self.tables
+            .catalogs
+            .get_mut(&object_name(table_name.as_ref().to_string()))
+            .map(|t| Arc::make_mut(t))
+    }
+
+    pub fn add_source_table(
+        &mut self,
+        name: String,
+        schema: Arc<Schema>,
+        event_time_field: Option<String>,
+        watermark_field: Option<String>,
+    ) {
+        self.register_stream_table(StreamTable::Source {
+            name,
+            connector: "stream_catalog".to_string(),
+            schema,
+            event_time_field,
+            watermark_field,
+            with_options: BTreeMap::new(),
+        });
+    }
+
+    pub fn add_sink_table(&mut self, name: String, program: LogicalProgram) {
+        self.register_stream_table(StreamTable::Sink { name, program });
+    }
+
+    pub fn insert_table(&mut self, table: StreamTable) {
+        self.register_stream_table(table);
+    }
+
+    /// Alias for [`Self::register_catalog_table`].
+    pub fn insert_catalog_table(&mut self, table: CatalogTable) {
+        self.register_catalog_table(table);
+    }
+
+    pub fn get_table(&self, table_name: impl AsRef<str>) -> Option<&StreamTable> {
+        self.tables
+            .streams
+            .get(&object_name(table_name.as_ref().to_string()))
+            .map(|a| a.as_ref())
+    }
+
+    pub fn get_table_mut(&mut self, table_name: impl AsRef<str>) -> Option<&mut StreamTable> {
+        self.tables
+            .streams
+            .get_mut(&object_name(table_name.as_ref().to_string()))
+            .map(|a| Arc::make_mut(a))
+    }
+
+    pub fn get_async_udf_options(&self, _name: &str) -> Option<crate::sql::analysis::AsyncOptions> {
+        None
+    }
+
+    fn create_table_source(name: String, schema: Arc<Schema>) -> Arc<dyn TableSource> {
+        let provider = LogicalBatchInput { table_name: name, schema };
+        Arc::new(DefaultTableSource::new(Arc::new(provider)))
+    }
+}
+
+impl ContextProvider for StreamPlanningContext {
+    fn get_table_source(&self, name: TableReference) -> Result<Arc<dyn TableSource>> {
+        let table = self
+            .get_stream_table(name.table())
+            .ok_or_else(|| DataFusionError::Plan(format!("Table {} not found", name)))?;
+
+        Ok(Self::create_table_source(name.to_string(), table.schema()))
+    }
+
+    fn get_function_meta(&self, name: &str) -> Option<Arc<ScalarUDF>> {
+        self.functions.scalars.get(name).cloned()
+    }
+
+    fn get_aggregate_meta(&self, name: &str) -> Option<Arc<AggregateUDF>> {
+        self.functions.aggregates.get(name).cloned()
+    }
+
+    fn get_window_meta(&self, name: &str) -> Option<Arc<WindowUDF>> {
+        self.functions.windows.get(name).cloned()
+    }
+
+    fn get_variable_type(&self, _variable_names: &[String]) -> Option<DataType> {
+        None
+    }
+
+    fn options(&self) -> &datafusion::config::ConfigOptions {
+        &self.config_options
+    }
+
+    fn udf_names(&self) -> Vec<String> {
+        self.functions.scalars.keys().cloned().collect()
+    }
+
+    fn udaf_names(&self) -> Vec<String> {
+        self.functions.aggregates.keys().cloned().collect()
+    }
+
+    fn udwf_names(&self) -> Vec<String> {
+        self.functions.windows.keys().cloned().collect()
+    }
+
+    fn get_expr_planners(&self) -> &[Arc<dyn ExprPlanner>] {
+        &self.functions.planners
+    }
+}
+
+impl FunctionRegistry for StreamPlanningContext {
+    fn udfs(&self) -> HashSet<String> {
+        self.functions.scalars.keys().cloned().collect()
+    }
+
+    fn udf(&self, name: &str) -> Result<Arc<ScalarUDF>> {
+        self.functions
+            .scalars
+            .get(name)
+            .cloned()
+            .ok_or_else(|| DataFusionError::Plan(format!("No UDF with name {name}")))
+    }
+
+    fn udaf(&self, name: &str) -> Result<Arc<AggregateUDF>> {
+        self.functions
+            .aggregates
+            .get(name)
+            .cloned()
+            .ok_or_else(|| DataFusionError::Plan(format!("No UDAF with name {name}")))
+    }
+
+    fn udwf(&self, name: &str) -> Result<Arc<WindowUDF>> {
+        self.functions
+            .windows
+            .get(name)
+            .cloned()
+            .ok_or_else(|| DataFusionError::Plan(format!("No UDWF with name {name}")))
+    }
+
+    fn register_function_rewrite(
+        &mut self,
+        rewrite: Arc<dyn FunctionRewrite + Send + Sync>,
+    ) -> Result<()> {
+        self.analyzer.add_function_rewrite(rewrite);
+        Ok(())
+    }
+
+    fn register_udf(&mut self, udf: Arc<ScalarUDF>) -> Result<Option<Arc<ScalarUDF>>> {
+        Ok(self.functions.scalars.insert(udf.name().to_string(), udf))
+    }
+
+    fn register_udaf(&mut self, udaf: Arc<AggregateUDF>) -> Result<Option<Arc<AggregateUDF>>> {
+        Ok(self
+            .functions
+            .aggregates
+            .insert(udaf.name().to_string(), udaf))
+    }
+
+    fn register_udwf(&mut self, udwf: Arc<WindowUDF>) -> Result<Option<Arc<WindowUDF>>> {
+        Ok(self.functions.windows.insert(udwf.name().to_string(), udwf))
+    }
+
+    fn register_expr_planner(&mut self, expr_planner: Arc<dyn ExprPlanner>) -> Result<()> {
+        self.functions.planners.push(expr_planner);
+        Ok(())
+    }
+
+    fn expr_planners(&self) -> Vec<Arc<dyn ExprPlanner>> {
+        self.functions.planners.clone()
+    }
+}
+
+#[derive(Default)]
+pub struct StreamPlanningContextBuilder {
+    context: StreamPlanningContext,
+}
+
+impl StreamPlanningContextBuilder {
+    pub fn new() -> Self {
+        Self::default()
+    }
+
+    pub fn with_default_functions(mut self) -> Result<Self> {
+        for p in SessionStateDefaults::default_scalar_functions() {
+            self.context.register_udf(p)?;
+        }
+        for p in SessionStateDefaults::default_aggregate_functions() {
+            self.context.register_udaf(p)?;
+        }
+        for p in SessionStateDefaults::default_window_functions() {
+            self.context.register_udwf(p)?;
+        }
+        for p in SessionStateDefaults::default_expr_planners() {
+            self.context.register_expr_planner(p)?;
+        }
+        Ok(self)
+    }
+
+    pub fn with_streaming_extensions(mut self) -> Result<Self> {
+        let extensions = vec![
+            PlaceholderUdf::with_return(
+                window_fn::HOP,
+                vec![
+                    DataType::Interval(datatypes::IntervalUnit::MonthDayNano),
+                    DataType::Interval(datatypes::IntervalUnit::MonthDayNano),
+                ],
+                window_arrow_struct(),
+            ),
+            PlaceholderUdf::with_return(
+                window_fn::TUMBLE,
+                vec![DataType::Interval(datatypes::IntervalUnit::MonthDayNano)],
+                window_arrow_struct(),
+            ),
+            PlaceholderUdf::with_return(
+                window_fn::SESSION,
+                vec![DataType::Interval(datatypes::IntervalUnit::MonthDayNano)],
+                window_arrow_struct(),
+            ),
+            PlaceholderUdf::with_return(
+                planning_placeholder_udf::UNNEST,
+                vec![DataType::List(Arc::new(Field::new(
+                    planning_placeholder_udf::LIST_ELEMENT_FIELD,
+                    DataType::Utf8,
+                    true,
+                )))],
+                DataType::Utf8,
+            ),
+            PlaceholderUdf::with_return(
+                planning_placeholder_udf::ROW_TIME,
+                vec![],
+                DataType::Timestamp(datatypes::TimeUnit::Nanosecond, None),
+            ),
+        ];
+
+        for ext in extensions {
+            self.context.register_udf(ext)?;
+        }
+
+        Ok(self)
+    }
+
+    pub fn build(self) -> StreamPlanningContext {
+        self.context
+    }
+}
diff --git a/src/sql/schema/source_table.rs b/src/sql/schema/source_table.rs
new file mode 100644
index 00000000..fe4411dd
--- /dev/null
+++ b/src/sql/schema/source_table.rs
@@ -0,0 +1,593 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::collections::{BTreeMap, HashMap};
+use std::sync::Arc;
+use std::time::Duration;
+
+use datafusion::arrow::datatypes::{DataType, Field, FieldRef, Schema};
+use datafusion::common::{Column, DFSchema, Result, plan_datafusion_err, plan_err};
+use datafusion::error::DataFusionError;
+use datafusion::logical_expr::Expr;
+use datafusion_expr::ExprSchemable;
+use datafusion::sql::planner::{PlannerContext, SqlToRel};
+use datafusion::sql::sqlparser::ast;
+use datafusion::sql::TableReference;
+use protocol::grpc::api::ConnectorOp;
+use tracing::warn;
+
+use super::column_descriptor::ColumnDescriptor;
+use super::connector_config::ConnectorConfig;
+use super::data_encoding_format::DataEncodingFormat;
+use super::schema_context::SchemaContext;
+use super::table_execution_unit::{EngineDescriptor, SyncMode, TableExecutionUnit};
+use super::table_role::{
+    apply_adapter_specific_rules, deduce_role, serialize_backend_params,
+    validate_adapter_availability, TableRole,
+};
+use super::temporal_pipeline_config::{resolve_temporal_logic, TemporalPipelineConfig, TemporalSpec};
+use super::StreamSchemaProvider;
+use crate::multifield_partial_ord;
+use crate::sql::api::ConnectionProfile;
+use crate::sql::common::constants::{
+    connection_table_role, connector_type, sql_field,
+};
+use crate::sql::common::connector_options::ConnectorOptions;
+use crate::sql::common::with_option_keys as opt;
+use crate::sql::common::{
+    BadData, Format, Framing, FsSchema, JsonCompression, JsonFormat,
+};
+use crate::sql::schema::kafka_operator_config::build_kafka_proto_config;
+use crate::sql::schema::ConnectionType;
+use crate::sql::schema::table::SqlSource;
+use crate::sql::types::ProcessingMode;
+
+/// Connector-backed catalog table (adapter / source-sink model).
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+pub struct SourceTable {
+    pub registry_id: Option<i64>,
+    pub adapter_type: String,
+    pub table_identifier: String,
+    pub role: TableRole,
+    pub schema_specs: Vec<ColumnDescriptor>,
+    /// Strongly-typed connector runtime configuration — replaces the legacy `opaque_config: String`.
+    pub connector_config: ConnectorConfig,
+    pub temporal_config: TemporalPipelineConfig,
+    pub key_constraints: Vec<String>,
+    pub payload_format: Option<DataEncodingFormat>,
+    /// Wire [`Format`] when built from SQL `WITH` (updating mode, `ConnectionSchema`).
+    pub connection_format: Option<Format>,
+    pub description: String,
+    pub partition_exprs: Arc<Option<Vec<Expr>>>,
+    pub lookup_cache_max_bytes: Option<u64>,
+    pub lookup_cache_ttl: Option<Duration>,
+    pub inferred_fields: Option<Vec<FieldRef>>,
+    /// Original `WITH` options for catalog persistence / `SHOW CREATE TABLE`.
+    pub catalog_with_options: BTreeMap<String, String>,
+}
+
+multifield_partial_ord!(
+    SourceTable,
+    registry_id,
+    adapter_type,
+    table_identifier,
+    role,
+    description,
+    key_constraints,
+    connection_format,
+    catalog_with_options
+);
+
+impl SourceTable {
+    #[inline]
+    pub fn name(&self) -> &str {
+        self.table_identifier.as_str()
+    }
+
+    pub fn new(
+        table_identifier: impl Into<String>,
+        connector: impl Into<String>,
+        connection_type: ConnectionType,
+    ) -> Self {
+        Self {
+            registry_id: None,
+            adapter_type: connector.into(),
+            table_identifier: table_identifier.into(),
+            role: connection_type.into(),
+            schema_specs: Vec::new(),
+            connector_config: ConnectorConfig::Generic(HashMap::new()),
+            temporal_config: TemporalPipelineConfig::default(),
+            key_constraints: Vec::new(),
+            payload_format: None,
+            connection_format: None,
+            description: String::new(),
+            partition_exprs: Arc::new(None),
+            lookup_cache_max_bytes: None,
+            lookup_cache_ttl: None,
+            inferred_fields: None,
+            catalog_with_options: BTreeMap::new(),
+        }
+    }
+
+    #[inline]
+    pub fn connector(&self) -> &str {
+        self.adapter_type.as_str()
+    }
+
+    #[inline]
+    pub fn connection_type(&self) -> ConnectionType {
+        self.role.into()
+    }
+
+    pub fn event_time_field(&self) -> Option<&str> {
+        self.temporal_config.event_column.as_deref()
+    }
+
+    pub fn watermark_field(&self) -> Option<&str> {
+        self.temporal_config.watermark_strategy_column.as_deref()
+    }
+
+    /// Watermark column name safe to persist for [`StreamTable::Source`]. Omits the computed
+    /// [`sql_field::COMPUTED_WATERMARK`] column: stream catalog only stores Arrow physical fields,
+    /// so `__watermark` cannot be resolved when the table is planned from the catalog.
+    pub fn stream_catalog_watermark_field(&self) -> Option<String> {
+        self.temporal_config
+            .watermark_strategy_column
+            .as_deref()
+            .filter(|w| *w != sql_field::COMPUTED_WATERMARK)
+            .map(str::to_string)
+    }
+
+    #[inline]
+    pub fn catalog_with_options(&self) -> &BTreeMap<String, String> {
+        &self.catalog_with_options
+    }
+
+    pub fn idle_time(&self) -> Option<Duration> {
+        self.temporal_config.liveness_timeout
+    }
+
+    pub fn initialize_from_params(
+        id: &str,
+        adapter: &str,
+        raw_columns: Vec<ColumnDescriptor>,
+        pk_list: Vec<String>,
+        time_meta: Option<TemporalSpec>,
+        options: &mut HashMap<String, String>,
+        _schema_ctx: &dyn SchemaContext,
+    ) -> Result<Self> {
+        validate_adapter_availability(adapter)?;
+
+        let catalog_with_options: BTreeMap<String, String> = options
+            .iter()
+            .map(|(k, v)| (k.clone(), v.clone()))
+            .collect();
+
+        let encoding = DataEncodingFormat::extract_from_map(options)?;
+
+        let mut refined_columns = apply_adapter_specific_rules(adapter, raw_columns);
+        refined_columns = encoding.apply_envelope(refined_columns)?;
+
+        let temporal_settings = resolve_temporal_logic(&refined_columns, time_meta)?;
+        let _finalized_config = serialize_backend_params(adapter, options)?;
+        let role = deduce_role(options)?;
+
+        if role == TableRole::Ingestion && encoding.supports_delta_updates() && pk_list.is_empty() {
+            return plan_err!("CDC source requires at least one primary key");
+        }
+
+        Ok(Self {
+            registry_id: None,
+            adapter_type: adapter.to_string(),
+            table_identifier: id.to_string(),
+            role,
+            schema_specs: refined_columns,
+            connector_config: ConnectorConfig::Generic(catalog_with_options.clone().into_iter().collect()),
+            temporal_config: temporal_settings,
+            key_constraints: pk_list,
+            payload_format: Some(encoding),
+            connection_format: None,
+            description: String::new(),
+            partition_exprs: Arc::new(None),
+            lookup_cache_max_bytes: None,
+            lookup_cache_ttl: None,
+            inferred_fields: None,
+            catalog_with_options,
+        })
+    }
+
+    pub fn produce_physical_schema(&self) -> Schema {
+        Schema::new(
+            self.schema_specs
+                .iter()
+                .filter(|c| !c.is_computed())
+                .map(|c| c.arrow_field().clone())
+                .collect::<Vec<_>>(),
+        )
+    }
+
+    #[inline]
+    pub fn physical_schema(&self) -> Schema {
+        self.produce_physical_schema()
+    }
+
+    pub fn convert_to_execution_unit(&self) -> Result<TableExecutionUnit> {
+        if self.role == TableRole::Egress {
+            return plan_err!("Target [{}] is write-only", self.table_identifier);
+        }
+
+        if self.is_cdc_enabled() && self.schema_specs.iter().any(|c| c.is_computed()) {
+            return plan_err!("CDC cannot be mixed with computed columns natively");
+        }
+
+        let mode = if self.is_cdc_enabled() {
+            SyncMode::Incremental
+        } else {
+            SyncMode::AppendOnly
+        };
+
+        Ok(TableExecutionUnit {
+            label: self.table_identifier.clone(),
+            engine_meta: EngineDescriptor {
+                engine_type: self.adapter_type.clone(),
+                raw_payload: String::new(),
+            },
+            sync_mode: mode,
+            temporal_offset: self.temporal_config.clone(),
+        })
+    }
+
+    #[inline]
+    pub fn to_execution_unit(&self) -> Result<TableExecutionUnit> {
+        self.convert_to_execution_unit()
+    }
+
+    fn is_cdc_enabled(&self) -> bool {
+        self.payload_format
+            .as_ref()
+            .is_some_and(|f| f.supports_delta_updates())
+    }
+
+    #[allow(clippy::too_many_arguments)]
+    pub fn from_options(
+        table_identifier: &str,
+        connector_name: &str,
+        temporary: bool,
+        fields: Vec<ColumnDescriptor>,
+        primary_keys: Vec<String>,
+        watermark: Option<(String, Option<ast::Expr>)>,
+        options: &mut ConnectorOptions,
+        connection_profile: Option<&ConnectionProfile>,
+        schema_provider: &StreamSchemaProvider,
+        connection_type_override: Option<ConnectionType>,
+        description: String,
+    ) -> Result<Self> {
+        let _ = connection_profile;
+
+        let catalog_with_options = options.snapshot_for_catalog();
+
+        if let Some(c) = options.pull_opt_str(opt::CONNECTOR)? {
+            if c != connector_name {
+                return plan_err!(
+                    "WITH option `connector` is '{c}' but table uses connector '{connector_name}'"
+                );
+            }
+        }
+
+        validate_adapter_availability(connector_name)?;
+
+        let mut columns = fields;
+        columns = apply_adapter_specific_rules(connector_name, columns);
+
+        let format = Format::from_opts(options)
+            .map_err(|e| DataFusionError::Plan(format!("invalid format: '{e}'")))?;
+
+        if let Some(Format::Json(JsonFormat { compression, .. })) = &format
+            && !matches!(compression, JsonCompression::Uncompressed)
+            && connector_name != connector_type::FILESYSTEM
+        {
+            return plan_err!("'json.compression' is only supported for the filesystem connector");
+        }
+
+        let _framing = Framing::from_opts(options)
+            .map_err(|e| DataFusionError::Plan(format!("invalid framing: '{e}'")))?;
+
+        if temporary
+            && let Some(t) = options.insert_str(opt::TYPE, connection_table_role::LOOKUP)?
+            && t != connection_table_role::LOOKUP
+        {
+            return plan_err!(
+                "Cannot have a temporary table with type '{t}'; temporary tables must be type 'lookup'"
+            );
+        }
+
+        let payload_format = format.as_ref().map(DataEncodingFormat::from_connection_format);
+        let encoding = payload_format.unwrap_or(DataEncodingFormat::Raw);
+        columns = encoding.apply_envelope(columns)?;
+
+        let bad_data = BadData::from_opts(options)
+            .map_err(|e| DataFusionError::Plan(format!("Invalid bad_data: '{e}'")))?;
+
+        let role = if let Some(t) = connection_type_override {
+            t.into()
+        } else {
+            match options.pull_opt_str(opt::TYPE)?.as_deref() {
+                None | Some(connection_table_role::SOURCE) => TableRole::Ingestion,
+                Some(connection_table_role::SINK) => TableRole::Egress,
+                Some(connection_table_role::LOOKUP) => TableRole::Reference,
+                Some(other) => {
+                    return plan_err!("invalid connection type '{other}' in WITH options");
+                }
+            }
+        };
+
+        let mut table = SourceTable {
+            registry_id: None,
+            adapter_type: connector_name.to_string(),
+            table_identifier: table_identifier.to_string(),
+            role,
+            schema_specs: columns,
+            connector_config: ConnectorConfig::Generic(HashMap::new()),
+            temporal_config: TemporalPipelineConfig::default(),
+            key_constraints: Vec::new(),
+            payload_format,
+            connection_format: format.clone(),
+            description,
+            partition_exprs: Arc::new(None),
+            lookup_cache_max_bytes: None,
+            lookup_cache_ttl: None,
+            inferred_fields: None,
+            catalog_with_options,
+        };
+
+        if let Some(event_time_field) = options.pull_opt_field(opt::EVENT_TIME_FIELD)? {
+            warn!("`event_time_field` WITH option is deprecated; use WATERMARK FOR syntax");
+            table.temporal_config.event_column = Some(event_time_field);
+        }
+
+        if let Some(watermark_field) = options.pull_opt_field(opt::WATERMARK_FIELD)? {
+            warn!("`watermark_field` WITH option is deprecated; use WATERMARK FOR syntax");
+            table.temporal_config.watermark_strategy_column = Some(watermark_field);
+        }
+
+        if let Some((time_field, watermark_expr)) = watermark {
+            let field = table
+                .schema_specs
+                .iter()
+                .find(|c| c.arrow_field().name().as_str() == time_field.as_str())
+                .ok_or_else(|| {
+                    plan_datafusion_err!(
+                        "WATERMARK FOR field `{}` does not exist in table",
+                        time_field
+                    )
+                })?;
+
+            if !matches!(field.arrow_field().data_type(), DataType::Timestamp(_, None)) {
+                return plan_err!(
+                    "WATERMARK FOR field `{time_field}` has type {}, but expected TIMESTAMP",
+                    field.arrow_field().data_type()
+                );
+            }
+
+            // Watermark 引用的时间列语义上必须非空，强制设为 NOT NULL，
+            // 避免用户建表时遗漏 NOT NULL 导致后续表达式 nullable 校验失败。
+            for col in table.schema_specs.iter_mut() {
+                if col.arrow_field().name().as_str() == time_field.as_str() {
+                    col.set_nullable(false);
+                    break;
+                }
+            }
+
+            let table_ref = TableReference::bare(table.table_identifier.as_str());
+            let df_schema =
+                DFSchema::try_from_qualified_schema(table_ref, &table.produce_physical_schema())?;
+
+            table.temporal_config.event_column = Some(time_field.clone());
+
+            if let Some(expr) = watermark_expr {
+                let logical_expr = plan_generating_expr(&expr, &df_schema, schema_provider)
+                    .map_err(|e| {
+                        DataFusionError::Plan(format!("could not plan watermark expression: {e}"))
+                    })?;
+
+                let (data_type, _nullable) = logical_expr.data_type_and_nullable(&df_schema)?;
+                if !matches!(data_type, DataType::Timestamp(_, _)) {
+                    return plan_err!(
+                        "the type of the WATERMARK FOR expression must be TIMESTAMP, but was {data_type}"
+                    );
+                }
+
+                table.schema_specs.push(ColumnDescriptor::new_computed(
+                    Field::new(
+                        sql_field::COMPUTED_WATERMARK,
+                        logical_expr.get_type(&df_schema)?,
+                        false,
+                    ),
+                    logical_expr,
+                ));
+                table.temporal_config.watermark_strategy_column =
+                    Some(sql_field::COMPUTED_WATERMARK.to_string());
+            } else {
+                table.temporal_config.watermark_strategy_column = Some(time_field);
+            }
+        }
+
+        let idle_from_micros = options
+            .pull_opt_i64(opt::IDLE_MICROS)?
+            .filter(|t| *t > 0)
+            .map(|t| Duration::from_micros(t as u64));
+        let idle_from_duration = options.pull_opt_duration(opt::IDLE_TIME)?;
+        table.temporal_config.liveness_timeout = idle_from_micros.or(idle_from_duration);
+
+        table.lookup_cache_max_bytes = options.pull_opt_u64(opt::LOOKUP_CACHE_MAX_BYTES)?;
+
+        table.lookup_cache_ttl = options.pull_opt_duration(opt::LOOKUP_CACHE_TTL)?;
+
+        if connector_name.eq_ignore_ascii_case(connector_type::KAFKA) {
+            let proto_cfg = build_kafka_proto_config(options, role, &format, bad_data)?;
+            table.connector_config = match proto_cfg {
+                protocol::grpc::api::connector_op::Config::KafkaSource(cfg) => {
+                    ConnectorConfig::KafkaSource(cfg)
+                }
+                protocol::grpc::api::connector_op::Config::KafkaSink(cfg) => {
+                    ConnectorConfig::KafkaSink(cfg)
+                }
+                protocol::grpc::api::connector_op::Config::Generic(g) => {
+                    ConnectorConfig::Generic(g.properties)
+                }
+            };
+        } else {
+            let extra_opts = options.drain_remaining_string_values()?;
+            table.connector_config = ConnectorConfig::Generic(extra_opts);
+        }
+
+        if role == TableRole::Ingestion && encoding.supports_delta_updates() && primary_keys.is_empty()
+        {
+            return plan_err!("Debezium source must have at least one PRIMARY KEY field");
+        }
+
+        table.key_constraints = primary_keys;
+
+        Ok(table)
+    }
+
+    pub fn has_virtual_fields(&self) -> bool {
+        self.schema_specs.iter().any(|c| c.is_computed())
+    }
+
+    pub fn is_updating(&self) -> bool {
+        self.connection_format
+            .as_ref()
+            .is_some_and(|f| f.is_updating())
+            || self.payload_format == Some(DataEncodingFormat::DebeziumJson)
+    }
+
+    /// Build strongly-typed `ConnectorOp` protobuf for runtime operator construction.
+    ///
+    /// Directly maps the in-memory [`ConnectorConfig`] to the proto `oneof config` — zero JSON,
+    /// zero re-parsing.
+    pub fn connector_op(&self) -> ConnectorOp {
+        let physical = self.produce_physical_schema();
+        let fields: Vec<Field> = physical
+            .fields()
+            .iter()
+            .map(|f| f.as_ref().clone())
+            .collect();
+        let fs_schema = FsSchema::from_fields(fields);
+
+        ConnectorOp {
+            connector: self.adapter_type.clone(),
+            fs_schema: Some(fs_schema.into()),
+            name: self.table_identifier.clone(),
+            description: self.description.clone(),
+            config: Some(self.connector_config.to_proto_config()),
+        }
+    }
+
+    pub fn processing_mode(&self) -> ProcessingMode {
+        if self.is_updating() {
+            ProcessingMode::Update
+        } else {
+            ProcessingMode::Append
+        }
+    }
+
+    pub fn timestamp_override(&self) -> Result<Option<Expr>> {
+        if let Some(field_name) = self.temporal_config.event_column.clone() {
+            if self.is_updating() {
+                return plan_err!("can't use event_time_field with update mode");
+            }
+            let _field = self.get_time_column(&field_name)?;
+            Ok(Some(Expr::Column(Column::from_name(field_name.as_str()))))
+        } else {
+            Ok(None)
+        }
+    }
+
+    fn get_time_column(&self, field_name: &str) -> Result<&ColumnDescriptor> {
+        self.schema_specs
+            .iter()
+            .find(|c| {
+                c.arrow_field().name() == field_name
+                    && matches!(c.arrow_field().data_type(), DataType::Timestamp(..))
+            })
+            .ok_or_else(|| {
+                DataFusionError::Plan(format!(
+                    "field {field_name} not found or not a timestamp"
+                ))
+            })
+    }
+
+    pub fn watermark_column(&self) -> Result<Option<Expr>> {
+        if let Some(field_name) = self.temporal_config.watermark_strategy_column.clone() {
+            let _field = self.get_time_column(&field_name)?;
+            Ok(Some(Expr::Column(Column::from_name(field_name.as_str()))))
+        } else {
+            Ok(None)
+        }
+    }
+
+    pub fn as_sql_source(&self) -> Result<SourceOperator> {
+        match self.role {
+            TableRole::Ingestion => {}
+            TableRole::Egress | TableRole::Reference => {
+                return plan_err!("cannot read from sink");
+            }
+        };
+
+        if self.is_updating() && self.has_virtual_fields() {
+            return plan_err!("can't read from a source with virtual fields and update mode.");
+        }
+
+        let timestamp_override = self.timestamp_override()?;
+        let watermark_column = self.watermark_column()?;
+
+        let source = SqlSource {
+            id: self.registry_id,
+            struct_def: self
+                .schema_specs
+                .iter()
+                .filter(|c| !c.is_computed())
+                .map(|c| Arc::new(c.arrow_field().clone()))
+                .collect(),
+            config: self.connector_op(),
+            processing_mode: self.processing_mode(),
+            idle_time: self.temporal_config.liveness_timeout,
+        };
+
+        Ok(SourceOperator {
+            name: self.table_identifier.clone(),
+            source,
+            timestamp_override,
+            watermark_column,
+        })
+    }
+}
+
+/// Plan a SQL scalar expression against a table-qualified schema (e.g. watermark `AS` clause).
+fn plan_generating_expr(
+    ast: &ast::Expr,
+    df_schema: &DFSchema,
+    schema_provider: &StreamSchemaProvider,
+) -> Result<Expr> {
+    let planner = SqlToRel::new(schema_provider);
+    let mut ctx = PlannerContext::new();
+    planner.sql_to_expr(ast.clone(), df_schema, &mut ctx)
+}
+
+#[derive(Debug, Clone)]
+pub struct SourceOperator {
+    pub name: String,
+    pub source: SqlSource,
+    pub timestamp_override: Option<Expr>,
+    pub watermark_column: Option<Expr>,
+}
diff --git a/src/sql/schema/table.rs b/src/sql/schema/table.rs
new file mode 100644
index 00000000..156e8ffd
--- /dev/null
+++ b/src/sql/schema/table.rs
@@ -0,0 +1,162 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::sync::Arc;
+use std::time::Duration;
+use datafusion::arrow::datatypes::FieldRef;
+use datafusion::common::{Result, plan_err};
+use datafusion::logical_expr::{Extension, LogicalPlan};
+use datafusion::sql::sqlparser::ast::Statement;
+use protocol::grpc::api::ConnectorOp;
+use super::source_table::SourceTable;
+use crate::sql::logical_planner::optimizers::produce_optimized_plan;
+use crate::sql::schema::StreamSchemaProvider;
+use crate::sql::extensions::remote_table::RemoteTableBoundaryNode;
+use crate::sql::analysis::rewrite_plan;
+use crate::sql::types::{DFField, ProcessingMode};
+
+/// Represents all table types in the FunctionStream SQL catalog.
+#[allow(clippy::enum_variant_names)]
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+pub enum Table {
+    /// A lookup table backed by an external connector.
+    LookupTable(SourceTable),
+    /// A source/sink table backed by an external connector.
+    ConnectorTable(SourceTable),
+    /// A table defined by a query (CREATE VIEW / CREATE TABLE AS SELECT).
+    TableFromQuery {
+        name: String,
+        logical_plan: LogicalPlan,
+    },
+}
+
+impl Table {
+    /// Try to construct a Table from a CREATE TABLE or CREATE VIEW statement.
+    pub fn try_from_statement(
+        statement: &Statement,
+        schema_provider: &StreamSchemaProvider,
+    ) -> Result<Option<Self>> {
+        use datafusion::logical_expr::{CreateMemoryTable, CreateView, DdlStatement};
+        use datafusion::sql::sqlparser::ast::CreateTable;
+
+        if let Statement::CreateTable(CreateTable { query: None, .. }) = statement {
+            return plan_err!(
+                "CREATE TABLE without AS SELECT is not supported; use CREATE TABLE ... AS SELECT or a connector table"
+            );
+        }
+
+        match produce_optimized_plan(statement, schema_provider) {
+            Ok(LogicalPlan::Ddl(DdlStatement::CreateView(CreateView { name, input, .. })))
+            | Ok(LogicalPlan::Ddl(DdlStatement::CreateMemoryTable(CreateMemoryTable {
+                name,
+                input,
+                ..
+            }))) => {
+                let rewritten = rewrite_plan(input.as_ref().clone(), schema_provider)?;
+                let schema = rewritten.schema().clone();
+                let remote = RemoteTableBoundaryNode {
+                    upstream_plan: rewritten,
+                    table_identifier: name.to_owned(),
+                    resolved_schema: schema,
+                    requires_materialization: true,
+                };
+                Ok(Some(Table::TableFromQuery {
+                    name: name.to_string(),
+                    logical_plan: LogicalPlan::Extension(Extension {
+                        node: Arc::new(remote),
+                    }),
+                }))
+            }
+            _ => Ok(None),
+        }
+    }
+
+    pub fn name(&self) -> &str {
+        match self {
+            Table::TableFromQuery { name, .. } => name.as_str(),
+            Table::ConnectorTable(c) | Table::LookupTable(c) => c.name(),
+        }
+    }
+
+    pub fn get_fields(&self) -> Vec<FieldRef> {
+        match self {
+            Table::ConnectorTable(SourceTable {
+                schema_specs,
+                inferred_fields,
+                ..
+            })
+            | Table::LookupTable(SourceTable {
+                schema_specs,
+                inferred_fields,
+                ..
+            }) => inferred_fields.clone().unwrap_or_else(|| {
+                schema_specs
+                    .iter()
+                    .map(|c| Arc::new(c.arrow_field().clone()))
+                    .collect()
+            }),
+            Table::TableFromQuery { logical_plan, .. } => {
+                logical_plan.schema().fields().iter().cloned().collect()
+            }
+        }
+    }
+
+    pub fn set_inferred_fields(&mut self, fields: Vec<DFField>) -> Result<()> {
+        let Table::ConnectorTable(t) = self else {
+            return Ok(());
+        };
+
+        if !t.schema_specs.is_empty() {
+            return Ok(());
+        }
+
+        if let Some(existing) = &t.inferred_fields {
+            let matches = existing.len() == fields.len()
+                && existing
+                    .iter()
+                    .zip(&fields)
+                    .all(|(a, b)| a.name() == b.name() && a.data_type() == b.data_type());
+
+            if !matches {
+                return plan_err!("all inserts into a table must share the same schema");
+            }
+        }
+
+        let fields: Vec<_> = fields.into_iter().map(|f| f.field().clone()).collect();
+        t.inferred_fields.replace(fields);
+
+        Ok(())
+    }
+
+    pub fn connector_op(&self) -> Result<ConnectorOp> {
+        match self {
+            Table::ConnectorTable(c) | Table::LookupTable(c) => Ok(c.connector_op()),
+            Table::TableFromQuery { .. } => plan_err!("can't write to a query-defined table"),
+        }
+    }
+
+    pub fn partition_exprs(&self) -> Option<&Vec<datafusion::logical_expr::Expr>> {
+        match self {
+            Table::ConnectorTable(c) => (*c.partition_exprs).as_ref(),
+            _ => None,
+        }
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct SqlSource {
+    pub id: Option<i64>,
+    pub struct_def: Vec<FieldRef>,
+    pub config: ConnectorOp,
+    pub processing_mode: ProcessingMode,
+    pub idle_time: Option<Duration>,
+}
diff --git a/src/sql/schema/table_execution_unit.rs b/src/sql/schema/table_execution_unit.rs
new file mode 100644
index 00000000..c23dda7a
--- /dev/null
+++ b/src/sql/schema/table_execution_unit.rs
@@ -0,0 +1,33 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use super::temporal_pipeline_config::TemporalPipelineConfig;
+
+#[derive(Debug, Clone)]
+pub struct EngineDescriptor {
+    pub engine_type: String,
+    pub raw_payload: String,
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub enum SyncMode {
+    AppendOnly,
+    Incremental,
+}
+
+#[derive(Debug, Clone)]
+pub struct TableExecutionUnit {
+    pub label: String,
+    pub engine_meta: EngineDescriptor,
+    pub sync_mode: SyncMode,
+    pub temporal_offset: TemporalPipelineConfig,
+}
diff --git a/src/sql/schema/table_role.rs b/src/sql/schema/table_role.rs
new file mode 100644
index 00000000..bf3fed74
--- /dev/null
+++ b/src/sql/schema/table_role.rs
@@ -0,0 +1,96 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::collections::HashMap;
+
+use datafusion::arrow::datatypes::{DataType, TimeUnit};
+use datafusion::common::{Result, plan_err};
+use datafusion::error::DataFusionError;
+
+use super::column_descriptor::ColumnDescriptor;
+use super::connection_type::ConnectionType;
+use crate::sql::common::constants::{
+    connection_table_role, connector_type, SUPPORTED_CONNECTOR_ADAPTERS,
+};
+use crate::sql::common::with_option_keys as opt;
+
+/// Role of a connector-backed table in the pipeline (ingest / egress / lookup).
+#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub enum TableRole {
+    Ingestion,
+    Egress,
+    Reference,
+}
+
+impl From<TableRole> for ConnectionType {
+    fn from(r: TableRole) -> Self {
+        match r {
+            TableRole::Ingestion => ConnectionType::Source,
+            TableRole::Egress => ConnectionType::Sink,
+            TableRole::Reference => ConnectionType::Lookup,
+        }
+    }
+}
+
+impl From<ConnectionType> for TableRole {
+    fn from(c: ConnectionType) -> Self {
+        match c {
+            ConnectionType::Source => TableRole::Ingestion,
+            ConnectionType::Sink => TableRole::Egress,
+            ConnectionType::Lookup => TableRole::Reference,
+        }
+    }
+}
+
+pub fn validate_adapter_availability(adapter: &str) -> Result<()> {
+    if !SUPPORTED_CONNECTOR_ADAPTERS.contains(&adapter) {
+        return Err(DataFusionError::Plan(format!("Unknown adapter '{adapter}'")));
+    }
+    Ok(())
+}
+
+pub fn apply_adapter_specific_rules(adapter: &str, mut cols: Vec<ColumnDescriptor>) -> Vec<ColumnDescriptor> {
+    match adapter {
+        a if a == connector_type::DELTA || a == connector_type::ICEBERG => {
+            for c in &mut cols {
+                if matches!(c.data_type(), DataType::Timestamp(_, _)) {
+                    c.force_precision(TimeUnit::Microsecond);
+                }
+            }
+            cols
+        }
+        _ => cols,
+    }
+}
+
+pub fn deduce_role(options: &HashMap<String, String>) -> Result<TableRole> {
+    match options.get(opt::TYPE).map(|s| s.as_str()) {
+        None | Some(connection_table_role::SOURCE) => Ok(TableRole::Ingestion),
+        Some(connection_table_role::SINK) => Ok(TableRole::Egress),
+        Some(connection_table_role::LOOKUP) => Ok(TableRole::Reference),
+        Some(other) => plan_err!("Invalid role '{other}'"),
+    }
+}
+
+pub fn serialize_backend_params(adapter: &str, options: &HashMap<String, String>) -> Result<String> {
+    let mut payload = serde_json::Map::new();
+    payload.insert(
+        opt::ADAPTER.to_string(),
+        serde_json::Value::String(adapter.to_string()),
+    );
+
+    for (k, v) in options {
+        payload.insert(k.clone(), serde_json::Value::String(v.clone()));
+    }
+
+    serde_json::to_string(&payload).map_err(|e| DataFusionError::Plan(e.to_string()))
+}
diff --git a/src/sql/schema/temporal_pipeline_config.rs b/src/sql/schema/temporal_pipeline_config.rs
new file mode 100644
index 00000000..f672e552
--- /dev/null
+++ b/src/sql/schema/temporal_pipeline_config.rs
@@ -0,0 +1,58 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::time::Duration;
+
+use datafusion::common::{Result, plan_err};
+use datafusion::logical_expr::Expr;
+
+use super::column_descriptor::ColumnDescriptor;
+use crate::sql::common::constants::sql_field;
+
+/// Event-time and watermark configuration for streaming tables.
+#[derive(Debug, Clone, Default, PartialEq, Eq, Hash)]
+pub struct TemporalPipelineConfig {
+    pub event_column: Option<String>,
+    pub watermark_strategy_column: Option<String>,
+    pub liveness_timeout: Option<Duration>,
+}
+
+#[derive(Debug, Clone)]
+pub struct TemporalSpec {
+    pub time_field: String,
+    pub watermark_expr: Option<Expr>,
+}
+
+pub fn resolve_temporal_logic(
+    columns: &[ColumnDescriptor],
+    time_meta: Option<TemporalSpec>,
+) -> Result<TemporalPipelineConfig> {
+    let mut config = TemporalPipelineConfig::default();
+
+    if let Some(meta) = time_meta {
+        let field_exists = columns
+            .iter()
+            .any(|c| c.arrow_field().name() == meta.time_field.as_str());
+        if !field_exists {
+            return plan_err!("Temporal field {} does not exist", meta.time_field);
+        }
+        config.event_column = Some(meta.time_field.clone());
+
+        if meta.watermark_expr.is_some() {
+            config.watermark_strategy_column = Some(sql_field::COMPUTED_WATERMARK.to_string());
+        } else {
+            config.watermark_strategy_column = Some(meta.time_field);
+        }
+    }
+
+    Ok(config)
+}
diff --git a/src/sql/schema/utils.rs b/src/sql/schema/utils.rs
new file mode 100644
index 00000000..ba408f22
--- /dev/null
+++ b/src/sql/schema/utils.rs
@@ -0,0 +1,79 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::collections::HashMap;
+use std::sync::Arc;
+
+use datafusion::arrow::datatypes::{DataType, Field, Schema, SchemaRef, TimeUnit};
+use datafusion::common::{DFSchema, DFSchemaRef, Result as DFResult, TableReference};
+
+use crate::sql::common::constants::window_interval_field;
+use crate::sql::types::{DFField, TIMESTAMP_FIELD};
+
+/// Returns the Arrow struct type for a window (start, end) pair.
+pub fn window_arrow_struct() -> DataType {
+    DataType::Struct(
+        vec![
+            Arc::new(Field::new(
+                window_interval_field::START,
+                DataType::Timestamp(TimeUnit::Nanosecond, None),
+                false,
+            )),
+            Arc::new(Field::new(
+                window_interval_field::END,
+                DataType::Timestamp(TimeUnit::Nanosecond, None),
+                false,
+            )),
+        ]
+        .into(),
+    )
+}
+
+/// Adds a `_timestamp` field to a DFSchema if it doesn't already have one.
+pub fn add_timestamp_field(
+    schema: DFSchemaRef,
+    qualifier: Option<TableReference>,
+) -> DFResult<DFSchemaRef> {
+    if has_timestamp_field(&schema) {
+        return Ok(schema);
+    }
+
+    let timestamp_field = DFField::new(
+        qualifier,
+        TIMESTAMP_FIELD,
+        DataType::Timestamp(TimeUnit::Nanosecond, None),
+        false,
+    );
+    Ok(Arc::new(schema.join(&DFSchema::new_with_metadata(
+        vec![timestamp_field.into()],
+        HashMap::new(),
+    )?)?))
+}
+
+/// Checks whether a DFSchema contains a `_timestamp` field.
+pub fn has_timestamp_field(schema: &DFSchemaRef) -> bool {
+    schema
+        .fields()
+        .iter()
+        .any(|field| field.name() == TIMESTAMP_FIELD)
+}
+
+/// Adds a `_timestamp` field to an Arrow Schema, returning a new SchemaRef.
+pub fn add_timestamp_field_arrow(schema: Schema) -> SchemaRef {
+    let mut fields = schema.fields().to_vec();
+    fields.push(Arc::new(Field::new(
+        TIMESTAMP_FIELD,
+        DataType::Timestamp(TimeUnit::Nanosecond, None),
+        false,
+    )));
+    Arc::new(Schema::new(fields))
+}
diff --git a/src/sql/types/data_type.rs b/src/sql/types/data_type.rs
new file mode 100644
index 00000000..4736f812
--- /dev/null
+++ b/src/sql/types/data_type.rs
@@ -0,0 +1,157 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::sync::Arc;
+
+use datafusion::arrow::datatypes::{
+    DECIMAL_DEFAULT_SCALE, DECIMAL128_MAX_PRECISION, DataType, Field, IntervalUnit, TimeUnit,
+};
+use datafusion::common::{Result, plan_datafusion_err, plan_err};
+
+use crate::sql::common::constants::planning_placeholder_udf;
+use crate::sql::common::FsExtensionType;
+
+pub fn convert_data_type(
+    sql_type: &datafusion::sql::sqlparser::ast::DataType,
+) -> Result<(DataType, Option<FsExtensionType>)> {
+    use datafusion::sql::sqlparser::ast::ArrayElemTypeDef;
+    use datafusion::sql::sqlparser::ast::DataType as SQLDataType;
+
+    match sql_type {
+        SQLDataType::Array(ArrayElemTypeDef::AngleBracket(inner_sql_type))
+        | SQLDataType::Array(ArrayElemTypeDef::SquareBracket(inner_sql_type, _)) => {
+            let (data_type, extension) = convert_simple_data_type(inner_sql_type)?;
+
+            Ok((
+                DataType::List(Arc::new(FsExtensionType::add_metadata(
+                    extension,
+                    Field::new(planning_placeholder_udf::LIST_ELEMENT_FIELD, data_type, true),
+                ))),
+                None,
+            ))
+        }
+        SQLDataType::Array(ArrayElemTypeDef::None) => {
+            plan_err!("Arrays with unspecified type is not supported")
+        }
+        other => convert_simple_data_type(other),
+    }
+}
+
+fn convert_simple_data_type(
+    sql_type: &datafusion::sql::sqlparser::ast::DataType,
+) -> Result<(DataType, Option<FsExtensionType>)> {
+    use datafusion::sql::sqlparser::ast::DataType as SQLDataType;
+    use datafusion::sql::sqlparser::ast::{ExactNumberInfo, TimezoneInfo};
+
+    if matches!(sql_type, SQLDataType::JSON) {
+        return Ok((DataType::Utf8, Some(FsExtensionType::JSON)));
+    }
+
+    let dt = match sql_type {
+        SQLDataType::Boolean | SQLDataType::Bool => Ok(DataType::Boolean),
+        SQLDataType::TinyInt(_) => Ok(DataType::Int8),
+        SQLDataType::SmallInt(_) | SQLDataType::Int2(_) => Ok(DataType::Int16),
+        SQLDataType::Int(_) | SQLDataType::Integer(_) | SQLDataType::Int4(_) => Ok(DataType::Int32),
+        SQLDataType::BigInt(_) | SQLDataType::Int8(_) => Ok(DataType::Int64),
+        SQLDataType::TinyIntUnsigned(_) => Ok(DataType::UInt8),
+        SQLDataType::SmallIntUnsigned(_) | SQLDataType::Int2Unsigned(_) => Ok(DataType::UInt16),
+        SQLDataType::IntUnsigned(_)
+        | SQLDataType::UnsignedInteger
+        | SQLDataType::Int4Unsigned(_) => Ok(DataType::UInt32),
+        SQLDataType::BigIntUnsigned(_) | SQLDataType::Int8Unsigned(_) => Ok(DataType::UInt64),
+        SQLDataType::Float(_) => Ok(DataType::Float32),
+        SQLDataType::Real | SQLDataType::Float4 => Ok(DataType::Float32),
+        SQLDataType::Double(_) | SQLDataType::DoublePrecision | SQLDataType::Float8 => {
+            Ok(DataType::Float64)
+        }
+        SQLDataType::Char(_)
+        | SQLDataType::Varchar(_)
+        | SQLDataType::Text
+        | SQLDataType::String(_) => Ok(DataType::Utf8),
+        SQLDataType::Timestamp(None, TimezoneInfo::None) | SQLDataType::Datetime(_) => {
+            Ok(DataType::Timestamp(TimeUnit::Nanosecond, None))
+        }
+        SQLDataType::Timestamp(Some(precision), TimezoneInfo::None) => match *precision {
+            0 => Ok(DataType::Timestamp(TimeUnit::Second, None)),
+            3 => Ok(DataType::Timestamp(TimeUnit::Millisecond, None)),
+            6 => Ok(DataType::Timestamp(TimeUnit::Microsecond, None)),
+            9 => Ok(DataType::Timestamp(TimeUnit::Nanosecond, None)),
+            _ => {
+                return plan_err!(
+                    "unsupported precision {} -- supported precisions are 0 (seconds), \
+            3 (milliseconds), 6 (microseconds), and 9 (nanoseconds)",
+                    precision
+                );
+            }
+        },
+        SQLDataType::Date => Ok(DataType::Date32),
+        SQLDataType::Time(None, tz_info) => {
+            if matches!(tz_info, TimezoneInfo::None)
+                || matches!(tz_info, TimezoneInfo::WithoutTimeZone)
+            {
+                Ok(DataType::Time64(TimeUnit::Nanosecond))
+            } else {
+                return plan_err!("Unsupported SQL type {sql_type:?}");
+            }
+        }
+        SQLDataType::Numeric(exact_number_info) | SQLDataType::Decimal(exact_number_info) => {
+            let (precision, scale) = match *exact_number_info {
+                ExactNumberInfo::None => (None, None),
+                ExactNumberInfo::Precision(precision) => (Some(precision), None),
+                ExactNumberInfo::PrecisionAndScale(precision, scale) => {
+                    (Some(precision), Some(scale))
+                }
+            };
+            make_decimal_type(precision, scale)
+        }
+        SQLDataType::Bytea => Ok(DataType::Binary),
+        SQLDataType::Interval => Ok(DataType::Interval(IntervalUnit::MonthDayNano)),
+        SQLDataType::Struct(fields, _) => {
+            let fields: Vec<_> = fields
+                .iter()
+                .map(|f| {
+                    Ok::<_, datafusion::error::DataFusionError>(Arc::new(Field::new(
+                        f.field_name
+                            .as_ref()
+                            .ok_or_else(|| {
+                                plan_datafusion_err!("anonymous struct fields are not allowed")
+                            })?
+                            .to_string(),
+                        convert_data_type(&f.field_type)?.0,
+                        true,
+                    )))
+                })
+                .collect::<Result<_>>()?;
+            Ok(DataType::Struct(fields.into()))
+        }
+        _ => return plan_err!("Unsupported SQL type {sql_type:?}"),
+    };
+
+    Ok((dt?, None))
+}
+
+fn make_decimal_type(precision: Option<u64>, scale: Option<u64>) -> Result<DataType> {
+    let (precision, scale) = match (precision, scale) {
+        (Some(p), Some(s)) => (p as u8, s as i8),
+        (Some(p), None) => (p as u8, 0),
+        (None, Some(_)) => return plan_err!("Cannot specify only scale for decimal data type"),
+        (None, None) => (DECIMAL128_MAX_PRECISION, DECIMAL_DEFAULT_SCALE),
+    };
+
+    if precision == 0 || precision > DECIMAL128_MAX_PRECISION || scale.unsigned_abs() > precision {
+        plan_err!(
+            "Decimal(precision = {precision}, scale = {scale}) should satisfy `0 < precision <= 38`, and `scale <= precision`."
+        )
+    } else {
+        Ok(DataType::Decimal128(precision, scale))
+    }
+}
diff --git a/src/sql/types/df_field.rs b/src/sql/types/df_field.rs
new file mode 100644
index 00000000..435ae30a
--- /dev/null
+++ b/src/sql/types/df_field.rs
@@ -0,0 +1,153 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::collections::HashMap;
+use std::sync::Arc;
+
+use datafusion::arrow::datatypes::{DataType, Field, FieldRef};
+use datafusion::common::{Column, DFSchema, Result, TableReference};
+
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+pub struct DFField {
+    qualifier: Option<TableReference>,
+    field: FieldRef,
+}
+
+impl From<(Option<TableReference>, FieldRef)> for DFField {
+    fn from(value: (Option<TableReference>, FieldRef)) -> Self {
+        Self {
+            qualifier: value.0,
+            field: value.1,
+        }
+    }
+}
+
+impl From<(Option<&TableReference>, &Field)> for DFField {
+    fn from(value: (Option<&TableReference>, &Field)) -> Self {
+        Self {
+            qualifier: value.0.cloned(),
+            field: Arc::new(value.1.clone()),
+        }
+    }
+}
+
+impl From<DFField> for (Option<TableReference>, FieldRef) {
+    fn from(value: DFField) -> Self {
+        (value.qualifier, value.field)
+    }
+}
+
+impl DFField {
+    pub fn new(
+        qualifier: Option<TableReference>,
+        name: impl Into<String>,
+        data_type: DataType,
+        nullable: bool,
+    ) -> Self {
+        Self {
+            qualifier,
+            field: Arc::new(Field::new(name, data_type, nullable)),
+        }
+    }
+
+    pub fn new_unqualified(name: &str, data_type: DataType, nullable: bool) -> Self {
+        DFField {
+            qualifier: None,
+            field: Arc::new(Field::new(name, data_type, nullable)),
+        }
+    }
+
+    pub fn name(&self) -> &String {
+        self.field.name()
+    }
+
+    pub fn data_type(&self) -> &DataType {
+        self.field.data_type()
+    }
+
+    pub fn is_nullable(&self) -> bool {
+        self.field.is_nullable()
+    }
+
+    pub fn metadata(&self) -> &HashMap<String, String> {
+        self.field.metadata()
+    }
+
+    pub fn qualified_name(&self) -> String {
+        if let Some(qualifier) = &self.qualifier {
+            format!("{}.{}", qualifier, self.field.name())
+        } else {
+            self.field.name().to_owned()
+        }
+    }
+
+    pub fn qualified_column(&self) -> Column {
+        Column {
+            relation: self.qualifier.clone(),
+            name: self.field.name().to_string(),
+            spans: Default::default(),
+        }
+    }
+
+    pub fn unqualified_column(&self) -> Column {
+        Column {
+            relation: None,
+            name: self.field.name().to_string(),
+            spans: Default::default(),
+        }
+    }
+
+    pub fn qualifier(&self) -> Option<&TableReference> {
+        self.qualifier.as_ref()
+    }
+
+    pub fn field(&self) -> &FieldRef {
+        &self.field
+    }
+
+    pub fn strip_qualifier(mut self) -> Self {
+        self.qualifier = None;
+        self
+    }
+
+    pub fn with_nullable(mut self, nullable: bool) -> Self {
+        let f = self.field().as_ref().clone().with_nullable(nullable);
+        self.field = f.into();
+        self
+    }
+
+    pub fn with_metadata(mut self, metadata: HashMap<String, String>) -> Self {
+        let f = self.field().as_ref().clone().with_metadata(metadata);
+        self.field = f.into();
+        self
+    }
+}
+
+pub fn fields_with_qualifiers(schema: &DFSchema) -> Vec<DFField> {
+    schema
+        .fields()
+        .iter()
+        .enumerate()
+        .map(|(i, f)| (schema.qualified_field(i).0.cloned(), f.clone()).into())
+        .collect()
+}
+
+pub fn schema_from_df_fields(fields: &[DFField]) -> Result<DFSchema> {
+    schema_from_df_fields_with_metadata(fields, HashMap::new())
+}
+
+pub fn schema_from_df_fields_with_metadata(
+    fields: &[DFField],
+    metadata: HashMap<String, String>,
+) -> Result<DFSchema> {
+    DFSchema::new_with_metadata(fields.iter().map(|t| t.clone().into()).collect(), metadata)
+}
diff --git a/src/sql/types/mod.rs b/src/sql/types/mod.rs
new file mode 100644
index 00000000..4c99d08f
--- /dev/null
+++ b/src/sql/types/mod.rs
@@ -0,0 +1,62 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+mod data_type;
+mod df_field;
+pub(crate) mod placeholder_udf;
+mod stream_schema;
+mod window;
+
+use std::time::Duration;
+
+use crate::sql::common::constants::sql_planning_default;
+
+pub use df_field::{
+    DFField, fields_with_qualifiers, schema_from_df_fields, schema_from_df_fields_with_metadata,
+};
+pub(crate) use placeholder_udf::PlaceholderUdf;
+pub(crate) use window::WindowBehavior;
+pub use window::{WindowType, find_window};
+
+pub use crate::sql::common::constants::sql_field::TIMESTAMP_FIELD;
+
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub enum ProcessingMode {
+    Append,
+    Update,
+}
+
+#[derive(Clone, Debug)]
+pub struct SqlConfig {
+    pub default_parallelism: usize,
+}
+
+impl Default for SqlConfig {
+    fn default() -> Self {
+        Self {
+            default_parallelism: sql_planning_default::DEFAULT_PARALLELISM,
+        }
+    }
+}
+
+#[derive(Clone)]
+pub struct PlanningOptions {
+    pub ttl: Duration,
+}
+
+impl Default for PlanningOptions {
+    fn default() -> Self {
+        Self {
+            ttl: Duration::from_secs(sql_planning_default::PLANNING_TTL_SECS),
+        }
+    }
+}
diff --git a/src/sql/types/placeholder_udf.rs b/src/sql/types/placeholder_udf.rs
new file mode 100644
index 00000000..0bdf17e6
--- /dev/null
+++ b/src/sql/types/placeholder_udf.rs
@@ -0,0 +1,70 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::any::Any;
+use std::fmt::{Debug, Formatter};
+use std::sync::Arc;
+
+use datafusion::arrow::datatypes::DataType;
+use datafusion::common::Result;
+use datafusion::logical_expr::{
+    ColumnarValue, ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl, Signature, Volatility,
+};
+
+#[allow(clippy::type_complexity)]
+pub(crate) struct PlaceholderUdf {
+    name: String,
+    signature: Signature,
+    return_type: Arc<dyn Fn(&[DataType]) -> Result<DataType> + Send + Sync + 'static>,
+}
+
+impl Debug for PlaceholderUdf {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(f, "PlaceholderUDF<{}>", self.name)
+    }
+}
+
+impl ScalarUDFImpl for PlaceholderUdf {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        &self.name
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, args: &[DataType]) -> Result<DataType> {
+        (self.return_type)(args)
+    }
+
+    fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        unimplemented!("PlaceholderUdf should never be called at execution time");
+    }
+}
+
+impl PlaceholderUdf {
+    pub fn with_return(
+        name: impl Into<String>,
+        args: Vec<DataType>,
+        ret: DataType,
+    ) -> Arc<ScalarUDF> {
+        Arc::new(ScalarUDF::new_from_impl(PlaceholderUdf {
+            name: name.into(),
+            signature: Signature::exact(args, Volatility::Volatile),
+            return_type: Arc::new(move |_| Ok(ret.clone())),
+        }))
+    }
+}
diff --git a/src/sql/types/stream_schema.rs b/src/sql/types/stream_schema.rs
new file mode 100644
index 00000000..4b63182d
--- /dev/null
+++ b/src/sql/types/stream_schema.rs
@@ -0,0 +1,88 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::sync::Arc;
+
+use datafusion::arrow::datatypes::{Field, Schema, SchemaRef};
+use datafusion::common::Result;
+
+use super::TIMESTAMP_FIELD;
+
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct StreamSchema {
+    pub schema: SchemaRef,
+    pub timestamp_index: usize,
+    pub key_indices: Option<Vec<usize>>,
+}
+
+impl StreamSchema {
+    pub fn new(schema: SchemaRef, timestamp_index: usize, key_indices: Option<Vec<usize>>) -> Self {
+        Self {
+            schema,
+            timestamp_index,
+            key_indices,
+        }
+    }
+
+    pub fn new_unkeyed(schema: SchemaRef, timestamp_index: usize) -> Self {
+        Self {
+            schema,
+            timestamp_index,
+            key_indices: None,
+        }
+    }
+
+    pub fn from_fields(fields: Vec<Field>) -> Self {
+        let schema = Arc::new(Schema::new(fields));
+        let timestamp_index = schema
+            .column_with_name(TIMESTAMP_FIELD)
+            .map(|(i, _)| i)
+            .unwrap_or(0);
+        Self {
+            schema,
+            timestamp_index,
+            key_indices: None,
+        }
+    }
+
+    pub fn from_schema_keys(schema: SchemaRef, key_indices: Vec<usize>) -> Result<Self> {
+        let timestamp_index = schema
+            .column_with_name(TIMESTAMP_FIELD)
+            .ok_or_else(|| {
+                datafusion::error::DataFusionError::Plan(format!(
+                    "no {TIMESTAMP_FIELD} field in schema, schema is {schema:?}"
+                ))
+            })?
+            .0;
+        Ok(Self {
+            schema,
+            timestamp_index,
+            key_indices: Some(key_indices),
+        })
+    }
+
+    pub fn from_schema_unkeyed(schema: SchemaRef) -> Result<Self> {
+        let timestamp_index = schema
+            .column_with_name(TIMESTAMP_FIELD)
+            .ok_or_else(|| {
+                datafusion::error::DataFusionError::Plan(format!(
+                    "no {TIMESTAMP_FIELD} field in schema"
+                ))
+            })?
+            .0;
+        Ok(Self {
+            schema,
+            timestamp_index,
+            key_indices: None,
+        })
+    }
+}
diff --git a/src/sql/types/window.rs b/src/sql/types/window.rs
new file mode 100644
index 00000000..7934bc1d
--- /dev/null
+++ b/src/sql/types/window.rs
@@ -0,0 +1,109 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::time::Duration;
+
+use datafusion::common::{Result, plan_err};
+use datafusion::logical_expr::Expr;
+
+use crate::sql::common::constants::window_fn;
+
+use super::DFField;
+
+#[derive(Clone, Debug, PartialEq, Eq, Hash)]
+pub enum WindowType {
+    Tumbling { width: Duration },
+    Sliding { width: Duration, slide: Duration },
+    Session { gap: Duration },
+    Instant,
+}
+
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+pub(crate) enum WindowBehavior {
+    FromOperator {
+        window: WindowType,
+        window_field: DFField,
+        window_index: usize,
+        is_nested: bool,
+    },
+    InData,
+}
+
+pub fn get_duration(expression: &Expr) -> Result<Duration> {
+    use datafusion::common::ScalarValue;
+
+    match expression {
+        Expr::Literal(ScalarValue::IntervalDayTime(Some(val)), _) => {
+            Ok(Duration::from_secs((val.days as u64) * 24 * 60 * 60)
+                + Duration::from_millis(val.milliseconds as u64))
+        }
+        Expr::Literal(ScalarValue::IntervalMonthDayNano(Some(val)), _) => {
+            if val.months != 0 {
+                return datafusion::common::not_impl_err!(
+                    "Windows do not support durations specified as months"
+                );
+            }
+            Ok(Duration::from_secs((val.days as u64) * 24 * 60 * 60)
+                + Duration::from_nanos(val.nanoseconds as u64))
+        }
+        _ => plan_err!(
+            "unsupported Duration expression, expect duration literal, not {}",
+            expression
+        ),
+    }
+}
+
+pub fn find_window(expression: &Expr) -> Result<Option<WindowType>> {
+    use datafusion::logical_expr::expr::Alias;
+    use datafusion::logical_expr::expr::ScalarFunction;
+
+    match expression {
+        Expr::ScalarFunction(ScalarFunction { func: fun, args }) => match fun.name() {
+            name if name == window_fn::HOP => {
+                if args.len() != 2 {
+                    unreachable!();
+                }
+                let slide = get_duration(&args[0])?;
+                let width = get_duration(&args[1])?;
+                if width.as_nanos() % slide.as_nanos() != 0 {
+                    return plan_err!(
+                        "hop() width {:?} must be a multiple of slide {:?}",
+                        width,
+                        slide
+                    );
+                }
+                if slide == width {
+                    Ok(Some(WindowType::Tumbling { width }))
+                } else {
+                    Ok(Some(WindowType::Sliding { width, slide }))
+                }
+            }
+            name if name == window_fn::TUMBLE => {
+                if args.len() != 1 {
+                    unreachable!("wrong number of arguments for tumble(), expect one");
+                }
+                let width = get_duration(&args[0])?;
+                Ok(Some(WindowType::Tumbling { width }))
+            }
+            name if name == window_fn::SESSION => {
+                if args.len() != 1 {
+                    unreachable!("wrong number of arguments for session(), expected one");
+                }
+                let gap = get_duration(&args[0])?;
+                Ok(Some(WindowType::Session { gap }))
+            }
+            _ => Ok(None),
+        },
+        Expr::Alias(Alias { expr, .. }) => find_window(expr),
+        _ => Ok(None),
+    }
+}
diff --git a/src/storage/mod.rs b/src/storage/mod.rs
index a4898619..823425d2 100644
--- a/src/storage/mod.rs
+++ b/src/storage/mod.rs
@@ -11,4 +11,5 @@
 // limitations under the License.
 
 pub mod state_backend;
+pub mod stream_catalog;
 pub mod task;
diff --git a/src/storage/stream_catalog/codec.rs b/src/storage/stream_catalog/codec.rs
new file mode 100644
index 00000000..34c2c4ba
--- /dev/null
+++ b/src/storage/stream_catalog/codec.rs
@@ -0,0 +1,57 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Arrow Schema IPC and [`LogicalProgram`] bincode payloads for stream catalog rows.
+
+use std::io::Cursor;
+use std::sync::Arc;
+
+use datafusion::arrow::datatypes::Schema;
+use datafusion::arrow::ipc::reader::StreamReader;
+use datafusion::arrow::ipc::writer::StreamWriter;
+use datafusion::arrow::record_batch::RecordBatch;
+use datafusion::common::{DataFusionError, Result};
+
+use crate::sql::logical_node::logical::LogicalProgram;
+
+pub struct CatalogCodec;
+
+impl CatalogCodec {
+    pub fn encode_schema(schema: &Arc<Schema>) -> Result<Vec<u8>> {
+        let mut buffer = Vec::new();
+        let empty_batch = RecordBatch::new_empty(Arc::clone(schema));
+        let mut writer = StreamWriter::try_new(&mut buffer, schema.as_ref())
+            .map_err(|e| DataFusionError::External(Box::new(e)))?;
+        writer
+            .write(&empty_batch)
+            .map_err(|e| DataFusionError::External(Box::new(e)))?;
+        writer
+            .finish()
+            .map_err(|e| DataFusionError::External(Box::new(e)))?;
+        Ok(buffer)
+    }
+
+    pub fn decode_schema(bytes: &[u8]) -> Result<Arc<Schema>> {
+        let cursor = Cursor::new(bytes);
+        let reader = StreamReader::try_new(cursor, None)
+            .map_err(|e| DataFusionError::External(Box::new(e)))?;
+        Ok(reader.schema())
+    }
+
+    pub fn encode_logical_program(program: &LogicalProgram) -> Result<Vec<u8>> {
+        program.encode_for_catalog()
+    }
+
+    pub fn decode_logical_program(bytes: &[u8]) -> Result<LogicalProgram> {
+        LogicalProgram::decode_for_catalog(bytes)
+    }
+}
diff --git a/src/storage/stream_catalog/manager.rs b/src/storage/stream_catalog/manager.rs
new file mode 100644
index 00000000..a0eb9b49
--- /dev/null
+++ b/src/storage/stream_catalog/manager.rs
@@ -0,0 +1,624 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::sync::{Arc, OnceLock};
+
+use anyhow::{anyhow, bail, Context};
+use datafusion::common::{internal_err, plan_err, Result as DFResult};
+use prost::Message;
+use protocol::grpc::api::FsProgram;
+use protocol::storage::{self as pb, table_definition};
+use tracing::{info, warn};
+use unicase::UniCase;
+
+use crate::sql::common::constants::sql_field;
+use crate::sql::schema::column_descriptor::ColumnDescriptor;
+use crate::sql::schema::connection_type::ConnectionType;
+use crate::sql::schema::source_table::SourceTable;
+use crate::sql::schema::table::Table as CatalogTable;
+use crate::sql::schema::{StreamPlanningContext, StreamTable};
+
+use super::codec::CatalogCodec;
+use super::meta_store::MetaStore;
+
+const CATALOG_KEY_PREFIX: &str = "catalog:stream_table:";
+const STREAMING_JOB_KEY_PREFIX: &str = "streaming_job:";
+
+pub struct CatalogManager {
+    store: Arc<dyn MetaStore>,
+}
+
+static GLOBAL_CATALOG: OnceLock<Arc<CatalogManager>> = OnceLock::new();
+
+impl CatalogManager {
+    pub fn new(store: Arc<dyn MetaStore>) -> Self {
+        Self { store }
+    }
+
+    pub fn init_global_in_memory() -> anyhow::Result<()> {
+        Self::init_global(Arc::new(super::InMemoryMetaStore::new()))
+    }
+
+    pub fn init_global(store: Arc<dyn MetaStore>) -> anyhow::Result<()> {
+        if GLOBAL_CATALOG.get().is_some() {
+            bail!("CatalogManager already initialized");
+        }
+
+        let mgr = Arc::new(CatalogManager::new(store));
+        GLOBAL_CATALOG
+            .set(mgr)
+            .map_err(|_| anyhow!("CatalogManager global install failed"))?;
+
+        Ok(())
+    }
+
+    pub fn try_global() -> Option<Arc<CatalogManager>> {
+        GLOBAL_CATALOG.get().cloned()
+    }
+
+    pub fn global() -> anyhow::Result<Arc<CatalogManager>> {
+        Self::try_global().ok_or_else(|| anyhow!("CatalogManager not initialized"))
+    }
+
+    #[inline]
+    fn build_store_key(table_name: &str) -> String {
+        format!("{CATALOG_KEY_PREFIX}{}", table_name.to_lowercase())
+    }
+
+    #[inline]
+    fn build_streaming_job_key(table_name: &str) -> String {
+        format!("{STREAMING_JOB_KEY_PREFIX}{}", table_name.to_lowercase())
+    }
+
+    // ========================================================================
+    // Streaming job persistence (CREATE STREAMING TABLE / DROP STREAMING TABLE)
+    // ========================================================================
+
+    pub fn persist_streaming_job(
+        &self,
+        table_name: &str,
+        fs_program: &FsProgram,
+        comment: &str,
+    ) -> DFResult<()> {
+        let program_bytes = fs_program.encode_to_vec();
+        let def = pb::StreamingTableDefinition {
+            table_name: table_name.to_string(),
+            created_at_millis: chrono::Utc::now().timestamp_millis(),
+            fs_program_bytes: program_bytes,
+            comment: comment.to_string(),
+        };
+        let payload = def.encode_to_vec();
+        let key = Self::build_streaming_job_key(table_name);
+        self.store.put(&key, payload)?;
+        info!(table = %table_name, "Streaming job definition persisted");
+        Ok(())
+    }
+
+    pub fn remove_streaming_job(&self, table_name: &str) -> DFResult<()> {
+        let key = Self::build_streaming_job_key(table_name);
+        self.store.delete(&key)?;
+        info!(table = %table_name, "Streaming job definition removed from store");
+        Ok(())
+    }
+
+    pub fn load_streaming_job_definitions(
+        &self,
+    ) -> DFResult<Vec<(String, FsProgram)>> {
+        let records = self.store.scan_prefix(STREAMING_JOB_KEY_PREFIX)?;
+        let mut out = Vec::with_capacity(records.len());
+        for (key, payload) in records {
+            let def = match pb::StreamingTableDefinition::decode(payload.as_slice()) {
+                Ok(v) => v,
+                Err(e) => {
+                    warn!(
+                        key = %key,
+                        error = %e,
+                        "Skipping corrupted streaming job record"
+                    );
+                    continue;
+                }
+            };
+            let program = match FsProgram::decode(def.fs_program_bytes.as_slice()) {
+                Ok(v) => v,
+                Err(e) => {
+                    warn!(
+                        table = %def.table_name,
+                        error = %e,
+                        "Skipping streaming job with corrupted FsProgram"
+                    );
+                    continue;
+                }
+            };
+            out.push((def.table_name, program));
+        }
+        Ok(out)
+    }
+
+    // ========================================================================
+    // Catalog table persistence (CREATE TABLE / DROP TABLE)
+    // ========================================================================
+
+    pub fn add_catalog_table(&self, table: CatalogTable) -> DFResult<()> {
+        let proto_def = self.encode_catalog_table(&table)?;
+        let payload = proto_def.encode_to_vec();
+        let key = Self::build_store_key(table.name());
+
+        self.store.put(&key, payload)?;
+        Ok(())
+    }
+
+    pub fn has_catalog_table(&self, name: &str) -> bool {
+        let key = Self::build_store_key(name);
+        self.store.get(&key).ok().flatten().is_some()
+    }
+
+    pub fn drop_catalog_table(&self, table_name: &str, if_exists: bool) -> DFResult<()> {
+        let key = Self::build_store_key(table_name);
+        let exists = self.store.get(&key)?.is_some();
+        if !exists {
+            if if_exists {
+                return Ok(());
+            }
+            return plan_err!("Table '{table_name}' not found");
+        }
+        self.store.delete(&key)?;
+        Ok(())
+    }
+
+    pub fn restore_from_store(&self) -> DFResult<()> {
+        // No-op by design: the catalog is read-through from storage.
+        Ok(())
+    }
+
+    pub fn acquire_planning_context(&self) -> StreamPlanningContext {
+        let mut ctx = StreamPlanningContext::new();
+        let catalogs = self.load_catalog_tables_map().unwrap_or_default();
+        ctx.tables.catalogs = catalogs.clone();
+
+        for (name, table) in catalogs {
+            let source = match table.as_ref() {
+                CatalogTable::ConnectorTable(s) | CatalogTable::LookupTable(s) => s,
+                CatalogTable::TableFromQuery { .. } => continue,
+            };
+
+            let schema = Arc::new(source.produce_physical_schema());
+            ctx.tables.streams.insert(
+                name,
+                Arc::new(StreamTable::Source {
+                    name: source.name().to_string(),
+                    connector: source.connector().to_string(),
+                    schema,
+                    event_time_field: source.event_time_field().map(str::to_string),
+                    watermark_field: source.stream_catalog_watermark_field(),
+                    with_options: source.catalog_with_options().clone(),
+                }),
+            );
+        }
+        ctx
+    }
+
+    /// All persisted catalog tables, sorted by table name.
+    pub fn list_catalog_tables(&self) -> DFResult<Vec<Arc<CatalogTable>>> {
+        let mut out: Vec<Arc<CatalogTable>> =
+            self.load_catalog_tables_map()?.into_values().collect();
+        out.sort_by(|a, b| a.name().cmp(b.name()));
+        Ok(out)
+    }
+
+    pub fn get_catalog_table(&self, name: &str) -> DFResult<Option<Arc<CatalogTable>>> {
+        let key = UniCase::new(name.to_string());
+        Ok(self.load_catalog_tables_map()?.get(&key).cloned())
+    }
+
+    pub fn add_table(&self, table: StreamTable) -> DFResult<()> {
+        match table {
+            StreamTable::Source {
+                name,
+                connector,
+                schema,
+                event_time_field,
+                watermark_field,
+                with_options,
+            } => {
+                let mut source = SourceTable::new(name, connector, ConnectionType::Source);
+                source.schema_specs = schema
+                    .fields()
+                    .iter()
+                    .map(|f| ColumnDescriptor::new_physical((**f).clone()))
+                    .collect();
+                source.inferred_fields = Some(schema.fields().iter().cloned().collect());
+                source.temporal_config.event_column = event_time_field;
+                source.temporal_config.watermark_strategy_column = watermark_field;
+                source.catalog_with_options = with_options;
+                self.add_catalog_table(CatalogTable::ConnectorTable(source))
+            }
+            StreamTable::Sink { name, .. } => plan_err!(
+                "Persisting streaming sink '{name}' in stream catalog is no longer supported"
+            ),
+        }
+    }
+
+    pub fn has_stream_table(&self, name: &str) -> bool {
+        self.has_catalog_table(name)
+    }
+
+    pub fn drop_table(&self, table_name: &str, if_exists: bool) -> DFResult<()> {
+        self.drop_catalog_table(table_name, if_exists)
+    }
+
+    pub fn list_stream_tables(&self) -> Vec<Arc<StreamTable>> {
+        self.list_catalog_tables()
+            .unwrap_or_default()
+            .into_iter()
+            .filter_map(|t| match t.as_ref() {
+                CatalogTable::ConnectorTable(s) | CatalogTable::LookupTable(s) => {
+                    Some(Arc::new(StreamTable::Source {
+                        name: s.name().to_string(),
+                        connector: s.connector().to_string(),
+                        schema: Arc::new(s.produce_physical_schema()),
+                        event_time_field: s.event_time_field().map(str::to_string),
+                        watermark_field: s.stream_catalog_watermark_field(),
+                        with_options: s.catalog_with_options().clone(),
+                    }))
+                }
+                CatalogTable::TableFromQuery { .. } => None,
+            })
+            .collect()
+    }
+
+    pub fn get_stream_table(&self, name: &str) -> Option<Arc<StreamTable>> {
+        self.get_catalog_table(name)
+            .ok()
+            .flatten()
+            .and_then(|t| match t.as_ref() {
+                CatalogTable::ConnectorTable(s) | CatalogTable::LookupTable(s) => {
+                    Some(Arc::new(StreamTable::Source {
+                        name: s.name().to_string(),
+                        connector: s.connector().to_string(),
+                        schema: Arc::new(s.produce_physical_schema()),
+                        event_time_field: s.event_time_field().map(str::to_string),
+                        watermark_field: s.stream_catalog_watermark_field(),
+                        with_options: s.catalog_with_options().clone(),
+                    }))
+                }
+                CatalogTable::TableFromQuery { .. } => None,
+            })
+    }
+
+    fn encode_catalog_table(&self, table: &CatalogTable) -> DFResult<pb::TableDefinition> {
+        let table_type = match table {
+            CatalogTable::ConnectorTable(source) | CatalogTable::LookupTable(source) => {
+                let mut opts = source.catalog_with_options().clone();
+                opts.entry("connector".to_string())
+                    .or_insert_with(|| source.connector().to_string());
+                let catalog_row = pb::CatalogSourceTable {
+                    arrow_schema_ipc: CatalogCodec::encode_schema(&Arc::new(
+                        source.produce_physical_schema(),
+                    ))?,
+                    event_time_field: source.event_time_field().map(str::to_string),
+                    watermark_field: source.stream_catalog_watermark_field(),
+                    with_options: opts.into_iter().collect(),
+                    connector: source.connector().to_string(),
+                    description: source.description.clone(),
+                };
+                if matches!(table, CatalogTable::LookupTable(_)) {
+                    table_definition::TableType::LookupTable(catalog_row)
+                } else {
+                    table_definition::TableType::ConnectorTable(catalog_row)
+                }
+            }
+            CatalogTable::TableFromQuery { name, .. } => return plan_err!(
+                "Persisting query-defined table '{}' is not supported by stream catalog storage",
+                name
+            ),
+        };
+
+        Ok(pb::TableDefinition {
+            table_name: table.name().to_string(),
+            updated_at_millis: chrono::Utc::now().timestamp_millis(),
+            table_type: Some(table_type),
+        })
+    }
+
+    fn decode_catalog_source_table(
+        &self,
+        table_name: String,
+        source_row: pb::CatalogSourceTable,
+        as_lookup: bool,
+    ) -> DFResult<CatalogTable> {
+        let connector = if source_row.connector.is_empty() {
+            source_row
+                .with_options
+                .get("connector")
+                .cloned()
+                .unwrap_or_else(|| "stream_catalog".to_string())
+        } else {
+            source_row.connector.clone()
+        };
+        let mut source = SourceTable::new(
+            table_name,
+            connector,
+            if as_lookup {
+                ConnectionType::Lookup
+            } else {
+                ConnectionType::Source
+            },
+        );
+        let schema = CatalogCodec::decode_schema(&source_row.arrow_schema_ipc)?;
+        source.schema_specs = schema
+            .fields()
+            .iter()
+            .map(|f| ColumnDescriptor::new_physical((**f).clone()))
+            .collect();
+        source.inferred_fields = Some(schema.fields().iter().cloned().collect());
+        source.temporal_config.event_column = source_row.event_time_field;
+        source.temporal_config.watermark_strategy_column = source_row
+            .watermark_field
+            .filter(|w| w != sql_field::COMPUTED_WATERMARK);
+        source.catalog_with_options = source_row.with_options.into_iter().collect();
+        source.description = source_row.description;
+
+        // Rebuild strongly-typed ConnectorConfig from persisted WITH options.
+        if source.connector().eq_ignore_ascii_case("kafka") {
+            use crate::sql::schema::kafka_operator_config::build_kafka_proto_config_from_string_map;
+            use crate::sql::schema::ConnectorConfig;
+            let opts_map: std::collections::HashMap<String, String> =
+                source.catalog_with_options.iter().map(|(k, v)| (k.clone(), v.clone())).collect();
+            let physical = source.produce_physical_schema();
+            if let Ok(proto_cfg) = build_kafka_proto_config_from_string_map(opts_map, &physical) {
+                source.connector_config = match proto_cfg {
+                    protocol::grpc::api::connector_op::Config::KafkaSource(cfg) => {
+                        ConnectorConfig::KafkaSource(cfg)
+                    }
+                    protocol::grpc::api::connector_op::Config::KafkaSink(cfg) => {
+                        ConnectorConfig::KafkaSink(cfg)
+                    }
+                    protocol::grpc::api::connector_op::Config::Generic(g) => {
+                        ConnectorConfig::Generic(g.properties)
+                    }
+                };
+            }
+        } else {
+            use crate::sql::schema::ConnectorConfig;
+            source.connector_config = ConnectorConfig::Generic(
+                source.catalog_with_options.iter().map(|(k, v)| (k.clone(), v.clone())).collect(),
+            );
+        }
+
+        if as_lookup {
+            Ok(CatalogTable::LookupTable(source))
+        } else {
+            Ok(CatalogTable::ConnectorTable(source))
+        }
+    }
+
+    fn decode_catalog_table(&self, proto_def: pb::TableDefinition) -> DFResult<CatalogTable> {
+        let Some(table_type) = proto_def.table_type else {
+            return internal_err!(
+                "Corrupted catalog row: missing table_type for {}",
+                proto_def.table_name
+            );
+        };
+
+        match table_type {
+            table_definition::TableType::ConnectorTable(src) => {
+                self.decode_catalog_source_table(proto_def.table_name, src, false)
+            }
+            table_definition::TableType::LookupTable(src) => {
+                self.decode_catalog_source_table(proto_def.table_name, src, true)
+            }
+        }
+    }
+
+    fn load_catalog_tables_map(
+        &self,
+    ) -> DFResult<std::collections::HashMap<crate::sql::schema::ObjectName, Arc<CatalogTable>>> {
+        let mut out = std::collections::HashMap::new();
+        let records = self.store.scan_prefix(CATALOG_KEY_PREFIX)?;
+        for (key, payload) in records {
+            let proto_def = match pb::TableDefinition::decode(payload.as_slice()) {
+                Ok(v) => v,
+                Err(e) => {
+                    warn!(
+                        catalog_key = %key,
+                        error = %e,
+                        "Skipping corrupted stream catalog row: protobuf decode failed"
+                    );
+                    continue;
+                }
+            };
+            let table = match self.decode_catalog_table(proto_def) {
+                Ok(v) => v,
+                Err(e) => {
+                    warn!(
+                        catalog_key = %key,
+                        error = %e,
+                        "Skipping unsupported/corrupted stream catalog row"
+                    );
+                    continue;
+                }
+            };
+            let object_name = UniCase::new(table.name().to_string());
+            out.insert(object_name, Arc::new(table));
+        }
+        Ok(out)
+    }
+}
+
+pub fn restore_global_catalog_from_store() {
+    let Some(mgr) = CatalogManager::try_global() else {
+        return;
+    };
+    match mgr.restore_from_store() {
+        Ok(()) => {
+            let n = mgr.list_catalog_tables().map(|t| t.len()).unwrap_or(0);
+            info!(catalog_tables = n, "Catalog loaded from durable store");
+        }
+        Err(e) => warn!("Stream catalog restore_from_store failed: {e:#}"),
+    }
+}
+
+pub fn restore_streaming_jobs_from_store() {
+    use crate::runtime::streaming::job::JobManager;
+
+    let Some(catalog) = CatalogManager::try_global() else {
+        warn!("CatalogManager not available; skipping streaming job restore");
+        return;
+    };
+    let job_manager = match JobManager::global() {
+        Ok(jm) => jm,
+        Err(e) => {
+            warn!(error = %e, "JobManager not available; skipping streaming job restore");
+            return;
+        }
+    };
+
+    let definitions = match catalog.load_streaming_job_definitions() {
+        Ok(defs) => defs,
+        Err(e) => {
+            warn!(error = %e, "Failed to load streaming job definitions from store");
+            return;
+        }
+    };
+
+    if definitions.is_empty() {
+        info!("No persisted streaming jobs to restore");
+        return;
+    }
+
+    let total = definitions.len();
+    info!(count = total, "Restoring persisted streaming jobs");
+
+    let rt = tokio::runtime::Handle::current();
+    let mut restored = 0usize;
+    let mut failed = 0usize;
+
+    for (table_name, fs_program) in definitions {
+        let jm = job_manager.clone();
+        let name = table_name.clone();
+        match rt.block_on(jm.submit_job(name.clone(), fs_program)) {
+            Ok(job_id) => {
+                info!(table = %table_name, job_id = %job_id, "Streaming job restored");
+                restored += 1;
+            }
+            Err(e) => {
+                warn!(table = %table_name, error = %e, "Failed to restore streaming job");
+                failed += 1;
+            }
+        }
+    }
+
+    info!(
+        restored = restored,
+        failed = failed,
+        total = total,
+        "Streaming job restore complete"
+    );
+}
+
+pub fn initialize_stream_catalog(config: &crate::config::GlobalConfig) -> anyhow::Result<()> {
+    if !config.stream_catalog.persist {
+        return CatalogManager::init_global_in_memory()
+            .context("Stream catalog (CatalogManager) in-memory init failed");
+    }
+
+    let path = config
+        .stream_catalog
+        .db_path
+        .as_ref()
+        .map(|p| crate::config::resolve_path(p))
+        .unwrap_or_else(|| crate::config::get_data_dir().join("stream_catalog"));
+
+    std::fs::create_dir_all(&path).with_context(|| {
+        format!(
+            "Failed to create stream catalog directory {}",
+            path.display()
+        )
+    })?;
+
+    let store = std::sync::Arc::new(
+        super::RocksDbMetaStore::open(&path).with_context(|| {
+            format!(
+                "Failed to open stream catalog RocksDB at {}",
+                path.display()
+            )
+        })?,
+    );
+
+    CatalogManager::init_global(store).context("Stream catalog (CatalogManager) init failed")
+}
+
+pub fn planning_schema_provider() -> StreamPlanningContext {
+    CatalogManager::try_global()
+        .map(|m| m.acquire_planning_context())
+        .unwrap_or_else(StreamPlanningContext::new)
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use datafusion::arrow::datatypes::{DataType, Field};
+
+    use crate::sql::schema::column_descriptor::ColumnDescriptor;
+    use crate::sql::schema::connection_type::ConnectionType;
+    use crate::sql::schema::source_table::SourceTable;
+    use crate::sql::schema::table::Table as CatalogTable;
+    use crate::storage::stream_catalog::InMemoryMetaStore;
+
+    use super::CatalogManager;
+
+    fn create_test_manager() -> CatalogManager {
+        CatalogManager::new(Arc::new(InMemoryMetaStore::new()))
+    }
+
+    #[test]
+    fn add_table_roundtrip_snapshot() {
+        let mgr = create_test_manager();
+        let mut source = SourceTable::new("t1", "kafka", ConnectionType::Source);
+        source.schema_specs = vec![ColumnDescriptor::new_physical(Field::new(
+            "a",
+            DataType::Int32,
+            false,
+        ))];
+        source.temporal_config.event_column = Some("ts".into());
+        let table = CatalogTable::ConnectorTable(source);
+
+        mgr.add_catalog_table(table).unwrap();
+
+        let got = mgr
+            .get_catalog_table("t1")
+            .unwrap()
+            .expect("table present");
+        assert_eq!(got.name(), "t1");
+    }
+
+    #[test]
+    fn drop_table_if_exists() {
+        let mgr = create_test_manager();
+        let mut source = SourceTable::new("t_drop", "kafka", ConnectionType::Source);
+        source.schema_specs = vec![ColumnDescriptor::new_physical(Field::new(
+            "a",
+            DataType::Int32,
+            false,
+        ))];
+        mgr.add_catalog_table(CatalogTable::ConnectorTable(source))
+            .unwrap();
+
+        mgr.drop_catalog_table("t_drop", false).unwrap();
+        assert!(!mgr.has_catalog_table("t_drop"));
+
+        mgr.drop_catalog_table("t_drop", true).unwrap();
+        assert!(mgr.drop_catalog_table("nope", false).is_err());
+        mgr.drop_catalog_table("nope", true).unwrap();
+    }
+}
diff --git a/src/storage/stream_catalog/meta_store.rs b/src/storage/stream_catalog/meta_store.rs
new file mode 100644
index 00000000..6f61b3f7
--- /dev/null
+++ b/src/storage/stream_catalog/meta_store.rs
@@ -0,0 +1,70 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Pluggable metadata KV backend (memory, etcd, Redis, …).
+
+use std::collections::HashMap;
+
+use datafusion::common::Result;
+use parking_lot::RwLock;
+
+/// Synchronous metadata store for catalog records.
+pub trait MetaStore: Send + Sync {
+    fn put(&self, key: &str, value: Vec<u8>) -> Result<()>;
+    fn get(&self, key: &str) -> Result<Option<Vec<u8>>>;
+    fn delete(&self, key: &str) -> Result<()>;
+    fn scan_prefix(&self, prefix: &str) -> Result<Vec<(String, Vec<u8>)>>;
+}
+
+/// In-process KV store for single-node deployments and tests.
+pub struct InMemoryMetaStore {
+    db: RwLock<HashMap<String, Vec<u8>>>,
+}
+
+impl InMemoryMetaStore {
+    pub fn new() -> Self {
+        Self {
+            db: RwLock::new(HashMap::new()),
+        }
+    }
+}
+
+impl Default for InMemoryMetaStore {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl MetaStore for InMemoryMetaStore {
+    fn put(&self, key: &str, value: Vec<u8>) -> Result<()> {
+        self.db.write().insert(key.to_string(), value);
+        Ok(())
+    }
+
+    fn get(&self, key: &str) -> Result<Option<Vec<u8>>> {
+        Ok(self.db.read().get(key).cloned())
+    }
+
+    fn delete(&self, key: &str) -> Result<()> {
+        self.db.write().remove(key);
+        Ok(())
+    }
+
+    fn scan_prefix(&self, prefix: &str) -> Result<Vec<(String, Vec<u8>)>> {
+        let db = self.db.read();
+        Ok(db
+            .iter()
+            .filter(|(k, _)| k.starts_with(prefix))
+            .map(|(k, v)| (k.clone(), v.clone()))
+            .collect())
+    }
+}
diff --git a/src/storage/stream_catalog/mod.rs b/src/storage/stream_catalog/mod.rs
new file mode 100644
index 00000000..b99f3080
--- /dev/null
+++ b/src/storage/stream_catalog/mod.rs
@@ -0,0 +1,26 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Stream table catalog: protobuf persistence, MVCC-style planning snapshots for the coordinator.
+
+mod codec;
+mod manager;
+mod meta_store;
+mod rocksdb_meta_store;
+
+pub use manager::{
+    CatalogManager, initialize_stream_catalog,
+    restore_global_catalog_from_store,
+    restore_streaming_jobs_from_store,
+};
+pub use meta_store::{InMemoryMetaStore, MetaStore};
+pub use rocksdb_meta_store::RocksDbMetaStore;
diff --git a/src/storage/stream_catalog/rocksdb_meta_store.rs b/src/storage/stream_catalog/rocksdb_meta_store.rs
new file mode 100644
index 00000000..98a518a3
--- /dev/null
+++ b/src/storage/stream_catalog/rocksdb_meta_store.rs
@@ -0,0 +1,131 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! RocksDB-backed [`super::MetaStore`] for durable stream catalog rows.
+
+use std::path::Path;
+use std::sync::Arc;
+
+use anyhow::Context;
+use datafusion::common::Result;
+use rocksdb::{DB, Direction, IteratorMode, Options};
+
+use super::MetaStore;
+
+/// Single-node durable KV used by [`crate::storage::stream_catalog::CatalogManager`].
+pub struct RocksDbMetaStore {
+    db: Arc<DB>,
+}
+
+impl RocksDbMetaStore {
+    pub fn open<P: AsRef<Path>>(path: P) -> anyhow::Result<Self> {
+        let path = path.as_ref();
+        if let Some(parent) = path.parent() {
+            std::fs::create_dir_all(parent).with_context(|| {
+                format!("stream catalog: create parent directory {parent:?}")
+            })?;
+        }
+        let mut opts = Options::default();
+        opts.create_if_missing(true);
+        let db = DB::open(&opts, path).with_context(|| {
+            format!("stream catalog: open RocksDB at {}", path.display())
+        })?;
+        Ok(Self { db: Arc::new(db) })
+    }
+}
+
+impl MetaStore for RocksDbMetaStore {
+    fn put(&self, key: &str, value: Vec<u8>) -> Result<()> {
+        self.db
+            .put(key.as_bytes(), value.as_slice())
+            .map_err(|e| datafusion::common::DataFusionError::Execution(format!(
+                "stream catalog store put: {e}"
+            )))
+    }
+
+    fn get(&self, key: &str) -> Result<Option<Vec<u8>>> {
+        self.db
+            .get(key.as_bytes())
+            .map_err(|e| datafusion::common::DataFusionError::Execution(format!(
+                "stream catalog store get: {e}"
+            )))
+    }
+
+    fn delete(&self, key: &str) -> Result<()> {
+        self.db
+            .delete(key.as_bytes())
+            .map_err(|e| datafusion::common::DataFusionError::Execution(format!(
+                "stream catalog store delete: {e}"
+            )))
+    }
+
+    fn scan_prefix(&self, prefix: &str) -> Result<Vec<(String, Vec<u8>)>> {
+        let mut out = Vec::new();
+        let iter = self
+            .db
+            .iterator(IteratorMode::From(prefix.as_bytes(), Direction::Forward));
+        for item in iter {
+            let (k, v) = item.map_err(|e| {
+                datafusion::common::DataFusionError::Execution(format!(
+                    "stream catalog store scan: {e}"
+                ))
+            })?;
+            let key = String::from_utf8(k.to_vec()).map_err(|e| {
+                datafusion::common::DataFusionError::Execution(format!(
+                    "stream catalog store: invalid utf8 key: {e}"
+                ))
+            })?;
+            if !key.starts_with(prefix) {
+                break;
+            }
+            out.push((key, v.to_vec()));
+        }
+        Ok(out)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::path::PathBuf;
+
+    use uuid::Uuid;
+
+    use super::*;
+
+    #[test]
+    fn put_get_scan_roundtrip() {
+        let dir: PathBuf = std::env::temp_dir().join(format!(
+            "fs_stream_catalog_test_{}",
+            Uuid::new_v4()
+        ));
+        let _ = std::fs::remove_dir_all(&dir);
+
+        let store = RocksDbMetaStore::open(&dir).expect("open");
+        store.put("catalog:stream_table:a", vec![1, 2, 3]).unwrap();
+        store.put("catalog:stream_table:b", vec![4]).unwrap();
+        store.put("other:x", vec![9]).unwrap();
+
+        assert_eq!(
+            store.get("catalog:stream_table:a").unwrap(),
+            Some(vec![1, 2, 3])
+        );
+
+        let prefixed = store.scan_prefix("catalog:stream_table:").unwrap();
+        assert_eq!(prefixed.len(), 2);
+        assert!(prefixed.iter().any(|(k, _)| k.ends_with(":a")));
+        assert!(prefixed.iter().any(|(k, _)| k.ends_with(":b")));
+
+        store.delete("catalog:stream_table:a").unwrap();
+        assert!(store.get("catalog:stream_table:a").unwrap().is_none());
+
+        let _ = std::fs::remove_dir_all(&dir);
+    }
+}
diff --git a/src/storage/task/mod.rs b/src/storage/task/mod.rs
index b4b3680f..3123415a 100644
--- a/src/storage/task/mod.rs
+++ b/src/storage/task/mod.rs
@@ -16,6 +16,7 @@
 
 pub mod factory;
 mod function_info;
+mod proto_codec;
 mod rocksdb_storage;
 pub mod storage;
 
diff --git a/src/storage/task/proto_codec.rs b/src/storage/task/proto_codec.rs
new file mode 100644
index 00000000..1e0bedb3
--- /dev/null
+++ b/src/storage/task/proto_codec.rs
@@ -0,0 +1,271 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Protobuf wire format for RocksDB task rows, with legacy bincode read support.
+
+use anyhow::{Context, Result, anyhow};
+use prost::Message;
+use protocol::storage::{
+    ComponentStateKind, ComponentStateProto, TaskMetadataProto, TaskModulePayloadProto,
+    TaskModulePython, TaskModuleWasm, task_module_payload_proto,
+};
+use serde::{Deserialize, Serialize};
+
+use crate::runtime::common::ComponentState;
+
+use super::storage::TaskModuleBytes;
+
+/// Magic prefix for protobuf-encoded task values (meta + payload). Legacy rows have no prefix.
+pub const TASK_STORAGE_PROTO_MAGIC: &[u8; 4] = b"FSP1";
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+struct LegacyTaskMetadata {
+    task_type: String,
+    state: ComponentState,
+    created_at: u64,
+    checkpoint_id: Option<u64>,
+}
+
+fn component_state_to_proto(state: &ComponentState) -> ComponentStateProto {
+    let (kind, error_message) = match state {
+        ComponentState::Uninitialized => (ComponentStateKind::Uninitialized, String::new()),
+        ComponentState::Initialized => (ComponentStateKind::Initialized, String::new()),
+        ComponentState::Starting => (ComponentStateKind::Starting, String::new()),
+        ComponentState::Running => (ComponentStateKind::Running, String::new()),
+        ComponentState::Checkpointing => (ComponentStateKind::Checkpointing, String::new()),
+        ComponentState::Stopping => (ComponentStateKind::Stopping, String::new()),
+        ComponentState::Stopped => (ComponentStateKind::Stopped, String::new()),
+        ComponentState::Closing => (ComponentStateKind::Closing, String::new()),
+        ComponentState::Closed => (ComponentStateKind::Closed, String::new()),
+        ComponentState::Error { error } => (ComponentStateKind::Error, error.clone()),
+    };
+    ComponentStateProto {
+        kind: kind as i32,
+        error_message,
+    }
+}
+
+fn component_state_from_proto(p: &ComponentStateProto) -> ComponentState {
+    let kind = ComponentStateKind::try_from(p.kind).unwrap_or(ComponentStateKind::Unspecified);
+    match kind {
+        ComponentStateKind::Unspecified | ComponentStateKind::Uninitialized => {
+            ComponentState::Uninitialized
+        }
+        ComponentStateKind::Initialized => ComponentState::Initialized,
+        ComponentStateKind::Starting => ComponentState::Starting,
+        ComponentStateKind::Running => ComponentState::Running,
+        ComponentStateKind::Checkpointing => ComponentState::Checkpointing,
+        ComponentStateKind::Stopping => ComponentState::Stopping,
+        ComponentStateKind::Stopped => ComponentState::Stopped,
+        ComponentStateKind::Closing => ComponentState::Closing,
+        ComponentStateKind::Closed => ComponentState::Closed,
+        ComponentStateKind::Error => ComponentState::Error {
+            error: if p.error_message.is_empty() {
+                "unknown error".to_string()
+            } else {
+                p.error_message.clone()
+            },
+        },
+    }
+}
+
+/// Encode task metadata for `task_meta` column family (always protobuf + magic).
+pub fn encode_task_metadata_bytes(
+    task_type: &str,
+    state: &ComponentState,
+    created_at: u64,
+    checkpoint_id: Option<u64>,
+) -> Result<Vec<u8>> {
+    let proto = TaskMetadataProto {
+        task_type: task_type.to_string(),
+        state: Some(component_state_to_proto(state)),
+        created_at,
+        checkpoint_id,
+    };
+    let mut out = TASK_STORAGE_PROTO_MAGIC.to_vec();
+    proto
+        .encode(&mut out)
+        .context("encode TaskMetadataProto")?;
+    Ok(out)
+}
+
+pub struct DecodedTaskMetadata {
+    pub task_type: String,
+    pub state: ComponentState,
+    pub created_at: u64,
+    pub checkpoint_id: Option<u64>,
+}
+
+/// Decode metadata written by this version (protobuf) or legacy bincode+serde.
+pub fn decode_task_metadata_bytes(raw: &[u8]) -> Result<DecodedTaskMetadata> {
+    if raw.len() >= TASK_STORAGE_PROTO_MAGIC.len()
+        && &raw[..TASK_STORAGE_PROTO_MAGIC.len()] == TASK_STORAGE_PROTO_MAGIC.as_slice()
+    {
+        let proto = TaskMetadataProto::decode(&raw[TASK_STORAGE_PROTO_MAGIC.len()..])
+            .context("decode TaskMetadataProto")?;
+        let state = proto
+            .state
+            .as_ref()
+            .map(component_state_from_proto)
+            .unwrap_or_default();
+        return Ok(DecodedTaskMetadata {
+            task_type: proto.task_type,
+            state,
+            created_at: proto.created_at,
+            checkpoint_id: proto.checkpoint_id,
+        });
+    }
+
+    let (legacy, _): (LegacyTaskMetadata, _) = bincode::serde::decode_from_slice(
+        raw,
+        bincode::config::standard(),
+    )
+    .map_err(|e| anyhow!("legacy task metadata bincode decode failed: {e}"))?;
+    Ok(DecodedTaskMetadata {
+        task_type: legacy.task_type,
+        state: legacy.state,
+        created_at: legacy.created_at,
+        checkpoint_id: legacy.checkpoint_id,
+    })
+}
+
+fn module_to_proto(module: &TaskModuleBytes) -> TaskModulePayloadProto {
+    match module {
+        TaskModuleBytes::Wasm(bytes) => TaskModulePayloadProto {
+            payload: Some(task_module_payload_proto::Payload::Wasm(TaskModuleWasm {
+                wasm_binary: bytes.clone(),
+            })),
+        },
+        TaskModuleBytes::Python {
+            class_name,
+            module,
+            bytes,
+        } => TaskModulePayloadProto {
+            payload: Some(task_module_payload_proto::Payload::Python(TaskModulePython {
+                class_name: class_name.clone(),
+                module_path: module.clone(),
+                embedded_code: bytes.clone(),
+            })),
+        },
+    }
+}
+
+/// Encode module payload for `task_payload` column family (always protobuf + magic).
+pub fn encode_task_module_bytes(module: &TaskModuleBytes) -> Result<Vec<u8>> {
+    let proto = module_to_proto(module);
+    let mut out = TASK_STORAGE_PROTO_MAGIC.to_vec();
+    proto
+        .encode(&mut out)
+        .context("encode TaskModulePayloadProto")?;
+    Ok(out)
+}
+
+/// Decode module payload: protobuf+magic or legacy bincode+serde [`TaskModuleBytes`].
+pub fn decode_task_module_bytes(raw: &[u8]) -> Result<TaskModuleBytes> {
+    if raw.len() >= TASK_STORAGE_PROTO_MAGIC.len()
+        && &raw[..TASK_STORAGE_PROTO_MAGIC.len()] == TASK_STORAGE_PROTO_MAGIC.as_slice()
+    {
+        let proto = TaskModulePayloadProto::decode(&raw[TASK_STORAGE_PROTO_MAGIC.len()..])
+            .context("decode TaskModulePayloadProto")?;
+        return proto.try_into_task_module();
+    }
+
+    let (legacy, _): (TaskModuleBytes, _) = bincode::serde::decode_from_slice(
+        raw,
+        bincode::config::standard(),
+    )
+    .map_err(|e| anyhow!("legacy task module bincode decode failed: {e}"))?;
+    Ok(legacy)
+}
+
+trait TryIntoTaskModule {
+    fn try_into_task_module(self) -> Result<TaskModuleBytes>;
+}
+
+impl TryIntoTaskModule for TaskModulePayloadProto {
+    fn try_into_task_module(self) -> Result<TaskModuleBytes> {
+        match self.payload {
+            Some(task_module_payload_proto::Payload::Wasm(w)) => {
+                Ok(TaskModuleBytes::Wasm(w.wasm_binary))
+            }
+            Some(task_module_payload_proto::Payload::Python(p)) => Ok(TaskModuleBytes::Python {
+                class_name: p.class_name,
+                module: p.module_path,
+                bytes: p.embedded_code,
+            }),
+            None => Err(anyhow!("TaskModulePayloadProto missing payload")),
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn metadata_roundtrip_proto() {
+        let enc = encode_task_metadata_bytes(
+            "wasm",
+            &ComponentState::Running,
+            42,
+            Some(7),
+        )
+        .unwrap();
+        let dec = decode_task_metadata_bytes(&enc).unwrap();
+        assert_eq!(dec.task_type, "wasm");
+        assert_eq!(dec.state, ComponentState::Running);
+        assert_eq!(dec.created_at, 42);
+        assert_eq!(dec.checkpoint_id, Some(7));
+    }
+
+    #[test]
+    fn module_roundtrip_wasm_proto() {
+        let m = TaskModuleBytes::Wasm(vec![1, 2, 3]);
+        let enc = encode_task_module_bytes(&m).unwrap();
+        let dec = decode_task_module_bytes(&enc).unwrap();
+        assert_eq!(dec, m);
+    }
+
+    #[test]
+    fn module_roundtrip_python_proto() {
+        let m = TaskModuleBytes::Python {
+            class_name: "C".into(),
+            module: "m".into(),
+            bytes: Some(vec![9]),
+        };
+        let enc = encode_task_module_bytes(&m).unwrap();
+        let dec = decode_task_module_bytes(&enc).unwrap();
+        assert_eq!(dec, m);
+    }
+
+    #[test]
+    fn legacy_bincode_metadata_still_decodes() {
+        let legacy = LegacyTaskMetadata {
+            task_type: "legacy".into(),
+            state: ComponentState::Stopped,
+            created_at: 99,
+            checkpoint_id: None,
+        };
+        let raw = bincode::serde::encode_to_vec(&legacy, bincode::config::standard()).unwrap();
+        let dec = decode_task_metadata_bytes(&raw).unwrap();
+        assert_eq!(dec.task_type, "legacy");
+        assert_eq!(dec.state, ComponentState::Stopped);
+        assert_eq!(dec.created_at, 99);
+    }
+
+    #[test]
+    fn legacy_bincode_module_still_decodes() {
+        let m = TaskModuleBytes::Wasm(vec![8, 9]);
+        let raw = bincode::serde::encode_to_vec(&m, bincode::config::standard()).unwrap();
+        assert_eq!(decode_task_module_bytes(&raw).unwrap(), m);
+    }
+}
diff --git a/src/storage/task/rocksdb_storage.rs b/src/storage/task/rocksdb_storage.rs
index 31709a51..cea0ceb9 100644
--- a/src/storage/task/rocksdb_storage.rs
+++ b/src/storage/task/rocksdb_storage.rs
@@ -14,12 +14,15 @@
 //!
 //! Uses three column families: task_meta, task_config, task_payload.
 
-use super::storage::{StoredTaskInfo, TaskModuleBytes, TaskStorage};
+use super::proto_codec::{
+    decode_task_metadata_bytes, decode_task_module_bytes, encode_task_metadata_bytes,
+    encode_task_module_bytes,
+};
+use super::storage::{StoredTaskInfo, TaskStorage};
 use crate::config::storage::RocksDBStorageConfig;
 use crate::runtime::common::ComponentState;
 use anyhow::{Context, Result, anyhow};
 use rocksdb::{ColumnFamilyDescriptor, DB, IteratorMode, Options, WriteBatch};
-use serde::{Deserialize, Serialize};
 use std::path::Path;
 use std::sync::Arc;
 
@@ -27,14 +30,6 @@ const CF_METADATA: &str = "task_meta";
 const CF_CONFIG: &str = "task_config";
 const CF_PAYLOAD: &str = "task_payload";
 
-#[derive(Debug, Clone, Serialize, Deserialize)]
-struct TaskMetadata {
-    task_type: String,
-    state: ComponentState,
-    created_at: u64,
-    checkpoint_id: Option<u64>,
-}
-
 pub struct RocksDBTaskStorage {
     db: Arc<DB>,
 }
@@ -95,19 +90,19 @@ impl TaskStorage for RocksDBTaskStorage {
             return Err(anyhow!("Task uniqueness violation: {}", task_info.name));
         }
 
-        let meta = TaskMetadata {
-            task_type: task_info.task_type.clone(),
-            state: task_info.state.clone(),
-            created_at: task_info.created_at,
-            checkpoint_id: task_info.checkpoint_id,
-        };
+        let meta_bytes = encode_task_metadata_bytes(
+            &task_info.task_type,
+            &task_info.state,
+            task_info.created_at,
+            task_info.checkpoint_id,
+        )?;
 
         let mut batch = WriteBatch::default();
-        batch.put_cf(&cf_meta, key, bincode::serialize(&meta)?);
+        batch.put_cf(&cf_meta, key, meta_bytes);
         batch.put_cf(&cf_conf, key, &task_info.config_bytes);
 
         if let Some(ref module) = task_info.module_bytes {
-            batch.put_cf(&cf_payl, key, bincode::serialize(module)?);
+            batch.put_cf(&cf_payl, key, encode_task_module_bytes(module)?);
         }
 
         self.db
@@ -124,10 +119,19 @@ impl TaskStorage for RocksDBTaskStorage {
             .get_cf(&cf, key)?
             .ok_or_else(|| anyhow!("Task {} not found", task_name))?;
 
-        let mut meta: TaskMetadata = bincode::deserialize(&raw)?;
-        meta.state = new_state;
-
-        self.db.put_cf(&cf, key, bincode::serialize(&meta)?)?;
+        let mut decoded = decode_task_metadata_bytes(&raw)?;
+        decoded.state = new_state;
+
+        self.db.put_cf(
+            &cf,
+            key,
+            encode_task_metadata_bytes(
+                &decoded.task_type,
+                &decoded.state,
+                decoded.created_at,
+                decoded.checkpoint_id,
+            )?,
+        )?;
         Ok(())
     }
 
@@ -140,10 +144,19 @@ impl TaskStorage for RocksDBTaskStorage {
             .get_cf(&cf, key)?
             .ok_or_else(|| anyhow!("Task {} not found", task_name))?;
 
-        let mut meta: TaskMetadata = bincode::deserialize(&raw)?;
-        meta.checkpoint_id = checkpoint_id;
-
-        self.db.put_cf(&cf, key, bincode::serialize(&meta)?)?;
+        let mut decoded = decode_task_metadata_bytes(&raw)?;
+        decoded.checkpoint_id = checkpoint_id;
+
+        self.db.put_cf(
+            &cf,
+            key,
+            encode_task_metadata_bytes(
+                &decoded.task_type,
+                &decoded.state,
+                decoded.created_at,
+                decoded.checkpoint_id,
+            )?,
+        )?;
         Ok(())
     }
 
@@ -171,12 +184,12 @@ impl TaskStorage for RocksDBTaskStorage {
             .get_cf(&self.get_cf(CF_CONFIG)?, key)?
             .ok_or_else(|| anyhow!("Config missing: {}", task_name))?;
 
-        let module_bytes = self
-            .db
-            .get_cf(&self.get_cf(CF_PAYLOAD)?, key)?
-            .and_then(|b| bincode::deserialize::<TaskModuleBytes>(&b).ok());
+        let module_bytes = match self.db.get_cf(&self.get_cf(CF_PAYLOAD)?, key)? {
+            None => None,
+            Some(b) => Some(decode_task_module_bytes(&b)?),
+        };
 
-        let meta: TaskMetadata = bincode::deserialize(&meta_raw)?;
+        let meta = decode_task_metadata_bytes(&meta_raw)?;
 
         Ok(StoredTaskInfo {
             name: task_name.to_string(),
diff --git a/src/storage/task/storage.rs b/src/storage/task/storage.rs
index 3c9e4080..156ee5d8 100644
--- a/src/storage/task/storage.rs
+++ b/src/storage/task/storage.rs
@@ -15,7 +15,7 @@ use anyhow::Result;
 use serde::{Deserialize, Serialize};
 
 #[allow(dead_code)]
-#[derive(Debug, Clone, Serialize, Deserialize)]
+#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
 pub enum TaskModuleBytes {
     Wasm(Vec<u8>),
     Python {