diff --git a/common/src/main/scala/org/apache/comet/CometConf.scala b/common/src/main/scala/org/apache/comet/CometConf.scala index 046ccf0b1c..9c8c85196a 100644 --- a/common/src/main/scala/org/apache/comet/CometConf.scala +++ b/common/src/main/scala/org/apache/comet/CometConf.scala @@ -434,11 +434,11 @@ object CometConf extends ShimCometConf { conf(s"$COMET_EXEC_CONFIG_PREFIX.shuffle.compression.codec") .category(CATEGORY_SHUFFLE) .doc( - "The codec of Comet native shuffle used to compress shuffle data. lz4, zstd, and " + - "snappy are supported. Compression can be disabled by setting " + + "The codec of Comet native shuffle used to compress shuffle data. " + + "Supported codecs: lz4, zstd. Compression can be disabled by setting " + "spark.shuffle.compress=false.") .stringConf - .checkValues(Set("zstd", "lz4", "snappy")) + .checkValues(Set("zstd", "lz4")) .createWithDefault("lz4") val COMET_EXEC_SHUFFLE_COMPRESSION_ZSTD_LEVEL: ConfigEntry[Int] = @@ -528,11 +528,10 @@ object CometConf extends ShimCometConf { .category(CATEGORY_SHUFFLE) .doc("Size of the write buffer in bytes used by the native shuffle writer when writing " + "shuffle data to disk. Larger values may improve write performance by reducing " + - "the number of system calls, but will use more memory. " + - "The default is 1MB which provides a good balance between performance and memory usage.") - .bytesConf(ByteUnit.MiB) + "the number of system calls, but will use more memory.") + .bytesConf(ByteUnit.KiB) .checkValue(v => v > 0, "Write buffer size must be positive") - .createWithDefault(1) + .createWithDefault(8) val COMET_SHUFFLE_PREFER_DICTIONARY_RATIO: ConfigEntry[Double] = conf( "spark.comet.shuffle.preferDictionary.ratio") diff --git a/native/Cargo.lock b/native/Cargo.lock index 480f7ad06d..38761f92e2 100644 --- a/native/Cargo.lock +++ b/native/Cargo.lock @@ -122,9 +122,9 @@ checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" [[package]] name = "anstream" -version = "1.0.0" +version = "0.6.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "824a212faf96e9acacdbd09febd34438f8f711fb84e09a8916013cd7815ca28d" +checksum = "43d5b281e737544384e969a5ccad3f1cdd24b48086a0fc1b2a5262a26b8f4f4a" dependencies = [ "anstyle", "anstyle-parse", @@ -137,15 +137,15 @@ dependencies = [ [[package]] name = "anstyle" -version = "1.0.14" +version = "1.0.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "940b3a0ca603d1eade50a4846a2afffd5ef57a9feac2c0e2ec2e14f9ead76000" +checksum = "5192cca8006f1fd4f7237516f40fa183bb07f8fbdfedaa0036de5ea9b0b45e78" [[package]] name = "anstyle-parse" -version = "1.0.0" +version = "0.2.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "52ce7f38b242319f7cabaa6813055467063ecdc9d355bbb4ce0c68908cd8130e" +checksum = "4e7644824f0aa2c7b9384579234ef10eb7efb6a0deb83f9630a49594dd9c15c2" dependencies = [ "utf8parse", ] @@ -196,16 +196,16 @@ dependencies = [ "serde_json", "strum", "strum_macros", - "thiserror 2.0.18", + "thiserror", "uuid", "zstd", ] [[package]] name = "arc-swap" -version = "1.9.1" +version = "1.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a3a1fd6f75306b68087b831f025c712524bcb19aad54e557b1129cfa0a2b207" +checksum = "f9f3647c145568cec02c42054e07bdf9a5a698e15b466fb2341bfc393cd24aa5" dependencies = [ "rustversion", ] @@ -357,6 +357,7 @@ dependencies = [ "arrow-select", "flatbuffers", "lz4_flex", + "zstd", ] [[package]] @@ -372,7 +373,7 @@ dependencies = [ "arrow-schema", "chrono", "half", - "indexmap 2.13.1", + "indexmap 2.13.0", "itoa", "lexical-core", "memchr", @@ -668,9 +669,9 @@ dependencies = [ [[package]] name = "aws-lc-rs" -version = "1.16.2" +version = "1.16.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a054912289d18629dc78375ba2c3726a3afe3ff71b4edba9dedfca0e3446d1fc" +checksum = "94bffc006df10ac2a68c83692d734a465f8ee6c5b384d8545a636f81d858f4bf" dependencies = [ "aws-lc-sys", "zeroize", @@ -678,9 +679,9 @@ dependencies = [ [[package]] name = "aws-lc-sys" -version = "0.39.1" +version = "0.38.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "83a25cf98105baa966497416dbd42565ce3a8cf8dbfd59803ec9ad46f3126399" +checksum = "4321e568ed89bb5a7d291a7f37997c2c0df89809d7b6d12062c81ddb54aa782e" dependencies = [ "cc", "cmake", @@ -715,9 +716,9 @@ dependencies = [ [[package]] name = "aws-sdk-sso" -version = "1.97.0" +version = "1.96.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9aadc669e184501caaa6beafb28c6267fc1baef0810fb58f9b205485ca3f2567" +checksum = "f64a6eded248c6b453966e915d32aeddb48ea63ad17932682774eb026fbef5b1" dependencies = [ "aws-credential-types", "aws-runtime", @@ -739,9 +740,9 @@ dependencies = [ [[package]] name = "aws-sdk-ssooidc" -version = "1.99.0" +version = "1.98.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1342a7db8f358d3de0aed2007a0b54e875458e39848d54cc1d46700b2bfcb0a8" +checksum = "db96d720d3c622fcbe08bae1c4b04a72ce6257d8b0584cb5418da00ae20a344f" dependencies = [ "aws-credential-types", "aws-runtime", @@ -763,9 +764,9 @@ dependencies = [ [[package]] name = "aws-sdk-sts" -version = "1.101.0" +version = "1.100.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ab41ad64e4051ecabeea802d6a17845a91e83287e1dd249e6963ea1ba78c428a" +checksum = "fafbdda43b93f57f699c5dfe8328db590b967b8a820a13ccdd6687355dfcc7ca" dependencies = [ "aws-credential-types", "aws-runtime", @@ -936,9 +937,9 @@ dependencies = [ [[package]] name = "aws-smithy-types" -version = "1.4.7" +version = "1.4.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9d73dbfbaa8e4bc57b9045137680b958d274823509a360abfd8e1d514d40c95c" +checksum = "d2b1117b3b2bbe166d11199b540ceed0d0f7676e36e7b962b5a437a9971eac75" dependencies = [ "base64-simd", "bytes", @@ -1085,7 +1086,7 @@ dependencies = [ "proc-macro2", "quote", "regex", - "rustc-hash 2.1.2", + "rustc-hash 2.1.1", "shlex", "syn 2.0.117", ] @@ -1113,16 +1114,16 @@ dependencies = [ [[package]] name = "blake3" -version = "1.8.4" +version = "1.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4d2d5991425dfd0785aed03aedcf0b321d61975c9b5b3689c774a2610ae0b51e" +checksum = "2468ef7d57b3fb7e16b576e8377cdbde2320c60e1491e961d11da40fc4f02a2d" dependencies = [ "arrayref", "arrayvec", "cc", "cfg-if", "constant_time_eq", - "cpufeatures 0.3.0", + "cpufeatures 0.2.17", ] [[package]] @@ -1168,9 +1169,9 @@ dependencies = [ [[package]] name = "bon" -version = "3.9.1" +version = "3.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f47dbe92550676ee653353c310dfb9cf6ba17ee70396e1f7cf0a2020ad49b2fe" +checksum = "2d13a61f2963b88eef9c1be03df65d42f6996dfeac1054870d950fcf66686f83" dependencies = [ "bon-macros", "rustversion", @@ -1178,9 +1179,9 @@ dependencies = [ [[package]] name = "bon-macros" -version = "3.9.1" +version = "3.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "519bd3116aeeb42d5372c29d982d16d0170d3d4a5ed85fc7dd91642ffff3c67c" +checksum = "d314cc62af2b6b0c65780555abb4d02a03dd3b799cd42419044f0c38d99738c0" dependencies = [ "darling 0.23.0", "ident_case", @@ -1272,9 +1273,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.2.59" +version = "1.2.58" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b7a4d3ec6524d28a329fc53654bbadc9bdd7b0431f5d65f1a56ffb28a1ee5283" +checksum = "e1e928d4b69e3077709075a938a05ffbedfa53a84c8f766efbf8220bb1ff60e1" dependencies = [ "find-msvc-tools", "jobserver", @@ -1282,12 +1283,6 @@ dependencies = [ "shlex", ] -[[package]] -name = "cesu8" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6d43a04d8753f35258c91f8ec639f792891f748a1edbd759cf1dcea3382ad83c" - [[package]] name = "cexpr" version = "0.6.0" @@ -1394,9 +1389,9 @@ dependencies = [ [[package]] name = "clap" -version = "4.6.0" +version = "4.5.60" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b193af5b67834b676abd72466a96c1024e6a6ad978a1f484bd90b85c94041351" +checksum = "2797f34da339ce31042b27d23607e051786132987f595b02ba4f6a6dffb7030a" dependencies = [ "clap_builder", "clap_derive", @@ -1404,9 +1399,9 @@ dependencies = [ [[package]] name = "clap_builder" -version = "4.6.0" +version = "4.5.60" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "714a53001bf66416adb0e2ef5ac857140e7dc3a0c48fb28b2f10762fc4b5069f" +checksum = "24a241312cea5059b13574bb9b3861cabf758b879c15190b37b6d6fd63ab6876" dependencies = [ "anstream", "anstyle", @@ -1416,9 +1411,9 @@ dependencies = [ [[package]] name = "clap_derive" -version = "4.6.0" +version = "4.5.55" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1110bd8a634a1ab8cb04345d8d878267d57c3cf1b38d91b71af6686408bbca6a" +checksum = "a92793da1a46a5f2a02a6f4c46c6496b28c43638adea8306fcb0caa1634f24e5" dependencies = [ "heck", "proc-macro2", @@ -1428,15 +1423,15 @@ dependencies = [ [[package]] name = "clap_lex" -version = "1.1.0" +version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c8d4a3bb8b1e0c1050499d1815f5ab16d04f0959b233085fb31653fbfc9d98f9" +checksum = "3a822ea5bc7590f9d40f1ba12c0dc3c2760f3482c6984db1573ad11031420831" [[package]] name = "cmake" -version = "0.1.58" +version = "0.1.57" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c0f78a02292a74a88ac736019ab962ece0bc380e3f977bf72e376c5d78ff0678" +checksum = "75443c44cd6b379beb8c5b45d85d0773baf31cce901fe7bb252f4eff3008ef7d" dependencies = [ "cc", ] @@ -1732,6 +1727,16 @@ dependencies = [ "darling_macro 0.20.11", ] +[[package]] +name = "darling" +version = "0.21.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9cdf337090841a411e2a7f3deb9187445851f91b309c0c0a29e05f74a00a48c0" +dependencies = [ + "darling_core 0.21.3", + "darling_macro 0.21.3", +] + [[package]] name = "darling" version = "0.23.0" @@ -1756,6 +1761,20 @@ dependencies = [ "syn 2.0.117", ] +[[package]] +name = "darling_core" +version = "0.21.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1247195ecd7e3c85f83c8d2a366e4210d588e802133e1e355180a9870b517ea4" +dependencies = [ + "fnv", + "ident_case", + "proc-macro2", + "quote", + "strsim", + "syn 2.0.117", +] + [[package]] name = "darling_core" version = "0.23.0" @@ -1780,6 +1799,17 @@ dependencies = [ "syn 2.0.117", ] +[[package]] +name = "darling_macro" +version = "0.21.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d38308df82d1080de0afee5d069fa14b0326a88c14f15c5ccda35b4a6c414c81" +dependencies = [ + "darling_core 0.21.3", + "quote", + "syn 2.0.117", +] + [[package]] name = "darling_macro" version = "0.23.0" @@ -1933,7 +1963,7 @@ dependencies = [ "iceberg", "iceberg-storage-opendal", "itertools 0.14.0", - "jni 0.22.4", + "jni", "lazy_static", "log", "log4rs", @@ -1968,7 +1998,7 @@ dependencies = [ "datafusion", "serde", "serde_json", - "thiserror 2.0.18", + "thiserror", ] [[package]] @@ -1994,14 +2024,14 @@ dependencies = [ "assertables", "datafusion", "datafusion-comet-common", - "jni 0.22.4", + "jni", "lazy_static", "once_cell", "parquet", "paste", "prost", "regex", - "thiserror 2.0.18", + "thiserror", ] [[package]] @@ -2043,15 +2073,12 @@ dependencies = [ "datafusion-comet-spark-expr", "futures", "itertools 0.14.0", - "jni 0.21.1", + "jni", "log", - "lz4_flex", "parquet", "simd-adler32", - "snap", "tempfile", "tokio", - "zstd", ] [[package]] @@ -2090,7 +2117,7 @@ dependencies = [ "half", "hashbrown 0.16.1", "hex", - "indexmap 2.13.1", + "indexmap 2.13.0", "itertools 0.14.0", "libc", "log", @@ -2294,7 +2321,7 @@ dependencies = [ "datafusion-functions-aggregate-common", "datafusion-functions-window-common", "datafusion-physical-expr-common", - "indexmap 2.13.1", + "indexmap 2.13.0", "itertools 0.14.0", "paste", "serde_json", @@ -2309,7 +2336,7 @@ checksum = "ab05fdd00e05d5a6ee362882546d29d6d3df43a6c55355164a7fbee12d163bc9" dependencies = [ "arrow", "datafusion-common", - "indexmap 2.13.1", + "indexmap 2.13.0", "itertools 0.14.0", "paste", ] @@ -2473,7 +2500,7 @@ dependencies = [ "datafusion-expr", "datafusion-expr-common", "datafusion-physical-expr", - "indexmap 2.13.1", + "indexmap 2.13.0", "itertools 0.14.0", "log", "regex", @@ -2495,7 +2522,7 @@ dependencies = [ "datafusion-physical-expr-common", "half", "hashbrown 0.16.1", - "indexmap 2.13.1", + "indexmap 2.13.0", "itertools 0.14.0", "parking_lot", "paste", @@ -2530,7 +2557,7 @@ dependencies = [ "datafusion-common", "datafusion-expr-common", "hashbrown 0.16.1", - "indexmap 2.13.1", + "indexmap 2.13.0", "itertools 0.14.0", "parking_lot", ] @@ -2576,7 +2603,7 @@ dependencies = [ "futures", "half", "hashbrown 0.16.1", - "indexmap 2.13.1", + "indexmap 2.13.0", "itertools 0.14.0", "log", "num-traits", @@ -2655,7 +2682,7 @@ dependencies = [ "datafusion-common", "datafusion-expr", "datafusion-functions-nested", - "indexmap 2.13.1", + "indexmap 2.13.0", "log", "regex", "sqlparser", @@ -2775,9 +2802,9 @@ dependencies = [ [[package]] name = "dissimilar" -version = "1.0.11" +version = "1.0.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aeda16ab4059c5fd2a83f2b9c9e9c981327b18aa8e3b313f7e6563799d4f093e" +checksum = "8975ffdaa0ef3661bfe02dbdcc06c9f829dfafe6a3c474de366a8d5e44276921" [[package]] name = "dlv-list" @@ -2919,9 +2946,9 @@ dependencies = [ [[package]] name = "fastrand" -version = "2.4.1" +version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9f1f227452a390804cdb637b74a86990f2a7d7ba4b7d5693aac9b4dd6defd8d6" +checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" [[package]] name = "find-msvc-tools" @@ -3213,7 +3240,7 @@ dependencies = [ "futures-core", "futures-sink", "http 1.4.0", - "indexmap 2.13.1", + "indexmap 2.13.0", "slab", "tokio", "tokio-util", @@ -3393,9 +3420,9 @@ checksum = "135b12329e5e3ce057a9f972339ea52bc954fe1e9358ef27f95e89716fbc5424" [[package]] name = "hyper" -version = "1.9.0" +version = "1.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6299f016b246a94207e63da54dbe807655bf9e00044f73ded42c3ac5305fbcca" +checksum = "2ab2d4f250c3d7b1c9fcdff1cece94ea4e2dfbec68614f7b87cb205f24ca9d11" dependencies = [ "atomic-waker", "bytes", @@ -3407,6 +3434,7 @@ dependencies = [ "httparse", "itoa", "pin-project-lite", + "pin-utils", "smallvec", "tokio", "want", @@ -3553,13 +3581,12 @@ dependencies = [ [[package]] name = "icu_collections" -version = "2.2.0" +version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2984d1cd16c883d7935b9e07e44071dca8d917fd52ecc02c04d5fa0b5a3f191c" +checksum = "4c6b649701667bbe825c3b7e6388cb521c23d88644678e83c0c4d0a621a34b43" dependencies = [ "displaydoc", "potential_utf", - "utf8_iter", "yoke", "zerofrom", "zerovec", @@ -3567,9 +3594,9 @@ dependencies = [ [[package]] name = "icu_locale_core" -version = "2.2.0" +version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "92219b62b3e2b4d88ac5119f8904c10f8f61bf7e95b640d25ba3075e6cac2c29" +checksum = "edba7861004dd3714265b4db54a3c390e880ab658fec5f7db895fae2046b5bb6" dependencies = [ "displaydoc", "litemap", @@ -3580,9 +3607,9 @@ dependencies = [ [[package]] name = "icu_normalizer" -version = "2.2.0" +version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c56e5ee99d6e3d33bd91c5d85458b6005a22140021cc324cea84dd0e72cff3b4" +checksum = "5f6c8828b67bf8908d82127b2054ea1b4427ff0230ee9141c54251934ab1b599" dependencies = [ "icu_collections", "icu_normalizer_data", @@ -3594,15 +3621,15 @@ dependencies = [ [[package]] name = "icu_normalizer_data" -version = "2.2.0" +version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da3be0ae77ea334f4da67c12f149704f19f81d1adf7c51cf482943e84a2bad38" +checksum = "7aedcccd01fc5fe81e6b489c15b247b8b0690feb23304303a9e560f37efc560a" [[package]] name = "icu_properties" -version = "2.2.0" +version = "2.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bee3b67d0ea5c2cca5003417989af8996f8604e34fb9ddf96208a033901e70de" +checksum = "020bfc02fe870ec3a66d93e677ccca0562506e5872c650f893269e08615d74ec" dependencies = [ "icu_collections", "icu_locale_core", @@ -3614,15 +3641,15 @@ dependencies = [ [[package]] name = "icu_properties_data" -version = "2.2.0" +version = "2.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e2bbb201e0c04f7b4b3e14382af113e17ba4f63e2c9d2ee626b720cbce54a14" +checksum = "616c294cf8d725c6afcd8f55abc17c56464ef6211f9ed59cccffe534129c77af" [[package]] name = "icu_provider" -version = "2.2.0" +version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "139c4cf31c8b5f33d7e199446eff9c1e02decfc2f0eec2c8d71f65befa45b421" +checksum = "85962cf0ce02e1e0a629cc34e7ca3e373ce20dda4c4d7294bbd0bf1fdb59e614" dependencies = [ "displaydoc", "icu_locale_core", @@ -3679,9 +3706,9 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.13.1" +version = "2.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "45a8a2b9cb3e0b0c1803dbb0758ffac5de2f425b23c28f518faabd9d805342ff" +checksum = "7714e70437a7dc3ac8eb7e6f8df75fd8eb422675fc7678aff7364301092b1017" dependencies = [ "equivalent", "hashbrown 0.16.1", @@ -3696,7 +3723,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "232929e1d75fe899576a3d5c7416ad0d88dbfbb3c3d6aa00873a7408a50ddb88" dependencies = [ "ahash", - "indexmap 2.13.1", + "indexmap 2.13.0", "is-terminal", "itoa", "log", @@ -3725,9 +3752,9 @@ checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02" [[package]] name = "inventory" -version = "0.3.24" +version = "0.3.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4f0c30c76f2f4ccee3fe55a2435f691ca00c0e4bd87abe4f4a851b1d4dac39b" +checksum = "009ae045c87e7082cb72dab0ccd01ae075dd00141ddc108f43a0ea150a9e7227" dependencies = [ "rustversion", ] @@ -3740,9 +3767,9 @@ checksum = "d98f6fed1fde3f8c21bc40a1abb88dd75e67924f9cffc3ef95607bad8017f8e2" [[package]] name = "iri-string" -version = "0.7.12" +version = "0.7.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "25e659a4bb38e810ebc252e53b5814ff908a8c58c2a9ce2fae1bbec24cbf4e20" +checksum = "c91338f0783edbd6195decb37bae672fd3b165faffb89bf7b9e6942f8b1a731a" dependencies = [ "memchr", "serde", @@ -3785,9 +3812,9 @@ dependencies = [ [[package]] name = "itoa" -version = "1.0.18" +version = "1.0.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682" +checksum = "92ecc6618181def0457392ccd0ee51198e065e016d1d527a7ac1b6dc7c1f09d2" [[package]] name = "java-locator" @@ -3841,22 +3868,6 @@ dependencies = [ "jiff-tzdb", ] -[[package]] -name = "jni" -version = "0.21.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a87aa2bb7d2af34197c04845522473242e1aa17c12f4935d5856491a7fb8c97" -dependencies = [ - "cesu8", - "cfg-if", - "combine", - "jni-sys 0.3.1", - "log", - "thiserror 1.0.69", - "walkdir", - "windows-sys 0.45.0", -] - [[package]] name = "jni" version = "0.22.4" @@ -3867,11 +3878,11 @@ dependencies = [ "combine", "java-locator", "jni-macros", - "jni-sys 0.4.1", + "jni-sys", "libloading", "log", "simd_cesu8", - "thiserror 2.0.18", + "thiserror", "walkdir", "windows-link", ] @@ -3889,15 +3900,6 @@ dependencies = [ "syn 2.0.117", ] -[[package]] -name = "jni-sys" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "41a652e1f9b6e0275df1f15b32661cf0d4b78d4d87ddec5e0c3c20f097433258" -dependencies = [ - "jni-sys 0.4.1", -] - [[package]] name = "jni-sys" version = "0.4.1" @@ -3929,12 +3931,10 @@ dependencies = [ [[package]] name = "js-sys" -version = "0.3.94" +version = "0.3.91" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2e04e2ef80ce82e13552136fabeef8a5ed1f985a96805761cbb9a2c34e7664d9" +checksum = "b49715b7073f385ba4bc528e5747d02e66cb39c6146efb66b781f131f0fb399c" dependencies = [ - "cfg-if", - "futures-util", "once_cell", "wasm-bindgen", ] @@ -4049,9 +4049,9 @@ checksum = "2c4a545a15244c7d945065b5d392b2d2d7f21526fba56ce51467b06ed445e8f7" [[package]] name = "libc" -version = "0.2.184" +version = "0.2.183" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "48f5d2a454e16a5ea0f4ced81bd44e4cfc7bd3a507b61887c99fd3538b28e4af" +checksum = "b5b646652bf6661599e1da8901b3b9522896f01e736bad5f723fe7a3a27f899d" [[package]] name = "libloading" @@ -4074,9 +4074,9 @@ dependencies = [ [[package]] name = "liblzma-sys" -version = "0.4.6" +version = "0.4.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a60851d15cd8c5346eca4ab8babff585be2ae4bc8097c067291d3ffe2add3b6" +checksum = "9f2db66f3268487b5033077f266da6777d057949b8f93c8ad82e441df25e6186" dependencies = [ "cc", "libc", @@ -4113,9 +4113,9 @@ checksum = "32a66949e030da00e8c7d4434b251670a91556f4144941d37452769c25d58a53" [[package]] name = "litemap" -version = "0.8.2" +version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "92daf443525c4cce67b150400bc2316076100ce0b3686209eb8cf3c31612e6f0" +checksum = "6373607a59f0be73a39b6fe456b8192fcc3585f602af20751600e974dd455e77" [[package]] name = "lock_api" @@ -4164,7 +4164,7 @@ dependencies = [ "serde-value", "serde_json", "serde_yaml", - "thiserror 2.0.18", + "thiserror", "thread-id", "typemap-ors", "unicode-segmentation", @@ -4247,9 +4247,9 @@ dependencies = [ [[package]] name = "mio" -version = "1.2.0" +version = "1.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "50b7e5b27aa02a74bac8c3f23f448f8d87ff11f92d3aac1a6ed369ee08cc56c1" +checksum = "a69bcab0ad47271a0234d9422b131806bf3968021e5dc9328caf2d4cd58557fc" dependencies = [ "libc", "wasi", @@ -4264,9 +4264,9 @@ checksum = "dce6dd36094cac388f119d2e9dc82dc730ef91c32a6222170d630e5414b956e6" [[package]] name = "moka" -version = "0.12.15" +version = "0.12.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "957228ad12042ee839f93c8f257b62b4c0ab5eaae1d4fa60de53b27c9d7c5046" +checksum = "85f8024e1c8e71c778968af91d43700ce1d11b219d127d79fb2934153b82b42b" dependencies = [ "async-lock", "crossbeam-channel", @@ -4367,9 +4367,9 @@ dependencies = [ [[package]] name = "num-conv" -version = "0.2.1" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c6673768db2d862beb9b39a78fdcb1a69439615d5794a1be50caa9bc92c81967" +checksum = "cf97ec579c3c42f953ef76dbf8d55ac91fb219dde70e49aa4a6b7d74e9919050" [[package]] name = "num-format" @@ -4462,7 +4462,7 @@ dependencies = [ "serde", "serde_json", "serde_urlencoded", - "thiserror 2.0.18", + "thiserror", "tokio", "tracing", "url", @@ -4752,7 +4752,7 @@ dependencies = [ "arrow-schema", "chrono", "half", - "indexmap 2.13.1", + "indexmap 2.13.0", "simdutf8", "uuid", ] @@ -4767,7 +4767,7 @@ dependencies = [ "arrow-schema", "chrono", "half", - "indexmap 2.13.1", + "indexmap 2.13.0", "parquet-variant", "parquet-variant-json", "serde_json", @@ -4843,7 +4843,7 @@ checksum = "8701b58ea97060d5e5b155d383a69952a60943f0e6dfe30b04c287beb0b27455" dependencies = [ "fixedbitset", "hashbrown 0.15.5", - "indexmap 2.13.1", + "indexmap 2.13.0", "serde", ] @@ -5014,18 +5014,18 @@ checksum = "c33a9471896f1c69cecef8d20cbe2f7accd12527ce60845ff44c153bb2a21b49" [[package]] name = "portable-atomic-util" -version = "0.2.6" +version = "0.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "091397be61a01d4be58e7841595bd4bfedb15f1cd54977d79b8271e94ed799a3" +checksum = "7a9db96d7fa8782dd8c15ce32ffe8680bbd1e978a43bf51a34d39483540495f5" dependencies = [ "portable-atomic", ] [[package]] name = "potential_utf" -version = "0.1.5" +version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0103b1cef7ec0cf76490e969665504990193874ea05c85ff9bab8b911d0a0564" +checksum = "b73949432f5e2a09657003c25bca5e19a0e9c84f8058ca374f49e0ebe605af77" dependencies = [ "zerovec", ] @@ -5055,7 +5055,7 @@ dependencies = [ "spin 0.10.0", "symbolic-demangle", "tempfile", - "thiserror 2.0.18", + "thiserror", ] [[package]] @@ -5217,10 +5217,10 @@ dependencies = [ "pin-project-lite", "quinn-proto", "quinn-udp", - "rustc-hash 2.1.2", + "rustc-hash 2.1.1", "rustls", "socket2", - "thiserror 2.0.18", + "thiserror", "tokio", "tracing", "web-time", @@ -5237,11 +5237,11 @@ dependencies = [ "lru-slab", "rand 0.9.2", "ring", - "rustc-hash 2.1.2", + "rustc-hash 2.1.1", "rustls", "rustls-pki-types", "slab", - "thiserror 2.0.18", + "thiserror", "tinyvec", "tracing", "web-time", @@ -5617,9 +5617,9 @@ checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" [[package]] name = "rustc-hash" -version = "2.1.2" +version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94300abf3f1ae2e2b8ffb7b58043de3d399c73fa6f4b73826402a5c457614dbe" +checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d" [[package]] name = "rustc_version" @@ -5810,9 +5810,9 @@ dependencies = [ [[package]] name = "semver" -version = "1.0.28" +version = "1.0.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a7852d02fc848982e0c167ef163aaff9cd91dc640ba85e263cb1ce46fae51cd" +checksum = "d767eb0aabc880b29956c35734170f26ed551a859dbd361d140cdbeca61ab1e2" [[package]] name = "seq-macro" @@ -5918,15 +5918,15 @@ dependencies = [ [[package]] name = "serde_with" -version = "3.18.0" +version = "3.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd5414fad8e6907dbdd5bc441a50ae8d6e26151a03b1de04d89a5576de61d01f" +checksum = "381b283ce7bc6b476d903296fb59d0d36633652b633b27f64db4fb46dcbfc3b9" dependencies = [ "base64", "chrono", "hex", "indexmap 1.9.3", - "indexmap 2.13.1", + "indexmap 2.13.0", "schemars 0.9.0", "schemars 1.2.1", "serde_core", @@ -5937,11 +5937,11 @@ dependencies = [ [[package]] name = "serde_with_macros" -version = "3.18.0" +version = "3.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d3db8978e608f1fe7357e211969fd9abdcae80bac1ba7a3369bb7eb6b404eb65" +checksum = "a6d4e30573c8cb306ed6ab1dca8423eec9a463ea0e155f45399455e0368b27e0" dependencies = [ - "darling 0.23.0", + "darling 0.21.3", "proc-macro2", "quote", "syn 2.0.117", @@ -5953,7 +5953,7 @@ version = "0.9.34+deprecated" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6a8b1a1a2ebf674015cc02edccce75287f1a0130d394307b36743c2f5d504b47" dependencies = [ - "indexmap 2.13.1", + "indexmap 2.13.0", "itoa", "ryu", "serde", @@ -6038,7 +6038,7 @@ checksum = "0d585997b0ac10be3c5ee635f1bab02d512760d14b7c468801ac8a01d9ae5f1d" dependencies = [ "num-bigint", "num-traits", - "thiserror 2.0.18", + "thiserror", "time", ] @@ -6169,9 +6169,9 @@ checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" [[package]] name = "symbolic-common" -version = "12.17.4" +version = "12.17.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "803d14d7cb9c6fa5b95a6f3de8af95b356a528d391998fa45a07d320a5573e51" +checksum = "751a2823d606b5d0a7616499e4130a516ebd01a44f39811be2b9600936509c23" dependencies = [ "debugid", "memmap2", @@ -6181,9 +6181,9 @@ dependencies = [ [[package]] name = "symbolic-demangle" -version = "12.17.4" +version = "12.17.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "39505731ae891b2dde47b0e4ae2ec40a7ced3476ab1129f1bf829e3fba62bb83" +checksum = "79b237cfbe320601dd24b4ac817a5b68bb28f5508e33f08d42be0682cadc8ac9" dependencies = [ "cpp_demangle", "rustc-demangle", @@ -6251,33 +6251,13 @@ dependencies = [ "windows-sys 0.61.2", ] -[[package]] -name = "thiserror" -version = "1.0.69" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52" -dependencies = [ - "thiserror-impl 1.0.69", -] - [[package]] name = "thiserror" version = "2.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4288b5bcbc7920c07a1149a35cf9590a2aa808e0bc1eafaade0b80947865fbc4" dependencies = [ - "thiserror-impl 2.0.18", -] - -[[package]] -name = "thiserror-impl" -version = "1.0.69" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.117", + "thiserror-impl", ] [[package]] @@ -6385,9 +6365,9 @@ dependencies = [ [[package]] name = "tinystr" -version = "0.8.3" +version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c8323304221c2a851516f22236c5722a72eaa19749016521d6dff0824447d96d" +checksum = "42d3e9c45c09de15d06dd8acf5f4e0e399e85927b7f00711024eb7ae10fa4869" dependencies = [ "displaydoc", "zerovec", @@ -6405,9 +6385,9 @@ dependencies = [ [[package]] name = "tinyvec" -version = "1.11.0" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3e61e67053d25a4e82c844e8424039d9745781b3fc4f32b8d55ed50f5f667ef3" +checksum = "bfa5fdc3bce6191a1dbc8c02d5c8bffcf557bafa17c124c5264a458f1b0613fa" dependencies = [ "tinyvec_macros", ] @@ -6420,9 +6400,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.51.1" +version = "1.50.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f66bf9585cda4b724d3e78ab34b73fb2bbaba9011b9bfdf69dc836382ea13b8c" +checksum = "27ad5e34374e03cfffefc301becb44e9dc3c17584f414349ebe29ed26661822d" dependencies = [ "bytes", "libc", @@ -6437,9 +6417,9 @@ dependencies = [ [[package]] name = "tokio-macros" -version = "2.7.0" +version = "2.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "385a6cb71ab9ab790c5fe8d67f1645e6c450a7ce006a33de03daa956cf70a496" +checksum = "5c55a2eff8b69ce66c84f85e1da1c233edc36ceb85a2058d11b0d6a3c7e7569c" dependencies = [ "proc-macro2", "quote", @@ -6645,9 +6625,9 @@ checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75" [[package]] name = "unicode-segmentation" -version = "1.13.2" +version = "1.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9629274872b2bfaf8d66f5f15725007f635594914870f65218920345aa11aa8c" +checksum = "f6ccf251212114b54433ec949fd6a7841275f9ada20dddd2f29e9ceea4501493" [[package]] name = "unicode-width" @@ -6797,9 +6777,9 @@ dependencies = [ [[package]] name = "wasm-bindgen" -version = "0.2.117" +version = "0.2.114" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0551fc1bb415591e3372d0bc4780db7e587d84e2a7e79da121051c5c4b89d0b0" +checksum = "6532f9a5c1ece3798cb1c2cfdba640b9b3ba884f5db45973a6f442510a87d38e" dependencies = [ "cfg-if", "once_cell", @@ -6810,19 +6790,23 @@ dependencies = [ [[package]] name = "wasm-bindgen-futures" -version = "0.4.67" +version = "0.4.64" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "03623de6905b7206edd0a75f69f747f134b7f0a2323392d664448bf2d3c5d87e" +checksum = "e9c5522b3a28661442748e09d40924dfb9ca614b21c00d3fd135720e48b67db8" dependencies = [ + "cfg-if", + "futures-util", "js-sys", + "once_cell", "wasm-bindgen", + "web-sys", ] [[package]] name = "wasm-bindgen-macro" -version = "0.2.117" +version = "0.2.114" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7fbdf9a35adf44786aecd5ff89b4563a90325f9da0923236f6104e603c7e86be" +checksum = "18a2d50fcf105fb33bb15f00e7a77b772945a2ee45dcf454961fd843e74c18e6" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -6830,9 +6814,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.117" +version = "0.2.114" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dca9693ef2bab6d4e6707234500350d8dad079eb508dca05530c85dc3a529ff2" +checksum = "03ce4caeaac547cdf713d280eda22a730824dd11e6b8c3ca9e42247b25c631e3" dependencies = [ "bumpalo", "proc-macro2", @@ -6843,9 +6827,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-shared" -version = "0.2.117" +version = "0.2.114" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "39129a682a6d2d841b6c429d0c51e5cb0ed1a03829d8b3d1e69a011e62cb3d3b" +checksum = "75a326b8c223ee17883a4251907455a2431acc2791c98c26279376490c378c16" dependencies = [ "unicode-ident", ] @@ -6867,7 +6851,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bb0e353e6a2fbdc176932bbaab493762eb1255a7900fe0fea1a2f96c296cc909" dependencies = [ "anyhow", - "indexmap 2.13.1", + "indexmap 2.13.0", "wasm-encoder", "wasmparser", ] @@ -6893,15 +6877,15 @@ checksum = "47b807c72e1bac69382b3a6fb3dbe8ea4c0ed87ff5629b8685ae6b9a611028fe" dependencies = [ "bitflags 2.11.0", "hashbrown 0.15.5", - "indexmap 2.13.1", + "indexmap 2.13.0", "semver", ] [[package]] name = "web-sys" -version = "0.3.94" +version = "0.3.91" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cd70027e39b12f0849461e08ffc50b9cd7688d942c1c8e3c7b22273236b4dd0a" +checksum = "854ba17bb104abfb26ba36da9729addc7ce7f06f5c0f90f3c391f8461cca21f9" dependencies = [ "js-sys", "wasm-bindgen", @@ -7028,15 +7012,6 @@ dependencies = [ "windows-link", ] -[[package]] -name = "windows-sys" -version = "0.45.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "75283be5efb2831d37ea142365f009c02ec203cd29a3ebecbc093d52315b66d0" -dependencies = [ - "windows-targets 0.42.2", -] - [[package]] name = "windows-sys" version = "0.52.0" @@ -7073,21 +7048,6 @@ dependencies = [ "windows-link", ] -[[package]] -name = "windows-targets" -version = "0.42.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e5180c00cd44c9b1c88adb3693291f1cd93605ded80c250a75d472756b4d071" -dependencies = [ - "windows_aarch64_gnullvm 0.42.2", - "windows_aarch64_msvc 0.42.2", - "windows_i686_gnu 0.42.2", - "windows_i686_msvc 0.42.2", - "windows_x86_64_gnu 0.42.2", - "windows_x86_64_gnullvm 0.42.2", - "windows_x86_64_msvc 0.42.2", -] - [[package]] name = "windows-targets" version = "0.52.6" @@ -7121,12 +7081,6 @@ dependencies = [ "windows_x86_64_msvc 0.53.1", ] -[[package]] -name = "windows_aarch64_gnullvm" -version = "0.42.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "597a5118570b68bc08d8d59125332c54f1ba9d9adeedeef5b99b02ba2b0698f8" - [[package]] name = "windows_aarch64_gnullvm" version = "0.52.6" @@ -7139,12 +7093,6 @@ version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a9d8416fa8b42f5c947f8482c43e7d89e73a173cead56d044f6a56104a6d1b53" -[[package]] -name = "windows_aarch64_msvc" -version = "0.42.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e08e8864a60f06ef0d0ff4ba04124db8b0fb3be5776a5cd47641e942e58c4d43" - [[package]] name = "windows_aarch64_msvc" version = "0.52.6" @@ -7157,12 +7105,6 @@ version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b9d782e804c2f632e395708e99a94275910eb9100b2114651e04744e9b125006" -[[package]] -name = "windows_i686_gnu" -version = "0.42.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c61d927d8da41da96a81f029489353e68739737d3beca43145c8afec9a31a84f" - [[package]] name = "windows_i686_gnu" version = "0.52.6" @@ -7187,12 +7129,6 @@ version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fa7359d10048f68ab8b09fa71c3daccfb0e9b559aed648a8f95469c27057180c" -[[package]] -name = "windows_i686_msvc" -version = "0.42.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "44d840b6ec649f480a41c8d80f9c65108b92d89345dd94027bfe06ac444d1060" - [[package]] name = "windows_i686_msvc" version = "0.52.6" @@ -7205,12 +7141,6 @@ version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e7ac75179f18232fe9c285163565a57ef8d3c89254a30685b57d83a38d326c2" -[[package]] -name = "windows_x86_64_gnu" -version = "0.42.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8de912b8b8feb55c064867cf047dda097f92d51efad5b491dfb98f6bbb70cb36" - [[package]] name = "windows_x86_64_gnu" version = "0.52.6" @@ -7223,12 +7153,6 @@ version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c3842cdd74a865a8066ab39c8a7a473c0778a3f29370b5fd6b4b9aa7df4a499" -[[package]] -name = "windows_x86_64_gnullvm" -version = "0.42.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "26d41b46a36d453748aedef1486d5c7a85db22e56aff34643984ea85514e94a3" - [[package]] name = "windows_x86_64_gnullvm" version = "0.52.6" @@ -7241,12 +7165,6 @@ version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0ffa179e2d07eee8ad8f57493436566c7cc30ac536a3379fdf008f47f6bb7ae1" -[[package]] -name = "windows_x86_64_msvc" -version = "0.42.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9aec5da331524158c6d1a4ac0ab1541149c0b9505fde06423b02f5ef0106b9f0" - [[package]] name = "windows_x86_64_msvc" version = "0.52.6" @@ -7287,7 +7205,7 @@ checksum = "b7c566e0f4b284dd6561c786d9cb0142da491f46a9fbed79ea69cdad5db17f21" dependencies = [ "anyhow", "heck", - "indexmap 2.13.1", + "indexmap 2.13.0", "prettyplease", "syn 2.0.117", "wasm-metadata", @@ -7318,7 +7236,7 @@ checksum = "9d66ea20e9553b30172b5e831994e35fbde2d165325bec84fc43dbf6f4eb9cb2" dependencies = [ "anyhow", "bitflags 2.11.0", - "indexmap 2.13.1", + "indexmap 2.13.0", "log", "serde", "serde_derive", @@ -7337,7 +7255,7 @@ checksum = "ecc8ac4bc1dc3381b7f59c34f00b67e18f910c2c0f50015669dde7def656a736" dependencies = [ "anyhow", "id-arena", - "indexmap 2.13.1", + "indexmap 2.13.0", "log", "semver", "serde", @@ -7349,9 +7267,9 @@ dependencies = [ [[package]] name = "writeable" -version = "0.6.3" +version = "0.6.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ffae5123b2d3fc086436f8834ae3ab053a283cfac8fe0a0b8eaae044768a4c4" +checksum = "9edde0db4769d2dc68579893f2306b26c6ecfbe0ef499b013d731b7b9247e0b9" [[package]] name = "xmlparser" @@ -7361,9 +7279,9 @@ checksum = "66fee0b777b0f5ac1c69bb06d361268faafa61cd4682ae064a171c16c433e9e4" [[package]] name = "yoke" -version = "0.8.2" +version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "abe8c5fda708d9ca3df187cae8bfb9ceda00dd96231bed36e445a1a48e66f9ca" +checksum = "72d6e5c6afb84d73944e5cedb052c4680d5657337201555f9f2a16b7406d4954" dependencies = [ "stable_deref_trait", "yoke-derive", @@ -7372,9 +7290,9 @@ dependencies = [ [[package]] name = "yoke-derive" -version = "0.8.2" +version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "de844c262c8848816172cef550288e7dc6c7b7814b4ee56b3e1553f275f1858e" +checksum = "b659052874eb698efe5b9e8cf382204678a0086ebf46982b79d6ca3182927e5d" dependencies = [ "proc-macro2", "quote", @@ -7384,18 +7302,18 @@ dependencies = [ [[package]] name = "zerocopy" -version = "0.8.48" +version = "0.8.42" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eed437bf9d6692032087e337407a86f04cd8d6a16a37199ed57949d415bd68e9" +checksum = "f2578b716f8a7a858b7f02d5bd870c14bf4ddbbcf3a4c05414ba6503640505e3" dependencies = [ "zerocopy-derive", ] [[package]] name = "zerocopy-derive" -version = "0.8.48" +version = "0.8.42" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "70e3cd084b1788766f53af483dd21f93881ff30d7320490ec3ef7526d203bad4" +checksum = "7e6cc098ea4d3bd6246687de65af3f920c430e236bee1e3bf2e441463f08a02f" dependencies = [ "proc-macro2", "quote", @@ -7404,18 +7322,18 @@ dependencies = [ [[package]] name = "zerofrom" -version = "0.1.7" +version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "69faa1f2a1ea75661980b013019ed6687ed0e83d069bc1114e2cc74c6c04c4df" +checksum = "50cc42e0333e05660c3587f3bf9d0478688e15d870fab3346451ce7f8c9fbea5" dependencies = [ "zerofrom-derive", ] [[package]] name = "zerofrom-derive" -version = "0.1.7" +version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "11532158c46691caf0f2593ea8358fed6bbf68a0315e80aae9bd41fbade684a1" +checksum = "d71e5d6e06ab090c67b5e44993ec16b72dcbaabc526db883a360057678b48502" dependencies = [ "proc-macro2", "quote", @@ -7431,9 +7349,9 @@ checksum = "b97154e67e32c85465826e8bcc1c59429aaaf107c1e4a9e53c8d8ccd5eff88d0" [[package]] name = "zerotrie" -version = "0.2.4" +version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0f9152d31db0792fa83f70fb2f83148effb5c1f5b8c7686c3459e361d9bc20bf" +checksum = "2a59c17a5562d507e4b54960e8569ebee33bee890c70aa3fe7b97e85a9fd7851" dependencies = [ "displaydoc", "yoke", @@ -7442,9 +7360,9 @@ dependencies = [ [[package]] name = "zerovec" -version = "0.11.6" +version = "0.11.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "90f911cbc359ab6af17377d242225f4d75119aec87ea711a880987b18cd7b239" +checksum = "6c28719294829477f525be0186d13efa9a3c602f7ec202ca9e353d310fb9a002" dependencies = [ "yoke", "zerofrom", @@ -7453,9 +7371,9 @@ dependencies = [ [[package]] name = "zerovec-derive" -version = "0.11.3" +version = "0.11.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "625dc425cab0dca6dc3c3319506e6593dcb08a9f387ea3b284dbd52a92c40555" +checksum = "eadce39539ca5cb3985590102671f2567e659fca9666581ad3411d59207951f3" dependencies = [ "proc-macro2", "quote", diff --git a/native/Cargo.toml b/native/Cargo.toml index b71bc0c73c..c12b6ea22d 100644 --- a/native/Cargo.toml +++ b/native/Cargo.toml @@ -34,7 +34,7 @@ edition = "2021" rust-version = "1.88" [workspace.dependencies] -arrow = { version = "58.1.0", features = ["prettyprint", "ffi", "chrono-tz"] } +arrow = { version = "58.1.0", features = ["prettyprint", "ffi", "chrono-tz", "ipc_compression"] } async-trait = { version = "0.1" } bytes = { version = "1.11.1" } parquet = { version = "58.1.0", default-features = false, features = ["experimental"] } diff --git a/native/core/src/execution/jni_api.rs b/native/core/src/execution/jni_api.rs index 93f75bae96..5192ce2c31 100644 --- a/native/core/src/execution/jni_api.rs +++ b/native/core/src/execution/jni_api.rs @@ -62,7 +62,6 @@ use datafusion_spark::function::string::space::SparkSpace; use futures::poll; use futures::stream::StreamExt; use futures::FutureExt; -use jni::objects::JByteBuffer; use jni::sys::{jlongArray, JNI_FALSE}; use jni::{ errors::Result as JNIResult, @@ -85,7 +84,7 @@ use crate::execution::memory_pools::{ create_memory_pool, handle_task_shared_pool_release, parse_memory_pool_config, MemoryPoolConfig, }; use crate::execution::operators::{ScanExec, ShuffleScanExec}; -use crate::execution::shuffle::{read_ipc_compressed, CompressionCodec}; +use crate::execution::shuffle::{CompressionCodec, ShuffleStreamReader}; use crate::execution::spark_plan::SparkPlan; use crate::execution::tracing::{ @@ -962,7 +961,6 @@ pub unsafe extern "system" fn Java_org_apache_comet_Native_writeSortedFileNative let compression_codec = match compression_codec.as_str() { "zstd" => CompressionCodec::Zstd(compression_level), "lz4" => CompressionCodec::Lz4Frame, - "snappy" => CompressionCodec::Snappy, _ => CompressionCodec::Lz4Frame, }; @@ -1029,29 +1027,72 @@ pub extern "system" fn Java_org_apache_comet_Native_sortRowPartitionsNative( } #[no_mangle] -/// Used by Comet native shuffle reader +/// Open a shuffle stream reader over a JVM InputStream. +/// Returns an opaque handle (pointer) to a `ShuffleStreamReader`. /// # Safety /// This function is inherently unsafe since it deals with raw pointers passed from JNI. -pub unsafe extern "system" fn Java_org_apache_comet_Native_decodeShuffleBlock( +pub unsafe extern "system" fn Java_org_apache_comet_Native_openShuffleStream( e: EnvUnowned, _class: JClass, - byte_buffer: JByteBuffer, - length: jint, + input_stream: JObject, +) -> jlong { + try_unwrap_or_throw(&e, |env| { + let reader = ShuffleStreamReader::new(env, &input_stream).map_err(CometError::Internal)?; + let handle = Box::into_raw(Box::new(reader)); + Ok(handle as jlong) + }) +} + +#[no_mangle] +/// Read the next batch from a shuffle stream, exporting via Arrow FFI. +/// Returns the row count, or -1 if the stream is exhausted. +/// # Safety +/// This function is inherently unsafe since it deals with raw pointers passed from JNI. +pub unsafe extern "system" fn Java_org_apache_comet_Native_nextShuffleStreamBatch( + e: EnvUnowned, + _class: JClass, + handle: jlong, array_addrs: JLongArray, schema_addrs: JLongArray, - tracing_enabled: jboolean, ) -> jlong { try_unwrap_or_throw(&e, |env| { - with_trace("decodeShuffleBlock", tracing_enabled != JNI_FALSE, || { - let raw_pointer = env.get_direct_buffer_address(&byte_buffer)?; - let length = length as usize; - let slice: &[u8] = unsafe { std::slice::from_raw_parts(raw_pointer, length) }; - let batch = read_ipc_compressed(slice)?; - prepare_output(env, array_addrs, schema_addrs, batch, false) - }) + let reader = unsafe { &mut *(handle as *mut ShuffleStreamReader) }; + match reader.next_batch().map_err(CometError::Internal)? { + Some(batch) => prepare_output(env, array_addrs, schema_addrs, batch, false), + None => Ok(-1_i64), + } }) } +#[no_mangle] +/// Get the number of fields in the shuffle stream's schema. +/// # Safety +/// This function is inherently unsafe since it deals with raw pointers passed from JNI. +pub unsafe extern "system" fn Java_org_apache_comet_Native_shuffleStreamNumFields( + _e: EnvUnowned, + _class: JClass, + handle: jlong, +) -> jlong { + let reader = unsafe { &*(handle as *mut ShuffleStreamReader) }; + reader.num_fields() as jlong +} + +#[no_mangle] +/// Close and drop a shuffle stream reader. +/// # Safety +/// This function is inherently unsafe since it deals with raw pointers passed from JNI. +pub unsafe extern "system" fn Java_org_apache_comet_Native_closeShuffleStream( + _e: EnvUnowned, + _class: JClass, + handle: jlong, +) { + if handle != 0 { + unsafe { + let _ = Box::from_raw(handle as *mut ShuffleStreamReader); + } + } +} + #[no_mangle] /// # Safety /// This function is inherently unsafe since it deals with raw pointers passed from JNI. diff --git a/native/core/src/execution/operators/shuffle_scan.rs b/native/core/src/execution/operators/shuffle_scan.rs index 92c4dc8780..abe63ba8b8 100644 --- a/native/core/src/execution/operators/shuffle_scan.rs +++ b/native/core/src/execution/operators/shuffle_scan.rs @@ -18,9 +18,9 @@ use crate::{ errors::CometError, execution::{ - operators::ExecutionError, planner::TEST_EXEC_CONTEXT_ID, shuffle::ipc::read_ipc_compressed, + operators::ExecutionError, planner::TEST_EXEC_CONTEXT_ID, shuffle::ShuffleStreamReader, }, - jvm_bridge::{jni_call, JVMClasses}, + jvm_bridge::JVMClasses, }; use arrow::array::ArrayRef; use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; @@ -35,7 +35,7 @@ use datafusion::{ physical_plan::{ExecutionPlan, *}, }; use futures::Stream; -use jni::objects::{Global, JByteBuffer, JObject}; +use jni::objects::{Global, JObject}; use std::{ any::Any, pin::Pin, @@ -45,14 +45,13 @@ use std::{ use super::scan::InputBatch; -/// ShuffleScanExec reads compressed shuffle blocks from JVM via JNI and decodes them natively. -/// Unlike ScanExec which receives Arrow arrays via FFI, ShuffleScanExec receives raw compressed -/// bytes from CometShuffleBlockIterator and decodes them using read_ipc_compressed(). -#[derive(Debug, Clone)] +/// ShuffleScanExec reads Arrow IPC streams from JVM via JNI and decodes them natively. +/// Unlike ScanExec which receives Arrow arrays via FFI, ShuffleScanExec receives a raw +/// InputStream from JVM and reads Arrow IPC streams using ShuffleStreamReader. pub struct ShuffleScanExec { /// The ID of the execution context that owns this subquery. pub exec_context_id: i64, - /// The input source: a global reference to a JVM CometShuffleBlockIterator object. + /// The input source: a global reference to a JVM InputStream object. pub input_source: Option>>>, /// The data types of columns in the shuffle output. pub data_types: Vec, @@ -60,16 +59,48 @@ pub struct ShuffleScanExec { pub schema: SchemaRef, /// The current input batch, populated by get_next_batch() before poll_next(). pub batch: Arc>>, + /// Cached ShuffleStreamReader, created lazily on first get_next call. + stream_reader: Option, /// Cache of plan properties. cache: Arc, /// Metrics collector. metrics: ExecutionPlanMetricsSet, /// Baseline metrics. baseline_metrics: BaselineMetrics, - /// Time spent decoding compressed shuffle blocks. + /// Time spent decoding shuffle batches. decode_time: Time, } +impl std::fmt::Debug for ShuffleScanExec { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("ShuffleScanExec") + .field("exec_context_id", &self.exec_context_id) + .field("data_types", &self.data_types) + .field("schema", &self.schema) + .field("stream_reader", &self.stream_reader.is_some()) + .finish() + } +} + +impl Clone for ShuffleScanExec { + fn clone(&self) -> Self { + Self { + exec_context_id: self.exec_context_id, + input_source: self.input_source.clone(), + data_types: self.data_types.clone(), + schema: Arc::clone(&self.schema), + batch: Arc::clone(&self.batch), + // stream_reader is not cloneable; cloned instances start without one + // and will lazily create their own if needed. + stream_reader: None, + cache: Arc::clone(&self.cache), + metrics: self.metrics.clone(), + baseline_metrics: self.baseline_metrics.clone(), + decode_time: self.decode_time.clone(), + } + } +} + impl ShuffleScanExec { pub fn new( exec_context_id: i64, @@ -94,6 +125,7 @@ impl ShuffleScanExec { input_source, data_types, batch: Arc::new(Mutex::new(None)), + stream_reader: None, cache, metrics: metrics_set, baseline_metrics, @@ -114,90 +146,87 @@ impl ShuffleScanExec { // Unit test mode - no JNI calls needed. return Ok(()); } - let mut timer = self.baseline_metrics.elapsed_compute().timer(); - let mut current_batch = self.batch.try_lock().unwrap(); - if current_batch.is_none() { - let next_batch = Self::get_next( - self.exec_context_id, - self.input_source.as_ref().unwrap().as_obj(), - &self.data_types, - &self.decode_time, - )?; + // Check if a batch is already pending without holding the lock during get_next + let needs_batch = { + let current_batch = self.batch.try_lock().unwrap(); + current_batch.is_none() + }; + + if needs_batch { + let start = std::time::Instant::now(); + let next_batch = self.get_next()?; + self.baseline_metrics + .elapsed_compute() + .add_duration(start.elapsed()); + let mut current_batch = self.batch.try_lock().unwrap(); *current_batch = Some(next_batch); } - timer.stop(); - Ok(()) } - /// Invokes JNI calls to get the next compressed shuffle block and decode it. - fn get_next( - exec_context_id: i64, - iter: &JObject, - data_types: &[DataType], - decode_time: &Time, - ) -> Result { - if exec_context_id == TEST_EXEC_CONTEXT_ID { + /// Reads the next batch from the ShuffleStreamReader, creating it lazily on first call. + fn get_next(&mut self) -> Result { + if self.exec_context_id == TEST_EXEC_CONTEXT_ID { return Ok(InputBatch::EOF); } - if iter.is_null() { - return Err(CometError::from(ExecutionError::GeneralError(format!( - "Null shuffle block iterator object. Plan id: {exec_context_id}" - )))); + // Lazily create the ShuffleStreamReader on first call + if self.stream_reader.is_none() { + let input_source = self.input_source.as_ref().ok_or_else(|| { + CometError::from(ExecutionError::GeneralError(format!( + "Null shuffle input source. Plan id: {}", + self.exec_context_id + ))) + })?; + let input_source = Arc::clone(input_source); + let reader = JVMClasses::with_env(|env| { + ShuffleStreamReader::new(env, input_source.as_obj()).map_err(|e| { + CometError::from(ExecutionError::GeneralError(format!( + "Failed to create ShuffleStreamReader: {e}" + ))) + }) + })?; + self.stream_reader = Some(reader); } - JVMClasses::with_env(|env| { - // has_next() reads the next block and returns its length, or -1 if EOF - let block_length: i32 = unsafe { - jni_call!(env, - comet_shuffle_block_iterator(iter).has_next() -> i32)? - }; - - if block_length == -1 { - return Ok(InputBatch::EOF); - } - - // Get the DirectByteBuffer containing the compressed shuffle block - let buffer: JObject = unsafe { - jni_call!(env, - comet_shuffle_block_iterator(iter).get_buffer() -> JObject)? - }; - - let byte_buffer = unsafe { JByteBuffer::from_raw(env, buffer.into_raw()) }; - let raw_pointer = env.get_direct_buffer_address(&byte_buffer)?; - let length = block_length as usize; - let slice: &[u8] = unsafe { std::slice::from_raw_parts(raw_pointer, length) }; - - // Decode the compressed IPC data - let mut timer = decode_time.timer(); - let batch = read_ipc_compressed(slice)?; - timer.stop(); + let reader = self.stream_reader.as_mut().unwrap(); + + let mut decode_timer = self.decode_time.timer(); + let batch_opt = reader.next_batch().map_err(|e| { + CometError::from(ExecutionError::GeneralError(format!( + "Failed to read shuffle batch: {e}" + ))) + })?; + decode_timer.stop(); + + match batch_opt { + None => Ok(InputBatch::EOF), + Some(batch) => { + let num_rows = batch.num_rows(); + + // Extract column arrays, unpacking any dictionary-encoded columns. + // Native shuffle may dictionary-encode string/binary columns for efficiency, + // but downstream DataFusion operators expect the value types declared in the + // schema (e.g. Utf8, not Dictionary). + let columns: Vec = batch + .columns() + .iter() + .map(|col| unpack_dictionary(col)) + .collect(); - let num_rows = batch.num_rows(); - - // Extract column arrays, unpacking any dictionary-encoded columns. - // Native shuffle may dictionary-encode string/binary columns for efficiency, - // but downstream DataFusion operators expect the value types declared in the - // schema (e.g. Utf8, not Dictionary). - let columns: Vec = batch - .columns() - .iter() - .map(|col| unpack_dictionary(col)) - .collect(); - - debug_assert_eq!( - columns.len(), - data_types.len(), - "Shuffle block column count mismatch: got {} but expected {}", - columns.len(), - data_types.len() - ); + debug_assert_eq!( + columns.len(), + self.data_types.len(), + "Shuffle block column count mismatch: got {} but expected {}", + columns.len(), + self.data_types.len() + ); - Ok(InputBatch::new(columns, Some(num_rows))) - }) + Ok(InputBatch::new(columns, Some(num_rows))) + } + } } } @@ -351,16 +380,15 @@ impl RecordBatchStream for ShuffleScanStream { #[cfg(test)] mod tests { - use crate::execution::shuffle::{CompressionCodec, ShuffleBlockWriter}; + use crate::execution::shuffle::CompressionCodec; use arrow::array::{Int32Array, StringArray}; use arrow::datatypes::{DataType, Field, Schema}; + use arrow::ipc::reader::StreamReader; + use arrow::ipc::writer::StreamWriter; use arrow::record_batch::RecordBatch; - use datafusion::physical_plan::metrics::Time; use std::io::Cursor; use std::sync::Arc; - use crate::execution::shuffle::ipc::read_ipc_compressed; - #[test] #[cfg_attr(miri, ignore)] // Miri cannot call FFI functions (zstd) fn test_read_compressed_ipc_block() { @@ -377,18 +405,18 @@ mod tests { ) .unwrap(); - // Write as compressed IPC - let writer = - ShuffleBlockWriter::try_new(&batch.schema(), CompressionCodec::Zstd(1)).unwrap(); - let mut buf = Cursor::new(Vec::new()); - let ipc_time = Time::new(); - writer.write_batch(&batch, &mut buf, &ipc_time).unwrap(); - - // Read back (skip 16-byte header: 8 compressed_length + 8 field_count) - let bytes = buf.into_inner(); - let body = &bytes[16..]; - - let decoded = read_ipc_compressed(body).unwrap(); + // Write as Arrow IPC stream with compression + let write_options = CompressionCodec::Zstd(1).ipc_write_options().unwrap(); + let mut buf = Vec::new(); + let mut writer = + StreamWriter::try_new_with_options(&mut buf, &batch.schema(), write_options).unwrap(); + writer.write(&batch).unwrap(); + writer.finish().unwrap(); + + // Read back using standard StreamReader + let cursor = Cursor::new(&buf); + let mut reader = StreamReader::try_new(cursor, None).unwrap(); + let decoded = reader.next().unwrap().unwrap(); assert_eq!(decoded.num_rows(), 3); assert_eq!(decoded.num_columns(), 2); @@ -404,9 +432,6 @@ mod tests { } /// Tests that ShuffleScanExec correctly unpacks dictionary-encoded columns. - /// Native shuffle may dictionary-encode string/binary columns, but the schema - /// declares value types (e.g. Utf8). Without unpacking, RecordBatch creation - /// fails with a schema mismatch. #[test] #[cfg_attr(miri, ignore)] fn test_dictionary_encoded_shuffle_block_is_unpacked() { @@ -416,15 +441,12 @@ mod tests { use datafusion::physical_plan::ExecutionPlan; use futures::StreamExt; - // Build a batch with a dictionary-encoded string column (simulating what - // the native shuffle writer produces for string columns). let mut dict_builder = StringDictionaryBuilder::::new(); dict_builder.append_value("hello"); dict_builder.append_value("world"); - dict_builder.append_value("hello"); // repeated value, good for dictionary + dict_builder.append_value("hello"); let dict_array = dict_builder.finish(); - // The IPC schema includes the dictionary type let dict_schema = Arc::new(Schema::new(vec![ Field::new("id", DataType::Int32, false), Field::new( @@ -442,19 +464,19 @@ mod tests { ) .unwrap(); - // Write as compressed IPC (preserves dictionary encoding) - let writer = - ShuffleBlockWriter::try_new(&dict_batch.schema(), CompressionCodec::Zstd(1)).unwrap(); - let mut buf = Cursor::new(Vec::new()); - let ipc_time = Time::new(); - writer - .write_batch(&dict_batch, &mut buf, &ipc_time) - .unwrap(); - let bytes = buf.into_inner(); - let body = &bytes[16..]; - - // Confirm that read_ipc_compressed returns dictionary-encoded arrays - let decoded = read_ipc_compressed(body).unwrap(); + // Write as Arrow IPC stream with compression + let write_options = CompressionCodec::Zstd(1).ipc_write_options().unwrap(); + let mut buf = Vec::new(); + let mut writer = + StreamWriter::try_new_with_options(&mut buf, &dict_batch.schema(), write_options) + .unwrap(); + writer.write(&dict_batch).unwrap(); + writer.finish().unwrap(); + + // Read back using standard StreamReader + let cursor = Cursor::new(&buf); + let mut reader = StreamReader::try_new(cursor, None).unwrap(); + let decoded = reader.next().unwrap().unwrap(); assert!( matches!(decoded.column(1).data_type(), DataType::Dictionary(_, _)), "Expected dictionary-encoded column from IPC, got {:?}", diff --git a/native/core/src/execution/planner.rs b/native/core/src/execution/planner.rs index ac35925ace..6da6e65100 100644 --- a/native/core/src/execution/planner.rs +++ b/native/core/src/execution/planner.rs @@ -1359,7 +1359,6 @@ impl PhysicalPlanner { let codec = match writer.codec.try_into() { Ok(SparkCompressionCodec::None) => Ok(CompressionCodec::None), - Ok(SparkCompressionCodec::Snappy) => Ok(CompressionCodec::Snappy), Ok(SparkCompressionCodec::Zstd) => { Ok(CompressionCodec::Zstd(writer.compression_level)) } diff --git a/native/jni-bridge/src/lib.rs b/native/jni-bridge/src/lib.rs index 5b0c0a4a56..4581fb71fd 100644 --- a/native/jni-bridge/src/lib.rs +++ b/native/jni-bridge/src/lib.rs @@ -181,12 +181,9 @@ pub use comet_exec::*; mod batch_iterator; mod comet_metric_node; mod comet_task_memory_manager; -mod shuffle_block_iterator; - use batch_iterator::CometBatchIterator; pub use comet_metric_node::*; pub use comet_task_memory_manager::*; -use shuffle_block_iterator::CometShuffleBlockIterator; /// The JVM classes that are used in the JNI calls. #[allow(dead_code)] // we need to keep references to Java items to prevent GC @@ -212,8 +209,6 @@ pub struct JVMClasses<'a> { pub comet_exec: CometExec<'a>, /// The CometBatchIterator class. Used for iterating over the batches. pub comet_batch_iterator: CometBatchIterator<'a>, - /// The CometShuffleBlockIterator class. Used for iterating over shuffle blocks. - pub comet_shuffle_block_iterator: CometShuffleBlockIterator<'a>, /// The CometTaskMemoryManager used for interacting with JVM side to /// acquire & release native memory. pub comet_task_memory_manager: CometTaskMemoryManager<'a>, @@ -285,7 +280,6 @@ impl JVMClasses<'_> { comet_metric_node: CometMetricNode::new(env).unwrap(), comet_exec: CometExec::new(env).unwrap(), comet_batch_iterator: CometBatchIterator::new(env).unwrap(), - comet_shuffle_block_iterator: CometShuffleBlockIterator::new(env).unwrap(), comet_task_memory_manager: CometTaskMemoryManager::new(env).unwrap(), } }); diff --git a/native/jni-bridge/src/shuffle_block_iterator.rs b/native/jni-bridge/src/shuffle_block_iterator.rs deleted file mode 100644 index fb65bf7251..0000000000 --- a/native/jni-bridge/src/shuffle_block_iterator.rs +++ /dev/null @@ -1,67 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use jni::signature::Primitive; -use jni::{ - errors::Result as JniResult, - objects::{JClass, JMethodID}, - signature::ReturnType, - strings::JNIString, - Env, -}; - -/// A struct that holds all the JNI methods and fields for JVM `CometShuffleBlockIterator` class. -#[allow(dead_code)] // we need to keep references to Java items to prevent GC -pub struct CometShuffleBlockIterator<'a> { - pub class: JClass<'a>, - pub method_has_next: JMethodID, - pub method_has_next_ret: ReturnType, - pub method_get_buffer: JMethodID, - pub method_get_buffer_ret: ReturnType, - pub method_get_current_block_length: JMethodID, - pub method_get_current_block_length_ret: ReturnType, -} - -impl<'a> CometShuffleBlockIterator<'a> { - pub const JVM_CLASS: &'static str = "org/apache/comet/CometShuffleBlockIterator"; - - pub fn new(env: &mut Env<'a>) -> JniResult> { - let class = env.find_class(JNIString::new(Self::JVM_CLASS))?; - - Ok(CometShuffleBlockIterator { - class, - method_has_next: env.get_method_id( - JNIString::new(Self::JVM_CLASS), - jni::jni_str!("hasNext"), - jni::jni_sig!("()I"), - )?, - method_has_next_ret: ReturnType::Primitive(Primitive::Int), - method_get_buffer: env.get_method_id( - JNIString::new(Self::JVM_CLASS), - jni::jni_str!("getBuffer"), - jni::jni_sig!("()Ljava/nio/ByteBuffer;"), - )?, - method_get_buffer_ret: ReturnType::Object, - method_get_current_block_length: env.get_method_id( - JNIString::new(Self::JVM_CLASS), - jni::jni_str!("getCurrentBlockLength"), - jni::jni_sig!("()I"), - )?, - method_get_current_block_length_ret: ReturnType::Primitive(Primitive::Int), - }) - } -} diff --git a/native/shuffle/Cargo.toml b/native/shuffle/Cargo.toml index 505879e319..59bccc84b7 100644 --- a/native/shuffle/Cargo.toml +++ b/native/shuffle/Cargo.toml @@ -41,15 +41,12 @@ datafusion-comet-jni-bridge = { workspace = true } datafusion-comet-spark-expr = { workspace = true } futures = { workspace = true } itertools = "0.14.0" -jni = "0.21" +jni = "0.22.4" log = "0.4" -lz4_flex = { version = "0.13.0", default-features = false, features = ["frame"] } # parquet is only used by the shuffle_bench binary (shuffle-bench feature) parquet = { workspace = true, optional = true } simd-adler32 = "0.3.9" -snap = "1.1" tokio = { version = "1", features = ["rt-multi-thread"] } -zstd = "0.13.3" [dev-dependencies] criterion = { version = "0.7", features = ["async", "async_tokio", "async_std"] } diff --git a/native/shuffle/benches/shuffle_writer.rs b/native/shuffle/benches/shuffle_writer.rs index 27abd919fa..6e4f1b8fc4 100644 --- a/native/shuffle/benches/shuffle_writer.rs +++ b/native/shuffle/benches/shuffle_writer.rs @@ -18,22 +18,19 @@ use arrow::array::builder::{Date32Builder, Decimal128Builder, Int32Builder}; use arrow::array::{builder::StringBuilder, Array, Int32Array, RecordBatch}; use arrow::datatypes::{DataType, Field, Schema}; +use arrow::ipc::writer::StreamWriter; use arrow::row::{RowConverter, SortField}; use criterion::{criterion_group, criterion_main, Criterion}; use datafusion::datasource::memory::MemorySourceConfig; use datafusion::datasource::source::DataSourceExec; use datafusion::physical_expr::expressions::{col, Column}; use datafusion::physical_expr::{LexOrdering, PhysicalSortExpr}; -use datafusion::physical_plan::metrics::Time; use datafusion::{ physical_plan::{common::collect, ExecutionPlan}, prelude::SessionContext, }; -use datafusion_comet_shuffle::{ - CometPartitioning, CompressionCodec, ShuffleBlockWriter, ShuffleWriterExec, -}; +use datafusion_comet_shuffle::{CometPartitioning, CompressionCodec, ShuffleWriterExec}; use itertools::Itertools; -use std::io::Cursor; use std::sync::Arc; use tokio::runtime::Runtime; @@ -43,20 +40,22 @@ fn criterion_benchmark(c: &mut Criterion) { for compression_codec in &[ CompressionCodec::None, CompressionCodec::Lz4Frame, - CompressionCodec::Snappy, CompressionCodec::Zstd(1), CompressionCodec::Zstd(6), ] { let name = format!("shuffle_writer: write encoded (compression={compression_codec:?})"); group.bench_function(name, |b| { - let mut buffer = vec![]; - let ipc_time = Time::default(); - let w = - ShuffleBlockWriter::try_new(&batch.schema(), compression_codec.clone()).unwrap(); + let write_options = compression_codec.ipc_write_options().unwrap(); b.iter(|| { - buffer.clear(); - let mut cursor = Cursor::new(&mut buffer); - w.write_batch(&batch, &mut cursor, &ipc_time).unwrap(); + let mut buffer = Vec::new(); + let mut writer = StreamWriter::try_new_with_options( + &mut buffer, + &batch.schema(), + write_options.clone(), + ) + .unwrap(); + writer.write(&batch).unwrap(); + writer.finish().unwrap(); }); }); } @@ -64,7 +63,6 @@ fn criterion_benchmark(c: &mut Criterion) { for compression_codec in [ CompressionCodec::None, CompressionCodec::Lz4Frame, - CompressionCodec::Snappy, CompressionCodec::Zstd(1), CompressionCodec::Zstd(6), ] { diff --git a/native/shuffle/src/bin/shuffle_bench.rs b/native/shuffle/src/bin/shuffle_bench.rs index bb8c2a0380..0f239d2fcb 100644 --- a/native/shuffle/src/bin/shuffle_bench.rs +++ b/native/shuffle/src/bin/shuffle_bench.rs @@ -15,8 +15,8 @@ // specific language governing permissions and limitations // under the License. -//! Standalone shuffle benchmark tool for profiling Comet shuffle write -//! performance outside of Spark. Streams input directly from Parquet files. +//! Standalone shuffle benchmark tool for profiling Comet shuffle write and read +//! outside of Spark. Streams input directly from Parquet files. //! //! # Usage //! @@ -24,18 +24,20 @@ //! cargo run --release --bin shuffle_bench -- \ //! --input /data/tpch-sf100/lineitem/ \ //! --partitions 200 \ -//! --codec lz4 \ -//! --hash-columns 0,3 +//! --codec zstd --zstd-level 1 \ +//! --hash-columns 0,3 \ +//! --read-back //! ``` //! //! Profile with flamegraph: //! ```sh //! cargo flamegraph --release --bin shuffle_bench -- \ //! --input /data/tpch-sf100/lineitem/ \ -//! --partitions 200 --codec lz4 +//! --partitions 200 --codec zstd --zstd-level 1 //! ``` use arrow::datatypes::{DataType, SchemaRef}; +use arrow::ipc::reader::StreamReader; use clap::Parser; use datafusion::execution::config::SessionConfig; use datafusion::execution::runtime_env::RuntimeEnvBuilder; @@ -55,7 +57,7 @@ use std::time::Instant; #[derive(Parser, Debug)] #[command( name = "shuffle_bench", - about = "Standalone benchmark for Comet shuffle write performance" + about = "Standalone benchmark for Comet shuffle write and read performance" )] struct Args { /// Path to input Parquet file or directory of Parquet files @@ -78,8 +80,8 @@ struct Args { #[arg(long, default_value = "0")] hash_columns: String, - /// Compression codec: none, lz4, zstd, snappy - #[arg(long, default_value = "lz4")] + /// Compression codec: none, lz4, zstd + #[arg(long, default_value = "zstd")] codec: String, /// Zstd compression level (1-22) @@ -90,6 +92,10 @@ struct Args { #[arg(long)] memory_limit: Option, + /// Also benchmark reading back the shuffle output + #[arg(long, default_value_t = false)] + read_back: bool, + /// Number of iterations to run #[arg(long, default_value_t = 1)] iterations: usize, @@ -103,7 +109,7 @@ struct Args { output_dir: PathBuf, /// Write buffer size in bytes - #[arg(long, default_value_t = 1048576)] + #[arg(long, default_value_t = 8192)] write_buffer_size: usize, /// Limit rows processed per iteration (0 = no limit) @@ -114,6 +120,11 @@ struct Args { /// Each task reads the same input and writes to its own output files. #[arg(long, default_value_t = 1)] concurrent_tasks: usize, + + /// Shuffle mode: 'immediate' writes IPC blocks per batch as they arrive, + /// 'buffered' buffers all rows before writing (original behavior). + #[arg(long, default_value = "immediate")] + mode: String, } fn main() { @@ -141,6 +152,7 @@ fn main() { println!("Partitioning: {}", args.partitioning); println!("Partitions: {}", args.partitions); println!("Codec: {:?}", codec); + println!("Mode: {}", args.mode); println!("Hash columns: {:?}", hash_col_indices); if let Some(mem_limit) = args.memory_limit { println!("Memory limit: {}", format_bytes(mem_limit)); @@ -156,6 +168,7 @@ fn main() { let total_iters = args.warmup + args.iterations; let mut write_times = Vec::with_capacity(args.iterations); + let mut read_times = Vec::with_capacity(args.iterations); let mut data_file_sizes = Vec::with_capacity(args.iterations); let mut last_metrics: Option = None; let mut last_input_metrics: Option = None; @@ -202,7 +215,22 @@ fn main() { print!(" output: {}", format_bytes(data_size as usize)); } + if args.read_back && args.concurrent_tasks <= 1 { + let read_elapsed = run_shuffle_read( + data_file.to_str().unwrap(), + index_file.to_str().unwrap(), + args.partitions, + ); + if !is_warmup { + read_times.push(read_elapsed); + } + print!(" read: {:.3}s", read_elapsed); + } println!(); + + // Remove output files after each iteration to avoid filling disk + let _ = fs::remove_file(&data_file); + let _ = fs::remove_file(&index_file); } if args.iterations > 0 { @@ -235,6 +263,24 @@ fn main() { ); } + if !read_times.is_empty() { + let avg_data_size = data_file_sizes.iter().sum::() / data_file_sizes.len() as u64; + let avg_read = read_times.iter().sum::() / read_times.len() as f64; + let read_throughput_bytes = avg_data_size as f64 / avg_read; + + println!("Read:"); + println!(" avg time: {:.3}s", avg_read); + if read_times.len() > 1 { + let min = read_times.iter().cloned().fold(f64::INFINITY, f64::min); + let max = read_times.iter().cloned().fold(f64::NEG_INFINITY, f64::max); + println!(" min/max: {:.3}s / {:.3}s", min, max); + } + println!( + " throughput: {}/s (from compressed)", + format_bytes(read_throughput_bytes as usize) + ); + } + if let Some(ref metrics) = last_input_metrics { println!(); println!("Input Metrics (last iteration):"); @@ -283,6 +329,15 @@ fn print_shuffle_metrics(metrics: &MetricsSet, total_wall_time_secs: f64) { if let Some(nanos) = get_metric("write_time") { println!(" write time: {}", fmt_time(nanos)); } + if let Some(nanos) = get_metric("interleave_time") { + println!(" interleave time: {}", fmt_time(nanos)); + } + if let Some(nanos) = get_metric("coalesce_time") { + println!(" coalesce time: {}", fmt_time(nanos)); + } + if let Some(nanos) = get_metric("memcopy_time") { + println!(" memcopy time: {}", fmt_time(nanos)); + } if let Some(spill_count) = get_metric("spill_count") { if spill_count > 0 { @@ -413,6 +468,7 @@ fn run_shuffle_write( args.limit, data_file.to_string(), index_file.to_string(), + args.mode == "immediate", ) .await .unwrap(); @@ -436,6 +492,7 @@ async fn execute_shuffle_write( limit: usize, data_file: String, index_file: String, + immediate_mode: bool, ) -> datafusion::common::Result<(MetricsSet, MetricsSet)> { let config = SessionConfig::new().with_batch_size(batch_size); let mut runtime_builder = RuntimeEnvBuilder::new(); @@ -477,6 +534,7 @@ async fn execute_shuffle_write( index_file, false, write_buffer_size, + immediate_mode, ) .expect("Failed to create ShuffleWriterExec"); @@ -541,6 +599,7 @@ fn run_concurrent_shuffle_writes( let memory_limit = args.memory_limit; let write_buffer_size = args.write_buffer_size; let limit = args.limit; + let immediate_mode = args.mode == "immediate"; handles.push(tokio::spawn(async move { execute_shuffle_write( @@ -553,6 +612,7 @@ fn run_concurrent_shuffle_writes( limit, data_file, index_file, + immediate_mode, ) .await .unwrap() @@ -572,6 +632,52 @@ fn run_concurrent_shuffle_writes( }) } +fn run_shuffle_read(data_file: &str, index_file: &str, num_partitions: usize) -> f64 { + let start = Instant::now(); + + let index_bytes = fs::read(index_file).expect("Failed to read index file"); + let num_offsets = index_bytes.len() / 8; + let offsets: Vec = (0..num_offsets) + .map(|i| { + let bytes: [u8; 8] = index_bytes[i * 8..(i + 1) * 8].try_into().unwrap(); + i64::from_le_bytes(bytes) + }) + .collect(); + + let data_bytes = fs::read(data_file).expect("Failed to read data file"); + + let mut total_rows = 0usize; + let mut total_batches = 0usize; + + for p in 0..num_partitions.min(offsets.len().saturating_sub(1)) { + let start_offset = offsets[p] as usize; + let end_offset = offsets[p + 1] as usize; + + if start_offset >= end_offset { + continue; + } + + // Each partition's data contains one or more complete IPC streams + let partition_data = &data_bytes[start_offset..end_offset]; + let mut reader = + StreamReader::try_new(partition_data, None).expect("Failed to open IPC stream"); + while let Some(batch_result) = reader.next() { + let batch = batch_result.expect("Failed to decode record batch"); + total_rows += batch.num_rows(); + total_batches += 1; + } + } + + let elapsed = start.elapsed().as_secs_f64(); + eprintln!( + " read back {} rows in {} batches from {} partitions", + format_number(total_rows), + total_batches, + num_partitions + ); + elapsed +} + fn build_partitioning( scheme: &str, num_partitions: usize, @@ -604,7 +710,6 @@ fn parse_codec(codec: &str, zstd_level: i32) -> CompressionCodec { "none" => CompressionCodec::None, "lz4" => CompressionCodec::Lz4Frame, "zstd" => CompressionCodec::Zstd(zstd_level), - "snappy" => CompressionCodec::Snappy, other => { eprintln!("Unknown codec: {other}. Using zstd."); CompressionCodec::Zstd(zstd_level) diff --git a/native/shuffle/src/ipc.rs b/native/shuffle/src/ipc.rs index 81ee41332a..fd07743339 100644 --- a/native/shuffle/src/ipc.rs +++ b/native/shuffle/src/ipc.rs @@ -17,36 +17,247 @@ use arrow::array::RecordBatch; use arrow::ipc::reader::StreamReader; -use datafusion::common::DataFusionError; -use datafusion::error::Result; - -pub fn read_ipc_compressed(bytes: &[u8]) -> Result { - match &bytes[0..4] { - b"SNAP" => { - let decoder = snap::read::FrameDecoder::new(&bytes[4..]); - let mut reader = - unsafe { StreamReader::try_new(decoder, None)?.with_skip_validation(true) }; - reader.next().unwrap().map_err(|e| e.into()) +use jni::objects::{Global, JObject, JPrimitiveArray, JValue}; +use jni::{jni_sig, jni_str, JavaVM}; +use std::io::Read; + +/// Size of the internal read-ahead buffer (64 KB). +const READ_AHEAD_BUF_SIZE: usize = 64 * 1024; + +/// A Rust `Read` implementation that pulls bytes from a JVM `java.io.InputStream` +/// via JNI callbacks, using an internal read-ahead buffer to minimize JNI crossings. +pub struct JniInputStream { + /// Handle to the JVM for attaching threads. + vm: JavaVM, + /// Global reference to the JVM InputStream object. + input_stream: Global>, + /// Global reference to the JVM byte[] used for bulk reads. + jbuf: Global>, + /// Internal Rust-side buffer holding bytes read from JVM. + buf: Vec, + /// Current read position within `buf`. + pos: usize, + /// Number of valid bytes in `buf`. + len: usize, +} + +impl JniInputStream { + /// Create a new `JniInputStream` wrapping a JVM InputStream. + pub fn new(env: &mut jni::Env, input_stream: &JObject) -> jni::errors::Result { + let vm = env.get_java_vm()?; + let input_stream = env.new_global_ref(input_stream)?; + let jbuf_local = env.new_byte_array(READ_AHEAD_BUF_SIZE)?; + let jbuf = env.new_global_ref(&jbuf_local)?; + Ok(Self { + vm, + input_stream, + jbuf, + buf: vec![0u8; READ_AHEAD_BUF_SIZE], + pos: 0, + len: 0, + }) + } + + /// Refill the internal buffer by calling `InputStream.read(byte[], 0, len)` via JNI. + fn refill(&mut self) -> std::io::Result { + let vm = &self.vm; + let input_stream = &self.input_stream; + let jbuf = &self.jbuf; + let buf = &mut self.buf; + + let n: i32 = vm + .attach_current_thread(|env| -> jni::errors::Result { + let n = env + .call_method( + input_stream, + jni_str!("read"), + jni_sig!("([BII)I"), + &[ + JValue::Object(jbuf.as_obj()), + JValue::Int(0), + JValue::Int(READ_AHEAD_BUF_SIZE as i32), + ], + )? + .i()?; + + if n > 0 { + let n_usize = n as usize; + // Copy bytes from JVM byte[] into our Rust buffer. + // jbyte is i8; we read into a temporary i8 slice then reinterpret as u8. + let mut i8_buf = vec![0i8; n_usize]; + jbuf.get_region(env, 0, &mut i8_buf)?; + + let src = unsafe { + std::slice::from_raw_parts(i8_buf.as_ptr() as *const u8, n_usize) + }; + buf[..n_usize].copy_from_slice(src); + } + + Ok(n) + }) + .map_err(|e| std::io::Error::other(e.to_string()))?; + + if n <= 0 { + // -1 means end of stream + self.pos = 0; + self.len = 0; + return Ok(0); } - b"LZ4_" => { - let decoder = lz4_flex::frame::FrameDecoder::new(&bytes[4..]); - let mut reader = - unsafe { StreamReader::try_new(decoder, None)?.with_skip_validation(true) }; - reader.next().unwrap().map_err(|e| e.into()) + + let n = n as usize; + self.pos = 0; + self.len = n; + + Ok(n) + } +} + +impl Read for JniInputStream { + fn read(&mut self, out: &mut [u8]) -> std::io::Result { + if self.pos >= self.len { + // Buffer is empty, refill + let filled = self.refill()?; + if filled == 0 { + return Ok(0); // EOF + } + } + + let available = self.len - self.pos; + let to_copy = available.min(out.len()); + out[..to_copy].copy_from_slice(&self.buf[self.pos..self.pos + to_copy]); + self.pos += to_copy; + Ok(to_copy) + } +} + +/// A wrapper around `JniInputStream` that allows `StreamReader` to borrow +/// it while still being able to create new `StreamReader` instances for +/// concatenated IPC streams. +/// +/// Uses a raw pointer to the `JniInputStream` stored in a `Box` so that +/// the `StreamReader` can take a `Read` impl without taking ownership. +struct SharedJniStream { + inner: *mut JniInputStream, +} + +impl SharedJniStream { + fn new(stream: JniInputStream) -> Self { + Self { + inner: Box::into_raw(Box::new(stream)), } - b"ZSTD" => { - let decoder = zstd::Decoder::new(&bytes[4..])?; - let mut reader = - unsafe { StreamReader::try_new(decoder, None)?.with_skip_validation(true) }; - reader.next().unwrap().map_err(|e| e.into()) + } + + /// Create a Read adapter that delegates to the inner stream. + fn reader(&self) -> StreamReadAdapter { + StreamReadAdapter { inner: self.inner } + } +} + +impl Drop for SharedJniStream { + fn drop(&mut self) { + unsafe { drop(Box::from_raw(self.inner)) }; + } +} + +// SAFETY: SharedJniStream owns the JniInputStream exclusively via a raw pointer. +// It is only accessed from a single thread at a time (the JNI thread that calls +// get_next_batch). The raw pointer is used to allow multiple sequential StreamReader +// instances to borrow the same underlying stream. +unsafe impl Send for SharedJniStream {} +unsafe impl Sync for SharedJniStream {} + +// SAFETY: StreamReadAdapter borrows from the same raw pointer as SharedJniStream. +// Same single-threaded access guarantees apply. +unsafe impl Send for StreamReadAdapter {} +unsafe impl Sync for StreamReadAdapter {} + +/// A Read adapter that delegates to a raw pointer to JniInputStream. +/// Multiple StreamReader instances can be created from this adapter +/// (sequentially, not concurrently). +struct StreamReadAdapter { + inner: *mut JniInputStream, +} + +impl Read for StreamReadAdapter { + fn read(&mut self, buf: &mut [u8]) -> std::io::Result { + unsafe { (*self.inner).read(buf) } + } +} + +/// Manages reading potentially concatenated Arrow IPC streams from a JVM +/// InputStream. A single partition's data may contain multiple IPC streams +/// (e.g., from spills), so when one stream reaches EOS we attempt to open +/// the next one from the same underlying InputStream. +pub struct ShuffleStreamReader { + /// Shared ownership of the JniInputStream. + jni_stream: SharedJniStream, + /// Current Arrow IPC stream reader. `None` when all streams are exhausted. + reader: Option>, + num_fields: usize, +} + +impl ShuffleStreamReader { + /// Create a new `ShuffleStreamReader` over a JVM InputStream. + /// Returns a reader that yields no batches if the stream is empty. + pub fn new(env: &mut jni::Env, input_stream: &JObject) -> Result { + let jni_stream = SharedJniStream::new( + JniInputStream::new(env, input_stream).map_err(|e| format!("JNI error: {e}"))?, + ); + match StreamReader::try_new(jni_stream.reader(), None) { + Ok(reader) => { + let reader = unsafe { reader.with_skip_validation(true) }; + let num_fields = reader.schema().fields().len(); + Ok(Self { + jni_stream, + reader: Some(reader), + num_fields, + }) + } + Err(_) => { + // Empty stream — no data for this partition + Ok(Self { + jni_stream, + reader: None, + num_fields: 0, + }) + } } - b"NONE" => { - let mut reader = - unsafe { StreamReader::try_new(&bytes[4..], None)?.with_skip_validation(true) }; - reader.next().unwrap().map_err(|e| e.into()) + } + + /// Read the next batch from the stream. Returns `None` when all + /// concatenated IPC streams are exhausted. + pub fn next_batch(&mut self) -> Result, String> { + loop { + let reader = match &mut self.reader { + Some(r) => r, + None => return Ok(None), + }; + + match reader.next() { + Some(Ok(batch)) => return Ok(Some(batch)), + Some(Err(e)) => return Err(format!("Arrow IPC read error: {e}")), + None => { + // Current IPC stream exhausted. Drop the old reader and try + // to open the next concatenated stream. + self.reader = None; + + match StreamReader::try_new(self.jni_stream.reader(), None) { + Ok(new_reader) => { + self.reader = Some(unsafe { new_reader.with_skip_validation(true) }); + // Loop back to read from the new reader + } + Err(_) => { + // No more streams — the InputStream is exhausted + return Ok(None); + } + } + } + } } - other => Err(DataFusionError::Execution(format!( - "Failed to decode batch: invalid compression codec: {other:?}" - ))), + } + + /// Return the number of fields in the stream's schema. + pub fn num_fields(&self) -> usize { + self.num_fields } } diff --git a/native/shuffle/src/lib.rs b/native/shuffle/src/lib.rs index dd3b900272..1c31bda5ef 100644 --- a/native/shuffle/src/lib.rs +++ b/native/shuffle/src/lib.rs @@ -25,6 +25,6 @@ pub mod spark_unsafe; pub(crate) mod writers; pub use comet_partitioning::CometPartitioning; -pub use ipc::read_ipc_compressed; +pub use ipc::{JniInputStream, ShuffleStreamReader}; pub use shuffle_writer::ShuffleWriterExec; -pub use writers::{CompressionCodec, ShuffleBlockWriter}; +pub use writers::CompressionCodec; diff --git a/native/shuffle/src/partitioners/empty_schema.rs b/native/shuffle/src/partitioners/empty_schema.rs index 45decfec05..00699b4b05 100644 --- a/native/shuffle/src/partitioners/empty_schema.rs +++ b/native/shuffle/src/partitioners/empty_schema.rs @@ -17,9 +17,9 @@ use crate::metrics::ShufflePartitionerMetrics; use crate::partitioners::ShufflePartitioner; -use crate::ShuffleBlockWriter; use arrow::array::RecordBatch; use arrow::datatypes::SchemaRef; +use arrow::ipc::writer::{IpcWriteOptions, StreamWriter}; use datafusion::common::DataFusionError; use std::fs::OpenOptions; use std::io::{BufWriter, Seek, Write}; @@ -33,7 +33,7 @@ pub(crate) struct EmptySchemaShufflePartitioner { output_data_file: String, output_index_file: String, schema: SchemaRef, - shuffle_block_writer: ShuffleBlockWriter, + write_options: IpcWriteOptions, num_output_partitions: usize, total_rows: usize, metrics: ShufflePartitionerMetrics, @@ -52,12 +52,12 @@ impl EmptySchemaShufflePartitioner { schema.fields().is_empty(), "EmptySchemaShufflePartitioner requires a zero-column schema" ); - let shuffle_block_writer = ShuffleBlockWriter::try_new(schema.as_ref(), codec)?; + let write_options = codec.ipc_write_options()?; Ok(Self { output_data_file, output_index_file, schema, - shuffle_block_writer, + write_options, num_output_partitions, total_rows: 0, metrics, @@ -100,11 +100,15 @@ impl ShufflePartitioner for EmptySchemaShufflePartitioner { vec![], &arrow::array::RecordBatchOptions::new().with_row_count(Some(self.total_rows)), )?; - self.shuffle_block_writer.write_batch( - &batch, + let mut encode_timer = self.metrics.encode_time.timer(); + let mut writer = StreamWriter::try_new_with_options( &mut output_data, - &self.metrics.encode_time, + &self.schema, + self.write_options.clone(), )?; + writer.write(&batch)?; + writer.finish()?; + encode_timer.stop(); } let mut write_timer = self.metrics.write_time.timer(); diff --git a/native/shuffle/src/partitioners/multi_partition.rs b/native/shuffle/src/partitioners/multi_partition.rs index 7de9314f54..68367e95d1 100644 --- a/native/shuffle/src/partitioners/multi_partition.rs +++ b/native/shuffle/src/partitioners/multi_partition.rs @@ -21,9 +21,10 @@ use crate::partitioners::partitioned_batch_iterator::{ }; use crate::partitioners::ShufflePartitioner; use crate::writers::{BufBatchWriter, PartitionWriter}; -use crate::{comet_partitioning, CometPartitioning, CompressionCodec, ShuffleBlockWriter}; +use crate::{comet_partitioning, CometPartitioning, CompressionCodec}; use arrow::array::{ArrayRef, RecordBatch}; use arrow::datatypes::SchemaRef; +use arrow::ipc::writer::IpcWriteOptions; use datafusion::common::utils::proxy::VecAllocExt; use datafusion::common::DataFusionError; use datafusion::execution::memory_pool::{MemoryConsumer, MemoryReservation}; @@ -111,7 +112,10 @@ pub(crate) struct MultiPartitionShuffleRepartitioner { buffered_batches: Vec, partition_indices: Vec>, partition_writers: Vec, - shuffle_block_writer: ShuffleBlockWriter, + /// Schema of the input data + schema: SchemaRef, + /// IPC write options (includes compression settings) + write_options: IpcWriteOptions, /// Partitioning scheme to use partitioning: CometPartitioning, runtime: Arc, @@ -123,7 +127,6 @@ pub(crate) struct MultiPartitionShuffleRepartitioner { /// Reservation for repartitioning reservation: MemoryReservation, tracing_enabled: bool, - /// Size of the write buffer in bytes write_buffer_size: usize, } @@ -165,10 +168,10 @@ impl MultiPartitionShuffleRepartitioner { partition_starts: vec![0; num_output_partitions + 1], }; - let shuffle_block_writer = ShuffleBlockWriter::try_new(schema.as_ref(), codec.clone())?; + let write_options = codec.ipc_write_options()?; let partition_writers = (0..num_output_partitions) - .map(|_| PartitionWriter::try_new(shuffle_block_writer.clone())) + .map(|_| PartitionWriter::try_new(Arc::clone(&schema), write_options.clone())) .collect::>>()?; let reservation = MemoryConsumer::new(format!("ShuffleRepartitioner[{partition}]")) @@ -181,7 +184,8 @@ impl MultiPartitionShuffleRepartitioner { buffered_batches: vec![], partition_indices: vec![vec![]; num_output_partitions], partition_writers, - shuffle_block_writer, + schema: Arc::clone(&schema), + write_options, partitioning, runtime, metrics, @@ -436,24 +440,31 @@ impl MultiPartitionShuffleRepartitioner { fn shuffle_write_partition( partition_iter: &mut PartitionedBatchIterator, - shuffle_block_writer: &mut ShuffleBlockWriter, + schema: &SchemaRef, + write_options: &IpcWriteOptions, output_data: &mut BufWriter, encode_time: &Time, - write_time: &Time, - write_buffer_size: usize, batch_size: usize, ) -> datafusion::common::Result<()> { - let mut buf_batch_writer = BufBatchWriter::new( - shuffle_block_writer, + // Only create the IPC stream writer when there's data to write. + // Empty partitions must have zero bytes so that Spark's MapOutputTracker + // reports zero-size blocks, which affects coalesce partition grouping. + let first_batch = match partition_iter.next() { + Some(batch) => batch?, + None => return Ok(()), + }; + let mut buf_batch_writer = BufBatchWriter::try_new( output_data, - write_buffer_size, + Arc::clone(schema), + write_options.clone(), batch_size, - ); + )?; + buf_batch_writer.write(&first_batch, encode_time)?; for batch in partition_iter { let batch = batch?; - buf_batch_writer.write(&batch, encode_time, write_time)?; + buf_batch_writer.write(&batch, encode_time)?; } - buf_batch_writer.flush(encode_time, write_time)?; + buf_batch_writer.flush(encode_time)?; Ok(()) } @@ -507,13 +518,7 @@ impl MultiPartitionShuffleRepartitioner { for partition_id in 0..num_output_partitions { let partition_writer = &mut self.partition_writers[partition_id]; let mut iter = partitioned_batches.produce(partition_id); - spilled_bytes += partition_writer.spill( - &mut iter, - &self.runtime, - &self.metrics, - self.write_buffer_size, - self.batch_size, - )?; + spilled_bytes += partition_writer.spill(&mut iter, &self.runtime, &self.metrics)?; } self.reservation.free(); @@ -573,7 +578,7 @@ impl ShufflePartitioner for MultiPartitionShuffleRepartitioner { .open(data_file) .map_err(|e| DataFusionError::Execution(format!("shuffle write error: {e:?}")))?; - let mut output_data = BufWriter::new(output_data); + let mut output_data = BufWriter::with_capacity(self.write_buffer_size, output_data); #[allow(clippy::needless_range_loop)] for i in 0..num_output_partitions { @@ -594,11 +599,10 @@ impl ShufflePartitioner for MultiPartitionShuffleRepartitioner { let mut partition_iter = partitioned_batches.produce(i); Self::shuffle_write_partition( &mut partition_iter, - &mut self.shuffle_block_writer, + &self.schema, + &self.write_options, &mut output_data, &self.metrics.encode_time, - &self.metrics.write_time, - self.write_buffer_size, self.batch_size, )?; } diff --git a/native/shuffle/src/partitioners/single_partition.rs b/native/shuffle/src/partitioners/single_partition.rs index 5801ef613b..40478c57ca 100644 --- a/native/shuffle/src/partitioners/single_partition.rs +++ b/native/shuffle/src/partitioners/single_partition.rs @@ -18,7 +18,7 @@ use crate::metrics::ShufflePartitionerMetrics; use crate::partitioners::ShufflePartitioner; use crate::writers::BufBatchWriter; -use crate::{CompressionCodec, ShuffleBlockWriter}; +use crate::CompressionCodec; use arrow::array::RecordBatch; use arrow::datatypes::SchemaRef; use datafusion::common::DataFusionError; @@ -26,19 +26,15 @@ use std::fs::{File, OpenOptions}; use std::io::{BufWriter, Write}; use tokio::time::Instant; -/// A partitioner that writes all shuffle data to a single file and a single index file +/// A partitioner that writes all shuffle data to a single file and a single index file. +/// Uses a persistent Arrow IPC StreamWriter via BufBatchWriter, so the schema is written +/// once and batches are appended with built-in body compression. pub(crate) struct SinglePartitionShufflePartitioner { - // output_data_file: File, - output_data_writer: BufBatchWriter, + output_data_writer: BufBatchWriter>, + output_data_path: String, output_index_path: String, - /// Batches that are smaller than the batch size and to be concatenated - buffered_batches: Vec, - /// Number of rows in the concatenating batches - num_buffered_rows: usize, /// Metrics for the repartitioner metrics: ShufflePartitionerMetrics, - /// The configured batch size - batch_size: usize, } impl SinglePartitionShufflePartitioner { @@ -51,61 +47,25 @@ impl SinglePartitionShufflePartitioner { codec: CompressionCodec, write_buffer_size: usize, ) -> datafusion::common::Result { - let shuffle_block_writer = ShuffleBlockWriter::try_new(schema.as_ref(), codec.clone())?; + let write_options = codec.ipc_write_options()?; let output_data_file = OpenOptions::new() .write(true) .create(true) .truncate(true) - .open(output_data_path)?; + .open(&output_data_path)?; + let buffered_file = BufWriter::with_capacity(write_buffer_size, output_data_file); - let output_data_writer = BufBatchWriter::new( - shuffle_block_writer, - output_data_file, - write_buffer_size, - batch_size, - ); + let output_data_writer = + BufBatchWriter::try_new(buffered_file, schema, write_options, batch_size)?; Ok(Self { output_data_writer, + output_data_path, output_index_path, - buffered_batches: vec![], - num_buffered_rows: 0, metrics, - batch_size, }) } - - /// Add a batch to the buffer of the partitioner, these buffered batches will be concatenated - /// and written to the output data file when the number of rows in the buffer reaches the batch size. - fn add_buffered_batch(&mut self, batch: RecordBatch) { - self.num_buffered_rows += batch.num_rows(); - self.buffered_batches.push(batch); - } - - /// Consumes buffered batches and return a concatenated batch if successful - fn concat_buffered_batches(&mut self) -> datafusion::common::Result> { - if self.buffered_batches.is_empty() { - Ok(None) - } else if self.buffered_batches.len() == 1 { - let batch = self.buffered_batches.remove(0); - self.num_buffered_rows = 0; - Ok(Some(batch)) - } else { - let schema = &self.buffered_batches[0].schema(); - match arrow::compute::concat_batches(schema, self.buffered_batches.iter()) { - Ok(concatenated) => { - self.buffered_batches.clear(); - self.num_buffered_rows = 0; - Ok(Some(concatenated)) - } - Err(e) => Err(DataFusionError::ArrowError( - Box::from(e), - Some(DataFusionError::get_back_trace()), - )), - } - } - } } #[async_trait::async_trait] @@ -118,32 +78,8 @@ impl ShufflePartitioner for SinglePartitionShufflePartitioner { self.metrics.data_size.add(batch.get_array_memory_size()); self.metrics.baseline.record_output(num_rows); - if num_rows >= self.batch_size || num_rows + self.num_buffered_rows > self.batch_size { - let concatenated_batch = self.concat_buffered_batches()?; - - // Write the concatenated buffered batch - if let Some(batch) = concatenated_batch { - self.output_data_writer.write( - &batch, - &self.metrics.encode_time, - &self.metrics.write_time, - )?; - } - - if num_rows >= self.batch_size { - // Write the new batch - self.output_data_writer.write( - &batch, - &self.metrics.encode_time, - &self.metrics.write_time, - )?; - } else { - // Add the new batch to the buffer - self.add_buffered_batch(batch); - } - } else { - self.add_buffered_batch(batch); - } + self.output_data_writer + .write(&batch, &self.metrics.encode_time)?; } self.metrics.input_batches.add(1); @@ -156,32 +92,29 @@ impl ShufflePartitioner for SinglePartitionShufflePartitioner { fn shuffle_write(&mut self) -> datafusion::common::Result<()> { let start_time = Instant::now(); - let concatenated_batch = self.concat_buffered_batches()?; - - // Write the concatenated buffered batch - if let Some(batch) = concatenated_batch { - self.output_data_writer.write( - &batch, - &self.metrics.encode_time, - &self.metrics.write_time, - )?; - } - self.output_data_writer - .flush(&self.metrics.encode_time, &self.metrics.write_time)?; + + self.output_data_writer.flush(&self.metrics.encode_time)?; + + let mut write_timer = self.metrics.write_time.timer(); + + // Get data file length via filesystem metadata + let data_file_length = std::fs::metadata(&self.output_data_path) + .map(|m| m.len()) + .map_err(|e| DataFusionError::Execution(format!("shuffle write error: {e:?}")))?; // Write index file. It should only contain 2 entries: 0 and the total number of bytes written let index_file = OpenOptions::new() .write(true) .create(true) .truncate(true) - .open(self.output_index_path.clone()) + .open(&self.output_index_path) .map_err(|e| DataFusionError::Execution(format!("shuffle write error: {e:?}")))?; let mut index_buf_writer = BufWriter::new(index_file); - let data_file_length = self.output_data_writer.writer_stream_position()?; - for offset in [0, data_file_length] { + for offset in [0u64, data_file_length] { index_buf_writer.write_all(&(offset as i64).to_le_bytes()[..])?; } index_buf_writer.flush()?; + write_timer.stop(); self.metrics .baseline diff --git a/native/shuffle/src/shuffle_writer.rs b/native/shuffle/src/shuffle_writer.rs index 8502c79624..1e7dd1cff0 100644 --- a/native/shuffle/src/shuffle_writer.rs +++ b/native/shuffle/src/shuffle_writer.rs @@ -266,9 +266,9 @@ async fn external_shuffle( #[cfg(test)] mod test { use super::*; - use crate::{read_ipc_compressed, ShuffleBlockWriter}; use arrow::array::{Array, StringArray, StringBuilder}; use arrow::datatypes::{DataType, Field, Schema}; + use arrow::ipc::reader::StreamReader; use arrow::record_batch::RecordBatch; use arrow::row::{RowConverter, SortField}; use datafusion::datasource::memory::MemorySourceConfig; @@ -281,30 +281,36 @@ mod test { use datafusion::physical_plan::metrics::Time; use datafusion::prelude::SessionContext; use itertools::Itertools; - use std::io::Cursor; use tokio::runtime::Runtime; #[test] #[cfg_attr(miri, ignore)] // miri can't call foreign function `ZSTD_createCCtx` fn roundtrip_ipc() { + use crate::writers::BufBatchWriter; + let batch = create_batch(8192); for codec in &[ CompressionCodec::None, CompressionCodec::Zstd(1), - CompressionCodec::Snappy, CompressionCodec::Lz4Frame, ] { - let mut output = vec![]; - let mut cursor = Cursor::new(&mut output); - let writer = - ShuffleBlockWriter::try_new(batch.schema().as_ref(), codec.clone()).unwrap(); - let length = writer - .write_batch(&batch, &mut cursor, &Time::default()) - .unwrap(); - assert_eq!(length, output.len()); + let write_options = codec.ipc_write_options().unwrap(); + let mut output = Vec::new(); + let encode_time = Time::default(); + + { + let mut writer = + BufBatchWriter::try_new(&mut output, batch.schema(), write_options, 8192) + .unwrap(); + writer.write(&batch, &encode_time).unwrap(); + writer.flush(&encode_time).unwrap(); + } + + assert!(!output.is_empty()); - let ipc_without_length_prefix = &output[16..]; - let batch2 = read_ipc_compressed(ipc_without_length_prefix).unwrap(); + // Read back using standard Arrow StreamReader + let mut reader = StreamReader::try_new(&output[..], None).unwrap(); + let batch2 = reader.next().unwrap().unwrap(); assert_eq!(batch, batch2); } } @@ -343,6 +349,7 @@ mod test { } #[tokio::test] + #[cfg_attr(miri, ignore)] // miri can't call foreign function `ZSTD_createCCtx` async fn shuffle_partitioner_memory() { let batch = create_batch(900); assert_eq!(8316, batch.get_array_memory_size()); // Not stable across Arrow versions @@ -362,7 +369,7 @@ mod test { 1024, CompressionCodec::Lz4Frame, false, - 1024 * 1024, // write_buffer_size: 1MB default + 8 * 1024, // write_buffer_size: 8KB default ) .unwrap(); @@ -466,7 +473,7 @@ mod test { "/tmp/data.out".to_string(), "/tmp/index.out".to_string(), false, - 1024 * 1024, // write_buffer_size: 1MB default + 8 * 1024, // write_buffer_size: 8KB default ) .unwrap(); @@ -587,15 +594,17 @@ mod test { let _ = fs::remove_file("/tmp/rr_index_1.out"); } - /// Test that batch coalescing in BufBatchWriter reduces output size by - /// writing fewer, larger IPC blocks instead of many small ones. + /// Test that batch coalescing in BufBatchWriter produces correct output. + /// With the new persistent StreamWriter format, schema is written once per stream + /// regardless of coalescing, but coalescing still reduces the number of record batch + /// messages in the stream. #[test] #[cfg_attr(miri, ignore)] - fn test_batch_coalescing_reduces_size() { + fn test_batch_coalescing_correct_output() { use crate::writers::BufBatchWriter; use arrow::array::Int32Array; - // Create a wide schema to amplify per-block schema overhead + // Create a wide schema to amplify per-batch message overhead let fields: Vec = (0..20) .map(|i| Field::new(format!("col_{i}"), DataType::Int32, false)) .collect(); @@ -617,52 +626,44 @@ mod test { .collect(); let codec = CompressionCodec::Lz4Frame; + let write_options = codec.ipc_write_options().unwrap(); let encode_time = Time::default(); - let write_time = Time::default(); // Write with coalescing (batch_size=8192) let mut coalesced_output = Vec::new(); { - let mut writer = ShuffleBlockWriter::try_new(schema.as_ref(), codec.clone()).unwrap(); - let mut buf_writer = BufBatchWriter::new( - &mut writer, - Cursor::new(&mut coalesced_output), - 1024 * 1024, + let mut buf_writer = BufBatchWriter::try_new( + &mut coalesced_output, + Arc::clone(&schema), + write_options.clone(), 8192, - ); + ) + .unwrap(); for batch in &small_batches { - buf_writer.write(batch, &encode_time, &write_time).unwrap(); + buf_writer.write(batch, &encode_time).unwrap(); } - buf_writer.flush(&encode_time, &write_time).unwrap(); + buf_writer.flush(&encode_time).unwrap(); } // Write without coalescing (batch_size=1) let mut uncoalesced_output = Vec::new(); { - let mut writer = ShuffleBlockWriter::try_new(schema.as_ref(), codec.clone()).unwrap(); - let mut buf_writer = BufBatchWriter::new( - &mut writer, - Cursor::new(&mut uncoalesced_output), - 1024 * 1024, + let mut buf_writer = BufBatchWriter::try_new( + &mut uncoalesced_output, + Arc::clone(&schema), + write_options, 1, - ); + ) + .unwrap(); for batch in &small_batches { - buf_writer.write(batch, &encode_time, &write_time).unwrap(); + buf_writer.write(batch, &encode_time).unwrap(); } - buf_writer.flush(&encode_time, &write_time).unwrap(); + buf_writer.flush(&encode_time).unwrap(); } - // Coalesced output should be smaller due to fewer IPC schema blocks - assert!( - coalesced_output.len() < uncoalesced_output.len(), - "Coalesced output ({} bytes) should be smaller than uncoalesced ({} bytes)", - coalesced_output.len(), - uncoalesced_output.len() - ); - - // Verify both roundtrip correctly by reading all IPC blocks - let coalesced_rows = read_all_ipc_blocks(&coalesced_output); - let uncoalesced_rows = read_all_ipc_blocks(&uncoalesced_output); + // Verify both roundtrip correctly by reading all batches via StreamReader + let coalesced_rows = read_all_ipc_stream_rows(&coalesced_output); + let uncoalesced_rows = read_all_ipc_stream_rows(&uncoalesced_output); assert_eq!( coalesced_rows, 5000, "Coalesced should contain all 5000 rows" @@ -673,24 +674,12 @@ mod test { ); } - /// Read all IPC blocks from a byte buffer written by BufBatchWriter/ShuffleBlockWriter, - /// returning the total number of rows. - fn read_all_ipc_blocks(data: &[u8]) -> usize { - let mut offset = 0; + /// Read all record batches from an Arrow IPC stream, returning total row count. + fn read_all_ipc_stream_rows(data: &[u8]) -> usize { + let reader = StreamReader::try_new(data, None).unwrap(); let mut total_rows = 0; - while offset < data.len() { - // First 8 bytes are the IPC length (little-endian u64) - let ipc_length = - u64::from_le_bytes(data[offset..offset + 8].try_into().unwrap()) as usize; - // Skip the 8-byte length prefix; the next 8 bytes are field_count + codec header - let block_start = offset + 8; - let block_end = block_start + ipc_length; - // read_ipc_compressed expects data starting after the 16-byte header - // (i.e., after length + field_count), at the codec tag - let ipc_data = &data[block_start + 8..block_end]; - let batch = read_ipc_compressed(ipc_data).unwrap(); - total_rows += batch.num_rows(); - offset = block_end; + for batch in reader { + total_rows += batch.unwrap().num_rows(); } total_rows } @@ -750,7 +739,9 @@ mod test { assert!(!data.is_empty(), "Data file should contain IPC data"); // Verify row count survives roundtrip - let total_rows = read_all_ipc_blocks(&data); + let cursor = std::io::Cursor::new(&data); + let reader = arrow::ipc::reader::StreamReader::try_new(cursor, None).unwrap(); + let total_rows: usize = reader.map(|b| b.unwrap().num_rows()).sum(); assert_eq!( total_rows, num_rows * num_batches, diff --git a/native/shuffle/src/spark_unsafe/row.rs b/native/shuffle/src/spark_unsafe/row.rs index 3c98677199..0accf61daf 100644 --- a/native/shuffle/src/spark_unsafe/row.rs +++ b/native/shuffle/src/spark_unsafe/row.rs @@ -23,7 +23,7 @@ use crate::spark_unsafe::{ map::{append_map_elements, get_map_key_value_fields}, }; use crate::writers::Checksum; -use crate::writers::ShuffleBlockWriter; +use crate::CompressionCodec; use arrow::array::{ builder::{ ArrayBuilder, BinaryBuilder, BinaryDictionaryBuilder, BooleanBuilder, Date32Builder, @@ -37,7 +37,6 @@ use arrow::array::{ use arrow::compute::cast; use arrow::datatypes::{DataType, Field, Schema, TimeUnit}; use arrow::error::ArrowError; -use datafusion::physical_plan::metrics::Time; use datafusion_comet_jni_bridge::errors::CometError; use jni::sys::{jint, jlong}; use std::{ @@ -197,7 +196,6 @@ macro_rules! get_field_builder { } // Expose the macro for other modules. -use crate::CompressionCodec; pub(crate) use downcast_builder_ref; /// Appends field of row to the given struct builder. `dt` is the data type of the field. @@ -1313,8 +1311,6 @@ pub fn process_sorted_row_partition( ) -> Result<(i64, Option), CometError> { // The current row number we are reading let mut current_row = 0; - // Total number of bytes written - let mut written = 0; // The current checksum value. This is updated incrementally in the following loop. let mut current_checksum = if checksum_enabled { Some(Checksum::try_new(checksum_algo, initial_checksum)?) @@ -1337,9 +1333,14 @@ pub fn process_sorted_row_partition( .append(true) .open(&output_path)?; - // Reusable buffer for serialized batch data + // Buffer that accumulates all IPC bytes across the single stream let mut frozen: Vec = Vec::new(); + // Build a schema from the first batch's datatypes so we can create the StreamWriter + // up front. We need a placeholder schema; we'll create it from the first batch. + let mut stream_writer: Option>> = None; + let write_options = codec.ipc_write_options()?; + while current_row < row_num { let n = std::cmp::min(batch_size, row_num - current_row); @@ -1368,22 +1369,33 @@ pub fn process_sorted_row_partition( .collect(); let batch = make_batch(array_refs?, n)?; - frozen.clear(); - let mut cursor = Cursor::new(&mut frozen); - - // we do not collect metrics in Native_writeSortedFileNative - let ipc_time = Time::default(); - let block_writer = ShuffleBlockWriter::try_new(batch.schema().as_ref(), codec.clone())?; - written += block_writer.write_batch(&batch, &mut cursor, &ipc_time)?; - - if let Some(checksum) = &mut current_checksum { - checksum.update(&mut cursor)?; + // Create the StreamWriter on the first batch (we need the schema) + if stream_writer.is_none() { + stream_writer = Some(arrow::ipc::writer::StreamWriter::try_new_with_options( + &mut frozen, + &batch.schema(), + write_options.clone(), + )?); } - output_data.write_all(&frozen)?; + stream_writer.as_mut().unwrap().write(&batch)?; current_row += n; } + // Finish the IPC stream and flush remaining bytes + if let Some(mut writer) = stream_writer { + writer.finish()?; + } + + let written = frozen.len(); + + if let Some(checksum) = &mut current_checksum { + let mut cursor = Cursor::new(&mut frozen); + checksum.update(&mut cursor)?; + } + + output_data.write_all(&frozen)?; + Ok((written as i64, current_checksum.map(|c| c.finalize()))) } diff --git a/native/shuffle/src/writers/buf_batch_writer.rs b/native/shuffle/src/writers/buf_batch_writer.rs index cfddb46539..0ca1b9a1d7 100644 --- a/native/shuffle/src/writers/buf_batch_writer.rs +++ b/native/shuffle/src/writers/buf_batch_writer.rs @@ -15,128 +15,65 @@ // specific language governing permissions and limitations // under the License. -use super::ShuffleBlockWriter; use arrow::array::RecordBatch; use arrow::compute::kernels::coalesce::BatchCoalescer; +use arrow::datatypes::SchemaRef; +use arrow::ipc::writer::{IpcWriteOptions, StreamWriter}; use datafusion::physical_plan::metrics::Time; -use std::borrow::Borrow; -use std::io::{Cursor, Seek, SeekFrom, Write}; +use std::io::Write; -/// Write batches to writer while using a buffer to avoid frequent system calls. -/// The record batches were first written by ShuffleBlockWriter into an internal buffer. -/// Once the buffer exceeds the max size, the buffer will be flushed to the writer. -/// -/// Small batches are coalesced using Arrow's [`BatchCoalescer`] before serialization, -/// producing exactly `batch_size`-row output batches to reduce per-block IPC schema overhead. -/// The coalescer is lazily initialized on the first write. -pub(crate) struct BufBatchWriter, W: Write> { - shuffle_block_writer: S, - writer: W, - buffer: Vec, - buffer_max_size: usize, +/// Writes batches to a persistent Arrow IPC `StreamWriter`. The schema is written once +/// when the writer is created. Small batches are coalesced via [`BatchCoalescer`] before +/// serialization, producing `batch_size`-row output batches. +pub(crate) struct BufBatchWriter { + writer: StreamWriter, /// Coalesces small batches into target_batch_size before serialization. - /// Lazily initialized on first write to capture the schema. - coalescer: Option, - /// Target batch size for coalescing - batch_size: usize, + coalescer: BatchCoalescer, } -impl, W: Write> BufBatchWriter { - pub(crate) fn new( - shuffle_block_writer: S, - writer: W, - buffer_max_size: usize, +impl BufBatchWriter { + pub(crate) fn try_new( + target: W, + schema: SchemaRef, + write_options: IpcWriteOptions, batch_size: usize, - ) -> Self { - Self { - shuffle_block_writer, - writer, - buffer: vec![], - buffer_max_size, - coalescer: None, - batch_size, - } + ) -> datafusion::common::Result { + let writer = StreamWriter::try_new_with_options(target, &schema, write_options)?; + let coalescer = BatchCoalescer::new(schema, batch_size); + Ok(Self { writer, coalescer }) } pub(crate) fn write( &mut self, batch: &RecordBatch, encode_time: &Time, - write_time: &Time, - ) -> datafusion::common::Result { - let coalescer = self - .coalescer - .get_or_insert_with(|| BatchCoalescer::new(batch.schema(), self.batch_size)); - coalescer.push_batch(batch.clone())?; + ) -> datafusion::common::Result<()> { + self.coalescer.push_batch(batch.clone())?; - // Drain completed batches into a local vec so the coalescer borrow ends - // before we call write_batch_to_buffer (which borrows &mut self). let mut completed = Vec::new(); - while let Some(batch) = coalescer.next_completed_batch() { + while let Some(batch) = self.coalescer.next_completed_batch() { completed.push(batch); } - let mut bytes_written = 0; for batch in &completed { - bytes_written += self.write_batch_to_buffer(batch, encode_time, write_time)?; + let mut timer = encode_time.timer(); + self.writer.write(batch)?; + timer.stop(); } - Ok(bytes_written) - } - - /// Serialize a single batch into the byte buffer, flushing to the writer if needed. - fn write_batch_to_buffer( - &mut self, - batch: &RecordBatch, - encode_time: &Time, - write_time: &Time, - ) -> datafusion::common::Result { - let mut cursor = Cursor::new(&mut self.buffer); - cursor.seek(SeekFrom::End(0))?; - let bytes_written = - self.shuffle_block_writer - .borrow() - .write_batch(batch, &mut cursor, encode_time)?; - let pos = cursor.position(); - if pos >= self.buffer_max_size as u64 { - let mut write_timer = write_time.timer(); - self.writer.write_all(&self.buffer)?; - write_timer.stop(); - self.buffer.clear(); - } - Ok(bytes_written) + Ok(()) } - pub(crate) fn flush( - &mut self, - encode_time: &Time, - write_time: &Time, - ) -> datafusion::common::Result<()> { + pub(crate) fn flush(&mut self, encode_time: &Time) -> datafusion::common::Result<()> { // Finish any remaining buffered rows in the coalescer - let mut remaining = Vec::new(); - if let Some(coalescer) = &mut self.coalescer { - coalescer.finish_buffered_batch()?; - while let Some(batch) = coalescer.next_completed_batch() { - remaining.push(batch); - } - } - for batch in &remaining { - self.write_batch_to_buffer(batch, encode_time, write_time)?; + self.coalescer.finish_buffered_batch()?; + while let Some(batch) = self.coalescer.next_completed_batch() { + let mut timer = encode_time.timer(); + self.writer.write(&batch)?; + timer.stop(); } - // Flush the byte buffer to the underlying writer - let mut write_timer = write_time.timer(); - if !self.buffer.is_empty() { - self.writer.write_all(&self.buffer)?; - } - self.writer.flush()?; - write_timer.stop(); - self.buffer.clear(); + // Finish the IPC stream (writes the end-of-stream marker) + self.writer.finish()?; Ok(()) } } - -impl, W: Write + Seek> BufBatchWriter { - pub(crate) fn writer_stream_position(&mut self) -> datafusion::common::Result { - self.writer.stream_position().map_err(Into::into) - } -} diff --git a/native/shuffle/src/writers/codec.rs b/native/shuffle/src/writers/codec.rs new file mode 100644 index 0000000000..5e6dc88772 --- /dev/null +++ b/native/shuffle/src/writers/codec.rs @@ -0,0 +1,49 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow::ipc::writer::IpcWriteOptions; +use arrow::ipc::CompressionType; + +/// Compression algorithm applied to shuffle IPC streams and Parquet output. +#[derive(Debug, Clone)] +pub enum CompressionCodec { + None, + Lz4Frame, + Zstd(i32), + /// Snappy is only used for Parquet output, not for shuffle IPC. + Snappy, +} + +impl CompressionCodec { + pub fn ipc_write_options(&self) -> datafusion::error::Result { + let compression = match self { + CompressionCodec::None => None, + CompressionCodec::Lz4Frame => Some(CompressionType::LZ4_FRAME), + CompressionCodec::Zstd(_) => Some(CompressionType::ZSTD), + CompressionCodec::Snappy => { + return Err(datafusion::common::DataFusionError::Execution( + "Snappy is not supported for Arrow IPC compression".to_string(), + )); + } + }; + let options = IpcWriteOptions::try_new(8, false, arrow::ipc::MetadataVersion::V5) + .map_err(|e| datafusion::common::DataFusionError::ArrowError(Box::from(e), None))?; + options + .try_with_compression(compression) + .map_err(|e| datafusion::common::DataFusionError::ArrowError(Box::from(e), None)) + } +} diff --git a/native/shuffle/src/writers/mod.rs b/native/shuffle/src/writers/mod.rs index 75caf9f3a3..ed57562856 100644 --- a/native/shuffle/src/writers/mod.rs +++ b/native/shuffle/src/writers/mod.rs @@ -17,10 +17,10 @@ mod buf_batch_writer; mod checksum; -mod shuffle_block_writer; +mod codec; mod spill; pub(crate) use buf_batch_writer::BufBatchWriter; pub(crate) use checksum::Checksum; -pub use shuffle_block_writer::{CompressionCodec, ShuffleBlockWriter}; +pub use codec::CompressionCodec; pub(crate) use spill::PartitionWriter; diff --git a/native/shuffle/src/writers/shuffle_block_writer.rs b/native/shuffle/src/writers/shuffle_block_writer.rs deleted file mode 100644 index 5ed5330e3a..0000000000 --- a/native/shuffle/src/writers/shuffle_block_writer.rs +++ /dev/null @@ -1,146 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use arrow::array::RecordBatch; -use arrow::datatypes::Schema; -use arrow::ipc::writer::StreamWriter; -use datafusion::common::DataFusionError; -use datafusion::error::Result; -use datafusion::physical_plan::metrics::Time; -use std::io::{Cursor, Seek, SeekFrom, Write}; - -/// Compression algorithm applied to shuffle IPC blocks. -#[derive(Debug, Clone)] -pub enum CompressionCodec { - None, - Lz4Frame, - Zstd(i32), - Snappy, -} - -/// Writes a record batch as a length-prefixed, compressed Arrow IPC block. -#[derive(Clone)] -pub struct ShuffleBlockWriter { - codec: CompressionCodec, - header_bytes: Vec, -} - -impl ShuffleBlockWriter { - pub fn try_new(schema: &Schema, codec: CompressionCodec) -> Result { - let header_bytes = Vec::with_capacity(20); - let mut cursor = Cursor::new(header_bytes); - - // leave space for compressed message length - cursor.seek_relative(8)?; - - // write number of columns because JVM side needs to know how many addresses to allocate - let field_count = schema.fields().len(); - cursor.write_all(&field_count.to_le_bytes())?; - - // write compression codec to header - let codec_header = match &codec { - CompressionCodec::Snappy => b"SNAP", - CompressionCodec::Lz4Frame => b"LZ4_", - CompressionCodec::Zstd(_) => b"ZSTD", - CompressionCodec::None => b"NONE", - }; - cursor.write_all(codec_header)?; - - let header_bytes = cursor.into_inner(); - - Ok(Self { - codec, - header_bytes, - }) - } - - /// Writes given record batch as Arrow IPC bytes into given writer. - /// Returns number of bytes written. - pub fn write_batch( - &self, - batch: &RecordBatch, - output: &mut W, - ipc_time: &Time, - ) -> Result { - if batch.num_rows() == 0 { - return Ok(0); - } - - let mut timer = ipc_time.timer(); - let start_pos = output.stream_position()?; - - // write header - output.write_all(&self.header_bytes)?; - - let output = match &self.codec { - CompressionCodec::None => { - let mut arrow_writer = StreamWriter::try_new(output, &batch.schema())?; - arrow_writer.write(batch)?; - arrow_writer.finish()?; - arrow_writer.into_inner()? - } - CompressionCodec::Lz4Frame => { - let mut wtr = lz4_flex::frame::FrameEncoder::new(output); - let mut arrow_writer = StreamWriter::try_new(&mut wtr, &batch.schema())?; - arrow_writer.write(batch)?; - arrow_writer.finish()?; - wtr.finish().map_err(|e| { - DataFusionError::Execution(format!("lz4 compression error: {e}")) - })? - } - - CompressionCodec::Zstd(level) => { - let encoder = zstd::Encoder::new(output, *level)?; - let mut arrow_writer = StreamWriter::try_new(encoder, &batch.schema())?; - arrow_writer.write(batch)?; - arrow_writer.finish()?; - let zstd_encoder = arrow_writer.into_inner()?; - zstd_encoder.finish()? - } - - CompressionCodec::Snappy => { - let mut wtr = snap::write::FrameEncoder::new(output); - let mut arrow_writer = StreamWriter::try_new(&mut wtr, &batch.schema())?; - arrow_writer.write(batch)?; - arrow_writer.finish()?; - wtr.into_inner().map_err(|e| { - DataFusionError::Execution(format!("snappy compression error: {e}")) - })? - } - }; - - // fill ipc length - let end_pos = output.stream_position()?; - let ipc_length = end_pos - start_pos - 8; - let max_size = i32::MAX as u64; - if ipc_length > max_size { - return Err(DataFusionError::Execution(format!( - "Shuffle block size {ipc_length} exceeds maximum size of {max_size}. \ - Try reducing batch size or increasing compression level" - ))); - } - - // fill ipc length - output.seek(SeekFrom::Start(start_pos))?; - output.write_all(&ipc_length.to_le_bytes())?; - output.seek(SeekFrom::Start(end_pos))?; - - timer.stop(); - - Ok((end_pos - start_pos) as usize) - } -} diff --git a/native/shuffle/src/writers/spill.rs b/native/shuffle/src/writers/spill.rs index c16caddbf9..c6feb34764 100644 --- a/native/shuffle/src/writers/spill.rs +++ b/native/shuffle/src/writers/spill.rs @@ -15,10 +15,10 @@ // specific language governing permissions and limitations // under the License. -use super::ShuffleBlockWriter; use crate::metrics::ShufflePartitionerMetrics; use crate::partitioners::PartitionedBatchIterator; -use crate::writers::buf_batch_writer::BufBatchWriter; +use arrow::datatypes::SchemaRef; +use arrow::ipc::writer::{IpcWriteOptions, StreamWriter}; use datafusion::common::DataFusionError; use datafusion::execution::disk_manager::RefCountedTempFile; use datafusion::execution::runtime_env::RuntimeEnv; @@ -36,17 +36,21 @@ pub(crate) struct PartitionWriter { /// will append to this file and the contents will be copied to the shuffle file at /// the end of processing. spill_file: Option, - /// Writer that performs encoding and compression - shuffle_block_writer: ShuffleBlockWriter, + /// Schema used for creating IPC stream writers + schema: SchemaRef, + /// IPC write options (includes compression settings) + write_options: IpcWriteOptions, } impl PartitionWriter { pub(crate) fn try_new( - shuffle_block_writer: ShuffleBlockWriter, + schema: SchemaRef, + write_options: IpcWriteOptions, ) -> datafusion::common::Result { Ok(Self { spill_file: None, - shuffle_block_writer, + schema, + write_options, }) } @@ -80,34 +84,42 @@ impl PartitionWriter { iter: &mut PartitionedBatchIterator, runtime: &RuntimeEnv, metrics: &ShufflePartitionerMetrics, - write_buffer_size: usize, - batch_size: usize, ) -> datafusion::common::Result { if let Some(batch) = iter.next() { self.ensure_spill_file_created(runtime)?; - let total_bytes_written = { - let mut buf_batch_writer = BufBatchWriter::new( - &mut self.shuffle_block_writer, - &mut self.spill_file.as_mut().unwrap().file, - write_buffer_size, - batch_size, - ); - let mut bytes_written = - buf_batch_writer.write(&batch?, &metrics.encode_time, &metrics.write_time)?; - for batch in iter { - let batch = batch?; - bytes_written += buf_batch_writer.write( - &batch, - &metrics.encode_time, - &metrics.write_time, - )?; - } - buf_batch_writer.flush(&metrics.encode_time, &metrics.write_time)?; - bytes_written - }; + let file = &mut self.spill_file.as_mut().unwrap().file; + let start_pos = file.metadata().map(|m| m.len()).unwrap_or(0); - Ok(total_bytes_written) + let mut writer = + StreamWriter::try_new_with_options(file, &self.schema, self.write_options.clone())?; + + let batch = batch?; + let mut encode_timer = metrics.encode_time.timer(); + writer.write(&batch)?; + encode_timer.stop(); + + for batch in iter { + let batch = batch?; + let mut encode_timer = metrics.encode_time.timer(); + writer.write(&batch)?; + encode_timer.stop(); + } + + let mut write_timer = metrics.write_time.timer(); + writer.finish()?; + write_timer.stop(); + + let end_pos = self + .spill_file + .as_ref() + .unwrap() + .file + .metadata() + .map(|m| m.len()) + .unwrap_or(0); + + Ok((end_pos - start_pos) as usize) } else { Ok(0) } diff --git a/spark/src/main/java/org/apache/comet/CometShuffleBlockIterator.java b/spark/src/main/java/org/apache/comet/CometShuffleBlockIterator.java deleted file mode 100644 index 9f72b20f51..0000000000 --- a/spark/src/main/java/org/apache/comet/CometShuffleBlockIterator.java +++ /dev/null @@ -1,142 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.comet; - -import java.io.Closeable; -import java.io.EOFException; -import java.io.IOException; -import java.io.InputStream; -import java.nio.ByteBuffer; -import java.nio.ByteOrder; -import java.nio.channels.Channels; -import java.nio.channels.ReadableByteChannel; - -/** - * Provides raw compressed shuffle blocks to native code via JNI. - * - *

Reads block headers (compressed length + field count) from a shuffle InputStream and loads the - * compressed body into a DirectByteBuffer. Native code pulls blocks by calling hasNext() and - * getBuffer(). - * - *

The DirectByteBuffer returned by getBuffer() is only valid until the next hasNext() call. - * Native code must fully consume it (via read_ipc_compressed which allocates new memory for the - * decompressed data) before pulling the next block. - */ -public class CometShuffleBlockIterator implements Closeable { - - private static final int INITIAL_BUFFER_SIZE = 128 * 1024; - - private final ReadableByteChannel channel; - private final InputStream inputStream; - private final ByteBuffer headerBuf = ByteBuffer.allocate(16).order(ByteOrder.LITTLE_ENDIAN); - private ByteBuffer dataBuf = ByteBuffer.allocateDirect(INITIAL_BUFFER_SIZE); - private boolean closed = false; - private int currentBlockLength = 0; - - public CometShuffleBlockIterator(InputStream in) { - this.inputStream = in; - this.channel = Channels.newChannel(in); - } - - /** - * Reads the next block header and loads the compressed body into the internal buffer. Called by - * native code via JNI. - * - *

Header format: 8-byte compressedLength (includes field count but not itself) + 8-byte - * fieldCount (discarded, schema comes from protobuf). - * - * @return the compressed body length in bytes (codec prefix + compressed IPC), or -1 if EOF - */ - public int hasNext() throws IOException { - if (closed) { - return -1; - } - - // Read 16-byte header: clear() resets position=0, limit=capacity, - // preparing the buffer for channel.read() to fill it - headerBuf.clear(); - while (headerBuf.hasRemaining()) { - int bytesRead = channel.read(headerBuf); - if (bytesRead < 0) { - if (headerBuf.position() == 0) { - close(); - return -1; - } - throw new EOFException("Data corrupt: unexpected EOF while reading batch header"); - } - } - headerBuf.flip(); - long compressedLength = headerBuf.getLong(); - // Field count discarded - schema determined by ShuffleScan protobuf fields - headerBuf.getLong(); - - // Subtract 8 because compressedLength includes the 8-byte field count we already read - long bytesToRead = compressedLength - 8; - if (bytesToRead > Integer.MAX_VALUE) { - throw new IllegalStateException( - "Native shuffle block size of " - + bytesToRead - + " exceeds maximum of " - + Integer.MAX_VALUE - + ". Try reducing spark.comet.columnar.shuffle.batch.size."); - } - - currentBlockLength = (int) bytesToRead; - - if (dataBuf.capacity() < currentBlockLength) { - int newCapacity = (int) Math.min(bytesToRead * 2L, Integer.MAX_VALUE); - dataBuf = ByteBuffer.allocateDirect(newCapacity); - } - - dataBuf.clear(); - dataBuf.limit(currentBlockLength); - while (dataBuf.hasRemaining()) { - int bytesRead = channel.read(dataBuf); - if (bytesRead < 0) { - throw new EOFException("Data corrupt: unexpected EOF while reading compressed batch"); - } - } - // Note: native side uses get_direct_buffer_address (base pointer) + currentBlockLength, - // not the buffer's position/limit. No flip needed. - - return currentBlockLength; - } - - /** - * Returns the DirectByteBuffer containing the current block's compressed bytes (4-byte codec - * prefix + compressed IPC data). Called by native code via JNI. - */ - public ByteBuffer getBuffer() { - return dataBuf; - } - - /** Returns the length of the current block in bytes. Called by native code via JNI. */ - public int getCurrentBlockLength() { - return currentBlockLength; - } - - @Override - public void close() throws IOException { - if (!closed) { - closed = true; - inputStream.close(); - } - } -} diff --git a/spark/src/main/scala/org/apache/comet/CometExecIterator.scala b/spark/src/main/scala/org/apache/comet/CometExecIterator.scala index f0c6373149..de13265a1a 100644 --- a/spark/src/main/scala/org/apache/comet/CometExecIterator.scala +++ b/spark/src/main/scala/org/apache/comet/CometExecIterator.scala @@ -68,7 +68,7 @@ class CometExecIterator( partitionIndex: Int, broadcastedHadoopConfForEncryption: Option[Broadcast[SerializableConfiguration]] = None, encryptedFilePaths: Seq[String] = Seq.empty, - shuffleBlockIterators: Map[Int, CometShuffleBlockIterator] = Map.empty) + shuffleInputStreams: Map[Int, java.io.InputStream] = Map.empty) extends Iterator[ColumnarBatch] with Logging { @@ -79,11 +79,11 @@ class CometExecIterator( private val taskAttemptId = TaskContext.get().taskAttemptId private val taskCPUs = TaskContext.get().cpus() private val cometTaskMemoryManager = new CometTaskMemoryManager(id, taskAttemptId) - // Build a mixed array of iterators: CometShuffleBlockIterator for shuffle - // scan indices, CometBatchIterator for regular scan indices. + // Build a mixed array of iterators: InputStream for shuffle scan indices, + // CometBatchIterator for regular scan indices. private val inputIterators: Array[Object] = inputs.zipWithIndex.map { - case (_, idx) if shuffleBlockIterators.contains(idx) => - shuffleBlockIterators(idx).asInstanceOf[Object] + case (_, idx) if shuffleInputStreams.contains(idx) => + shuffleInputStreams(idx).asInstanceOf[Object] case (iterator, _) => new CometBatchIterator(iterator, nativeUtil).asInstanceOf[Object] }.toArray @@ -237,7 +237,7 @@ class CometExecIterator( currentBatch = null } nativeUtil.close() - shuffleBlockIterators.values.foreach(_.close()) + shuffleInputStreams.values.foreach(_.close()) nativeLib.releasePlan(plan) if (tracingEnabled) { diff --git a/spark/src/main/scala/org/apache/comet/Native.scala b/spark/src/main/scala/org/apache/comet/Native.scala index c003bcd138..7842f0ab8d 100644 --- a/spark/src/main/scala/org/apache/comet/Native.scala +++ b/spark/src/main/scala/org/apache/comet/Native.scala @@ -19,8 +19,6 @@ package org.apache.comet -import java.nio.ByteBuffer - import org.apache.spark.CometTaskMemoryManager import org.apache.spark.sql.comet.CometMetricNode @@ -172,12 +170,16 @@ class Native extends NativeBase { * @param size * the size of the array. */ - @native def decodeShuffleBlock( - shuffleBlock: ByteBuffer, - length: Int, + @native def openShuffleStream(inputStream: java.io.InputStream): Long + + @native def nextShuffleStreamBatch( + handle: Long, arrayAddrs: Array[Long], - schemaAddrs: Array[Long], - tracingEnabled: Boolean): Long + schemaAddrs: Array[Long]): Long + + @native def shuffleStreamNumFields(handle: Long): Long + + @native def closeShuffleStream(handle: Long): Unit /** * Log the beginning of an event. diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/CometExecRDD.scala b/spark/src/main/scala/org/apache/spark/sql/comet/CometExecRDD.scala index c5014818c4..963505dcf2 100644 --- a/spark/src/main/scala/org/apache/spark/sql/comet/CometExecRDD.scala +++ b/spark/src/main/scala/org/apache/spark/sql/comet/CometExecRDD.scala @@ -111,11 +111,11 @@ private[spark] class CometExecRDD( serializedPlan } - // Create shuffle block iterators for inputs that are CometShuffledBatchRDD - val shuffleBlockIters = shuffleScanIndices.flatMap { idx => + // Create raw InputStreams for inputs that are CometShuffledBatchRDD + val shuffleStreams = shuffleScanIndices.flatMap { idx => inputRDDs(idx) match { case rdd: CometShuffledBatchRDD => - Some(idx -> rdd.computeAsShuffleBlockIterator(partition.inputPartitions(idx), context)) + Some(idx -> rdd.computeAsRawStream(partition.inputPartitions(idx), context)) case _ => None } }.toMap @@ -130,7 +130,7 @@ private[spark] class CometExecRDD( partition.index, broadcastedHadoopConfForEncryption, encryptedFilePaths, - shuffleBlockIters) + shuffleStreams) // Register ScalarSubqueries so native code can look them up subqueries.foreach(sub => CometScalarSubquery.setSubquery(it.id, sub)) diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffledRowRDD.scala b/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffledRowRDD.scala index 7604910b06..45677d93fb 100644 --- a/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffledRowRDD.scala +++ b/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffledRowRDD.scala @@ -27,8 +27,6 @@ import org.apache.spark.sql.execution.metric.{SQLMetric, SQLShuffleReadMetricsRe import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.vectorized.ColumnarBatch -import org.apache.comet.CometShuffleBlockIterator - /** * Different from [[org.apache.spark.sql.execution.ShuffledRowRDD]], this RDD is specialized for * reading shuffled data through [[CometBlockStoreShuffleReader]]. The shuffled data is read in an @@ -149,14 +147,12 @@ class CometShuffledBatchRDD( } /** - * Creates a CometShuffleBlockIterator that provides raw compressed shuffle blocks for direct - * consumption by native code, bypassing Arrow FFI. + * Returns the raw InputStream of concatenated Arrow IPC streams for direct consumption by + * native code via ShuffleStreamReader. */ - def computeAsShuffleBlockIterator( - split: Partition, - context: TaskContext): CometShuffleBlockIterator = { + def computeAsRawStream(split: Partition, context: TaskContext): java.io.InputStream = { val reader = createReader(split, context) - new CometShuffleBlockIterator(reader.readAsRawStream()) + reader.readAsRawStream() } override def compute(split: Partition, context: TaskContext): Iterator[ColumnarBatch] = { diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/NativeBatchDecoderIterator.scala b/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/NativeBatchDecoderIterator.scala index f96c8f16dd..22fc14df97 100644 --- a/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/NativeBatchDecoderIterator.scala +++ b/spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/NativeBatchDecoderIterator.scala @@ -19,9 +19,7 @@ package org.apache.spark.sql.comet.execution.shuffle -import java.io.{EOFException, InputStream} -import java.nio.{ByteBuffer, ByteOrder} -import java.nio.channels.{Channels, ReadableByteChannel} +import java.io.InputStream import org.apache.spark.sql.execution.metric.SQLMetric import org.apache.spark.sql.vectorized.ColumnarBatch @@ -43,27 +41,32 @@ case class NativeBatchDecoderIterator( extends Iterator[ColumnarBatch] { private var isClosed = false - private val longBuf = ByteBuffer.allocate(8).order(ByteOrder.LITTLE_ENDIAN) private var currentBatch: ColumnarBatch = null - private var batch = fetchNext() - import NativeBatchDecoderIterator._ + // Open the native stream reader + private val handle: Long = if (in != null) { + nativeLib.openShuffleStream(in) + } else { + 0L + } - private val channel: ReadableByteChannel = if (in != null) { - Channels.newChannel(in) + // Get field count from the native reader (it parsed the schema on open) + private val numFields: Int = if (handle != 0L) { + nativeLib.shuffleStreamNumFields(handle).toInt } else { - null + 0 } + private var batch = fetchNext() + def hasNext(): Boolean = { - if (channel == null || isClosed) { + if (handle == 0L || isClosed) { return false } if (batch.isDefined) { return true } - // Release the previous batch. if (currentBatch != null) { currentBatch.close() currentBatch = null @@ -81,89 +84,24 @@ case class NativeBatchDecoderIterator( if (!hasNext) { throw new NoSuchElementException } - val nextBatch = batch.get - currentBatch = nextBatch batch = None currentBatch } private def fetchNext(): Option[ColumnarBatch] = { - if (channel == null || isClosed) { + if (handle == 0L || isClosed) { return None } - // read compressed batch size from header - try { - longBuf.clear() - while (longBuf.hasRemaining && channel.read(longBuf) >= 0) {} - } catch { - case _: EOFException => - close() - return None - } - - // If we reach the end of the stream, we are done, or if we read partial length - // then the stream is corrupted. - if (longBuf.hasRemaining) { - if (longBuf.position() == 0) { - close() - return None - } - throw new EOFException("Data corrupt: unexpected EOF while reading compressed ipc lengths") - } - - // get compressed length (including headers) - longBuf.flip() - val compressedLength = longBuf.getLong - - // read field count from header - longBuf.clear() - while (longBuf.hasRemaining && channel.read(longBuf) >= 0) {} - if (longBuf.hasRemaining) { - throw new EOFException("Data corrupt: unexpected EOF while reading field count") - } - longBuf.flip() - val fieldCount = longBuf.getLong.toInt - - // read body - val bytesToRead = compressedLength - 8 - if (bytesToRead > Integer.MAX_VALUE) { - // very unlikely that shuffle block will reach 2GB - throw new IllegalStateException( - s"Native shuffle block size of $bytesToRead exceeds " + - s"maximum of ${Integer.MAX_VALUE}. Try reducing shuffle batch size.") - } - var dataBuf = threadLocalDataBuf.get() - if (dataBuf.capacity() < bytesToRead) { - // it is unlikely that we would overflow here since it would - // require a 1GB compressed shuffle block but we check anyway - val newCapacity = (bytesToRead * 2L).min(Integer.MAX_VALUE).toInt - dataBuf = ByteBuffer.allocateDirect(newCapacity) - threadLocalDataBuf.set(dataBuf) - } - dataBuf.clear() - dataBuf.limit(bytesToRead.toInt) - while (dataBuf.hasRemaining && channel.read(dataBuf) >= 0) {} - if (dataBuf.hasRemaining) { - throw new EOFException("Data corrupt: unexpected EOF while reading compressed batch") - } - - // make native call to decode batch val startTime = System.nanoTime() val batch = nativeUtil.getNextBatch( - fieldCount, + numFields, (arrayAddrs, schemaAddrs) => { - nativeLib.decodeShuffleBlock( - dataBuf, - bytesToRead.toInt, - arrayAddrs, - schemaAddrs, - tracingEnabled) + nativeLib.nextShuffleStreamBatch(handle, arrayAddrs, schemaAddrs) }) decodeTime.add(System.nanoTime() - startTime) - batch } @@ -174,25 +112,14 @@ case class NativeBatchDecoderIterator( currentBatch.close() currentBatch = null } - in.close() - resetDataBuf() + if (handle != 0L) { + nativeLib.closeShuffleStream(handle) + } + if (in != null) { + in.close() + } isClosed = true } } } } - -object NativeBatchDecoderIterator { - - private val INITIAL_BUFFER_SIZE = 128 * 1024 - - private val threadLocalDataBuf: ThreadLocal[ByteBuffer] = ThreadLocal.withInitial(() => { - ByteBuffer.allocateDirect(INITIAL_BUFFER_SIZE) - }) - - private def resetDataBuf(): Unit = { - if (threadLocalDataBuf.get().capacity() > INITIAL_BUFFER_SIZE) { - threadLocalDataBuf.set(ByteBuffer.allocateDirect(INITIAL_BUFFER_SIZE)) - } - } -}