diff --git a/.claude/skills/rationalize-deps/SKILL.md b/.claude/skills/rationalize-deps/SKILL.md new file mode 100644 index 00000000000..829a70c67ec --- /dev/null +++ b/.claude/skills/rationalize-deps/SKILL.md @@ -0,0 +1,125 @@ +--- +name: rationalize-deps +description: Analyze Cargo.toml dependencies and attempt to remove unused features to reduce compile times and binary size +--- + +# Rationalize Dependencies + +This skill analyzes Cargo.toml dependencies to identify and remove unused features. + +## Overview + +Many crates enable features by default that may not be needed. This skill: +1. Identifies dependencies with default features enabled +2. Tests if `default-features = false` works +3. Identifies which specific features are actually needed +4. Verifies compilation after changes + +## Step 1: Identify the target + +Ask the user which crate(s) to analyze: +- A specific crate name (e.g., "tokio", "serde") +- A specific workspace member (e.g., "quickwit-search") +- "all" to scan the entire workspace + +## Step 2: Analyze current dependencies + +For the workspace Cargo.toml (`quickwit/Cargo.toml`), list dependencies that: +- Do NOT have `default-features = false` +- Have default features that might be unnecessary + +Run: `cargo tree -p -f "{p} {f}" --edges features` to see what features are actually used. + +## Step 3: For each candidate dependency + +### 3a: Check the crate's default features + +Look up the crate on crates.io or check its Cargo.toml to understand: +- What features are enabled by default +- What each feature provides + +Use: `cargo metadata --format-version=1 | jq '.packages[] | select(.name == "") | .features'` + +### 3b: Try disabling default features + +Modify the dependency in `quickwit/Cargo.toml`: + +From: +```toml +some-crate = { version = "1.0" } +``` + +To: +```toml +some-crate = { version = "1.0", default-features = false } +``` + +### 3c: Run cargo check + +Run: `cargo check --workspace` (or target specific packages for faster feedback) + +If compilation fails: +1. Read the error messages to identify which features are needed +2. Add only the required features explicitly: + ```toml + some-crate = { version = "1.0", default-features = false, features = ["needed-feature"] } + ``` +3. Re-run cargo check + +### 3d: Binary search for minimal features + +If there are many default features, use binary search: +1. Start with no features +2. If it fails, add half the default features +3. Continue until you find the minimal set + +## Step 4: Document findings + +For each dependency analyzed, report: +- Original configuration +- New configuration (if changed) +- Features that were removed +- Any features that are required + +## Step 5: Verify full build + +After all changes, run: +```bash +cargo check --workspace --all-targets +cargo test --workspace --no-run +``` + +## Common Patterns + +### Serde +Often only needs `derive`: +```toml +serde = { version = "1.0", default-features = false, features = ["derive", "std"] } +``` + +### Tokio +Identify which runtime features are actually used: +```toml +tokio = { version = "1.0", default-features = false, features = ["rt-multi-thread", "macros", "sync"] } +``` + +### Reqwest +Often doesn't need all TLS backends: +```toml +reqwest = { version = "0.11", default-features = false, features = ["rustls-tls", "json"] } +``` + +## Rollback + +If changes cause issues: +```bash +git checkout quickwit/Cargo.toml +cargo check --workspace +``` + +## Tips + +- Start with large crates that have many default features (tokio, reqwest, hyper) +- Use `cargo bloat --crates` to identify large dependencies +- Check `cargo tree -d` for duplicate dependencies that might indicate feature conflicts +- Some features are needed only for tests - consider using `[dev-dependencies]` features diff --git a/LICENSE-3rdparty.csv b/LICENSE-3rdparty.csv index 13904cb90c2..ed79fbdb132 100644 --- a/LICENSE-3rdparty.csv +++ b/LICENSE-3rdparty.csv @@ -52,7 +52,6 @@ base16ct,https://github.com/RustCrypto/formats/tree/master/base16ct,Apache-2.0 O base64,https://github.com/marshallpierce/rust-base64,MIT OR Apache-2.0,Marshall Pierce base64-simd,https://github.com/Nugine/simd,MIT,The base64-simd Authors base64ct,https://github.com/RustCrypto/formats,Apache-2.0 OR MIT,RustCrypto Developers -bincode,https://github.com/servo/bincode,MIT,"Ty Overby , Francesco Mazzoli , David Tolnay , Zoey Riordan " bit-set,https://github.com/contain-rs/bit-set,Apache-2.0 OR MIT,Alexis Beingessner bit-vec,https://github.com/contain-rs/bit-vec,Apache-2.0 OR MIT,Alexis Beingessner bitflags,https://github.com/bitflags/bitflags,MIT OR Apache-2.0,The Rust Project Developers @@ -104,8 +103,6 @@ crossbeam-utils,https://github.com/crossbeam-rs/crossbeam,MIT OR Apache-2.0,The crunchy,https://github.com/eira-fransham/crunchy,MIT,Eira Fransham crypto-bigint,https://github.com/RustCrypto/crypto-bigint,Apache-2.0 OR MIT,RustCrypto Developers crypto-common,https://github.com/RustCrypto/traits,MIT OR Apache-2.0,RustCrypto Developers -csv,https://github.com/BurntSushi/rust-csv,Unlicense OR MIT,Andrew Gallant -csv-core,https://github.com/BurntSushi/rust-csv,Unlicense OR MIT,Andrew Gallant darling,https://github.com/TedDriggs/darling,MIT,Ted Driggs darling_core,https://github.com/TedDriggs/darling,MIT,Ted Driggs darling_macro,https://github.com/TedDriggs/darling,MIT,Ted Driggs @@ -130,15 +127,7 @@ elliptic-curve,https://github.com/RustCrypto/traits/tree/master/elliptic-curve,A embedded-io,https://github.com/embassy-rs/embedded-io,MIT OR Apache-2.0,The embedded-io Authors embedded-io,https://github.com/rust-embedded/embedded-hal,MIT OR Apache-2.0,The embedded-io Authors encode_unicode,https://github.com/tormol/encode_unicode,Apache-2.0 OR MIT,Torbjørn Birch Moltu -encoding,https://github.com/lifthrasiir/rust-encoding,MIT,Kang Seonghoon -encoding-index-japanese,https://github.com/lifthrasiir/rust-encoding,CC0-1.0,Kang Seonghoon -encoding-index-korean,https://github.com/lifthrasiir/rust-encoding,CC0-1.0,Kang Seonghoon -encoding-index-simpchinese,https://github.com/lifthrasiir/rust-encoding,CC0-1.0,Kang Seonghoon -encoding-index-singlebyte,https://github.com/lifthrasiir/rust-encoding,CC0-1.0,Kang Seonghoon -encoding-index-tradchinese,https://github.com/lifthrasiir/rust-encoding,CC0-1.0,Kang Seonghoon -encoding_index_tests,https://github.com/lifthrasiir/rust-encoding,CC0-1.0,Kang Seonghoon encoding_rs,https://github.com/hsivonen/encoding_rs,(Apache-2.0 OR MIT) AND BSD-3-Clause,Henri Sivonen -encoding_rs_io,https://github.com/BurntSushi/encoding_rs_io,MIT OR Apache-2.0,Andrew Gallant enum-iterator,https://github.com/stephaneyfx/enum-iterator,0BSD,Stephane Raux enum-iterator-derive,https://github.com/stephaneyfx/enum-iterator,0BSD,Stephane Raux env_filter,https://github.com/rust-cli/env_logger,MIT OR Apache-2.0,The env_filter Authors @@ -150,7 +139,6 @@ fail,https://github.com/tikv/fail-rs,Apache-2.0,The TiKV Project Developers fastdivide,https://github.com/fulmicoton/fastdivide,zlib-acknowledgement OR MIT,Paul Masurel fastrand,https://github.com/smol-rs/fastrand,Apache-2.0 OR MIT,Stjepan Glavina ff,https://github.com/zkcrypto/ff,MIT OR Apache-2.0,"Sean Bowe , Jack Grigg " -filetime,https://github.com/alexcrichton/filetime,MIT OR Apache-2.0,Alex Crichton find-msvc-tools,https://github.com/rust-lang/cc-rs,MIT OR Apache-2.0,The find-msvc-tools Authors fixedbitset,https://github.com/petgraph/fixedbitset,MIT OR Apache-2.0,bluss flate2,https://github.com/rust-lang/flate2-rs,MIT OR Apache-2.0,"Alex Crichton , Josh Triplett " @@ -224,8 +212,6 @@ is-terminal,https://github.com/sunfishcode/is-terminal,MIT,"softprops -jiff,https://github.com/BurntSushi/jiff,Unlicense OR MIT,Andrew Gallant -jiff-static,https://github.com/BurntSushi/jiff,Unlicense OR MIT,Andrew Gallant jobserver,https://github.com/rust-lang/jobserver-rs,MIT OR Apache-2.0,Alex Crichton js-sys,https://github.com/wasm-bindgen/wasm-bindgen/tree/master/crates/js-sys,MIT OR Apache-2.0,The wasm-bindgen Developers json_comments,https://github.com/tmccombs/json-comments-rs,Apache-2.0,Thayne McCombs @@ -233,19 +219,6 @@ lazy_static,https://github.com/rust-lang-nursery/lazy-static.rs,MIT OR Apache-2. levenshtein_automata,https://github.com/tantivy-search/levenshtein-automata,MIT,Paul Masurel libc,https://github.com/rust-lang/libc,MIT OR Apache-2.0,The Rust Project Developers libm,https://github.com/rust-lang/compiler-builtins,MIT,Jorge Aparicio -libredox,https://gitlab.redox-os.org/redox-os/libredox,MIT,4lDO2 <4lDO2@protonmail.com> -lindera-cc-cedict,https://github.com/lindera-morphology/lindera,MIT,The lindera-cc-cedict Authors -lindera-cc-cedict-builder,https://github.com/lindera-morphology/lindera,MIT,The lindera-cc-cedict-builder Authors -lindera-core,https://github.com/lindera-morphology/lindera,MIT,The lindera-core Authors -lindera-decompress,https://github.com/lindera-morphology/lindera,MIT,The lindera-decompress Authors -lindera-dictionary,https://github.com/lindera-morphology/lindera,MIT,The lindera-dictionary Authors -lindera-ipadic,https://github.com/lindera-morphology/lindera,MIT,The lindera-ipadic Authors -lindera-ipadic-builder,https://github.com/lindera-morphology/lindera,MIT,The lindera-ipadic-builder Authors -lindera-ipadic-neologd-builder,https://github.com/lindera-morphology/lindera,MIT,The lindera-ipadic-neologd-builder Authors -lindera-ko-dic,https://github.com/lindera-morphology/lindera,MIT,The lindera-ko-dic Authors -lindera-ko-dic-builder,https://github.com/lindera-morphology/lindera,MIT,The lindera-ko-dic-builder Authors -lindera-tokenizer,https://github.com/lindera-morphology/lindera,MIT,The lindera-tokenizer Authors -lindera-unidic-builder,https://github.com/lindera-morphology/lindera,MIT,The lindera-unidic-builder Authors linked-hash-map,https://github.com/contain-rs/linked-hash-map,MIT OR Apache-2.0,"Stepan Koltsov , Andrew Paseltiner " linux-raw-sys,https://github.com/sunfishcode/linux-raw-sys,Apache-2.0 WITH LLVM-exception OR Apache-2.0 OR MIT,Dan Gohman litemap,https://github.com/unicode-org/icu4x,Unicode-3.0,The ICU4X Project Developers @@ -330,7 +303,6 @@ pnet_packet,https://github.com/libpnet/libpnet,MIT OR Apache-2.0,Robert Clipsham pnet_sys,https://github.com/libpnet/libpnet,MIT OR Apache-2.0,"Robert Clipsham , Linus Färnstrand " pnet_transport,https://github.com/libpnet/libpnet,MIT OR Apache-2.0,Robert Clipsham portable-atomic,https://github.com/taiki-e/portable-atomic,Apache-2.0 OR MIT,The portable-atomic Authors -portable-atomic-util,https://github.com/taiki-e/portable-atomic,Apache-2.0 OR MIT,The portable-atomic-util Authors postcard,https://github.com/jamesmunns/postcard,MIT OR Apache-2.0,James Munns potential_utf,https://github.com/unicode-org/icu4x,Unicode-3.0,The ICU4X Project Developers powerfmt,https://github.com/jhpratt/powerfmt,MIT OR Apache-2.0,Jacob Pratt @@ -353,8 +325,6 @@ prost,https://github.com/tokio-rs/prost,Apache-2.0,"Dan Burkert , Lucio Franco , Casper Meijn , Tokio Contributors " prost-derive,https://github.com/tokio-rs/prost,Apache-2.0,"Dan Burkert , Lucio Franco , Casper Meijn , Tokio Contributors " prost-types,https://github.com/tokio-rs/prost,Apache-2.0,"Dan Burkert , Lucio Franco , Casper Meijn , Tokio Contributors " -protobuf,https://github.com/stepancheg/rust-protobuf,MIT,Stepan Koltsov -protobuf-support,https://github.com/stepancheg/rust-protobuf,MIT,Stepan Koltsov pulldown-cmark,https://github.com/raphlinus/pulldown-cmark,MIT,"Raph Levien , Marcus Klaas de Vries " pulldown-cmark-to-cmark,https://github.com/Byron/pulldown-cmark-to-cmark,Apache-2.0,"Sebastian Thiel , Dylan Owen , Alessandro Ogier , Zixian Cai <2891235+caizixian@users.noreply.github.com>, Andrew Lyjak " quanta,https://github.com/metrics-rs/quanta,MIT,Toby Lawrence @@ -388,7 +358,6 @@ roxmltree,https://github.com/RazrFalcon/roxmltree,MIT OR Apache-2.0,Evgeniy Reiz rust-embed,https://pyrossh.dev/repos/rust-embed,MIT,pyrossh rust-embed-impl,https://pyrossh.dev/repos/rust-embed,MIT,pyrossh rust-embed-utils,https://pyrossh.dev/repos/rust-embed,MIT,pyrossh -rust-stemmers,https://github.com/CurrySoftware/rust-stemmers,MIT OR BSD-3-Clause,"Jakob Demler , CurrySoftware " rustc-hash,https://github.com/rust-lang/rustc-hash,Apache-2.0 OR MIT,The Rust Project Developers rustix,https://github.com/bytecodealliance/rustix,Apache-2.0 WITH LLVM-exception OR Apache-2.0 OR MIT,"Dan Gohman , Jakub Konka " rustls,https://github.com/rustls/rustls,Apache-2.0 OR ISC OR MIT,The rustls Authors @@ -448,8 +417,6 @@ syn,https://github.com/dtolnay/syn,MIT OR Apache-2.0,David Tolnay synstructure,https://github.com/mystor/synstructure,MIT,Nika Layzell sysinfo,https://github.com/GuillaumeGomez/sysinfo,MIT,Guillaume Gomez -system-configuration,https://github.com/mullvad/system-configuration-rs,MIT OR Apache-2.0,Mullvad VPN -system-configuration-sys,https://github.com/mullvad/system-configuration-rs,MIT OR Apache-2.0,Mullvad VPN tabled,https://github.com/zhiburt/tabled,MIT,Maxim Zhiburt tabled_derive,https://github.com/zhiburt/tabled,MIT,Maxim Zhiburt tantivy,https://github.com/quickwit-oss/tantivy,MIT,Paul Masurel @@ -545,7 +512,6 @@ wasmtimer,https://github.com/whizsid/wasmtimer-rs,MIT,"WhizSid web-sys,https://github.com/wasm-bindgen/wasm-bindgen/tree/master/crates/web-sys,MIT OR Apache-2.0,The wasm-bindgen Developers web-time,https://github.com/daxpedda/web-time,MIT OR Apache-2.0,The web-time Authors webpki-roots,https://github.com/rustls/webpki-roots,CDLA-Permissive-2.0,The webpki-roots Authors -whichlang,https://github.com/quickwit-oss/whichlang,MIT,"Quickwit, Inc. " winapi,https://github.com/retep998/winapi-rs,MIT,Peter Atashian winapi,https://github.com/retep998/winapi-rs,MIT OR Apache-2.0,Peter Atashian winapi-i686-pc-windows-gnu,https://github.com/retep998/winapi-rs,MIT OR Apache-2.0,Peter Atashian @@ -561,7 +527,6 @@ windows-interface,https://github.com/microsoft/windows-rs,MIT OR Apache-2.0,The windows-link,https://github.com/microsoft/windows-rs,MIT OR Apache-2.0,Microsoft windows-link,https://github.com/microsoft/windows-rs,MIT OR Apache-2.0,The windows-link Authors windows-numerics,https://github.com/microsoft/windows-rs,MIT OR Apache-2.0,The windows-numerics Authors -windows-registry,https://github.com/microsoft/windows-rs,MIT OR Apache-2.0,The windows-registry Authors windows-result,https://github.com/microsoft/windows-rs,MIT OR Apache-2.0,Microsoft windows-result,https://github.com/microsoft/windows-rs,MIT OR Apache-2.0,The windows-result Authors windows-strings,https://github.com/microsoft/windows-rs,MIT OR Apache-2.0,Microsoft @@ -590,9 +555,7 @@ windows_x86_64_msvc,https://github.com/microsoft/windows-rs,MIT OR Apache-2.0,Th winnow,https://github.com/winnow-rs/winnow,MIT,The winnow Authors wit-bindgen,https://github.com/bytecodealliance/wit-bindgen,Apache-2.0 WITH LLVM-exception OR Apache-2.0 OR MIT,Alex Crichton writeable,https://github.com/unicode-org/icu4x,Unicode-3.0,The ICU4X Project Developers -xattr,https://github.com/Stebalien/xattr,MIT OR Apache-2.0,Steven Allen xmlparser,https://github.com/RazrFalcon/xmlparser,MIT OR Apache-2.0,Yevhenii Reizner -yada,https://github.com/takuyaa/yada,MIT OR Apache-2.0,Takuya Asano yansi,https://github.com/SergioBenitez/yansi,MIT OR Apache-2.0,Sergio Benitez yoke,https://github.com/unicode-org/icu4x,Unicode-3.0,Manish Goregaokar yoke-derive,https://github.com/unicode-org/icu4x,Unicode-3.0,Manish Goregaokar diff --git a/quickwit/Cargo.lock b/quickwit/Cargo.lock index a0f47b86c7d..d05bafb3f7f 100644 --- a/quickwit/Cargo.lock +++ b/quickwit/Cargo.lock @@ -1186,15 +1186,6 @@ version = "1.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7d809780667f4410e7c41b07f52439b94d2bdf8528eeedc287fa38d3b7f95d82" -[[package]] -name = "bincode" -version = "1.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1f45e9417d87227c7a56d22e471c6206462cba514c7590c09aff4cf6d1ddcad" -dependencies = [ - "serde", -] - [[package]] name = "bindgen" version = "0.72.1" @@ -1376,9 +1367,9 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" [[package]] name = "bytes" -version = "1.11.0" +version = "1.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b35204fbdc0b3f4446b89fc1ac2cf84a8a68971995d0bf2e925ec7cd960f9cb3" +checksum = "1e748733b7cbc798e1434b6ac524f0c1ff2ab456fe201501e6497c8417a4fc33" dependencies = [ "serde", ] @@ -2312,8 +2303,6 @@ checksum = "25f104b501bf2364e78d0d3974cbc774f738f5865306ed128e1e0d7499c0ad96" dependencies = [ "console", "shell-words", - "tempfile", - "zeroize", ] [[package]] @@ -2598,70 +2587,6 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "34aa73646ffb006b8f5147f3dc182bd4bcb190227ce861fc4a4844bf8e3cb2c0" -[[package]] -name = "encoding" -version = "0.2.33" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6b0d943856b990d12d3b55b359144ff341533e516d94098b1d3fc1ac666d36ec" -dependencies = [ - "encoding-index-japanese", - "encoding-index-korean", - "encoding-index-simpchinese", - "encoding-index-singlebyte", - "encoding-index-tradchinese", -] - -[[package]] -name = "encoding-index-japanese" -version = "1.20141219.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "04e8b2ff42e9a05335dbf8b5c6f7567e5591d0d916ccef4e0b1710d32a0d0c91" -dependencies = [ - "encoding_index_tests", -] - -[[package]] -name = "encoding-index-korean" -version = "1.20141219.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4dc33fb8e6bcba213fe2f14275f0963fd16f0a02c878e3095ecfdf5bee529d81" -dependencies = [ - "encoding_index_tests", -] - -[[package]] -name = "encoding-index-simpchinese" -version = "1.20141219.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d87a7194909b9118fc707194baa434a4e3b0fb6a5a757c73c3adb07aa25031f7" -dependencies = [ - "encoding_index_tests", -] - -[[package]] -name = "encoding-index-singlebyte" -version = "1.20141219.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3351d5acffb224af9ca265f435b859c7c01537c0849754d3db3fdf2bfe2ae84a" -dependencies = [ - "encoding_index_tests", -] - -[[package]] -name = "encoding-index-tradchinese" -version = "1.20141219.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fd0e20d5688ce3cab59eb3ef3a2083a5c77bf496cb798dc6fcdb75f323890c18" -dependencies = [ - "encoding_index_tests", -] - -[[package]] -name = "encoding_index_tests" -version = "0.1.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a246d82be1c9d791c5dfde9a2bd045fc3cbba3fa2b11ad558f27d01712f00569" - [[package]] name = "encoding_rs" version = "0.8.35" @@ -2671,15 +2596,6 @@ dependencies = [ "cfg-if", ] -[[package]] -name = "encoding_rs_io" -version = "0.1.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1cc3c5651fb62ab8aa3103998dade57efdd028544bd300516baa31840c252a83" -dependencies = [ - "encoding_rs", -] - [[package]] name = "enum-iterator" version = "2.3.0" @@ -2707,7 +2623,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1bf3c259d255ca70051b30e2e95b5446cdb8949ac4cd22c0d7fd634d89f568e2" dependencies = [ "log", - "regex", ] [[package]] @@ -2719,7 +2634,6 @@ dependencies = [ "anstream", "anstyle", "env_filter", - "jiff", "log", ] @@ -2892,18 +2806,6 @@ version = "0.2.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "28dea519a9695b9977216879a3ebfddf92f1c08c05d984f8996aecd6ecdc811d" -[[package]] -name = "filetime" -version = "0.2.26" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc0505cd1b6fa6580283f6bdf70a73fcf4aba1184038c90902b92b3dd0df63ed" -dependencies = [ - "cfg-if", - "libc", - "libredox", - "windows-sys 0.60.2", -] - [[package]] name = "find-msvc-tools" version = "0.1.6" @@ -3824,12 +3726,9 @@ dependencies = [ "percent-encoding", "pin-project-lite", "socket2 0.6.1", - "system-configuration", "tokio", - "tower-layer", "tower-service", "tracing", - "windows-registry", ] [[package]] @@ -4292,9 +4191,9 @@ dependencies = [ [[package]] name = "keccak" -version = "0.1.5" +version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ecc2af9a1119c51f12a14607e783cb977bde58bc069ff0c3da1095e635d70654" +checksum = "cb26cec98cce3a3d96cbb7bced3c4b16e3d13f27ec56dbd62cbc8f39cfb9d653" dependencies = [ "cpufeatures", ] @@ -4418,219 +4317,6 @@ dependencies = [ "vcpkg", ] -[[package]] -name = "lindera-cc-cedict" -version = "0.29.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7595a377b9723e837711366721b02662dac64d734af3dac1c01941e779e95a6b" -dependencies = [ - "bincode", - "byteorder", - "encoding", - "flate2", - "lindera-cc-cedict-builder", - "lindera-core", - "once_cell", - "tar", - "ureq", -] - -[[package]] -name = "lindera-cc-cedict-builder" -version = "0.29.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1c6fbd76a65b5df73574898e871d7cff3e34bf89f544f6e1a1087cba82e25cce" -dependencies = [ - "anyhow", - "bincode", - "byteorder", - "csv", - "encoding", - "env_logger", - "glob", - "lindera-core", - "lindera-decompress", - "log", - "yada", -] - -[[package]] -name = "lindera-core" -version = "0.29.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85af015d15c25cb3b7af82ba181908f4afbec6a2636f0fdfcca6d173c1b2c7fe" -dependencies = [ - "anyhow", - "bincode", - "byteorder", - "encoding_rs", - "log", - "once_cell", - "serde", - "thiserror 1.0.69", - "yada", -] - -[[package]] -name = "lindera-decompress" -version = "0.29.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b3dfc054b2f3f3eb21a24ce062a3d5f969339ddf50652038ea33993b1b97d4ba" -dependencies = [ - "anyhow", - "flate2", - "serde", -] - -[[package]] -name = "lindera-dictionary" -version = "0.29.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d6b1a5d8f4cba37dcca18dc0e827233ff46695a6d878d716f16f755d264d588a" -dependencies = [ - "anyhow", - "bincode", - "byteorder", - "lindera-cc-cedict", - "lindera-cc-cedict-builder", - "lindera-core", - "lindera-ipadic", - "lindera-ipadic-builder", - "lindera-ipadic-neologd-builder", - "lindera-ko-dic", - "lindera-ko-dic-builder", - "lindera-unidic-builder", - "serde", -] - -[[package]] -name = "lindera-ipadic" -version = "0.29.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e5f1d26aba22d8a9193dcd2d087205d89e0ffb19490bc305b341e25c037f353" -dependencies = [ - "bincode", - "byteorder", - "encoding", - "flate2", - "lindera-core", - "lindera-ipadic-builder", - "once_cell", - "tar", - "ureq", -] - -[[package]] -name = "lindera-ipadic-builder" -version = "0.29.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "184a9769b05ae857bd55f5e8a94b2ae2ba8816c5c6b78c73f161b4d7490c0461" -dependencies = [ - "anyhow", - "bincode", - "byteorder", - "csv", - "encoding_rs", - "encoding_rs_io", - "env_logger", - "glob", - "lindera-core", - "lindera-decompress", - "log", - "serde", - "yada", -] - -[[package]] -name = "lindera-ipadic-neologd-builder" -version = "0.29.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6b8cd28b5402425184d0f719d5bd81af87a7e36e2032b5bcceddf55011b1b22c" -dependencies = [ - "anyhow", - "bincode", - "byteorder", - "csv", - "encoding_rs", - "encoding_rs_io", - "env_logger", - "glob", - "lindera-core", - "lindera-decompress", - "log", - "serde", - "yada", -] - -[[package]] -name = "lindera-ko-dic" -version = "0.29.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a6d718720a28ac5d93b449661d8844f7858b2b71595e3198bc90e437f01e5ce" -dependencies = [ - "bincode", - "byteorder", - "encoding", - "flate2", - "lindera-core", - "lindera-ko-dic-builder", - "once_cell", - "tar", - "ureq", -] - -[[package]] -name = "lindera-ko-dic-builder" -version = "0.29.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f22de1fcdc33de258037145ae86686125214206b98d04c6dfe01f36c136c0022" -dependencies = [ - "anyhow", - "bincode", - "byteorder", - "csv", - "encoding", - "env_logger", - "glob", - "lindera-core", - "lindera-decompress", - "log", - "yada", -] - -[[package]] -name = "lindera-tokenizer" -version = "0.29.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cca45cbc1af512ce2aa9dea9a1d694430480a53bb53e37165ba143e27e81f7dd" -dependencies = [ - "bincode", - "lindera-core", - "lindera-dictionary", - "once_cell", - "serde", - "serde_json", -] - -[[package]] -name = "lindera-unidic-builder" -version = "0.29.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "359425c8dff54164ff1b068122d26df358ce18533e4771eb5c5ce68888d988f2" -dependencies = [ - "anyhow", - "bincode", - "byteorder", - "csv", - "encoding", - "env_logger", - "glob", - "lindera-core", - "lindera-decompress", - "log", - "yada", -] - [[package]] name = "linked-hash-map" version = "0.5.6" @@ -5090,9 +4776,9 @@ dependencies = [ [[package]] name = "num-conv" -version = "0.1.0" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9" +checksum = "cf97ec579c3c42f953ef76dbf8d55ac91fb219dde70e49aa4a6b7d74e9919050" [[package]] name = "num-format" @@ -5297,9 +4983,9 @@ checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe" [[package]] name = "oneshot" -version = "0.1.11" +version = "0.1.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b4ce411919553d3f9fa53a0880544cda985a112117a0444d5ff1e870a893d6ea" +checksum = "269bca4c2591a28585d6bf10d9ed0332b7d76900a1b02bec41bdc3a2cdcda107" [[package]] name = "onig" @@ -6364,7 +6050,6 @@ dependencies = [ "memchr", "parking_lot 0.12.5", "procfs", - "protobuf", "thiserror 2.0.17", ] @@ -6504,26 +6189,6 @@ dependencies = [ "prost 0.14.1", ] -[[package]] -name = "protobuf" -version = "3.7.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d65a1d4ddae7d8b5de68153b48f6aa3bba8cb002b243dbdbc55a5afbc98f99f4" -dependencies = [ - "once_cell", - "protobuf-support", - "thiserror 1.0.69", -] - -[[package]] -name = "protobuf-support" -version = "3.7.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3e36c2f31e0a47f9280fb347ef5e461ffcd2c52dd520d8e216b52f93b0b0d7d6" -dependencies = [ - "thiserror 1.0.69", -] - [[package]] name = "psl" version = "2.1.176" @@ -6719,7 +6384,6 @@ dependencies = [ "quickwit-cluster", "quickwit-common", "quickwit-config", - "quickwit-doc-mapper", "quickwit-index-management", "quickwit-indexing", "quickwit-ingest", @@ -7328,9 +6992,6 @@ dependencies = [ "bitpacking", "criterion", "hex", - "lindera-core", - "lindera-dictionary", - "lindera-tokenizer", "once_cell", "proptest", "quickwit-common", @@ -7346,7 +7007,6 @@ dependencies = [ "thiserror 2.0.17", "time", "tracing", - "whichlang", ] [[package]] @@ -8195,16 +7855,6 @@ dependencies = [ "walkdir", ] -[[package]] -name = "rust-stemmers" -version = "1.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e46a2036019fdb888131db7a4c847a1063a7493f971ed94ea82c67eada63ca54" -dependencies = [ - "serde", - "serde_derive", -] - [[package]] name = "rust_decimal" version = "1.39.0" @@ -9428,27 +9078,6 @@ dependencies = [ "nom 8.0.0", ] -[[package]] -name = "system-configuration" -version = "0.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c879d448e9d986b661742763247d3693ed13609438cf3d006f51f5368a5ba6b" -dependencies = [ - "bitflags 2.10.0", - "core-foundation 0.9.4", - "system-configuration-sys", -] - -[[package]] -name = "system-configuration-sys" -version = "0.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e1d1b10ced5ca923a1fcb8d03e96b8d3268065d724548c0211415ff6ac6bac4" -dependencies = [ - "core-foundation-sys", - "libc", -] - [[package]] name = "tabled" version = "0.20.0" @@ -9514,7 +9143,6 @@ dependencies = [ "oneshot", "rayon", "regex", - "rust-stemmers", "rustc-hash", "serde", "serde_json", @@ -9624,17 +9252,6 @@ dependencies = [ "serde", ] -[[package]] -name = "tar" -version = "0.4.44" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d863878d212c87a19c1a610eb53bb01fe12951c0501cf5a0d65f724914a667a" -dependencies = [ - "filetime", - "libc", - "xattr", -] - [[package]] name = "tempfile" version = "3.24.0" @@ -9770,9 +9387,9 @@ dependencies = [ [[package]] name = "time" -version = "0.3.44" +version = "0.3.47" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "91e7d9e3bb61134e77bde20dd4825b97c010155709965fedf0f49bb138e52a9d" +checksum = "743bd48c283afc0388f9b8827b976905fb217ad9e647fae3a379a9283c4def2c" dependencies = [ "deranged", "itoa", @@ -9781,16 +9398,16 @@ dependencies = [ "num-conv", "num_threads", "powerfmt", - "serde", + "serde_core", "time-core", "time-macros", ] [[package]] name = "time-core" -version = "0.1.6" +version = "0.1.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "40868e7c1d2f0b8d73e4a8c7f0ff63af4f6d19be117e90bd73eb1d62cf831c6b" +checksum = "7694e1cfe791f8d31026952abf09c69ca6f6fa4e1a1229e18988f06a04a12dca" [[package]] name = "time-fmt" @@ -9804,9 +9421,9 @@ dependencies = [ [[package]] name = "time-macros" -version = "0.2.24" +version = "0.2.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "30cfb0125f12d9c277f35663a0a33f8c30190f4e4574868a330595412d34ebf3" +checksum = "2e70e4c5a0e0a8a4823ad65dfe1a6930e4f4d756dcd9dd7939022b5e8c501215" dependencies = [ "num-conv", "time-core", @@ -9950,10 +9567,7 @@ dependencies = [ "futures-core", "futures-io", "futures-sink", - "futures-util", - "hashbrown 0.15.5", "pin-project-lite", - "slab", "tokio", ] @@ -10496,21 +10110,6 @@ version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" -[[package]] -name = "ureq" -version = "2.12.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "02d1a66277ed75f640d608235660df48c8e3c19f3b4edb6a263315626cc3c01d" -dependencies = [ - "base64 0.22.1", - "log", - "once_cell", - "rustls 0.23.36", - "rustls-pki-types", - "url", - "webpki-roots 0.26.11", -] - [[package]] name = "url" version = "2.5.8" @@ -10984,12 +10583,6 @@ dependencies = [ "rustls-pki-types", ] -[[package]] -name = "whichlang" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b9aa3ad29c3d08283ac6b769e3ec15ad1ddb88af7d2e9bc402c574973b937e7" - [[package]] name = "whoami" version = "1.6.1" @@ -11146,17 +10739,6 @@ dependencies = [ "windows-link 0.1.3", ] -[[package]] -name = "windows-registry" -version = "0.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "02752bf7fbdcce7f2a27a742f798510f3e5ad88dbe84871e5168e2120c3d5720" -dependencies = [ - "windows-link 0.2.1", - "windows-result 0.4.1", - "windows-strings 0.5.1", -] - [[package]] name = "windows-result" version = "0.3.4" @@ -11496,16 +11078,6 @@ version = "0.6.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9edde0db4769d2dc68579893f2306b26c6ecfbe0ef499b013d731b7b9247e0b9" -[[package]] -name = "xattr" -version = "1.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32e45ad4206f6d2479085147f02bc2ef834ac85886624a23575ae137c8aa8156" -dependencies = [ - "libc", - "rustix 1.1.3", -] - [[package]] name = "xmlparser" version = "0.13.6" @@ -11518,12 +11090,6 @@ version = "0.8.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fdd20c5420375476fbd4394763288da7eb0cc0b8c11deed431a91562af7335d3" -[[package]] -name = "yada" -version = "0.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aed111bd9e48a802518765906cbdadf0b45afb72b9c81ab049a3b86252adffdd" - [[package]] name = "yansi" version = "1.0.1" diff --git a/quickwit/Cargo.toml b/quickwit/Cargo.toml index 453b5850761..0a9b40c55e8 100644 --- a/quickwit/Cargo.toml +++ b/quickwit/Cargo.toml @@ -102,11 +102,11 @@ colored = "3.0" console-subscriber = "0.5" criterion = { version = "0.8", features = ["async_tokio"] } cron = "0.15" -dialoguer = "0.12" +dialoguer = { version = "0.12", default-features = false } dotenvy = "0.15" dyn-clone = "1.0" enum-iterator = "2.3" -env_logger = "0.11" +env_logger = { version = "0.11", default-features = false, features = ["auto-color"] } fail = "0.5" flate2 = "1.1" flume = "0.12" @@ -131,23 +131,18 @@ http-serde = "2.1" humantime = "2.3" hyper = { version = "1.8", features = ["client", "http1", "http2", "server"] } hyper-rustls = "0.27" -hyper-util = { version = "0.1", features = ["full"] } +hyper-util = { version = "0.1", default-features = false, features = [ + "client-legacy", + "server-auto", + "server-graceful", + "service", + "tokio", +] } indexmap = { version = "2.12", features = ["serde"] } indicatif = "0.18" itertools = "0.14" json_comments = "0.2" libz-sys = "1.1" -# Lindera tokenizer 0.30+ versions (tested up to 0.32.3) are currently broken due to upstream build failures. -# The dictionary crates attempt to download artifacts from S3 URLs that return 404 Not Found. -# Version 0.29.0 is the latest version that builds correctly. It also explicitly depends on lindera-core 0.29 -# and lindera-dictionary 0.29. -lindera-core = "0.29" -lindera-dictionary = "0.29" -lindera-tokenizer = { version = "0.29", features = [ - "cc-cedict", - "ipadic", - "ko-dic", -] } lru = "0.16" matches = "0.1" md5 = "0.8" @@ -175,7 +170,7 @@ pprof = { version = "0.15", features = ["flamegraph"] } predicates = "3" prettyplease = "0.2" proc-macro2 = "1.0" -prometheus = { version = "0.14", features = ["process"] } +prometheus = { version = "0.14", default-features = false, features = ["process"] } proptest = "1" prost = { version = "0.14", default-features = false, features = [ "derive", @@ -245,7 +240,10 @@ tokio = { version = "1.48", features = ["full"] } tokio-metrics = { version = "0.4", features = ["rt"] } tokio-rustls = { version = "0.26", default-features = false } tokio-stream = { version = "0.1", features = ["sync"] } -tokio-util = { version = "0.7", features = ["full"] } +tokio-util = { version = "0.7", default-features = false, features = [ + "compat", + "io-util", +] } toml = "0.9" tonic = { version = "0.14", features = [ "_tls-any", @@ -295,9 +293,8 @@ vrl = { version = "0.29", default-features = false, features = [ "value", ] } warp = { version = "0.4", features = ["server", "test"] } -whichlang = "0.1" wiremock = "0.6" -zstd = "0.13" +zstd = { version = "0.13", default-features = false } aws-config = "1.8" aws-credential-types = { version = "1.2", features = ["hardcoded-credentials"] } diff --git a/quickwit/quickwit-actors/src/actor.rs b/quickwit/quickwit-actors/src/actor.rs index 2fa32d7f2a5..bb5a48239a4 100644 --- a/quickwit/quickwit-actors/src/actor.rs +++ b/quickwit/quickwit-actors/src/actor.rs @@ -18,7 +18,6 @@ use std::sync::Arc; use async_trait::async_trait; use thiserror::Error; -use tracing::error; use crate::{ActorContext, QueueCapacity, SendError}; diff --git a/quickwit/quickwit-cli/Cargo.toml b/quickwit/quickwit-cli/Cargo.toml index c595cb7e90a..8819d92ec97 100644 --- a/quickwit/quickwit-cli/Cargo.toml +++ b/quickwit/quickwit-cli/Cargo.toml @@ -59,7 +59,6 @@ quickwit-actors = { workspace = true } quickwit-cluster = { workspace = true } quickwit-common = { workspace = true } quickwit-config = { workspace = true } -quickwit-doc-mapper = { workspace = true } quickwit-index-management = { workspace = true } quickwit-indexing = { workspace = true } quickwit-ingest = { workspace = true } @@ -105,7 +104,6 @@ release-feature-set = [ "quickwit-storage/azure", "quickwit-storage/gcs", "quickwit-metastore/postgres", - "quickwit-doc-mapper/multilang", ] release-feature-vendored-set = [ "jemalloc", @@ -119,7 +117,6 @@ release-feature-vendored-set = [ "quickwit-storage/azure", "quickwit-storage/gcs", "quickwit-metastore/postgres", - "quickwit-doc-mapper/multilang", ] release-macos-feature-vendored-set = [ "jemalloc", @@ -132,13 +129,8 @@ release-macos-feature-vendored-set = [ "quickwit-storage/azure", "quickwit-storage/gcs", "quickwit-metastore/postgres", - "quickwit-doc-mapper/multilang", ] release-jemalloc-profiled = [ "release-feature-set", "jemalloc-profiled", ] - -[package.metadata.cargo-machete] -# used to enable the `multilang` feature -ignored = ["quickwit-doc-mapper"] diff --git a/quickwit/quickwit-common/src/rate_limited_tracing.rs b/quickwit/quickwit-common/src/rate_limited_tracing.rs index c9a323f9ec2..198c2bf8bdd 100644 --- a/quickwit/quickwit-common/src/rate_limited_tracing.rs +++ b/quickwit/quickwit-common/src/rate_limited_tracing.rs @@ -179,12 +179,13 @@ fn _check_macro_works() { #[doc(hidden)] pub use coarsetime::Instant as CoarsetimeInstant; +pub use rate_limited_debug; +pub use rate_limited_error; +pub use rate_limited_info; +pub use rate_limited_trace; #[doc(hidden)] pub use rate_limited_tracing; -pub use { - rate_limited_debug, rate_limited_error, rate_limited_info, rate_limited_trace, - rate_limited_warn, -}; +pub use rate_limited_warn; #[cfg(test)] mod tests { diff --git a/quickwit/quickwit-config/src/storage_config.rs b/quickwit/quickwit-config/src/storage_config.rs index 52daffdb537..7a9af4b1cdf 100644 --- a/quickwit/quickwit-config/src/storage_config.rs +++ b/quickwit/quickwit-config/src/storage_config.rs @@ -425,6 +425,7 @@ impl fmt::Debug for S3StorageConfig { "disable_multi_object_delete", &self.disable_multi_object_delete, ) + .field("encryption", &self.encryption) .finish() } } diff --git a/quickwit/quickwit-datetime/src/java_date_time_format.rs b/quickwit/quickwit-datetime/src/java_date_time_format.rs index 2ef63f32881..a0d6c1cb0f5 100644 --- a/quickwit/quickwit-datetime/src/java_date_time_format.rs +++ b/quickwit/quickwit-datetime/src/java_date_time_format.rs @@ -261,14 +261,17 @@ fn resolve_java_datetime_format_alias(java_datetime_format: &str) -> &str { OnceLock::new(); let java_datetime_format_map = JAVA_DATE_FORMAT_ALIASES.get_or_init(|| { let mut m = HashMap::new(); - m.insert("date_optional_time", "yyyy-MM-dd['T'HH:mm:ss.SSSZ]"); + m.insert( + "date_optional_time", + "yyyy[-MM[-dd['T'HH[:mm[:ss[.SSS][Z]]]]]]", + ); m.insert( "strict_date_optional_time", - "yyyy[-MM[-dd['T'HH[:mm[:ss[.SSS[Z]]]]]]]", + "yyyy[-MM[-dd['T'HH[:mm[:ss[.SSS][Z]]]]]]", ); m.insert( "strict_date_optional_time_nanos", - "yyyy[-MM[-dd['T'HH:mm:ss.SSSSSSZ]]]", + "yyyy[-MM[-dd['T'HH[:mm[:ss[.SSSSSS][Z]]]]]]", ); m.insert("basic_date", "yyyyMMdd"); @@ -660,6 +663,7 @@ mod tests { "2019-03-23T21:35:46.123+00:00", "2019-03-23T21:36:46.123+03:00", "2019-03-23T21:37:46.123+0300", + "2019-03-23T21:38:46+00:00", ]; let expected = [ datetime!(2019-01-01 00:00:00 UTC), @@ -671,6 +675,7 @@ mod tests { datetime!(2019-03-23 21:35:46.123 UTC), datetime!(2019-03-23 21:36:46.123 +03:00:00), datetime!(2019-03-23 21:37:46.123 +03:00:00), + datetime!(2019-03-23 21:38:46 UTC), ]; for (date_str, &expected_dt) in dates.iter().zip(expected.iter()) { let parsed_dt = parser @@ -692,6 +697,7 @@ mod tests { "2019-03-23T21:35:46.123456789+00:00", "2019-03-23T21:36:46.123456789+03:00", "2019-03-23T21:37:46.123456789+0300", + "2019-03-23T21:38:46+00:00", ]; let expected = [ datetime!(2019-01-01 00:00:00 UTC), @@ -701,6 +707,7 @@ mod tests { datetime!(2019-03-23 21:35:46.123456789 UTC), datetime!(2019-03-23 21:36:46.123456789 +03:00:00), datetime!(2019-03-23 21:37:46.123456789 +03:00:00), + datetime!(2019-03-23 21:38:46 UTC), ]; for (date_str, &expected_dt) in dates.iter().zip(expected.iter()) { let parsed_dt = parser diff --git a/quickwit/quickwit-doc-mapper/Cargo.toml b/quickwit/quickwit-doc-mapper/Cargo.toml index ae0239e53c5..92c977fe4da 100644 --- a/quickwit/quickwit-doc-mapper/Cargo.toml +++ b/quickwit/quickwit-doc-mapper/Cargo.toml @@ -42,10 +42,9 @@ serde_yaml = { workspace = true } time = { workspace = true } quickwit-common = { workspace = true, features = ["testsuite"] } -quickwit-query = { workspace = true, features = ["multilang"] } +quickwit-query = { workspace = true } [features] -multilang = ["quickwit-query/multilang"] testsuite = [] [[bench]] diff --git a/quickwit/quickwit-doc-mapper/src/doc_mapper/field_mapping_entry.rs b/quickwit/quickwit-doc-mapper/src/doc_mapper/field_mapping_entry.rs index ae3388aee32..e69d337a616 100644 --- a/quickwit/quickwit-doc-mapper/src/doc_mapper/field_mapping_entry.rs +++ b/quickwit/quickwit-doc-mapper/src/doc_mapper/field_mapping_entry.rs @@ -1152,7 +1152,7 @@ mod tests { "type": "text", "stored": true, "record": "basic", - "tokenizer": "en_stem" + "tokenizer": "lowercase" } "#, )?; @@ -1161,7 +1161,7 @@ mod tests { FieldMappingType::Text(options, _) => { assert_eq!(options.stored, true); let indexing_options = options.indexing_options.unwrap(); - assert_eq!(indexing_options.tokenizer.name(), "en_stem"); + assert_eq!(indexing_options.tokenizer.name(), "lowercase"); assert_eq!(indexing_options.record, IndexRecordOption::Basic); } _ => panic!("wrong property type"), diff --git a/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs b/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs index bed4b18b90f..749dde228a7 100644 --- a/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs +++ b/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs @@ -28,8 +28,6 @@ use std::ops::Bound; pub use doc_mapper_builder::DocMapperBuilder; pub use doc_mapper_impl::DocMapper; -#[cfg(all(test, feature = "multilang"))] -pub(crate) use field_mapping_entry::TextIndexingOptions; pub use field_mapping_entry::{ BinaryFormat, FastFieldOptions, FieldMappingEntry, QuickwitBytesOptions, QuickwitJsonOptions, QuickwitTextNormalizer, @@ -812,55 +810,4 @@ mod tests { warmup_info.simplify(); assert_eq!(warmup_info, expected); } - - #[test] - #[cfg(feature = "multilang")] - fn test_doc_mapper_query_with_multilang_field() { - use quickwit_query::query_ast::TermQuery; - use tantivy::schema::IndexRecordOption; - - use crate::doc_mapper::{ - QuickwitTextOptions, QuickwitTextTokenizer, TextIndexingOptions, TokenizerType, - }; - use crate::{TokenizerConfig, TokenizerEntry}; - let mut doc_mapper_builder = DocMapperBuilder::default(); - doc_mapper_builder - .doc_mapping - .field_mappings - .push(FieldMappingEntry { - name: "multilang".to_string(), - mapping_type: FieldMappingType::Text( - QuickwitTextOptions { - indexing_options: Some(TextIndexingOptions { - tokenizer: QuickwitTextTokenizer::from_static("multilang"), - record: IndexRecordOption::Basic, - fieldnorms: false, - }), - ..Default::default() - }, - Cardinality::SingleValued, - ), - }); - doc_mapper_builder - .doc_mapping - .tokenizers - .push(TokenizerEntry { - name: "multilang".to_string(), - config: TokenizerConfig { - tokenizer_type: TokenizerType::Multilang, - filters: Vec::new(), - }, - }); - let doc_mapper = doc_mapper_builder.try_build().unwrap(); - let schema = doc_mapper.schema(); - let query_ast = quickwit_query::query_ast::QueryAst::Term(TermQuery { - field: "multilang".to_string(), - value: "JPN:す".to_string(), - }); - let (query, _) = doc_mapper.query(schema, query_ast, false, None).unwrap(); - assert_eq!( - format!("{query:?}"), - r#"TermQuery(Term(field=2, type=Str, "JPN:す"))"# - ); - } } diff --git a/quickwit/quickwit-doc-mapper/src/doc_mapper/tokenizer_entry.rs b/quickwit/quickwit-doc-mapper/src/doc_mapper/tokenizer_entry.rs index b9793dc9548..0488d118c9f 100644 --- a/quickwit/quickwit-doc-mapper/src/doc_mapper/tokenizer_entry.rs +++ b/quickwit/quickwit-doc-mapper/src/doc_mapper/tokenizer_entry.rs @@ -44,10 +44,6 @@ impl TokenizerConfig { pub fn text_analyzer(&self) -> anyhow::Result { let mut text_analyzer_builder = match &self.tokenizer_type { TokenizerType::Simple => TextAnalyzer::builder(SimpleTokenizer::default()).dynamic(), - #[cfg(any(test, feature = "multilang"))] - TokenizerType::Multilang => { - TextAnalyzer::builder(quickwit_query::MultiLangTokenizer::default()).dynamic() - } TokenizerType::SourceCode => TextAnalyzer::builder(CodeTokenizer::default()).dynamic(), TokenizerType::Ngram(options) => { let tokenizer = @@ -120,8 +116,6 @@ impl TokenFilterType { #[derive(Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize, utoipa::ToSchema)] #[serde(tag = "type", rename_all = "snake_case")] pub enum TokenizerType { - #[cfg(any(test, feature = "multilang"))] - Multilang, Ngram(NgramTokenizerOption), Regex(RegexTokenizerOption), Simple, diff --git a/quickwit/quickwit-indexing/src/source/ingest/mod.rs b/quickwit/quickwit-indexing/src/source/ingest/mod.rs index d9e21affb87..63c746aabe0 100644 --- a/quickwit/quickwit-indexing/src/source/ingest/mod.rs +++ b/quickwit/quickwit-indexing/src/source/ingest/mod.rs @@ -410,9 +410,8 @@ impl IngestSource { .assigned_shards .keys() .filter(|&shard_id| !new_assigned_shard_ids.contains(shard_id)) - .cloned() .any(|removed_shard_id| { - let Some(assigned_shard) = self.assigned_shards.get(&removed_shard_id) else { + let Some(assigned_shard) = self.assigned_shards.get(removed_shard_id) else { return false; }; assigned_shard.status != IndexingStatus::Complete diff --git a/quickwit/quickwit-proto/src/codegen/jaeger/opentelemetry.proto.trace.v1.rs b/quickwit/quickwit-proto/src/codegen/jaeger/opentelemetry.proto.trace.v1.rs index 6736d97c7e2..afa08ca3c9d 100644 --- a/quickwit/quickwit-proto/src/codegen/jaeger/opentelemetry.proto.trace.v1.rs +++ b/quickwit/quickwit-proto/src/codegen/jaeger/opentelemetry.proto.trace.v1.rs @@ -120,10 +120,12 @@ pub struct Span { /// attributes is a collection of key/value pairs. Note, global attributes /// like server name can be set using the resource API. Examples of attributes: /// - /// "/http/user_agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36" - /// "/http/server_latency": 300 - /// "abc.com/myattribute": true - /// "abc.com/score": 10.239 + /// ```text + /// "/http/user_agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36" + /// "/http/server_latency": 300 + /// "abc.com/myattribute": true + /// "abc.com/score": 10.239 + /// ``` /// /// The OpenTelemetry API specification further restricts the allowed value types: /// @@ -276,7 +278,7 @@ pub mod span { } /// The Status type defines a logical error model that is suitable for different /// programming environments, including REST APIs and RPC APIs. -#[derive(Clone, PartialEq, ::prost::Message)] +#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)] pub struct Status { /// A developer-facing human readable error message. #[prost(string, tag = "2")] diff --git a/quickwit/quickwit-proto/src/lib.rs b/quickwit/quickwit-proto/src/lib.rs index f4ddb734d2a..f89fdb97687 100644 --- a/quickwit/quickwit-proto/src/lib.rs +++ b/quickwit/quickwit-proto/src/lib.rs @@ -28,7 +28,8 @@ use tracing_opentelemetry::OpenTelemetrySpanExt; pub mod cluster; pub mod control_plane; -pub use {bytes, tonic}; +pub use bytes; +pub use tonic; pub mod developer; pub mod error; mod getters; diff --git a/quickwit/quickwit-query/Cargo.toml b/quickwit/quickwit-query/Cargo.toml index 066c00c0ff7..f24d8662715 100644 --- a/quickwit/quickwit-query/Cargo.toml +++ b/quickwit/quickwit-query/Cargo.toml @@ -15,9 +15,6 @@ anyhow = { workspace = true } base64 = { workspace = true } bitpacking = { workspace = true } hex = { workspace = true } -lindera-core = { workspace = true, optional = true } -lindera-dictionary = { workspace = true, optional = true } -lindera-tokenizer = { workspace = true, optional = true } once_cell = { workspace = true } regex = { workspace = true } serde = { workspace = true } @@ -29,7 +26,6 @@ tracing = { workspace = true } time = { workspace = true } thiserror = { workspace = true } rustc-hash = { workspace = true } -whichlang = { workspace = true, optional = true } quickwit-common = { workspace = true } quickwit-datetime = { workspace = true } @@ -42,19 +38,6 @@ time = { workspace = true } quickwit-common = { workspace = true, features = ["testsuite"] } -[features] -multilang = [ - "lindera-core", - "lindera-dictionary", - "lindera-tokenizer", - "whichlang", - "tantivy/stemmer", -] - [[bench]] name = "tokenizers_bench" harness = false - -[[bench]] -name = "multilang_tokenizers_bench" -harness = false diff --git a/quickwit/quickwit-query/benches/multilang_tokenizers_bench.rs b/quickwit/quickwit-query/benches/multilang_tokenizers_bench.rs deleted file mode 100644 index 61755dea556..00000000000 --- a/quickwit/quickwit-query/benches/multilang_tokenizers_bench.rs +++ /dev/null @@ -1,167 +0,0 @@ -// Copyright 2021-Present Datadog, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use criterion::{Criterion, Throughput, black_box, criterion_group, criterion_main}; -use quickwit_query::create_default_quickwit_tokenizer_manager; -use tantivy::tokenizer::{TextAnalyzer, Token, TokenStream}; - -// A random ascii string of length 100 chars. -const ASCII_SHORT: &str = "It is a long established fact"; -static ASCII_LONG: &str = r#"It is a long established fact that a reader will be distracted by the readable content of a - page when looking at its layout. The point of using Lorem Ipsum is that it has a - more-or-less normal distribution of letters, as opposed to using 'Content here, content - here', making it look like readable English. Many desktop publishing packages and web page - editors now use Lorem Ipsum as their default model text, and a search for 'lorem ipsum' will - uncover many web sites still in their infancy. Various versions have evolved over the years, - sometimes by accident, sometimes on purpose (injected humour and the like)."#; -const JPN_SHORT: &str = "日本ごです。 とても素敵な言葉ですね"; -const JPN_LONG: &str = r#"日本ごです。 和名の由来は、 - 太陽の動きにつれてその方向を追うように花が回るといわれたことから。 - ただしこの動きは生長に伴うものであるため、 - 実際に太陽を追って動くのは生長が盛んな若い時期だけである。 - 若いヒマワリの茎の上部の葉は太陽に正対になるように動き、 - 朝には東を向いていたのが夕方には西を向く。日没後はまもなく起きあがり、 - 夜明け前にはふたたび東に向く。この運動はつぼみを付ける頃まで続くが、 - つぼみが大きくなり花が開く素敵な言葉ですね."#; -const CMN_SHORT: &str = "滚滚长江东逝水,浪花淘尽英雄。"; -const CMN_LONG: &str = r#"滚滚长江东逝水,浪花淘尽英雄。是非成败转头空,青山依旧在,几度夕阳红。 - 白发渔樵江渚上,惯看秋月春风。一壶浊酒喜相逢,古今多少事,都付笑谈中。 - 是非成败转头空,青山依旧在,惯看秋月春风。一壶浊酒喜相逢,古今多少事, - 滚滚长江东逝水,浪花淘尽英雄。 几度夕阳红。白发渔樵江渚上,都付笑谈中。"#; -const KOR_SHORT: &str = "안녕하세요. 반갑습니다."; -const KOR_LONG: &str = r#" -포근히 내려오는 눈밭속에서는 -낯이 붉은 處女아이들도 깃들이어 오는 소리… -울고 -웃고 -수구리고 -새파라니 얼어서 -運命들이 모두다 안끼어 드는 소리… -큰놈에겐 큰 눈물자국, 작은놈에겐 작은 웃음 흔적 -큰이얘기 작은이얘기들이 오부록이 도란 그리며 안끼어 오는 소리 -끊임없이 내리는 눈발 속에서는 -山도 山도 靑山도 안끼어 드는 소리 -"#; - -fn process_tokens(analyzer: &mut TextAnalyzer, text: &str) -> Vec { - let mut token_stream = analyzer.token_stream(text); - let mut tokens: Vec = Vec::new(); - token_stream.process(&mut |token: &Token| tokens.push(token.clone())); - tokens -} - -pub fn tokenizers_throughput_benchmark(c: &mut Criterion) { - let mut group = c.benchmark_group("multilang"); - let tokenizer_manager = create_default_quickwit_tokenizer_manager(); - let mut default_tokenizer = tokenizer_manager.get_tokenizer("default").unwrap(); - let mut multilang_tokenizer = tokenizer_manager.get_tokenizer("multilang").unwrap(); - let mut chinese_tokenizer = tokenizer_manager - .get_tokenizer("chinese_compatible") - .unwrap(); - - group - .throughput(Throughput::Bytes(ASCII_SHORT.len() as u64)) - .bench_with_input("default-tokenize-short", ASCII_SHORT, |b, text| { - b.iter(|| process_tokens(&mut default_tokenizer, black_box(text))); - }); - group - .throughput(Throughput::Bytes(ASCII_LONG.len() as u64)) - .bench_with_input("default-tokenize-long", ASCII_LONG, |b, text| { - b.iter(|| process_tokens(&mut default_tokenizer, black_box(text))); - }); - group - .throughput(Throughput::Bytes(ASCII_SHORT.len() as u64)) - .bench_with_input("multilang-eng-tokenize-short", ASCII_SHORT, |b, text| { - b.iter(|| process_tokens(&mut multilang_tokenizer, black_box(text))); - }); - group - .throughput(Throughput::Bytes(ASCII_LONG.len() as u64)) - .bench_with_input("multilang-eng-tokenize-long", ASCII_LONG, |b, text| { - b.iter(|| process_tokens(&mut multilang_tokenizer, black_box(text))); - }); - let short_with_prefix = "ENG:".to_string() + ASCII_SHORT; - group - .throughput(Throughput::Bytes(ASCII_SHORT.len() as u64)) - .bench_with_input( - "multilang-tokenize-short-with-prefix", - &short_with_prefix, - |b, text| { - b.iter(|| process_tokens(&mut multilang_tokenizer, black_box(text))); - }, - ); - let long_with_prefix = "ENG:".to_string() + ASCII_LONG; - group - .throughput(Throughput::Bytes(ASCII_LONG.len() as u64)) - .bench_with_input( - "multilang-tokenize-long-with-prefix", - &long_with_prefix, - |b, text| { - b.iter(|| process_tokens(&mut multilang_tokenizer, black_box(text))); - }, - ); - group - .throughput(Throughput::Bytes(JPN_SHORT.len() as u64)) - .bench_with_input("multilang-tokenize-jpn-short", JPN_SHORT, |b, text| { - b.iter(|| process_tokens(&mut multilang_tokenizer, black_box(text))); - }); - group - .throughput(Throughput::Bytes(JPN_LONG.len() as u64)) - .bench_with_input("multilang-tokenize-jpn-long", JPN_LONG, |b, text| { - b.iter(|| process_tokens(&mut multilang_tokenizer, black_box(text))); - }); - group - .throughput(Throughput::Bytes(CMN_SHORT.len() as u64)) - .bench_with_input("multilang-tokenize-cmn-short", CMN_SHORT, |b, text| { - b.iter(|| process_tokens(&mut multilang_tokenizer, black_box(text))); - }); - group - .throughput(Throughput::Bytes(CMN_LONG.len() as u64)) - .bench_with_input("multilang-tokenize-cmn-long", CMN_LONG, |b, text| { - b.iter(|| process_tokens(&mut multilang_tokenizer, black_box(text))); - }); - group - .throughput(Throughput::Bytes(KOR_SHORT.len() as u64)) - .bench_with_input("multilang-tokenize-kor-short", KOR_SHORT, |b, text| { - b.iter(|| process_tokens(&mut multilang_tokenizer, black_box(text))); - }); - group - .throughput(Throughput::Bytes(KOR_LONG.len() as u64)) - .bench_with_input("multilang-tokenize-kor-long", KOR_LONG, |b, text| { - b.iter(|| process_tokens(&mut multilang_tokenizer, black_box(text))); - }); - group - .throughput(Throughput::Bytes(CMN_SHORT.len() as u64)) - .bench_with_input( - "chinese-compatible-tokenize-cmn-short", - CMN_SHORT, - |b, text| { - b.iter(|| process_tokens(&mut chinese_tokenizer, black_box(text))); - }, - ); - group - .throughput(Throughput::Bytes(CMN_LONG.len() as u64)) - .bench_with_input( - "chinese-compatible-tokenize-cmn-long", - CMN_LONG, - |b, text| { - b.iter(|| process_tokens(&mut chinese_tokenizer, black_box(text))); - }, - ); -} - -criterion_group!( - tokenizers_throughput_benches, - tokenizers_throughput_benchmark -); -criterion_main!(tokenizers_throughput_benches); diff --git a/quickwit/quickwit-query/src/lib.rs b/quickwit/quickwit-query/src/lib.rs index b2040f73daa..8f70e155933 100644 --- a/quickwit/quickwit-query/src/lib.rs +++ b/quickwit/quickwit-query/src/lib.rs @@ -38,8 +38,6 @@ pub(crate) use not_nan_f32::NotNaNf32; pub use query_ast::utils::find_field_or_hit_dynamic; use serde::{Deserialize, Serialize}; pub use tantivy::query::Query as TantivyQuery; -#[cfg(feature = "multilang")] -pub use tokenizers::MultiLangTokenizer; pub use tokenizers::{ CodeTokenizer, DEFAULT_REMOVE_TOKEN_LENGTH, create_default_quickwit_tokenizer_manager, get_quickwit_fastfield_normalizer_manager, diff --git a/quickwit/quickwit-query/src/query_ast/wildcard_query.rs b/quickwit/quickwit-query/src/query_ast/wildcard_query.rs index 84176f4a4aa..7b24a66163d 100644 --- a/quickwit/quickwit-query/src/query_ast/wildcard_query.rs +++ b/quickwit/quickwit-query/src/query_ast/wildcard_query.rs @@ -247,7 +247,6 @@ mod tests { "raw_lowercase", "lowercase", "default", - "en_stem", "chinese_compatible", "source_code_default", "source_code_with_hex", @@ -290,7 +289,6 @@ mod tests { "raw_lowercase", "lowercase", "default", - "en_stem", "chinese_compatible", "source_code_default", "source_code_with_hex", @@ -335,7 +333,6 @@ mod tests { "raw_lowercase", "lowercase", "default", - "en_stem", "chinese_compatible", "source_code_default", "source_code_with_hex", @@ -398,7 +395,6 @@ mod tests { "raw_lowercase", "lowercase", "default", - "en_stem", "chinese_compatible", "source_code_default", "source_code_with_hex", diff --git a/quickwit/quickwit-query/src/tokenizers/mod.rs b/quickwit/quickwit-query/src/tokenizers/mod.rs index d086c36a977..5a90715075e 100644 --- a/quickwit/quickwit-query/src/tokenizers/mod.rs +++ b/quickwit/quickwit-query/src/tokenizers/mod.rs @@ -14,8 +14,6 @@ mod chinese_compatible; mod code_tokenizer; -#[cfg(feature = "multilang")] -mod multilang; mod tokenizer_manager; use once_cell::sync::Lazy; @@ -26,8 +24,6 @@ use tantivy::tokenizer::{ use self::chinese_compatible::ChineseTokenizer; pub use self::code_tokenizer::CodeTokenizer; -#[cfg(feature = "multilang")] -pub use self::multilang::MultiLangTokenizer; pub use self::tokenizer_manager::{RAW_TOKENIZER_NAME, TokenizerManager}; pub const DEFAULT_REMOVE_TOKEN_LENGTH: usize = 255; @@ -58,17 +54,6 @@ pub fn create_default_quickwit_tokenizer_manager() -> TokenizerManager { .filter(LowerCaser) .build(); tokenizer_manager.register("default", default_tokenizer, true); - #[cfg(feature = "multilang")] - { - let en_stem_tokenizer = TextAnalyzer::builder(SimpleTokenizer::default()) - .filter(RemoveLongFilter::limit(DEFAULT_REMOVE_TOKEN_LENGTH)) - .filter(LowerCaser) - .filter(tantivy::tokenizer::Stemmer::new( - tantivy::tokenizer::Language::English, - )) - .build(); - tokenizer_manager.register("en_stem", en_stem_tokenizer, true); - } tokenizer_manager.register("whitespace", WhitespaceTokenizer::default(), false); let chinese_tokenizer = TextAnalyzer::builder(ChineseTokenizer) @@ -94,15 +79,6 @@ pub fn create_default_quickwit_tokenizer_manager() -> TokenizerManager { .build(), true, ); - #[cfg(feature = "multilang")] - tokenizer_manager.register( - "multilang_default", - TextAnalyzer::builder(MultiLangTokenizer::default()) - .filter(RemoveLongFilter::limit(DEFAULT_REMOVE_TOKEN_LENGTH)) - .filter(LowerCaser) - .build(), - true, - ); tokenizer_manager } diff --git a/quickwit/quickwit-query/src/tokenizers/multilang.rs b/quickwit/quickwit-query/src/tokenizers/multilang.rs deleted file mode 100644 index a62d2ff151c..00000000000 --- a/quickwit/quickwit-query/src/tokenizers/multilang.rs +++ /dev/null @@ -1,334 +0,0 @@ -// Copyright 2021-Present Datadog, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use lindera_core::mode::Mode; -use lindera_dictionary::{DictionaryConfig, DictionaryKind, load_dictionary_from_config}; -use lindera_tokenizer::token::Token as LinderaToken; -use lindera_tokenizer::tokenizer::Tokenizer as LinderaTokenizer; -use once_cell::sync::Lazy; -use tantivy::tokenizer::{SimpleTokenStream, SimpleTokenizer, Token, TokenStream, Tokenizer}; -use whichlang::{Lang, detect_language}; - -// Note(fmassot): we use `lindera_tokenizer::tokenizer::Tokenizer` and not -// `use lindera_tantivy::tokenizer::LinderaTokenizer` to avoid -// costly copy of lindera dictionaries each time we clone the `MultiLangTokenizer`. - -/// Mandarin chinese tokenizer. -static CMN_TOKENIZER: Lazy = Lazy::new(|| { - let cmn_dictionary_config = DictionaryConfig { - kind: Some(DictionaryKind::CcCedict), - path: None, - }; - let cmn_dictionary = load_dictionary_from_config(cmn_dictionary_config) - .expect("Lindera `CcCedict` dictionary must be present"); - LinderaTokenizer::new(cmn_dictionary, None, Mode::Normal) -}); - -/// Japanese tokenizer. -static JPN_TOKENIZER: Lazy = Lazy::new(|| { - let jpn_dictionary_config = DictionaryConfig { - kind: Some(DictionaryKind::IPADIC), - path: None, - }; - let jpn_dictionary = load_dictionary_from_config(jpn_dictionary_config) - .expect("Lindera `IPADIC` dictionary must be present"); - LinderaTokenizer::new(jpn_dictionary, None, Mode::Normal) -}); - -/// Korean tokenizer. -static KOR_TOKENIZER: Lazy = Lazy::new(|| { - let kor_dictionary_config = DictionaryConfig { - kind: Some(DictionaryKind::KoDic), - path: None, - }; - let kor_dictionary = load_dictionary_from_config(kor_dictionary_config) - .expect("Lindera `KoDic` dictionary must be present"); - LinderaTokenizer::new(kor_dictionary, None, Mode::Normal) -}); - -/// Multilanguage tokenizer that uses the `whichlang` to detect the language of the text -/// and uses the appropriate tokenizer for the detected language: -/// - lindera for Chinese, Japanese, and Korean. -/// - Quickwit's default tokenizer for other languages. -/// -/// It is possible to bypass the language detection by prefixing the text with the language code -/// followed by a colon. For example, `KOR:일본입니다` will be tokenized by the korean tokenizer. -/// Current supported prefix are: -/// - `KOR:` for Korean tokenizer -/// - `JPN:` for Japanese tokenizer -/// - `CMN:` for Chinese tokenizer -/// - `ENG:` for Quickwit's default tokenizer -#[derive(Clone, Default)] -pub struct MultiLangTokenizer { - default_tokenizer: SimpleTokenizer, - token: Token, -} - -impl Tokenizer for MultiLangTokenizer { - type TokenStream<'a> = MultiLanguageTokenStream<'a>; - fn token_stream<'a>(&'a mut self, text: &'a str) -> MultiLanguageTokenStream<'a> { - self.token.reset(); - let (language_prefix, text_to_tokenize) = get_language_from_prefix(text); - // If the text is empty, we return an empty token stream. - // `whichlang::detect_language` panicks if the text is empty. - if text.trim().is_empty() { - return MultiLanguageTokenStream::Empty; - } - let language = language_prefix.unwrap_or_else(|| detect_language(text_to_tokenize)); - match language { - Lang::Cmn => { - let lindera_token_stream = LinderaTokenStream { - tokens: CMN_TOKENIZER - .tokenize(text_to_tokenize) - .expect("tokenize method should never fail"), - token: &mut self.token, - }; - MultiLanguageTokenStream::Lindera(lindera_token_stream) - } - Lang::Jpn => { - let lindera_token_stream = LinderaTokenStream { - tokens: JPN_TOKENIZER - .tokenize(text_to_tokenize) - .expect("tokenize method should never fail"), - token: &mut self.token, - }; - MultiLanguageTokenStream::Lindera(lindera_token_stream) - } - Lang::Kor => { - let lindera_token_stream = LinderaTokenStream { - tokens: KOR_TOKENIZER - .tokenize(text_to_tokenize) - .expect("tokenize method should never fail"), - token: &mut self.token, - }; - MultiLanguageTokenStream::Lindera(lindera_token_stream) - } - _ => MultiLanguageTokenStream::Simple( - self.default_tokenizer.token_stream(text_to_tokenize), - ), - } - } -} - -/// Gets the language defined by a prefix `{ID}:text` where ID being the 3-letter language used by -/// whichlang) and returns the language and the text without the prefix. If the prefix is not -/// recognized, the language is `None` and the text is the original. -fn get_language_from_prefix(text: &str) -> (Option, &str) { - let prefix_bytes = &text.as_bytes()[0..std::cmp::min(4, text.len())]; - // TODO: refactor. - let prefix_language = match prefix_bytes { - b"CMN:" => Some(Lang::Cmn), - b"ENG:" => Some(Lang::Eng), - b"JPN:" => Some(Lang::Jpn), - b"KOR:" => Some(Lang::Kor), - _ => None, - }; - let text_without_prefix = if prefix_language.is_some() { - // This is safe as we know that the prefix is made of 4 ascii characters. - &text[4..] - } else { - text - }; - (prefix_language, text_without_prefix) -} -pub enum MultiLanguageTokenStream<'a> { - Empty, - Lindera(LinderaTokenStream<'a>), - Simple(SimpleTokenStream<'a>), -} - -impl TokenStream for MultiLanguageTokenStream<'_> { - fn advance(&mut self) -> bool { - match self { - MultiLanguageTokenStream::Empty => false, - MultiLanguageTokenStream::Lindera(tokenizer) => tokenizer.advance(), - MultiLanguageTokenStream::Simple(tokenizer) => tokenizer.advance(), - } - } - - fn token(&self) -> &Token { - match self { - MultiLanguageTokenStream::Empty => { - panic!("Cannot call token() on an empty token stream.") - } - MultiLanguageTokenStream::Lindera(tokenizer) => tokenizer.token(), - MultiLanguageTokenStream::Simple(tokenizer) => tokenizer.token(), - } - } - - fn token_mut(&mut self) -> &mut Token { - match self { - MultiLanguageTokenStream::Empty => { - panic!("Cannot call token_mut() on an empty token stream.") - } - MultiLanguageTokenStream::Lindera(tokenizer) => tokenizer.token_mut(), - MultiLanguageTokenStream::Simple(tokenizer) => tokenizer.token_mut(), - } - } -} - -pub struct LinderaTokenStream<'a> { - pub tokens: Vec>, - pub token: &'a mut Token, -} - -impl TokenStream for LinderaTokenStream<'_> { - fn advance(&mut self) -> bool { - if self.tokens.is_empty() { - return false; - } - let token = self.tokens.remove(0); - self.token.text = token.text.to_string(); - self.token.offset_from = token.byte_start; - self.token.offset_to = token.byte_end; - self.token.position = token.position; - self.token.position_length = token.position_length; - - true - } - - fn token(&self) -> &Token { - self.token - } - - fn token_mut(&mut self) -> &mut Token { - self.token - } -} - -#[cfg(test)] -mod tests { - use tantivy::tokenizer::{Token, TokenStream, Tokenizer}; - - use super::{MultiLangTokenizer, MultiLanguageTokenStream, get_language_from_prefix}; - - fn test_helper(mut tokenizer: MultiLanguageTokenStream) -> Vec { - let mut tokens: Vec = Vec::new(); - tokenizer.process(&mut |token: &Token| tokens.push(token.clone())); - tokens - } - - #[test] - fn test_multilanguage_tokenizer_cmn() { - let mut tokenizer = MultiLangTokenizer::default(); - let tokens = test_helper( - tokenizer.token_stream("地址1,包含無效的字元 (包括符號與不標準的asci阿爾發字元"), - ); - assert_eq!(tokens.len(), 19); - { - let token = &tokens[0]; - assert_eq!(token.text, "地址"); - assert_eq!(token.offset_from, 0); - assert_eq!(token.offset_to, 6); - assert_eq!(token.position, 0); - assert_eq!(token.position_length, 1); - } - } - - #[test] - fn test_multilanguage_tokenizer_jpn() { - let mut tokenizer = MultiLangTokenizer::default(); - { - let tokens = test_helper(tokenizer.token_stream("すもももももももものうち")); - assert_eq!(tokens.len(), 7); - { - let token = &tokens[0]; - assert_eq!(token.text, "すもも"); - assert_eq!(token.offset_from, 0); - assert_eq!(token.offset_to, 9); - assert_eq!(token.position, 0); - assert_eq!(token.position_length, 1); - } - } - { - // Force usage of JPN tokenizer. - let tokens = test_helper(tokenizer.token_stream("JPN:すもももももももものうち")); - assert_eq!(tokens.len(), 7); - } - { - // Force usage of ENG tokenizer. - // This tokenizer will return only one token. - let tokens = test_helper(tokenizer.token_stream("ENG:すもももももももものうち")); - assert_eq!(tokens.len(), 1); - } - } - - #[test] - fn test_multilanguage_tokenizer_kor() { - let mut tokenizer = MultiLangTokenizer::default(); - { - let tokens = test_helper(tokenizer.token_stream("일본입니다. 매우 멋진 단어입니다.")); - assert_eq!(tokens.len(), 11); - { - let token = &tokens[0]; - assert_eq!(token.text, "일본"); - assert_eq!(token.offset_from, 0); - assert_eq!(token.offset_to, 6); - assert_eq!(token.position, 0); - assert_eq!(token.position_length, 1); - } - } - { - let tokens = - test_helper(tokenizer.token_stream("KOR:일본입니다. 매우 멋진 단어입니다.")); - assert_eq!(tokens.len(), 11); - } - { - let tokens = test_helper(tokenizer.token_stream("ENG:일본입니다")); - assert_eq!(tokens.len(), 1); - } - } - - #[test] - fn test_multilanguage_tokenizer_with_empty_string() { - let mut tokenizer = MultiLangTokenizer::default(); - { - let tokens = test_helper(tokenizer.token_stream("")); - assert_eq!(tokens.len(), 0); - } - { - let tokens = test_helper(tokenizer.token_stream(" ")); - assert_eq!(tokens.len(), 0); - } - } - - #[test] - fn test_multilanguage_process_language_prefix() { - { - let (lang, text) = get_language_from_prefix("JPN:すもももももももものうち"); - assert_eq!(lang, Some(whichlang::Lang::Jpn)); - assert_eq!(text, "すもももももももものうち"); - } - { - let (lang, text) = get_language_from_prefix("CMN:地址1,包含無效的字元"); - assert_eq!(lang, Some(whichlang::Lang::Cmn)); - assert_eq!(text, "地址1,包含無效的字元"); - } - { - let (lang, text) = get_language_from_prefix("ENG:my address"); - assert_eq!(lang, Some(whichlang::Lang::Eng)); - assert_eq!(text, "my address"); - } - { - let (lang, text) = get_language_from_prefix("UNK:my address"); - assert!(lang.is_none()); - assert_eq!(text, "UNK:my address"); - } - { - let (lang, text) = get_language_from_prefix(""); - assert!(lang.is_none()); - assert_eq!(text, ""); - } - } -} diff --git a/quickwit/quickwit-search/src/leaf.rs b/quickwit/quickwit-search/src/leaf.rs index 3d9e5d00cce..2db727d5bc5 100644 --- a/quickwit/quickwit-search/src/leaf.rs +++ b/quickwit/quickwit-search/src/leaf.rs @@ -809,28 +809,24 @@ fn remove_redundant_timestamp_range( } } (Bound::Unbounded, Some(_)) => Bound::Unbounded, - (timestamp, None) => timestamp, + (query_bound, None) => query_bound, }; - let final_end_timestamp = match ( - visitor.end_timestamp, - split.timestamp_end.map(DateTime::from_timestamp_secs), - ) { - (Bound::Included(query_ts), Some(split_ts)) => { - if query_ts < split_ts { - Bound::Included(query_ts) - } else { - Bound::Unbounded - } - } - (Bound::Excluded(query_ts), Some(split_ts)) => { - if query_ts <= split_ts { - Bound::Excluded(query_ts) + let final_end_timestamp = match (visitor.end_timestamp, split.timestamp_end) { + ( + query_bound @ (Bound::Included(query_ts) | Bound::Excluded(query_ts)), + Some(split_end), + ) => { + // split.timestamp_end is the truncation of the highest timestamp in the split, + // so the actual known bound for the split is split.timestamp_end+1 (exclusive) + let split_end_exclusive = DateTime::from_timestamp_secs(split_end + 1); + if query_ts < split_end_exclusive { + query_bound } else { Bound::Unbounded } } (Bound::Unbounded, Some(_)) => Bound::Unbounded, - (timestamp, None) => timestamp, + (query_bound, None) => query_bound, }; if final_start_timestamp != Bound::Unbounded || final_end_timestamp != Bound::Unbounded { let range = RangeQuery { @@ -1688,6 +1684,11 @@ mod tests { }; remove_timestamp_test_case(&search_request, &split, None); + let expected_upper_inclusive = RangeQuery { + field: timestamp_field.to_string(), + lower_bound: Bound::Unbounded, + upper_bound: Bound::Included((time3 * S_TO_NS).into()), + }; let search_request = SearchRequest { query_ast: serde_json::to_string(&QueryAst::Range(RangeQuery { field: timestamp_field.to_string(), @@ -1697,7 +1698,7 @@ mod tests { .unwrap(), ..SearchRequest::default() }; - remove_timestamp_test_case(&search_request, &split, None); + remove_timestamp_test_case(&search_request, &split, Some(expected_upper_inclusive)); let search_request = SearchRequest { query_ast: serde_json::to_string(&QueryAst::MatchAll).unwrap(), @@ -1740,10 +1741,10 @@ mod tests { Some(expected_upper_exclusive.clone()), ); - let expected_lower_exclusive = RangeQuery { + let expected_lower_excl_upper_incl = RangeQuery { field: timestamp_field.to_string(), lower_bound: Bound::Excluded((time2 * S_TO_NS).into()), - upper_bound: Bound::Unbounded, + upper_bound: Bound::Included((time3 * S_TO_NS).into()), }; let search_request = SearchRequest { query_ast: serde_json::to_string(&QueryAst::Range(RangeQuery { @@ -1757,10 +1758,22 @@ mod tests { remove_timestamp_test_case( &search_request, &split, - Some(expected_lower_exclusive.clone()), + Some(expected_lower_excl_upper_incl.clone()), ); + } + + #[test] + fn test_remove_timestamp_range_multiple_bounds() { + // When bounds are defined both in the AST and in the search request, + // make sure we take the most restrictive ones. + const S_TO_NS: i64 = 1_000_000_000; + let time1 = 1700001000; + let time2 = 1700002000; + let time3 = 1700003000; + let time4 = 1700004000; + + let timestamp_field = "timestamp".to_string(); - // we take the most restrictive bounds let split = SplitIdAndFooterOffsets { timestamp_start: Some(time1), timestamp_end: Some(time4), @@ -1803,10 +1816,10 @@ mod tests { }; remove_timestamp_test_case(&search_request, &split, Some(expected_upper_2_inc)); - let expected_lower_3 = RangeQuery { + let expected_lower_3_upper_4 = RangeQuery { field: timestamp_field.to_string(), lower_bound: Bound::Included((time3 * S_TO_NS).into()), - upper_bound: Bound::Unbounded, + upper_bound: Bound::Included((time4 * S_TO_NS).into()), }; let search_request = SearchRequest { @@ -1820,7 +1833,11 @@ mod tests { end_timestamp: Some(time4 + 1), ..SearchRequest::default() }; - remove_timestamp_test_case(&search_request, &split, Some(expected_lower_3.clone())); + remove_timestamp_test_case( + &search_request, + &split, + Some(expected_lower_3_upper_4.clone()), + ); let search_request = SearchRequest { query_ast: serde_json::to_string(&QueryAst::Range(RangeQuery { @@ -1833,7 +1850,7 @@ mod tests { end_timestamp: Some(time4 + 1), ..SearchRequest::default() }; - remove_timestamp_test_case(&search_request, &split, Some(expected_lower_3)); + remove_timestamp_test_case(&search_request, &split, Some(expected_lower_3_upper_4)); let mut search_request = SearchRequest { query_ast: serde_json::to_string(&QueryAst::MatchAll).unwrap(), diff --git a/quickwit/quickwit-serve/src/otlp_api/rest_handler.rs b/quickwit/quickwit-serve/src/otlp_api/rest_handler.rs index 1654a840dad..4ec47c15847 100644 --- a/quickwit/quickwit-serve/src/otlp_api/rest_handler.rs +++ b/quickwit/quickwit-serve/src/otlp_api/rest_handler.rs @@ -25,7 +25,6 @@ use quickwit_proto::opentelemetry::proto::collector::trace::v1::{ use quickwit_proto::types::IndexId; use quickwit_proto::{ServiceError, ServiceErrorCode, tonic}; use serde::{self, Serialize}; -use tracing::error; use warp::{Filter, Rejection}; use crate::decompression::get_body_bytes; diff --git a/quickwit/quickwit-ui/src/components/IndexSummary.tsx b/quickwit/quickwit-ui/src/components/IndexSummary.tsx index c3eca2da261..7be3b8b01ee 100644 --- a/quickwit/quickwit-ui/src/components/IndexSummary.tsx +++ b/quickwit/quickwit-ui/src/components/IndexSummary.tsx @@ -13,7 +13,7 @@ // limitations under the License. import styled from "@emotion/styled"; -import { Paper } from "@mui/material"; +import { Alert, Paper } from "@mui/material"; import dayjs from "dayjs"; import utc from "dayjs/plugin/utc"; import { FC, ReactNode } from "react"; @@ -75,6 +75,12 @@ export function IndexSummary({ index }: { index: Index }) { return ( + {index.split_limit_reached && ( + + Split limit reached. Only the first 10,000 splits were retrieved. + The actual total may be higher. Statistics shown are incomplete. + + )} {dayjs .unix(index.metadata.create_timestamp) diff --git a/quickwit/quickwit-ui/src/services/client.ts b/quickwit/quickwit-ui/src/services/client.ts index cc7643b6687..95baaceed99 100644 --- a/quickwit/quickwit-ui/src/services/client.ts +++ b/quickwit/quickwit-ui/src/services/client.ts @@ -81,7 +81,8 @@ export class Client { ]); return { metadata: metadata, - splits: splits, + splits: splits[0], + split_limit_reached: splits[1], }; } @@ -89,14 +90,16 @@ export class Client { return this.fetch(`${this.apiRoot()}indexes/${indexId}`, {}); } - async getAllSplits(indexId: string): Promise> { + async getAllSplits( + indexId: string, + ): Promise<[Array, boolean]> { // TODO: restrieve all the splits. const results: { splits: Array } = await this.fetch( `${this.apiRoot()}indexes/${indexId}/splits?limit=10000`, {}, ); - return results["splits"]; + return [results["splits"], results["splits"].length === 10000]; } async listIndexes(): Promise> { diff --git a/quickwit/quickwit-ui/src/utils/models.ts b/quickwit/quickwit-ui/src/utils/models.ts index 67e77add3de..8abe8acc6e1 100644 --- a/quickwit/quickwit-ui/src/utils/models.ts +++ b/quickwit/quickwit-ui/src/utils/models.ts @@ -282,6 +282,7 @@ export type Range = { export type Index = { metadata: IndexMetadata; splits: SplitMetadata[]; + split_limit_reached: boolean; }; export type Cluster = { diff --git a/quickwit/rest-api-tests/scenarii/qw_search_api/0001_ts_range.yaml b/quickwit/rest-api-tests/scenarii/qw_search_api/0001_ts_range.yaml index dc9765b634e..7dae4d645da 100644 --- a/quickwit/rest-api-tests/scenarii/qw_search_api/0001_ts_range.yaml +++ b/quickwit/rest-api-tests/scenarii/qw_search_api/0001_ts_range.yaml @@ -40,3 +40,9 @@ params: query: "auto_date:>=2023-05-25T00:00:00Z AND auto_date:<2023-05-26T00:00:00Z" expected: num_hits: 2 +--- +endpoint: millisec/search +params: + query: "ts:>=2022-12-16T10:00:57.000Z AND ts:<=2022-12-16T10:00:57.000Z" +expected: + num_hits: 1 \ No newline at end of file diff --git a/quickwit/rest-api-tests/scenarii/qw_search_api/_setup.quickwit.yaml b/quickwit/rest-api-tests/scenarii/qw_search_api/_setup.quickwit.yaml index b333ed3c86a..e410ecd96c0 100644 --- a/quickwit/rest-api-tests/scenarii/qw_search_api/_setup.quickwit.yaml +++ b/quickwit/rest-api-tests/scenarii/qw_search_api/_setup.quickwit.yaml @@ -98,3 +98,31 @@ ndjson: - {"text_raw": "indexed with raw tokenizer dashes"} - {"text_fast": "fast-text-value-dashes"} - {"text_fast": "fast text value whitespaces"} +--- +method: DELETE +endpoint: indexes/millisec +status_code: null +--- +method: POST +endpoint: indexes/ +json: + version: "0.7" + index_id: millisec + doc_mapping: + timestamp_field: ts + mode: strict + field_mappings: + - name: ts + type: datetime + fast: true + input_formats: ["rfc3339"] + fast_precision: milliseconds +--- +method: POST +endpoint: millisec/ingest +params: + commit: force +ndjson: + - {"ts": "2022-12-16T10:00:56.297Z"} + - {"ts": "2022-12-16T10:00:57.000Z"} + - {"ts": "2022-12-16T10:00:57.297Z"} \ No newline at end of file diff --git a/quickwit/rest-api-tests/scenarii/qw_search_api/_teardown.quickwit.yaml b/quickwit/rest-api-tests/scenarii/qw_search_api/_teardown.quickwit.yaml index 56cd2bda8a9..ebfa1c4931b 100644 --- a/quickwit/rest-api-tests/scenarii/qw_search_api/_teardown.quickwit.yaml +++ b/quickwit/rest-api-tests/scenarii/qw_search_api/_teardown.quickwit.yaml @@ -3,3 +3,6 @@ endpoint: indexes/simple --- method: DELETE endpoint: indexes/nested +--- +method: DELETE +endpoint: indexes/millisec \ No newline at end of file diff --git a/quickwit/rust-toolchain.toml b/quickwit/rust-toolchain.toml index e54a09951e9..2a30998f14b 100644 --- a/quickwit/rust-toolchain.toml +++ b/quickwit/rust-toolchain.toml @@ -1,4 +1,4 @@ [toolchain] -channel = "1.91" +channel = "1.93" components = ["cargo", "clippy", "rustfmt", "rust-docs"]