From f49756b92113f9d3e1d954622e16947dfbc513f4 Mon Sep 17 00:00:00 2001 From: Qifan Zhang Date: Mon, 22 Jun 2026 22:23:56 +0000 Subject: [PATCH] =?UTF-8?q?Fix=20O(N=C2=B2)=20duplicate-attribute=20check?= =?UTF-8?q?=20in=20`Attributes`=20iterator?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `IterState::check_for_duplicates` did a linear scan of every previously seen attribute name for each new attribute, so a start tag with N distinct names cost O(N²) byte comparisons -- a CPU-exhaustion vector on untrusted XML (#969). Small tags keep the linear scan: for the handful of attributes a real start tag carries it is faster than hashing and needs no allocation (the busiest element in `players.xml` has 22). Once a tag declares more than `SMALL_ATTRIBUTE_COUNT` (32) attributes it switches to a 64-bit hash pre-filter, making the whole tag O(N). The set is seeded from the names already collected, so a duplicate that spans the switch is still caught, and on a hit it falls back to the linear scan to report the exact previous position -- `AttrError::Duplicated` is unchanged. The pre-filter stores SipHash name hashes in a `HashSet` keyed by an identity hasher, since the values are already hashes (no re-hashing). Exercised by a new `benches/issue971.rs` and a unit test covering a duplicate past the hash threshold. Co-Authored-By: Claude Opus 4.8 (1M context) --- Cargo.toml | 5 ++ Changelog.md | 10 ++- benches/issue971.rs | 68 +++++++++++++++++++ src/events/attributes.rs | 143 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 224 insertions(+), 2 deletions(-) create mode 100644 benches/issue971.rs diff --git a/Cargo.toml b/Cargo.toml index 6f6f9a9b..0f1a4178 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -68,6 +68,11 @@ harness = false path = "benches/encoding.rs" required-features = ["encoding"] +[[bench]] +name = "issue971" +harness = false +path = "benches/issue971.rs" + [features] default = [] diff --git a/Changelog.md b/Changelog.md index 6b1fbdc4..c9e78ffa 100644 --- a/Changelog.md +++ b/Changelog.md @@ -21,6 +21,11 @@ ### Bug Fixes +- [#969]: `Attributes` (and anything that iterates `BytesStart::attributes()` + with the default `with_checks(true)`) no longer takes O(N²) time on a start + tag with a large number of attributes. Small tags keep the previous linear + scan; larger ones switch to a 64-bit hash pre-filter, so the whole tag is + O(N). The exact `AttrError::Duplicated(new, prev)` positions are unchanged. - [#970]: `NamespaceResolver::push` (and hence every `NsReader` `Start`/`Empty` event) now rejects a start tag that declares more than `DEFAULT_MAX_DECLARATIONS_PER_ELEMENT` (256) `xmlns` / `xmlns:*` namespace @@ -31,10 +36,11 @@ via `NamespaceResolver::set_max_declarations_per_element` (use `usize::MAX` to disable). -[#970]: https://github.com/tafia/quick-xml/issues/970 - ### Misc Changes +[#969]: https://github.com/tafia/quick-xml/issues/969 +[#970]: https://github.com/tafia/quick-xml/issues/970 + ## 0.40.1 -- 2026-05-15 diff --git a/benches/issue971.rs b/benches/issue971.rs new file mode 100644 index 00000000..80246eb5 --- /dev/null +++ b/benches/issue971.rs @@ -0,0 +1,68 @@ +//! Regression benchmark for [#969] / [#971]: iterating the attributes of a +//! single start tag must stay ~O(N) in the number of attributes. +//! +//! `BytesStart::attributes()` defaults to `with_checks(true)`, which rejects +//! duplicate attribute names. That check used to be a linear scan of every +//! previously seen name, making a tag with N distinct attributes cost O(N²) byte +//! comparisons -- a CPU-exhaustion vector on untrusted XML. This benchmark times +//! the check across a range of attribute counts; if the O(N²) behaviour ever +//! returns, the per-element time for the larger inputs will blow up. +//! +//! Run with `cargo bench --bench issue971`. It is also executed once per case as +//! a smoke test by `cargo test --benches` on CI. +//! +//! [#969]: https://github.com/tafia/quick-xml/issues/969 +//! [#971]: https://github.com/tafia/quick-xml/pull/971 + +use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion, Throughput}; +use quick_xml::events::Event; +use quick_xml::reader::Reader; + +/// Builds a single empty-element tag carrying `n` distinct attributes, as in the +/// [#969] proof of concept (``). +fn tag_with_attributes(n: usize) -> String { + let mut xml = String::with_capacity(n * 13 + 8); + xml.push_str(""); + xml +} + +/// Iterates every attribute of the single start tag in `xml`, returning the +/// count. `checks` toggles the duplicate-name detection under test. +fn count_attributes(xml: &str, checks: bool) -> usize { + let mut reader = Reader::from_str(xml); + match reader.read_event() { + Ok(Event::Empty(e)) => { + let mut count = 0; + for attr in e.attributes().with_checks(checks) { + attr.expect("valid attribute"); + count += 1; + } + count + } + other => panic!("expected an empty element, got {:?}", other), + } +} + +fn duplicate_check(c: &mut Criterion) { + let mut group = c.benchmark_group("issue971_attributes"); + for n in [16usize, 64, 256, 1024, 4096] { + let xml = tag_with_attributes(n); + assert_eq!(count_attributes(&xml, true), n); + group.throughput(Throughput::Elements(n as u64)); + + group.bench_with_input(BenchmarkId::new("with_checks(true)", n), &xml, |b, xml| { + b.iter(|| count_attributes(xml, true)) + }); + group.bench_with_input(BenchmarkId::new("with_checks(false)", n), &xml, |b, xml| { + b.iter(|| count_attributes(xml, false)) + }); + } + group.finish(); +} + +criterion_group!(benches, duplicate_check); +criterion_main!(benches); diff --git a/src/events/attributes.rs b/src/events/attributes.rs index 1d23cc7e..544d7690 100644 --- a/src/events/attributes.rs +++ b/src/events/attributes.rs @@ -9,7 +9,9 @@ use crate::name::{LocalName, Namespace, NamespaceResolver, QName}; use crate::utils::{is_whitespace, Bytes}; use crate::XmlVersion; +use std::collections::HashSet; use std::fmt::{self, Debug, Display, Formatter}; +use std::hash::{BuildHasherDefault, DefaultHasher, Hasher}; use std::iter::FusedIterator; use std::{borrow::Cow, ops::Range}; @@ -964,6 +966,53 @@ enum State { SkipEqValue(usize), } +/// Number of attributes a start tag may have before the duplicate-name check +/// switches from a direct linear scan of the previously seen names to a hash +/// pre-filter (see [`IterState::check_for_duplicates`]). +/// +/// Real-world start tags carry only a handful of attributes -- the busiest +/// element in our benchmark corpus (`tests/documents/players.xml`) has 22 -- +/// where the scan is faster than hashing and needs no allocation. Larger tags +/// are where the scan became the O(N²) CPU-DoS of [#969], so above this count we +/// pay for a hash set to keep the whole tag O(N). The value sits just above the +/// measured linear-vs-hash crossover. +/// +/// [#969]: https://github.com/tafia/quick-xml/issues/969 +const SMALL_ATTRIBUTE_COUNT: usize = 32; + +/// A no-op [`Hasher`] for the `key_hashes` set, whose values are already 64-bit +/// hashes of attribute names; re-hashing them with the default SipHash would be +/// wasted work. Only `write_u64` is ever exercised (via `u64`'s `Hash` impl). +#[derive(Default)] +struct IdentityHasher(u64); + +impl Hasher for IdentityHasher { + #[inline] + fn finish(&self) -> u64 { + self.0 + } + + #[inline] + fn write(&mut self, _: &[u8]) { + // The set only ever stores `u64` keys, which route through `write_u64`. + unreachable!("IdentityHasher only supports u64 keys") + } + + #[inline] + fn write_u64(&mut self, n: u64) { + self.0 = n; + } +} + +/// Hashes a single attribute name. A fresh [`DefaultHasher`] per name keeps each +/// hash independent (so it is also DoS-resistant on untrusted input). +#[inline] +fn hash_name(name: &[u8]) -> u64 { + let mut hasher = DefaultHasher::new(); + hasher.write(name); + hasher.finish() +} + /// External iterator over spans of attribute key and value #[derive(Clone, Debug)] pub(crate) struct IterState { @@ -979,6 +1028,13 @@ pub(crate) struct IterState { /// names. We store a ranges instead of slices to able to report a previous /// attribute position keys: Vec>, + /// 64-bit hashes of the byte content of `keys`, used as an O(1) pre-filter + /// once a start tag declares more than `SMALL_ATTRIBUTE_COUNT` attributes, so + /// the duplicate check stays O(N) over the whole tag instead of O(N²). The + /// values are already hashes, so the set stores them with `IdentityHasher` + /// instead of re-hashing. Allocated only when the threshold is crossed, so + /// small tags (and [`IterState::new`]) stay allocation-free and `const`. + key_hashes: Option>>, } impl IterState { @@ -988,6 +1044,7 @@ impl IterState { html, check_duplicates: true, keys: Vec::new(), + key_hashes: None, } } @@ -1063,6 +1120,17 @@ impl IterState { } } + /// Checks that the attribute name `key` (a range into `slice`) was not seen + /// earlier in the same start tag, recording it for subsequent checks. + /// + /// Small tags use a direct linear scan of [`Self::keys`]: for a handful of + /// attributes that beats hashing and needs no allocation, which is the + /// overwhelmingly common case. Once a tag declares more than + /// `SMALL_ATTRIBUTE_COUNT` attributes -- where the scan would become the + /// O(N²) CPU-DoS of [#969] -- it switches to a hash pre-filter that keeps the + /// whole tag O(N). + /// + /// [#969]: https://github.com/tafia/quick-xml/issues/969 #[inline] fn check_for_duplicates( &mut self, @@ -1070,6 +1138,9 @@ impl IterState { key: Range, ) -> Result, AttrError> { if self.check_duplicates { + if self.keys.len() >= SMALL_ATTRIBUTE_COUNT { + return self.check_for_duplicates_hashed(slice, key); + } if let Some(prev) = self .keys .iter() @@ -1082,6 +1153,44 @@ impl IterState { Ok(key) } + /// Cold path of [`Self::check_for_duplicates`] for start tags with many + /// attributes: a [`HashSet`] of 64-bit name hashes acts as an O(1) pre-filter + /// so iterating N attributes is O(N) rather than O(N²). + #[cold] + fn check_for_duplicates_hashed( + &mut self, + slice: &[u8], + key: Range, + ) -> Result, AttrError> { + let keys = &self.keys; + let key_hashes = self.key_hashes.get_or_insert_with(|| { + // First time over the threshold: seed the set with the names already + // collected during the linear phase so the pre-filter knows them. + let mut set = HashSet::with_capacity_and_hasher( + keys.len() * 2, + BuildHasherDefault::::default(), + ); + for r in keys { + set.insert(hash_name(&slice[r.clone()])); + } + set + }); + // A fresh hash proves the name is new. On a hit (a real duplicate, or the + // astronomically rare 64-bit collision) fall back to the linear scan to + // recover the exact previous position for `AttrError::Duplicated`. + if !key_hashes.insert(hash_name(&slice[key.clone()])) { + if let Some(prev) = self + .keys + .iter() + .find(|r| slice[(*r).clone()] == slice[key.clone()]) + { + return Err(AttrError::Duplicated(key.start, prev.start)); + } + } + self.keys.push(key.clone()); + Ok(key) + } + /// # Parameters /// /// - `slice`: content of the tag, used for checking for duplicates @@ -1988,6 +2097,40 @@ mod xml { assert_eq!(iter.next(), None); assert_eq!(iter.next(), None); } + + /// Once a start tag declares more than `SMALL_ATTRIBUTE_COUNT` + /// attributes the duplicate check switches to its hash-based path. A + /// duplicate of a name first seen during the earlier linear phase must + /// still be detected, with the original position reported. Regression + /// cover for the cold path of [#969]. + /// + /// [#969]: https://github.com/tafia/quick-xml/issues/969 + #[test] + fn duplicate_past_hash_threshold() { + let dup = SMALL_ATTRIBUTE_COUNT / 2; + let n = SMALL_ATTRIBUTE_COUNT + 8; + + let mut source = String::from("tag"); + let mut positions = Vec::with_capacity(n); + for i in 0..n { + source.push(' '); + positions.push(source.len()); + source.push_str(&format!("k{:04}=''", i)); + } + // Repeat the name first seen at `positions[dup]` (linear phase). + source.push(' '); + let dup_pos = source.len(); + source.push_str(&format!("k{:04}=''", dup)); + + let mut iter = Attributes::new(&source, 3); + for _ in 0..n { + assert!(matches!(iter.next(), Some(Ok(_)))); + } + assert_eq!( + iter.next(), + Some(Err(AttrError::Duplicated(dup_pos, positions[dup]))) + ); + } } /// Check for duplicated names is disabled