From ffcca31db2e82e9c715af1b283f6cb9cad2933bb Mon Sep 17 00:00:00 2001 From: Diogo Sousa Date: Fri, 22 May 2026 20:24:56 +0100 Subject: [PATCH] Reject all browser-recognized comment-closing sequences in Comment::set_text. Previously `Comment::set_text` only rejected the canonical `-->` terminator, but a WHATWG-conformant browser also closes a comment on three more byte sequences: 1. `--!>` anywhere in the body (comment-end-bang state). 2. A leading `>` (abrupt-closing-of-empty-comment from comment-start). 3. A leading `->` (abrupt-closing-of-empty-comment from comment-start-dash). --- CHANGELOG.md | 4 ++ src/rewritable_units/tokens/comment.rs | 68 +++++++++++++++++--------- 2 files changed, 50 insertions(+), 22 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0a42ab5f..e7496e23 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -30,6 +30,10 @@ - Renamed the internal-use feature `integration_test` to `_integration_test`. The leading underscore signals to `cargo-semver-checks` and similar tools that the feature is not part of the public API. +- `Comment::set_text` now also rejects `--!>`, a leading `>`, and a leading `->`, which + WHATWG-conformant browsers treat as comment terminators. Previously only `-->` was + rejected, so a caller passing attacker-influenced data could let an attacker break out + of the comment and inject HTML (security fix). ## v2.9.0 diff --git a/src/rewritable_units/tokens/comment.rs b/src/rewritable_units/tokens/comment.rs index 3182c728..1035e49a 100644 --- a/src/rewritable_units/tokens/comment.rs +++ b/src/rewritable_units/tokens/comment.rs @@ -12,8 +12,9 @@ use thiserror::Error; /// An error that occurs when invalid value is provided for the HTML comment text. #[derive(Error, Debug, Eq, PartialEq, Copy, Clone)] pub enum CommentTextError { - /// The provided value contains the `-->` character sequence that preemptively closes the comment. - #[error("Comment text shouldn't contain comment closing sequence (`-->`).")] + /// The provided value contains a byte sequence that a WHATWG-conformant browser + /// treats as a comment terminator (`-->`, `--!>`, a leading `>`, or a leading `->`). + #[error("Comment text shouldn't contain a comment-closing sequence.")] CommentClosingSequence, /// The provided value contains a character that can't be represented in the document's [`encoding`]. @@ -36,6 +37,20 @@ pub struct Comment<'i> { user_data: Box, } +/// Returns `true` if `text`, when emitted between ``, would let a +/// WHATWG-conformant browser close the comment earlier than the trailing `-->` +/// marker, leaking whatever follows as live markup. +/// +/// Four sequences trigger this: +/// +/// 1. `-->` anywhere in the body (comment-end state). +/// 2. `--!>` anywhere in the body (comment-end-bang state). +/// 3. A leading `>` (abrupt-closing-of-empty-comment from comment-start). +/// 4. A leading `->` (abrupt-closing-of-empty-comment from comment-start-dash). +fn contains_comment_closing_sequence(text: &str) -> bool { + text.contains("-->") || text.contains("--!>") || text.starts_with('>') || text.starts_with("->") +} + impl<'i> Comment<'i> { #[inline] #[must_use] @@ -68,22 +83,22 @@ impl<'i> Comment<'i> { /// Sets the text of the comment. #[inline] pub fn set_text(&mut self, text: &str) -> Result<(), CommentTextError> { - if text.contains("-->") { - Err(CommentTextError::CommentClosingSequence) - } else { - // NOTE: if character can't be represented in the given - // encoding then encoding_rs replaces it with a numeric - // character reference. Character references are not - // supported in comments, so we need to bail. - match BytesCow::owned_from_str_without_replacements(text, self.encoding) { - Ok(text) => { - self.text = text; - self.raw.set_modified(); - - Ok(()) - } - Err(_) => Err(CommentTextError::UnencodableCharacter), + if contains_comment_closing_sequence(text) { + return Err(CommentTextError::CommentClosingSequence); + } + + // NOTE: if character can't be represented in the given + // encoding then encoding_rs replaces it with a numeric + // character reference. Character references are not + // supported in comments, so we need to bail. + match BytesCow::owned_from_str_without_replacements(text, self.encoding) { + Ok(text) => { + self.text = text; + self.raw.set_modified(); + + Ok(()) } + Err(_) => Err(CommentTextError::UnencodableCharacter), } } @@ -314,11 +329,20 @@ mod tests { #[test] fn comment_closing_sequence_in_text() { - rewrite_comment(b"", UTF_8, |c| { - let err = c.set_text("foo -- bar --> baz").unwrap_err(); - - assert_eq!(err, CommentTextError::CommentClosingSequence); - }); + // Every byte sequence here is treated as a comment terminator by a + // WHATWG-conformant browser when emitted between ``. + // See `contains_comment_closing_sequence`. + for payload in ["foo -- bar --> baz", "foo --!> bar", ">foo", "->foo"] { + rewrite_comment(b"", UTF_8, |c| { + let err = c.set_text(payload).unwrap_err(); + + assert_eq!( + err, + CommentTextError::CommentClosingSequence, + "expected `{payload}` to be rejected", + ); + }); + } } #[test]