From 16b9fddf001a7f024f300e2bffa43203ff271318 Mon Sep 17 00:00:00 2001 From: yukang Date: Thu, 26 Mar 2026 11:42:38 +0800 Subject: [PATCH] Improve doc comment unicode guidance --- compiler/rustc_parse/src/errors.rs | 10 +++++++--- compiler/rustc_parse/src/lexer/mod.rs | 5 ++++- .../macro/unicode-control-codepoints-macros.stderr | 10 +++++----- tests/ui/parser/unicode-control-codepoints.stderr | 4 ++-- .../unicode-control-doc-comment-issue-153096.rs | 8 ++++++++ .../unicode-control-doc-comment-issue-153096.stderr | 13 +++++++++++++ 6 files changed, 39 insertions(+), 11 deletions(-) create mode 100644 tests/ui/parser/unicode-control-doc-comment-issue-153096.rs create mode 100644 tests/ui/parser/unicode-control-doc-comment-issue-153096.stderr diff --git a/compiler/rustc_parse/src/errors.rs b/compiler/rustc_parse/src/errors.rs index 0e852d2abda04..6faaafcc01005 100644 --- a/compiler/rustc_parse/src/errors.rs +++ b/compiler/rustc_parse/src/errors.rs @@ -4344,7 +4344,7 @@ impl Subdiagnostic for HiddenUnicodeCodepointsDiagLabels { pub(crate) enum HiddenUnicodeCodepointsDiagSub { Escape { spans: Vec<(char, Span)> }, - NoEscape { spans: Vec<(char, Span)> }, + NoEscape { spans: Vec<(char, Span)>, is_doc_comment: bool }, } // Used because of multiple multipart_suggestion and note @@ -4370,7 +4370,7 @@ impl Subdiagnostic for HiddenUnicodeCodepointsDiagSub { Applicability::MachineApplicable, ); } - HiddenUnicodeCodepointsDiagSub::NoEscape { spans } => { + HiddenUnicodeCodepointsDiagSub::NoEscape { spans, is_doc_comment } => { // FIXME: in other suggestions we've reversed the inner spans of doc comments. We // should do the same here to provide the same good suggestions as we do for // literals above. @@ -4383,7 +4383,11 @@ impl Subdiagnostic for HiddenUnicodeCodepointsDiagSub { .join(", "), ); diag.note(msg!("if their presence wasn't intentional, you can remove them")); - diag.note(msg!("if you want to keep them but make them visible in your source code, you can escape them: {$escaped}")); + if is_doc_comment { + diag.note(msg!(r#"if you need to keep them and make them explicit in source, rewrite this doc comment as a `#[doc = "..."]` attribute and use Unicode escapes such as {$escaped}"#)); + } else { + diag.note(msg!("if you want to keep them but make them visible in your source code, you can escape them: {$escaped}")); + } } } } diff --git a/compiler/rustc_parse/src/lexer/mod.rs b/compiler/rustc_parse/src/lexer/mod.rs index 2223e6a265759..5766d25bc86ce 100644 --- a/compiler/rustc_parse/src/lexer/mod.rs +++ b/compiler/rustc_parse/src/lexer/mod.rs @@ -546,6 +546,7 @@ impl<'psess, 'src> Lexer<'psess, 'src> { self.mk_sp(start, self.pos), 0, false, + true, "doc comment", ); } @@ -580,6 +581,7 @@ impl<'psess, 'src> Lexer<'psess, 'src> { span, padding, point_at_inner_spans, + false, label, ); } @@ -590,6 +592,7 @@ impl<'psess, 'src> Lexer<'psess, 'src> { span: Span, padding: u32, point_at_inner_spans: bool, + is_doc_comment: bool, label: &str, ) { // Obtain the `Span`s for each of the forbidden chars. @@ -610,7 +613,7 @@ impl<'psess, 'src> Lexer<'psess, 'src> { let sub = if point_at_inner_spans && !spans.is_empty() { errors::HiddenUnicodeCodepointsDiagSub::Escape { spans } } else { - errors::HiddenUnicodeCodepointsDiagSub::NoEscape { spans } + errors::HiddenUnicodeCodepointsDiagSub::NoEscape { spans, is_doc_comment } }; self.psess.buffer_lint( diff --git a/tests/ui/parser/macro/unicode-control-codepoints-macros.stderr b/tests/ui/parser/macro/unicode-control-codepoints-macros.stderr index 22fb1b945c654..b1d9c9d018744 100644 --- a/tests/ui/parser/macro/unicode-control-codepoints-macros.stderr +++ b/tests/ui/parser/macro/unicode-control-codepoints-macros.stderr @@ -6,7 +6,7 @@ LL | /// �test� RTL in doc in vec | = note: these kind of unicode codepoints change the way text flows on applications that support them, but can cause confusion because they change the order of characters on the screen = note: if their presence wasn't intentional, you can remove them - = note: if you want to keep them but make them visible in your source code, you can escape them: '\u{202e}', '\u{2066}' + = note: if you need to keep them and make them explicit in source, rewrite this doc comment as a `#[doc = "..."]` attribute and use Unicode escapes such as '\u{202e}', '\u{2066}' = note: `#[deny(text_direction_codepoint_in_literal)]` on by default error: unicode codepoint changing visible direction of text present in doc comment @@ -19,7 +19,7 @@ LL | | */ | = note: these kind of unicode codepoints change the way text flows on applications that support them, but can cause confusion because they change the order of characters on the screen = note: if their presence wasn't intentional, you can remove them - = note: if you want to keep them but make them visible in your source code, you can escape them: '\u{202e}', '\u{2066}' + = note: if you need to keep them and make them explicit in source, rewrite this doc comment as a `#[doc = "..."]` attribute and use Unicode escapes such as '\u{202e}', '\u{2066}' error: unicode codepoint changing visible direction of text present in doc comment --> $DIR/unicode-control-codepoints-macros.rs:33:9 @@ -31,7 +31,7 @@ LL | | */ | = note: these kind of unicode codepoints change the way text flows on applications that support them, but can cause confusion because they change the order of characters on the screen = note: if their presence wasn't intentional, you can remove them - = note: if you want to keep them but make them visible in your source code, you can escape them: '\u{202e}', '\u{2066}' + = note: if you need to keep them and make them explicit in source, rewrite this doc comment as a `#[doc = "..."]` attribute and use Unicode escapes such as '\u{202e}', '\u{2066}' error: unicode codepoint changing visible direction of text present in doc comment --> $DIR/unicode-control-codepoints-macros.rs:41:9 @@ -41,7 +41,7 @@ LL | /// �test� RTL in doc in proc macro | = note: these kind of unicode codepoints change the way text flows on applications that support them, but can cause confusion because they change the order of characters on the screen = note: if their presence wasn't intentional, you can remove them - = note: if you want to keep them but make them visible in your source code, you can escape them: '\u{202e}', '\u{2066}' + = note: if you need to keep them and make them explicit in source, rewrite this doc comment as a `#[doc = "..."]` attribute and use Unicode escapes such as '\u{202e}', '\u{2066}' error: unicode codepoint changing visible direction of text present in doc comment --> $DIR/unicode-control-codepoints-macros.rs:46:9 @@ -51,7 +51,7 @@ LL | /// �test� RTL in doc in proc macro | = note: these kind of unicode codepoints change the way text flows on applications that support them, but can cause confusion because they change the order of characters on the screen = note: if their presence wasn't intentional, you can remove them - = note: if you want to keep them but make them visible in your source code, you can escape them: '\u{202e}', '\u{2066}' + = note: if you need to keep them and make them explicit in source, rewrite this doc comment as a `#[doc = "..."]` attribute and use Unicode escapes such as '\u{202e}', '\u{2066}' error: aborting due to 5 previous errors diff --git a/tests/ui/parser/unicode-control-codepoints.stderr b/tests/ui/parser/unicode-control-codepoints.stderr index 7978c1435f609..3b273f8212faa 100644 --- a/tests/ui/parser/unicode-control-codepoints.stderr +++ b/tests/ui/parser/unicode-control-codepoints.stderr @@ -232,7 +232,7 @@ LL | /** '�'); */fn foo() {} | = note: these kind of unicode codepoints change the way text flows on applications that support them, but can cause confusion because they change the order of characters on the screen = note: if their presence wasn't intentional, you can remove them - = note: if you want to keep them but make them visible in your source code, you can escape them: '\u{202e}' + = note: if you need to keep them and make them explicit in source, rewrite this doc comment as a `#[doc = "..."]` attribute and use Unicode escapes such as '\u{202e}' error: unicode codepoint changing visible direction of text present in doc comment --> $DIR/unicode-control-codepoints.rs:46:1 @@ -244,7 +244,7 @@ LL | | * '�'); */fn bar() {} | = note: these kind of unicode codepoints change the way text flows on applications that support them, but can cause confusion because they change the order of characters on the screen = note: if their presence wasn't intentional, you can remove them - = note: if you want to keep them but make them visible in your source code, you can escape them: '\u{202e}' + = note: if you need to keep them and make them explicit in source, rewrite this doc comment as a `#[doc = "..."]` attribute and use Unicode escapes such as '\u{202e}' error: aborting due to 20 previous errors diff --git a/tests/ui/parser/unicode-control-doc-comment-issue-153096.rs b/tests/ui/parser/unicode-control-doc-comment-issue-153096.rs new file mode 100644 index 0000000000000..a7998daa72fdc --- /dev/null +++ b/tests/ui/parser/unicode-control-doc-comment-issue-153096.rs @@ -0,0 +1,8 @@ +//@ edition: 2024 + +#[allow(unused)] +/// ⁨א⁩, ⁨ב⁩, ⁨ג⁩, ⁨ד⁩, ⁨ה⁩ +//~^ ERROR unicode codepoint changing visible direction of text present in doc comment +fn foo() {} + +fn main() {} diff --git a/tests/ui/parser/unicode-control-doc-comment-issue-153096.stderr b/tests/ui/parser/unicode-control-doc-comment-issue-153096.stderr new file mode 100644 index 0000000000000..668a7844da635 --- /dev/null +++ b/tests/ui/parser/unicode-control-doc-comment-issue-153096.stderr @@ -0,0 +1,13 @@ +error: unicode codepoint changing visible direction of text present in doc comment + --> $DIR/unicode-control-doc-comment-issue-153096.rs:4:1 + | +LL | /// �א�, �ב�, �ג�, �ד�, �ה� + | ^^^^^^^^^^^^^^^^^^^^^^^^^^^ this doc comment contains invisible unicode text flow control codepoints + | + = note: these kind of unicode codepoints change the way text flows on applications that support them, but can cause confusion because they change the order of characters on the screen + = note: if their presence wasn't intentional, you can remove them + = note: if you need to keep them and make them explicit in source, rewrite this doc comment as a `#[doc = "..."]` attribute and use Unicode escapes such as '\u{2068}', '\u{2069}', '\u{2068}', '\u{2069}', '\u{2068}', '\u{2069}', '\u{2068}', '\u{2069}', '\u{2068}', '\u{2069}' + = note: `#[deny(text_direction_codepoint_in_literal)]` on by default + +error: aborting due to 1 previous error +