diff --git a/compiler/rustc_parse/messages.ftl b/compiler/rustc_parse/messages.ftl index 747895c80469..1331d99c01ea 100644 --- a/compiler/rustc_parse/messages.ftl +++ b/compiler/rustc_parse/messages.ftl @@ -967,6 +967,7 @@ parse_unknown_start_of_token = unknown start of token: {$escaped} .sugg_quotes = Unicode characters '“' (Left Double Quotation Mark) and '”' (Right Double Quotation Mark) look like '{$ascii_str}' ({$ascii_name}), but are not .sugg_other = Unicode character '{$ch}' ({$u_name}) looks like '{$ascii_str}' ({$ascii_name}), but it is not .help_null = source files must contain UTF-8 encoded text, unexpected null bytes might occur when a different encoding is used + .help_invisible_char = invisible characters like '{$escaped}' are not usually visible in text editors .note_repeats = character appears {$repeats -> [one] once more *[other] {$repeats} more times diff --git a/compiler/rustc_parse/src/errors.rs b/compiler/rustc_parse/src/errors.rs index 3b72c9802afd..60e4a240c85e 100644 --- a/compiler/rustc_parse/src/errors.rs +++ b/compiler/rustc_parse/src/errors.rs @@ -2369,6 +2369,8 @@ pub(crate) struct UnknownTokenStart { pub null: Option, #[subdiagnostic] pub repeat: Option, + #[subdiagnostic] + pub invisible: Option, } #[derive(Subdiagnostic)] @@ -2409,6 +2411,10 @@ pub(crate) struct UnknownTokenRepeat { pub repeats: usize, } +#[derive(Subdiagnostic)] +#[help(parse_help_invisible_char)] +pub(crate) struct InvisibleCharacter; + #[derive(Subdiagnostic)] #[help(parse_help_null)] pub(crate) struct UnknownTokenNull; diff --git a/compiler/rustc_parse/src/lexer/mod.rs b/compiler/rustc_parse/src/lexer/mod.rs index 94ae35c19582..7c969dd7f9f4 100644 --- a/compiler/rustc_parse/src/lexer/mod.rs +++ b/compiler/rustc_parse/src/lexer/mod.rs @@ -36,6 +36,10 @@ use unescape_error_reporting::{emit_unescape_error, escaped_char}; #[cfg(target_pointer_width = "64")] rustc_data_structures::static_assert_size!(rustc_lexer::Token, 12); +const INVISIBLE_CHARACTERS: [char; 8] = [ + '\u{200b}', '\u{200c}', '\u{2060}', '\u{2061}', '\u{2062}', '\u{00ad}', '\u{034f}', '\u{061c}', +]; + #[derive(Clone, Debug)] pub(crate) struct UnmatchedDelim { pub found_delim: Option, @@ -456,6 +460,7 @@ impl<'psess, 'src> Lexer<'psess, 'src> { escaped: escaped_char(c), sugg, null: if c == '\x00' { Some(errors::UnknownTokenNull) } else { None }, + invisible: if INVISIBLE_CHARACTERS.contains(&c) { Some(errors::InvisibleCharacter) } else { None }, repeat: if repeats > 0 { swallow_next_invalid = repeats; Some(errors::UnknownTokenRepeat { repeats }) diff --git a/tests/ui/lexer/lex-invisible-characters.rs b/tests/ui/lexer/lex-invisible-characters.rs new file mode 100644 index 000000000000..2db72b8475dc --- /dev/null +++ b/tests/ui/lexer/lex-invisible-characters.rs @@ -0,0 +1,6 @@ +// Provide extra help when a user has an invisible character in their code + +fn main​() { + //~^ ERROR unknown start of token: \u{200b} + //~| HELP invisible characters like '\u{200b}' are not usually visible in text editors +} diff --git a/tests/ui/lexer/lex-invisible-characters.stderr b/tests/ui/lexer/lex-invisible-characters.stderr new file mode 100644 index 000000000000..ddac0f4e9325 --- /dev/null +++ b/tests/ui/lexer/lex-invisible-characters.stderr @@ -0,0 +1,10 @@ +error: unknown start of token: \u{200b} + --> $DIR/lex-invisible-characters.rs:3:8 + | +LL | fn main​() { + | ^ + | + = help: invisible characters like '\u{200b}' are not usually visible in text editors + +error: aborting due to 1 previous error +