From 6f0da976c5f422ff5b1124d409e7d7e063fc4225 Mon Sep 17 00:00:00 2001 From: Ed Page Date: Thu, 28 Aug 2025 15:15:41 -0500 Subject: [PATCH] fix(lexer): Only allow horizontal whitespace in frontmatter In writing up the reference for frontmatter, I realized that we probably shouldn't be accepting Unicode Line Ending characters between the code fence and infostring or trailing after the infostring or a code fence. In digging into the unicode specification we use for Whitespace, it divides it up into categories, so I'm deferring to what it says for horizontal whitespace for what should be used within a line. Note, I am leaving out support for Unicde Default Ignorable characters. I figure that can be discussed outside of this change within the reference and tracking issue. --- compiler/rustc_lexer/src/lib.rs | 21 ++++++++++++++++++--- compiler/rustc_parse/src/lexer/mod.rs | 6 +++--- 2 files changed, 21 insertions(+), 6 deletions(-) diff --git a/compiler/rustc_lexer/src/lib.rs b/compiler/rustc_lexer/src/lib.rs index e347a76f6a2a..c29ab569f479 100644 --- a/compiler/rustc_lexer/src/lib.rs +++ b/compiler/rustc_lexer/src/lib.rs @@ -350,6 +350,21 @@ pub fn is_whitespace(c: char) -> bool { ) } +/// True if `c` is considered horizontal whitespace according to Rust language definition. +pub fn is_horizontal_whitespace(c: char) -> bool { + // This is Pattern_White_Space. + // + // Note that this set is stable (ie, it doesn't change with different + // Unicode versions), so it's ok to just hard-code the values. + + matches!( + c, + // Horizontal space characters + '\u{0009}' // tab (\t) + | '\u{0020}' // space + ) +} + /// True if `c` is valid as a first character of an identifier. /// See [Rust language reference](https://doc.rust-lang.org/reference/identifiers.html) for /// a formal definition of valid identifier name. @@ -536,7 +551,7 @@ impl Cursor<'_> { debug_assert!(length_opening >= 3); // whitespace between the opening and the infostring. - self.eat_while(|ch| ch != '\n' && is_whitespace(ch)); + self.eat_while(|ch| ch != '\n' && is_horizontal_whitespace(ch)); // copied from `eat_identifier`, but allows `-` and `.` in infostring to allow something like // `---Cargo.toml` as a valid opener @@ -545,7 +560,7 @@ impl Cursor<'_> { self.eat_while(|c| is_id_continue(c) || c == '-' || c == '.'); } - self.eat_while(|ch| ch != '\n' && is_whitespace(ch)); + self.eat_while(|ch| ch != '\n' && is_horizontal_whitespace(ch)); let invalid_infostring = self.first() != '\n'; let mut found = false; @@ -586,7 +601,7 @@ impl Cursor<'_> { // on a standalone line. Might be wrong. while let Some(closing) = rest.find("---") { let preceding_chars_start = rest[..closing].rfind("\n").map_or(0, |i| i + 1); - if rest[preceding_chars_start..closing].chars().all(is_whitespace) { + if rest[preceding_chars_start..closing].chars().all(is_horizontal_whitespace) { // candidate found potential_closing = Some(closing); break; diff --git a/compiler/rustc_parse/src/lexer/mod.rs b/compiler/rustc_parse/src/lexer/mod.rs index 9792240a5485..e3bd6a9a3270 100644 --- a/compiler/rustc_parse/src/lexer/mod.rs +++ b/compiler/rustc_parse/src/lexer/mod.rs @@ -6,7 +6,7 @@ use rustc_ast::util::unicode::{TEXT_FLOW_CONTROL_CHARS, contains_text_flow_contr use rustc_errors::codes::*; use rustc_errors::{Applicability, Diag, DiagCtxtHandle, StashKey}; use rustc_lexer::{ - Base, Cursor, DocStyle, FrontmatterAllowed, LiteralKind, RawStrError, is_whitespace, + Base, Cursor, DocStyle, FrontmatterAllowed, LiteralKind, RawStrError, is_horizontal_whitespace, }; use rustc_literal_escaper::{EscapeError, Mode, check_for_errors}; use rustc_session::lint::BuiltinLintDiag; @@ -597,7 +597,7 @@ impl<'psess, 'src> Lexer<'psess, 'src> { let last_line_start = within.rfind('\n').map_or(0, |i| i + 1); let last_line = &within[last_line_start..]; - let last_line_trimmed = last_line.trim_start_matches(is_whitespace); + let last_line_trimmed = last_line.trim_start_matches(is_horizontal_whitespace); let last_line_start_pos = frontmatter_opening_end_pos + BytePos(last_line_start as u32); let frontmatter_span = self.mk_sp(frontmatter_opening_pos, self.pos); @@ -640,7 +640,7 @@ impl<'psess, 'src> Lexer<'psess, 'src> { }); } - if !rest.trim_matches(is_whitespace).is_empty() { + if !rest.trim_matches(is_horizontal_whitespace).is_empty() { let span = self.mk_sp(last_line_start_pos, self.pos); self.dcx().emit_err(errors::FrontmatterExtraCharactersAfterClose { span }); }