Rollup merge of #148321 - Marcondiro:master, r=Mark-Simulacrum

parser/lexer: bump to Unicode 17, use faster unicode-ident Hello, Bump the unicode version used by lexer/parser to 17.0.0 by updating: - `unicode-normalization` to 0.1.25 - `unicode-properties` to 0.1.4 - `unicode-width` to 0.2.2 and by replacing `unicode-xid` with `unicode-ident` which is also 6 times faster. I think it might be worth to run the benchmarks to double check. (`unicode-ident` is already in `src/tools/tidy/src/deps.rs`) Thanks!
2025-12-28 22:52:29 +01:00 · 2025-12-28 22:52:29 +01:00 · 30618bb89c
commit 30618bb89c
parent 21cf7fb3ff f7cb82e70a
10 changed files with 74 additions and 22 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -4144,8 +4144,8 @@ version = "0.0.0"
 dependencies = [
 "expect-test",
 "memchr",
+ "unicode-ident",
 "unicode-properties",
- "unicode-xid",
 ]

 [[package]]
@ -5981,24 +5981,24 @@ checksum = "75b844d17643ee918803943289730bec8aac480150456169e647ed0b576ba539"

 [[package]]
 name = "unicode-ident"
-version = "1.0.18"
+version = "1.0.22"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512"
+checksum = "9312f7c4f6ff9069b165498234ce8be658059c6728633667c526e27dc2cf1df5"

 [[package]]
 name = "unicode-normalization"
-version = "0.1.24"
+version = "0.1.25"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5033c97c4262335cded6d6fc3e5c18ab755e1a3dc96376350f3d8e9f009ad956"
+checksum = "5fd4f6878c9cb28d874b009da9e8d183b5abc80117c40bbd187a1fde336be6e8"
 dependencies = [
 "tinyvec",
 ]

 [[package]]
 name = "unicode-properties"
-version = "0.1.3"
+version = "0.1.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e70f2a8b45122e719eb623c01822704c4e0907e7e426a05927e1a1cfff5b75d0"
+checksum = "7df058c713841ad818f1dc5d3fd88063241cc61f49f5fbea4b951e8cf5a8d71d"

 [[package]]
 name = "unicode-script"
--- a/compiler/rustc_lexer/Cargo.toml
+++ b/compiler/rustc_lexer/Cargo.toml
@ -15,8 +15,8 @@ Rust lexer used by rustc. No stability guarantees are provided.
 # Note that this crate purposefully does not depend on other rustc crates
 [dependencies]
 memchr = "2.7.6"
-unicode-properties = { version = "0.1.0", default-features = false, features = ["emoji"] }
-unicode-xid = "0.2.0"
+unicode-properties = { version = "0.1.4", default-features = false, features = ["emoji"] }
+unicode-ident = "1.0.22"

 [dev-dependencies]
 expect-test = "1.4.0"
--- a/compiler/rustc_lexer/src/lib.rs
+++ b/compiler/rustc_lexer/src/lib.rs
@ -34,8 +34,25 @@ use LiteralKind::*;
 use TokenKind::*;
 use cursor::EOF_CHAR;
 pub use cursor::{Cursor, FrontmatterAllowed};
+pub use unicode_ident::UNICODE_VERSION;
 use unicode_properties::UnicodeEmoji;
-pub use unicode_xid::UNICODE_VERSION as UNICODE_XID_VERSION;
+
+// Make sure that the Unicode version of the dependencies is the same.
+const _: () = {
+    let properties = unicode_properties::UNICODE_VERSION;
+    let ident = unicode_ident::UNICODE_VERSION;
+
+    if properties.0 != ident.0 as u64
+        || properties.1 != ident.1 as u64
+        || properties.2 != ident.2 as u64
+    {
+        panic!(
+            "unicode-properties and unicode-ident must use the same Unicode version, \
+            `unicode_properties::UNICODE_VERSION` and `unicode_ident::UNICODE_VERSION` are \
+            different."
+        );
+    }
+};

 /// Parsed token.
 /// It doesn't contain information about data that has been parsed,
@ -370,14 +387,14 @@ pub fn is_horizontal_whitespace(c: char) -> bool {
 /// a formal definition of valid identifier name.
 pub fn is_id_start(c: char) -> bool {
    // This is XID_Start OR '_' (which formally is not a XID_Start).
-    c == '_' || unicode_xid::UnicodeXID::is_xid_start(c)
+    c == '_' || unicode_ident::is_xid_start(c)
 }

 /// True if `c` is valid as a non-first character of an identifier.
 /// See [Rust language reference](https://doc.rust-lang.org/reference/identifiers.html) for
 /// a formal definition of valid identifier name.
 pub fn is_id_continue(c: char) -> bool {
-    unicode_xid::UnicodeXID::is_xid_continue(c)
+    unicode_ident::is_xid_continue(c)
 }

 /// The passed string is lexically an identifier.
--- a/compiler/rustc_parse/Cargo.toml
+++ b/compiler/rustc_parse/Cargo.toml
@ -20,8 +20,8 @@ rustc_session = { path = "../rustc_session" }
 rustc_span = { path = "../rustc_span" }
 thin-vec = "0.2.12"
 tracing = "0.1"
-unicode-normalization = "0.1.11"
-unicode-width = "0.2.0"
+unicode-normalization = "0.1.25"
+unicode-width = "0.2.2"
 # tidy-alphabetical-end

 [dev-dependencies]
--- a/compiler/rustc_parse/src/lib.rs
+++ b/compiler/rustc_parse/src/lib.rs
@ -22,10 +22,10 @@ use rustc_ast::token;
 use rustc_ast::tokenstream::TokenStream;
 use rustc_ast_pretty::pprust;
 use rustc_errors::{Diag, EmissionGuarantee, FatalError, PResult, pluralize};
+pub use rustc_lexer::UNICODE_VERSION;
 use rustc_session::parse::ParseSess;
 use rustc_span::source_map::SourceMap;
 use rustc_span::{FileName, SourceFile, Span};
-pub use unicode_normalization::UNICODE_VERSION as UNICODE_NORMALIZATION_VERSION;

 pub const MACRO_ARGUMENTS: Option<&str> = Some("macro arguments");

@ -39,6 +39,44 @@ pub mod lexer;

 mod errors;

+// Make sure that the Unicode version of the dependencies is the same.
+const _: () = {
+    let rustc_lexer = rustc_lexer::UNICODE_VERSION;
+    let rustc_span = rustc_span::UNICODE_VERSION;
+    let normalization = unicode_normalization::UNICODE_VERSION;
+    let width = unicode_width::UNICODE_VERSION;
+
+    if rustc_lexer.0 != rustc_span.0
+        || rustc_lexer.1 != rustc_span.1
+        || rustc_lexer.2 != rustc_span.2
+    {
+        panic!(
+            "rustc_lexer and rustc_span must use the same Unicode version, \
+            `rustc_lexer::UNICODE_VERSION` and `rustc_span::UNICODE_VERSION` are \
+            different."
+        );
+    }
+
+    if rustc_lexer.0 != normalization.0
+        || rustc_lexer.1 != normalization.1
+        || rustc_lexer.2 != normalization.2
+    {
+        panic!(
+            "rustc_lexer and unicode-normalization must use the same Unicode version, \
+            `rustc_lexer::UNICODE_VERSION` and `unicode_normalization::UNICODE_VERSION` are \
+            different."
+        );
+    }
+
+    if rustc_lexer.0 != width.0 || rustc_lexer.1 != width.1 || rustc_lexer.2 != width.2 {
+        panic!(
+            "rustc_lexer and unicode-width must use the same Unicode version, \
+            `rustc_lexer::UNICODE_VERSION` and `unicode_width::UNICODE_VERSION` are \
+            different."
+        );
+    }
+};
+
 rustc_fluent_macro::fluent_messages! { "../messages.ftl" }

 // Unwrap the result if `Ok`, otherwise emit the diagnostics and abort.
--- a/compiler/rustc_span/Cargo.toml
+++ b/compiler/rustc_span/Cargo.toml
@ -21,5 +21,5 @@ scoped-tls = "1.0"
 sha1 = "0.10.0"
 sha2 = "0.10.1"
 tracing = "0.1"
-unicode-width = "0.2.0"
+unicode-width = "0.2.2"
 # tidy-alphabetical-end
--- a/compiler/rustc_span/src/lib.rs
+++ b/compiler/rustc_span/src/lib.rs
@ -39,6 +39,7 @@ use rustc_macros::{Decodable, Encodable, HashStable_Generic};
 use rustc_serialize::opaque::{FileEncoder, MemDecoder};
 use rustc_serialize::{Decodable, Decoder, Encodable, Encoder};
 use tracing::debug;
+pub use unicode_width::UNICODE_VERSION;

 mod caching_source_map_view;
 pub mod source_map;
--- a/src/tools/tidy/src/deps.rs
+++ b/src/tools/tidy/src/deps.rs
@ -466,7 +466,6 @@ const PERMITTED_RUSTC_DEPENDENCIES: &[&str] = &[
    "unicode-script",
    "unicode-security",
    "unicode-width",
-    "unicode-xid",
    "utf8parse",
    "valuable",
    "version_check",
--- a/tests/ui-fulldeps/lexer/unicode-version.rs
+++ b/tests/ui-fulldeps/lexer/unicode-version.rs
@ -12,7 +12,6 @@
 #![feature(rustc_private)]

 extern crate rustc_driver;
-extern crate rustc_lexer;
 extern crate rustc_parse;

 fn main() {
@ -22,6 +21,5 @@ fn main() {
         it should also be updated in the reference at \
         https://github.com/rust-lang/reference/blob/HEAD/src/identifiers.md."
    );
-    println!("Unicode XID version is: {:?}", rustc_lexer::UNICODE_XID_VERSION);
-    println!("Unicode normalization version is: {:?}", rustc_parse::UNICODE_NORMALIZATION_VERSION);
+    println!("Unicode version used in rustc_parse is: {:?}", rustc_parse::UNICODE_VERSION);
 }
--- a/tests/ui-fulldeps/lexer/unicode-version.run.stdout
+++ b/tests/ui-fulldeps/lexer/unicode-version.run.stdout
@ -1,4 +1,3 @@
 Checking if Unicode version changed.
 If the Unicode version changes are intentional, it should also be updated in the reference at https://github.com/rust-lang/reference/blob/HEAD/src/identifiers.md.
-Unicode XID version is: (16, 0, 0)
-Unicode normalization version is: (16, 0, 0)
+Unicode version used in rustc_parse is: (17, 0, 0)