Revert "unicode_data refactors RUST-147622"

This PR reverts RUST-147622 for several reasons: 1. The RUST-147622 PR would format the generated core library code using an arbitrary `rustfmt` picked up from `PATH`, which will cause hard-to-debug failures when the `rustfmt` used to format the generated unicode data code versus the `rustfmt` used to format the in-tree library code. 2. Previously, the `unicode-table-generator` tests were not run under CI as part of `coretests`, and since for `x86_64-gnu-aux` job we run library `coretests` with `miri`, the generated tests unfortunately caused an unacceptably large Merge CI time regression from ~2 hours to ~3.5 hours, making it the slowest Merge CI job (and thus the new bottleneck). 3. This PR also has an unintended effect of causing a diagnostic regression (RUST-148387), though that's mostly an edge case not properly handled by `rustc` diagnostics. Given that these are three distinct causes with non-trivial fixes, I'm proposing to revert this PR to return us to baseline. This is not prejudice against relanding the changes with these issues addressed, but to alleviate time pressure to address these non-trivial issues.
2025-11-03 19:39:50 +08:00 · 2025-11-03 19:39:50 +08:00 · 4aeb297064
commit 4aeb297064
parent f2bae990e8
13 changed files with 1588 additions and 4696 deletions
--- a/library/core/src/unicode/mod.rs
+++ b/library/core/src/unicode/mod.rs
@ -18,9 +18,8 @@ pub(crate) use unicode_data::white_space::lookup as White_Space;

 pub(crate) mod printable;

-mod rt;
 #[allow(unreachable_pub)]
-pub mod unicode_data;
+mod unicode_data;

 /// The version of [Unicode](https://www.unicode.org/) that the Unicode parts of
 /// `char` and `str` methods are based on.
--- a/library/core/src/unicode/unicode_data.rs
+++ b/library/core/src/unicode/unicode_data.rs
--- a/library/coretests/tests/lib.rs
+++ b/library/coretests/tests/lib.rs
@ -116,7 +116,6 @@
 #![feature(try_find)]
 #![feature(try_trait_v2)]
 #![feature(uint_bit_width)]
-#![feature(unicode_internals)]
 #![feature(unsize)]
 #![feature(unwrap_infallible)]
 // tidy-alphabetical-end
--- a/library/coretests/tests/unicode.rs
+++ b/library/coretests/tests/unicode.rs
@ -1,101 +1,5 @@
-use core::unicode::unicode_data;
-use std::ops::RangeInclusive;
-
-mod test_data;
-
 #[test]
 pub fn version() {
    let (major, _minor, _update) = core::char::UNICODE_VERSION;
    assert!(major >= 10);
 }
-
-#[track_caller]
-fn test_boolean_property(ranges: &[RangeInclusive<char>], lookup: fn(char) -> bool) {
-    let mut start = '\u{80}';
-    for range in ranges {
-        for c in start..*range.start() {
-            assert!(!lookup(c), "{c:?}");
-        }
-        for c in range.clone() {
-            assert!(lookup(c), "{c:?}");
-        }
-        start = char::from_u32(*range.end() as u32 + 1).unwrap();
-    }
-    for c in start..=char::MAX {
-        assert!(!lookup(c), "{c:?}");
-    }
-}
-
-#[track_caller]
-fn test_case_mapping(ranges: &[(char, [char; 3])], lookup: fn(char) -> [char; 3]) {
-    let mut start = '\u{80}';
-    for &(key, val) in ranges {
-        for c in start..key {
-            assert_eq!(lookup(c), [c, '\0', '\0'], "{c:?}");
-        }
-        assert_eq!(lookup(key), val, "{key:?}");
-        start = char::from_u32(key as u32 + 1).unwrap();
-    }
-    for c in start..=char::MAX {
-        assert_eq!(lookup(c), [c, '\0', '\0'], "{c:?}");
-    }
-}
-
-#[test]
-#[cfg_attr(miri, ignore)]
-fn alphabetic() {
-    test_boolean_property(test_data::ALPHABETIC, unicode_data::alphabetic::lookup);
-}
-
-#[test]
-#[cfg_attr(miri, ignore)]
-fn case_ignorable() {
-    test_boolean_property(test_data::CASE_IGNORABLE, unicode_data::case_ignorable::lookup);
-}
-
-#[test]
-#[cfg_attr(miri, ignore)]
-fn cased() {
-    test_boolean_property(test_data::CASED, unicode_data::cased::lookup);
-}
-
-#[test]
-#[cfg_attr(miri, ignore)]
-fn grapheme_extend() {
-    test_boolean_property(test_data::GRAPHEME_EXTEND, unicode_data::grapheme_extend::lookup);
-}
-
-#[test]
-#[cfg_attr(miri, ignore)]
-fn lowercase() {
-    test_boolean_property(test_data::LOWERCASE, unicode_data::lowercase::lookup);
-}
-
-#[test]
-fn n() {
-    test_boolean_property(test_data::N, unicode_data::n::lookup);
-}
-
-#[test]
-#[cfg_attr(miri, ignore)]
-fn uppercase() {
-    test_boolean_property(test_data::UPPERCASE, unicode_data::uppercase::lookup);
-}
-
-#[test]
-#[cfg_attr(miri, ignore)]
-fn white_space() {
-    test_boolean_property(test_data::WHITE_SPACE, unicode_data::white_space::lookup);
-}
-
-#[test]
-#[cfg_attr(miri, ignore)]
-fn to_lowercase() {
-    test_case_mapping(test_data::TO_LOWER, unicode_data::conversions::to_lower);
-}
-
-#[test]
-#[cfg_attr(miri, ignore)]
-fn to_uppercase() {
-    test_case_mapping(test_data::TO_UPPER, unicode_data::conversions::to_upper);
-}
--- a/library/coretests/tests/unicode/test_data.rs
+++ b/library/coretests/tests/unicode/test_data.rs
--- a/src/bootstrap/src/core/build_steps/run.rs
+++ b/src/bootstrap/src/core/build_steps/run.rs
@ -374,7 +374,6 @@ impl Step for UnicodeTableGenerator {
    fn run(self, builder: &Builder<'_>) {
        let mut cmd = builder.tool_cmd(Tool::UnicodeTableGenerator);
        cmd.arg(builder.src.join("library/core/src/unicode/unicode_data.rs"));
-        cmd.arg(builder.src.join("library/coretests/tests/unicode/test_data.rs"));
        cmd.run(builder);
    }
 }
--- a/src/tools/unicode-table-generator/src/cascading_map.rs
+++ b/src/tools/unicode-table-generator/src/cascading_map.rs
@ -1,8 +1,9 @@
 use std::collections::HashMap;
+use std::fmt::Write as _;
 use std::ops::Range;

+use crate::fmt_list;
 use crate::raw_emitter::RawEmitter;
-use crate::writeln;

 impl RawEmitter {
    pub fn emit_cascading_map(&mut self, ranges: &[Range<u32>]) -> bool {
@ -23,6 +24,8 @@ impl RawEmitter {
            .flat_map(|r| (r.start..r.end).collect::<Vec<u32>>())
            .collect::<Vec<u32>>();

+        println!("there are {} points", points.len());
+
        // how many distinct ranges need to be counted?
        let mut codepoints_by_high_bytes = HashMap::<usize, Vec<u32>>::new();
        for point in points {
@ -34,7 +37,7 @@ impl RawEmitter {
        }

        let mut bit_for_high_byte = 1u8;
-        let mut arms = String::new();
+        let mut arms = Vec::<String>::new();

        let mut high_bytes: Vec<usize> = codepoints_by_high_bytes.keys().copied().collect();
        high_bytes.sort();
@ -42,33 +45,33 @@ impl RawEmitter {
            let codepoints = codepoints_by_high_bytes.get_mut(&high_byte).unwrap();
            if codepoints.len() == 1 {
                let ch = codepoints.pop().unwrap();
-                writeln!(arms, "{high_byte:#04x} => c as u32 == {ch:#04x},");
+                arms.push(format!("{high_byte} => c as u32 == {ch:#04x}"));
                continue;
            }
            // more than 1 codepoint in this arm
            for codepoint in codepoints {
                map[(*codepoint & 0xff) as usize] |= bit_for_high_byte;
            }
-            writeln!(
-                arms,
-                "{high_byte:#04x} => WHITESPACE_MAP[c as usize & 0xff] & {bit_for_high_byte} != 0,"
-            );
+            arms.push(format!(
+                "{high_byte} => WHITESPACE_MAP[c as usize & 0xff] & {bit_for_high_byte} != 0"
+            ));
            bit_for_high_byte <<= 1;
        }

+        writeln!(&mut self.file, "static WHITESPACE_MAP: [u8; 256] = [{}];", fmt_list(map.iter()))
+            .unwrap();
        self.bytes_used += 256;
-        self.file = format!(
-            "static WHITESPACE_MAP: [u8; 256] = {map:?};

-            #[inline]
-            pub const fn lookup(c: char) -> bool {{
-                debug_assert!(!c.is_ascii());
-                match c as u32 >> 8 {{
-                    {arms}\
-                    _ => false,
-                }}
-            }}"
-        );
+        writeln!(&mut self.file, "#[inline]").unwrap();
+        writeln!(&mut self.file, "pub const fn lookup(c: char) -> bool {{").unwrap();
+        writeln!(&mut self.file, "    debug_assert!(!c.is_ascii());").unwrap();
+        writeln!(&mut self.file, "    match c as u32 >> 8 {{").unwrap();
+        for arm in arms {
+            writeln!(&mut self.file, "        {arm},").unwrap();
+        }
+        writeln!(&mut self.file, "        _ => false,").unwrap();
+        writeln!(&mut self.file, "    }}").unwrap();
+        writeln!(&mut self.file, "}}").unwrap();

        true
    }
--- a/src/tools/unicode-table-generator/src/case_mapping.rs
+++ b/src/tools/unicode-table-generator/src/case_mapping.rs
@ -1,25 +1,27 @@
 use std::char;
 use std::collections::BTreeMap;
+use std::fmt::{self, Write};

-use crate::fmt_helpers::Hex;
-use crate::{CharEscape, UnicodeData, fmt_list};
+use crate::{UnicodeData, fmt_list};

 const INDEX_MASK: u32 = 1 << 22;

 pub(crate) fn generate_case_mapping(data: &UnicodeData) -> (String, [usize; 2]) {
+    let mut file = String::new();
+
+    write!(file, "const INDEX_MASK: u32 = 0x{INDEX_MASK:x};").unwrap();
+    file.push_str("\n\n");
+    file.push_str(HEADER.trim_start());
+    file.push('\n');
    let (lower_tables, lower_size) = generate_tables("LOWER", &data.to_lower);
+    file.push_str(&lower_tables);
+    file.push_str("\n\n");
    let (upper_tables, upper_size) = generate_tables("UPPER", &data.to_upper);
-    let file = format!(
-        "{lower_tables}
-        {upper_tables}"
-    );
+    file.push_str(&upper_tables);
    (file, [lower_size, upper_size])
 }

 fn generate_tables(case: &str, data: &BTreeMap<u32, [u32; 3]>) -> (String, usize) {
-    let case_lower = case.to_lowercase();
-    let case_upper = case.to_uppercase();
-
    let mut mappings = Vec::with_capacity(data.len());
    let mut multis = Vec::new();

@ -42,49 +44,77 @@ fn generate_tables(case: &str, data: &BTreeMap<u32, [u32; 3]>) -> (String, usize
            INDEX_MASK | (u32::try_from(multis.len()).unwrap() - 1)
        };

-        mappings.push((CharEscape(key), Hex(value)));
+        mappings.push((CharEscape(key), value));
    }

-    let size = size_of_val(mappings.as_slice()) + size_of_val(multis.as_slice());
-    let file = format!(
-        "
-#[rustfmt::skip]
-static {case}CASE_TABLE: &[(char, u32); {mappings_len}] = &[{mappings}];
+    let mut tables = String::new();
+    let mut size = 0;

-#[rustfmt::skip]
-static {case}CASE_TABLE_MULTI: &[[char; 3]; {multis_len}] = &[{multis}];
+    size += size_of_val(mappings.as_slice());
+    write!(
+        tables,
+        "static {}CASE_TABLE: &[(char, u32); {}] = &[{}];",
+        case,
+        mappings.len(),
+        fmt_list(mappings),
+    )
+    .unwrap();

-#[inline]
-pub fn to_{case_lower}(c: char) -> [char; 3] {{
-    const {{
-        let mut i = 0;
-        while i < {case_upper}CASE_TABLE.len() {{
-            let (_, val) = {case_upper}CASE_TABLE[i];
-            if val & (1 << 22) == 0 {{
-                assert!(char::from_u32(val).is_some());
-            }} else {{
-                let index = val & ((1 << 22) - 1);
-                assert!((index as usize) < {case_upper}CASE_TABLE_MULTI.len());
-            }}
-            i += 1;
-        }}
-    }}
+    tables.push_str("\n\n");

-    // SAFETY: Just checked that the tables are valid
-    unsafe {{
-        super::case_conversion(
-            c,
-            |c| c.to_ascii_{case_lower}case(),
-            {case_upper}CASE_TABLE,
-            {case_upper}CASE_TABLE_MULTI,
-        )
-    }}
-}}",
-        mappings = fmt_list(&mappings),
-        mappings_len = mappings.len(),
-        multis = fmt_list(&multis),
-        multis_len = multis.len(),
-    );
+    size += size_of_val(multis.as_slice());
+    write!(
+        tables,
+        "static {}CASE_TABLE_MULTI: &[[char; 3]; {}] = &[{}];",
+        case,
+        multis.len(),
+        fmt_list(multis),
+    )
+    .unwrap();

-    (file, size)
+    (tables, size)
 }
+
+struct CharEscape(char);
+
+impl fmt::Debug for CharEscape {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "'{}'", self.0.escape_default())
+    }
+}
+
+static HEADER: &str = r"
+pub fn to_lower(c: char) -> [char; 3] {
+    if c.is_ascii() {
+        [(c as u8).to_ascii_lowercase() as char, '\0', '\0']
+    } else {
+        LOWERCASE_TABLE
+            .binary_search_by(|&(key, _)| key.cmp(&c))
+            .map(|i| {
+                let u = LOWERCASE_TABLE[i].1;
+                char::from_u32(u).map(|c| [c, '\0', '\0']).unwrap_or_else(|| {
+                    // SAFETY: Index comes from statically generated table
+                    unsafe { *LOWERCASE_TABLE_MULTI.get_unchecked((u & (INDEX_MASK - 1)) as usize) }
+                })
+            })
+            .unwrap_or([c, '\0', '\0'])
+    }
+}
+
+pub fn to_upper(c: char) -> [char; 3] {
+    if c.is_ascii() {
+        [(c as u8).to_ascii_uppercase() as char, '\0', '\0']
+    } else {
+        UPPERCASE_TABLE
+            .binary_search_by(|&(key, _)| key.cmp(&c))
+            .map(|i| {
+                let u = UPPERCASE_TABLE[i].1;
+                char::from_u32(u).map(|c| [c, '\0', '\0']).unwrap_or_else(|| {
+                    // SAFETY: Index comes from statically generated table
+                    unsafe { *UPPERCASE_TABLE_MULTI.get_unchecked((u & (INDEX_MASK - 1)) as usize) }
+                })
+            })
+            .unwrap_or([c, '\0', '\0'])
+    }
+}
+";
--- a/src/tools/unicode-table-generator/src/fmt_helpers.rs
+++ b/src/tools/unicode-table-generator/src/fmt_helpers.rs
@ -1,82 +0,0 @@
-use std::fmt;
-
-// Convenience macros for writing and unwrapping.
-#[macro_export]
-macro_rules! writeln {
-    ($($args:tt)*) => {{
-        use std::fmt::Write as _;
-        std::writeln!($($args)*).unwrap();
-    }};
-}
-#[macro_export]
-macro_rules! write {
-    ($($args:tt)*) => {{
-        use std::fmt::Write as _;
-        std::write!($($args)*).unwrap();
-    }};
-}
-
-pub fn fmt_list<V: fmt::Debug>(values: impl IntoIterator<Item = V>) -> String {
-    let pieces = values.into_iter().map(|b| format!("{b:?}, "));
-    let mut out = String::new();
-    let mut line = String::from("\n    ");
-    for piece in pieces {
-        if line.len() + piece.len() < 98 {
-            line.push_str(&piece);
-        } else {
-            writeln!(out, "{}", line.trim_end());
-            line = format!("    {piece}");
-        }
-    }
-    writeln!(out, "{}", line.trim_end());
-    out
-}
-
-/// Wrapper type for formatting a `T` using its `Binary` implementation.
-#[derive(Copy, Clone)]
-pub struct Bin<T>(pub T);
-
-impl<T: fmt::Binary> fmt::Debug for Bin<T> {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        let bits = size_of::<T>() * 8;
-        std::write!(f, "0b{:0bits$b}", self.0)
-    }
-}
-
-impl<T: fmt::Binary> fmt::Display for Bin<T> {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        fmt::Debug::fmt(self, f)
-    }
-}
-
-/// Wrapper type for formatting a `T` using its `LowerHex` implementation.
-#[derive(Copy, Clone)]
-pub struct Hex<T>(pub T);
-
-impl<T: fmt::LowerHex> fmt::Debug for Hex<T> {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        std::write!(f, "{:#x}", self.0)
-    }
-}
-
-impl<T: fmt::LowerHex> fmt::Display for Hex<T> {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        fmt::Debug::fmt(self, f)
-    }
-}
-
-/// Wrapper type for formatting a `char` using `escape_unicode`.
-#[derive(Copy, Clone)]
-pub struct CharEscape(pub char);
-
-impl fmt::Debug for CharEscape {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        std::write!(f, "'{}'", self.0.escape_unicode())
-    }
-}
-
-impl fmt::Display for CharEscape {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        fmt::Debug::fmt(self, f)
-    }
-}
--- a/src/tools/unicode-table-generator/src/main.rs
+++ b/src/tools/unicode-table-generator/src/main.rs
@ -72,18 +72,18 @@
 //! or not.

 use std::collections::{BTreeMap, HashMap};
+use std::fmt;
+use std::fmt::Write;
 use std::ops::Range;

 use ucd_parse::Codepoints;

 mod cascading_map;
 mod case_mapping;
-mod fmt_helpers;
 mod raw_emitter;
 mod skiplist;
 mod unicode_download;

-use fmt_helpers::*;
 use raw_emitter::{RawEmitter, emit_codepoints, emit_whitespace};

 static PROPERTIES: &[&str] = &[
@ -207,27 +207,29 @@ fn load_data() -> UnicodeData {
 }

 fn main() {
-    let args = std::env::args().collect::<Vec<_>>();
-
-    if args.len() != 3 {
-        eprintln!("Must provide paths to write unicode tables and tests to");
+    let write_location = std::env::args().nth(1).unwrap_or_else(|| {
+        eprintln!("Must provide path to write unicode tables to");
        eprintln!(
-            "e.g. {} library/core/src/unicode/unicode_data.rs library/coretests/tests/unicode/test_data.rs",
-            args[0]
+            "e.g. {} library/core/src/unicode/unicode_data.rs",
+            std::env::args().next().unwrap_or_default()
        );
        std::process::exit(1);
-    }
+    });

-    let data_path = &args[1];
-    let test_path = &args[2];
+    // Optional test path, which is a Rust source file testing that the unicode
+    // property lookups are correct.
+    let test_path = std::env::args().nth(2);

    let unicode_data = load_data();
    let ranges_by_property = &unicode_data.ranges;

+    if let Some(path) = test_path {
+        std::fs::write(&path, generate_tests(&unicode_data).unwrap()).unwrap();
+    }
+
    let mut table_file = String::new();
-    writeln!(
-        table_file,
-        "//! This file is generated by `./x run src/tools/unicode-table-generator`; do not edit manually!",
+    table_file.push_str(
+        "//! This file is generated by `./x run src/tools/unicode-table-generator`; do not edit manually!\n",
    );

    let mut total_bytes = 0;
@ -243,9 +245,8 @@ fn main() {
        }

        modules.push((property.to_lowercase().to_string(), emitter.file));
-        writeln!(
-            table_file,
-            "// {:16}: {:5} bytes, {:6} codepoints in {:3} ranges (U+{:06X} - U+{:06X}) using {}",
+        table_file.push_str(&format!(
+            "// {:16}: {:5} bytes, {:6} codepoints in {:3} ranges (U+{:06X} - U+{:06X}) using {}\n",
            property,
            emitter.bytes_used,
            datapoints,
@ -253,42 +254,47 @@ fn main() {
            ranges.first().unwrap().start,
            ranges.last().unwrap().end,
            emitter.desc,
-        );
+        ));
        total_bytes += emitter.bytes_used;
    }
    let (conversions, sizes) = case_mapping::generate_case_mapping(&unicode_data);
    for (name, size) in ["to_lower", "to_upper"].iter().zip(sizes) {
-        writeln!(table_file, "// {:16}: {:5} bytes", name, size);
+        table_file.push_str(&format!("// {:16}: {:5} bytes\n", name, size));
        total_bytes += size;
    }
-    writeln!(table_file, "// {:16}: {:5} bytes\n", "Total", total_bytes);
+    table_file.push_str(&format!("// {:16}: {:5} bytes\n", "Total", total_bytes));

-    writeln!(table_file, "use super::rt::*;\n");
-    writeln!(table_file, "{}\n", version());
+    // Include the range search function
+    table_file.push('\n');
+    table_file.push_str(include_str!("range_search.rs"));
+    table_file.push('\n');
+
+    table_file.push_str(&version());
+
+    table_file.push('\n');

    modules.push((String::from("conversions"), conversions));

    for (name, contents) in modules {
-        writeln!(table_file, "pub mod {name} {{");
-        for line in contents.trim().lines() {
-            writeln!(table_file, "    {line}");
+        table_file.push_str("#[rustfmt::skip]\n");
+        table_file.push_str(&format!("pub mod {name} {{\n"));
+        for line in contents.lines() {
+            if !line.trim().is_empty() {
+                table_file.push_str("    ");
+                table_file.push_str(line);
+            }
+            table_file.push('\n');
        }
-        writeln!(table_file, "}}\n");
+        table_file.push_str("}\n\n");
    }

-    let test_file = generate_tests(&unicode_data);
-
-    std::fs::write(&test_path, test_file).unwrap();
-    std::fs::write(&data_path, table_file).unwrap();
-    rustfmt(&data_path);
-    rustfmt(&test_path);
-}
-
-fn rustfmt(path: &str) {
-    std::process::Command::new("rustfmt").arg(path).status().expect("rustfmt failed");
+    std::fs::write(&write_location, format!("{}\n", table_file.trim_end())).unwrap();
 }

 fn version() -> String {
+    let mut out = String::new();
+    out.push_str("pub const UNICODE_VERSION: (u8, u8, u8) = ");
+
    let readme =
        std::fs::read_to_string(std::path::Path::new(UNICODE_DIRECTORY).join("ReadMe.txt"))
            .unwrap();
@ -300,72 +306,109 @@ fn version() -> String {
        readme[start..end].split('.').map(|v| v.parse::<u32>().expect(v)).collect::<Vec<_>>();
    let [major, minor, micro] = [version[0], version[1], version[2]];

-    format!("pub const UNICODE_VERSION: (u8, u8, u8) = ({major}, {minor}, {micro});")
+    out.push_str(&format!("({major}, {minor}, {micro});\n"));
+    out
 }

-fn generate_tests(data: &UnicodeData) -> String {
+fn fmt_list<V: std::fmt::Debug>(values: impl IntoIterator<Item = V>) -> String {
+    let pieces = values.into_iter().map(|b| format!("{b:?}, ")).collect::<Vec<_>>();
+    let mut out = String::new();
+    let mut line = String::from("\n    ");
+    for piece in pieces {
+        if line.len() + piece.len() < 98 {
+            line.push_str(&piece);
+        } else {
+            out.push_str(line.trim_end());
+            out.push('\n');
+            line = format!("    {piece}");
+        }
+    }
+    out.push_str(line.trim_end());
+    out.push('\n');
+    out
+}
+
+fn generate_tests(data: &UnicodeData) -> Result<String, fmt::Error> {
    let mut s = String::new();
-    writeln!(
-        s,
-        "//! This file is generated by `./x run src/tools/unicode-table-generator`; do not edit manually!"
-    );
-    writeln!(s, "// ignore-tidy-filelength\n");
-    writeln!(s, "use std::ops::RangeInclusive;\n");
+    writeln!(s, "#![feature(core_intrinsics)]")?;
+    writeln!(s, "#![allow(internal_features, dead_code)]")?;
+    writeln!(s, "// ignore-tidy-filelength")?;
+    writeln!(s, "use std::intrinsics;")?;
+    writeln!(s, "mod unicode_data;")?;
+    writeln!(s, "fn main() {{")?;
    for (property, ranges) in &data.ranges {
-        let prop_upper = property.to_uppercase();
-        let is_true = (char::MIN..=char::MAX)
+        let prop = property.to_lowercase();
+        writeln!(s, r#"    println!("Testing {prop}");"#)?;
+        writeln!(s, "    {prop}_true();")?;
+        writeln!(s, "    {prop}_false();")?;
+        let (is_true, is_false): (Vec<_>, Vec<_>) = (char::MIN..=char::MAX)
            .filter(|c| !c.is_ascii())
            .map(u32::from)
-            .filter(|c| ranges.iter().any(|r| r.contains(c)))
-            .collect::<Vec<_>>();
-        let is_true = ranges_from_set(&is_true);
-        let is_true = is_true
-            .iter()
-            .map(|r| {
-                let start = char::from_u32(r.start).unwrap();
-                let end = char::from_u32(r.end - 1).unwrap();
-                CharEscape(start)..=CharEscape(end)
-            })
-            .collect::<Vec<_>>();
+            .partition(|c| ranges.iter().any(|r| r.contains(c)));

-        writeln!(
-            s,
-            r#"
-#[rustfmt::skip]
-pub(super) static {prop_upper}: &[RangeInclusive<char>; {is_true_len}] = &[{is_true}];
-"#,
-            is_true_len = is_true.len(),
-            is_true = fmt_list(is_true),
-        );
+        writeln!(s, "    fn {prop}_true() {{")?;
+        generate_asserts(&mut s, &prop, &is_true, true)?;
+        writeln!(s, "    }}")?;
+
+        writeln!(s, "    fn {prop}_false() {{")?;
+        generate_asserts(&mut s, &prop, &is_false, false)?;
+        writeln!(s, "    }}")?;
    }

-    for (prop_lower, conversion) in
-        ["to_lower", "to_upper"].iter().zip([&data.to_lower, &data.to_upper])
+    for (name, conversion) in ["to_lower", "to_upper"].iter().zip([&data.to_lower, &data.to_upper])
    {
-        let prop_upper = prop_lower.to_uppercase();
+        writeln!(s, r#"    println!("Testing {name}");"#)?;
+        for (c, mapping) in conversion {
+            let c = char::from_u32(*c).unwrap();
+            let mapping = mapping.map(|c| char::from_u32(c).unwrap());
+            writeln!(
+                s,
+                r#"    assert_eq!(unicode_data::conversions::{name}({c:?}), {mapping:?});"#
+            )?;
+        }
+        let unmapped: Vec<_> = (char::MIN..=char::MAX)
+            .filter(|c| !c.is_ascii())
+            .map(u32::from)
+            .filter(|c| !conversion.contains_key(c))
+            .collect();
+        let unmapped_ranges = ranges_from_set(&unmapped);
+        for range in unmapped_ranges {
+            let start = char::from_u32(range.start).unwrap();
+            let end = char::from_u32(range.end - 1).unwrap();
+            writeln!(s, "    for c in {start:?}..={end:?} {{")?;
+            writeln!(
+                s,
+                r#"        assert_eq!(unicode_data::conversions::{name}(c), [c, '\0', '\0']);"#
+            )?;

-        let mapped = conversion
-            .iter()
-            .map(|(c, chars)| {
-                (
-                    CharEscape(char::from_u32(*c).unwrap()),
-                    chars.map(|c| CharEscape(char::from_u32(c).unwrap())),
-                )
-            })
-            .collect::<Vec<_>>();
-
-        writeln!(
-            s,
-            r#"
-#[rustfmt::skip]
-pub(super) static {prop_upper}: &[(char, [char; 3]); {mapped_len}] = &[{mapped}];
-"#,
-            mapped_len = mapped.len(),
-            mapped = fmt_list(mapped),
-        );
+            writeln!(s, "    }}")?;
+        }
    }

-    s
+    writeln!(s, "}}")?;
+    Ok(s)
+}
+
+fn generate_asserts(
+    s: &mut String,
+    prop: &str,
+    points: &[u32],
+    truthy: bool,
+) -> Result<(), fmt::Error> {
+    let truthy = if truthy { "" } else { "!" };
+    for range in ranges_from_set(points) {
+        let start = char::from_u32(range.start).unwrap();
+        let end = char::from_u32(range.end - 1).unwrap();
+        match range.len() {
+            1 => writeln!(s, "        assert!({truthy}unicode_data::{prop}::lookup({start:?}));")?,
+            _ => {
+                writeln!(s, "        for c in {start:?}..={end:?} {{")?;
+                writeln!(s, "            assert!({truthy}unicode_data::{prop}::lookup(c));")?;
+                writeln!(s, "        }}")?;
+            }
+        }
+    }
+    Ok(())
 }

 /// Group the elements of `set` into contigous ranges
--- a/src/tools/unicode-table-generator/src/range_search.rs
+++ b/src/tools/unicode-table-generator/src/range_search.rs
@ -1,7 +1,5 @@
-//! Runtime support for `unicode_data`.
-
 #[inline(always)]
-pub(super) const fn bitset_search<
+const fn bitset_search<
    const N: usize,
    const CHUNK_SIZE: usize,
    const N1: usize,
@ -48,10 +46,10 @@ pub(super) const fn bitset_search<
 }

 #[repr(transparent)]
-pub(super) struct ShortOffsetRunHeader(pub(super) u32);
+struct ShortOffsetRunHeader(u32);

 impl ShortOffsetRunHeader {
-    pub(super) const fn new(start_index: usize, prefix_sum: u32) -> Self {
+    const fn new(start_index: usize, prefix_sum: u32) -> Self {
        assert!(start_index < (1 << 11));
        assert!(prefix_sum < (1 << 21));

@ -59,12 +57,12 @@ impl ShortOffsetRunHeader {
    }

    #[inline]
-    pub(super) const fn start_index(&self) -> usize {
+    const fn start_index(&self) -> usize {
        (self.0 >> 21) as usize
    }

    #[inline]
-    pub(super) const fn prefix_sum(&self) -> u32 {
+    const fn prefix_sum(&self) -> u32 {
        self.0 & ((1 << 21) - 1)
    }
 }
@ -74,7 +72,7 @@ impl ShortOffsetRunHeader {
 /// - The last element of `short_offset_runs` must be greater than `std::char::MAX`.
 /// - The start indices of all elements in `short_offset_runs` must be less than `OFFSETS`.
 #[inline(always)]
-pub(super) unsafe fn skip_search<const SOR: usize, const OFFSETS: usize>(
+unsafe fn skip_search<const SOR: usize, const OFFSETS: usize>(
    needle: char,
    short_offset_runs: &[ShortOffsetRunHeader; SOR],
    offsets: &[u8; OFFSETS],
@ -128,35 +126,3 @@ pub(super) unsafe fn skip_search<const SOR: usize, const OFFSETS: usize>(
    }
    offset_idx % 2 == 1
 }
-
-/// # Safety
-/// The second component of each tuple in `table` must either be:
-/// - A valid `char`
-/// - A value with the high bit (1 << 22) set, and the lower 22 bits
-///   being a valid index into `multi`.
-#[inline(always)]
-pub(super) unsafe fn case_conversion(
-    c: char,
-    ascii_fn: fn(char) -> char,
-    table: &[(char, u32)],
-    multi: &[[char; 3]],
-) -> [char; 3] {
-    const INDEX_MASK: u32 = 1 << 22;
-
-    if c.is_ascii() {
-        return [ascii_fn(c), '\0', '\0'];
-    }
-
-    let Ok(i) = table.binary_search_by(|&(key, _)| key.cmp(&c)) else {
-        return [c, '\0', '\0'];
-    };
-
-    let u = table[i].1;
-    match char::from_u32(u) {
-        Option::Some(c) => [c, '\0', '\0'],
-        Option::None => {
-            // SAFETY: Index comes from statically generated table
-            unsafe { *multi.get_unchecked((u & (INDEX_MASK - 1)) as usize) }
-        }
-    }
-}
--- a/src/tools/unicode-table-generator/src/raw_emitter.rs
+++ b/src/tools/unicode-table-generator/src/raw_emitter.rs
@ -1,7 +1,8 @@
 use std::collections::{BTreeMap, BTreeSet, HashMap};
+use std::fmt::{self, Write};
 use std::ops::Range;

-use crate::{Bin, fmt_list, writeln};
+use crate::fmt_list;

 #[derive(Clone)]
 pub struct RawEmitter {
@ -15,6 +16,13 @@ impl RawEmitter {
        RawEmitter { file: String::new(), bytes_used: 0, desc: String::new() }
    }

+    fn blank_line(&mut self) {
+        if self.file.is_empty() || self.file.ends_with("\n\n") {
+            return;
+        }
+        writeln!(&mut self.file).unwrap();
+    }
+
    fn emit_bitset(&mut self, ranges: &[Range<u32>]) -> Result<(), String> {
        let first_code_point = ranges.first().unwrap().start;
        let last_code_point = ranges.last().unwrap().end;
@ -60,33 +68,48 @@ impl RawEmitter {
        }
        self.emit_chunk_map(word_indices[&0], &compressed_words, best.unwrap().0);

+        struct Bits(u64);
+        impl fmt::Debug for Bits {
+            fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+                write!(f, "0b{:064b}", self.0)
+            }
+        }
+
+        writeln!(
+            &mut self.file,
+            "static BITSET_CANONICAL: [u64; {}] = [{}];",
+            canonicalized.canonical_words.len(),
+            fmt_list(canonicalized.canonical_words.iter().map(|v| Bits(*v))),
+        )
+        .unwrap();
        self.bytes_used += 8 * canonicalized.canonical_words.len();
+        writeln!(
+            &mut self.file,
+            "static BITSET_MAPPING: [(u8, u8); {}] = [{}];",
+            canonicalized.canonicalized_words.len(),
+            fmt_list(&canonicalized.canonicalized_words),
+        )
+        .unwrap();
        // 8 bit index into shifted words, 7 bits for shift + optional flip
        // We only need it for the words that we removed by applying a shift and
        // flip to them.
        self.bytes_used += 2 * canonicalized.canonicalized_words.len();

-        writeln!(
-            self.file,
-            "static BITSET_CANONICAL: [u64; {canonical_words_len}] = {canonical_words:?};
-            static BITSET_MAPPING: [(u8, u8); {canonicalized_words_len}] = {canonicalized_words:?};
+        self.blank_line();

-            pub const fn lookup(c: char) -> bool {{
-                debug_assert!(!c.is_ascii());
-                (c as u32) >= {first_code_point:#04x} &&
-                    super::bitset_search(
-                        c as u32,
-                        &BITSET_CHUNKS_MAP,
-                        &BITSET_INDEX_CHUNKS,
-                        &BITSET_CANONICAL,
-                        &BITSET_MAPPING,
-                    )
-            }}",
-            canonical_words = canonicalized.canonical_words,
-            canonical_words_len = canonicalized.canonical_words.len(),
-            canonicalized_words = canonicalized.canonicalized_words,
-            canonicalized_words_len = canonicalized.canonicalized_words.len(),
-        );
+        writeln!(&mut self.file, "pub const fn lookup(c: char) -> bool {{").unwrap();
+        writeln!(&mut self.file, "    debug_assert!(!c.is_ascii());").unwrap();
+        if first_code_point > 0x7f {
+            writeln!(&mut self.file, "    (c as u32) >= {first_code_point:#04x} &&").unwrap();
+        }
+        writeln!(&mut self.file, "    super::bitset_search(").unwrap();
+        writeln!(&mut self.file, "        c as u32,").unwrap();
+        writeln!(&mut self.file, "        &BITSET_CHUNKS_MAP,").unwrap();
+        writeln!(&mut self.file, "        &BITSET_INDEX_CHUNKS,").unwrap();
+        writeln!(&mut self.file, "        &BITSET_CANONICAL,").unwrap();
+        writeln!(&mut self.file, "        &BITSET_MAPPING,").unwrap();
+        writeln!(&mut self.file, "    )").unwrap();
+        writeln!(&mut self.file, "}}").unwrap();

        Ok(())
    }
@ -110,21 +133,29 @@ impl RawEmitter {
            chunk_indices.push(chunk_map[chunk]);
        }

+        writeln!(
+            &mut self.file,
+            "static BITSET_CHUNKS_MAP: [u8; {}] = [{}];",
+            chunk_indices.len(),
+            fmt_list(&chunk_indices),
+        )
+        .unwrap();
        self.bytes_used += chunk_indices.len();
        writeln!(
-            self.file,
-            "static BITSET_CHUNKS_MAP: [u8; {chunk_indices_len}] = {chunk_indices:?};
-            static BITSET_INDEX_CHUNKS: [[u8; {chunk_len}]; {chunks_len}] = [{chunks}];",
-            chunk_indices_len = chunk_indices.len(),
-            chunk_len = chunk_length,
-            chunks_len = chunks.len(),
-            chunks = fmt_list(chunks.iter()),
-        );
+            &mut self.file,
+            "static BITSET_INDEX_CHUNKS: [[u8; {}]; {}] = [{}];",
+            chunk_length,
+            chunks.len(),
+            fmt_list(chunks.iter()),
+        )
+        .unwrap();
        self.bytes_used += chunk_length * chunks.len();
    }
 }

 pub fn emit_codepoints(emitter: &mut RawEmitter, ranges: &[Range<u32>]) {
+    emitter.blank_line();
+
    let mut bitset = emitter.clone();
    let bitset_ok = bitset.emit_bitset(ranges).is_ok();

@ -141,6 +172,8 @@ pub fn emit_codepoints(emitter: &mut RawEmitter, ranges: &[Range<u32>]) {
 }

 pub fn emit_whitespace(emitter: &mut RawEmitter, ranges: &[Range<u32>]) {
+    emitter.blank_line();
+
    let mut cascading = emitter.clone();
    cascading.emit_cascading_map(ranges);
    *emitter = cascading;
@ -148,7 +181,7 @@ pub fn emit_whitespace(emitter: &mut RawEmitter, ranges: &[Range<u32>]) {
 }

 struct Canonicalized {
-    canonical_words: Vec<Bin<u64>>,
+    canonical_words: Vec<u64>,
    canonicalized_words: Vec<(u8, u8)>,

    /// Maps an input unique word to the associated index (u8) which is into
@ -361,7 +394,6 @@ impl Canonicalized {
                )
            })
            .collect::<Vec<(u8, u8)>>();
-        let canonical_words = canonical_words.into_iter().map(Bin).collect::<Vec<_>>();
        Canonicalized { unique_mapping, canonical_words, canonicalized_words }
    }
 }
--- a/src/tools/unicode-table-generator/src/skiplist.rs
+++ b/src/tools/unicode-table-generator/src/skiplist.rs
@ -1,8 +1,8 @@
-use std::fmt::{self};
+use std::fmt::{self, Write as _};
 use std::ops::Range;

+use crate::fmt_list;
 use crate::raw_emitter::RawEmitter;
-use crate::writeln;

 /// This will get packed into a single u32 before inserting into the data set.
 #[derive(PartialEq)]
@ -68,45 +68,79 @@ impl RawEmitter {
            assert!(inserted);
        }

+        writeln!(&mut self.file, "use super::ShortOffsetRunHeader;\n").unwrap();
+        writeln!(
+            &mut self.file,
+            "static SHORT_OFFSET_RUNS: [ShortOffsetRunHeader; {}] = [{}];",
+            short_offset_runs.len(),
+            fmt_list(short_offset_runs.iter())
+        )
+        .unwrap();
        self.bytes_used += 4 * short_offset_runs.len();
+        writeln!(
+            &mut self.file,
+            "static OFFSETS: [u8; {}] = [{}];",
+            coded_offsets.len(),
+            fmt_list(&coded_offsets)
+        )
+        .unwrap();
        self.bytes_used += coded_offsets.len();

        // The inlining in this code works like the following:
        //
-        // The `skip_search` function is always inlined into the parent `lookup_slow` fn,
+        // The `skip_search` function is always inlined into the parent `lookup` fn,
        // thus the compiler can generate optimal code based on the referenced `static`s.
        //
-        // The lower-bounds check is inlined into the caller, and slower-path
-        // `skip_search` is outlined into a separate `lookup_slow` fn.
-        assert!(first_code_point > 0x7f);
-        writeln!(self.file,
-            "use super::ShortOffsetRunHeader;
-
-            static SHORT_OFFSET_RUNS: [ShortOffsetRunHeader; {short_offset_runs_len}] = {short_offset_runs:?};
-            static OFFSETS: [u8; {coded_offset_len}] = {coded_offsets:?};
-
-            #[inline]
-            pub fn lookup(c: char) -> bool {{
-                debug_assert!(!c.is_ascii());
-                (c as u32) >= {first_code_point:#04x} && lookup_slow(c)
-            }}
-
-            #[inline(never)]
-            fn lookup_slow(c: char) -> bool {{
-                const {{
-                    assert!(SHORT_OFFSET_RUNS.last().unwrap().0 > char::MAX as u32);
-                    let mut i = 0;
-                    while i < SHORT_OFFSET_RUNS.len() {{
-                        assert!(SHORT_OFFSET_RUNS[i].start_index() < OFFSETS.len());
-                        i += 1;
-                    }}
-                }}
-                // SAFETY: We just ensured the last element of `SHORT_OFFSET_RUNS` is greater than `std::char::MAX`
-                // and the start indices of all elements in `SHORT_OFFSET_RUNS` are smaller than `OFFSETS.len()`.
-                unsafe {{ super::skip_search(c, &SHORT_OFFSET_RUNS, &OFFSETS) }}
-            }}",
-            short_offset_runs_len = short_offset_runs.len(),
-            coded_offset_len = coded_offsets.len(),
-        );
+        // In the case of ASCII optimization, the lower-bounds check is inlined into
+        // the caller, and slower-path `skip_search` is outlined into a separate `lookup_slow` fn.
+        //
+        // Thus, in both cases, the `skip_search` function is specialized for the `static`s,
+        // and outlined into the prebuilt `std`.
+        if first_code_point > 0x7f {
+            writeln!(&mut self.file, "#[inline]").unwrap();
+            writeln!(&mut self.file, "pub fn lookup(c: char) -> bool {{").unwrap();
+            writeln!(&mut self.file, "    debug_assert!(!c.is_ascii());").unwrap();
+            writeln!(&mut self.file, "    (c as u32) >= {first_code_point:#04x} && lookup_slow(c)")
+                .unwrap();
+            writeln!(&mut self.file, "}}").unwrap();
+            writeln!(&mut self.file).unwrap();
+            writeln!(&mut self.file, "#[inline(never)]").unwrap();
+            writeln!(&mut self.file, "fn lookup_slow(c: char) -> bool {{").unwrap();
+        } else {
+            writeln!(&mut self.file, "pub fn lookup(c: char) -> bool {{").unwrap();
+            writeln!(&mut self.file, "    debug_assert!(!c.is_ascii());").unwrap();
+        }
+        writeln!(&mut self.file, "    const {{").unwrap();
+        writeln!(
+            &mut self.file,
+            "        assert!(SHORT_OFFSET_RUNS.last().unwrap().0 > char::MAX as u32);",
+        )
+        .unwrap();
+        writeln!(&mut self.file, "        let mut i = 0;").unwrap();
+        writeln!(&mut self.file, "        while i < SHORT_OFFSET_RUNS.len() {{").unwrap();
+        writeln!(
+            &mut self.file,
+            "            assert!(SHORT_OFFSET_RUNS[i].start_index() < OFFSETS.len());",
+        )
+        .unwrap();
+        writeln!(&mut self.file, "            i += 1;").unwrap();
+        writeln!(&mut self.file, "        }}").unwrap();
+        writeln!(&mut self.file, "    }}").unwrap();
+        writeln!(
+            &mut self.file,
+            "    // SAFETY: We just ensured the last element of `SHORT_OFFSET_RUNS` is greater than `std::char::MAX`",
+        )
+        .unwrap();
+        writeln!(
+            &mut self.file,
+            "    // and the start indices of all elements in `SHORT_OFFSET_RUNS` are smaller than `OFFSETS.len()`.",
+        )
+        .unwrap();
+        writeln!(
+            &mut self.file,
+            "    unsafe {{ super::skip_search(c, &SHORT_OFFSET_RUNS, &OFFSETS) }}"
+        )
+        .unwrap();
+        writeln!(&mut self.file, "}}").unwrap();
    }
 }