Auto merge of #146232 - bjorn3:lto_allocator_shim, r=lqd

Make the allocator shim participate in LTO again This is likely the cause of the perf regression in https://github.com/rust-lang/rust/pull/145955. It also caused some functional regressions. Fixes https://github.com/rust-lang/rust/issues/146235 Fixes https://github.com/rust-lang/rust/issues/146239
2025-09-06 15:21:16 +00:00 · 2025-09-06 15:21:16 +00:00 · bea625f327
commit bea625f327
parent 6d5caf3a4a 2cf94b92ca
7 changed files with 79 additions and 29 deletions
--- a/compiler/rustc_codegen_cranelift/src/driver/aot.rs
+++ b/compiler/rustc_codegen_cranelift/src/driver/aot.rs
@ -12,7 +12,9 @@ use cranelift_object::{ObjectBuilder, ObjectModule};
 use rustc_codegen_ssa::assert_module_sources::CguReuse;
 use rustc_codegen_ssa::back::link::ensure_removed;
 use rustc_codegen_ssa::base::determine_cgu_reuse;
-use rustc_codegen_ssa::{CodegenResults, CompiledModule, CrateInfo, errors as ssa_errors};
+use rustc_codegen_ssa::{
+    CodegenResults, CompiledModule, CrateInfo, ModuleKind, errors as ssa_errors,
+};
 use rustc_data_structures::profiling::SelfProfilerRef;
 use rustc_data_structures::stable_hasher::{HashStable, StableHasher};
 use rustc_data_structures::sync::{IntoDynSyncSend, par_map};
@ -361,6 +363,7 @@ fn emit_cgu(
        invocation_temp,
        prof,
        product.object,
+        ModuleKind::Regular,
        name.clone(),
        producer,
    )?;
@ -369,6 +372,7 @@ fn emit_cgu(
        module_regular,
        module_global_asm: global_asm_object_file.map(|global_asm_object_file| CompiledModule {
            name: format!("{name}.asm"),
+            kind: ModuleKind::Regular,
            object: Some(global_asm_object_file),
            dwarf_object: None,
            bytecode: None,
@ -385,6 +389,7 @@ fn emit_module(
    invocation_temp: Option<&str>,
    prof: &SelfProfilerRef,
    mut object: cranelift_object::object::write::Object<'_>,
+    kind: ModuleKind,
    name: String,
    producer_str: &str,
 ) -> Result<CompiledModule, String> {
@ -425,6 +430,7 @@ fn emit_module(

    Ok(CompiledModule {
        name,
+        kind,
        object: Some(tmp_file),
        dwarf_object: None,
        bytecode: None,
@ -479,6 +485,7 @@ fn reuse_workproduct_for_cgu(
    Ok(ModuleCodegenResult {
        module_regular: CompiledModule {
            name: cgu.name().to_string(),
+            kind: ModuleKind::Regular,
            object: Some(obj_out_regular),
            dwarf_object: None,
            bytecode: None,
@ -488,6 +495,7 @@ fn reuse_workproduct_for_cgu(
        },
        module_global_asm: source_file_global_asm.map(|source_file| CompiledModule {
            name: cgu.name().to_string(),
+            kind: ModuleKind::Regular,
            object: Some(obj_out_global_asm),
            dwarf_object: None,
            bytecode: None,
@ -643,6 +651,7 @@ fn emit_allocator_module(tcx: TyCtxt<'_>) -> Option<CompiledModule> {
            tcx.sess.invocation_temp.as_deref(),
            &tcx.sess.prof,
            product.object,
+            ModuleKind::Allocator,
            "allocator_shim".to_owned(),
            &crate::debuginfo::producer(tcx.sess),
        ) {
--- a/compiler/rustc_codegen_llvm/src/back/lto.rs
+++ b/compiler/rustc_codegen_llvm/src/back/lto.rs
@ -11,7 +11,7 @@ use object::{Object, ObjectSection};
 use rustc_codegen_ssa::back::lto::{SerializedModule, ThinModule, ThinShared};
 use rustc_codegen_ssa::back::write::{CodegenContext, FatLtoInput};
 use rustc_codegen_ssa::traits::*;
-use rustc_codegen_ssa::{ModuleCodegen, looks_like_rust_object_file};
+use rustc_codegen_ssa::{ModuleCodegen, ModuleKind, looks_like_rust_object_file};
 use rustc_data_structures::fx::FxHashMap;
 use rustc_data_structures::memmap::Mmap;
 use rustc_errors::DiagCtxtHandle;
@ -225,9 +225,15 @@ fn fat_lto(
    // All the other modules will be serialized and reparsed into the new
    // context, so this hopefully avoids serializing and parsing the largest
    // codegen unit.
+    //
+    // Additionally use a regular module as the base here to ensure that various
+    // file copy operations in the backend work correctly. The only other kind
+    // of module here should be an allocator one, and if your crate is smaller
+    // than the allocator module then the size doesn't really matter anyway.
    let costliest_module = in_memory
        .iter()
        .enumerate()
+        .filter(|&(_, module)| module.kind == ModuleKind::Regular)
        .map(|(i, module)| {
            let cost = unsafe { llvm::LLVMRustModuleCost(module.module_llvm.llmod()) };
            (cost, i)
--- a/compiler/rustc_codegen_ssa/src/back/lto.rs
+++ b/compiler/rustc_codegen_ssa/src/back/lto.rs
@ -1,7 +1,6 @@
 use std::ffi::CString;
 use std::sync::Arc;

-use rustc_ast::expand::allocator::AllocatorKind;
 use rustc_data_structures::memmap::Mmap;
 use rustc_hir::def_id::{CrateNum, LOCAL_CRATE};
 use rustc_middle::middle::exported_symbols::{ExportedSymbol, SymbolExportInfo, SymbolExportLevel};
@ -96,19 +95,6 @@ pub(super) fn exported_symbols_for_lto(
            .filter_map(|&(s, info): &(ExportedSymbol<'_>, SymbolExportInfo)| {
                if info.level.is_below_threshold(export_threshold) || info.used {
                    Some(symbol_name_for_instance_in_crate(tcx, s, cnum))
-                } else if export_threshold == SymbolExportLevel::C
-                    && info.rustc_std_internal_symbol
-                    && let Some(AllocatorKind::Default) = allocator_kind_for_codegen(tcx)
-                {
-                    // Export the __rdl_* exports for usage by the allocator shim when not using
-                    // #[global_allocator]. Most of the conditions above are only used to avoid
-                    // unnecessary expensive symbol_name_for_instance_in_crate calls.
-                    let sym = symbol_name_for_instance_in_crate(tcx, s, cnum);
-                    if sym.contains("__rdl_") || sym.contains("__rg_oom") {
-                        Some(sym)
-                    } else {
-                        None
-                    }
                } else {
                    None
                }
--- a/compiler/rustc_codegen_ssa/src/back/write.rs
+++ b/compiler/rustc_codegen_ssa/src/back/write.rs
@ -334,6 +334,7 @@ pub struct CodegenContext<B: WriteBackendMethods> {
    pub output_filenames: Arc<OutputFilenames>,
    pub invocation_temp: Option<String>,
    pub module_config: Arc<ModuleConfig>,
+    pub allocator_config: Arc<ModuleConfig>,
    pub tm_factory: TargetMachineFactoryFn<B>,
    pub msvc_imps_needed: bool,
    pub is_pe_coff: bool,
@ -489,7 +490,7 @@ fn copy_all_cgu_workproducts_to_incr_comp_cache_dir(

    let _timer = sess.timer("copy_all_cgu_workproducts_to_incr_comp_cache_dir");

-    for module in &compiled_modules.modules {
+    for module in compiled_modules.modules.iter().filter(|m| m.kind == ModuleKind::Regular) {
        let mut files = Vec::new();
        if let Some(object_file_path) = &module.object {
            files.push((OutputType::Object.extension(), object_file_path.as_path()));
@ -794,12 +795,19 @@ pub(crate) fn compute_per_cgu_lto_type(
    sess_lto: &Lto,
    opts: &config::Options,
    sess_crate_types: &[CrateType],
+    module_kind: ModuleKind,
 ) -> ComputedLtoType {
    // If the linker does LTO, we don't have to do it. Note that we
    // keep doing full LTO, if it is requested, as not to break the
    // assumption that the output will be a single module.
    let linker_does_lto = opts.cg.linker_plugin_lto.enabled();

+    // When we're automatically doing ThinLTO for multi-codegen-unit
+    // builds we don't actually want to LTO the allocator module if
+    // it shows up. This is due to various linker shenanigans that
+    // we'll encounter later.
+    let is_allocator = module_kind == ModuleKind::Allocator;
+
    // We ignore a request for full crate graph LTO if the crate type
    // is only an rlib, as there is no full crate graph to process,
    // that'll happen later.
@ -811,7 +819,7 @@ pub(crate) fn compute_per_cgu_lto_type(
    let is_rlib = matches!(sess_crate_types, [CrateType::Rlib]);

    match sess_lto {
-        Lto::ThinLocal if !linker_does_lto => ComputedLtoType::Thin,
+        Lto::ThinLocal if !linker_does_lto && !is_allocator => ComputedLtoType::Thin,
        Lto::Thin if !linker_does_lto && !is_rlib => ComputedLtoType::Thin,
        Lto::Fat if !is_rlib => ComputedLtoType::Fat,
        _ => ComputedLtoType::No,
@ -825,18 +833,23 @@ fn execute_optimize_work_item<B: ExtraBackendMethods>(
    let dcx = cgcx.create_dcx();
    let dcx = dcx.handle();

-    B::optimize(cgcx, dcx, &mut module, &cgcx.module_config);
+    let module_config = match module.kind {
+        ModuleKind::Regular => &cgcx.module_config,
+        ModuleKind::Allocator => &cgcx.allocator_config,
+    };
+
+    B::optimize(cgcx, dcx, &mut module, module_config);

    // After we've done the initial round of optimizations we need to
    // decide whether to synchronously codegen this module or ship it
    // back to the coordinator thread for further LTO processing (which
    // has to wait for all the initial modules to be optimized).

-    let lto_type = compute_per_cgu_lto_type(&cgcx.lto, &cgcx.opts, &cgcx.crate_types);
+    let lto_type = compute_per_cgu_lto_type(&cgcx.lto, &cgcx.opts, &cgcx.crate_types, module.kind);

    // If we're doing some form of incremental LTO then we need to be sure to
    // save our module to disk first.
-    let bitcode = if cgcx.module_config.emit_pre_lto_bc {
+    let bitcode = if module_config.emit_pre_lto_bc {
        let filename = pre_lto_bitcode_filename(&module.name);
        cgcx.incr_comp_session_dir.as_ref().map(|path| path.join(&filename))
    } else {
@ -845,7 +858,7 @@ fn execute_optimize_work_item<B: ExtraBackendMethods>(

    match lto_type {
        ComputedLtoType::No => {
-            let module = B::codegen(cgcx, module, &cgcx.module_config);
+            let module = B::codegen(cgcx, module, module_config);
            WorkItemResult::Finished(module)
        }
        ComputedLtoType::Thin => {
@ -947,6 +960,7 @@ fn execute_copy_from_cache_work_item<B: ExtraBackendMethods>(

    WorkItemResult::Finished(CompiledModule {
        links_from_incr_cache,
+        kind: ModuleKind::Regular,
        name: module.name,
        object,
        dwarf_object,
@ -1133,6 +1147,7 @@ fn start_executing_work<B: ExtraBackendMethods>(
        diag_emitter: shared_emitter.clone(),
        output_filenames: Arc::clone(tcx.output_filenames(())),
        module_config: regular_config,
+        allocator_config,
        tm_factory: backend.target_machine_factory(tcx.sess, ol, backend_features),
        msvc_imps_needed: msvc_imps_needed(tcx),
        is_pe_coff: tcx.sess.target.is_like_windows,
@ -1147,11 +1162,6 @@ fn start_executing_work<B: ExtraBackendMethods>(
        invocation_temp: sess.invocation_temp.clone(),
    };

-    let compiled_allocator_module = allocator_module.map(|mut allocator_module| {
-        B::optimize(&cgcx, tcx.sess.dcx(), &mut allocator_module, &allocator_config);
-        B::codegen(&cgcx, allocator_module, &allocator_config)
-    });
-
    // This is the "main loop" of parallel work happening for parallel codegen.
    // It's here that we manage parallelism, schedule work, and work with
    // messages coming from clients.
@ -1331,6 +1341,17 @@ fn start_executing_work<B: ExtraBackendMethods>(

        let mut llvm_start_time: Option<VerboseTimingGuard<'_>> = None;

+        let compiled_allocator_module = allocator_module.and_then(|allocator_module| {
+            match execute_optimize_work_item(&cgcx, allocator_module) {
+                WorkItemResult::Finished(compiled_module) => return Some(compiled_module),
+                WorkItemResult::NeedsFatLto(fat_lto_input) => needs_fat_lto.push(fat_lto_input),
+                WorkItemResult::NeedsThinLto(name, thin_buffer) => {
+                    needs_thin_lto.push((name, thin_buffer))
+                }
+            }
+            None
+        });
+
        // Run the message loop while there's still anything that needs message
        // processing. Note that as soon as codegen is aborted we simply want to
        // wait for all existing work to finish, so many of the conditions here
--- a/compiler/rustc_codegen_ssa/src/base.rs
+++ b/compiler/rustc_codegen_ssa/src/base.rs
@ -46,7 +46,9 @@ use crate::meth::load_vtable;
 use crate::mir::operand::OperandValue;
 use crate::mir::place::PlaceRef;
 use crate::traits::*;
-use crate::{CachedModuleCodegen, CodegenLintLevels, CrateInfo, ModuleCodegen, errors, meth, mir};
+use crate::{
+    CachedModuleCodegen, CodegenLintLevels, CrateInfo, ModuleCodegen, ModuleKind, errors, meth, mir,
+};

 pub(crate) fn bin_op_to_icmp_predicate(op: BinOp, signed: bool) -> IntPredicate {
    match (op, signed) {
@ -1124,7 +1126,12 @@ pub fn determine_cgu_reuse<'tcx>(tcx: TyCtxt<'tcx>, cgu: &CodegenUnit<'tcx>) ->
        // We can re-use either the pre- or the post-thinlto state. If no LTO is
        // being performed then we can use post-LTO artifacts, otherwise we must
        // reuse pre-LTO artifacts
-        match compute_per_cgu_lto_type(&tcx.sess.lto(), &tcx.sess.opts, tcx.crate_types()) {
+        match compute_per_cgu_lto_type(
+            &tcx.sess.lto(),
+            &tcx.sess.opts,
+            tcx.crate_types(),
+            ModuleKind::Regular,
+        ) {
            ComputedLtoType::No => CguReuse::PostLto,
            _ => CguReuse::PreLto,
        }
--- a/compiler/rustc_codegen_ssa/src/lib.rs
+++ b/compiler/rustc_codegen_ssa/src/lib.rs
@ -120,6 +120,7 @@ impl<M> ModuleCodegen<M> {

        CompiledModule {
            name: self.name,
+            kind: self.kind,
            object,
            dwarf_object,
            bytecode,
@ -133,6 +134,7 @@ impl<M> ModuleCodegen<M> {
 #[derive(Debug, Encodable, Decodable)]
 pub struct CompiledModule {
    pub name: String,
+    pub kind: ModuleKind,
    pub object: Option<PathBuf>,
    pub dwarf_object: Option<PathBuf>,
    pub bytecode: Option<PathBuf>,
--- a/tests/ui/lto/lto-global-allocator.rs
+++ b/tests/ui/lto/lto-global-allocator.rs
@ -0,0 +1,19 @@
+//@ compile-flags: --crate-type cdylib -C lto
+//@ build-pass
+//@ no-prefer-dynamic
+//@ needs-crate-type: cdylib
+
+use std::alloc::{GlobalAlloc, Layout};
+
+struct MyAllocator;
+
+unsafe impl GlobalAlloc for MyAllocator {
+    unsafe fn alloc(&self, _layout: Layout) -> *mut u8 {
+        todo!()
+    }
+
+    unsafe fn dealloc(&self, _ptr: *mut u8, _layout: Layout) {}
+}
+
+#[global_allocator]
+static GLOBAL: MyAllocator = MyAllocator;