Rollup merge of #143388 - bjorn3:lto_refactors, r=compiler-errors

Various refactors to the LTO handling code In particular reducing the sharing of code paths between fat and thin-LTO and making the fat LTO implementation more self-contained. This also moves some autodiff handling out of cg_ssa into cg_llvm given that Enzyme only works with LLVM anyway and an implementation for another backend may do things entirely differently. This will also make it a bit easier to split LTO handling out of the coordinator thread main loop into a separate loop, which should reduce the complexity of the coordinator thread.
2025-07-17 03:58:28 +02:00 · 2025-07-17 03:58:28 +02:00 · be5f8f299d
commit be5f8f299d
parent 9ac88eabed 21026cae8d
13 changed files with 154 additions and 240 deletions
--- a/compiler/rustc_codegen_gcc/src/back/lto.rs
+++ b/compiler/rustc_codegen_gcc/src/back/lto.rs
@ -24,7 +24,7 @@ use std::sync::Arc;

 use gccjit::{Context, OutputKind};
 use object::read::archive::ArchiveFile;
-use rustc_codegen_ssa::back::lto::{LtoModuleCodegen, SerializedModule, ThinModule, ThinShared};
+use rustc_codegen_ssa::back::lto::{SerializedModule, ThinModule, ThinShared};
 use rustc_codegen_ssa::back::symbol_export;
 use rustc_codegen_ssa::back::write::{CodegenContext, FatLtoInput};
 use rustc_codegen_ssa::traits::*;
@ -176,7 +176,7 @@ pub(crate) fn run_fat(
    cgcx: &CodegenContext<GccCodegenBackend>,
    modules: Vec<FatLtoInput<GccCodegenBackend>>,
    cached_modules: Vec<(SerializedModule<ModuleBuffer>, WorkProduct)>,
-) -> Result<LtoModuleCodegen<GccCodegenBackend>, FatalError> {
+) -> Result<ModuleCodegen<GccContext>, FatalError> {
    let dcx = cgcx.create_dcx();
    let dcx = dcx.handle();
    let lto_data = prepare_lto(cgcx, dcx)?;
@ -201,7 +201,7 @@ fn fat_lto(
    mut serialized_modules: Vec<(SerializedModule<ModuleBuffer>, CString)>,
    tmp_path: TempDir,
    //symbols_below_threshold: &[String],
-) -> Result<LtoModuleCodegen<GccCodegenBackend>, FatalError> {
+) -> Result<ModuleCodegen<GccContext>, FatalError> {
    let _timer = cgcx.prof.generic_activity("GCC_fat_lto_build_monolithic_module");
    info!("going for a fat lto");

@ -334,7 +334,7 @@ fn fat_lto(
    // of now.
    module.module_llvm.temp_dir = Some(tmp_path);

-    Ok(LtoModuleCodegen::Fat(module))
+    Ok(module)
 }

 pub struct ModuleBuffer(PathBuf);
@ -358,7 +358,7 @@ pub(crate) fn run_thin(
    cgcx: &CodegenContext<GccCodegenBackend>,
    modules: Vec<(String, ThinBuffer)>,
    cached_modules: Vec<(SerializedModule<ModuleBuffer>, WorkProduct)>,
-) -> Result<(Vec<LtoModuleCodegen<GccCodegenBackend>>, Vec<WorkProduct>), FatalError> {
+) -> Result<(Vec<ThinModule<GccCodegenBackend>>, Vec<WorkProduct>), FatalError> {
    let dcx = cgcx.create_dcx();
    let dcx = dcx.handle();
    let lto_data = prepare_lto(cgcx, dcx)?;
@ -427,7 +427,7 @@ fn thin_lto(
    tmp_path: TempDir,
    cached_modules: Vec<(SerializedModule<ModuleBuffer>, WorkProduct)>,
    //_symbols_below_threshold: &[String],
-) -> Result<(Vec<LtoModuleCodegen<GccCodegenBackend>>, Vec<WorkProduct>), FatalError> {
+) -> Result<(Vec<ThinModule<GccCodegenBackend>>, Vec<WorkProduct>), FatalError> {
    let _timer = cgcx.prof.generic_activity("LLVM_thin_lto_global_analysis");
    info!("going for that thin, thin LTO");

@ -573,8 +573,7 @@ fn thin_lto(
        }*/

        info!(" - {}: re-compiled", module_name);
-        opt_jobs
-            .push(LtoModuleCodegen::Thin(ThinModule { shared: shared.clone(), idx: module_index }));
+        opt_jobs.push(ThinModule { shared: shared.clone(), idx: module_index });
    }

    // Save the current ThinLTO import information for the next compilation
--- a/compiler/rustc_codegen_gcc/src/back/write.rs
+++ b/compiler/rustc_codegen_gcc/src/back/write.rs
@ -16,10 +16,12 @@ use crate::{GccCodegenBackend, GccContext};

 pub(crate) fn codegen(
    cgcx: &CodegenContext<GccCodegenBackend>,
-    dcx: DiagCtxtHandle<'_>,
    module: ModuleCodegen<GccContext>,
    config: &ModuleConfig,
 ) -> Result<CompiledModule, FatalError> {
+    let dcx = cgcx.create_dcx();
+    let dcx = dcx.handle();
+
    let _timer = cgcx.prof.generic_activity_with_arg("GCC_module_codegen", &*module.name);
    {
        let context = &module.module_llvm.context;
--- a/compiler/rustc_codegen_gcc/src/lib.rs
+++ b/compiler/rustc_codegen_gcc/src/lib.rs
@ -93,7 +93,7 @@ use gccjit::{CType, Context, OptimizationLevel};
 use gccjit::{TargetInfo, Version};
 use rustc_ast::expand::allocator::AllocatorKind;
 use rustc_ast::expand::autodiff_attrs::AutoDiffItem;
-use rustc_codegen_ssa::back::lto::{LtoModuleCodegen, SerializedModule, ThinModule};
+use rustc_codegen_ssa::back::lto::{SerializedModule, ThinModule};
 use rustc_codegen_ssa::back::write::{
    CodegenContext, FatLtoInput, ModuleConfig, TargetMachineFactoryFn,
 };
@ -353,11 +353,16 @@ impl WriteBackendMethods for GccCodegenBackend {
    type ThinData = ThinData;
    type ThinBuffer = ThinBuffer;

-    fn run_fat_lto(
+    fn run_and_optimize_fat_lto(
        cgcx: &CodegenContext<Self>,
        modules: Vec<FatLtoInput<Self>>,
        cached_modules: Vec<(SerializedModule<Self::ModuleBuffer>, WorkProduct)>,
-    ) -> Result<LtoModuleCodegen<Self>, FatalError> {
+        diff_fncs: Vec<AutoDiffItem>,
+    ) -> Result<ModuleCodegen<Self::Module>, FatalError> {
+        if !diff_fncs.is_empty() {
+            unimplemented!();
+        }
+
        back::lto::run_fat(cgcx, modules, cached_modules)
    }

@ -365,7 +370,7 @@ impl WriteBackendMethods for GccCodegenBackend {
        cgcx: &CodegenContext<Self>,
        modules: Vec<(String, Self::ThinBuffer)>,
        cached_modules: Vec<(SerializedModule<Self::ModuleBuffer>, WorkProduct)>,
-    ) -> Result<(Vec<LtoModuleCodegen<Self>>, Vec<WorkProduct>), FatalError> {
+    ) -> Result<(Vec<ThinModule<Self>>, Vec<WorkProduct>), FatalError> {
        back::lto::run_thin(cgcx, modules, cached_modules)
    }

@ -387,14 +392,6 @@ impl WriteBackendMethods for GccCodegenBackend {
        Ok(())
    }

-    fn optimize_fat(
-        _cgcx: &CodegenContext<Self>,
-        _module: &mut ModuleCodegen<Self::Module>,
-    ) -> Result<(), FatalError> {
-        // TODO(antoyo)
-        Ok(())
-    }
-
    fn optimize_thin(
        cgcx: &CodegenContext<Self>,
        thin: ThinModule<Self>,
@ -404,11 +401,10 @@ impl WriteBackendMethods for GccCodegenBackend {

    fn codegen(
        cgcx: &CodegenContext<Self>,
-        dcx: DiagCtxtHandle<'_>,
        module: ModuleCodegen<Self::Module>,
        config: &ModuleConfig,
    ) -> Result<CompiledModule, FatalError> {
-        back::write::codegen(cgcx, dcx, module, config)
+        back::write::codegen(cgcx, module, config)
    }

    fn prepare_thin(
@ -429,15 +425,6 @@ impl WriteBackendMethods for GccCodegenBackend {
    ) -> Result<ModuleCodegen<Self::Module>, FatalError> {
        back::write::link(cgcx, dcx, modules)
    }
-
-    fn autodiff(
-        _cgcx: &CodegenContext<Self>,
-        _module: &ModuleCodegen<Self::Module>,
-        _diff_functions: Vec<AutoDiffItem>,
-        _config: &ModuleConfig,
-    ) -> Result<(), FatalError> {
-        unimplemented!()
-    }
 }

 /// This is the entrypoint for a hot plugged rustc_codegen_gccjit
--- a/compiler/rustc_codegen_llvm/messages.ftl
+++ b/compiler/rustc_codegen_llvm/messages.ftl
@ -1,5 +1,4 @@
 codegen_llvm_autodiff_without_enable = using the autodiff feature requires -Z autodiff=Enable
-codegen_llvm_autodiff_without_lto = using the autodiff feature requires using fat-lto

 codegen_llvm_copy_bitcode = failed to copy bitcode to object file: {$err}

--- a/compiler/rustc_codegen_llvm/src/back/lto.rs
+++ b/compiler/rustc_codegen_llvm/src/back/lto.rs
@ -7,7 +7,7 @@ use std::sync::Arc;
 use std::{io, iter, slice};

 use object::read::archive::ArchiveFile;
-use rustc_codegen_ssa::back::lto::{LtoModuleCodegen, SerializedModule, ThinModule, ThinShared};
+use rustc_codegen_ssa::back::lto::{SerializedModule, ThinModule, ThinShared};
 use rustc_codegen_ssa::back::symbol_export;
 use rustc_codegen_ssa::back::write::{CodegenContext, FatLtoInput};
 use rustc_codegen_ssa::traits::*;
@ -201,7 +201,7 @@ pub(crate) fn run_fat(
    cgcx: &CodegenContext<LlvmCodegenBackend>,
    modules: Vec<FatLtoInput<LlvmCodegenBackend>>,
    cached_modules: Vec<(SerializedModule<ModuleBuffer>, WorkProduct)>,
-) -> Result<LtoModuleCodegen<LlvmCodegenBackend>, FatalError> {
+) -> Result<ModuleCodegen<ModuleLlvm>, FatalError> {
    let dcx = cgcx.create_dcx();
    let dcx = dcx.handle();
    let (symbols_below_threshold, upstream_modules) = prepare_lto(cgcx, dcx)?;
@ -217,7 +217,7 @@ pub(crate) fn run_thin(
    cgcx: &CodegenContext<LlvmCodegenBackend>,
    modules: Vec<(String, ThinBuffer)>,
    cached_modules: Vec<(SerializedModule<ModuleBuffer>, WorkProduct)>,
-) -> Result<(Vec<LtoModuleCodegen<LlvmCodegenBackend>>, Vec<WorkProduct>), FatalError> {
+) -> Result<(Vec<ThinModule<LlvmCodegenBackend>>, Vec<WorkProduct>), FatalError> {
    let dcx = cgcx.create_dcx();
    let dcx = dcx.handle();
    let (symbols_below_threshold, upstream_modules) = prepare_lto(cgcx, dcx)?;
@ -248,7 +248,7 @@ fn fat_lto(
    cached_modules: Vec<(SerializedModule<ModuleBuffer>, WorkProduct)>,
    mut serialized_modules: Vec<(SerializedModule<ModuleBuffer>, CString)>,
    symbols_below_threshold: &[*const libc::c_char],
-) -> Result<LtoModuleCodegen<LlvmCodegenBackend>, FatalError> {
+) -> Result<ModuleCodegen<ModuleLlvm>, FatalError> {
    let _timer = cgcx.prof.generic_activity("LLVM_fat_lto_build_monolithic_module");
    info!("going for a fat lto");

@ -366,7 +366,7 @@ fn fat_lto(
        save_temp_bitcode(cgcx, &module, "lto.after-restriction");
    }

-    Ok(LtoModuleCodegen::Fat(module))
+    Ok(module)
 }

 pub(crate) struct Linker<'a>(&'a mut llvm::Linker<'a>);
@ -436,7 +436,7 @@ fn thin_lto(
    serialized_modules: Vec<(SerializedModule<ModuleBuffer>, CString)>,
    cached_modules: Vec<(SerializedModule<ModuleBuffer>, WorkProduct)>,
    symbols_below_threshold: &[*const libc::c_char],
-) -> Result<(Vec<LtoModuleCodegen<LlvmCodegenBackend>>, Vec<WorkProduct>), FatalError> {
+) -> Result<(Vec<ThinModule<LlvmCodegenBackend>>, Vec<WorkProduct>), FatalError> {
    let _timer = cgcx.prof.generic_activity("LLVM_thin_lto_global_analysis");
    unsafe {
        info!("going for that thin, thin LTO");
@ -568,10 +568,7 @@ fn thin_lto(
            }

            info!(" - {}: re-compiled", module_name);
-            opt_jobs.push(LtoModuleCodegen::Thin(ThinModule {
-                shared: Arc::clone(&shared),
-                idx: module_index,
-            }));
+            opt_jobs.push(ThinModule { shared: Arc::clone(&shared), idx: module_index });
        }

        // Save the current ThinLTO import information for the next compilation
--- a/compiler/rustc_codegen_llvm/src/back/write.rs
+++ b/compiler/rustc_codegen_llvm/src/back/write.rs
@ -817,10 +817,12 @@ pub(crate) fn link(

 pub(crate) fn codegen(
    cgcx: &CodegenContext<LlvmCodegenBackend>,
-    dcx: DiagCtxtHandle<'_>,
    module: ModuleCodegen<ModuleLlvm>,
    config: &ModuleConfig,
 ) -> Result<CompiledModule, FatalError> {
+    let dcx = cgcx.create_dcx();
+    let dcx = dcx.handle();
+
    let _timer = cgcx.prof.generic_activity_with_arg("LLVM_module_codegen", &*module.name);
    {
        let llmod = module.module_llvm.llmod();
--- a/compiler/rustc_codegen_llvm/src/builder/autodiff.rs
+++ b/compiler/rustc_codegen_llvm/src/builder/autodiff.rs
@ -2,7 +2,6 @@ use std::ptr;

 use rustc_ast::expand::autodiff_attrs::{AutoDiffAttrs, AutoDiffItem, DiffActivity, DiffMode};
 use rustc_codegen_ssa::ModuleCodegen;
-use rustc_codegen_ssa::back::write::ModuleConfig;
 use rustc_codegen_ssa::common::TypeKind;
 use rustc_codegen_ssa::traits::BaseTypeCodegenMethods;
 use rustc_errors::FatalError;
@ -461,7 +460,6 @@ pub(crate) fn differentiate<'ll>(
    module: &'ll ModuleCodegen<ModuleLlvm>,
    cgcx: &CodegenContext<LlvmCodegenBackend>,
    diff_items: Vec<AutoDiffItem>,
-    _config: &ModuleConfig,
 ) -> Result<(), FatalError> {
    for item in &diff_items {
        trace!("{}", item);
--- a/compiler/rustc_codegen_llvm/src/errors.rs
+++ b/compiler/rustc_codegen_llvm/src/errors.rs
@ -37,10 +37,6 @@ impl<G: EmissionGuarantee> Diagnostic<'_, G> for ParseTargetMachineConfig<'_> {
    }
 }

-#[derive(Diagnostic)]
-#[diag(codegen_llvm_autodiff_without_lto)]
-pub(crate) struct AutoDiffWithoutLTO;
-
 #[derive(Diagnostic)]
 #[diag(codegen_llvm_autodiff_without_enable)]
 pub(crate) struct AutoDiffWithoutEnable;
--- a/compiler/rustc_codegen_llvm/src/lib.rs
+++ b/compiler/rustc_codegen_llvm/src/lib.rs
@ -26,11 +26,11 @@ use std::mem::ManuallyDrop;
 use back::owned_target_machine::OwnedTargetMachine;
 use back::write::{create_informational_target_machine, create_target_machine};
 use context::SimpleCx;
-use errors::{AutoDiffWithoutLTO, ParseTargetMachineConfig};
+use errors::ParseTargetMachineConfig;
 use llvm_util::target_config;
 use rustc_ast::expand::allocator::AllocatorKind;
 use rustc_ast::expand::autodiff_attrs::AutoDiffItem;
-use rustc_codegen_ssa::back::lto::{LtoModuleCodegen, SerializedModule, ThinModule};
+use rustc_codegen_ssa::back::lto::{SerializedModule, ThinModule};
 use rustc_codegen_ssa::back::write::{
    CodegenContext, FatLtoInput, ModuleConfig, TargetMachineFactoryConfig, TargetMachineFactoryFn,
 };
@ -43,7 +43,7 @@ use rustc_middle::dep_graph::{WorkProduct, WorkProductId};
 use rustc_middle::ty::TyCtxt;
 use rustc_middle::util::Providers;
 use rustc_session::Session;
-use rustc_session::config::{Lto, OptLevel, OutputFilenames, PrintKind, PrintRequest};
+use rustc_session::config::{OptLevel, OutputFilenames, PrintKind, PrintRequest};
 use rustc_span::Symbol;

 mod back {
@ -174,18 +174,29 @@ impl WriteBackendMethods for LlvmCodegenBackend {
    ) -> Result<ModuleCodegen<Self::Module>, FatalError> {
        back::write::link(cgcx, dcx, modules)
    }
-    fn run_fat_lto(
+    fn run_and_optimize_fat_lto(
        cgcx: &CodegenContext<Self>,
        modules: Vec<FatLtoInput<Self>>,
        cached_modules: Vec<(SerializedModule<Self::ModuleBuffer>, WorkProduct)>,
-    ) -> Result<LtoModuleCodegen<Self>, FatalError> {
-        back::lto::run_fat(cgcx, modules, cached_modules)
+        diff_fncs: Vec<AutoDiffItem>,
+    ) -> Result<ModuleCodegen<Self::Module>, FatalError> {
+        let mut module = back::lto::run_fat(cgcx, modules, cached_modules)?;
+
+        if !diff_fncs.is_empty() {
+            builder::autodiff::differentiate(&module, cgcx, diff_fncs)?;
+        }
+
+        let dcx = cgcx.create_dcx();
+        let dcx = dcx.handle();
+        back::lto::run_pass_manager(cgcx, dcx, &mut module, false)?;
+
+        Ok(module)
    }
    fn run_thin_lto(
        cgcx: &CodegenContext<Self>,
        modules: Vec<(String, Self::ThinBuffer)>,
        cached_modules: Vec<(SerializedModule<Self::ModuleBuffer>, WorkProduct)>,
-    ) -> Result<(Vec<LtoModuleCodegen<Self>>, Vec<WorkProduct>), FatalError> {
+    ) -> Result<(Vec<ThinModule<Self>>, Vec<WorkProduct>), FatalError> {
        back::lto::run_thin(cgcx, modules, cached_modules)
    }
    fn optimize(
@ -196,14 +207,6 @@ impl WriteBackendMethods for LlvmCodegenBackend {
    ) -> Result<(), FatalError> {
        back::write::optimize(cgcx, dcx, module, config)
    }
-    fn optimize_fat(
-        cgcx: &CodegenContext<Self>,
-        module: &mut ModuleCodegen<Self::Module>,
-    ) -> Result<(), FatalError> {
-        let dcx = cgcx.create_dcx();
-        let dcx = dcx.handle();
-        back::lto::run_pass_manager(cgcx, dcx, module, false)
-    }
    fn optimize_thin(
        cgcx: &CodegenContext<Self>,
        thin: ThinModule<Self>,
@ -212,11 +215,10 @@ impl WriteBackendMethods for LlvmCodegenBackend {
    }
    fn codegen(
        cgcx: &CodegenContext<Self>,
-        dcx: DiagCtxtHandle<'_>,
        module: ModuleCodegen<Self::Module>,
        config: &ModuleConfig,
    ) -> Result<CompiledModule, FatalError> {
-        back::write::codegen(cgcx, dcx, module, config)
+        back::write::codegen(cgcx, module, config)
    }
    fn prepare_thin(
        module: ModuleCodegen<Self::Module>,
@ -227,19 +229,6 @@ impl WriteBackendMethods for LlvmCodegenBackend {
    fn serialize_module(module: ModuleCodegen<Self::Module>) -> (String, Self::ModuleBuffer) {
        (module.name, back::lto::ModuleBuffer::new(module.module_llvm.llmod()))
    }
-    /// Generate autodiff rules
-    fn autodiff(
-        cgcx: &CodegenContext<Self>,
-        module: &ModuleCodegen<Self::Module>,
-        diff_fncs: Vec<AutoDiffItem>,
-        config: &ModuleConfig,
-    ) -> Result<(), FatalError> {
-        if cgcx.lto != Lto::Fat {
-            let dcx = cgcx.create_dcx();
-            return Err(dcx.handle().emit_almost_fatal(AutoDiffWithoutLTO));
-        }
-        builder::autodiff::differentiate(module, cgcx, diff_fncs, config)
-    }
 }

 impl LlvmCodegenBackend {
--- a/compiler/rustc_codegen_ssa/src/back/lto.rs
+++ b/compiler/rustc_codegen_ssa/src/back/lto.rs
@ -1,13 +1,8 @@
 use std::ffi::CString;
 use std::sync::Arc;

-use rustc_ast::expand::autodiff_attrs::AutoDiffItem;
 use rustc_data_structures::memmap::Mmap;
-use rustc_errors::FatalError;

-use super::write::CodegenContext;
-use crate::ModuleCodegen;
-use crate::back::write::ModuleConfig;
 use crate::traits::*;

 pub struct ThinModule<B: WriteBackendMethods> {
@ -42,61 +37,6 @@ pub struct ThinShared<B: WriteBackendMethods> {
    pub module_names: Vec<CString>,
 }

-pub enum LtoModuleCodegen<B: WriteBackendMethods> {
-    Fat(ModuleCodegen<B::Module>),
-    Thin(ThinModule<B>),
-}
-
-impl<B: WriteBackendMethods> LtoModuleCodegen<B> {
-    pub fn name(&self) -> &str {
-        match *self {
-            LtoModuleCodegen::Fat(_) => "everything",
-            LtoModuleCodegen::Thin(ref m) => m.name(),
-        }
-    }
-
-    /// Optimize this module within the given codegen context.
-    pub fn optimize(
-        self,
-        cgcx: &CodegenContext<B>,
-    ) -> Result<ModuleCodegen<B::Module>, FatalError> {
-        match self {
-            LtoModuleCodegen::Fat(mut module) => {
-                B::optimize_fat(cgcx, &mut module)?;
-                Ok(module)
-            }
-            LtoModuleCodegen::Thin(thin) => B::optimize_thin(cgcx, thin),
-        }
-    }
-
-    /// A "gauge" of how costly it is to optimize this module, used to sort
-    /// biggest modules first.
-    pub fn cost(&self) -> u64 {
-        match *self {
-            // Only one module with fat LTO, so the cost doesn't matter.
-            LtoModuleCodegen::Fat(_) => 0,
-            LtoModuleCodegen::Thin(ref m) => m.cost(),
-        }
-    }
-
-    /// Run autodiff on Fat LTO module
-    pub fn autodiff(
-        self,
-        cgcx: &CodegenContext<B>,
-        diff_fncs: Vec<AutoDiffItem>,
-        config: &ModuleConfig,
-    ) -> Result<LtoModuleCodegen<B>, FatalError> {
-        match &self {
-            LtoModuleCodegen::Fat(module) => {
-                B::autodiff(cgcx, &module, diff_fncs, config)?;
-            }
-            _ => panic!("autodiff called with non-fat LTO module"),
-        }
-
-        Ok(self)
-    }
-}
-
 pub enum SerializedModule<M: ModuleBufferMethods> {
    Local(M),
    FromRlib(Vec<u8>),
--- a/compiler/rustc_codegen_ssa/src/back/write.rs
+++ b/compiler/rustc_codegen_ssa/src/back/write.rs
@ -397,50 +397,31 @@ impl<B: WriteBackendMethods> CodegenContext<B> {
    }
 }

-fn generate_lto_work<B: ExtraBackendMethods>(
+fn generate_thin_lto_work<B: ExtraBackendMethods>(
    cgcx: &CodegenContext<B>,
-    autodiff: Vec<AutoDiffItem>,
-    needs_fat_lto: Vec<FatLtoInput<B>>,
    needs_thin_lto: Vec<(String, B::ThinBuffer)>,
    import_only_modules: Vec<(SerializedModule<B::ModuleBuffer>, WorkProduct)>,
 ) -> Vec<(WorkItem<B>, u64)> {
-    let _prof_timer = cgcx.prof.generic_activity("codegen_generate_lto_work");
+    let _prof_timer = cgcx.prof.generic_activity("codegen_thin_generate_lto_work");

-    if !needs_fat_lto.is_empty() {
-        assert!(needs_thin_lto.is_empty());
-        let mut module =
-            B::run_fat_lto(cgcx, needs_fat_lto, import_only_modules).unwrap_or_else(|e| e.raise());
-        if cgcx.lto == Lto::Fat && !autodiff.is_empty() {
-            let config = cgcx.config(ModuleKind::Regular);
-            module = module.autodiff(cgcx, autodiff, config).unwrap_or_else(|e| e.raise());
-        }
-        // We are adding a single work item, so the cost doesn't matter.
-        vec![(WorkItem::LTO(module), 0)]
-    } else {
-        if !autodiff.is_empty() {
-            let dcx = cgcx.create_dcx();
-            dcx.handle().emit_fatal(AutodiffWithoutLto {});
-        }
-        assert!(needs_fat_lto.is_empty());
-        let (lto_modules, copy_jobs) = B::run_thin_lto(cgcx, needs_thin_lto, import_only_modules)
-            .unwrap_or_else(|e| e.raise());
-        lto_modules
-            .into_iter()
-            .map(|module| {
-                let cost = module.cost();
-                (WorkItem::LTO(module), cost)
-            })
-            .chain(copy_jobs.into_iter().map(|wp| {
-                (
-                    WorkItem::CopyPostLtoArtifacts(CachedModuleCodegen {
-                        name: wp.cgu_name.clone(),
-                        source: wp,
-                    }),
-                    0, // copying is very cheap
-                )
-            }))
-            .collect()
-    }
+    let (lto_modules, copy_jobs) =
+        B::run_thin_lto(cgcx, needs_thin_lto, import_only_modules).unwrap_or_else(|e| e.raise());
+    lto_modules
+        .into_iter()
+        .map(|module| {
+            let cost = module.cost();
+            (WorkItem::ThinLto(module), cost)
+        })
+        .chain(copy_jobs.into_iter().map(|wp| {
+            (
+                WorkItem::CopyPostLtoArtifacts(CachedModuleCodegen {
+                    name: wp.cgu_name.clone(),
+                    source: wp,
+                }),
+                0, // copying is very cheap
+            )
+        }))
+        .collect()
 }

 struct CompiledModules {
@ -470,6 +451,7 @@ pub(crate) fn start_async_codegen<B: ExtraBackendMethods>(
    backend: B,
    tcx: TyCtxt<'_>,
    target_cpu: String,
+    autodiff_items: &[AutoDiffItem],
 ) -> OngoingCodegen<B> {
    let (coordinator_send, coordinator_receive) = channel();

@ -488,6 +470,7 @@ pub(crate) fn start_async_codegen<B: ExtraBackendMethods>(
        backend.clone(),
        tcx,
        &crate_info,
+        autodiff_items,
        shared_emitter,
        codegen_worker_send,
        coordinator_receive,
@ -736,15 +719,23 @@ pub(crate) enum WorkItem<B: WriteBackendMethods> {
    /// Copy the post-LTO artifacts from the incremental cache to the output
    /// directory.
    CopyPostLtoArtifacts(CachedModuleCodegen),
-    /// Performs (Thin)LTO on the given module.
-    LTO(lto::LtoModuleCodegen<B>),
+    /// Performs fat LTO on the given module.
+    FatLto {
+        needs_fat_lto: Vec<FatLtoInput<B>>,
+        import_only_modules: Vec<(SerializedModule<B::ModuleBuffer>, WorkProduct)>,
+        autodiff: Vec<AutoDiffItem>,
+    },
+    /// Performs thin-LTO on the given module.
+    ThinLto(lto::ThinModule<B>),
 }

 impl<B: WriteBackendMethods> WorkItem<B> {
    fn module_kind(&self) -> ModuleKind {
        match *self {
            WorkItem::Optimize(ref m) => m.kind,
-            WorkItem::CopyPostLtoArtifacts(_) | WorkItem::LTO(_) => ModuleKind::Regular,
+            WorkItem::CopyPostLtoArtifacts(_) | WorkItem::FatLto { .. } | WorkItem::ThinLto(_) => {
+                ModuleKind::Regular
+            }
        }
    }

@ -792,7 +783,8 @@ impl<B: WriteBackendMethods> WorkItem<B> {
        match self {
            WorkItem::Optimize(m) => desc("opt", "optimize module", &m.name),
            WorkItem::CopyPostLtoArtifacts(m) => desc("cpy", "copy LTO artifacts for", &m.name),
-            WorkItem::LTO(m) => desc("lto", "LTO module", m.name()),
+            WorkItem::FatLto { .. } => desc("lto", "fat LTO module", "everything"),
+            WorkItem::ThinLto(m) => desc("lto", "thin-LTO module", m.name()),
        }
    }
 }
@ -996,12 +988,24 @@ fn execute_copy_from_cache_work_item<B: ExtraBackendMethods>(
    })
 }

-fn execute_lto_work_item<B: ExtraBackendMethods>(
+fn execute_fat_lto_work_item<B: ExtraBackendMethods>(
    cgcx: &CodegenContext<B>,
-    module: lto::LtoModuleCodegen<B>,
+    needs_fat_lto: Vec<FatLtoInput<B>>,
+    import_only_modules: Vec<(SerializedModule<B::ModuleBuffer>, WorkProduct)>,
+    autodiff: Vec<AutoDiffItem>,
    module_config: &ModuleConfig,
 ) -> Result<WorkItemResult<B>, FatalError> {
-    let module = module.optimize(cgcx)?;
+    let module = B::run_and_optimize_fat_lto(cgcx, needs_fat_lto, import_only_modules, autodiff)?;
+    let module = B::codegen(cgcx, module, module_config)?;
+    Ok(WorkItemResult::Finished(module))
+}
+
+fn execute_thin_lto_work_item<B: ExtraBackendMethods>(
+    cgcx: &CodegenContext<B>,
+    module: lto::ThinModule<B>,
+    module_config: &ModuleConfig,
+) -> Result<WorkItemResult<B>, FatalError> {
+    let module = B::optimize_thin(cgcx, module)?;
    finish_intra_module_work(cgcx, module, module_config)
 }

@ -1010,11 +1014,8 @@ fn finish_intra_module_work<B: ExtraBackendMethods>(
    module: ModuleCodegen<B::Module>,
    module_config: &ModuleConfig,
 ) -> Result<WorkItemResult<B>, FatalError> {
-    let dcx = cgcx.create_dcx();
-    let dcx = dcx.handle();
-
    if !cgcx.opts.unstable_opts.combine_cgu || module.kind == ModuleKind::Allocator {
-        let module = B::codegen(cgcx, dcx, module, module_config)?;
+        let module = B::codegen(cgcx, module, module_config)?;
        Ok(WorkItemResult::Finished(module))
    } else {
        Ok(WorkItemResult::NeedsLink(module))
@ -1031,9 +1032,6 @@ pub(crate) enum Message<B: WriteBackendMethods> {
    /// Sent from a backend worker thread.
    WorkItem { result: Result<WorkItemResult<B>, Option<WorkerFatalError>>, worker_id: usize },

-    /// A vector containing all the AutoDiff tasks that we have to pass to Enzyme.
-    AddAutoDiffItems(Vec<AutoDiffItem>),
-
    /// The frontend has finished generating something (backend IR or a
    /// post-LTO artifact) for a codegen unit, and it should be passed to the
    /// backend. Sent from the main thread.
@ -1100,6 +1098,7 @@ fn start_executing_work<B: ExtraBackendMethods>(
    backend: B,
    tcx: TyCtxt<'_>,
    crate_info: &CrateInfo,
+    autodiff_items: &[AutoDiffItem],
    shared_emitter: SharedEmitter,
    codegen_worker_send: Sender<CguMessage>,
    coordinator_receive: Receiver<Box<dyn Any + Send>>,
@ -1109,6 +1108,7 @@ fn start_executing_work<B: ExtraBackendMethods>(
 ) -> thread::JoinHandle<Result<CompiledModules, ()>> {
    let coordinator_send = tx_to_llvm_workers;
    let sess = tcx.sess;
+    let autodiff_items = autodiff_items.to_vec();

    let mut each_linked_rlib_for_lto = Vec::new();
    drop(link::each_linked_rlib(crate_info, None, &mut |cnum, path| {
@ -1362,7 +1362,6 @@ fn start_executing_work<B: ExtraBackendMethods>(

        // This is where we collect codegen units that have gone all the way
        // through codegen and LLVM.
-        let mut autodiff_items = Vec::new();
        let mut compiled_modules = vec![];
        let mut compiled_allocator_module = None;
        let mut needs_link = Vec::new();
@ -1474,20 +1473,37 @@ fn start_executing_work<B: ExtraBackendMethods>(
                    let needs_thin_lto = mem::take(&mut needs_thin_lto);
                    let import_only_modules = mem::take(&mut lto_import_only_modules);

-                    for (work, cost) in generate_lto_work(
-                        &cgcx,
-                        autodiff_items.clone(),
-                        needs_fat_lto,
-                        needs_thin_lto,
-                        import_only_modules,
-                    ) {
-                        let insertion_index = work_items
-                            .binary_search_by_key(&cost, |&(_, cost)| cost)
-                            .unwrap_or_else(|e| e);
-                        work_items.insert(insertion_index, (work, cost));
+                    if !needs_fat_lto.is_empty() {
+                        assert!(needs_thin_lto.is_empty());
+
+                        work_items.push((
+                            WorkItem::FatLto {
+                                needs_fat_lto,
+                                import_only_modules,
+                                autodiff: autodiff_items.clone(),
+                            },
+                            0,
+                        ));
                        if cgcx.parallel {
                            helper.request_token();
                        }
+                    } else {
+                        if !autodiff_items.is_empty() {
+                            let dcx = cgcx.create_dcx();
+                            dcx.handle().emit_fatal(AutodiffWithoutLto {});
+                        }
+
+                        for (work, cost) in
+                            generate_thin_lto_work(&cgcx, needs_thin_lto, import_only_modules)
+                        {
+                            let insertion_index = work_items
+                                .binary_search_by_key(&cost, |&(_, cost)| cost)
+                                .unwrap_or_else(|e| e);
+                            work_items.insert(insertion_index, (work, cost));
+                            if cgcx.parallel {
+                                helper.request_token();
+                            }
+                        }
                    }
                }

@ -1616,10 +1632,6 @@ fn start_executing_work<B: ExtraBackendMethods>(
                    main_thread_state = MainThreadState::Idle;
                }

-                Message::AddAutoDiffItems(mut items) => {
-                    autodiff_items.append(&mut items);
-                }
-
                Message::CodegenComplete => {
                    if codegen_state != Aborted {
                        codegen_state = Completed;
@ -1702,7 +1714,7 @@ fn start_executing_work<B: ExtraBackendMethods>(
            let dcx = dcx.handle();
            let module = B::run_link(&cgcx, dcx, needs_link).map_err(|_| ())?;
            let module =
-                B::codegen(&cgcx, dcx, module, cgcx.config(ModuleKind::Regular)).map_err(|_| ())?;
+                B::codegen(&cgcx, module, cgcx.config(ModuleKind::Regular)).map_err(|_| ())?;
            compiled_modules.push(module);
        }

@ -1842,10 +1854,22 @@ fn spawn_work<'a, B: ExtraBackendMethods>(
                    );
                    Ok(execute_copy_from_cache_work_item(&cgcx, m, module_config))
                }
-                WorkItem::LTO(m) => {
+                WorkItem::FatLto { needs_fat_lto, import_only_modules, autodiff } => {
+                    let _timer = cgcx
+                        .prof
+                        .generic_activity_with_arg("codegen_module_perform_lto", "everything");
+                    execute_fat_lto_work_item(
+                        &cgcx,
+                        needs_fat_lto,
+                        import_only_modules,
+                        autodiff,
+                        module_config,
+                    )
+                }
+                WorkItem::ThinLto(m) => {
                    let _timer =
                        cgcx.prof.generic_activity_with_arg("codegen_module_perform_lto", m.name());
-                    execute_lto_work_item(&cgcx, m, module_config)
+                    execute_thin_lto_work_item(&cgcx, m, module_config)
                }
            })
        };
@ -2082,10 +2106,6 @@ impl<B: ExtraBackendMethods> OngoingCodegen<B> {
        drop(self.coordinator.sender.send(Box::new(Message::CodegenComplete::<B>)));
    }

-    pub(crate) fn submit_autodiff_items(&self, items: Vec<AutoDiffItem>) {
-        drop(self.coordinator.sender.send(Box::new(Message::<B>::AddAutoDiffItems(items))));
-    }
-
    pub(crate) fn check_for_errors(&self, sess: &Session) {
        self.shared_emitter_main.check(sess, false);
    }
--- a/compiler/rustc_codegen_ssa/src/base.rs
+++ b/compiler/rustc_codegen_ssa/src/base.rs
@ -647,7 +647,7 @@ pub fn codegen_crate<B: ExtraBackendMethods>(
 ) -> OngoingCodegen<B> {
    // Skip crate items and just output metadata in -Z no-codegen mode.
    if tcx.sess.opts.unstable_opts.no_codegen || !tcx.sess.opts.output_types.should_codegen() {
-        let ongoing_codegen = start_async_codegen(backend, tcx, target_cpu);
+        let ongoing_codegen = start_async_codegen(backend, tcx, target_cpu, &[]);

        ongoing_codegen.codegen_finished(tcx);

@ -667,7 +667,6 @@ pub fn codegen_crate<B: ExtraBackendMethods>(
    // codegen units.
    let MonoItemPartitions { codegen_units, autodiff_items, .. } =
        tcx.collect_and_partition_mono_items(());
-    let autodiff_fncs = autodiff_items.to_vec();

    // Force all codegen_unit queries so they are already either red or green
    // when compile_codegen_unit accesses them. We are not able to re-execute
@ -680,7 +679,7 @@ pub fn codegen_crate<B: ExtraBackendMethods>(
        }
    }

-    let ongoing_codegen = start_async_codegen(backend.clone(), tcx, target_cpu);
+    let ongoing_codegen = start_async_codegen(backend.clone(), tcx, target_cpu, autodiff_items);

    // Codegen an allocator shim, if necessary.
    if let Some(kind) = allocator_kind_for_codegen(tcx) {
@ -710,10 +709,6 @@ pub fn codegen_crate<B: ExtraBackendMethods>(
        );
    }

-    if !autodiff_fncs.is_empty() {
-        ongoing_codegen.submit_autodiff_items(autodiff_fncs);
-    }
-
    // For better throughput during parallel processing by LLVM, we used to sort
    // CGUs largest to smallest. This would lead to better thread utilization
    // by, for example, preventing a large CGU from being processed last and
--- a/compiler/rustc_codegen_ssa/src/traits/write.rs
+++ b/compiler/rustc_codegen_ssa/src/traits/write.rs
@ -2,7 +2,7 @@ use rustc_ast::expand::autodiff_attrs::AutoDiffItem;
 use rustc_errors::{DiagCtxtHandle, FatalError};
 use rustc_middle::dep_graph::WorkProduct;

-use crate::back::lto::{LtoModuleCodegen, SerializedModule, ThinModule};
+use crate::back::lto::{SerializedModule, ThinModule};
 use crate::back::write::{CodegenContext, FatLtoInput, ModuleConfig};
 use crate::{CompiledModule, ModuleCodegen};

@ -20,13 +20,14 @@ pub trait WriteBackendMethods: Clone + 'static {
        dcx: DiagCtxtHandle<'_>,
        modules: Vec<ModuleCodegen<Self::Module>>,
    ) -> Result<ModuleCodegen<Self::Module>, FatalError>;
-    /// Performs fat LTO by merging all modules into a single one and returning it
-    /// for further optimization.
-    fn run_fat_lto(
+    /// Performs fat LTO by merging all modules into a single one, running autodiff
+    /// if necessary and running any further optimizations
+    fn run_and_optimize_fat_lto(
        cgcx: &CodegenContext<Self>,
        modules: Vec<FatLtoInput<Self>>,
        cached_modules: Vec<(SerializedModule<Self::ModuleBuffer>, WorkProduct)>,
-    ) -> Result<LtoModuleCodegen<Self>, FatalError>;
+        diff_fncs: Vec<AutoDiffItem>,
+    ) -> Result<ModuleCodegen<Self::Module>, FatalError>;
    /// Performs thin LTO by performing necessary global analysis and returning two
    /// lists, one of the modules that need optimization and another for modules that
    /// can simply be copied over from the incr. comp. cache.
@ -34,7 +35,7 @@ pub trait WriteBackendMethods: Clone + 'static {
        cgcx: &CodegenContext<Self>,
        modules: Vec<(String, Self::ThinBuffer)>,
        cached_modules: Vec<(SerializedModule<Self::ModuleBuffer>, WorkProduct)>,
-    ) -> Result<(Vec<LtoModuleCodegen<Self>>, Vec<WorkProduct>), FatalError>;
+    ) -> Result<(Vec<ThinModule<Self>>, Vec<WorkProduct>), FatalError>;
    fn print_pass_timings(&self);
    fn print_statistics(&self);
    fn optimize(
@ -43,17 +44,12 @@ pub trait WriteBackendMethods: Clone + 'static {
        module: &mut ModuleCodegen<Self::Module>,
        config: &ModuleConfig,
    ) -> Result<(), FatalError>;
-    fn optimize_fat(
-        cgcx: &CodegenContext<Self>,
-        llmod: &mut ModuleCodegen<Self::Module>,
-    ) -> Result<(), FatalError>;
    fn optimize_thin(
        cgcx: &CodegenContext<Self>,
        thin: ThinModule<Self>,
    ) -> Result<ModuleCodegen<Self::Module>, FatalError>;
    fn codegen(
        cgcx: &CodegenContext<Self>,
-        dcx: DiagCtxtHandle<'_>,
        module: ModuleCodegen<Self::Module>,
        config: &ModuleConfig,
    ) -> Result<CompiledModule, FatalError>;
@ -62,12 +58,6 @@ pub trait WriteBackendMethods: Clone + 'static {
        want_summary: bool,
    ) -> (String, Self::ThinBuffer);
    fn serialize_module(module: ModuleCodegen<Self::Module>) -> (String, Self::ModuleBuffer);
-    fn autodiff(
-        cgcx: &CodegenContext<Self>,
-        module: &ModuleCodegen<Self::Module>,
-        diff_fncs: Vec<AutoDiffItem>,
-        config: &ModuleConfig,
-    ) -> Result<(), FatalError>;
 }

 pub trait ThinBufferMethods: Send + Sync {