Fix multi-cgu+debug builds using autodiff by delaying autodiff till lto

This commit is contained in:
Manuel Drehwald 2026-02-11 14:08:56 -05:00
parent 8340622e14
commit c89a89bb14
2 changed files with 13 additions and 5 deletions

View file

@ -812,12 +812,12 @@ extern "C" LLVMRustResult LLVMRustOptimize(
auto ThinLTOBuffer = std::make_unique<LLVMRustThinLTOBuffer>(); auto ThinLTOBuffer = std::make_unique<LLVMRustThinLTOBuffer>();
raw_string_ostream ThinLTODataOS(ThinLTOBuffer->data); raw_string_ostream ThinLTODataOS(ThinLTOBuffer->data);
raw_string_ostream ThinLinkDataOS(ThinLTOBuffer->thin_link_data); raw_string_ostream ThinLinkDataOS(ThinLTOBuffer->thin_link_data);
bool IsLTO = OptStage == LLVMRustOptStage::ThinLTO ||
OptStage == LLVMRustOptStage::FatLTO;
if (!NoPrepopulatePasses) { if (!NoPrepopulatePasses) {
// The pre-link pipelines don't support O0 and require using // The pre-link pipelines don't support O0 and require using
// buildO0DefaultPipeline() instead. At the same time, the LTO pipelines do // buildO0DefaultPipeline() instead. At the same time, the LTO pipelines do
// support O0 and using them is required. // support O0 and using them is required.
bool IsLTO = OptStage == LLVMRustOptStage::ThinLTO ||
OptStage == LLVMRustOptStage::FatLTO;
if (OptLevel == OptimizationLevel::O0 && !IsLTO) { if (OptLevel == OptimizationLevel::O0 && !IsLTO) {
for (const auto &C : PipelineStartEPCallbacks) for (const auto &C : PipelineStartEPCallbacks)
PB.registerPipelineStartEPCallback(C); PB.registerPipelineStartEPCallback(C);
@ -908,7 +908,10 @@ extern "C" LLVMRustResult LLVMRustOptimize(
// now load "-enzyme" pass: // now load "-enzyme" pass:
// With dlopen, ENZYME macro may not be defined, so check EnzymePtr directly // With dlopen, ENZYME macro may not be defined, so check EnzymePtr directly
if (EnzymePtr) { // In the case of debug builds with multiple codegen units, we might not
// have all function definitions available during the early compiler
// invocations. We therefore wait for the final lto step to run Enzyme.
if (EnzymePtr && IsLTO) {
if (PrintBeforeEnzyme) { if (PrintBeforeEnzyme) {
// Handle the Rust flag `-Zautodiff=PrintModBefore`. // Handle the Rust flag `-Zautodiff=PrintModBefore`.
@ -929,6 +932,7 @@ extern "C" LLVMRustResult LLVMRustOptimize(
MPM.addPass(PrintModulePass(outs(), Banner, true, false)); MPM.addPass(PrintModulePass(outs(), Banner, true, false));
} }
} }
if (PrintPasses) { if (PrintPasses) {
// Print all passes from the PM: // Print all passes from the PM:
std::string Pipeline; std::string Pipeline;

View file

@ -1,6 +1,6 @@
//@ revisions: DEBUG RELEASE //@ revisions: DEBUG RELEASE
//@[RELEASE] compile-flags: -Zautodiff=Enable,NoTT -C opt-level=3 -Clto=fat //@[RELEASE] compile-flags: -Zautodiff=Enable,NoTT -C opt-level=3 -Clto=fat
//@[DEBUG] compile-flags: -Zautodiff=Enable,NoTT -C opt-level=0 -Clto=fat -C debuginfo=2 //@[DEBUG] compile-flags: -Zautodiff=Enable,NoTT -Copt-level=0 -Clto=fat -Cdebuginfo=2 -Ccodegen-units=8
//@ needs-enzyme //@ needs-enzyme
//@ incremental //@ incremental
//@ no-prefer-dynamic //@ no-prefer-dynamic
@ -13,6 +13,10 @@
// dropped. We now use globals instead and add this test to verify that incremental // dropped. We now use globals instead and add this test to verify that incremental
// keeps working. Also testing debug mode while at it. // keeps working. Also testing debug mode while at it.
// We extended this test to use 8 codegen-units in debug mode and call an intrinsic like powi,
// rather than just simple arithmetic. This caused a compilation failure, since the definition of
// the intrinsic was not available in the same cgu as the function being differentiated.
use std::autodiff::autodiff_reverse; use std::autodiff::autodiff_reverse;
#[autodiff_reverse(bar, Duplicated, Duplicated)] #[autodiff_reverse(bar, Duplicated, Duplicated)]
@ -20,7 +24,7 @@ pub fn foo(r: &[f64; 10], res: &mut f64) {
let mut output = [0.0; 10]; let mut output = [0.0; 10];
output[0] = r[0]; output[0] = r[0];
output[1] = r[1] * r[2]; output[1] = r[1] * r[2];
output[2] = r[4] * r[5]; output[2] = r[4] * r[5].powi(2);
output[3] = r[2] * r[6]; output[3] = r[2] * r[6];
output[4] = r[1] * r[7]; output[4] = r[1] * r[7];
output[5] = r[2] * r[8]; output[5] = r[2] * r[8];