Auto merge of #149170 - ZuseZ4:automate-offload-packager, r=oli-obk

automate gpu offloading - part 1

Automates step 1 from the rustc-dev-guide offload section:
https://rustc-dev-guide.rust-lang.org/offload/usage.html#compile-instructions
`"clang-offload-packager" "-o" "host.out" "--image=file=device.bc,triple=amdgcn-amd-amdhsa,arch=gfx90a,kind=openmp"`

Verified on an MI 250X

cc `@jhuber6,` `@kevinsala,` `@jdoerfert,` `@Sa4dUs`

r? oli-obk
This commit is contained in:
bors 2025-11-23 10:45:30 +00:00
commit 23f708107b
11 changed files with 112 additions and 13 deletions

View file

@ -38,6 +38,7 @@ check_only = ['rustc_driver_impl/check_only']
jemalloc = ['dep:tikv-jemalloc-sys']
llvm = ['rustc_driver_impl/llvm']
llvm_enzyme = ['rustc_driver_impl/llvm_enzyme']
llvm_offload = ['rustc_driver_impl/llvm_offload']
max_level_info = ['rustc_driver_impl/max_level_info']
rustc_randomized_layouts = ['rustc_driver_impl/rustc_randomized_layouts']
# tidy-alphabetical-end

View file

@ -47,5 +47,6 @@ tracing = "0.1"
# tidy-alphabetical-start
check_only = ["rustc_llvm/check_only"]
llvm_enzyme = []
llvm_offload = []
# tidy-alphabetical-end

View file

@ -771,6 +771,13 @@ pub(crate) unsafe fn llvm_optimize(
llvm_plugins.len(),
)
};
if cgcx.target_is_like_gpu && config.offload.contains(&config::Offload::Enable) {
unsafe {
llvm::LLVMRustBundleImages(module.module_llvm.llmod(), module.module_llvm.tm.raw());
}
}
result.into_result().unwrap_or_else(|()| llvm_err(dcx, LlvmError::RunLlvmPasses))
}

View file

@ -1718,6 +1718,37 @@ unsafe extern "C" {
) -> &'a Value;
}
#[cfg(feature = "llvm_offload")]
pub(crate) use self::Offload::*;
#[cfg(feature = "llvm_offload")]
mod Offload {
use super::*;
unsafe extern "C" {
/// Processes the module and writes it in an offload compatible way into a "host.out" file.
pub(crate) fn LLVMRustBundleImages<'a>(M: &'a Module, TM: &'a TargetMachine) -> bool;
pub(crate) fn LLVMRustOffloadMapper<'a>(OldFn: &'a Value, NewFn: &'a Value);
}
}
#[cfg(not(feature = "llvm_offload"))]
pub(crate) use self::Offload_fallback::*;
#[cfg(not(feature = "llvm_offload"))]
mod Offload_fallback {
use super::*;
/// Processes the module and writes it in an offload compatible way into a "host.out" file.
/// Marked as unsafe to match the real offload wrapper which is unsafe due to FFI.
#[allow(unused_unsafe)]
pub(crate) unsafe fn LLVMRustBundleImages<'a>(_M: &'a Module, _TM: &'a TargetMachine) -> bool {
unimplemented!("This rustc version was not built with LLVM Offload support!");
}
#[allow(unused_unsafe)]
pub(crate) unsafe fn LLVMRustOffloadMapper<'a>(_OldFn: &'a Value, _NewFn: &'a Value) {
unimplemented!("This rustc version was not built with LLVM Offload support!");
}
}
// FFI bindings for `DIBuilder` functions in the LLVM-C API.
// Try to keep these in the same order as in `llvm/include/llvm-c/DebugInfo.h`.
//
@ -2025,7 +2056,6 @@ unsafe extern "C" {
) -> &Attribute;
// Operations on functions
pub(crate) fn LLVMRustOffloadMapper<'a>(Fn: &'a Value, Fn: &'a Value);
pub(crate) fn LLVMRustGetOrInsertFunction<'a>(
M: &'a Module,
Name: *const c_char,

View file

@ -75,6 +75,7 @@ ctrlc = "3.4.4"
check_only = ['rustc_interface/check_only']
llvm = ['rustc_interface/llvm']
llvm_enzyme = ['rustc_interface/llvm_enzyme']
llvm_offload = ['rustc_interface/llvm_offload']
max_level_info = ['rustc_log/max_level_info']
rustc_randomized_layouts = [
'rustc_index/rustc_randomized_layouts',

View file

@ -59,4 +59,5 @@ rustc_abi = { path = "../rustc_abi" }
check_only = ['rustc_codegen_llvm?/check_only']
llvm = ['dep:rustc_codegen_llvm']
llvm_enzyme = ['rustc_builtin_macros/llvm_enzyme', 'rustc_codegen_llvm/llvm_enzyme']
llvm_offload = ['rustc_codegen_llvm/llvm_offload']
# tidy-alphabetical-end

View file

@ -214,6 +214,10 @@ fn main() {
cfg.define("ENZYME", None);
}
if tracked_env_var_os("LLVM_OFFLOAD").is_some() {
cfg.define("OFFLOAD", None);
}
if tracked_env_var_os("LLVM_RUSTLLVM").is_some() {
cfg.define("LLVM_RUSTLLVM", None);
}

View file

@ -39,6 +39,14 @@
#include "llvm/Transforms/Utils/ValueMapper.h"
#include <iostream>
// Some of the functions below rely on LLVM modules that may not always be
// available. As such, we only try to build it in the first place, if
// llvm.offload is enabled.
#ifdef OFFLOAD
#include "llvm/Object/OffloadBinary.h"
#include "llvm/Target/TargetMachine.h"
#endif
// for raw `write` in the bad-alloc handler
#ifdef _MSC_VER
#include <io.h>
@ -144,6 +152,55 @@ extern "C" void LLVMRustPrintStatistics(RustStringRef OutBuf) {
llvm::PrintStatistics(OS);
}
// Some of the functions here rely on LLVM modules that may not always be
// available. As such, we only try to build it in the first place, if
// llvm.offload is enabled.
#ifdef OFFLOAD
static Error writeFile(StringRef Filename, StringRef Data) {
Expected<std::unique_ptr<FileOutputBuffer>> OutputOrErr =
FileOutputBuffer::create(Filename, Data.size());
if (!OutputOrErr)
return OutputOrErr.takeError();
std::unique_ptr<FileOutputBuffer> Output = std::move(*OutputOrErr);
llvm::copy(Data, Output->getBufferStart());
if (Error E = Output->commit())
return E;
return Error::success();
}
// This is the first of many steps in creating a binary using llvm offload,
// to run code on the gpu. Concrete, it replaces the following binary use:
// clang-offload-packager -o host.out
// --image=file=device.bc,triple=amdgcn-amd-amdhsa,arch=gfx90a,kind=openmp
// The input module is the rust code compiled for a gpu target like amdgpu.
// Based on clang/tools/clang-offload-packager/ClangOffloadPackager.cpp
extern "C" bool LLVMRustBundleImages(LLVMModuleRef M, TargetMachine &TM) {
std::string Storage;
llvm::raw_string_ostream OS1(Storage);
llvm::WriteBitcodeToFile(*unwrap(M), OS1);
OS1.flush();
auto MB = llvm::MemoryBuffer::getMemBufferCopy(Storage, "module.bc");
SmallVector<char, 1024> BinaryData;
raw_svector_ostream OS2(BinaryData);
OffloadBinary::OffloadingImage ImageBinary{};
ImageBinary.TheImageKind = object::IMG_Bitcode;
ImageBinary.Image = std::move(MB);
ImageBinary.TheOffloadKind = object::OFK_OpenMP;
ImageBinary.StringData["triple"] = TM.getTargetTriple().str();
ImageBinary.StringData["arch"] = TM.getTargetCPU();
llvm::SmallString<0> Buffer = OffloadBinary::write(ImageBinary);
if (Buffer.size() % OffloadBinary::getAlignment() != 0)
// Offload binary has invalid size alignment
return false;
OS2 << Buffer;
if (Error E = writeFile("host.out",
StringRef(BinaryData.begin(), BinaryData.size())))
return false;
return true;
}
extern "C" void LLVMRustOffloadMapper(LLVMValueRef OldFn, LLVMValueRef NewFn) {
llvm::Function *oldFn = llvm::unwrap<llvm::Function>(OldFn);
llvm::Function *newFn = llvm::unwrap<llvm::Function>(NewFn);
@ -163,6 +220,7 @@ extern "C" void LLVMRustOffloadMapper(LLVMValueRef OldFn, LLVMValueRef NewFn) {
llvm::CloneFunctionChangeType::LocalChangesOnly,
returns);
}
#endif
extern "C" LLVMValueRef LLVMRustGetNamedValue(LLVMModuleRef M, const char *Name,
size_t NameLen) {

View file

@ -1436,6 +1436,9 @@ fn rustc_llvm_env(builder: &Builder<'_>, cargo: &mut Cargo, target: TargetSelect
if builder.config.llvm_enzyme {
cargo.env("LLVM_ENZYME", "1");
}
if builder.config.llvm_offload {
cargo.env("LLVM_OFFLOAD", "1");
}
let llvm::LlvmResult { host_llvm_config, .. } = builder.ensure(llvm::Llvm { target });
cargo.env("LLVM_CONFIG", &host_llvm_config);

View file

@ -873,6 +873,9 @@ impl Build {
if self.config.llvm_enzyme {
features.push("llvm_enzyme");
}
if self.config.llvm_offload {
features.push("llvm_offload");
}
// keep in sync with `bootstrap/compile.rs:rustc_cargo_env`
if self.config.rust_randomize_layout && check("rustc_randomized_layouts") {
features.push("rustc_randomized_layouts");

View file

@ -79,19 +79,8 @@ Now we generate the device code. Replace the target-cpu with the right code for
RUSTFLAGS="-Ctarget-cpu=gfx90a --emit=llvm-bc,llvm-ir -Zoffload=Enable -Zunstable-options" cargo +offload build -Zunstable-options -r -v --target amdgcn-amd-amdhsa -Zbuild-std=core
```
Now find the `<libname>.ll` under target/amdgcn-amd-amdhsa folder and copy it to a device.ll file (or adjust the file names below).
If you work on an NVIDIA or Intel gpu, please adjust the names acordingly and open an issue to share your results (either if you succeed or fail).
First we compile our .ll files (good for manual inspections) to .bc files and clean up leftover artifacts. The cleanup is important, otherwise caching might interfere on following runs.
```
opt lib.ll -o lib.bc
opt device.ll -o device.bc
rm *.o
rm bare.amdgcn.gfx90a.img*
```
```
"clang-offload-packager" "-o" "host.out" "--image=file=device.bc,triple=amdgcn-amd-amdhsa,arch=gfx90a,kind=openmp"
"clang-21" "-cc1" "-triple" "x86_64-unknown-linux-gnu" "-S" "-save-temps=cwd" "-disable-free" "-clear-ast-before-backend" "-main-file-name" "lib.rs" "-mrelocation-model" "pic" "-pic-level" "2" "-pic-is-pie" "-mframe-pointer=all" "-fmath-errno" "-ffp-contract=on" "-fno-rounding-math" "-mconstructor-aliases" "-funwind-tables=2" "-target-cpu" "x86-64" "-tune-cpu" "generic" "-resource-dir" "/<ABSOLUTE_PATH_TO>/rust/build/x86_64-unknown-linux-gnu/llvm/lib/clang/21" "-ferror-limit" "19" "-fopenmp" "-fopenmp-offload-mandatory" "-fgnuc-version=4.2.1" "-fskip-odr-check-in-gmf" "-fembed-offload-object=host.out" "-fopenmp-targets=amdgcn-amd-amdhsa" "-faddrsig" "-D__GCC_HAVE_DWARF2_CFI_ASM=1" "-o" "host.s" "-x" "ir" "lib.bc"
"clang-21" "-cc1as" "-triple" "x86_64-unknown-linux-gnu" "-filetype" "obj" "-main-file-name" "lib.rs" "-target-cpu" "x86-64" "-mrelocation-model" "pic" "-o" "host.o" "host.s"
@ -99,7 +88,8 @@ rm bare.amdgcn.gfx90a.img*
"clang-linker-wrapper" "--should-extract=gfx90a" "--device-compiler=amdgcn-amd-amdhsa=-g" "--device-compiler=amdgcn-amd-amdhsa=-save-temps=cwd" "--device-linker=amdgcn-amd-amdhsa=-lompdevice" "--host-triple=x86_64-unknown-linux-gnu" "--save-temps" "--linker-path=/ABSOlUTE_PATH_TO/rust/build/x86_64-unknown-linux-gnu/lld/bin/ld.lld" "--hash-style=gnu" "--eh-frame-hdr" "-m" "elf_x86_64" "-pie" "-dynamic-linker" "/lib64/ld-linux-x86-64.so.2" "-o" "bare" "/lib/../lib64/Scrt1.o" "/lib/../lib64/crti.o" "/ABSOLUTE_PATH_TO/crtbeginS.o" "-L/ABSOLUTE_PATH_TO/rust/build/x86_64-unknown-linux-gnu/llvm/bin/../lib/x86_64-unknown-linux-gnu" "-L/ABSOLUTE_PATH_TO/rust/build/x86_64-unknown-linux-gnu/llvm/lib/clang/21/lib/x86_64-unknown-linux-gnu" "-L/lib/../lib64" "-L/usr/lib64" "-L/lib" "-L/usr/lib" "host.o" "-lstdc++" "-lm" "-lomp" "-lomptarget" "-L/ABSOLUTE_PATH_TO/rust/build/x86_64-unknown-linux-gnu/llvm/lib" "-lgcc_s" "-lgcc" "-lpthread" "-lc" "-lgcc_s" "-lgcc" "/ABSOLUTE_PATH_TO/crtendS.o" "/lib/../lib64/crtn.o"
```
Especially for the last command I recommend to not fix the paths, but rather just re-generate them by copying a bare-mode openmp example and compiling it with your clang. By adding `-###` to your clang invocation, you can see the invidual steps.
Especially for the last three commands I recommend to not fix the paths, but rather just re-generate them by copying a bare-mode openmp example and compiling it with your clang. By adding `-###` to your clang invocation, you can see the invidual steps.
You can ignore other steps, e.g. the invocation of a "clang-offload-packager".
```
myclang++ -fuse-ld=lld -O3 -fopenmp -fopenmp-offload-mandatory --offload-arch=gfx90a omp_bare.cpp -o main -###
```