Auto merge of #149170 - ZuseZ4:automate-offload-packager, r=oli-obk
automate gpu offloading - part 1 Automates step 1 from the rustc-dev-guide offload section: https://rustc-dev-guide.rust-lang.org/offload/usage.html#compile-instructions `"clang-offload-packager" "-o" "host.out" "--image=file=device.bc,triple=amdgcn-amd-amdhsa,arch=gfx90a,kind=openmp"` Verified on an MI 250X cc `@jhuber6,` `@kevinsala,` `@jdoerfert,` `@Sa4dUs` r? oli-obk
This commit is contained in:
commit
23f708107b
11 changed files with 112 additions and 13 deletions
|
|
@ -38,6 +38,7 @@ check_only = ['rustc_driver_impl/check_only']
|
|||
jemalloc = ['dep:tikv-jemalloc-sys']
|
||||
llvm = ['rustc_driver_impl/llvm']
|
||||
llvm_enzyme = ['rustc_driver_impl/llvm_enzyme']
|
||||
llvm_offload = ['rustc_driver_impl/llvm_offload']
|
||||
max_level_info = ['rustc_driver_impl/max_level_info']
|
||||
rustc_randomized_layouts = ['rustc_driver_impl/rustc_randomized_layouts']
|
||||
# tidy-alphabetical-end
|
||||
|
|
|
|||
|
|
@ -47,5 +47,6 @@ tracing = "0.1"
|
|||
# tidy-alphabetical-start
|
||||
check_only = ["rustc_llvm/check_only"]
|
||||
llvm_enzyme = []
|
||||
llvm_offload = []
|
||||
# tidy-alphabetical-end
|
||||
|
||||
|
|
|
|||
|
|
@ -771,6 +771,13 @@ pub(crate) unsafe fn llvm_optimize(
|
|||
llvm_plugins.len(),
|
||||
)
|
||||
};
|
||||
|
||||
if cgcx.target_is_like_gpu && config.offload.contains(&config::Offload::Enable) {
|
||||
unsafe {
|
||||
llvm::LLVMRustBundleImages(module.module_llvm.llmod(), module.module_llvm.tm.raw());
|
||||
}
|
||||
}
|
||||
|
||||
result.into_result().unwrap_or_else(|()| llvm_err(dcx, LlvmError::RunLlvmPasses))
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -1718,6 +1718,37 @@ unsafe extern "C" {
|
|||
) -> &'a Value;
|
||||
}
|
||||
|
||||
#[cfg(feature = "llvm_offload")]
|
||||
pub(crate) use self::Offload::*;
|
||||
|
||||
#[cfg(feature = "llvm_offload")]
|
||||
mod Offload {
|
||||
use super::*;
|
||||
unsafe extern "C" {
|
||||
/// Processes the module and writes it in an offload compatible way into a "host.out" file.
|
||||
pub(crate) fn LLVMRustBundleImages<'a>(M: &'a Module, TM: &'a TargetMachine) -> bool;
|
||||
pub(crate) fn LLVMRustOffloadMapper<'a>(OldFn: &'a Value, NewFn: &'a Value);
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(not(feature = "llvm_offload"))]
|
||||
pub(crate) use self::Offload_fallback::*;
|
||||
|
||||
#[cfg(not(feature = "llvm_offload"))]
|
||||
mod Offload_fallback {
|
||||
use super::*;
|
||||
/// Processes the module and writes it in an offload compatible way into a "host.out" file.
|
||||
/// Marked as unsafe to match the real offload wrapper which is unsafe due to FFI.
|
||||
#[allow(unused_unsafe)]
|
||||
pub(crate) unsafe fn LLVMRustBundleImages<'a>(_M: &'a Module, _TM: &'a TargetMachine) -> bool {
|
||||
unimplemented!("This rustc version was not built with LLVM Offload support!");
|
||||
}
|
||||
#[allow(unused_unsafe)]
|
||||
pub(crate) unsafe fn LLVMRustOffloadMapper<'a>(_OldFn: &'a Value, _NewFn: &'a Value) {
|
||||
unimplemented!("This rustc version was not built with LLVM Offload support!");
|
||||
}
|
||||
}
|
||||
|
||||
// FFI bindings for `DIBuilder` functions in the LLVM-C API.
|
||||
// Try to keep these in the same order as in `llvm/include/llvm-c/DebugInfo.h`.
|
||||
//
|
||||
|
|
@ -2025,7 +2056,6 @@ unsafe extern "C" {
|
|||
) -> &Attribute;
|
||||
|
||||
// Operations on functions
|
||||
pub(crate) fn LLVMRustOffloadMapper<'a>(Fn: &'a Value, Fn: &'a Value);
|
||||
pub(crate) fn LLVMRustGetOrInsertFunction<'a>(
|
||||
M: &'a Module,
|
||||
Name: *const c_char,
|
||||
|
|
|
|||
|
|
@ -75,6 +75,7 @@ ctrlc = "3.4.4"
|
|||
check_only = ['rustc_interface/check_only']
|
||||
llvm = ['rustc_interface/llvm']
|
||||
llvm_enzyme = ['rustc_interface/llvm_enzyme']
|
||||
llvm_offload = ['rustc_interface/llvm_offload']
|
||||
max_level_info = ['rustc_log/max_level_info']
|
||||
rustc_randomized_layouts = [
|
||||
'rustc_index/rustc_randomized_layouts',
|
||||
|
|
|
|||
|
|
@ -59,4 +59,5 @@ rustc_abi = { path = "../rustc_abi" }
|
|||
check_only = ['rustc_codegen_llvm?/check_only']
|
||||
llvm = ['dep:rustc_codegen_llvm']
|
||||
llvm_enzyme = ['rustc_builtin_macros/llvm_enzyme', 'rustc_codegen_llvm/llvm_enzyme']
|
||||
llvm_offload = ['rustc_codegen_llvm/llvm_offload']
|
||||
# tidy-alphabetical-end
|
||||
|
|
|
|||
|
|
@ -214,6 +214,10 @@ fn main() {
|
|||
cfg.define("ENZYME", None);
|
||||
}
|
||||
|
||||
if tracked_env_var_os("LLVM_OFFLOAD").is_some() {
|
||||
cfg.define("OFFLOAD", None);
|
||||
}
|
||||
|
||||
if tracked_env_var_os("LLVM_RUSTLLVM").is_some() {
|
||||
cfg.define("LLVM_RUSTLLVM", None);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -39,6 +39,14 @@
|
|||
#include "llvm/Transforms/Utils/ValueMapper.h"
|
||||
#include <iostream>
|
||||
|
||||
// Some of the functions below rely on LLVM modules that may not always be
|
||||
// available. As such, we only try to build it in the first place, if
|
||||
// llvm.offload is enabled.
|
||||
#ifdef OFFLOAD
|
||||
#include "llvm/Object/OffloadBinary.h"
|
||||
#include "llvm/Target/TargetMachine.h"
|
||||
#endif
|
||||
|
||||
// for raw `write` in the bad-alloc handler
|
||||
#ifdef _MSC_VER
|
||||
#include <io.h>
|
||||
|
|
@ -144,6 +152,55 @@ extern "C" void LLVMRustPrintStatistics(RustStringRef OutBuf) {
|
|||
llvm::PrintStatistics(OS);
|
||||
}
|
||||
|
||||
// Some of the functions here rely on LLVM modules that may not always be
|
||||
// available. As such, we only try to build it in the first place, if
|
||||
// llvm.offload is enabled.
|
||||
#ifdef OFFLOAD
|
||||
static Error writeFile(StringRef Filename, StringRef Data) {
|
||||
Expected<std::unique_ptr<FileOutputBuffer>> OutputOrErr =
|
||||
FileOutputBuffer::create(Filename, Data.size());
|
||||
if (!OutputOrErr)
|
||||
return OutputOrErr.takeError();
|
||||
std::unique_ptr<FileOutputBuffer> Output = std::move(*OutputOrErr);
|
||||
llvm::copy(Data, Output->getBufferStart());
|
||||
if (Error E = Output->commit())
|
||||
return E;
|
||||
return Error::success();
|
||||
}
|
||||
|
||||
// This is the first of many steps in creating a binary using llvm offload,
|
||||
// to run code on the gpu. Concrete, it replaces the following binary use:
|
||||
// clang-offload-packager -o host.out
|
||||
// --image=file=device.bc,triple=amdgcn-amd-amdhsa,arch=gfx90a,kind=openmp
|
||||
// The input module is the rust code compiled for a gpu target like amdgpu.
|
||||
// Based on clang/tools/clang-offload-packager/ClangOffloadPackager.cpp
|
||||
extern "C" bool LLVMRustBundleImages(LLVMModuleRef M, TargetMachine &TM) {
|
||||
std::string Storage;
|
||||
llvm::raw_string_ostream OS1(Storage);
|
||||
llvm::WriteBitcodeToFile(*unwrap(M), OS1);
|
||||
OS1.flush();
|
||||
auto MB = llvm::MemoryBuffer::getMemBufferCopy(Storage, "module.bc");
|
||||
|
||||
SmallVector<char, 1024> BinaryData;
|
||||
raw_svector_ostream OS2(BinaryData);
|
||||
|
||||
OffloadBinary::OffloadingImage ImageBinary{};
|
||||
ImageBinary.TheImageKind = object::IMG_Bitcode;
|
||||
ImageBinary.Image = std::move(MB);
|
||||
ImageBinary.TheOffloadKind = object::OFK_OpenMP;
|
||||
ImageBinary.StringData["triple"] = TM.getTargetTriple().str();
|
||||
ImageBinary.StringData["arch"] = TM.getTargetCPU();
|
||||
llvm::SmallString<0> Buffer = OffloadBinary::write(ImageBinary);
|
||||
if (Buffer.size() % OffloadBinary::getAlignment() != 0)
|
||||
// Offload binary has invalid size alignment
|
||||
return false;
|
||||
OS2 << Buffer;
|
||||
if (Error E = writeFile("host.out",
|
||||
StringRef(BinaryData.begin(), BinaryData.size())))
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
extern "C" void LLVMRustOffloadMapper(LLVMValueRef OldFn, LLVMValueRef NewFn) {
|
||||
llvm::Function *oldFn = llvm::unwrap<llvm::Function>(OldFn);
|
||||
llvm::Function *newFn = llvm::unwrap<llvm::Function>(NewFn);
|
||||
|
|
@ -163,6 +220,7 @@ extern "C" void LLVMRustOffloadMapper(LLVMValueRef OldFn, LLVMValueRef NewFn) {
|
|||
llvm::CloneFunctionChangeType::LocalChangesOnly,
|
||||
returns);
|
||||
}
|
||||
#endif
|
||||
|
||||
extern "C" LLVMValueRef LLVMRustGetNamedValue(LLVMModuleRef M, const char *Name,
|
||||
size_t NameLen) {
|
||||
|
|
|
|||
|
|
@ -1436,6 +1436,9 @@ fn rustc_llvm_env(builder: &Builder<'_>, cargo: &mut Cargo, target: TargetSelect
|
|||
if builder.config.llvm_enzyme {
|
||||
cargo.env("LLVM_ENZYME", "1");
|
||||
}
|
||||
if builder.config.llvm_offload {
|
||||
cargo.env("LLVM_OFFLOAD", "1");
|
||||
}
|
||||
let llvm::LlvmResult { host_llvm_config, .. } = builder.ensure(llvm::Llvm { target });
|
||||
cargo.env("LLVM_CONFIG", &host_llvm_config);
|
||||
|
||||
|
|
|
|||
|
|
@ -873,6 +873,9 @@ impl Build {
|
|||
if self.config.llvm_enzyme {
|
||||
features.push("llvm_enzyme");
|
||||
}
|
||||
if self.config.llvm_offload {
|
||||
features.push("llvm_offload");
|
||||
}
|
||||
// keep in sync with `bootstrap/compile.rs:rustc_cargo_env`
|
||||
if self.config.rust_randomize_layout && check("rustc_randomized_layouts") {
|
||||
features.push("rustc_randomized_layouts");
|
||||
|
|
|
|||
|
|
@ -79,19 +79,8 @@ Now we generate the device code. Replace the target-cpu with the right code for
|
|||
RUSTFLAGS="-Ctarget-cpu=gfx90a --emit=llvm-bc,llvm-ir -Zoffload=Enable -Zunstable-options" cargo +offload build -Zunstable-options -r -v --target amdgcn-amd-amdhsa -Zbuild-std=core
|
||||
```
|
||||
|
||||
Now find the `<libname>.ll` under target/amdgcn-amd-amdhsa folder and copy it to a device.ll file (or adjust the file names below).
|
||||
If you work on an NVIDIA or Intel gpu, please adjust the names acordingly and open an issue to share your results (either if you succeed or fail).
|
||||
First we compile our .ll files (good for manual inspections) to .bc files and clean up leftover artifacts. The cleanup is important, otherwise caching might interfere on following runs.
|
||||
```
|
||||
opt lib.ll -o lib.bc
|
||||
opt device.ll -o device.bc
|
||||
rm *.o
|
||||
rm bare.amdgcn.gfx90a.img*
|
||||
```
|
||||
|
||||
```
|
||||
"clang-offload-packager" "-o" "host.out" "--image=file=device.bc,triple=amdgcn-amd-amdhsa,arch=gfx90a,kind=openmp"
|
||||
|
||||
"clang-21" "-cc1" "-triple" "x86_64-unknown-linux-gnu" "-S" "-save-temps=cwd" "-disable-free" "-clear-ast-before-backend" "-main-file-name" "lib.rs" "-mrelocation-model" "pic" "-pic-level" "2" "-pic-is-pie" "-mframe-pointer=all" "-fmath-errno" "-ffp-contract=on" "-fno-rounding-math" "-mconstructor-aliases" "-funwind-tables=2" "-target-cpu" "x86-64" "-tune-cpu" "generic" "-resource-dir" "/<ABSOLUTE_PATH_TO>/rust/build/x86_64-unknown-linux-gnu/llvm/lib/clang/21" "-ferror-limit" "19" "-fopenmp" "-fopenmp-offload-mandatory" "-fgnuc-version=4.2.1" "-fskip-odr-check-in-gmf" "-fembed-offload-object=host.out" "-fopenmp-targets=amdgcn-amd-amdhsa" "-faddrsig" "-D__GCC_HAVE_DWARF2_CFI_ASM=1" "-o" "host.s" "-x" "ir" "lib.bc"
|
||||
|
||||
"clang-21" "-cc1as" "-triple" "x86_64-unknown-linux-gnu" "-filetype" "obj" "-main-file-name" "lib.rs" "-target-cpu" "x86-64" "-mrelocation-model" "pic" "-o" "host.o" "host.s"
|
||||
|
|
@ -99,7 +88,8 @@ rm bare.amdgcn.gfx90a.img*
|
|||
"clang-linker-wrapper" "--should-extract=gfx90a" "--device-compiler=amdgcn-amd-amdhsa=-g" "--device-compiler=amdgcn-amd-amdhsa=-save-temps=cwd" "--device-linker=amdgcn-amd-amdhsa=-lompdevice" "--host-triple=x86_64-unknown-linux-gnu" "--save-temps" "--linker-path=/ABSOlUTE_PATH_TO/rust/build/x86_64-unknown-linux-gnu/lld/bin/ld.lld" "--hash-style=gnu" "--eh-frame-hdr" "-m" "elf_x86_64" "-pie" "-dynamic-linker" "/lib64/ld-linux-x86-64.so.2" "-o" "bare" "/lib/../lib64/Scrt1.o" "/lib/../lib64/crti.o" "/ABSOLUTE_PATH_TO/crtbeginS.o" "-L/ABSOLUTE_PATH_TO/rust/build/x86_64-unknown-linux-gnu/llvm/bin/../lib/x86_64-unknown-linux-gnu" "-L/ABSOLUTE_PATH_TO/rust/build/x86_64-unknown-linux-gnu/llvm/lib/clang/21/lib/x86_64-unknown-linux-gnu" "-L/lib/../lib64" "-L/usr/lib64" "-L/lib" "-L/usr/lib" "host.o" "-lstdc++" "-lm" "-lomp" "-lomptarget" "-L/ABSOLUTE_PATH_TO/rust/build/x86_64-unknown-linux-gnu/llvm/lib" "-lgcc_s" "-lgcc" "-lpthread" "-lc" "-lgcc_s" "-lgcc" "/ABSOLUTE_PATH_TO/crtendS.o" "/lib/../lib64/crtn.o"
|
||||
```
|
||||
|
||||
Especially for the last command I recommend to not fix the paths, but rather just re-generate them by copying a bare-mode openmp example and compiling it with your clang. By adding `-###` to your clang invocation, you can see the invidual steps.
|
||||
Especially for the last three commands I recommend to not fix the paths, but rather just re-generate them by copying a bare-mode openmp example and compiling it with your clang. By adding `-###` to your clang invocation, you can see the invidual steps.
|
||||
You can ignore other steps, e.g. the invocation of a "clang-offload-packager".
|
||||
```
|
||||
myclang++ -fuse-ld=lld -O3 -fopenmp -fopenmp-offload-mandatory --offload-arch=gfx90a omp_bare.cpp -o main -###
|
||||
```
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue