Rollup merge of #147936 - Sa4dUs:offload-intrinsic, r=ZuseZ4
Offload intrinsic
This PR implements the minimal mechanisms required to run a small subset of arbitrary offload kernels without relying on hardcoded names or metadata.
- `offload(kernel, (..args))`: an intrinsic that generates the necessary host-side LLVM-IR code.
- `rustc_offload_kernel`: a builtin attribute that marks device kernels to be handled appropriately.
Example usage (pseudocode):
```rust
fn kernel(x: *mut [f64; 128]) {
core::intrinsics::offload(kernel_1, (x,))
}
#[cfg(target_os = "linux")]
extern "C" {
pub fn kernel_1(array_b: *mut [f64; 128]);
}
#[cfg(not(target_os = "linux"))]
#[rustc_offload_kernel]
extern "gpu-kernel" fn kernel_1(x: *mut [f64; 128]) {
unsafe { (*x)[0] = 21.0 };
}
```
This commit is contained in:
commit
2b150f2c65
23 changed files with 529 additions and 178 deletions
|
|
@ -18,6 +18,9 @@ codegen_llvm_lto_bitcode_from_rlib = failed to get bitcode from object file for
|
|||
codegen_llvm_mismatch_data_layout =
|
||||
data-layout for target `{$rustc_target}`, `{$rustc_layout}`, differs from LLVM target's `{$llvm_target}` default layout, `{$llvm_layout}`
|
||||
|
||||
codegen_llvm_offload_without_enable = using the offload feature requires -Z offload=Enable
|
||||
codegen_llvm_offload_without_fat_lto = using the offload feature requires -C lto=fat
|
||||
|
||||
codegen_llvm_parse_bitcode = failed to parse bitcode for LTO module
|
||||
codegen_llvm_parse_bitcode_with_llvm_err = failed to parse bitcode for LTO module: {$llvm_err}
|
||||
|
||||
|
|
|
|||
|
|
@ -30,6 +30,14 @@ pub(crate) fn apply_to_callsite(callsite: &Value, idx: AttributePlace, attrs: &[
|
|||
}
|
||||
}
|
||||
|
||||
pub(crate) fn has_string_attr(llfn: &Value, name: &str) -> bool {
|
||||
llvm::HasStringAttribute(llfn, name)
|
||||
}
|
||||
|
||||
pub(crate) fn remove_string_attr_from_llfn(llfn: &Value, name: &str) {
|
||||
llvm::RemoveStringAttrFromFn(llfn, name);
|
||||
}
|
||||
|
||||
/// Get LLVM attribute for the provided inline heuristic.
|
||||
pub(crate) fn inline_attr<'ll, 'tcx>(
|
||||
cx: &SimpleCx<'ll>,
|
||||
|
|
@ -408,6 +416,10 @@ pub(crate) fn llfn_attrs_from_instance<'ll, 'tcx>(
|
|||
to_add.push(llvm::CreateAttrString(cx.llcx, "no-builtins"));
|
||||
}
|
||||
|
||||
if codegen_fn_attrs.flags.contains(CodegenFnAttrFlags::OFFLOAD_KERNEL) {
|
||||
to_add.push(llvm::CreateAttrString(cx.llcx, "offload-kernel"))
|
||||
}
|
||||
|
||||
if codegen_fn_attrs.flags.contains(CodegenFnAttrFlags::COLD) {
|
||||
to_add.push(AttributeKind::Cold.create_attr(cx.llcx));
|
||||
}
|
||||
|
|
|
|||
|
|
@ -26,7 +26,7 @@ use crate::back::write::{
|
|||
};
|
||||
use crate::errors::{LlvmError, LtoBitcodeFromRlib};
|
||||
use crate::llvm::{self, build_string};
|
||||
use crate::{LlvmCodegenBackend, ModuleLlvm, SimpleCx};
|
||||
use crate::{LlvmCodegenBackend, ModuleLlvm};
|
||||
|
||||
/// We keep track of the computed LTO cache keys from the previous
|
||||
/// session to determine which CGUs we can reuse.
|
||||
|
|
@ -601,7 +601,6 @@ pub(crate) fn run_pass_manager(
|
|||
// We then run the llvm_optimize function a second time, to optimize the code which we generated
|
||||
// in the enzyme differentiation pass.
|
||||
let enable_ad = config.autodiff.contains(&config::AutoDiff::Enable);
|
||||
let enable_gpu = config.offload.contains(&config::Offload::Enable);
|
||||
let stage = if thin {
|
||||
write::AutodiffStage::PreAD
|
||||
} else {
|
||||
|
|
@ -616,13 +615,6 @@ pub(crate) fn run_pass_manager(
|
|||
write::llvm_optimize(cgcx, dcx, module, None, config, opt_level, opt_stage, stage);
|
||||
}
|
||||
|
||||
// Here we only handle the GPU host (=cpu) code.
|
||||
if enable_gpu && !thin && !cgcx.target_is_like_gpu {
|
||||
let cx =
|
||||
SimpleCx::new(module.module_llvm.llmod(), &module.module_llvm.llcx, cgcx.pointer_size);
|
||||
crate::builder::gpu_offload::handle_gpu_code(cgcx, &cx);
|
||||
}
|
||||
|
||||
if cfg!(feature = "llvm_enzyme") && enable_ad && !thin {
|
||||
let opt_stage = llvm::OptStage::FatLTO;
|
||||
let stage = write::AutodiffStage::PostAD;
|
||||
|
|
|
|||
|
|
@ -43,7 +43,7 @@ use crate::errors::{
|
|||
use crate::llvm::diagnostic::OptimizationDiagnosticKind::*;
|
||||
use crate::llvm::{self, DiagnosticInfo};
|
||||
use crate::type_::llvm_type_ptr;
|
||||
use crate::{LlvmCodegenBackend, ModuleLlvm, SimpleCx, base, common, llvm_util};
|
||||
use crate::{LlvmCodegenBackend, ModuleLlvm, SimpleCx, attributes, base, common, llvm_util};
|
||||
|
||||
pub(crate) fn llvm_err<'a>(dcx: DiagCtxtHandle<'_>, err: LlvmError<'a>) -> ! {
|
||||
match llvm::last_error() {
|
||||
|
|
@ -712,11 +712,12 @@ pub(crate) unsafe fn llvm_optimize(
|
|||
SimpleCx::new(module.module_llvm.llmod(), module.module_llvm.llcx, cgcx.pointer_size);
|
||||
// For now we only support up to 10 kernels named kernel_0 ... kernel_9, a follow-up PR is
|
||||
// introducing a proper offload intrinsic to solve this limitation.
|
||||
for num in 0..9 {
|
||||
let name = format!("kernel_{num}");
|
||||
if let Some(kernel) = cx.get_function(&name) {
|
||||
handle_offload(&cx, kernel);
|
||||
for func in cx.get_functions() {
|
||||
let offload_kernel = "offload-kernel";
|
||||
if attributes::has_string_attr(func, offload_kernel) {
|
||||
handle_offload(&cx, func);
|
||||
}
|
||||
attributes::remove_string_attr_from_llfn(func, offload_kernel);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -2,37 +2,13 @@ use std::ffi::CString;
|
|||
|
||||
use llvm::Linkage::*;
|
||||
use rustc_abi::Align;
|
||||
use rustc_codegen_ssa::back::write::CodegenContext;
|
||||
use rustc_codegen_ssa::traits::BaseTypeCodegenMethods;
|
||||
use rustc_middle::ty::offload_meta::OffloadMetadata;
|
||||
|
||||
use crate::builder::SBuilder;
|
||||
use crate::common::AsCCharPtr;
|
||||
use crate::llvm::AttributePlace::Function;
|
||||
use crate::llvm::{self, Linkage, Type, Value};
|
||||
use crate::{LlvmCodegenBackend, SimpleCx, attributes};
|
||||
|
||||
pub(crate) fn handle_gpu_code<'ll>(
|
||||
_cgcx: &CodegenContext<LlvmCodegenBackend>,
|
||||
cx: &'ll SimpleCx<'_>,
|
||||
) {
|
||||
// The offload memory transfer type for each kernel
|
||||
let mut memtransfer_types = vec![];
|
||||
let mut region_ids = vec![];
|
||||
let offload_entry_ty = TgtOffloadEntry::new_decl(&cx);
|
||||
// This is a temporary hack, we only search for kernel_0 to kernel_9 functions.
|
||||
// There is a draft PR in progress which will introduce a proper offload intrinsic to remove
|
||||
// this limitation.
|
||||
for num in 0..9 {
|
||||
let kernel = cx.get_function(&format!("kernel_{num}"));
|
||||
if let Some(kernel) = kernel {
|
||||
let (o, k) = gen_define_handling(&cx, kernel, offload_entry_ty, num);
|
||||
memtransfer_types.push(o);
|
||||
region_ids.push(k);
|
||||
}
|
||||
}
|
||||
|
||||
gen_call_handling(&cx, &memtransfer_types, ®ion_ids);
|
||||
}
|
||||
use crate::llvm::{self, BasicBlock, Linkage, Type, Value};
|
||||
use crate::{SimpleCx, attributes};
|
||||
|
||||
// ; Function Attrs: nounwind
|
||||
// declare i32 @__tgt_target_kernel(ptr, i64, i32, i32, ptr, ptr) #2
|
||||
|
|
@ -79,7 +55,7 @@ fn generate_at_one<'ll>(cx: &'ll SimpleCx<'_>) -> &'ll llvm::Value {
|
|||
at_one
|
||||
}
|
||||
|
||||
struct TgtOffloadEntry {
|
||||
pub(crate) struct TgtOffloadEntry {
|
||||
// uint64_t Reserved;
|
||||
// uint16_t Version;
|
||||
// uint16_t Kind;
|
||||
|
|
@ -167,7 +143,7 @@ impl KernelArgsTy {
|
|||
fn new<'ll>(
|
||||
cx: &'ll SimpleCx<'_>,
|
||||
num_args: u64,
|
||||
memtransfer_types: &[&'ll Value],
|
||||
memtransfer_types: &'ll Value,
|
||||
geps: [&'ll Value; 3],
|
||||
) -> [(Align, &'ll Value); 13] {
|
||||
let four = Align::from_bytes(4).expect("4 Byte alignment should work");
|
||||
|
|
@ -181,7 +157,7 @@ impl KernelArgsTy {
|
|||
(eight, geps[0]),
|
||||
(eight, geps[1]),
|
||||
(eight, geps[2]),
|
||||
(eight, memtransfer_types[0]),
|
||||
(eight, memtransfer_types),
|
||||
// The next two are debug infos. FIXME(offload): set them
|
||||
(eight, cx.const_null(cx.type_ptr())), // dbg
|
||||
(eight, cx.const_null(cx.type_ptr())), // dbg
|
||||
|
|
@ -194,6 +170,14 @@ impl KernelArgsTy {
|
|||
}
|
||||
}
|
||||
|
||||
// Contains LLVM values needed to manage offloading for a single kernel.
|
||||
pub(crate) struct OffloadKernelData<'ll> {
|
||||
pub offload_sizes: &'ll llvm::Value,
|
||||
pub memtransfer_types: &'ll llvm::Value,
|
||||
pub region_id: &'ll llvm::Value,
|
||||
pub offload_entry: &'ll llvm::Value,
|
||||
}
|
||||
|
||||
fn gen_tgt_data_mappers<'ll>(
|
||||
cx: &'ll SimpleCx<'_>,
|
||||
) -> (&'ll llvm::Value, &'ll llvm::Value, &'ll llvm::Value, &'ll llvm::Type) {
|
||||
|
|
@ -256,68 +240,68 @@ pub(crate) fn add_global<'ll>(
|
|||
// This function returns a memtransfer value which encodes how arguments to this kernel shall be
|
||||
// mapped to/from the gpu. It also returns a region_id with the name of this kernel, to be
|
||||
// concatenated into the list of region_ids.
|
||||
fn gen_define_handling<'ll>(
|
||||
cx: &'ll SimpleCx<'_>,
|
||||
kernel: &'ll llvm::Value,
|
||||
pub(crate) fn gen_define_handling<'ll>(
|
||||
cx: &SimpleCx<'ll>,
|
||||
offload_entry_ty: &'ll llvm::Type,
|
||||
num: i64,
|
||||
) -> (&'ll llvm::Value, &'ll llvm::Value) {
|
||||
let types = cx.func_params_types(cx.get_type_of_global(kernel));
|
||||
metadata: &[OffloadMetadata],
|
||||
types: &[&Type],
|
||||
symbol: &str,
|
||||
) -> OffloadKernelData<'ll> {
|
||||
// It seems like non-pointer values are automatically mapped. So here, we focus on pointer (or
|
||||
// reference) types.
|
||||
let num_ptr_types = types
|
||||
.iter()
|
||||
.filter(|&x| matches!(cx.type_kind(x), rustc_codegen_ssa::common::TypeKind::Pointer))
|
||||
.count();
|
||||
let ptr_meta = types.iter().zip(metadata).filter_map(|(&x, meta)| match cx.type_kind(x) {
|
||||
rustc_codegen_ssa::common::TypeKind::Pointer => Some(meta),
|
||||
_ => None,
|
||||
});
|
||||
|
||||
// We do not know their size anymore at this level, so hardcode a placeholder.
|
||||
// A follow-up pr will track these from the frontend, where we still have Rust types.
|
||||
// Then, we will be able to figure out that e.g. `&[f32;256]` will result in 4*256 bytes.
|
||||
// I decided that 1024 bytes is a great placeholder value for now.
|
||||
add_priv_unnamed_arr(&cx, &format!(".offload_sizes.{num}"), &vec![1024; num_ptr_types]);
|
||||
// FIXME(Sa4dUs): add `OMP_MAP_TARGET_PARAM = 0x20` only if necessary
|
||||
let (ptr_sizes, ptr_transfer): (Vec<_>, Vec<_>) =
|
||||
ptr_meta.map(|m| (m.payload_size, m.mode.bits() | 0x20)).unzip();
|
||||
|
||||
let offload_sizes = add_priv_unnamed_arr(&cx, &format!(".offload_sizes.{symbol}"), &ptr_sizes);
|
||||
// Here we figure out whether something needs to be copied to the gpu (=1), from the gpu (=2),
|
||||
// or both to and from the gpu (=3). Other values shouldn't affect us for now.
|
||||
// A non-mutable reference or pointer will be 1, an array that's not read, but fully overwritten
|
||||
// will be 2. For now, everything is 3, until we have our frontend set up.
|
||||
// 1+2+32: 1 (MapTo), 2 (MapFrom), 32 (Add one extra input ptr per function, to be used later).
|
||||
let memtransfer_types = add_priv_unnamed_arr(
|
||||
&cx,
|
||||
&format!(".offload_maptypes.{num}"),
|
||||
&vec![1 + 2 + 32; num_ptr_types],
|
||||
);
|
||||
let memtransfer_types =
|
||||
add_priv_unnamed_arr(&cx, &format!(".offload_maptypes.{symbol}"), &ptr_transfer);
|
||||
|
||||
// Next: For each function, generate these three entries. A weak constant,
|
||||
// the llvm.rodata entry name, and the llvm_offload_entries value
|
||||
|
||||
let name = format!(".kernel_{num}.region_id");
|
||||
let name = format!(".{symbol}.region_id");
|
||||
let initializer = cx.get_const_i8(0);
|
||||
let region_id = add_unnamed_global(&cx, &name, initializer, WeakAnyLinkage);
|
||||
|
||||
let c_entry_name = CString::new(format!("kernel_{num}")).unwrap();
|
||||
let c_entry_name = CString::new(symbol).unwrap();
|
||||
let c_val = c_entry_name.as_bytes_with_nul();
|
||||
let offload_entry_name = format!(".offloading.entry_name.{num}");
|
||||
let offload_entry_name = format!(".offloading.entry_name.{symbol}");
|
||||
|
||||
let initializer = crate::common::bytes_in_context(cx.llcx, c_val);
|
||||
let llglobal = add_unnamed_global(&cx, &offload_entry_name, initializer, InternalLinkage);
|
||||
llvm::set_alignment(llglobal, Align::ONE);
|
||||
llvm::set_section(llglobal, c".llvm.rodata.offloading");
|
||||
let name = format!(".offloading.entry.kernel_{num}");
|
||||
|
||||
let name = format!(".offloading.entry.{symbol}");
|
||||
|
||||
// See the __tgt_offload_entry documentation above.
|
||||
let elems = TgtOffloadEntry::new(&cx, region_id, llglobal);
|
||||
|
||||
let initializer = crate::common::named_struct(offload_entry_ty, &elems);
|
||||
let c_name = CString::new(name).unwrap();
|
||||
let llglobal = llvm::add_global(cx.llmod, offload_entry_ty, &c_name);
|
||||
llvm::set_global_constant(llglobal, true);
|
||||
llvm::set_linkage(llglobal, WeakAnyLinkage);
|
||||
llvm::set_initializer(llglobal, initializer);
|
||||
llvm::set_alignment(llglobal, Align::EIGHT);
|
||||
let offload_entry = llvm::add_global(cx.llmod, offload_entry_ty, &c_name);
|
||||
llvm::set_global_constant(offload_entry, true);
|
||||
llvm::set_linkage(offload_entry, WeakAnyLinkage);
|
||||
llvm::set_initializer(offload_entry, initializer);
|
||||
llvm::set_alignment(offload_entry, Align::EIGHT);
|
||||
let c_section_name = CString::new("llvm_offload_entries").unwrap();
|
||||
llvm::set_section(llglobal, &c_section_name);
|
||||
(memtransfer_types, region_id)
|
||||
llvm::set_section(offload_entry, &c_section_name);
|
||||
|
||||
OffloadKernelData { offload_sizes, memtransfer_types, region_id, offload_entry }
|
||||
}
|
||||
|
||||
pub(crate) fn declare_offload_fn<'ll>(
|
||||
fn declare_offload_fn<'ll>(
|
||||
cx: &'ll SimpleCx<'_>,
|
||||
name: &str,
|
||||
ty: &'ll llvm::Type,
|
||||
|
|
@ -333,8 +317,7 @@ pub(crate) fn declare_offload_fn<'ll>(
|
|||
}
|
||||
|
||||
// For each kernel *call*, we now use some of our previous declared globals to move data to and from
|
||||
// the gpu. We don't have a proper frontend yet, so we assume that every call to a kernel function
|
||||
// from main is intended to run on the GPU. For now, we only handle the data transfer part of it.
|
||||
// the gpu. For now, we only handle the data transfer part of it.
|
||||
// If two consecutive kernels use the same memory, we still move it to the host and back to the gpu.
|
||||
// Since in our frontend users (by default) don't have to specify data transfer, this is something
|
||||
// we should optimize in the future! We also assume that everything should be copied back and forth,
|
||||
|
|
@ -352,11 +335,16 @@ pub(crate) fn declare_offload_fn<'ll>(
|
|||
// 4. set insert point after kernel call.
|
||||
// 5. generate all the GEPS and stores, to be used in 6)
|
||||
// 6. generate __tgt_target_data_end calls to move data from the GPU
|
||||
fn gen_call_handling<'ll>(
|
||||
cx: &'ll SimpleCx<'_>,
|
||||
memtransfer_types: &[&'ll llvm::Value],
|
||||
region_ids: &[&'ll llvm::Value],
|
||||
pub(crate) fn gen_call_handling<'ll>(
|
||||
cx: &SimpleCx<'ll>,
|
||||
bb: &BasicBlock,
|
||||
offload_data: &OffloadKernelData<'ll>,
|
||||
args: &[&'ll Value],
|
||||
types: &[&Type],
|
||||
metadata: &[OffloadMetadata],
|
||||
) {
|
||||
let OffloadKernelData { offload_sizes, offload_entry, memtransfer_types, region_id } =
|
||||
offload_data;
|
||||
let (tgt_decl, tgt_target_kernel_ty) = generate_launcher(&cx);
|
||||
// %struct.__tgt_bin_desc = type { i32, ptr, ptr, ptr }
|
||||
let tptr = cx.type_ptr();
|
||||
|
|
@ -368,27 +356,32 @@ fn gen_call_handling<'ll>(
|
|||
let tgt_kernel_decl = KernelArgsTy::new_decl(&cx);
|
||||
let (begin_mapper_decl, _, end_mapper_decl, fn_ty) = gen_tgt_data_mappers(&cx);
|
||||
|
||||
let main_fn = cx.get_function("main");
|
||||
let Some(main_fn) = main_fn else { return };
|
||||
let kernel_name = "kernel_1";
|
||||
let call = unsafe {
|
||||
llvm::LLVMRustGetFunctionCall(main_fn, kernel_name.as_c_char_ptr(), kernel_name.len())
|
||||
};
|
||||
let Some(kernel_call) = call else {
|
||||
return;
|
||||
};
|
||||
let kernel_call_bb = unsafe { llvm::LLVMGetInstructionParent(kernel_call) };
|
||||
let called = unsafe { llvm::LLVMGetCalledValue(kernel_call).unwrap() };
|
||||
let mut builder = SBuilder::build(cx, kernel_call_bb);
|
||||
let mut builder = SBuilder::build(cx, bb);
|
||||
|
||||
let types = cx.func_params_types(cx.get_type_of_global(called));
|
||||
let num_args = types.len() as u64;
|
||||
let ip = unsafe { llvm::LLVMRustGetInsertPoint(&builder.llbuilder) };
|
||||
|
||||
// FIXME(Sa4dUs): dummy loads are a temp workaround, we should find a proper way to prevent these
|
||||
// variables from being optimized away
|
||||
for val in [offload_sizes, offload_entry] {
|
||||
unsafe {
|
||||
let dummy = llvm::LLVMBuildLoad2(
|
||||
&builder.llbuilder,
|
||||
llvm::LLVMTypeOf(val),
|
||||
val,
|
||||
b"dummy\0".as_ptr() as *const _,
|
||||
);
|
||||
llvm::LLVMSetVolatile(dummy, llvm::TRUE);
|
||||
}
|
||||
}
|
||||
|
||||
// Step 0)
|
||||
// %struct.__tgt_bin_desc = type { i32, ptr, ptr, ptr }
|
||||
// %6 = alloca %struct.__tgt_bin_desc, align 8
|
||||
unsafe { llvm::LLVMRustPositionBuilderPastAllocas(builder.llbuilder, main_fn) };
|
||||
|
||||
let llfn = unsafe { llvm::LLVMGetBasicBlockParent(bb) };
|
||||
unsafe {
|
||||
llvm::LLVMRustPositionBuilderPastAllocas(&builder.llbuilder, llfn);
|
||||
}
|
||||
let tgt_bin_desc_alloca = builder.direct_alloca(tgt_bin_desc, Align::EIGHT, "EmptyDesc");
|
||||
|
||||
let ty = cx.type_array(cx.type_ptr(), num_args);
|
||||
|
|
@ -404,15 +397,16 @@ fn gen_call_handling<'ll>(
|
|||
let a5 = builder.direct_alloca(tgt_kernel_decl, Align::EIGHT, "kernel_args");
|
||||
|
||||
// Step 1)
|
||||
unsafe { llvm::LLVMRustPositionBefore(builder.llbuilder, kernel_call) };
|
||||
unsafe {
|
||||
llvm::LLVMRustRestoreInsertPoint(&builder.llbuilder, ip);
|
||||
}
|
||||
builder.memset(tgt_bin_desc_alloca, cx.get_const_i8(0), cx.get_const_i64(32), Align::EIGHT);
|
||||
|
||||
// Now we allocate once per function param, a copy to be passed to one of our maps.
|
||||
let mut vals = vec![];
|
||||
let mut geps = vec![];
|
||||
let i32_0 = cx.get_const_i32(0);
|
||||
for index in 0..types.len() {
|
||||
let v = unsafe { llvm::LLVMGetOperand(kernel_call, index as u32).unwrap() };
|
||||
for &v in args {
|
||||
let gep = builder.inbounds_gep(cx.type_f32(), v, &[i32_0]);
|
||||
vals.push(v);
|
||||
geps.push(gep);
|
||||
|
|
@ -437,10 +431,8 @@ fn gen_call_handling<'ll>(
|
|||
let gep2 = builder.inbounds_gep(ty, a2, &[i32_0, idx]);
|
||||
builder.store(geps[i as usize], gep2, Align::EIGHT);
|
||||
let gep3 = builder.inbounds_gep(ty2, a4, &[i32_0, idx]);
|
||||
// As mentioned above, we don't use Rust type information yet. So for now we will just
|
||||
// assume that we have 1024 bytes, 256 f32 values.
|
||||
// FIXME(offload): write an offload frontend and handle arbitrary types.
|
||||
builder.store(cx.get_const_i64(1024), gep3, Align::EIGHT);
|
||||
builder.store(cx.get_const_i64(metadata[i as usize].payload_size), gep3, Align::EIGHT);
|
||||
}
|
||||
|
||||
// For now we have a very simplistic indexing scheme into our
|
||||
|
|
@ -482,9 +474,17 @@ fn gen_call_handling<'ll>(
|
|||
|
||||
// Step 2)
|
||||
let s_ident_t = generate_at_one(&cx);
|
||||
let o = memtransfer_types[0];
|
||||
let geps = get_geps(&mut builder, &cx, ty, ty2, a1, a2, a4);
|
||||
generate_mapper_call(&mut builder, &cx, geps, o, begin_mapper_decl, fn_ty, num_args, s_ident_t);
|
||||
generate_mapper_call(
|
||||
&mut builder,
|
||||
&cx,
|
||||
geps,
|
||||
memtransfer_types,
|
||||
begin_mapper_decl,
|
||||
fn_ty,
|
||||
num_args,
|
||||
s_ident_t,
|
||||
);
|
||||
let values = KernelArgsTy::new(&cx, num_args, memtransfer_types, geps);
|
||||
|
||||
// Step 3)
|
||||
|
|
@ -501,26 +501,26 @@ fn gen_call_handling<'ll>(
|
|||
// FIXME(offload): Don't hardcode the numbers of threads in the future.
|
||||
cx.get_const_i32(2097152),
|
||||
cx.get_const_i32(256),
|
||||
region_ids[0],
|
||||
region_id,
|
||||
a5,
|
||||
];
|
||||
let offload_success = builder.call(tgt_target_kernel_ty, tgt_decl, &args, None);
|
||||
builder.call(tgt_target_kernel_ty, tgt_decl, &args, None);
|
||||
// %41 = call i32 @__tgt_target_kernel(ptr @1, i64 -1, i32 2097152, i32 256, ptr @.kernel_1.region_id, ptr %kernel_args)
|
||||
unsafe {
|
||||
let next = llvm::LLVMGetNextInstruction(offload_success).unwrap();
|
||||
llvm::LLVMRustPositionAfter(builder.llbuilder, next);
|
||||
llvm::LLVMInstructionEraseFromParent(next);
|
||||
}
|
||||
|
||||
// Step 4)
|
||||
let geps = get_geps(&mut builder, &cx, ty, ty2, a1, a2, a4);
|
||||
generate_mapper_call(&mut builder, &cx, geps, o, end_mapper_decl, fn_ty, num_args, s_ident_t);
|
||||
generate_mapper_call(
|
||||
&mut builder,
|
||||
&cx,
|
||||
geps,
|
||||
memtransfer_types,
|
||||
end_mapper_decl,
|
||||
fn_ty,
|
||||
num_args,
|
||||
s_ident_t,
|
||||
);
|
||||
|
||||
builder.call(mapper_fn_ty, unregister_lib_decl, &[tgt_bin_desc_alloca], None);
|
||||
|
||||
drop(builder);
|
||||
// FIXME(offload) The issue is that we right now add a call to the gpu version of the function,
|
||||
// and then delete the call to the CPU version. In the future, we should use an intrinsic which
|
||||
// directly resolves to a call to the GPU version.
|
||||
unsafe { llvm::LLVMDeleteFunction(called) };
|
||||
}
|
||||
|
|
|
|||
|
|
@ -791,6 +791,16 @@ impl<'ll, CX: Borrow<SCx<'ll>>> GenericCx<'ll, CX> {
|
|||
llvm::LLVMMDStringInContext2(self.llcx(), name.as_ptr() as *const c_char, name.len())
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn get_functions(&self) -> Vec<&'ll Value> {
|
||||
let mut functions = vec![];
|
||||
let mut func = unsafe { llvm::LLVMGetFirstFunction(self.llmod()) };
|
||||
while let Some(f) = func {
|
||||
functions.push(f);
|
||||
func = unsafe { llvm::LLVMGetNextFunction(f) }
|
||||
}
|
||||
functions
|
||||
}
|
||||
}
|
||||
|
||||
impl<'ll, 'tcx> MiscCodegenMethods<'tcx> for CodegenCx<'ll, 'tcx> {
|
||||
|
|
|
|||
|
|
@ -40,6 +40,14 @@ pub(crate) struct AutoDiffWithoutLto;
|
|||
#[diag(codegen_llvm_autodiff_without_enable)]
|
||||
pub(crate) struct AutoDiffWithoutEnable;
|
||||
|
||||
#[derive(Diagnostic)]
|
||||
#[diag(codegen_llvm_offload_without_enable)]
|
||||
pub(crate) struct OffloadWithoutEnable;
|
||||
|
||||
#[derive(Diagnostic)]
|
||||
#[diag(codegen_llvm_offload_without_fat_lto)]
|
||||
pub(crate) struct OffloadWithoutFatLTO;
|
||||
|
||||
#[derive(Diagnostic)]
|
||||
#[diag(codegen_llvm_lto_bitcode_from_rlib)]
|
||||
pub(crate) struct LtoBitcodeFromRlib {
|
||||
|
|
|
|||
|
|
@ -13,6 +13,7 @@ use rustc_hir::def_id::LOCAL_CRATE;
|
|||
use rustc_hir::{self as hir};
|
||||
use rustc_middle::mir::BinOp;
|
||||
use rustc_middle::ty::layout::{FnAbiOf, HasTyCtxt, HasTypingEnv, LayoutOf};
|
||||
use rustc_middle::ty::offload_meta::OffloadMetadata;
|
||||
use rustc_middle::ty::{self, GenericArgsRef, Instance, SimdAlign, Ty, TyCtxt, TypingEnv};
|
||||
use rustc_middle::{bug, span_bug};
|
||||
use rustc_session::config::CrateType;
|
||||
|
|
@ -25,8 +26,11 @@ use tracing::debug;
|
|||
use crate::abi::FnAbiLlvmExt;
|
||||
use crate::builder::Builder;
|
||||
use crate::builder::autodiff::{adjust_activity_to_abi, generate_enzyme_call};
|
||||
use crate::builder::gpu_offload::TgtOffloadEntry;
|
||||
use crate::context::CodegenCx;
|
||||
use crate::errors::{AutoDiffWithoutEnable, AutoDiffWithoutLto};
|
||||
use crate::errors::{
|
||||
AutoDiffWithoutEnable, AutoDiffWithoutLto, OffloadWithoutEnable, OffloadWithoutFatLTO,
|
||||
};
|
||||
use crate::llvm::{self, Metadata, Type, Value};
|
||||
use crate::type_of::LayoutLlvmExt;
|
||||
use crate::va_arg::emit_va_arg;
|
||||
|
|
@ -197,6 +201,24 @@ impl<'ll, 'tcx> IntrinsicCallBuilderMethods<'tcx> for Builder<'_, 'll, 'tcx> {
|
|||
codegen_autodiff(self, tcx, instance, args, result);
|
||||
return Ok(());
|
||||
}
|
||||
sym::offload => {
|
||||
if !tcx
|
||||
.sess
|
||||
.opts
|
||||
.unstable_opts
|
||||
.offload
|
||||
.contains(&rustc_session::config::Offload::Enable)
|
||||
{
|
||||
let _ = tcx.dcx().emit_almost_fatal(OffloadWithoutEnable);
|
||||
}
|
||||
|
||||
if tcx.sess.lto() != rustc_session::config::Lto::Fat {
|
||||
let _ = tcx.dcx().emit_almost_fatal(OffloadWithoutFatLTO);
|
||||
}
|
||||
|
||||
codegen_offload(self, tcx, instance, args);
|
||||
return Ok(());
|
||||
}
|
||||
sym::is_val_statically_known => {
|
||||
if let OperandValue::Immediate(imm) = args[0].val {
|
||||
self.call_intrinsic(
|
||||
|
|
@ -1231,6 +1253,62 @@ fn codegen_autodiff<'ll, 'tcx>(
|
|||
);
|
||||
}
|
||||
|
||||
// Generates the LLVM code to offload a Rust function to a target device (e.g., GPU).
|
||||
// For each kernel call, it generates the necessary globals (including metadata such as
|
||||
// size and pass mode), manages memory mapping to and from the device, handles all
|
||||
// data transfers, and launches the kernel on the target device.
|
||||
fn codegen_offload<'ll, 'tcx>(
|
||||
bx: &mut Builder<'_, 'll, 'tcx>,
|
||||
tcx: TyCtxt<'tcx>,
|
||||
instance: ty::Instance<'tcx>,
|
||||
args: &[OperandRef<'tcx, &'ll Value>],
|
||||
) {
|
||||
let cx = bx.cx;
|
||||
let fn_args = instance.args;
|
||||
|
||||
let (target_id, target_args) = match fn_args.into_type_list(tcx)[0].kind() {
|
||||
ty::FnDef(def_id, params) => (def_id, params),
|
||||
_ => bug!("invalid offload intrinsic arg"),
|
||||
};
|
||||
|
||||
let fn_target = match Instance::try_resolve(tcx, cx.typing_env(), *target_id, target_args) {
|
||||
Ok(Some(instance)) => instance,
|
||||
Ok(None) => bug!(
|
||||
"could not resolve ({:?}, {:?}) to a specific offload instance",
|
||||
target_id,
|
||||
target_args
|
||||
),
|
||||
Err(_) => {
|
||||
// An error has already been emitted
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
let args = get_args_from_tuple(bx, args[1], fn_target);
|
||||
let target_symbol = symbol_name_for_instance_in_crate(tcx, fn_target, LOCAL_CRATE);
|
||||
|
||||
let offload_entry_ty = TgtOffloadEntry::new_decl(&cx);
|
||||
|
||||
let sig = tcx.fn_sig(fn_target.def_id()).skip_binder().skip_binder();
|
||||
let inputs = sig.inputs();
|
||||
|
||||
let metadata = inputs.iter().map(|ty| OffloadMetadata::from_ty(tcx, *ty)).collect::<Vec<_>>();
|
||||
|
||||
let types = inputs.iter().map(|ty| cx.layout_of(*ty).llvm_type(cx)).collect::<Vec<_>>();
|
||||
|
||||
let offload_data = crate::builder::gpu_offload::gen_define_handling(
|
||||
cx,
|
||||
offload_entry_ty,
|
||||
&metadata,
|
||||
&types,
|
||||
&target_symbol,
|
||||
);
|
||||
|
||||
// FIXME(Sa4dUs): pass the original builder once we separate kernel launch logic from globals
|
||||
let bb = unsafe { llvm::LLVMGetInsertBlock(bx.llbuilder) };
|
||||
crate::builder::gpu_offload::gen_call_handling(cx, bb, &offload_data, &args, &types, &metadata);
|
||||
}
|
||||
|
||||
fn get_args_from_tuple<'ll, 'tcx>(
|
||||
bx: &mut Builder<'_, 'll, 'tcx>,
|
||||
tuple_op: OperandRef<'tcx, &'ll Value>,
|
||||
|
|
|
|||
|
|
@ -1160,13 +1160,9 @@ unsafe extern "C" {
|
|||
) -> &'a BasicBlock;
|
||||
|
||||
// Operations on instructions
|
||||
pub(crate) fn LLVMGetInstructionParent(Inst: &Value) -> &BasicBlock;
|
||||
pub(crate) fn LLVMGetCalledValue(CallInst: &Value) -> Option<&Value>;
|
||||
pub(crate) fn LLVMIsAInstruction(Val: &Value) -> Option<&Value>;
|
||||
pub(crate) fn LLVMGetFirstBasicBlock(Fn: &Value) -> &BasicBlock;
|
||||
pub(crate) fn LLVMGetOperand(Val: &Value, Index: c_uint) -> Option<&Value>;
|
||||
pub(crate) fn LLVMGetNextInstruction(Val: &Value) -> Option<&Value>;
|
||||
pub(crate) fn LLVMInstructionEraseFromParent(Val: &Value);
|
||||
|
||||
// Operations on call sites
|
||||
pub(crate) fn LLVMSetInstructionCallConv(Instr: &Value, CC: c_uint);
|
||||
|
|
@ -2484,6 +2480,8 @@ unsafe extern "C" {
|
|||
|
||||
pub(crate) fn LLVMRustPositionBuilderPastAllocas<'a>(B: &Builder<'a>, Fn: &'a Value);
|
||||
pub(crate) fn LLVMRustPositionBuilderAtStart<'a>(B: &Builder<'a>, BB: &'a BasicBlock);
|
||||
pub(crate) fn LLVMRustGetInsertPoint<'a>(B: &Builder<'a>) -> &'a Value;
|
||||
pub(crate) fn LLVMRustRestoreInsertPoint<'a>(B: &Builder<'a>, IP: &'a Value);
|
||||
|
||||
pub(crate) fn LLVMRustSetModulePICLevel(M: &Module);
|
||||
pub(crate) fn LLVMRustSetModulePIELevel(M: &Module);
|
||||
|
|
|
|||
|
|
@ -43,6 +43,14 @@ pub(crate) fn AddFunctionAttributes<'ll>(
|
|||
}
|
||||
}
|
||||
|
||||
pub(crate) fn HasStringAttribute<'ll>(llfn: &'ll Value, name: &str) -> bool {
|
||||
unsafe { LLVMRustHasFnAttribute(llfn, name.as_c_char_ptr(), name.len()) }
|
||||
}
|
||||
|
||||
pub(crate) fn RemoveStringAttrFromFn<'ll>(llfn: &'ll Value, name: &str) {
|
||||
unsafe { LLVMRustRemoveFnAttribute(llfn, name.as_c_char_ptr(), name.len()) }
|
||||
}
|
||||
|
||||
pub(crate) fn AddCallSiteAttributes<'ll>(
|
||||
callsite: &'ll Value,
|
||||
idx: AttributePlace,
|
||||
|
|
|
|||
|
|
@ -334,6 +334,9 @@ fn process_builtin_attrs(
|
|||
codegen_fn_attrs.patchable_function_entry =
|
||||
parse_patchable_function_entry(tcx, attr);
|
||||
}
|
||||
sym::rustc_offload_kernel => {
|
||||
codegen_fn_attrs.flags |= CodegenFnAttrFlags::OFFLOAD_KERNEL
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1117,6 +1117,11 @@ pub static BUILTIN_ATTRIBUTES: &[BuiltinAttribute] = &[
|
|||
rustc_autodiff, Normal,
|
||||
template!(Word, List: &[r#""...""#]), DuplicatesOk,
|
||||
EncodeCrossCrate::Yes,
|
||||
),
|
||||
rustc_attr!(
|
||||
rustc_offload_kernel, Normal,
|
||||
template!(Word), DuplicatesOk,
|
||||
EncodeCrossCrate::Yes,
|
||||
),
|
||||
// Traces that are left when `cfg` and `cfg_attr` attributes are expanded.
|
||||
// The attributes are not gated, to avoid stability errors, but they cannot be used in stable
|
||||
|
|
|
|||
|
|
@ -163,6 +163,7 @@ fn intrinsic_operation_unsafety(tcx: TyCtxt<'_>, intrinsic_id: LocalDefId) -> hi
|
|||
| sym::minnumf128
|
||||
| sym::mul_with_overflow
|
||||
| sym::needs_drop
|
||||
| sym::offload
|
||||
| sym::offset_of
|
||||
| sym::overflow_checks
|
||||
| sym::powf16
|
||||
|
|
@ -313,6 +314,7 @@ pub(crate) fn check_intrinsic_type(
|
|||
let type_id = tcx.type_of(tcx.lang_items().type_id().unwrap()).instantiate_identity();
|
||||
(0, 0, vec![type_id, type_id], tcx.types.bool)
|
||||
}
|
||||
sym::offload => (3, 0, vec![param(0), param(1)], param(2)),
|
||||
sym::offset => (2, 0, vec![param(0), param(1)], param(0)),
|
||||
sym::arith_offset => (
|
||||
1,
|
||||
|
|
|
|||
|
|
@ -1436,6 +1436,39 @@ extern "C" void LLVMRustPositionAfter(LLVMBuilderRef B, LLVMValueRef Instr) {
|
|||
}
|
||||
}
|
||||
|
||||
extern "C" LLVMValueRef LLVMRustGetInsertPoint(LLVMBuilderRef B) {
|
||||
llvm::IRBuilderBase &IRB = *unwrap(B);
|
||||
|
||||
llvm::IRBuilderBase::InsertPoint ip = IRB.saveIP();
|
||||
llvm::BasicBlock *BB = ip.getBlock();
|
||||
|
||||
if (!BB)
|
||||
return nullptr;
|
||||
|
||||
auto it = ip.getPoint();
|
||||
|
||||
if (it == BB->end())
|
||||
return nullptr;
|
||||
|
||||
llvm::Instruction *I = &*it;
|
||||
return wrap(I);
|
||||
}
|
||||
|
||||
extern "C" void LLVMRustRestoreInsertPoint(LLVMBuilderRef B,
|
||||
LLVMValueRef Instr) {
|
||||
llvm::IRBuilderBase &IRB = *unwrap(B);
|
||||
|
||||
if (!Instr) {
|
||||
llvm::BasicBlock *BB = IRB.GetInsertBlock();
|
||||
if (BB)
|
||||
IRB.SetInsertPoint(BB);
|
||||
return;
|
||||
}
|
||||
|
||||
llvm::Instruction *I = unwrap<llvm::Instruction>(Instr);
|
||||
IRB.SetInsertPoint(I);
|
||||
}
|
||||
|
||||
extern "C" LLVMValueRef
|
||||
LLVMRustGetFunctionCall(LLVMValueRef Fn, const char *Name, size_t NameLen) {
|
||||
auto targetName = StringRef(Name, NameLen);
|
||||
|
|
|
|||
|
|
@ -190,6 +190,8 @@ bitflags::bitflags! {
|
|||
const NO_BUILTINS = 1 << 15;
|
||||
/// Marks foreign items, to make `contains_extern_indicator` cheaper.
|
||||
const FOREIGN_ITEM = 1 << 16;
|
||||
/// `#[rustc_offload_kernel]`: indicates that this is an offload kernel, an extra ptr arg will be added.
|
||||
const OFFLOAD_KERNEL = 1 << 17;
|
||||
}
|
||||
}
|
||||
rustc_data_structures::external_bitflags_debug! { CodegenFnAttrFlags }
|
||||
|
|
|
|||
|
|
@ -129,6 +129,7 @@ pub mod fast_reject;
|
|||
pub mod inhabitedness;
|
||||
pub mod layout;
|
||||
pub mod normalize_erasing_regions;
|
||||
pub mod offload_meta;
|
||||
pub mod pattern;
|
||||
pub mod print;
|
||||
pub mod relate;
|
||||
|
|
|
|||
119
compiler/rustc_middle/src/ty/offload_meta.rs
Normal file
119
compiler/rustc_middle/src/ty/offload_meta.rs
Normal file
|
|
@ -0,0 +1,119 @@
|
|||
use bitflags::bitflags;
|
||||
|
||||
use crate::ty::{self, PseudoCanonicalInput, Ty, TyCtxt, TypingEnv};
|
||||
|
||||
pub struct OffloadMetadata {
|
||||
pub payload_size: u64,
|
||||
pub mode: MappingFlags,
|
||||
}
|
||||
|
||||
bitflags! {
|
||||
/// Mirrors `OpenMPOffloadMappingFlags` from Clang/OpenMP.
|
||||
#[derive(Debug, Copy, Clone)]
|
||||
#[repr(transparent)]
|
||||
pub struct MappingFlags: u64 {
|
||||
/// No flags.
|
||||
const NONE = 0x0;
|
||||
/// Allocate memory on the device and move data from host to device.
|
||||
const TO = 0x01;
|
||||
/// Allocate memory on the device and move data from device to host.
|
||||
const FROM = 0x02;
|
||||
/// Always perform the requested mapping action, even if already mapped.
|
||||
const ALWAYS = 0x04;
|
||||
/// Delete the element from the device environment, ignoring ref count.
|
||||
const DELETE = 0x08;
|
||||
/// The element being mapped is a pointer-pointee pair.
|
||||
const PTR_AND_OBJ = 0x10;
|
||||
/// The base address should be passed to the target kernel as argument.
|
||||
const TARGET_PARAM = 0x20;
|
||||
/// The runtime must return the device pointer.
|
||||
const RETURN_PARAM = 0x40;
|
||||
/// The reference being passed is a pointer to private data.
|
||||
const PRIVATE = 0x80;
|
||||
/// Pass the element by value.
|
||||
const LITERAL = 0x100;
|
||||
/// Implicit map (generated by compiler, not explicit in code).
|
||||
const IMPLICIT = 0x200;
|
||||
/// Hint to allocate memory close to the target device.
|
||||
const CLOSE = 0x400;
|
||||
/// Reserved (0x800 in OpenMP for XLC compatibility).
|
||||
const RESERVED = 0x800;
|
||||
/// Require that the data is already allocated on the device.
|
||||
const PRESENT = 0x1000;
|
||||
/// Increment/decrement a separate ref counter (OpenACC compatibility).
|
||||
const OMPX_HOLD = 0x2000;
|
||||
/// Used for non-contiguous list items in target update.
|
||||
const NON_CONTIG = 0x100000000000;
|
||||
/// 16 MSBs indicate membership in a struct.
|
||||
const MEMBER_OF = 0xffff000000000000;
|
||||
}
|
||||
}
|
||||
|
||||
impl OffloadMetadata {
|
||||
pub fn from_ty<'tcx>(tcx: TyCtxt<'tcx>, ty: Ty<'tcx>) -> Self {
|
||||
OffloadMetadata {
|
||||
payload_size: get_payload_size(tcx, ty),
|
||||
mode: MappingFlags::from_ty(tcx, ty),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// FIXME(Sa4dUs): implement a solid logic to determine the payload size
|
||||
fn get_payload_size<'tcx>(tcx: TyCtxt<'tcx>, ty: Ty<'tcx>) -> u64 {
|
||||
match ty.kind() {
|
||||
ty::RawPtr(inner, _) | ty::Ref(_, inner, _) => get_payload_size(tcx, *inner),
|
||||
_ => tcx
|
||||
.layout_of(PseudoCanonicalInput {
|
||||
typing_env: TypingEnv::fully_monomorphized(),
|
||||
value: ty,
|
||||
})
|
||||
.unwrap()
|
||||
.size
|
||||
.bytes(),
|
||||
}
|
||||
}
|
||||
|
||||
impl MappingFlags {
|
||||
fn from_ty<'tcx>(tcx: TyCtxt<'tcx>, ty: Ty<'tcx>) -> Self {
|
||||
use rustc_ast::Mutability::*;
|
||||
|
||||
match ty.kind() {
|
||||
ty::Bool
|
||||
| ty::Char
|
||||
| ty::Int(_)
|
||||
| ty::Uint(_)
|
||||
| ty::Float(_)
|
||||
| ty::Adt(_, _)
|
||||
| ty::Tuple(_)
|
||||
| ty::Array(_, _)
|
||||
| ty::Alias(_, _)
|
||||
| ty::Param(_) => MappingFlags::TO,
|
||||
|
||||
ty::RawPtr(_, Not) | ty::Ref(_, _, Not) => MappingFlags::TO,
|
||||
|
||||
ty::RawPtr(_, Mut) | ty::Ref(_, _, Mut) => MappingFlags::TO | MappingFlags::FROM,
|
||||
|
||||
ty::Slice(_) | ty::Str | ty::Dynamic(_, _) => MappingFlags::TO | MappingFlags::FROM,
|
||||
|
||||
ty::Foreign(_) | ty::Pat(_, _) | ty::UnsafeBinder(_) => {
|
||||
MappingFlags::TO | MappingFlags::FROM
|
||||
}
|
||||
|
||||
ty::FnDef(_, _)
|
||||
| ty::FnPtr(_, _)
|
||||
| ty::Closure(_, _)
|
||||
| ty::CoroutineClosure(_, _)
|
||||
| ty::Coroutine(_, _)
|
||||
| ty::CoroutineWitness(_, _)
|
||||
| ty::Never
|
||||
| ty::Bound(_, _)
|
||||
| ty::Placeholder(_)
|
||||
| ty::Infer(_)
|
||||
| ty::Error(_) => {
|
||||
tcx.dcx()
|
||||
.span_err(rustc_span::DUMMY_SP, format!("type `{ty:?}` cannot be offloaded"));
|
||||
MappingFlags::empty()
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1584,6 +1584,7 @@ symbols! {
|
|||
object_safe_for_dispatch,
|
||||
of,
|
||||
off,
|
||||
offload,
|
||||
offset,
|
||||
offset_of,
|
||||
offset_of_enum,
|
||||
|
|
@ -1966,6 +1967,7 @@ symbols! {
|
|||
rustc_objc_class,
|
||||
rustc_objc_selector,
|
||||
rustc_object_lifetime_default,
|
||||
rustc_offload_kernel,
|
||||
rustc_on_unimplemented,
|
||||
rustc_outlives,
|
||||
rustc_paren_sugar,
|
||||
|
|
|
|||
|
|
@ -3324,6 +3324,38 @@ pub const fn copysignf128(x: f128, y: f128) -> f128;
|
|||
#[rustc_intrinsic]
|
||||
pub const fn autodiff<F, G, T: crate::marker::Tuple, R>(f: F, df: G, args: T) -> R;
|
||||
|
||||
/// Generates the LLVM body of a wrapper function to offload a kernel `f`.
|
||||
///
|
||||
/// Type Parameters:
|
||||
/// - `F`: The kernel to offload. Must be a function item.
|
||||
/// - `T`: A tuple of arguments passed to `f`.
|
||||
/// - `R`: The return type of the kernel.
|
||||
///
|
||||
/// Example usage (pseudocode):
|
||||
///
|
||||
/// ```rust,ignore (pseudocode)
|
||||
/// fn kernel(x: *mut [f64; 128]) {
|
||||
/// core::intrinsics::offload(kernel_1, (x,))
|
||||
/// }
|
||||
///
|
||||
/// #[cfg(target_os = "linux")]
|
||||
/// extern "C" {
|
||||
/// pub fn kernel_1(array_b: *mut [f64; 128]);
|
||||
/// }
|
||||
///
|
||||
/// #[cfg(not(target_os = "linux"))]
|
||||
/// #[rustc_offload_kernel]
|
||||
/// extern "gpu-kernel" fn kernel_1(x: *mut [f64; 128]) {
|
||||
/// unsafe { (*x)[0] = 21.0 };
|
||||
/// }
|
||||
/// ```
|
||||
///
|
||||
/// For reference, see the Clang documentation on offloading:
|
||||
/// <https://clang.llvm.org/docs/OffloadingDesign.html>.
|
||||
#[rustc_nounwind]
|
||||
#[rustc_intrinsic]
|
||||
pub const fn offload<F, T: crate::marker::Tuple, R>(f: F, args: T) -> R;
|
||||
|
||||
/// Inform Miri that a given pointer definitely has a certain alignment.
|
||||
#[cfg(miri)]
|
||||
#[rustc_allow_const_fn_unstable(const_eval_select)]
|
||||
|
|
|
|||
|
|
@ -5,6 +5,8 @@ We currently work on launching the following Rust kernel on the GPU. To follow a
|
|||
|
||||
```rust
|
||||
#![feature(abi_gpu_kernel)]
|
||||
#![feature(rustc_attrs)]
|
||||
#![feature(core_intrinsics)]
|
||||
#![no_std]
|
||||
|
||||
#[cfg(target_os = "linux")]
|
||||
|
|
@ -12,6 +14,7 @@ extern crate libc;
|
|||
#[cfg(target_os = "linux")]
|
||||
use libc::c_char;
|
||||
|
||||
#[cfg(target_os = "linux")]
|
||||
use core::mem;
|
||||
|
||||
#[panic_handler]
|
||||
|
|
@ -38,7 +41,7 @@ fn main() {
|
|||
}
|
||||
|
||||
unsafe {
|
||||
kernel_1(array_c);
|
||||
kernel(array_c);
|
||||
}
|
||||
core::hint::black_box(&array_c);
|
||||
unsafe {
|
||||
|
|
@ -52,6 +55,11 @@ fn main() {
|
|||
}
|
||||
}
|
||||
|
||||
#[inline(never)]
|
||||
unsafe fn kernel(x: *mut [f64; 256]) {
|
||||
core::intrinsics::offload(kernel_1, (x,))
|
||||
}
|
||||
|
||||
#[cfg(target_os = "linux")]
|
||||
unsafe extern "C" {
|
||||
pub fn kernel_1(array_b: *mut [f64; 256]);
|
||||
|
|
@ -60,6 +68,7 @@ unsafe extern "C" {
|
|||
#[cfg(not(target_os = "linux"))]
|
||||
#[unsafe(no_mangle)]
|
||||
#[inline(never)]
|
||||
#[rustc_offload_kernel]
|
||||
pub extern "gpu-kernel" fn kernel_1(x: *mut [f64; 256]) {
|
||||
unsafe { (*x)[0] = 21.0 };
|
||||
}
|
||||
|
|
|
|||
|
|
@ -11,6 +11,7 @@
|
|||
// when inside of a function called main. This, too, is a temporary workaround for not having a
|
||||
// frontend.
|
||||
|
||||
#![feature(core_intrinsics)]
|
||||
#![no_main]
|
||||
|
||||
#[unsafe(no_mangle)]
|
||||
|
|
@ -25,73 +26,70 @@ fn main() {
|
|||
// CHECK: %struct.__tgt_bin_desc = type { i32, ptr, ptr, ptr }
|
||||
// CHECK: %struct.__tgt_kernel_arguments = type { i32, i32, ptr, ptr, ptr, ptr, ptr, ptr, i64, i64, [3 x i32], [3 x i32], i32 }
|
||||
|
||||
// CHECK: @.offload_sizes.1 = private unnamed_addr constant [1 x i64] [i64 1024]
|
||||
// CHECK: @.offload_maptypes.1 = private unnamed_addr constant [1 x i64] [i64 35]
|
||||
// CHECK: @.kernel_1.region_id = weak unnamed_addr constant i8 0
|
||||
// CHECK: @.offloading.entry_name.1 = internal unnamed_addr constant [9 x i8] c"kernel_1\00", section ".llvm.rodata.offloading", align 1
|
||||
// CHECK: @.offloading.entry.kernel_1 = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 1, i32 0, ptr @.kernel_1.region_id, ptr @.offloading.entry_name.1, i64 0, i64 0, ptr null }, section "llvm_offload_entries", align 8
|
||||
// CHECK: @0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
|
||||
// CHECK: @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @0 }, align 8
|
||||
// CHECK: @.offload_sizes._kernel_1 = private unnamed_addr constant [1 x i64] [i64 1024]
|
||||
// CHECK: @.offload_maptypes._kernel_1 = private unnamed_addr constant [1 x i64] [i64 35]
|
||||
// CHECK: @._kernel_1.region_id = internal unnamed_addr constant i8 0
|
||||
// CHECK: @.offloading.entry_name._kernel_1 = internal unnamed_addr constant [10 x i8] c"_kernel_1\00", section ".llvm.rodata.offloading", align 1
|
||||
// CHECK: @.offloading.entry._kernel_1 = internal constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 1, i32 0, ptr @._kernel_1.region_id, ptr @.offloading.entry_name._kernel_1, i64 0, i64 0, ptr null }, section "llvm_offload_entries", align 8
|
||||
|
||||
// CHECK: @anon.{{.*}}.0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
|
||||
// CHECK: @anon.{{.*}}.1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @anon.{{.*}}.0 }, align 8
|
||||
|
||||
// CHECK: Function Attrs:
|
||||
// CHECK-NEXT: define{{( dso_local)?}} void @main()
|
||||
// CHECK-NEXT: start:
|
||||
// CHECK-NEXT: %0 = alloca [8 x i8], align 8
|
||||
// CHECK-NEXT: %x = alloca [1024 x i8], align 16
|
||||
// CHECK: call void @kernel_1(ptr noalias noundef nonnull align 4 dereferenceable(1024) %x)
|
||||
// CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %0)
|
||||
// CHECK-NEXT: store ptr %x, ptr %0, align 8
|
||||
// CHECK-NEXT: call void asm sideeffect "", "r,~{memory}"(ptr nonnull %0) #4, !srcloc !4
|
||||
// CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %0)
|
||||
// CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 1024, ptr nonnull %x)
|
||||
// CHECK-NEXT: ret void
|
||||
// CHECK-NEXT: }
|
||||
|
||||
// CHECK: define{{( dso_local)?}} void @kernel_1(ptr noalias noundef align 4 dereferenceable(1024) %x)
|
||||
// CHECK-NEXT: start:
|
||||
// CHECK-NEXT: %EmptyDesc = alloca %struct.__tgt_bin_desc, align 8
|
||||
// CHECK-NEXT: %.offload_baseptrs = alloca [1 x ptr], align 8
|
||||
// CHECK-NEXT: %.offload_ptrs = alloca [1 x ptr], align 8
|
||||
// CHECK-NEXT: %.offload_sizes = alloca [1 x i64], align 8
|
||||
// CHECK-NEXT: %kernel_args = alloca %struct.__tgt_kernel_arguments, align 8
|
||||
// CHECK: call void @llvm.memset.p0.i64(ptr align 8 %EmptyDesc, i8 0, i64 32, i1 false)
|
||||
// CHECK-NEXT: %1 = getelementptr inbounds float, ptr %x, i32 0
|
||||
// CHECK-NEXT: call void @__tgt_register_lib(ptr %EmptyDesc)
|
||||
// CHECK-NEXT: %dummy = load volatile ptr, ptr @.offload_sizes._kernel_1, align 8
|
||||
// CHECK-NEXT: %dummy1 = load volatile ptr, ptr @.offloading.entry._kernel_1, align 8
|
||||
// CHECK-NEXT: call void @llvm.memset.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) %EmptyDesc, i8 0, i64 32, i1 false)
|
||||
// CHECK-NEXT: call void @__tgt_register_lib(ptr nonnull %EmptyDesc)
|
||||
// CHECK-NEXT: call void @__tgt_init_all_rtls()
|
||||
// CHECK-NEXT: %2 = getelementptr inbounds [1 x ptr], ptr %.offload_baseptrs, i32 0, i32 0
|
||||
// CHECK-NEXT: store ptr %x, ptr %2, align 8
|
||||
// CHECK-NEXT: %3 = getelementptr inbounds [1 x ptr], ptr %.offload_ptrs, i32 0, i32 0
|
||||
// CHECK-NEXT: store ptr %1, ptr %3, align 8
|
||||
// CHECK-NEXT: %4 = getelementptr inbounds [1 x i64], ptr %.offload_sizes, i32 0, i32 0
|
||||
// CHECK-NEXT: store i64 1024, ptr %4, align 8
|
||||
// CHECK-NEXT: %5 = getelementptr inbounds [1 x ptr], ptr %.offload_baseptrs, i32 0, i32 0
|
||||
// CHECK-NEXT: %6 = getelementptr inbounds [1 x ptr], ptr %.offload_ptrs, i32 0, i32 0
|
||||
// CHECK-NEXT: %7 = getelementptr inbounds [1 x i64], ptr %.offload_sizes, i32 0, i32 0
|
||||
// CHECK-NEXT: call void @__tgt_target_data_begin_mapper(ptr @1, i64 -1, i32 1, ptr %5, ptr %6, ptr %7, ptr @.offload_maptypes.1, ptr null, ptr null)
|
||||
// CHECK-NEXT: %8 = getelementptr inbounds %struct.__tgt_kernel_arguments, ptr %kernel_args, i32 0, i32 0
|
||||
// CHECK-NEXT: store i32 3, ptr %8, align 4
|
||||
// CHECK-NEXT: %9 = getelementptr inbounds %struct.__tgt_kernel_arguments, ptr %kernel_args, i32 0, i32 1
|
||||
// CHECK-NEXT: store i32 1, ptr %9, align 4
|
||||
// CHECK-NEXT: %10 = getelementptr inbounds %struct.__tgt_kernel_arguments, ptr %kernel_args, i32 0, i32 2
|
||||
// CHECK-NEXT: store ptr %5, ptr %10, align 8
|
||||
// CHECK-NEXT: %11 = getelementptr inbounds %struct.__tgt_kernel_arguments, ptr %kernel_args, i32 0, i32 3
|
||||
// CHECK-NEXT: store ptr %6, ptr %11, align 8
|
||||
// CHECK-NEXT: %12 = getelementptr inbounds %struct.__tgt_kernel_arguments, ptr %kernel_args, i32 0, i32 4
|
||||
// CHECK-NEXT: store ptr %7, ptr %12, align 8
|
||||
// CHECK-NEXT: %13 = getelementptr inbounds %struct.__tgt_kernel_arguments, ptr %kernel_args, i32 0, i32 5
|
||||
// CHECK-NEXT: store ptr @.offload_maptypes.1, ptr %13, align 8
|
||||
// CHECK-NEXT: %14 = getelementptr inbounds %struct.__tgt_kernel_arguments, ptr %kernel_args, i32 0, i32 6
|
||||
// CHECK-NEXT: store ptr null, ptr %14, align 8
|
||||
// CHECK-NEXT: %15 = getelementptr inbounds %struct.__tgt_kernel_arguments, ptr %kernel_args, i32 0, i32 7
|
||||
// CHECK-NEXT: store ptr null, ptr %15, align 8
|
||||
// CHECK-NEXT: %16 = getelementptr inbounds %struct.__tgt_kernel_arguments, ptr %kernel_args, i32 0, i32 8
|
||||
// CHECK-NEXT: store i64 0, ptr %16, align 8
|
||||
// CHECK-NEXT: %17 = getelementptr inbounds %struct.__tgt_kernel_arguments, ptr %kernel_args, i32 0, i32 9
|
||||
// CHECK-NEXT: store i64 0, ptr %17, align 8
|
||||
// CHECK-NEXT: %18 = getelementptr inbounds %struct.__tgt_kernel_arguments, ptr %kernel_args, i32 0, i32 10
|
||||
// CHECK-NEXT: store [3 x i32] [i32 2097152, i32 0, i32 0], ptr %18, align 4
|
||||
// CHECK-NEXT: %19 = getelementptr inbounds %struct.__tgt_kernel_arguments, ptr %kernel_args, i32 0, i32 11
|
||||
// CHECK-NEXT: store [3 x i32] [i32 256, i32 0, i32 0], ptr %19, align 4
|
||||
// CHECK-NEXT: %20 = getelementptr inbounds %struct.__tgt_kernel_arguments, ptr %kernel_args, i32 0, i32 12
|
||||
// CHECK-NEXT: store i32 0, ptr %20, align 4
|
||||
// CHECK-NEXT: %21 = call i32 @__tgt_target_kernel(ptr @1, i64 -1, i32 2097152, i32 256, ptr @.kernel_1.region_id, ptr %kernel_args)
|
||||
// CHECK-NEXT: %22 = getelementptr inbounds [1 x ptr], ptr %.offload_baseptrs, i32 0, i32 0
|
||||
// CHECK-NEXT: %23 = getelementptr inbounds [1 x ptr], ptr %.offload_ptrs, i32 0, i32 0
|
||||
// CHECK-NEXT: %24 = getelementptr inbounds [1 x i64], ptr %.offload_sizes, i32 0, i32 0
|
||||
// CHECK-NEXT: call void @__tgt_target_data_end_mapper(ptr @1, i64 -1, i32 1, ptr %22, ptr %23, ptr %24, ptr @.offload_maptypes.1, ptr null, ptr null)
|
||||
// CHECK-NEXT: call void @__tgt_unregister_lib(ptr %EmptyDesc)
|
||||
// CHECK: store ptr %x, ptr %0, align 8
|
||||
// CHECK-NEXT: call void asm sideeffect "", "r,~{memory}"(ptr nonnull %0)
|
||||
// CHECK: ret void
|
||||
// CHECK-NEXT: store ptr %x, ptr %.offload_baseptrs, align 8
|
||||
// CHECK-NEXT: store ptr %x, ptr %.offload_ptrs, align 8
|
||||
// CHECK-NEXT: store i64 1024, ptr %.offload_sizes, align 8
|
||||
// CHECK-NEXT: call void @__tgt_target_data_begin_mapper(ptr nonnull @anon.{{.*}}.1, i64 -1, i32 1, ptr nonnull %.offload_baseptrs, ptr nonnull %.offload_ptrs, ptr nonnull %.offload_sizes, ptr nonnull @.offload_maptypes._kernel_1, ptr null, ptr null)
|
||||
// CHECK-NEXT: store i32 3, ptr %kernel_args, align 8
|
||||
// CHECK-NEXT: %0 = getelementptr inbounds nuw i8, ptr %kernel_args, i64 4
|
||||
// CHECK-NEXT: store i32 1, ptr %0, align 4
|
||||
// CHECK-NEXT: %1 = getelementptr inbounds nuw i8, ptr %kernel_args, i64 8
|
||||
// CHECK-NEXT: store ptr %.offload_baseptrs, ptr %1, align 8
|
||||
// CHECK-NEXT: %2 = getelementptr inbounds nuw i8, ptr %kernel_args, i64 16
|
||||
// CHECK-NEXT: store ptr %.offload_ptrs, ptr %2, align 8
|
||||
// CHECK-NEXT: %3 = getelementptr inbounds nuw i8, ptr %kernel_args, i64 24
|
||||
// CHECK-NEXT: store ptr %.offload_sizes, ptr %3, align 8
|
||||
// CHECK-NEXT: %4 = getelementptr inbounds nuw i8, ptr %kernel_args, i64 32
|
||||
// CHECK-NEXT: store ptr @.offload_maptypes._kernel_1, ptr %4, align 8
|
||||
// CHECK-NEXT: %5 = getelementptr inbounds nuw i8, ptr %kernel_args, i64 40
|
||||
// CHECK-NEXT: %6 = getelementptr inbounds nuw i8, ptr %kernel_args, i64 72
|
||||
// CHECK-NEXT: call void @llvm.memset.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) %5, i8 0, i64 32, i1 false)
|
||||
// CHECK-NEXT: store <4 x i32> <i32 2097152, i32 0, i32 0, i32 256>, ptr %6, align 8
|
||||
// CHECK-NEXT: %.fca.1.gep3 = getelementptr inbounds nuw i8, ptr %kernel_args, i64 88
|
||||
// CHECK-NEXT: store i32 0, ptr %.fca.1.gep3, align 8
|
||||
// CHECK-NEXT: %.fca.2.gep4 = getelementptr inbounds nuw i8, ptr %kernel_args, i64 92
|
||||
// CHECK-NEXT: store i32 0, ptr %.fca.2.gep4, align 4
|
||||
// CHECK-NEXT: %7 = getelementptr inbounds nuw i8, ptr %kernel_args, i64 96
|
||||
// CHECK-NEXT: store i32 0, ptr %7, align 8
|
||||
// CHECK-NEXT: %8 = call i32 @__tgt_target_kernel(ptr nonnull @anon.{{.*}}.1, i64 -1, i32 2097152, i32 256, ptr nonnull @._kernel_1.region_id, ptr nonnull %kernel_args)
|
||||
// CHECK-NEXT: call void @__tgt_target_data_end_mapper(ptr nonnull @anon.{{.*}}.1, i64 -1, i32 1, ptr nonnull %.offload_baseptrs, ptr nonnull %.offload_ptrs, ptr nonnull %.offload_sizes, ptr nonnull @.offload_maptypes._kernel_1, ptr null, ptr null)
|
||||
// CHECK-NEXT: call void @__tgt_unregister_lib(ptr nonnull %EmptyDesc)
|
||||
// CHECK-NEXT: ret void
|
||||
// CHECK-NEXT: }
|
||||
|
||||
// CHECK: Function Attrs: nounwind
|
||||
|
|
@ -100,6 +98,12 @@ fn main() {
|
|||
#[unsafe(no_mangle)]
|
||||
#[inline(never)]
|
||||
pub fn kernel_1(x: &mut [f32; 256]) {
|
||||
core::intrinsics::offload(_kernel_1, (x,))
|
||||
}
|
||||
|
||||
#[unsafe(no_mangle)]
|
||||
#[inline(never)]
|
||||
pub fn _kernel_1(x: &mut [f32; 256]) {
|
||||
for i in 0..256 {
|
||||
x[i] = 21.0;
|
||||
}
|
||||
|
|
|
|||
6
tests/ui/offload/check_config.fail.stderr
Normal file
6
tests/ui/offload/check_config.fail.stderr
Normal file
|
|
@ -0,0 +1,6 @@
|
|||
error: using the offload feature requires -Z offload=Enable
|
||||
|
||||
error: using the offload feature requires -C lto=fat
|
||||
|
||||
error: aborting due to 2 previous errors
|
||||
|
||||
23
tests/ui/offload/check_config.rs
Normal file
23
tests/ui/offload/check_config.rs
Normal file
|
|
@ -0,0 +1,23 @@
|
|||
//@ revisions: pass fail
|
||||
//@ no-prefer-dynamic
|
||||
//@ needs-enzyme
|
||||
//@[pass] build-pass
|
||||
//@[fail] build-fail
|
||||
//@[pass] compile-flags: -Zunstable-options -Zoffload=Enable -Clto=fat --emit=metadata
|
||||
//@[fail] compile-flags: -Clto=thin
|
||||
|
||||
//[fail]~? ERROR: using the offload feature requires -Z offload=Enable
|
||||
//[fail]~? ERROR: using the offload feature requires -C lto=fat
|
||||
|
||||
#![feature(core_intrinsics)]
|
||||
|
||||
fn main() {
|
||||
let mut x = [3.0; 256];
|
||||
kernel_1(&mut x);
|
||||
}
|
||||
|
||||
fn kernel_1(x: &mut [f32; 256]) {
|
||||
core::intrinsics::offload(_kernel_1, (x,))
|
||||
}
|
||||
|
||||
fn _kernel_1(x: &mut [f32; 256]) {}
|
||||
Loading…
Add table
Add a link
Reference in a new issue