Rollup merge of #147936 - Sa4dUs:offload-intrinsic, r=ZuseZ4

Offload intrinsic

This PR implements the minimal mechanisms required to run a small subset of arbitrary offload kernels without relying on hardcoded names or metadata.

- `offload(kernel, (..args))`: an intrinsic that generates the necessary host-side LLVM-IR code.
- `rustc_offload_kernel`: a builtin attribute that marks device kernels to be handled appropriately.

Example usage (pseudocode):
```rust
fn kernel(x: *mut [f64; 128]) {
    core::intrinsics::offload(kernel_1, (x,))
}

#[cfg(target_os = "linux")]
extern "C" {
    pub fn kernel_1(array_b: *mut [f64; 128]);
}

#[cfg(not(target_os = "linux"))]
#[rustc_offload_kernel]
extern "gpu-kernel" fn kernel_1(x: *mut [f64; 128]) {
    unsafe { (*x)[0] = 21.0 };
}
```
This commit is contained in:
Stuart Cook 2025-11-26 23:32:03 +11:00 committed by GitHub
commit 2b150f2c65
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
23 changed files with 529 additions and 178 deletions

View file

@ -18,6 +18,9 @@ codegen_llvm_lto_bitcode_from_rlib = failed to get bitcode from object file for
codegen_llvm_mismatch_data_layout =
data-layout for target `{$rustc_target}`, `{$rustc_layout}`, differs from LLVM target's `{$llvm_target}` default layout, `{$llvm_layout}`
codegen_llvm_offload_without_enable = using the offload feature requires -Z offload=Enable
codegen_llvm_offload_without_fat_lto = using the offload feature requires -C lto=fat
codegen_llvm_parse_bitcode = failed to parse bitcode for LTO module
codegen_llvm_parse_bitcode_with_llvm_err = failed to parse bitcode for LTO module: {$llvm_err}

View file

@ -30,6 +30,14 @@ pub(crate) fn apply_to_callsite(callsite: &Value, idx: AttributePlace, attrs: &[
}
}
pub(crate) fn has_string_attr(llfn: &Value, name: &str) -> bool {
llvm::HasStringAttribute(llfn, name)
}
pub(crate) fn remove_string_attr_from_llfn(llfn: &Value, name: &str) {
llvm::RemoveStringAttrFromFn(llfn, name);
}
/// Get LLVM attribute for the provided inline heuristic.
pub(crate) fn inline_attr<'ll, 'tcx>(
cx: &SimpleCx<'ll>,
@ -408,6 +416,10 @@ pub(crate) fn llfn_attrs_from_instance<'ll, 'tcx>(
to_add.push(llvm::CreateAttrString(cx.llcx, "no-builtins"));
}
if codegen_fn_attrs.flags.contains(CodegenFnAttrFlags::OFFLOAD_KERNEL) {
to_add.push(llvm::CreateAttrString(cx.llcx, "offload-kernel"))
}
if codegen_fn_attrs.flags.contains(CodegenFnAttrFlags::COLD) {
to_add.push(AttributeKind::Cold.create_attr(cx.llcx));
}

View file

@ -26,7 +26,7 @@ use crate::back::write::{
};
use crate::errors::{LlvmError, LtoBitcodeFromRlib};
use crate::llvm::{self, build_string};
use crate::{LlvmCodegenBackend, ModuleLlvm, SimpleCx};
use crate::{LlvmCodegenBackend, ModuleLlvm};
/// We keep track of the computed LTO cache keys from the previous
/// session to determine which CGUs we can reuse.
@ -601,7 +601,6 @@ pub(crate) fn run_pass_manager(
// We then run the llvm_optimize function a second time, to optimize the code which we generated
// in the enzyme differentiation pass.
let enable_ad = config.autodiff.contains(&config::AutoDiff::Enable);
let enable_gpu = config.offload.contains(&config::Offload::Enable);
let stage = if thin {
write::AutodiffStage::PreAD
} else {
@ -616,13 +615,6 @@ pub(crate) fn run_pass_manager(
write::llvm_optimize(cgcx, dcx, module, None, config, opt_level, opt_stage, stage);
}
// Here we only handle the GPU host (=cpu) code.
if enable_gpu && !thin && !cgcx.target_is_like_gpu {
let cx =
SimpleCx::new(module.module_llvm.llmod(), &module.module_llvm.llcx, cgcx.pointer_size);
crate::builder::gpu_offload::handle_gpu_code(cgcx, &cx);
}
if cfg!(feature = "llvm_enzyme") && enable_ad && !thin {
let opt_stage = llvm::OptStage::FatLTO;
let stage = write::AutodiffStage::PostAD;

View file

@ -43,7 +43,7 @@ use crate::errors::{
use crate::llvm::diagnostic::OptimizationDiagnosticKind::*;
use crate::llvm::{self, DiagnosticInfo};
use crate::type_::llvm_type_ptr;
use crate::{LlvmCodegenBackend, ModuleLlvm, SimpleCx, base, common, llvm_util};
use crate::{LlvmCodegenBackend, ModuleLlvm, SimpleCx, attributes, base, common, llvm_util};
pub(crate) fn llvm_err<'a>(dcx: DiagCtxtHandle<'_>, err: LlvmError<'a>) -> ! {
match llvm::last_error() {
@ -712,11 +712,12 @@ pub(crate) unsafe fn llvm_optimize(
SimpleCx::new(module.module_llvm.llmod(), module.module_llvm.llcx, cgcx.pointer_size);
// For now we only support up to 10 kernels named kernel_0 ... kernel_9, a follow-up PR is
// introducing a proper offload intrinsic to solve this limitation.
for num in 0..9 {
let name = format!("kernel_{num}");
if let Some(kernel) = cx.get_function(&name) {
handle_offload(&cx, kernel);
for func in cx.get_functions() {
let offload_kernel = "offload-kernel";
if attributes::has_string_attr(func, offload_kernel) {
handle_offload(&cx, func);
}
attributes::remove_string_attr_from_llfn(func, offload_kernel);
}
}

View file

@ -2,37 +2,13 @@ use std::ffi::CString;
use llvm::Linkage::*;
use rustc_abi::Align;
use rustc_codegen_ssa::back::write::CodegenContext;
use rustc_codegen_ssa::traits::BaseTypeCodegenMethods;
use rustc_middle::ty::offload_meta::OffloadMetadata;
use crate::builder::SBuilder;
use crate::common::AsCCharPtr;
use crate::llvm::AttributePlace::Function;
use crate::llvm::{self, Linkage, Type, Value};
use crate::{LlvmCodegenBackend, SimpleCx, attributes};
pub(crate) fn handle_gpu_code<'ll>(
_cgcx: &CodegenContext<LlvmCodegenBackend>,
cx: &'ll SimpleCx<'_>,
) {
// The offload memory transfer type for each kernel
let mut memtransfer_types = vec![];
let mut region_ids = vec![];
let offload_entry_ty = TgtOffloadEntry::new_decl(&cx);
// This is a temporary hack, we only search for kernel_0 to kernel_9 functions.
// There is a draft PR in progress which will introduce a proper offload intrinsic to remove
// this limitation.
for num in 0..9 {
let kernel = cx.get_function(&format!("kernel_{num}"));
if let Some(kernel) = kernel {
let (o, k) = gen_define_handling(&cx, kernel, offload_entry_ty, num);
memtransfer_types.push(o);
region_ids.push(k);
}
}
gen_call_handling(&cx, &memtransfer_types, &region_ids);
}
use crate::llvm::{self, BasicBlock, Linkage, Type, Value};
use crate::{SimpleCx, attributes};
// ; Function Attrs: nounwind
// declare i32 @__tgt_target_kernel(ptr, i64, i32, i32, ptr, ptr) #2
@ -79,7 +55,7 @@ fn generate_at_one<'ll>(cx: &'ll SimpleCx<'_>) -> &'ll llvm::Value {
at_one
}
struct TgtOffloadEntry {
pub(crate) struct TgtOffloadEntry {
// uint64_t Reserved;
// uint16_t Version;
// uint16_t Kind;
@ -167,7 +143,7 @@ impl KernelArgsTy {
fn new<'ll>(
cx: &'ll SimpleCx<'_>,
num_args: u64,
memtransfer_types: &[&'ll Value],
memtransfer_types: &'ll Value,
geps: [&'ll Value; 3],
) -> [(Align, &'ll Value); 13] {
let four = Align::from_bytes(4).expect("4 Byte alignment should work");
@ -181,7 +157,7 @@ impl KernelArgsTy {
(eight, geps[0]),
(eight, geps[1]),
(eight, geps[2]),
(eight, memtransfer_types[0]),
(eight, memtransfer_types),
// The next two are debug infos. FIXME(offload): set them
(eight, cx.const_null(cx.type_ptr())), // dbg
(eight, cx.const_null(cx.type_ptr())), // dbg
@ -194,6 +170,14 @@ impl KernelArgsTy {
}
}
// Contains LLVM values needed to manage offloading for a single kernel.
pub(crate) struct OffloadKernelData<'ll> {
pub offload_sizes: &'ll llvm::Value,
pub memtransfer_types: &'ll llvm::Value,
pub region_id: &'ll llvm::Value,
pub offload_entry: &'ll llvm::Value,
}
fn gen_tgt_data_mappers<'ll>(
cx: &'ll SimpleCx<'_>,
) -> (&'ll llvm::Value, &'ll llvm::Value, &'ll llvm::Value, &'ll llvm::Type) {
@ -256,68 +240,68 @@ pub(crate) fn add_global<'ll>(
// This function returns a memtransfer value which encodes how arguments to this kernel shall be
// mapped to/from the gpu. It also returns a region_id with the name of this kernel, to be
// concatenated into the list of region_ids.
fn gen_define_handling<'ll>(
cx: &'ll SimpleCx<'_>,
kernel: &'ll llvm::Value,
pub(crate) fn gen_define_handling<'ll>(
cx: &SimpleCx<'ll>,
offload_entry_ty: &'ll llvm::Type,
num: i64,
) -> (&'ll llvm::Value, &'ll llvm::Value) {
let types = cx.func_params_types(cx.get_type_of_global(kernel));
metadata: &[OffloadMetadata],
types: &[&Type],
symbol: &str,
) -> OffloadKernelData<'ll> {
// It seems like non-pointer values are automatically mapped. So here, we focus on pointer (or
// reference) types.
let num_ptr_types = types
.iter()
.filter(|&x| matches!(cx.type_kind(x), rustc_codegen_ssa::common::TypeKind::Pointer))
.count();
let ptr_meta = types.iter().zip(metadata).filter_map(|(&x, meta)| match cx.type_kind(x) {
rustc_codegen_ssa::common::TypeKind::Pointer => Some(meta),
_ => None,
});
// We do not know their size anymore at this level, so hardcode a placeholder.
// A follow-up pr will track these from the frontend, where we still have Rust types.
// Then, we will be able to figure out that e.g. `&[f32;256]` will result in 4*256 bytes.
// I decided that 1024 bytes is a great placeholder value for now.
add_priv_unnamed_arr(&cx, &format!(".offload_sizes.{num}"), &vec![1024; num_ptr_types]);
// FIXME(Sa4dUs): add `OMP_MAP_TARGET_PARAM = 0x20` only if necessary
let (ptr_sizes, ptr_transfer): (Vec<_>, Vec<_>) =
ptr_meta.map(|m| (m.payload_size, m.mode.bits() | 0x20)).unzip();
let offload_sizes = add_priv_unnamed_arr(&cx, &format!(".offload_sizes.{symbol}"), &ptr_sizes);
// Here we figure out whether something needs to be copied to the gpu (=1), from the gpu (=2),
// or both to and from the gpu (=3). Other values shouldn't affect us for now.
// A non-mutable reference or pointer will be 1, an array that's not read, but fully overwritten
// will be 2. For now, everything is 3, until we have our frontend set up.
// 1+2+32: 1 (MapTo), 2 (MapFrom), 32 (Add one extra input ptr per function, to be used later).
let memtransfer_types = add_priv_unnamed_arr(
&cx,
&format!(".offload_maptypes.{num}"),
&vec![1 + 2 + 32; num_ptr_types],
);
let memtransfer_types =
add_priv_unnamed_arr(&cx, &format!(".offload_maptypes.{symbol}"), &ptr_transfer);
// Next: For each function, generate these three entries. A weak constant,
// the llvm.rodata entry name, and the llvm_offload_entries value
let name = format!(".kernel_{num}.region_id");
let name = format!(".{symbol}.region_id");
let initializer = cx.get_const_i8(0);
let region_id = add_unnamed_global(&cx, &name, initializer, WeakAnyLinkage);
let c_entry_name = CString::new(format!("kernel_{num}")).unwrap();
let c_entry_name = CString::new(symbol).unwrap();
let c_val = c_entry_name.as_bytes_with_nul();
let offload_entry_name = format!(".offloading.entry_name.{num}");
let offload_entry_name = format!(".offloading.entry_name.{symbol}");
let initializer = crate::common::bytes_in_context(cx.llcx, c_val);
let llglobal = add_unnamed_global(&cx, &offload_entry_name, initializer, InternalLinkage);
llvm::set_alignment(llglobal, Align::ONE);
llvm::set_section(llglobal, c".llvm.rodata.offloading");
let name = format!(".offloading.entry.kernel_{num}");
let name = format!(".offloading.entry.{symbol}");
// See the __tgt_offload_entry documentation above.
let elems = TgtOffloadEntry::new(&cx, region_id, llglobal);
let initializer = crate::common::named_struct(offload_entry_ty, &elems);
let c_name = CString::new(name).unwrap();
let llglobal = llvm::add_global(cx.llmod, offload_entry_ty, &c_name);
llvm::set_global_constant(llglobal, true);
llvm::set_linkage(llglobal, WeakAnyLinkage);
llvm::set_initializer(llglobal, initializer);
llvm::set_alignment(llglobal, Align::EIGHT);
let offload_entry = llvm::add_global(cx.llmod, offload_entry_ty, &c_name);
llvm::set_global_constant(offload_entry, true);
llvm::set_linkage(offload_entry, WeakAnyLinkage);
llvm::set_initializer(offload_entry, initializer);
llvm::set_alignment(offload_entry, Align::EIGHT);
let c_section_name = CString::new("llvm_offload_entries").unwrap();
llvm::set_section(llglobal, &c_section_name);
(memtransfer_types, region_id)
llvm::set_section(offload_entry, &c_section_name);
OffloadKernelData { offload_sizes, memtransfer_types, region_id, offload_entry }
}
pub(crate) fn declare_offload_fn<'ll>(
fn declare_offload_fn<'ll>(
cx: &'ll SimpleCx<'_>,
name: &str,
ty: &'ll llvm::Type,
@ -333,8 +317,7 @@ pub(crate) fn declare_offload_fn<'ll>(
}
// For each kernel *call*, we now use some of our previous declared globals to move data to and from
// the gpu. We don't have a proper frontend yet, so we assume that every call to a kernel function
// from main is intended to run on the GPU. For now, we only handle the data transfer part of it.
// the gpu. For now, we only handle the data transfer part of it.
// If two consecutive kernels use the same memory, we still move it to the host and back to the gpu.
// Since in our frontend users (by default) don't have to specify data transfer, this is something
// we should optimize in the future! We also assume that everything should be copied back and forth,
@ -352,11 +335,16 @@ pub(crate) fn declare_offload_fn<'ll>(
// 4. set insert point after kernel call.
// 5. generate all the GEPS and stores, to be used in 6)
// 6. generate __tgt_target_data_end calls to move data from the GPU
fn gen_call_handling<'ll>(
cx: &'ll SimpleCx<'_>,
memtransfer_types: &[&'ll llvm::Value],
region_ids: &[&'ll llvm::Value],
pub(crate) fn gen_call_handling<'ll>(
cx: &SimpleCx<'ll>,
bb: &BasicBlock,
offload_data: &OffloadKernelData<'ll>,
args: &[&'ll Value],
types: &[&Type],
metadata: &[OffloadMetadata],
) {
let OffloadKernelData { offload_sizes, offload_entry, memtransfer_types, region_id } =
offload_data;
let (tgt_decl, tgt_target_kernel_ty) = generate_launcher(&cx);
// %struct.__tgt_bin_desc = type { i32, ptr, ptr, ptr }
let tptr = cx.type_ptr();
@ -368,27 +356,32 @@ fn gen_call_handling<'ll>(
let tgt_kernel_decl = KernelArgsTy::new_decl(&cx);
let (begin_mapper_decl, _, end_mapper_decl, fn_ty) = gen_tgt_data_mappers(&cx);
let main_fn = cx.get_function("main");
let Some(main_fn) = main_fn else { return };
let kernel_name = "kernel_1";
let call = unsafe {
llvm::LLVMRustGetFunctionCall(main_fn, kernel_name.as_c_char_ptr(), kernel_name.len())
};
let Some(kernel_call) = call else {
return;
};
let kernel_call_bb = unsafe { llvm::LLVMGetInstructionParent(kernel_call) };
let called = unsafe { llvm::LLVMGetCalledValue(kernel_call).unwrap() };
let mut builder = SBuilder::build(cx, kernel_call_bb);
let mut builder = SBuilder::build(cx, bb);
let types = cx.func_params_types(cx.get_type_of_global(called));
let num_args = types.len() as u64;
let ip = unsafe { llvm::LLVMRustGetInsertPoint(&builder.llbuilder) };
// FIXME(Sa4dUs): dummy loads are a temp workaround, we should find a proper way to prevent these
// variables from being optimized away
for val in [offload_sizes, offload_entry] {
unsafe {
let dummy = llvm::LLVMBuildLoad2(
&builder.llbuilder,
llvm::LLVMTypeOf(val),
val,
b"dummy\0".as_ptr() as *const _,
);
llvm::LLVMSetVolatile(dummy, llvm::TRUE);
}
}
// Step 0)
// %struct.__tgt_bin_desc = type { i32, ptr, ptr, ptr }
// %6 = alloca %struct.__tgt_bin_desc, align 8
unsafe { llvm::LLVMRustPositionBuilderPastAllocas(builder.llbuilder, main_fn) };
let llfn = unsafe { llvm::LLVMGetBasicBlockParent(bb) };
unsafe {
llvm::LLVMRustPositionBuilderPastAllocas(&builder.llbuilder, llfn);
}
let tgt_bin_desc_alloca = builder.direct_alloca(tgt_bin_desc, Align::EIGHT, "EmptyDesc");
let ty = cx.type_array(cx.type_ptr(), num_args);
@ -404,15 +397,16 @@ fn gen_call_handling<'ll>(
let a5 = builder.direct_alloca(tgt_kernel_decl, Align::EIGHT, "kernel_args");
// Step 1)
unsafe { llvm::LLVMRustPositionBefore(builder.llbuilder, kernel_call) };
unsafe {
llvm::LLVMRustRestoreInsertPoint(&builder.llbuilder, ip);
}
builder.memset(tgt_bin_desc_alloca, cx.get_const_i8(0), cx.get_const_i64(32), Align::EIGHT);
// Now we allocate once per function param, a copy to be passed to one of our maps.
let mut vals = vec![];
let mut geps = vec![];
let i32_0 = cx.get_const_i32(0);
for index in 0..types.len() {
let v = unsafe { llvm::LLVMGetOperand(kernel_call, index as u32).unwrap() };
for &v in args {
let gep = builder.inbounds_gep(cx.type_f32(), v, &[i32_0]);
vals.push(v);
geps.push(gep);
@ -437,10 +431,8 @@ fn gen_call_handling<'ll>(
let gep2 = builder.inbounds_gep(ty, a2, &[i32_0, idx]);
builder.store(geps[i as usize], gep2, Align::EIGHT);
let gep3 = builder.inbounds_gep(ty2, a4, &[i32_0, idx]);
// As mentioned above, we don't use Rust type information yet. So for now we will just
// assume that we have 1024 bytes, 256 f32 values.
// FIXME(offload): write an offload frontend and handle arbitrary types.
builder.store(cx.get_const_i64(1024), gep3, Align::EIGHT);
builder.store(cx.get_const_i64(metadata[i as usize].payload_size), gep3, Align::EIGHT);
}
// For now we have a very simplistic indexing scheme into our
@ -482,9 +474,17 @@ fn gen_call_handling<'ll>(
// Step 2)
let s_ident_t = generate_at_one(&cx);
let o = memtransfer_types[0];
let geps = get_geps(&mut builder, &cx, ty, ty2, a1, a2, a4);
generate_mapper_call(&mut builder, &cx, geps, o, begin_mapper_decl, fn_ty, num_args, s_ident_t);
generate_mapper_call(
&mut builder,
&cx,
geps,
memtransfer_types,
begin_mapper_decl,
fn_ty,
num_args,
s_ident_t,
);
let values = KernelArgsTy::new(&cx, num_args, memtransfer_types, geps);
// Step 3)
@ -501,26 +501,26 @@ fn gen_call_handling<'ll>(
// FIXME(offload): Don't hardcode the numbers of threads in the future.
cx.get_const_i32(2097152),
cx.get_const_i32(256),
region_ids[0],
region_id,
a5,
];
let offload_success = builder.call(tgt_target_kernel_ty, tgt_decl, &args, None);
builder.call(tgt_target_kernel_ty, tgt_decl, &args, None);
// %41 = call i32 @__tgt_target_kernel(ptr @1, i64 -1, i32 2097152, i32 256, ptr @.kernel_1.region_id, ptr %kernel_args)
unsafe {
let next = llvm::LLVMGetNextInstruction(offload_success).unwrap();
llvm::LLVMRustPositionAfter(builder.llbuilder, next);
llvm::LLVMInstructionEraseFromParent(next);
}
// Step 4)
let geps = get_geps(&mut builder, &cx, ty, ty2, a1, a2, a4);
generate_mapper_call(&mut builder, &cx, geps, o, end_mapper_decl, fn_ty, num_args, s_ident_t);
generate_mapper_call(
&mut builder,
&cx,
geps,
memtransfer_types,
end_mapper_decl,
fn_ty,
num_args,
s_ident_t,
);
builder.call(mapper_fn_ty, unregister_lib_decl, &[tgt_bin_desc_alloca], None);
drop(builder);
// FIXME(offload) The issue is that we right now add a call to the gpu version of the function,
// and then delete the call to the CPU version. In the future, we should use an intrinsic which
// directly resolves to a call to the GPU version.
unsafe { llvm::LLVMDeleteFunction(called) };
}

View file

@ -791,6 +791,16 @@ impl<'ll, CX: Borrow<SCx<'ll>>> GenericCx<'ll, CX> {
llvm::LLVMMDStringInContext2(self.llcx(), name.as_ptr() as *const c_char, name.len())
}
}
pub(crate) fn get_functions(&self) -> Vec<&'ll Value> {
let mut functions = vec![];
let mut func = unsafe { llvm::LLVMGetFirstFunction(self.llmod()) };
while let Some(f) = func {
functions.push(f);
func = unsafe { llvm::LLVMGetNextFunction(f) }
}
functions
}
}
impl<'ll, 'tcx> MiscCodegenMethods<'tcx> for CodegenCx<'ll, 'tcx> {

View file

@ -40,6 +40,14 @@ pub(crate) struct AutoDiffWithoutLto;
#[diag(codegen_llvm_autodiff_without_enable)]
pub(crate) struct AutoDiffWithoutEnable;
#[derive(Diagnostic)]
#[diag(codegen_llvm_offload_without_enable)]
pub(crate) struct OffloadWithoutEnable;
#[derive(Diagnostic)]
#[diag(codegen_llvm_offload_without_fat_lto)]
pub(crate) struct OffloadWithoutFatLTO;
#[derive(Diagnostic)]
#[diag(codegen_llvm_lto_bitcode_from_rlib)]
pub(crate) struct LtoBitcodeFromRlib {

View file

@ -13,6 +13,7 @@ use rustc_hir::def_id::LOCAL_CRATE;
use rustc_hir::{self as hir};
use rustc_middle::mir::BinOp;
use rustc_middle::ty::layout::{FnAbiOf, HasTyCtxt, HasTypingEnv, LayoutOf};
use rustc_middle::ty::offload_meta::OffloadMetadata;
use rustc_middle::ty::{self, GenericArgsRef, Instance, SimdAlign, Ty, TyCtxt, TypingEnv};
use rustc_middle::{bug, span_bug};
use rustc_session::config::CrateType;
@ -25,8 +26,11 @@ use tracing::debug;
use crate::abi::FnAbiLlvmExt;
use crate::builder::Builder;
use crate::builder::autodiff::{adjust_activity_to_abi, generate_enzyme_call};
use crate::builder::gpu_offload::TgtOffloadEntry;
use crate::context::CodegenCx;
use crate::errors::{AutoDiffWithoutEnable, AutoDiffWithoutLto};
use crate::errors::{
AutoDiffWithoutEnable, AutoDiffWithoutLto, OffloadWithoutEnable, OffloadWithoutFatLTO,
};
use crate::llvm::{self, Metadata, Type, Value};
use crate::type_of::LayoutLlvmExt;
use crate::va_arg::emit_va_arg;
@ -197,6 +201,24 @@ impl<'ll, 'tcx> IntrinsicCallBuilderMethods<'tcx> for Builder<'_, 'll, 'tcx> {
codegen_autodiff(self, tcx, instance, args, result);
return Ok(());
}
sym::offload => {
if !tcx
.sess
.opts
.unstable_opts
.offload
.contains(&rustc_session::config::Offload::Enable)
{
let _ = tcx.dcx().emit_almost_fatal(OffloadWithoutEnable);
}
if tcx.sess.lto() != rustc_session::config::Lto::Fat {
let _ = tcx.dcx().emit_almost_fatal(OffloadWithoutFatLTO);
}
codegen_offload(self, tcx, instance, args);
return Ok(());
}
sym::is_val_statically_known => {
if let OperandValue::Immediate(imm) = args[0].val {
self.call_intrinsic(
@ -1231,6 +1253,62 @@ fn codegen_autodiff<'ll, 'tcx>(
);
}
// Generates the LLVM code to offload a Rust function to a target device (e.g., GPU).
// For each kernel call, it generates the necessary globals (including metadata such as
// size and pass mode), manages memory mapping to and from the device, handles all
// data transfers, and launches the kernel on the target device.
fn codegen_offload<'ll, 'tcx>(
bx: &mut Builder<'_, 'll, 'tcx>,
tcx: TyCtxt<'tcx>,
instance: ty::Instance<'tcx>,
args: &[OperandRef<'tcx, &'ll Value>],
) {
let cx = bx.cx;
let fn_args = instance.args;
let (target_id, target_args) = match fn_args.into_type_list(tcx)[0].kind() {
ty::FnDef(def_id, params) => (def_id, params),
_ => bug!("invalid offload intrinsic arg"),
};
let fn_target = match Instance::try_resolve(tcx, cx.typing_env(), *target_id, target_args) {
Ok(Some(instance)) => instance,
Ok(None) => bug!(
"could not resolve ({:?}, {:?}) to a specific offload instance",
target_id,
target_args
),
Err(_) => {
// An error has already been emitted
return;
}
};
let args = get_args_from_tuple(bx, args[1], fn_target);
let target_symbol = symbol_name_for_instance_in_crate(tcx, fn_target, LOCAL_CRATE);
let offload_entry_ty = TgtOffloadEntry::new_decl(&cx);
let sig = tcx.fn_sig(fn_target.def_id()).skip_binder().skip_binder();
let inputs = sig.inputs();
let metadata = inputs.iter().map(|ty| OffloadMetadata::from_ty(tcx, *ty)).collect::<Vec<_>>();
let types = inputs.iter().map(|ty| cx.layout_of(*ty).llvm_type(cx)).collect::<Vec<_>>();
let offload_data = crate::builder::gpu_offload::gen_define_handling(
cx,
offload_entry_ty,
&metadata,
&types,
&target_symbol,
);
// FIXME(Sa4dUs): pass the original builder once we separate kernel launch logic from globals
let bb = unsafe { llvm::LLVMGetInsertBlock(bx.llbuilder) };
crate::builder::gpu_offload::gen_call_handling(cx, bb, &offload_data, &args, &types, &metadata);
}
fn get_args_from_tuple<'ll, 'tcx>(
bx: &mut Builder<'_, 'll, 'tcx>,
tuple_op: OperandRef<'tcx, &'ll Value>,

View file

@ -1160,13 +1160,9 @@ unsafe extern "C" {
) -> &'a BasicBlock;
// Operations on instructions
pub(crate) fn LLVMGetInstructionParent(Inst: &Value) -> &BasicBlock;
pub(crate) fn LLVMGetCalledValue(CallInst: &Value) -> Option<&Value>;
pub(crate) fn LLVMIsAInstruction(Val: &Value) -> Option<&Value>;
pub(crate) fn LLVMGetFirstBasicBlock(Fn: &Value) -> &BasicBlock;
pub(crate) fn LLVMGetOperand(Val: &Value, Index: c_uint) -> Option<&Value>;
pub(crate) fn LLVMGetNextInstruction(Val: &Value) -> Option<&Value>;
pub(crate) fn LLVMInstructionEraseFromParent(Val: &Value);
// Operations on call sites
pub(crate) fn LLVMSetInstructionCallConv(Instr: &Value, CC: c_uint);
@ -2484,6 +2480,8 @@ unsafe extern "C" {
pub(crate) fn LLVMRustPositionBuilderPastAllocas<'a>(B: &Builder<'a>, Fn: &'a Value);
pub(crate) fn LLVMRustPositionBuilderAtStart<'a>(B: &Builder<'a>, BB: &'a BasicBlock);
pub(crate) fn LLVMRustGetInsertPoint<'a>(B: &Builder<'a>) -> &'a Value;
pub(crate) fn LLVMRustRestoreInsertPoint<'a>(B: &Builder<'a>, IP: &'a Value);
pub(crate) fn LLVMRustSetModulePICLevel(M: &Module);
pub(crate) fn LLVMRustSetModulePIELevel(M: &Module);

View file

@ -43,6 +43,14 @@ pub(crate) fn AddFunctionAttributes<'ll>(
}
}
pub(crate) fn HasStringAttribute<'ll>(llfn: &'ll Value, name: &str) -> bool {
unsafe { LLVMRustHasFnAttribute(llfn, name.as_c_char_ptr(), name.len()) }
}
pub(crate) fn RemoveStringAttrFromFn<'ll>(llfn: &'ll Value, name: &str) {
unsafe { LLVMRustRemoveFnAttribute(llfn, name.as_c_char_ptr(), name.len()) }
}
pub(crate) fn AddCallSiteAttributes<'ll>(
callsite: &'ll Value,
idx: AttributePlace,

View file

@ -334,6 +334,9 @@ fn process_builtin_attrs(
codegen_fn_attrs.patchable_function_entry =
parse_patchable_function_entry(tcx, attr);
}
sym::rustc_offload_kernel => {
codegen_fn_attrs.flags |= CodegenFnAttrFlags::OFFLOAD_KERNEL
}
_ => {}
}
}

View file

@ -1117,6 +1117,11 @@ pub static BUILTIN_ATTRIBUTES: &[BuiltinAttribute] = &[
rustc_autodiff, Normal,
template!(Word, List: &[r#""...""#]), DuplicatesOk,
EncodeCrossCrate::Yes,
),
rustc_attr!(
rustc_offload_kernel, Normal,
template!(Word), DuplicatesOk,
EncodeCrossCrate::Yes,
),
// Traces that are left when `cfg` and `cfg_attr` attributes are expanded.
// The attributes are not gated, to avoid stability errors, but they cannot be used in stable

View file

@ -163,6 +163,7 @@ fn intrinsic_operation_unsafety(tcx: TyCtxt<'_>, intrinsic_id: LocalDefId) -> hi
| sym::minnumf128
| sym::mul_with_overflow
| sym::needs_drop
| sym::offload
| sym::offset_of
| sym::overflow_checks
| sym::powf16
@ -313,6 +314,7 @@ pub(crate) fn check_intrinsic_type(
let type_id = tcx.type_of(tcx.lang_items().type_id().unwrap()).instantiate_identity();
(0, 0, vec![type_id, type_id], tcx.types.bool)
}
sym::offload => (3, 0, vec![param(0), param(1)], param(2)),
sym::offset => (2, 0, vec![param(0), param(1)], param(0)),
sym::arith_offset => (
1,

View file

@ -1436,6 +1436,39 @@ extern "C" void LLVMRustPositionAfter(LLVMBuilderRef B, LLVMValueRef Instr) {
}
}
extern "C" LLVMValueRef LLVMRustGetInsertPoint(LLVMBuilderRef B) {
llvm::IRBuilderBase &IRB = *unwrap(B);
llvm::IRBuilderBase::InsertPoint ip = IRB.saveIP();
llvm::BasicBlock *BB = ip.getBlock();
if (!BB)
return nullptr;
auto it = ip.getPoint();
if (it == BB->end())
return nullptr;
llvm::Instruction *I = &*it;
return wrap(I);
}
extern "C" void LLVMRustRestoreInsertPoint(LLVMBuilderRef B,
LLVMValueRef Instr) {
llvm::IRBuilderBase &IRB = *unwrap(B);
if (!Instr) {
llvm::BasicBlock *BB = IRB.GetInsertBlock();
if (BB)
IRB.SetInsertPoint(BB);
return;
}
llvm::Instruction *I = unwrap<llvm::Instruction>(Instr);
IRB.SetInsertPoint(I);
}
extern "C" LLVMValueRef
LLVMRustGetFunctionCall(LLVMValueRef Fn, const char *Name, size_t NameLen) {
auto targetName = StringRef(Name, NameLen);

View file

@ -190,6 +190,8 @@ bitflags::bitflags! {
const NO_BUILTINS = 1 << 15;
/// Marks foreign items, to make `contains_extern_indicator` cheaper.
const FOREIGN_ITEM = 1 << 16;
/// `#[rustc_offload_kernel]`: indicates that this is an offload kernel, an extra ptr arg will be added.
const OFFLOAD_KERNEL = 1 << 17;
}
}
rustc_data_structures::external_bitflags_debug! { CodegenFnAttrFlags }

View file

@ -129,6 +129,7 @@ pub mod fast_reject;
pub mod inhabitedness;
pub mod layout;
pub mod normalize_erasing_regions;
pub mod offload_meta;
pub mod pattern;
pub mod print;
pub mod relate;

View file

@ -0,0 +1,119 @@
use bitflags::bitflags;
use crate::ty::{self, PseudoCanonicalInput, Ty, TyCtxt, TypingEnv};
pub struct OffloadMetadata {
pub payload_size: u64,
pub mode: MappingFlags,
}
bitflags! {
/// Mirrors `OpenMPOffloadMappingFlags` from Clang/OpenMP.
#[derive(Debug, Copy, Clone)]
#[repr(transparent)]
pub struct MappingFlags: u64 {
/// No flags.
const NONE = 0x0;
/// Allocate memory on the device and move data from host to device.
const TO = 0x01;
/// Allocate memory on the device and move data from device to host.
const FROM = 0x02;
/// Always perform the requested mapping action, even if already mapped.
const ALWAYS = 0x04;
/// Delete the element from the device environment, ignoring ref count.
const DELETE = 0x08;
/// The element being mapped is a pointer-pointee pair.
const PTR_AND_OBJ = 0x10;
/// The base address should be passed to the target kernel as argument.
const TARGET_PARAM = 0x20;
/// The runtime must return the device pointer.
const RETURN_PARAM = 0x40;
/// The reference being passed is a pointer to private data.
const PRIVATE = 0x80;
/// Pass the element by value.
const LITERAL = 0x100;
/// Implicit map (generated by compiler, not explicit in code).
const IMPLICIT = 0x200;
/// Hint to allocate memory close to the target device.
const CLOSE = 0x400;
/// Reserved (0x800 in OpenMP for XLC compatibility).
const RESERVED = 0x800;
/// Require that the data is already allocated on the device.
const PRESENT = 0x1000;
/// Increment/decrement a separate ref counter (OpenACC compatibility).
const OMPX_HOLD = 0x2000;
/// Used for non-contiguous list items in target update.
const NON_CONTIG = 0x100000000000;
/// 16 MSBs indicate membership in a struct.
const MEMBER_OF = 0xffff000000000000;
}
}
impl OffloadMetadata {
pub fn from_ty<'tcx>(tcx: TyCtxt<'tcx>, ty: Ty<'tcx>) -> Self {
OffloadMetadata {
payload_size: get_payload_size(tcx, ty),
mode: MappingFlags::from_ty(tcx, ty),
}
}
}
// FIXME(Sa4dUs): implement a solid logic to determine the payload size
fn get_payload_size<'tcx>(tcx: TyCtxt<'tcx>, ty: Ty<'tcx>) -> u64 {
match ty.kind() {
ty::RawPtr(inner, _) | ty::Ref(_, inner, _) => get_payload_size(tcx, *inner),
_ => tcx
.layout_of(PseudoCanonicalInput {
typing_env: TypingEnv::fully_monomorphized(),
value: ty,
})
.unwrap()
.size
.bytes(),
}
}
impl MappingFlags {
fn from_ty<'tcx>(tcx: TyCtxt<'tcx>, ty: Ty<'tcx>) -> Self {
use rustc_ast::Mutability::*;
match ty.kind() {
ty::Bool
| ty::Char
| ty::Int(_)
| ty::Uint(_)
| ty::Float(_)
| ty::Adt(_, _)
| ty::Tuple(_)
| ty::Array(_, _)
| ty::Alias(_, _)
| ty::Param(_) => MappingFlags::TO,
ty::RawPtr(_, Not) | ty::Ref(_, _, Not) => MappingFlags::TO,
ty::RawPtr(_, Mut) | ty::Ref(_, _, Mut) => MappingFlags::TO | MappingFlags::FROM,
ty::Slice(_) | ty::Str | ty::Dynamic(_, _) => MappingFlags::TO | MappingFlags::FROM,
ty::Foreign(_) | ty::Pat(_, _) | ty::UnsafeBinder(_) => {
MappingFlags::TO | MappingFlags::FROM
}
ty::FnDef(_, _)
| ty::FnPtr(_, _)
| ty::Closure(_, _)
| ty::CoroutineClosure(_, _)
| ty::Coroutine(_, _)
| ty::CoroutineWitness(_, _)
| ty::Never
| ty::Bound(_, _)
| ty::Placeholder(_)
| ty::Infer(_)
| ty::Error(_) => {
tcx.dcx()
.span_err(rustc_span::DUMMY_SP, format!("type `{ty:?}` cannot be offloaded"));
MappingFlags::empty()
}
}
}
}

View file

@ -1584,6 +1584,7 @@ symbols! {
object_safe_for_dispatch,
of,
off,
offload,
offset,
offset_of,
offset_of_enum,
@ -1966,6 +1967,7 @@ symbols! {
rustc_objc_class,
rustc_objc_selector,
rustc_object_lifetime_default,
rustc_offload_kernel,
rustc_on_unimplemented,
rustc_outlives,
rustc_paren_sugar,

View file

@ -3324,6 +3324,38 @@ pub const fn copysignf128(x: f128, y: f128) -> f128;
#[rustc_intrinsic]
pub const fn autodiff<F, G, T: crate::marker::Tuple, R>(f: F, df: G, args: T) -> R;
/// Generates the LLVM body of a wrapper function to offload a kernel `f`.
///
/// Type Parameters:
/// - `F`: The kernel to offload. Must be a function item.
/// - `T`: A tuple of arguments passed to `f`.
/// - `R`: The return type of the kernel.
///
/// Example usage (pseudocode):
///
/// ```rust,ignore (pseudocode)
/// fn kernel(x: *mut [f64; 128]) {
/// core::intrinsics::offload(kernel_1, (x,))
/// }
///
/// #[cfg(target_os = "linux")]
/// extern "C" {
/// pub fn kernel_1(array_b: *mut [f64; 128]);
/// }
///
/// #[cfg(not(target_os = "linux"))]
/// #[rustc_offload_kernel]
/// extern "gpu-kernel" fn kernel_1(x: *mut [f64; 128]) {
/// unsafe { (*x)[0] = 21.0 };
/// }
/// ```
///
/// For reference, see the Clang documentation on offloading:
/// <https://clang.llvm.org/docs/OffloadingDesign.html>.
#[rustc_nounwind]
#[rustc_intrinsic]
pub const fn offload<F, T: crate::marker::Tuple, R>(f: F, args: T) -> R;
/// Inform Miri that a given pointer definitely has a certain alignment.
#[cfg(miri)]
#[rustc_allow_const_fn_unstable(const_eval_select)]

View file

@ -5,6 +5,8 @@ We currently work on launching the following Rust kernel on the GPU. To follow a
```rust
#![feature(abi_gpu_kernel)]
#![feature(rustc_attrs)]
#![feature(core_intrinsics)]
#![no_std]
#[cfg(target_os = "linux")]
@ -12,6 +14,7 @@ extern crate libc;
#[cfg(target_os = "linux")]
use libc::c_char;
#[cfg(target_os = "linux")]
use core::mem;
#[panic_handler]
@ -38,7 +41,7 @@ fn main() {
}
unsafe {
kernel_1(array_c);
kernel(array_c);
}
core::hint::black_box(&array_c);
unsafe {
@ -52,6 +55,11 @@ fn main() {
}
}
#[inline(never)]
unsafe fn kernel(x: *mut [f64; 256]) {
core::intrinsics::offload(kernel_1, (x,))
}
#[cfg(target_os = "linux")]
unsafe extern "C" {
pub fn kernel_1(array_b: *mut [f64; 256]);
@ -60,6 +68,7 @@ unsafe extern "C" {
#[cfg(not(target_os = "linux"))]
#[unsafe(no_mangle)]
#[inline(never)]
#[rustc_offload_kernel]
pub extern "gpu-kernel" fn kernel_1(x: *mut [f64; 256]) {
unsafe { (*x)[0] = 21.0 };
}

View file

@ -11,6 +11,7 @@
// when inside of a function called main. This, too, is a temporary workaround for not having a
// frontend.
#![feature(core_intrinsics)]
#![no_main]
#[unsafe(no_mangle)]
@ -25,73 +26,70 @@ fn main() {
// CHECK: %struct.__tgt_bin_desc = type { i32, ptr, ptr, ptr }
// CHECK: %struct.__tgt_kernel_arguments = type { i32, i32, ptr, ptr, ptr, ptr, ptr, ptr, i64, i64, [3 x i32], [3 x i32], i32 }
// CHECK: @.offload_sizes.1 = private unnamed_addr constant [1 x i64] [i64 1024]
// CHECK: @.offload_maptypes.1 = private unnamed_addr constant [1 x i64] [i64 35]
// CHECK: @.kernel_1.region_id = weak unnamed_addr constant i8 0
// CHECK: @.offloading.entry_name.1 = internal unnamed_addr constant [9 x i8] c"kernel_1\00", section ".llvm.rodata.offloading", align 1
// CHECK: @.offloading.entry.kernel_1 = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 1, i32 0, ptr @.kernel_1.region_id, ptr @.offloading.entry_name.1, i64 0, i64 0, ptr null }, section "llvm_offload_entries", align 8
// CHECK: @0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
// CHECK: @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @0 }, align 8
// CHECK: @.offload_sizes._kernel_1 = private unnamed_addr constant [1 x i64] [i64 1024]
// CHECK: @.offload_maptypes._kernel_1 = private unnamed_addr constant [1 x i64] [i64 35]
// CHECK: @._kernel_1.region_id = internal unnamed_addr constant i8 0
// CHECK: @.offloading.entry_name._kernel_1 = internal unnamed_addr constant [10 x i8] c"_kernel_1\00", section ".llvm.rodata.offloading", align 1
// CHECK: @.offloading.entry._kernel_1 = internal constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 1, i32 0, ptr @._kernel_1.region_id, ptr @.offloading.entry_name._kernel_1, i64 0, i64 0, ptr null }, section "llvm_offload_entries", align 8
// CHECK: @anon.{{.*}}.0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
// CHECK: @anon.{{.*}}.1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @anon.{{.*}}.0 }, align 8
// CHECK: Function Attrs:
// CHECK-NEXT: define{{( dso_local)?}} void @main()
// CHECK-NEXT: start:
// CHECK-NEXT: %0 = alloca [8 x i8], align 8
// CHECK-NEXT: %x = alloca [1024 x i8], align 16
// CHECK: call void @kernel_1(ptr noalias noundef nonnull align 4 dereferenceable(1024) %x)
// CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %0)
// CHECK-NEXT: store ptr %x, ptr %0, align 8
// CHECK-NEXT: call void asm sideeffect "", "r,~{memory}"(ptr nonnull %0) #4, !srcloc !4
// CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %0)
// CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 1024, ptr nonnull %x)
// CHECK-NEXT: ret void
// CHECK-NEXT: }
// CHECK: define{{( dso_local)?}} void @kernel_1(ptr noalias noundef align 4 dereferenceable(1024) %x)
// CHECK-NEXT: start:
// CHECK-NEXT: %EmptyDesc = alloca %struct.__tgt_bin_desc, align 8
// CHECK-NEXT: %.offload_baseptrs = alloca [1 x ptr], align 8
// CHECK-NEXT: %.offload_ptrs = alloca [1 x ptr], align 8
// CHECK-NEXT: %.offload_sizes = alloca [1 x i64], align 8
// CHECK-NEXT: %kernel_args = alloca %struct.__tgt_kernel_arguments, align 8
// CHECK: call void @llvm.memset.p0.i64(ptr align 8 %EmptyDesc, i8 0, i64 32, i1 false)
// CHECK-NEXT: %1 = getelementptr inbounds float, ptr %x, i32 0
// CHECK-NEXT: call void @__tgt_register_lib(ptr %EmptyDesc)
// CHECK-NEXT: %dummy = load volatile ptr, ptr @.offload_sizes._kernel_1, align 8
// CHECK-NEXT: %dummy1 = load volatile ptr, ptr @.offloading.entry._kernel_1, align 8
// CHECK-NEXT: call void @llvm.memset.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) %EmptyDesc, i8 0, i64 32, i1 false)
// CHECK-NEXT: call void @__tgt_register_lib(ptr nonnull %EmptyDesc)
// CHECK-NEXT: call void @__tgt_init_all_rtls()
// CHECK-NEXT: %2 = getelementptr inbounds [1 x ptr], ptr %.offload_baseptrs, i32 0, i32 0
// CHECK-NEXT: store ptr %x, ptr %2, align 8
// CHECK-NEXT: %3 = getelementptr inbounds [1 x ptr], ptr %.offload_ptrs, i32 0, i32 0
// CHECK-NEXT: store ptr %1, ptr %3, align 8
// CHECK-NEXT: %4 = getelementptr inbounds [1 x i64], ptr %.offload_sizes, i32 0, i32 0
// CHECK-NEXT: store i64 1024, ptr %4, align 8
// CHECK-NEXT: %5 = getelementptr inbounds [1 x ptr], ptr %.offload_baseptrs, i32 0, i32 0
// CHECK-NEXT: %6 = getelementptr inbounds [1 x ptr], ptr %.offload_ptrs, i32 0, i32 0
// CHECK-NEXT: %7 = getelementptr inbounds [1 x i64], ptr %.offload_sizes, i32 0, i32 0
// CHECK-NEXT: call void @__tgt_target_data_begin_mapper(ptr @1, i64 -1, i32 1, ptr %5, ptr %6, ptr %7, ptr @.offload_maptypes.1, ptr null, ptr null)
// CHECK-NEXT: %8 = getelementptr inbounds %struct.__tgt_kernel_arguments, ptr %kernel_args, i32 0, i32 0
// CHECK-NEXT: store i32 3, ptr %8, align 4
// CHECK-NEXT: %9 = getelementptr inbounds %struct.__tgt_kernel_arguments, ptr %kernel_args, i32 0, i32 1
// CHECK-NEXT: store i32 1, ptr %9, align 4
// CHECK-NEXT: %10 = getelementptr inbounds %struct.__tgt_kernel_arguments, ptr %kernel_args, i32 0, i32 2
// CHECK-NEXT: store ptr %5, ptr %10, align 8
// CHECK-NEXT: %11 = getelementptr inbounds %struct.__tgt_kernel_arguments, ptr %kernel_args, i32 0, i32 3
// CHECK-NEXT: store ptr %6, ptr %11, align 8
// CHECK-NEXT: %12 = getelementptr inbounds %struct.__tgt_kernel_arguments, ptr %kernel_args, i32 0, i32 4
// CHECK-NEXT: store ptr %7, ptr %12, align 8
// CHECK-NEXT: %13 = getelementptr inbounds %struct.__tgt_kernel_arguments, ptr %kernel_args, i32 0, i32 5
// CHECK-NEXT: store ptr @.offload_maptypes.1, ptr %13, align 8
// CHECK-NEXT: %14 = getelementptr inbounds %struct.__tgt_kernel_arguments, ptr %kernel_args, i32 0, i32 6
// CHECK-NEXT: store ptr null, ptr %14, align 8
// CHECK-NEXT: %15 = getelementptr inbounds %struct.__tgt_kernel_arguments, ptr %kernel_args, i32 0, i32 7
// CHECK-NEXT: store ptr null, ptr %15, align 8
// CHECK-NEXT: %16 = getelementptr inbounds %struct.__tgt_kernel_arguments, ptr %kernel_args, i32 0, i32 8
// CHECK-NEXT: store i64 0, ptr %16, align 8
// CHECK-NEXT: %17 = getelementptr inbounds %struct.__tgt_kernel_arguments, ptr %kernel_args, i32 0, i32 9
// CHECK-NEXT: store i64 0, ptr %17, align 8
// CHECK-NEXT: %18 = getelementptr inbounds %struct.__tgt_kernel_arguments, ptr %kernel_args, i32 0, i32 10
// CHECK-NEXT: store [3 x i32] [i32 2097152, i32 0, i32 0], ptr %18, align 4
// CHECK-NEXT: %19 = getelementptr inbounds %struct.__tgt_kernel_arguments, ptr %kernel_args, i32 0, i32 11
// CHECK-NEXT: store [3 x i32] [i32 256, i32 0, i32 0], ptr %19, align 4
// CHECK-NEXT: %20 = getelementptr inbounds %struct.__tgt_kernel_arguments, ptr %kernel_args, i32 0, i32 12
// CHECK-NEXT: store i32 0, ptr %20, align 4
// CHECK-NEXT: %21 = call i32 @__tgt_target_kernel(ptr @1, i64 -1, i32 2097152, i32 256, ptr @.kernel_1.region_id, ptr %kernel_args)
// CHECK-NEXT: %22 = getelementptr inbounds [1 x ptr], ptr %.offload_baseptrs, i32 0, i32 0
// CHECK-NEXT: %23 = getelementptr inbounds [1 x ptr], ptr %.offload_ptrs, i32 0, i32 0
// CHECK-NEXT: %24 = getelementptr inbounds [1 x i64], ptr %.offload_sizes, i32 0, i32 0
// CHECK-NEXT: call void @__tgt_target_data_end_mapper(ptr @1, i64 -1, i32 1, ptr %22, ptr %23, ptr %24, ptr @.offload_maptypes.1, ptr null, ptr null)
// CHECK-NEXT: call void @__tgt_unregister_lib(ptr %EmptyDesc)
// CHECK: store ptr %x, ptr %0, align 8
// CHECK-NEXT: call void asm sideeffect "", "r,~{memory}"(ptr nonnull %0)
// CHECK: ret void
// CHECK-NEXT: store ptr %x, ptr %.offload_baseptrs, align 8
// CHECK-NEXT: store ptr %x, ptr %.offload_ptrs, align 8
// CHECK-NEXT: store i64 1024, ptr %.offload_sizes, align 8
// CHECK-NEXT: call void @__tgt_target_data_begin_mapper(ptr nonnull @anon.{{.*}}.1, i64 -1, i32 1, ptr nonnull %.offload_baseptrs, ptr nonnull %.offload_ptrs, ptr nonnull %.offload_sizes, ptr nonnull @.offload_maptypes._kernel_1, ptr null, ptr null)
// CHECK-NEXT: store i32 3, ptr %kernel_args, align 8
// CHECK-NEXT: %0 = getelementptr inbounds nuw i8, ptr %kernel_args, i64 4
// CHECK-NEXT: store i32 1, ptr %0, align 4
// CHECK-NEXT: %1 = getelementptr inbounds nuw i8, ptr %kernel_args, i64 8
// CHECK-NEXT: store ptr %.offload_baseptrs, ptr %1, align 8
// CHECK-NEXT: %2 = getelementptr inbounds nuw i8, ptr %kernel_args, i64 16
// CHECK-NEXT: store ptr %.offload_ptrs, ptr %2, align 8
// CHECK-NEXT: %3 = getelementptr inbounds nuw i8, ptr %kernel_args, i64 24
// CHECK-NEXT: store ptr %.offload_sizes, ptr %3, align 8
// CHECK-NEXT: %4 = getelementptr inbounds nuw i8, ptr %kernel_args, i64 32
// CHECK-NEXT: store ptr @.offload_maptypes._kernel_1, ptr %4, align 8
// CHECK-NEXT: %5 = getelementptr inbounds nuw i8, ptr %kernel_args, i64 40
// CHECK-NEXT: %6 = getelementptr inbounds nuw i8, ptr %kernel_args, i64 72
// CHECK-NEXT: call void @llvm.memset.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) %5, i8 0, i64 32, i1 false)
// CHECK-NEXT: store <4 x i32> <i32 2097152, i32 0, i32 0, i32 256>, ptr %6, align 8
// CHECK-NEXT: %.fca.1.gep3 = getelementptr inbounds nuw i8, ptr %kernel_args, i64 88
// CHECK-NEXT: store i32 0, ptr %.fca.1.gep3, align 8
// CHECK-NEXT: %.fca.2.gep4 = getelementptr inbounds nuw i8, ptr %kernel_args, i64 92
// CHECK-NEXT: store i32 0, ptr %.fca.2.gep4, align 4
// CHECK-NEXT: %7 = getelementptr inbounds nuw i8, ptr %kernel_args, i64 96
// CHECK-NEXT: store i32 0, ptr %7, align 8
// CHECK-NEXT: %8 = call i32 @__tgt_target_kernel(ptr nonnull @anon.{{.*}}.1, i64 -1, i32 2097152, i32 256, ptr nonnull @._kernel_1.region_id, ptr nonnull %kernel_args)
// CHECK-NEXT: call void @__tgt_target_data_end_mapper(ptr nonnull @anon.{{.*}}.1, i64 -1, i32 1, ptr nonnull %.offload_baseptrs, ptr nonnull %.offload_ptrs, ptr nonnull %.offload_sizes, ptr nonnull @.offload_maptypes._kernel_1, ptr null, ptr null)
// CHECK-NEXT: call void @__tgt_unregister_lib(ptr nonnull %EmptyDesc)
// CHECK-NEXT: ret void
// CHECK-NEXT: }
// CHECK: Function Attrs: nounwind
@ -100,6 +98,12 @@ fn main() {
#[unsafe(no_mangle)]
#[inline(never)]
pub fn kernel_1(x: &mut [f32; 256]) {
core::intrinsics::offload(_kernel_1, (x,))
}
#[unsafe(no_mangle)]
#[inline(never)]
pub fn _kernel_1(x: &mut [f32; 256]) {
for i in 0..256 {
x[i] = 21.0;
}

View file

@ -0,0 +1,6 @@
error: using the offload feature requires -Z offload=Enable
error: using the offload feature requires -C lto=fat
error: aborting due to 2 previous errors

View file

@ -0,0 +1,23 @@
//@ revisions: pass fail
//@ no-prefer-dynamic
//@ needs-enzyme
//@[pass] build-pass
//@[fail] build-fail
//@[pass] compile-flags: -Zunstable-options -Zoffload=Enable -Clto=fat --emit=metadata
//@[fail] compile-flags: -Clto=thin
//[fail]~? ERROR: using the offload feature requires -Z offload=Enable
//[fail]~? ERROR: using the offload feature requires -C lto=fat
#![feature(core_intrinsics)]
fn main() {
let mut x = [3.0; 256];
kernel_1(&mut x);
}
fn kernel_1(x: &mut [f32; 256]) {
core::intrinsics::offload(_kernel_1, (x,))
}
fn _kernel_1(x: &mut [f32; 256]) {}