Rollup merge of #147936 - Sa4dUs:offload-intrinsic, r=ZuseZ4

Offload intrinsic This PR implements the minimal mechanisms required to run a small subset of arbitrary offload kernels without relying on hardcoded names or metadata. - `offload(kernel, (..args))`: an intrinsic that generates the necessary host-side LLVM-IR code. - `rustc_offload_kernel`: a builtin attribute that marks device kernels to be handled appropriately. Example usage (pseudocode): ```rust fn kernel(x: *mut [f64; 128]) { core::intrinsics::offload(kernel_1, (x,)) } #[cfg(target_os = "linux")] extern "C" { pub fn kernel_1(array_b: *mut [f64; 128]); } #[cfg(not(target_os = "linux"))] #[rustc_offload_kernel] extern "gpu-kernel" fn kernel_1(x: *mut [f64; 128]) { unsafe { (*x)[0] = 21.0 }; } ```
2025-11-26 23:32:03 +11:00 · 2025-11-26 23:32:03 +11:00 · 2b150f2c65
commit 2b150f2c65
parent 6840234806 f39ec4756f
23 changed files with 529 additions and 178 deletions
--- a/compiler/rustc_codegen_llvm/messages.ftl
+++ b/compiler/rustc_codegen_llvm/messages.ftl
@ -18,6 +18,9 @@ codegen_llvm_lto_bitcode_from_rlib = failed to get bitcode from object file for
 codegen_llvm_mismatch_data_layout =
    data-layout for target `{$rustc_target}`, `{$rustc_layout}`, differs from LLVM target's `{$llvm_target}` default layout, `{$llvm_layout}`

+codegen_llvm_offload_without_enable = using the offload feature requires -Z offload=Enable
+codegen_llvm_offload_without_fat_lto = using the offload feature requires -C lto=fat
+
 codegen_llvm_parse_bitcode = failed to parse bitcode for LTO module
 codegen_llvm_parse_bitcode_with_llvm_err = failed to parse bitcode for LTO module: {$llvm_err}

--- a/compiler/rustc_codegen_llvm/src/attributes.rs
+++ b/compiler/rustc_codegen_llvm/src/attributes.rs
@ -30,6 +30,14 @@ pub(crate) fn apply_to_callsite(callsite: &Value, idx: AttributePlace, attrs: &[
    }
 }

+pub(crate) fn has_string_attr(llfn: &Value, name: &str) -> bool {
+    llvm::HasStringAttribute(llfn, name)
+}
+
+pub(crate) fn remove_string_attr_from_llfn(llfn: &Value, name: &str) {
+    llvm::RemoveStringAttrFromFn(llfn, name);
+}
+
 /// Get LLVM attribute for the provided inline heuristic.
 pub(crate) fn inline_attr<'ll, 'tcx>(
    cx: &SimpleCx<'ll>,
@ -408,6 +416,10 @@ pub(crate) fn llfn_attrs_from_instance<'ll, 'tcx>(
        to_add.push(llvm::CreateAttrString(cx.llcx, "no-builtins"));
    }

+    if codegen_fn_attrs.flags.contains(CodegenFnAttrFlags::OFFLOAD_KERNEL) {
+        to_add.push(llvm::CreateAttrString(cx.llcx, "offload-kernel"))
+    }
+
    if codegen_fn_attrs.flags.contains(CodegenFnAttrFlags::COLD) {
        to_add.push(AttributeKind::Cold.create_attr(cx.llcx));
    }
--- a/compiler/rustc_codegen_llvm/src/back/lto.rs
+++ b/compiler/rustc_codegen_llvm/src/back/lto.rs
@ -26,7 +26,7 @@ use crate::back::write::{
 };
 use crate::errors::{LlvmError, LtoBitcodeFromRlib};
 use crate::llvm::{self, build_string};
-use crate::{LlvmCodegenBackend, ModuleLlvm, SimpleCx};
+use crate::{LlvmCodegenBackend, ModuleLlvm};

 /// We keep track of the computed LTO cache keys from the previous
 /// session to determine which CGUs we can reuse.
@ -601,7 +601,6 @@ pub(crate) fn run_pass_manager(
    // We then run the llvm_optimize function a second time, to optimize the code which we generated
    // in the enzyme differentiation pass.
    let enable_ad = config.autodiff.contains(&config::AutoDiff::Enable);
-    let enable_gpu = config.offload.contains(&config::Offload::Enable);
    let stage = if thin {
        write::AutodiffStage::PreAD
    } else {
@ -616,13 +615,6 @@ pub(crate) fn run_pass_manager(
        write::llvm_optimize(cgcx, dcx, module, None, config, opt_level, opt_stage, stage);
    }

-    // Here we only handle the GPU host (=cpu) code.
-    if enable_gpu && !thin && !cgcx.target_is_like_gpu {
-        let cx =
-            SimpleCx::new(module.module_llvm.llmod(), &module.module_llvm.llcx, cgcx.pointer_size);
-        crate::builder::gpu_offload::handle_gpu_code(cgcx, &cx);
-    }
-
    if cfg!(feature = "llvm_enzyme") && enable_ad && !thin {
        let opt_stage = llvm::OptStage::FatLTO;
        let stage = write::AutodiffStage::PostAD;
--- a/compiler/rustc_codegen_llvm/src/back/write.rs
+++ b/compiler/rustc_codegen_llvm/src/back/write.rs
@ -43,7 +43,7 @@ use crate::errors::{
 use crate::llvm::diagnostic::OptimizationDiagnosticKind::*;
 use crate::llvm::{self, DiagnosticInfo};
 use crate::type_::llvm_type_ptr;
-use crate::{LlvmCodegenBackend, ModuleLlvm, SimpleCx, base, common, llvm_util};
+use crate::{LlvmCodegenBackend, ModuleLlvm, SimpleCx, attributes, base, common, llvm_util};

 pub(crate) fn llvm_err<'a>(dcx: DiagCtxtHandle<'_>, err: LlvmError<'a>) -> ! {
    match llvm::last_error() {
@ -712,11 +712,12 @@ pub(crate) unsafe fn llvm_optimize(
            SimpleCx::new(module.module_llvm.llmod(), module.module_llvm.llcx, cgcx.pointer_size);
        // For now we only support up to 10 kernels named kernel_0 ... kernel_9, a follow-up PR is
        // introducing a proper offload intrinsic to solve this limitation.
-        for num in 0..9 {
-            let name = format!("kernel_{num}");
-            if let Some(kernel) = cx.get_function(&name) {
-                handle_offload(&cx, kernel);
+        for func in cx.get_functions() {
+            let offload_kernel = "offload-kernel";
+            if attributes::has_string_attr(func, offload_kernel) {
+                handle_offload(&cx, func);
            }
+            attributes::remove_string_attr_from_llfn(func, offload_kernel);
        }
    }

--- a/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs
+++ b/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs
@ -2,37 +2,13 @@ use std::ffi::CString;

 use llvm::Linkage::*;
 use rustc_abi::Align;
-use rustc_codegen_ssa::back::write::CodegenContext;
 use rustc_codegen_ssa::traits::BaseTypeCodegenMethods;
+use rustc_middle::ty::offload_meta::OffloadMetadata;

 use crate::builder::SBuilder;
-use crate::common::AsCCharPtr;
 use crate::llvm::AttributePlace::Function;
-use crate::llvm::{self, Linkage, Type, Value};
-use crate::{LlvmCodegenBackend, SimpleCx, attributes};
-
-pub(crate) fn handle_gpu_code<'ll>(
-    _cgcx: &CodegenContext<LlvmCodegenBackend>,
-    cx: &'ll SimpleCx<'_>,
-) {
-    // The offload memory transfer type for each kernel
-    let mut memtransfer_types = vec![];
-    let mut region_ids = vec![];
-    let offload_entry_ty = TgtOffloadEntry::new_decl(&cx);
-    // This is a temporary hack, we only search for kernel_0 to kernel_9 functions.
-    // There is a draft PR in progress which will introduce a proper offload intrinsic to remove
-    // this limitation.
-    for num in 0..9 {
-        let kernel = cx.get_function(&format!("kernel_{num}"));
-        if let Some(kernel) = kernel {
-            let (o, k) = gen_define_handling(&cx, kernel, offload_entry_ty, num);
-            memtransfer_types.push(o);
-            region_ids.push(k);
-        }
-    }
-
-    gen_call_handling(&cx, &memtransfer_types, &region_ids);
-}
+use crate::llvm::{self, BasicBlock, Linkage, Type, Value};
+use crate::{SimpleCx, attributes};

 // ; Function Attrs: nounwind
 // declare i32 @__tgt_target_kernel(ptr, i64, i32, i32, ptr, ptr) #2
@ -79,7 +55,7 @@ fn generate_at_one<'ll>(cx: &'ll SimpleCx<'_>) -> &'ll llvm::Value {
    at_one
 }

-struct TgtOffloadEntry {
+pub(crate) struct TgtOffloadEntry {
    //   uint64_t Reserved;
    //   uint16_t Version;
    //   uint16_t Kind;
@ -167,7 +143,7 @@ impl KernelArgsTy {
    fn new<'ll>(
        cx: &'ll SimpleCx<'_>,
        num_args: u64,
-        memtransfer_types: &[&'ll Value],
+        memtransfer_types: &'ll Value,
        geps: [&'ll Value; 3],
    ) -> [(Align, &'ll Value); 13] {
        let four = Align::from_bytes(4).expect("4 Byte alignment should work");
@ -181,7 +157,7 @@ impl KernelArgsTy {
            (eight, geps[0]),
            (eight, geps[1]),
            (eight, geps[2]),
-            (eight, memtransfer_types[0]),
+            (eight, memtransfer_types),
            // The next two are debug infos. FIXME(offload): set them
            (eight, cx.const_null(cx.type_ptr())), // dbg
            (eight, cx.const_null(cx.type_ptr())), // dbg
@ -194,6 +170,14 @@ impl KernelArgsTy {
    }
 }

+// Contains LLVM values needed to manage offloading for a single kernel.
+pub(crate) struct OffloadKernelData<'ll> {
+    pub offload_sizes: &'ll llvm::Value,
+    pub memtransfer_types: &'ll llvm::Value,
+    pub region_id: &'ll llvm::Value,
+    pub offload_entry: &'ll llvm::Value,
+}
+
 fn gen_tgt_data_mappers<'ll>(
    cx: &'ll SimpleCx<'_>,
 ) -> (&'ll llvm::Value, &'ll llvm::Value, &'ll llvm::Value, &'ll llvm::Type) {
@ -256,68 +240,68 @@ pub(crate) fn add_global<'ll>(
 // This function returns a memtransfer value which encodes how arguments to this kernel shall be
 // mapped to/from the gpu. It also returns a region_id with the name of this kernel, to be
 // concatenated into the list of region_ids.
-fn gen_define_handling<'ll>(
-    cx: &'ll SimpleCx<'_>,
-    kernel: &'ll llvm::Value,
+pub(crate) fn gen_define_handling<'ll>(
+    cx: &SimpleCx<'ll>,
    offload_entry_ty: &'ll llvm::Type,
-    num: i64,
-) -> (&'ll llvm::Value, &'ll llvm::Value) {
-    let types = cx.func_params_types(cx.get_type_of_global(kernel));
+    metadata: &[OffloadMetadata],
+    types: &[&Type],
+    symbol: &str,
+) -> OffloadKernelData<'ll> {
    // It seems like non-pointer values are automatically mapped. So here, we focus on pointer (or
    // reference) types.
-    let num_ptr_types = types
-        .iter()
-        .filter(|&x| matches!(cx.type_kind(x), rustc_codegen_ssa::common::TypeKind::Pointer))
-        .count();
+    let ptr_meta = types.iter().zip(metadata).filter_map(|(&x, meta)| match cx.type_kind(x) {
+        rustc_codegen_ssa::common::TypeKind::Pointer => Some(meta),
+        _ => None,
+    });

-    // We do not know their size anymore at this level, so hardcode a placeholder.
-    // A follow-up pr will track these from the frontend, where we still have Rust types.
-    // Then, we will be able to figure out that e.g. `&[f32;256]` will result in 4*256 bytes.
-    // I decided that 1024 bytes is a great placeholder value for now.
-    add_priv_unnamed_arr(&cx, &format!(".offload_sizes.{num}"), &vec![1024; num_ptr_types]);
+    // FIXME(Sa4dUs): add `OMP_MAP_TARGET_PARAM = 0x20` only if necessary
+    let (ptr_sizes, ptr_transfer): (Vec<_>, Vec<_>) =
+        ptr_meta.map(|m| (m.payload_size, m.mode.bits() | 0x20)).unzip();
+
+    let offload_sizes = add_priv_unnamed_arr(&cx, &format!(".offload_sizes.{symbol}"), &ptr_sizes);
    // Here we figure out whether something needs to be copied to the gpu (=1), from the gpu (=2),
    // or both to and from the gpu (=3). Other values shouldn't affect us for now.
    // A non-mutable reference or pointer will be 1, an array that's not read, but fully overwritten
    // will be 2. For now, everything is 3, until we have our frontend set up.
    // 1+2+32: 1 (MapTo), 2 (MapFrom), 32 (Add one extra input ptr per function, to be used later).
-    let memtransfer_types = add_priv_unnamed_arr(
-        &cx,
-        &format!(".offload_maptypes.{num}"),
-        &vec![1 + 2 + 32; num_ptr_types],
-    );
+    let memtransfer_types =
+        add_priv_unnamed_arr(&cx, &format!(".offload_maptypes.{symbol}"), &ptr_transfer);
+
    // Next: For each function, generate these three entries. A weak constant,
    // the llvm.rodata entry name, and  the llvm_offload_entries value

-    let name = format!(".kernel_{num}.region_id");
+    let name = format!(".{symbol}.region_id");
    let initializer = cx.get_const_i8(0);
    let region_id = add_unnamed_global(&cx, &name, initializer, WeakAnyLinkage);

-    let c_entry_name = CString::new(format!("kernel_{num}")).unwrap();
+    let c_entry_name = CString::new(symbol).unwrap();
    let c_val = c_entry_name.as_bytes_with_nul();
-    let offload_entry_name = format!(".offloading.entry_name.{num}");
+    let offload_entry_name = format!(".offloading.entry_name.{symbol}");

    let initializer = crate::common::bytes_in_context(cx.llcx, c_val);
    let llglobal = add_unnamed_global(&cx, &offload_entry_name, initializer, InternalLinkage);
    llvm::set_alignment(llglobal, Align::ONE);
    llvm::set_section(llglobal, c".llvm.rodata.offloading");
-    let name = format!(".offloading.entry.kernel_{num}");
+
+    let name = format!(".offloading.entry.{symbol}");

    // See the __tgt_offload_entry documentation above.
    let elems = TgtOffloadEntry::new(&cx, region_id, llglobal);

    let initializer = crate::common::named_struct(offload_entry_ty, &elems);
    let c_name = CString::new(name).unwrap();
-    let llglobal = llvm::add_global(cx.llmod, offload_entry_ty, &c_name);
-    llvm::set_global_constant(llglobal, true);
-    llvm::set_linkage(llglobal, WeakAnyLinkage);
-    llvm::set_initializer(llglobal, initializer);
-    llvm::set_alignment(llglobal, Align::EIGHT);
+    let offload_entry = llvm::add_global(cx.llmod, offload_entry_ty, &c_name);
+    llvm::set_global_constant(offload_entry, true);
+    llvm::set_linkage(offload_entry, WeakAnyLinkage);
+    llvm::set_initializer(offload_entry, initializer);
+    llvm::set_alignment(offload_entry, Align::EIGHT);
    let c_section_name = CString::new("llvm_offload_entries").unwrap();
-    llvm::set_section(llglobal, &c_section_name);
-    (memtransfer_types, region_id)
+    llvm::set_section(offload_entry, &c_section_name);
+
+    OffloadKernelData { offload_sizes, memtransfer_types, region_id, offload_entry }
 }

-pub(crate) fn declare_offload_fn<'ll>(
+fn declare_offload_fn<'ll>(
    cx: &'ll SimpleCx<'_>,
    name: &str,
    ty: &'ll llvm::Type,
@ -333,8 +317,7 @@ pub(crate) fn declare_offload_fn<'ll>(
 }

 // For each kernel *call*, we now use some of our previous declared globals to move data to and from
-// the gpu. We don't have a proper frontend yet, so we assume that every call to a kernel function
-// from main is intended to run on the GPU. For now, we only handle the data transfer part of it.
+// the gpu. For now, we only handle the data transfer part of it.
 // If two consecutive kernels use the same memory, we still move it to the host and back to the gpu.
 // Since in our frontend users (by default) don't have to specify data transfer, this is something
 // we should optimize in the future! We also assume that everything should be copied back and forth,
@ -352,11 +335,16 @@ pub(crate) fn declare_offload_fn<'ll>(
 // 4. set insert point after kernel call.
 // 5. generate all the GEPS and stores, to be used in 6)
 // 6. generate __tgt_target_data_end calls to move data from the GPU
-fn gen_call_handling<'ll>(
-    cx: &'ll SimpleCx<'_>,
-    memtransfer_types: &[&'ll llvm::Value],
-    region_ids: &[&'ll llvm::Value],
+pub(crate) fn gen_call_handling<'ll>(
+    cx: &SimpleCx<'ll>,
+    bb: &BasicBlock,
+    offload_data: &OffloadKernelData<'ll>,
+    args: &[&'ll Value],
+    types: &[&Type],
+    metadata: &[OffloadMetadata],
 ) {
+    let OffloadKernelData { offload_sizes, offload_entry, memtransfer_types, region_id } =
+        offload_data;
    let (tgt_decl, tgt_target_kernel_ty) = generate_launcher(&cx);
    // %struct.__tgt_bin_desc = type { i32, ptr, ptr, ptr }
    let tptr = cx.type_ptr();
@ -368,27 +356,32 @@ fn gen_call_handling<'ll>(
    let tgt_kernel_decl = KernelArgsTy::new_decl(&cx);
    let (begin_mapper_decl, _, end_mapper_decl, fn_ty) = gen_tgt_data_mappers(&cx);

-    let main_fn = cx.get_function("main");
-    let Some(main_fn) = main_fn else { return };
-    let kernel_name = "kernel_1";
-    let call = unsafe {
-        llvm::LLVMRustGetFunctionCall(main_fn, kernel_name.as_c_char_ptr(), kernel_name.len())
-    };
-    let Some(kernel_call) = call else {
-        return;
-    };
-    let kernel_call_bb = unsafe { llvm::LLVMGetInstructionParent(kernel_call) };
-    let called = unsafe { llvm::LLVMGetCalledValue(kernel_call).unwrap() };
-    let mut builder = SBuilder::build(cx, kernel_call_bb);
+    let mut builder = SBuilder::build(cx, bb);

-    let types = cx.func_params_types(cx.get_type_of_global(called));
    let num_args = types.len() as u64;
+    let ip = unsafe { llvm::LLVMRustGetInsertPoint(&builder.llbuilder) };
+
+    // FIXME(Sa4dUs): dummy loads are a temp workaround, we should find a proper way to prevent these
+    // variables from being optimized away
+    for val in [offload_sizes, offload_entry] {
+        unsafe {
+            let dummy = llvm::LLVMBuildLoad2(
+                &builder.llbuilder,
+                llvm::LLVMTypeOf(val),
+                val,
+                b"dummy\0".as_ptr() as *const _,
+            );
+            llvm::LLVMSetVolatile(dummy, llvm::TRUE);
+        }
+    }

    // Step 0)
    // %struct.__tgt_bin_desc = type { i32, ptr, ptr, ptr }
    // %6 = alloca %struct.__tgt_bin_desc, align 8
-    unsafe { llvm::LLVMRustPositionBuilderPastAllocas(builder.llbuilder, main_fn) };
-
+    let llfn = unsafe { llvm::LLVMGetBasicBlockParent(bb) };
+    unsafe {
+        llvm::LLVMRustPositionBuilderPastAllocas(&builder.llbuilder, llfn);
+    }
    let tgt_bin_desc_alloca = builder.direct_alloca(tgt_bin_desc, Align::EIGHT, "EmptyDesc");

    let ty = cx.type_array(cx.type_ptr(), num_args);
@ -404,15 +397,16 @@ fn gen_call_handling<'ll>(
    let a5 = builder.direct_alloca(tgt_kernel_decl, Align::EIGHT, "kernel_args");

    // Step 1)
-    unsafe { llvm::LLVMRustPositionBefore(builder.llbuilder, kernel_call) };
+    unsafe {
+        llvm::LLVMRustRestoreInsertPoint(&builder.llbuilder, ip);
+    }
    builder.memset(tgt_bin_desc_alloca, cx.get_const_i8(0), cx.get_const_i64(32), Align::EIGHT);

    // Now we allocate once per function param, a copy to be passed to one of our maps.
    let mut vals = vec![];
    let mut geps = vec![];
    let i32_0 = cx.get_const_i32(0);
-    for index in 0..types.len() {
-        let v = unsafe { llvm::LLVMGetOperand(kernel_call, index as u32).unwrap() };
+    for &v in args {
        let gep = builder.inbounds_gep(cx.type_f32(), v, &[i32_0]);
        vals.push(v);
        geps.push(gep);
@ -437,10 +431,8 @@ fn gen_call_handling<'ll>(
        let gep2 = builder.inbounds_gep(ty, a2, &[i32_0, idx]);
        builder.store(geps[i as usize], gep2, Align::EIGHT);
        let gep3 = builder.inbounds_gep(ty2, a4, &[i32_0, idx]);
-        // As mentioned above, we don't use Rust type information yet. So for now we will just
-        // assume that we have 1024 bytes, 256 f32 values.
        // FIXME(offload): write an offload frontend and handle arbitrary types.
-        builder.store(cx.get_const_i64(1024), gep3, Align::EIGHT);
+        builder.store(cx.get_const_i64(metadata[i as usize].payload_size), gep3, Align::EIGHT);
    }

    // For now we have a very simplistic indexing scheme into our
@ -482,9 +474,17 @@ fn gen_call_handling<'ll>(

    // Step 2)
    let s_ident_t = generate_at_one(&cx);
-    let o = memtransfer_types[0];
    let geps = get_geps(&mut builder, &cx, ty, ty2, a1, a2, a4);
-    generate_mapper_call(&mut builder, &cx, geps, o, begin_mapper_decl, fn_ty, num_args, s_ident_t);
+    generate_mapper_call(
+        &mut builder,
+        &cx,
+        geps,
+        memtransfer_types,
+        begin_mapper_decl,
+        fn_ty,
+        num_args,
+        s_ident_t,
+    );
    let values = KernelArgsTy::new(&cx, num_args, memtransfer_types, geps);

    // Step 3)
@ -501,26 +501,26 @@ fn gen_call_handling<'ll>(
        // FIXME(offload): Don't hardcode the numbers of threads in the future.
        cx.get_const_i32(2097152),
        cx.get_const_i32(256),
-        region_ids[0],
+        region_id,
        a5,
    ];
-    let offload_success = builder.call(tgt_target_kernel_ty, tgt_decl, &args, None);
+    builder.call(tgt_target_kernel_ty, tgt_decl, &args, None);
    // %41 = call i32 @__tgt_target_kernel(ptr @1, i64 -1, i32 2097152, i32 256, ptr @.kernel_1.region_id, ptr %kernel_args)
-    unsafe {
-        let next = llvm::LLVMGetNextInstruction(offload_success).unwrap();
-        llvm::LLVMRustPositionAfter(builder.llbuilder, next);
-        llvm::LLVMInstructionEraseFromParent(next);
-    }

    // Step 4)
    let geps = get_geps(&mut builder, &cx, ty, ty2, a1, a2, a4);
-    generate_mapper_call(&mut builder, &cx, geps, o, end_mapper_decl, fn_ty, num_args, s_ident_t);
+    generate_mapper_call(
+        &mut builder,
+        &cx,
+        geps,
+        memtransfer_types,
+        end_mapper_decl,
+        fn_ty,
+        num_args,
+        s_ident_t,
+    );

    builder.call(mapper_fn_ty, unregister_lib_decl, &[tgt_bin_desc_alloca], None);

    drop(builder);
-    // FIXME(offload) The issue is that we right now add a call to the gpu version of the function,
-    // and then delete the call to the CPU version. In the future, we should use an intrinsic which
-    // directly resolves to a call to the GPU version.
-    unsafe { llvm::LLVMDeleteFunction(called) };
 }
--- a/compiler/rustc_codegen_llvm/src/context.rs
+++ b/compiler/rustc_codegen_llvm/src/context.rs
@ -791,6 +791,16 @@ impl<'ll, CX: Borrow<SCx<'ll>>> GenericCx<'ll, CX> {
            llvm::LLVMMDStringInContext2(self.llcx(), name.as_ptr() as *const c_char, name.len())
        }
    }
+
+    pub(crate) fn get_functions(&self) -> Vec<&'ll Value> {
+        let mut functions = vec![];
+        let mut func = unsafe { llvm::LLVMGetFirstFunction(self.llmod()) };
+        while let Some(f) = func {
+            functions.push(f);
+            func = unsafe { llvm::LLVMGetNextFunction(f) }
+        }
+        functions
+    }
 }

 impl<'ll, 'tcx> MiscCodegenMethods<'tcx> for CodegenCx<'ll, 'tcx> {
--- a/compiler/rustc_codegen_llvm/src/errors.rs
+++ b/compiler/rustc_codegen_llvm/src/errors.rs
@ -40,6 +40,14 @@ pub(crate) struct AutoDiffWithoutLto;
 #[diag(codegen_llvm_autodiff_without_enable)]
 pub(crate) struct AutoDiffWithoutEnable;

+#[derive(Diagnostic)]
+#[diag(codegen_llvm_offload_without_enable)]
+pub(crate) struct OffloadWithoutEnable;
+
+#[derive(Diagnostic)]
+#[diag(codegen_llvm_offload_without_fat_lto)]
+pub(crate) struct OffloadWithoutFatLTO;
+
 #[derive(Diagnostic)]
 #[diag(codegen_llvm_lto_bitcode_from_rlib)]
 pub(crate) struct LtoBitcodeFromRlib {
--- a/compiler/rustc_codegen_llvm/src/intrinsic.rs
+++ b/compiler/rustc_codegen_llvm/src/intrinsic.rs
@ -13,6 +13,7 @@ use rustc_hir::def_id::LOCAL_CRATE;
 use rustc_hir::{self as hir};
 use rustc_middle::mir::BinOp;
 use rustc_middle::ty::layout::{FnAbiOf, HasTyCtxt, HasTypingEnv, LayoutOf};
+use rustc_middle::ty::offload_meta::OffloadMetadata;
 use rustc_middle::ty::{self, GenericArgsRef, Instance, SimdAlign, Ty, TyCtxt, TypingEnv};
 use rustc_middle::{bug, span_bug};
 use rustc_session::config::CrateType;
@ -25,8 +26,11 @@ use tracing::debug;
 use crate::abi::FnAbiLlvmExt;
 use crate::builder::Builder;
 use crate::builder::autodiff::{adjust_activity_to_abi, generate_enzyme_call};
+use crate::builder::gpu_offload::TgtOffloadEntry;
 use crate::context::CodegenCx;
-use crate::errors::{AutoDiffWithoutEnable, AutoDiffWithoutLto};
+use crate::errors::{
+    AutoDiffWithoutEnable, AutoDiffWithoutLto, OffloadWithoutEnable, OffloadWithoutFatLTO,
+};
 use crate::llvm::{self, Metadata, Type, Value};
 use crate::type_of::LayoutLlvmExt;
 use crate::va_arg::emit_va_arg;
@ -197,6 +201,24 @@ impl<'ll, 'tcx> IntrinsicCallBuilderMethods<'tcx> for Builder<'_, 'll, 'tcx> {
                codegen_autodiff(self, tcx, instance, args, result);
                return Ok(());
            }
+            sym::offload => {
+                if !tcx
+                    .sess
+                    .opts
+                    .unstable_opts
+                    .offload
+                    .contains(&rustc_session::config::Offload::Enable)
+                {
+                    let _ = tcx.dcx().emit_almost_fatal(OffloadWithoutEnable);
+                }
+
+                if tcx.sess.lto() != rustc_session::config::Lto::Fat {
+                    let _ = tcx.dcx().emit_almost_fatal(OffloadWithoutFatLTO);
+                }
+
+                codegen_offload(self, tcx, instance, args);
+                return Ok(());
+            }
            sym::is_val_statically_known => {
                if let OperandValue::Immediate(imm) = args[0].val {
                    self.call_intrinsic(
@ -1231,6 +1253,62 @@ fn codegen_autodiff<'ll, 'tcx>(
    );
 }

+// Generates the LLVM code to offload a Rust function to a target device (e.g., GPU).
+// For each kernel call, it generates the necessary globals (including metadata such as
+// size and pass mode), manages memory mapping to and from the device, handles all
+// data transfers, and launches the kernel on the target device.
+fn codegen_offload<'ll, 'tcx>(
+    bx: &mut Builder<'_, 'll, 'tcx>,
+    tcx: TyCtxt<'tcx>,
+    instance: ty::Instance<'tcx>,
+    args: &[OperandRef<'tcx, &'ll Value>],
+) {
+    let cx = bx.cx;
+    let fn_args = instance.args;
+
+    let (target_id, target_args) = match fn_args.into_type_list(tcx)[0].kind() {
+        ty::FnDef(def_id, params) => (def_id, params),
+        _ => bug!("invalid offload intrinsic arg"),
+    };
+
+    let fn_target = match Instance::try_resolve(tcx, cx.typing_env(), *target_id, target_args) {
+        Ok(Some(instance)) => instance,
+        Ok(None) => bug!(
+            "could not resolve ({:?}, {:?}) to a specific offload instance",
+            target_id,
+            target_args
+        ),
+        Err(_) => {
+            // An error has already been emitted
+            return;
+        }
+    };
+
+    let args = get_args_from_tuple(bx, args[1], fn_target);
+    let target_symbol = symbol_name_for_instance_in_crate(tcx, fn_target, LOCAL_CRATE);
+
+    let offload_entry_ty = TgtOffloadEntry::new_decl(&cx);
+
+    let sig = tcx.fn_sig(fn_target.def_id()).skip_binder().skip_binder();
+    let inputs = sig.inputs();
+
+    let metadata = inputs.iter().map(|ty| OffloadMetadata::from_ty(tcx, *ty)).collect::<Vec<_>>();
+
+    let types = inputs.iter().map(|ty| cx.layout_of(*ty).llvm_type(cx)).collect::<Vec<_>>();
+
+    let offload_data = crate::builder::gpu_offload::gen_define_handling(
+        cx,
+        offload_entry_ty,
+        &metadata,
+        &types,
+        &target_symbol,
+    );
+
+    // FIXME(Sa4dUs): pass the original builder once we separate kernel launch logic from globals
+    let bb = unsafe { llvm::LLVMGetInsertBlock(bx.llbuilder) };
+    crate::builder::gpu_offload::gen_call_handling(cx, bb, &offload_data, &args, &types, &metadata);
+}
+
 fn get_args_from_tuple<'ll, 'tcx>(
    bx: &mut Builder<'_, 'll, 'tcx>,
    tuple_op: OperandRef<'tcx, &'ll Value>,
--- a/compiler/rustc_codegen_llvm/src/llvm/ffi.rs
+++ b/compiler/rustc_codegen_llvm/src/llvm/ffi.rs
@ -1160,13 +1160,9 @@ unsafe extern "C" {
    ) -> &'a BasicBlock;

    // Operations on instructions
-    pub(crate) fn LLVMGetInstructionParent(Inst: &Value) -> &BasicBlock;
-    pub(crate) fn LLVMGetCalledValue(CallInst: &Value) -> Option<&Value>;
    pub(crate) fn LLVMIsAInstruction(Val: &Value) -> Option<&Value>;
    pub(crate) fn LLVMGetFirstBasicBlock(Fn: &Value) -> &BasicBlock;
    pub(crate) fn LLVMGetOperand(Val: &Value, Index: c_uint) -> Option<&Value>;
-    pub(crate) fn LLVMGetNextInstruction(Val: &Value) -> Option<&Value>;
-    pub(crate) fn LLVMInstructionEraseFromParent(Val: &Value);

    // Operations on call sites
    pub(crate) fn LLVMSetInstructionCallConv(Instr: &Value, CC: c_uint);
@ -2484,6 +2480,8 @@ unsafe extern "C" {

    pub(crate) fn LLVMRustPositionBuilderPastAllocas<'a>(B: &Builder<'a>, Fn: &'a Value);
    pub(crate) fn LLVMRustPositionBuilderAtStart<'a>(B: &Builder<'a>, BB: &'a BasicBlock);
+    pub(crate) fn LLVMRustGetInsertPoint<'a>(B: &Builder<'a>) -> &'a Value;
+    pub(crate) fn LLVMRustRestoreInsertPoint<'a>(B: &Builder<'a>, IP: &'a Value);

    pub(crate) fn LLVMRustSetModulePICLevel(M: &Module);
    pub(crate) fn LLVMRustSetModulePIELevel(M: &Module);
--- a/compiler/rustc_codegen_llvm/src/llvm/mod.rs
+++ b/compiler/rustc_codegen_llvm/src/llvm/mod.rs
@ -43,6 +43,14 @@ pub(crate) fn AddFunctionAttributes<'ll>(
    }
 }

+pub(crate) fn HasStringAttribute<'ll>(llfn: &'ll Value, name: &str) -> bool {
+    unsafe { LLVMRustHasFnAttribute(llfn, name.as_c_char_ptr(), name.len()) }
+}
+
+pub(crate) fn RemoveStringAttrFromFn<'ll>(llfn: &'ll Value, name: &str) {
+    unsafe { LLVMRustRemoveFnAttribute(llfn, name.as_c_char_ptr(), name.len()) }
+}
+
 pub(crate) fn AddCallSiteAttributes<'ll>(
    callsite: &'ll Value,
    idx: AttributePlace,
--- a/compiler/rustc_codegen_ssa/src/codegen_attrs.rs
+++ b/compiler/rustc_codegen_ssa/src/codegen_attrs.rs
@ -334,6 +334,9 @@ fn process_builtin_attrs(
                codegen_fn_attrs.patchable_function_entry =
                    parse_patchable_function_entry(tcx, attr);
            }
+            sym::rustc_offload_kernel => {
+                codegen_fn_attrs.flags |= CodegenFnAttrFlags::OFFLOAD_KERNEL
+            }
            _ => {}
        }
    }
--- a/compiler/rustc_feature/src/builtin_attrs.rs
+++ b/compiler/rustc_feature/src/builtin_attrs.rs
@ -1117,6 +1117,11 @@ pub static BUILTIN_ATTRIBUTES: &[BuiltinAttribute] = &[
        rustc_autodiff, Normal,
        template!(Word, List: &[r#""...""#]), DuplicatesOk,
        EncodeCrossCrate::Yes,
+    ),
+        rustc_attr!(
+        rustc_offload_kernel, Normal,
+        template!(Word), DuplicatesOk,
+        EncodeCrossCrate::Yes,
    ),
    // Traces that are left when `cfg` and `cfg_attr` attributes are expanded.
    // The attributes are not gated, to avoid stability errors, but they cannot be used in stable
--- a/compiler/rustc_hir_analysis/src/check/intrinsic.rs
+++ b/compiler/rustc_hir_analysis/src/check/intrinsic.rs
@ -163,6 +163,7 @@ fn intrinsic_operation_unsafety(tcx: TyCtxt<'_>, intrinsic_id: LocalDefId) -> hi
        | sym::minnumf128
        | sym::mul_with_overflow
        | sym::needs_drop
+        | sym::offload
        | sym::offset_of
        | sym::overflow_checks
        | sym::powf16
@ -313,6 +314,7 @@ pub(crate) fn check_intrinsic_type(
            let type_id = tcx.type_of(tcx.lang_items().type_id().unwrap()).instantiate_identity();
            (0, 0, vec![type_id, type_id], tcx.types.bool)
        }
+        sym::offload => (3, 0, vec![param(0), param(1)], param(2)),
        sym::offset => (2, 0, vec![param(0), param(1)], param(0)),
        sym::arith_offset => (
            1,
--- a/compiler/rustc_llvm/llvm-wrapper/RustWrapper.cpp
+++ b/compiler/rustc_llvm/llvm-wrapper/RustWrapper.cpp
@ -1436,6 +1436,39 @@ extern "C" void LLVMRustPositionAfter(LLVMBuilderRef B, LLVMValueRef Instr) {
  }
 }

+extern "C" LLVMValueRef LLVMRustGetInsertPoint(LLVMBuilderRef B) {
+  llvm::IRBuilderBase &IRB = *unwrap(B);
+
+  llvm::IRBuilderBase::InsertPoint ip = IRB.saveIP();
+  llvm::BasicBlock *BB = ip.getBlock();
+
+  if (!BB)
+    return nullptr;
+
+  auto it = ip.getPoint();
+
+  if (it == BB->end())
+    return nullptr;
+
+  llvm::Instruction *I = &*it;
+  return wrap(I);
+}
+
+extern "C" void LLVMRustRestoreInsertPoint(LLVMBuilderRef B,
+                                           LLVMValueRef Instr) {
+  llvm::IRBuilderBase &IRB = *unwrap(B);
+
+  if (!Instr) {
+    llvm::BasicBlock *BB = IRB.GetInsertBlock();
+    if (BB)
+      IRB.SetInsertPoint(BB);
+    return;
+  }
+
+  llvm::Instruction *I = unwrap<llvm::Instruction>(Instr);
+  IRB.SetInsertPoint(I);
+}
+
 extern "C" LLVMValueRef
 LLVMRustGetFunctionCall(LLVMValueRef Fn, const char *Name, size_t NameLen) {
  auto targetName = StringRef(Name, NameLen);
--- a/compiler/rustc_middle/src/middle/codegen_fn_attrs.rs
+++ b/compiler/rustc_middle/src/middle/codegen_fn_attrs.rs
@ -190,6 +190,8 @@ bitflags::bitflags! {
        const NO_BUILTINS               = 1 << 15;
        /// Marks foreign items, to make `contains_extern_indicator` cheaper.
        const FOREIGN_ITEM              = 1 << 16;
+        /// `#[rustc_offload_kernel]`: indicates that this is an offload kernel, an extra ptr arg will be added.
+        const OFFLOAD_KERNEL = 1 << 17;
    }
 }
 rustc_data_structures::external_bitflags_debug! { CodegenFnAttrFlags }
--- a/compiler/rustc_middle/src/ty/mod.rs
+++ b/compiler/rustc_middle/src/ty/mod.rs
@ -129,6 +129,7 @@ pub mod fast_reject;
 pub mod inhabitedness;
 pub mod layout;
 pub mod normalize_erasing_regions;
+pub mod offload_meta;
 pub mod pattern;
 pub mod print;
 pub mod relate;
--- a/compiler/rustc_middle/src/ty/offload_meta.rs
+++ b/compiler/rustc_middle/src/ty/offload_meta.rs
@ -0,0 +1,119 @@
+use bitflags::bitflags;
+
+use crate::ty::{self, PseudoCanonicalInput, Ty, TyCtxt, TypingEnv};
+
+pub struct OffloadMetadata {
+    pub payload_size: u64,
+    pub mode: MappingFlags,
+}
+
+bitflags! {
+    /// Mirrors `OpenMPOffloadMappingFlags` from Clang/OpenMP.
+    #[derive(Debug, Copy, Clone)]
+    #[repr(transparent)]
+    pub struct MappingFlags: u64 {
+        /// No flags.
+        const NONE           = 0x0;
+        /// Allocate memory on the device and move data from host to device.
+        const TO             = 0x01;
+        /// Allocate memory on the device and move data from device to host.
+        const FROM           = 0x02;
+        /// Always perform the requested mapping action, even if already mapped.
+        const ALWAYS         = 0x04;
+        /// Delete the element from the device environment, ignoring ref count.
+        const DELETE         = 0x08;
+        /// The element being mapped is a pointer-pointee pair.
+        const PTR_AND_OBJ    = 0x10;
+        /// The base address should be passed to the target kernel as argument.
+        const TARGET_PARAM   = 0x20;
+        /// The runtime must return the device pointer.
+        const RETURN_PARAM   = 0x40;
+        /// The reference being passed is a pointer to private data.
+        const PRIVATE        = 0x80;
+        /// Pass the element by value.
+        const LITERAL        = 0x100;
+        /// Implicit map (generated by compiler, not explicit in code).
+        const IMPLICIT       = 0x200;
+        /// Hint to allocate memory close to the target device.
+        const CLOSE          = 0x400;
+        /// Reserved (0x800 in OpenMP for XLC compatibility).
+        const RESERVED       = 0x800;
+        /// Require that the data is already allocated on the device.
+        const PRESENT        = 0x1000;
+        /// Increment/decrement a separate ref counter (OpenACC compatibility).
+        const OMPX_HOLD      = 0x2000;
+        /// Used for non-contiguous list items in target update.
+        const NON_CONTIG     = 0x100000000000;
+        /// 16 MSBs indicate membership in a struct.
+        const MEMBER_OF      = 0xffff000000000000;
+    }
+}
+
+impl OffloadMetadata {
+    pub fn from_ty<'tcx>(tcx: TyCtxt<'tcx>, ty: Ty<'tcx>) -> Self {
+        OffloadMetadata {
+            payload_size: get_payload_size(tcx, ty),
+            mode: MappingFlags::from_ty(tcx, ty),
+        }
+    }
+}
+
+// FIXME(Sa4dUs): implement a solid logic to determine the payload size
+fn get_payload_size<'tcx>(tcx: TyCtxt<'tcx>, ty: Ty<'tcx>) -> u64 {
+    match ty.kind() {
+        ty::RawPtr(inner, _) | ty::Ref(_, inner, _) => get_payload_size(tcx, *inner),
+        _ => tcx
+            .layout_of(PseudoCanonicalInput {
+                typing_env: TypingEnv::fully_monomorphized(),
+                value: ty,
+            })
+            .unwrap()
+            .size
+            .bytes(),
+    }
+}
+
+impl MappingFlags {
+    fn from_ty<'tcx>(tcx: TyCtxt<'tcx>, ty: Ty<'tcx>) -> Self {
+        use rustc_ast::Mutability::*;
+
+        match ty.kind() {
+            ty::Bool
+            | ty::Char
+            | ty::Int(_)
+            | ty::Uint(_)
+            | ty::Float(_)
+            | ty::Adt(_, _)
+            | ty::Tuple(_)
+            | ty::Array(_, _)
+            | ty::Alias(_, _)
+            | ty::Param(_) => MappingFlags::TO,
+
+            ty::RawPtr(_, Not) | ty::Ref(_, _, Not) => MappingFlags::TO,
+
+            ty::RawPtr(_, Mut) | ty::Ref(_, _, Mut) => MappingFlags::TO | MappingFlags::FROM,
+
+            ty::Slice(_) | ty::Str | ty::Dynamic(_, _) => MappingFlags::TO | MappingFlags::FROM,
+
+            ty::Foreign(_) | ty::Pat(_, _) | ty::UnsafeBinder(_) => {
+                MappingFlags::TO | MappingFlags::FROM
+            }
+
+            ty::FnDef(_, _)
+            | ty::FnPtr(_, _)
+            | ty::Closure(_, _)
+            | ty::CoroutineClosure(_, _)
+            | ty::Coroutine(_, _)
+            | ty::CoroutineWitness(_, _)
+            | ty::Never
+            | ty::Bound(_, _)
+            | ty::Placeholder(_)
+            | ty::Infer(_)
+            | ty::Error(_) => {
+                tcx.dcx()
+                    .span_err(rustc_span::DUMMY_SP, format!("type `{ty:?}` cannot be offloaded"));
+                MappingFlags::empty()
+            }
+        }
+    }
+}
--- a/compiler/rustc_span/src/symbol.rs
+++ b/compiler/rustc_span/src/symbol.rs
@ -1584,6 +1584,7 @@ symbols! {
        object_safe_for_dispatch,
        of,
        off,
+        offload,
        offset,
        offset_of,
        offset_of_enum,
@ -1966,6 +1967,7 @@ symbols! {
        rustc_objc_class,
        rustc_objc_selector,
        rustc_object_lifetime_default,
+        rustc_offload_kernel,
        rustc_on_unimplemented,
        rustc_outlives,
        rustc_paren_sugar,
--- a/library/core/src/intrinsics/mod.rs
+++ b/library/core/src/intrinsics/mod.rs
@ -3324,6 +3324,38 @@ pub const fn copysignf128(x: f128, y: f128) -> f128;
 #[rustc_intrinsic]
 pub const fn autodiff<F, G, T: crate::marker::Tuple, R>(f: F, df: G, args: T) -> R;

+/// Generates the LLVM body of a wrapper function to offload a kernel `f`.
+///
+/// Type Parameters:
+/// - `F`: The kernel to offload. Must be a function item.
+/// - `T`: A tuple of arguments passed to `f`.
+/// - `R`: The return type of the kernel.
+///
+/// Example usage (pseudocode):
+///
+/// ```rust,ignore (pseudocode)
+/// fn kernel(x: *mut [f64; 128]) {
+///     core::intrinsics::offload(kernel_1, (x,))
+/// }
+///
+/// #[cfg(target_os = "linux")]
+/// extern "C" {
+///     pub fn kernel_1(array_b: *mut [f64; 128]);
+/// }
+///
+/// #[cfg(not(target_os = "linux"))]
+/// #[rustc_offload_kernel]
+/// extern "gpu-kernel" fn kernel_1(x: *mut [f64; 128]) {
+///     unsafe { (*x)[0] = 21.0 };
+/// }
+/// ```
+///
+/// For reference, see the Clang documentation on offloading:
+/// <https://clang.llvm.org/docs/OffloadingDesign.html>.
+#[rustc_nounwind]
+#[rustc_intrinsic]
+pub const fn offload<F, T: crate::marker::Tuple, R>(f: F, args: T) -> R;
+
 /// Inform Miri that a given pointer definitely has a certain alignment.
 #[cfg(miri)]
 #[rustc_allow_const_fn_unstable(const_eval_select)]
--- a/src/doc/rustc-dev-guide/src/offload/usage.md
+++ b/src/doc/rustc-dev-guide/src/offload/usage.md
@ -5,6 +5,8 @@ We currently work on launching the following Rust kernel on the GPU. To follow a

 ```rust
 #![feature(abi_gpu_kernel)]
+#![feature(rustc_attrs)]
+#![feature(core_intrinsics)]
 #![no_std]

 #[cfg(target_os = "linux")]
@ -12,6 +14,7 @@ extern crate libc;
 #[cfg(target_os = "linux")]
 use libc::c_char;

+#[cfg(target_os = "linux")]
 use core::mem;

 #[panic_handler]
@ -38,7 +41,7 @@ fn main() {
    }

    unsafe {
-        kernel_1(array_c);
+        kernel(array_c);
    }
    core::hint::black_box(&array_c);
    unsafe {
@ -52,6 +55,11 @@ fn main() {
    }
 }

+#[inline(never)]
+unsafe fn kernel(x: *mut [f64; 256]) {
+    core::intrinsics::offload(kernel_1, (x,))
+}
+
 #[cfg(target_os = "linux")]
 unsafe extern "C" {
    pub fn kernel_1(array_b: *mut [f64; 256]);
@ -60,6 +68,7 @@ unsafe extern "C" {
 #[cfg(not(target_os = "linux"))]
 #[unsafe(no_mangle)]
 #[inline(never)]
+#[rustc_offload_kernel]
 pub extern "gpu-kernel" fn kernel_1(x: *mut [f64; 256]) {
    unsafe { (*x)[0] = 21.0 };
 }
--- a/tests/codegen-llvm/gpu_offload/gpu_host.rs
+++ b/tests/codegen-llvm/gpu_offload/gpu_host.rs
@ -11,6 +11,7 @@
 // when inside of a function called main. This, too, is a temporary workaround for not having a
 // frontend.

+#![feature(core_intrinsics)]
 #![no_main]

 #[unsafe(no_mangle)]
@ -25,73 +26,70 @@ fn main() {
 // CHECK: %struct.__tgt_bin_desc = type { i32, ptr, ptr, ptr }
 // CHECK: %struct.__tgt_kernel_arguments = type { i32, i32, ptr, ptr, ptr, ptr, ptr, ptr, i64, i64, [3 x i32], [3 x i32], i32 }

-// CHECK: @.offload_sizes.1 = private unnamed_addr constant [1 x i64] [i64 1024]
-// CHECK: @.offload_maptypes.1 = private unnamed_addr constant [1 x i64] [i64 35]
-// CHECK: @.kernel_1.region_id = weak unnamed_addr constant i8 0
-// CHECK: @.offloading.entry_name.1 = internal unnamed_addr constant [9 x i8] c"kernel_1\00", section ".llvm.rodata.offloading", align 1
-// CHECK: @.offloading.entry.kernel_1 = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 1, i32 0, ptr @.kernel_1.region_id, ptr @.offloading.entry_name.1, i64 0, i64 0, ptr null }, section "llvm_offload_entries", align 8
-// CHECK: @0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
-// CHECK: @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @0 }, align 8
+// CHECK: @.offload_sizes._kernel_1 = private unnamed_addr constant [1 x i64] [i64 1024]
+// CHECK: @.offload_maptypes._kernel_1 = private unnamed_addr constant [1 x i64] [i64 35]
+// CHECK: @._kernel_1.region_id = internal unnamed_addr constant i8 0
+// CHECK: @.offloading.entry_name._kernel_1 = internal unnamed_addr constant [10 x i8] c"_kernel_1\00", section ".llvm.rodata.offloading", align 1
+// CHECK: @.offloading.entry._kernel_1 = internal constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 1, i32 0, ptr @._kernel_1.region_id, ptr @.offloading.entry_name._kernel_1, i64 0, i64 0, ptr null }, section "llvm_offload_entries", align 8
+
+// CHECK: @anon.{{.*}}.0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
+// CHECK: @anon.{{.*}}.1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @anon.{{.*}}.0 }, align 8

 // CHECK:  Function Attrs:
 // CHECK-NEXT: define{{( dso_local)?}} void @main()
 // CHECK-NEXT: start:
 // CHECK-NEXT:   %0 = alloca [8 x i8], align 8
 // CHECK-NEXT:   %x = alloca [1024 x i8], align 16
+// CHECK:        call void @kernel_1(ptr noalias noundef nonnull align 4 dereferenceable(1024) %x)
+// CHECK-NEXT:   call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %0)
+// CHECK-NEXT:   store ptr %x, ptr %0, align 8
+// CHECK-NEXT:   call void asm sideeffect "", "r,~{memory}"(ptr nonnull %0) #4, !srcloc !4
+// CHECK-NEXT:   call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %0)
+// CHECK-NEXT:   call void @llvm.lifetime.end.p0(i64 1024, ptr nonnull %x)
+// CHECK-NEXT:   ret void
+// CHECK-NEXT: }
+
+// CHECK:      define{{( dso_local)?}} void @kernel_1(ptr noalias noundef align 4 dereferenceable(1024) %x)
+// CHECK-NEXT: start:
 // CHECK-NEXT:   %EmptyDesc = alloca %struct.__tgt_bin_desc, align 8
 // CHECK-NEXT:   %.offload_baseptrs = alloca [1 x ptr], align 8
 // CHECK-NEXT:   %.offload_ptrs = alloca [1 x ptr], align 8
 // CHECK-NEXT:   %.offload_sizes = alloca [1 x i64], align 8
 // CHECK-NEXT:   %kernel_args = alloca %struct.__tgt_kernel_arguments, align 8
-// CHECK:        call void @llvm.memset.p0.i64(ptr align 8 %EmptyDesc, i8 0, i64 32, i1 false)
-// CHECK-NEXT:   %1 = getelementptr inbounds float, ptr %x, i32 0
-// CHECK-NEXT:   call void @__tgt_register_lib(ptr %EmptyDesc)
+// CHECK-NEXT:   %dummy = load volatile ptr, ptr @.offload_sizes._kernel_1, align 8
+// CHECK-NEXT:   %dummy1 = load volatile ptr, ptr @.offloading.entry._kernel_1, align 8
+// CHECK-NEXT:   call void @llvm.memset.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) %EmptyDesc, i8 0, i64 32, i1 false)
+// CHECK-NEXT:   call void @__tgt_register_lib(ptr nonnull %EmptyDesc)
 // CHECK-NEXT:   call void @__tgt_init_all_rtls()
-// CHECK-NEXT:   %2 = getelementptr inbounds [1 x ptr], ptr %.offload_baseptrs, i32 0, i32 0
-// CHECK-NEXT:   store ptr %x, ptr %2, align 8
-// CHECK-NEXT:   %3 = getelementptr inbounds [1 x ptr], ptr %.offload_ptrs, i32 0, i32 0
-// CHECK-NEXT:   store ptr %1, ptr %3, align 8
-// CHECK-NEXT:   %4 = getelementptr inbounds [1 x i64], ptr %.offload_sizes, i32 0, i32 0
-// CHECK-NEXT:   store i64 1024, ptr %4, align 8
-// CHECK-NEXT:   %5 = getelementptr inbounds [1 x ptr], ptr %.offload_baseptrs, i32 0, i32 0
-// CHECK-NEXT:   %6 = getelementptr inbounds [1 x ptr], ptr %.offload_ptrs, i32 0, i32 0
-// CHECK-NEXT:   %7 = getelementptr inbounds [1 x i64], ptr %.offload_sizes, i32 0, i32 0
-// CHECK-NEXT:   call void @__tgt_target_data_begin_mapper(ptr @1, i64 -1, i32 1, ptr %5, ptr %6, ptr %7, ptr @.offload_maptypes.1, ptr null, ptr null)
-// CHECK-NEXT:   %8 = getelementptr inbounds %struct.__tgt_kernel_arguments, ptr %kernel_args, i32 0, i32 0
-// CHECK-NEXT:   store i32 3, ptr %8, align 4
-// CHECK-NEXT:   %9 = getelementptr inbounds %struct.__tgt_kernel_arguments, ptr %kernel_args, i32 0, i32 1
-// CHECK-NEXT:   store i32 1, ptr %9, align 4
-// CHECK-NEXT:   %10 = getelementptr inbounds %struct.__tgt_kernel_arguments, ptr %kernel_args, i32 0, i32 2
-// CHECK-NEXT:   store ptr %5, ptr %10, align 8
-// CHECK-NEXT:   %11 = getelementptr inbounds %struct.__tgt_kernel_arguments, ptr %kernel_args, i32 0, i32 3
-// CHECK-NEXT:   store ptr %6, ptr %11, align 8
-// CHECK-NEXT:   %12 = getelementptr inbounds %struct.__tgt_kernel_arguments, ptr %kernel_args, i32 0, i32 4
-// CHECK-NEXT:   store ptr %7, ptr %12, align 8
-// CHECK-NEXT:   %13 = getelementptr inbounds %struct.__tgt_kernel_arguments, ptr %kernel_args, i32 0, i32 5
-// CHECK-NEXT:   store ptr @.offload_maptypes.1, ptr %13, align 8
-// CHECK-NEXT:   %14 = getelementptr inbounds %struct.__tgt_kernel_arguments, ptr %kernel_args, i32 0, i32 6
-// CHECK-NEXT:   store ptr null, ptr %14, align 8
-// CHECK-NEXT:   %15 = getelementptr inbounds %struct.__tgt_kernel_arguments, ptr %kernel_args, i32 0, i32 7
-// CHECK-NEXT:   store ptr null, ptr %15, align 8
-// CHECK-NEXT:   %16 = getelementptr inbounds %struct.__tgt_kernel_arguments, ptr %kernel_args, i32 0, i32 8
-// CHECK-NEXT:   store i64 0, ptr %16, align 8
-// CHECK-NEXT:   %17 = getelementptr inbounds %struct.__tgt_kernel_arguments, ptr %kernel_args, i32 0, i32 9
-// CHECK-NEXT:   store i64 0, ptr %17, align 8
-// CHECK-NEXT:   %18 = getelementptr inbounds %struct.__tgt_kernel_arguments, ptr %kernel_args, i32 0, i32 10
-// CHECK-NEXT:   store [3 x i32] [i32 2097152, i32 0, i32 0], ptr %18, align 4
-// CHECK-NEXT:   %19 = getelementptr inbounds %struct.__tgt_kernel_arguments, ptr %kernel_args, i32 0, i32 11
-// CHECK-NEXT:   store [3 x i32] [i32 256, i32 0, i32 0], ptr %19, align 4
-// CHECK-NEXT:   %20 = getelementptr inbounds %struct.__tgt_kernel_arguments, ptr %kernel_args, i32 0, i32 12
-// CHECK-NEXT:   store i32 0, ptr %20, align 4
-// CHECK-NEXT:   %21 = call i32 @__tgt_target_kernel(ptr @1, i64 -1, i32 2097152, i32 256, ptr @.kernel_1.region_id, ptr %kernel_args)
-// CHECK-NEXT:   %22 = getelementptr inbounds [1 x ptr], ptr %.offload_baseptrs, i32 0, i32 0
-// CHECK-NEXT:   %23 = getelementptr inbounds [1 x ptr], ptr %.offload_ptrs, i32 0, i32 0
-// CHECK-NEXT:   %24 = getelementptr inbounds [1 x i64], ptr %.offload_sizes, i32 0, i32 0
-// CHECK-NEXT:   call void @__tgt_target_data_end_mapper(ptr @1, i64 -1, i32 1, ptr %22, ptr %23, ptr %24, ptr @.offload_maptypes.1, ptr null, ptr null)
-// CHECK-NEXT:   call void @__tgt_unregister_lib(ptr %EmptyDesc)
-// CHECK:        store ptr %x, ptr %0, align 8
-// CHECK-NEXT:   call void asm sideeffect "", "r,~{memory}"(ptr nonnull %0)
-// CHECK:        ret void
+// CHECK-NEXT:   store ptr %x, ptr %.offload_baseptrs, align 8
+// CHECK-NEXT:   store ptr %x, ptr %.offload_ptrs, align 8
+// CHECK-NEXT:   store i64 1024, ptr %.offload_sizes, align 8
+// CHECK-NEXT:   call void @__tgt_target_data_begin_mapper(ptr nonnull @anon.{{.*}}.1, i64 -1, i32 1, ptr nonnull %.offload_baseptrs, ptr nonnull %.offload_ptrs, ptr nonnull %.offload_sizes, ptr nonnull @.offload_maptypes._kernel_1, ptr null, ptr null)
+// CHECK-NEXT:   store i32 3, ptr %kernel_args, align 8
+// CHECK-NEXT:   %0 = getelementptr inbounds nuw i8, ptr %kernel_args, i64 4
+// CHECK-NEXT:   store i32 1, ptr %0, align 4
+// CHECK-NEXT:   %1 = getelementptr inbounds nuw i8, ptr %kernel_args, i64 8
+// CHECK-NEXT:   store ptr %.offload_baseptrs, ptr %1, align 8
+// CHECK-NEXT:   %2 = getelementptr inbounds nuw i8, ptr %kernel_args, i64 16
+// CHECK-NEXT:   store ptr %.offload_ptrs, ptr %2, align 8
+// CHECK-NEXT:   %3 = getelementptr inbounds nuw i8, ptr %kernel_args, i64 24
+// CHECK-NEXT:   store ptr %.offload_sizes, ptr %3, align 8
+// CHECK-NEXT:   %4 = getelementptr inbounds nuw i8, ptr %kernel_args, i64 32
+// CHECK-NEXT:   store ptr @.offload_maptypes._kernel_1, ptr %4, align 8
+// CHECK-NEXT:   %5 = getelementptr inbounds nuw i8, ptr %kernel_args, i64 40
+// CHECK-NEXT:   %6 = getelementptr inbounds nuw i8, ptr %kernel_args, i64 72
+// CHECK-NEXT:   call void @llvm.memset.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) %5, i8 0, i64 32, i1 false)
+// CHECK-NEXT:   store <4 x i32> <i32 2097152, i32 0, i32 0, i32 256>, ptr %6, align 8
+// CHECK-NEXT:   %.fca.1.gep3 = getelementptr inbounds nuw i8, ptr %kernel_args, i64 88
+// CHECK-NEXT:   store i32 0, ptr %.fca.1.gep3, align 8
+// CHECK-NEXT:   %.fca.2.gep4 = getelementptr inbounds nuw i8, ptr %kernel_args, i64 92
+// CHECK-NEXT:   store i32 0, ptr %.fca.2.gep4, align 4
+// CHECK-NEXT:   %7 = getelementptr inbounds nuw i8, ptr %kernel_args, i64 96
+// CHECK-NEXT:   store i32 0, ptr %7, align 8
+// CHECK-NEXT:   %8 = call i32 @__tgt_target_kernel(ptr nonnull @anon.{{.*}}.1, i64 -1, i32 2097152, i32 256, ptr nonnull @._kernel_1.region_id, ptr nonnull %kernel_args)
+// CHECK-NEXT:   call void @__tgt_target_data_end_mapper(ptr nonnull @anon.{{.*}}.1, i64 -1, i32 1, ptr nonnull %.offload_baseptrs, ptr nonnull %.offload_ptrs, ptr nonnull %.offload_sizes, ptr nonnull @.offload_maptypes._kernel_1, ptr null, ptr null)
+// CHECK-NEXT:   call void @__tgt_unregister_lib(ptr nonnull %EmptyDesc)
+// CHECK-NEXT:   ret void
 // CHECK-NEXT: }

 // CHECK: Function Attrs: nounwind
@ -100,6 +98,12 @@ fn main() {
 #[unsafe(no_mangle)]
 #[inline(never)]
 pub fn kernel_1(x: &mut [f32; 256]) {
+    core::intrinsics::offload(_kernel_1, (x,))
+}
+
+#[unsafe(no_mangle)]
+#[inline(never)]
+pub fn _kernel_1(x: &mut [f32; 256]) {
    for i in 0..256 {
        x[i] = 21.0;
    }
--- a/tests/ui/offload/check_config.fail.stderr
+++ b/tests/ui/offload/check_config.fail.stderr
@ -0,0 +1,6 @@
+error: using the offload feature requires -Z offload=Enable
+
+error: using the offload feature requires -C lto=fat
+
+error: aborting due to 2 previous errors
+
--- a/tests/ui/offload/check_config.rs
+++ b/tests/ui/offload/check_config.rs
@ -0,0 +1,23 @@
+//@ revisions: pass fail
+//@ no-prefer-dynamic
+//@ needs-enzyme
+//@[pass] build-pass
+//@[fail] build-fail
+//@[pass] compile-flags: -Zunstable-options -Zoffload=Enable -Clto=fat --emit=metadata
+//@[fail] compile-flags: -Clto=thin
+
+//[fail]~? ERROR: using the offload feature requires -Z offload=Enable
+//[fail]~? ERROR: using the offload feature requires -C lto=fat
+
+#![feature(core_intrinsics)]
+
+fn main() {
+    let mut x = [3.0; 256];
+    kernel_1(&mut x);
+}
+
+fn kernel_1(x: &mut [f32; 256]) {
+    core::intrinsics::offload(_kernel_1, (x,))
+}
+
+fn _kernel_1(x: &mut [f32; 256]) {}