Implement arm64 vaddlvq_u8 and vld1q_u8_x4 vendor intrinsics

This is required for using the bytecount crate on arm64.
This commit is contained in:
bjorn3 2025-02-07 11:01:59 +00:00
parent ed91b73179
commit b004312ee4

View file

@ -17,6 +17,14 @@ pub(crate) fn codegen_aarch64_llvm_intrinsic_call<'tcx>(
fx.bcx.ins().fence();
}
"llvm.aarch64.neon.ld1x4.v16i8.p0i8" => {
intrinsic_args!(fx, args => (ptr); intrinsic);
let ptr = ptr.load_scalar(fx);
let val = CPlace::for_ptr(Pointer::new(ptr), ret.layout()).to_cvalue(fx);
ret.write_cvalue(fx, val);
}
_ if intrinsic.starts_with("llvm.aarch64.neon.abs.v") => {
intrinsic_args!(fx, args => (a); intrinsic);
@ -115,6 +123,22 @@ pub(crate) fn codegen_aarch64_llvm_intrinsic_call<'tcx>(
);
}
"llvm.aarch64.neon.uaddlv.i32.v16i8" => {
intrinsic_args!(fx, args => (v); intrinsic);
let mut res_val = fx.bcx.ins().iconst(types::I16, 0);
for lane_idx in 0..16 {
let lane = v.value_lane(fx, lane_idx).load_scalar(fx);
let lane = fx.bcx.ins().uextend(types::I16, lane);
res_val = fx.bcx.ins().iadd(res_val, lane);
}
let res = CValue::by_val(
fx.bcx.ins().uextend(types::I32, res_val),
fx.layout_of(fx.tcx.types.u32),
);
ret.write_cvalue(fx, res);
}
_ if intrinsic.starts_with("llvm.aarch64.neon.faddv.f32.v") => {
intrinsic_args!(fx, args => (v); intrinsic);