core-arch: Add NEON fp16 intrinsics

This commit is contained in:
Kajetan Puchalski 2025-01-30 15:52:26 +00:00 committed by Amanieu d'Antras
parent 7cdc9157e6
commit f4a31fd609
7 changed files with 16001 additions and 89 deletions

File diff suppressed because it is too large Load diff

View file

@ -4783,6 +4783,24 @@ pub unsafe fn vbsl_u64(a: uint64x1_t, b: uint64x1_t, c: uint64x1_t) -> uint64x1_
simd_or(simd_and(a, b), simd_and(simd_xor(a, transmute(not)), c))
}
/// Bitwise Select.
#[inline]
#[target_feature(enable = "neon,fp16")]
#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vbsl))]
#[cfg_attr(
all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
assert_instr(bsl)
)]
#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
pub unsafe fn vbsl_f16(a: uint16x4_t, b: float16x4_t, c: float16x4_t) -> float16x4_t {
let not = int16x4_t::splat(-1);
transmute(simd_or(
simd_and(a, transmute(b)),
simd_and(simd_xor(a, transmute(not)), transmute(c)),
))
}
/// Bitwise Select.
#[inline]
#[target_feature(enable = "neon")]
@ -5096,6 +5114,24 @@ pub unsafe fn vbslq_p16(a: uint16x8_t, b: poly16x8_t, c: poly16x8_t) -> poly16x8
))
}
/// Bitwise Select.
#[inline]
#[target_feature(enable = "neon,fp16")]
#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vbsl))]
#[cfg_attr(
all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
assert_instr(bsl)
)]
#[unstable(feature = "stdarch_neon_f16", issue = "136306")]
pub unsafe fn vbslq_f16(a: uint16x8_t, b: float16x8_t, c: float16x8_t) -> float16x8_t {
let not = int16x8_t::splat(-1);
transmute(simd_or(
simd_and(a, transmute(b)),
simd_and(simd_xor(a, transmute(not)), transmute(c)),
))
}
/// Bitwise Select. (128-bit)
#[inline]
#[target_feature(enable = "neon")]

View file

@ -108,6 +108,74 @@ vsri_n_p64
vsriq_n_p64
vtst_p64
vtstq_p64
vaddh_f16
vsubh_f16
vabsh_f16
vdivh_f16
vmulh_f16
vfmsh_f16
vfmah_f16
vminnmh_f16
vmaxnmh_f16
vrndh_f16
vrndnh_f16
vrndih_f16
vrndah_f16
vrndph_f16
vrndmh_f16
vrndxh_f16
vsqrth_f16
vnegh_f16
vcvth_f16_s32
vcvth_s32_f16
vcvth_n_f16_s32
vcvth_n_s32_f16
vcvth_f16_u32
vcvth_u32_f16
vcvth_n_f16_u32
vcvth_n_u32_f16
vcvtah_s32_f16
vcvtah_u32_f16
vcvtmh_s32_f16
vcvtmh_u32_f16
vcvtpq_s16_f16
vcvtpq_u16_f16
vcvtp_s16_f16
vcvtp_u16_f16
vcvtph_s32_f16
vcvtph_u32_f16
vcvtnh_u32_f16
vcvtnh_s32_f16
vfmlsl_low_f16
vfmlslq_low_f16
vfmlsl_high_f16
vfmlslq_high_f16
vfmlsl_lane_high_f16
vfmlsl_laneq_high_f16
vfmlslq_lane_high_f16
vfmlslq_laneq_high_f16
vfmlsl_lane_low_f16
vfmlsl_laneq_low_f16
vfmlslq_lane_low_f16
vfmlslq_laneq_low_f16
vfmlal_low_f16
vfmlalq_low_f16
vfmlal_high_f16
vfmlalq_high_f16
vfmlal_lane_low_f16
vfmlal_laneq_low_f16
vfmlalq_lane_low_f16
vfmlalq_laneq_low_f16
vfmlal_lane_high_f16
vfmlal_laneq_high_f16
vfmlalq_lane_high_f16
vfmlalq_laneq_high_f16
vreinterpret_f16_p64
vreinterpretq_f16_p64
vreinterpret_p64_f16
vreinterpretq_p64_f16
vreinterpret_p128_f16
vreinterpretq_p128_f16
# Present in Clang header but triggers an ICE due to lack of backend support.
vcmla_f32
@ -134,6 +202,31 @@ vcmlaq_rot270_laneq_f32
vcmlaq_rot90_f32
vcmlaq_rot90_lane_f32
vcmlaq_rot90_laneq_f32
vcmla_f16
vcmlaq_f16
vcmla_laneq_f16
vcmla_lane_f16
vcmla_laneq_f16
vcmlaq_lane_f16
vcmlaq_laneq_f16
vcmla_rot90_f16
vcmlaq_rot90_f16
vcmla_rot180_f16
vcmlaq_rot180_f16
vcmla_rot270_f16
vcmlaq_rot270_f16
vcmla_rot90_lane_f16
vcmla_rot90_laneq_f16
vcmlaq_rot90_lane_f16
vcmlaq_rot90_laneq_f16
vcmla_rot180_lane_f16
vcmla_rot180_laneq_f16
vcmlaq_rot180_lane_f16
vcmlaq_rot180_laneq_f16
vcmla_rot270_lane_f16
vcmla_rot270_laneq_f16
vcmlaq_rot270_lane_f16
vcmlaq_rot270_laneq_f16
# Implemented in stdarch for A64 only, Clang support both A32/A64
vadd_s64
@ -182,4 +275,46 @@ vrndpq_f32
vrndq_f32
vrndq_f32
vrndx_f32
vrndxq_f32
vrndxq_f32
vrnda_f16
vrnda_f16
vrndaq_f16
vrndaq_f16
vrnd_f16
vrnd_f16
vrndi_f16
vrndi_f16
vrndiq_f16
vrndiq_f16
vrndm_f16
vrndm_f16
vrndmq_f16
vrndmq_f16
vrndns_f16
vrndp_f16
vrndpq_f16
vrndq_f16
vrndx_f16
vrndxq_f16
vpmin_f16
vpmax_f16
vcaddq_rot270_f16
vcaddq_rot90_f16
vcadd_rot270_f16
vcadd_rot90_f16
vcvtm_s16_f16
vcvtmq_s16_f16
vcvtm_u16_f16
vcvtmq_u16_f16
vcvtaq_s16_f16
vcvtaq_u16_f16
vcvtnq_s16_f16
vcvtnq_u16_f16
vcvtn_s16_f16
vcvtn_u16_f16
vcvtaq_s16_f16
vcvtaq_u16_f16
vcvta_s16_f16
vcvta_u16_f16
vceqz_f16
vceqzq_f16

View file

@ -194,7 +194,7 @@ fn generate_rust_program(notices: &str, intrinsic: &Intrinsic, target: &str) ->
#![cfg_attr(any(target_arch = "aarch64", target_arch = "arm64ec"), feature(stdarch_neon_sha3))]
#![cfg_attr(any(target_arch = "aarch64", target_arch = "arm64ec"), feature(stdarch_neon_sm4))]
#![cfg_attr(any(target_arch = "aarch64", target_arch = "arm64ec"), feature(stdarch_neon_ftts))]
#![cfg_attr(any(target_arch = "aarch64", target_arch = "arm64ec"), feature(stdarch_neon_f16))]
#![feature(stdarch_neon_f16)]
#![allow(non_upper_case_globals)]
use core_arch::arch::{target_arch}::*;