From 42ec76a3ff31c3ab5b0d2156359d65e39fbc4537 Mon Sep 17 00:00:00 2001 From: gnzlbg Date: Fri, 22 Dec 2017 17:14:41 +0100 Subject: [PATCH] [sse4a] implement non-immediate-mode intrinsics (#249) --- library/stdarch/coresimd/src/runtime/x86.rs | 9 +- library/stdarch/coresimd/src/x86/i586/tbm.rs | 1 + library/stdarch/coresimd/src/x86/i686/mod.rs | 5 + .../stdarch/coresimd/src/x86/i686/sse4a.rs | 155 ++++++++++++++++++ 4 files changed, 165 insertions(+), 5 deletions(-) create mode 100644 library/stdarch/coresimd/src/x86/i686/sse4a.rs diff --git a/library/stdarch/coresimd/src/runtime/x86.rs b/library/stdarch/coresimd/src/runtime/x86.rs index 73e6f4f68cc9..b952598f4971 100644 --- a/library/stdarch/coresimd/src/runtime/x86.rs +++ b/library/stdarch/coresimd/src/runtime/x86.rs @@ -302,8 +302,7 @@ pub fn detect_features() -> usize { // Contains information about bmi,bmi2, and avx2 support. let (extended_features_ebx, extended_features_ecx) = if max_basic_leaf >= 7 { - let CpuidResult { ebx, ecx, .. } = - unsafe { __cpuid(0x0000_0007_u32) }; + let CpuidResult { ebx, ecx, .. } = unsafe { __cpuid(0x0000_0007_u32) }; (ebx, ecx) } else { (0, 0) // CPUID does not support "Extended Features" @@ -320,8 +319,7 @@ pub fn detect_features() -> usize { // EAX = 0x8000_0001, ECX=0: Queries "Extended Processor Info and Feature // Bits" let extended_proc_info_ecx = if extended_max_basic_leaf >= 1 { - let CpuidResult { ecx, .. } = - unsafe { __cpuid(0x8000_0001_u32) }; + let CpuidResult { ecx, .. } = unsafe { __cpuid(0x8000_0001_u32) }; ecx } else { 0 @@ -457,6 +455,7 @@ mod tests { println!("ssse3: {:?}", cfg_feature_enabled!("ssse3")); println!("sse4.1: {:?}", cfg_feature_enabled!("sse4.1")); println!("sse4.2: {:?}", cfg_feature_enabled!("sse4.2")); + println!("sse4a: {:?}", cfg_feature_enabled!("sse4a")); println!("avx: {:?}", cfg_feature_enabled!("avx")); println!("avx2: {:?}", cfg_feature_enabled!("avx2")); println!("avx512f {:?}", cfg_feature_enabled!("avx512f")); @@ -495,6 +494,7 @@ mod tests { assert_eq!(cfg_feature_enabled!("ssse3"), information.ssse3()); assert_eq!(cfg_feature_enabled!("sse4.1"), information.sse4_1()); assert_eq!(cfg_feature_enabled!("sse4.2"), information.sse4_2()); + assert_eq!(cfg_feature_enabled!("sse4a"), information.sse4a()); assert_eq!(cfg_feature_enabled!("avx"), information.avx()); assert_eq!(cfg_feature_enabled!("avx2"), information.avx2()); assert_eq!(cfg_feature_enabled!("avx512f"), information.avx512f()); @@ -520,7 +520,6 @@ mod tests { assert_eq!(cfg_feature_enabled!("bmi"), information.bmi1()); assert_eq!(cfg_feature_enabled!("bmi2"), information.bmi2()); assert_eq!(cfg_feature_enabled!("popcnt"), information.popcnt()); - assert_eq!(cfg_feature_enabled!("sse4a"), information.sse4a()); assert_eq!(cfg_feature_enabled!("abm"), information.lzcnt()); assert_eq!(cfg_feature_enabled!("tbm"), information.tbm()); assert_eq!(cfg_feature_enabled!("lzcnt"), information.lzcnt()); diff --git a/library/stdarch/coresimd/src/x86/i586/tbm.rs b/library/stdarch/coresimd/src/x86/i586/tbm.rs index f42a382abff8..38c044c54377 100644 --- a/library/stdarch/coresimd/src/x86/i586/tbm.rs +++ b/library/stdarch/coresimd/src/x86/i586/tbm.rs @@ -13,6 +13,7 @@ #[cfg(test)] use stdsimd_test::assert_instr; +// FIXME(blocked on #248) // TODO: LLVM-CODEGEN ERROR: LLVM ERROR: Cannot select: // intrinsic %llvm.x86.tbm.bextri.u32 /* diff --git a/library/stdarch/coresimd/src/x86/i686/mod.rs b/library/stdarch/coresimd/src/x86/i686/mod.rs index c7d117116de9..1c4430d07862 100644 --- a/library/stdarch/coresimd/src/x86/i686/mod.rs +++ b/library/stdarch/coresimd/src/x86/i686/mod.rs @@ -17,3 +17,8 @@ pub use self::sse41::*; mod sse42; pub use self::sse42::*; + +#[cfg(not(feature = "intel_sde"))] +mod sse4a; +#[cfg(not(feature = "intel_sde"))] +pub use self::sse4a::*; diff --git a/library/stdarch/coresimd/src/x86/i686/sse4a.rs b/library/stdarch/coresimd/src/x86/i686/sse4a.rs new file mode 100644 index 000000000000..f021565f58b2 --- /dev/null +++ b/library/stdarch/coresimd/src/x86/i686/sse4a.rs @@ -0,0 +1,155 @@ +//! `i686`'s Streaming SIMD Extensions 4a (SSE4a) + +use core::mem; +use v128::*; + +#[cfg(test)] +use stdsimd_test::assert_instr; + +#[allow(improper_ctypes)] +extern "C" { + #[link_name = "llvm.x86.sse4a.extrq"] + fn extrq(x: i64x2, y: i8x16) -> i64x2; + #[link_name = "llvm.x86.sse4a.insertq"] + fn insertq(x: i64x2, y: i64x2) -> i64x2; + #[link_name = "llvm.x86.sse4a.movnt.sd"] + fn movntsd(x: *mut f64, y: f64x2); + #[link_name = "llvm.x86.sse4a.movnt.ss"] + fn movntss(x: *mut f32, y: f32x4); +} + +// FIXME(blocked on #248): _mm_extracti_si64(x, len, idx) // EXTRQ +// FIXME(blocked on #248): _mm_inserti_si64(x, y, len, idx) // INSERTQ + +/// Extracts the bit range specified by `y` from the lower 64 bits of `x`. +/// +/// The [13:8] bits of `y` specify the index of the bit-range to extract. The +/// [5:0] bits of `y` specify the length of the bit-range to extract. All other +/// bits are ignored. +/// +/// If the length is zero, it is interpreted as `64`. If the length and index +/// are zero, the lower 64 bits of `x` are extracted. +/// +/// If `length == 0 && index > 0` or `lenght + index > 64` the result is +/// undefined. +#[inline(always)] +#[target_feature = "+sse4a"] +#[cfg_attr(test, assert_instr(extrq))] +pub unsafe fn _mm_extract_si64(x: i64x2, y: i64x2) -> i64x2 { + extrq(x, mem::transmute(y)) +} + +/// Inserts the `[length:0]` bits of `y` into `x` at `index`. +/// +/// The bits of `y`: +/// +/// - `[69:64]` specify the `length`, +/// - `[77:72]` specify the index. +/// +/// If the `length` is zero it is interpreted as `64`. If `index + length > 64` +/// or `index > 0 && length == 0` the result is undefined. +#[inline(always)] +#[target_feature = "+sse4a"] +#[cfg_attr(test, assert_instr(insertq))] +pub unsafe fn _mm_insert_si64(x: i64x2, y: i64x2) -> i64x2 { + insertq(x, mem::transmute(y)) +} + +/// Non-temporal store of `a.1` into `p`. +#[inline(always)] +#[target_feature = "+sse4a"] +#[cfg_attr(test, assert_instr(movntsd))] +pub unsafe fn _mm_stream_sd(p: *mut f64, a: f64x2) { + movntsd(p, a); +} + +/// Non-temporal store of `a.3` into `p`. +#[inline(always)] +#[target_feature = "+sse4a"] +#[cfg_attr(test, assert_instr(movntss))] +pub unsafe fn _mm_stream_ss(p: *mut f32, a: f32x4) { + movntss(p, a); +} + +#[cfg(test)] +mod tests { + use stdsimd_test::simd_test; + use x86::i686::sse4a; + use v128::*; + + #[simd_test = "sse4a"] + unsafe fn _mm_extract_si64() { + let b = 0b0110_0000_0000_i64; + // ^^^^ bit range extracted + let x = i64x2::new(b, 0); + let v = 0b001000___00___000100_i64; + // ^idx: 2^3 = 8 ^length = 2^2 = 4 + let y = i64x2::new(v, 0); + let e = i64x2::new(0b0110_i64, 0); + let r = sse4a::_mm_extract_si64(x, y); + assert_eq!(r, e); + } + + #[simd_test = "sse4a"] + unsafe fn _mm_insert_si64() { + let i = 0b0110_i64; + // ^^^^ bit range inserted + let z = 0b1010_1010_1010i64; + // ^^^^ bit range replaced + let e = 0b0110_1010_1010i64; + // ^^^^ replaced 1010 with 0110 + let x = i64x2::new(z, 0); + let expected = i64x2::new(e, 0); + let v = 0b001000___00___000100_i64; + // ^idx: 2^3 = 8 ^length = 2^2 = 4 + let y = i64x2::new(i, v); + let r = sse4a::_mm_insert_si64(x, y); + assert_eq!(r, expected); + } + + #[repr(align(16))] + struct MemoryF64 { + data: [f64; 2], + } + + #[simd_test = "sse4a"] + unsafe fn _mm_stream_sd() { + let mut mem = MemoryF64 { + data: [1.0_f64, 2.0], + }; + { + let vals = &mut mem.data; + let d = vals.as_mut_ptr(); + + let x = f64x2::new(3.0, 4.0); + + sse4a::_mm_stream_sd(d, x); + } + assert_eq!(mem.data[0], 4.0); + assert_eq!(mem.data[1], 2.0); + } + + #[repr(align(16))] + struct MemoryF32 { + data: [f32; 4], + } + + #[simd_test = "sse4a"] + unsafe fn _mm_stream_ss() { + let mut mem = MemoryF32 { + data: [1.0_f32, 2.0, 3.0, 4.0], + }; + { + let vals = &mut mem.data; + let d = vals.as_mut_ptr(); + + let x = f32x4::new(5.0, 6.0, 7.0, 8.0); + + sse4a::_mm_stream_ss(d, x); + } + assert_eq!(mem.data[0], 8.0); + assert_eq!(mem.data[1], 2.0); + assert_eq!(mem.data[2], 3.0); + assert_eq!(mem.data[3], 4.0); + } +}