Migrate a bunch of i586::sse2 to native types (#273)

This commit is contained in:
Alex Crichton 2018-01-10 12:42:26 -06:00 committed by GitHub
parent baf9d0e7e0
commit 6d8d2f81e9
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
9 changed files with 1574 additions and 1532 deletions

View file

@ -843,7 +843,7 @@ pub unsafe fn _mm256_extractf128_ps(a: f32x8, imm8: i32) -> __m128 {
#[inline(always)]
#[target_feature = "+avx"]
#[cfg_attr(test, assert_instr(vextractf128))]
pub unsafe fn _mm256_extractf128_pd(a: f64x4, imm8: i32) -> f64x2 {
pub unsafe fn _mm256_extractf128_pd(a: f64x4, imm8: i32) -> __m128d {
match imm8 & 1 {
0 => simd_shuffle2(a, _mm256_undefined_pd(), [0, 1]),
_ => simd_shuffle2(a, _mm256_undefined_pd(), [2, 3]),
@ -1068,9 +1068,7 @@ pub unsafe fn _mm256_permute_pd(a: f64x4, imm8: i32) -> f64x4 {
#[inline(always)]
#[target_feature = "+avx,+sse2"]
#[cfg_attr(test, assert_instr(vpermilpd, imm8 = 0x1))]
pub unsafe fn _mm_permute_pd(a: f64x2, imm8: i32) -> f64x2 {
use x86::i586::sse2::_mm_undefined_pd;
pub unsafe fn _mm_permute_pd(a: __m128d, imm8: i32) -> __m128d {
let imm8 = (imm8 & 0xFF) as u8;
macro_rules! shuffle2 {
($a:expr, $b:expr) => {
@ -1194,7 +1192,7 @@ pub unsafe fn _mm256_insertf128_ps(a: f32x8, b: __m128, imm8: i32) -> f32x8 {
#[inline(always)]
#[target_feature = "+avx"]
#[cfg_attr(test, assert_instr(vinsertf128, imm8 = 1))]
pub unsafe fn _mm256_insertf128_pd(a: f64x4, b: f64x2, imm8: i32) -> f64x4 {
pub unsafe fn _mm256_insertf128_pd(a: f64x4, b: __m128d, imm8: i32) -> f64x4 {
match imm8 & 1 {
0 => simd_shuffle4(a, _mm256_castpd128_pd256(b), [4, 5, 2, 3]),
_ => simd_shuffle4(a, _mm256_castpd128_pd256(b), [0, 1, 4, 5]),
@ -2139,7 +2137,7 @@ pub unsafe fn _mm256_castps256_ps128(a: f32x8) -> __m128 {
#[target_feature = "+avx"]
// This intrinsic is only used for compilation and does not generate any
// instructions, thus it has zero latency.
pub unsafe fn _mm256_castpd256_pd128(a: f64x4) -> f64x2 {
pub unsafe fn _mm256_castpd256_pd128(a: f64x4) -> __m128d {
simd_shuffle2(a, a, [0, 1])
}
@ -2171,7 +2169,7 @@ pub unsafe fn _mm256_castps128_ps256(a: __m128) -> f32x8 {
#[target_feature = "+avx"]
// This intrinsic is only used for compilation and does not generate any
// instructions, thus it has zero latency.
pub unsafe fn _mm256_castpd128_pd256(a: f64x2) -> f64x4 {
pub unsafe fn _mm256_castpd128_pd256(a: __m128d) -> f64x4 {
// FIXME simd_shuffle4(a, a, [0, 1, -1, -1])
simd_shuffle4(a, a, [0, 1, 0, 0])
}
@ -2221,8 +2219,7 @@ pub unsafe fn _mm256_zextsi128_si256(a: __m128i) -> __m256i {
#[target_feature = "+avx,+sse2"]
// This intrinsic is only used for compilation and does not generate any
// instructions, thus it has zero latency.
pub unsafe fn _mm256_zextpd128_pd256(a: f64x2) -> f64x4 {
use x86::i586::sse2::_mm_setzero_pd;
pub unsafe fn _mm256_zextpd128_pd256(a: __m128d) -> f64x4 {
simd_shuffle4(a, _mm_setzero_pd(), [0, 1, 2, 3])
}
@ -2326,7 +2323,6 @@ pub unsafe fn _mm256_loadu2_m128(
pub unsafe fn _mm256_loadu2_m128d(
hiaddr: *const f64, loaddr: *const f64
) -> f64x4 {
use x86::i586::sse2::_mm_loadu_pd;
let a = _mm256_castpd128_pd256(_mm_loadu_pd(loaddr));
_mm256_insertf128_pd(a, _mm_loadu_pd(hiaddr), 1)
}
@ -2371,7 +2367,6 @@ pub unsafe fn _mm256_storeu2_m128(
pub unsafe fn _mm256_storeu2_m128d(
hiaddr: *mut f64, loaddr: *mut f64, a: f64x4
) {
use x86::i586::sse2::_mm_storeu_pd;
let lo = _mm256_castpd256_pd128(a);
_mm_storeu_pd(loaddr, lo);
let hi = _mm256_extractf128_pd(a, 1);
@ -3104,9 +3099,9 @@ mod tests {
#[simd_test = "avx"]
unsafe fn test_mm256_extractf128_pd() {
let a = f64x4::new(4., 3., 2., 5.);
let r = avx::_mm256_extractf128_pd(a, 0);
let e = f64x2::new(4., 3.);
assert_eq!(r, e);
let r = _mm256_extractf128_pd(a, 0);
let e = _mm_setr_pd(4., 3.);
assert_eq_m128d(r, e);
}
#[simd_test = "avx"]
@ -3189,10 +3184,10 @@ mod tests {
#[simd_test = "avx"]
unsafe fn test_mm_permute_pd() {
let a = f64x2::new(4., 3.);
let r = avx::_mm_permute_pd(a, 1);
let e = f64x2::new(3., 4.);
assert_eq!(r, e);
let a = _mm_setr_pd(4., 3.);
let r = _mm_permute_pd(a, 1);
let e = _mm_setr_pd(3., 4.);
assert_eq_m128d(r, e);
}
#[simd_test = "avx"]
@ -3271,8 +3266,8 @@ mod tests {
#[simd_test = "avx"]
unsafe fn test_mm256_insertf128_pd() {
let a = f64x4::new(1., 2., 3., 4.);
let b = f64x2::new(5., 6.);
let r = avx::_mm256_insertf128_pd(a, b, 0);
let b = _mm_setr_pd(5., 6.);
let r = _mm256_insertf128_pd(a, b, 0);
let e = f64x4::new(5., 6., 3., 4.);
assert_eq!(r, e);
}
@ -4078,8 +4073,8 @@ mod tests {
#[simd_test = "avx"]
unsafe fn test_mm256_castpd256_pd128() {
let a = f64x4::new(1., 2., 3., 4.);
let r = avx::_mm256_castpd256_pd128(a);
assert_eq!(r, f64x2::new(1., 2.));
let r = _mm256_castpd256_pd128(a);
assert_eq_m128d(r, _mm_setr_pd(1., 2.));
}
#[simd_test = "avx"]
@ -4107,8 +4102,8 @@ mod tests {
#[simd_test = "avx"]
unsafe fn test_mm256_zextpd128_pd256() {
let a = f64x2::new(1., 2.);
let r = avx::_mm256_zextpd128_pd256(a);
let a = _mm_setr_pd(1., 2.);
let r = _mm256_zextpd128_pd256(a);
let e = f64x4::new(1., 2., 0., 0.);
assert_eq!(r, e);
}
@ -4271,8 +4266,8 @@ mod tests {
&mut lo as *mut _ as *mut f64,
a,
);
assert_eq!(hi, f64x2::new(3., 4.));
assert_eq!(lo, f64x2::new(1., 2.));
assert_eq_m128d(hi, _mm_setr_pd(3., 4.));
assert_eq_m128d(lo, _mm_setr_pd(1., 2.));
}
#[simd_test = "avx"]

View file

@ -1711,20 +1711,6 @@ mod tests {
use stdsimd_test::simd_test;
use test::black_box; // Used to inhibit constant-folding.
#[target_feature = "+sse"]
unsafe fn assert_eq_m128(a: __m128, b: __m128) {
let r = _mm_cmpeq_ps(a, b);
if _mm_movemask_ps(r) != 0b1111 {
panic!("{:?} != {:?}", a, b);
}
}
#[target_feature = "+sse"]
unsafe fn get_m128(a: __m128, idx: usize) -> f32 {
union A { a: __m128, b: [f32; 4] };
transmute::<__m128, A>(a).b[idx]
}
#[simd_test = "sse"]
unsafe fn test_mm_add_ps() {
let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);

File diff suppressed because it is too large Load diff

View file

@ -2,6 +2,7 @@
use simd_llvm::{simd_shuffle2, simd_shuffle4};
use v128::*;
use x86::*;
#[cfg(test)]
use stdsimd_test::assert_instr;
@ -84,8 +85,7 @@ pub unsafe fn _mm_movedup_pd(a: f64x2) -> f64x2 {
#[inline(always)]
#[target_feature = "+sse3"]
#[cfg_attr(test, assert_instr(movddup))]
pub unsafe fn _mm_loaddup_pd(mem_addr: *const f64) -> f64x2 {
use x86::i586::sse2::_mm_load1_pd;
pub unsafe fn _mm_loaddup_pd(mem_addr: *const f64) -> __m128d {
_mm_load1_pd(mem_addr)
}
@ -131,9 +131,10 @@ mod tests {
use v128::*;
use x86::i586::sse3;
use x86::*;
#[simd_test = "sse3"]
unsafe fn _mm_addsub_ps() {
unsafe fn test_mm_addsub_ps() {
let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
let r = sse3::_mm_addsub_ps(a, b);
@ -141,7 +142,7 @@ mod tests {
}
#[simd_test = "sse3"]
unsafe fn _mm_addsub_pd() {
unsafe fn test_mm_addsub_pd() {
let a = f64x2::new(-1.0, 5.0);
let b = f64x2::new(-100.0, 20.0);
let r = sse3::_mm_addsub_pd(a, b);
@ -149,7 +150,7 @@ mod tests {
}
#[simd_test = "sse3"]
unsafe fn _mm_hadd_pd() {
unsafe fn test_mm_hadd_pd() {
let a = f64x2::new(-1.0, 5.0);
let b = f64x2::new(-100.0, 20.0);
let r = sse3::_mm_hadd_pd(a, b);
@ -157,7 +158,7 @@ mod tests {
}
#[simd_test = "sse3"]
unsafe fn _mm_hadd_ps() {
unsafe fn test_mm_hadd_ps() {
let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
let r = sse3::_mm_hadd_ps(a, b);
@ -165,7 +166,7 @@ mod tests {
}
#[simd_test = "sse3"]
unsafe fn _mm_hsub_pd() {
unsafe fn test_mm_hsub_pd() {
let a = f64x2::new(-1.0, 5.0);
let b = f64x2::new(-100.0, 20.0);
let r = sse3::_mm_hsub_pd(a, b);
@ -173,7 +174,7 @@ mod tests {
}
#[simd_test = "sse3"]
unsafe fn _mm_hsub_ps() {
unsafe fn test_mm_hsub_ps() {
let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
let r = sse3::_mm_hsub_ps(a, b);
@ -181,7 +182,7 @@ mod tests {
}
#[simd_test = "sse3"]
unsafe fn _mm_lddqu_si128() {
unsafe fn test_mm_lddqu_si128() {
#[cfg_attr(rustfmt, rustfmt_skip)]
let a = __m128i::from(i8x16::new(
1, 2, 3, 4,
@ -194,30 +195,30 @@ mod tests {
}
#[simd_test = "sse3"]
unsafe fn _mm_movedup_pd() {
unsafe fn test_mm_movedup_pd() {
let a = f64x2::new(-1.0, 5.0);
let r = sse3::_mm_movedup_pd(a);
assert_eq!(r, f64x2::new(-1.0, -1.0));
}
#[simd_test = "sse3"]
unsafe fn _mm_movehdup_ps() {
unsafe fn test_mm_movehdup_ps() {
let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
let r = sse3::_mm_movehdup_ps(a);
assert_eq!(r, f32x4::new(5.0, 5.0, -10.0, -10.0));
}
#[simd_test = "sse3"]
unsafe fn _mm_moveldup_ps() {
unsafe fn test_mm_moveldup_ps() {
let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
let r = sse3::_mm_moveldup_ps(a);
assert_eq!(r, f32x4::new(-1.0, -1.0, 0.0, 0.0));
}
#[simd_test = "sse3"]
unsafe fn _mm_loaddup_pd() {
unsafe fn test_mm_loaddup_pd() {
let d = -5.0;
let r = sse3::_mm_loaddup_pd(&d);
assert_eq!(r, f64x2::new(d, d));
let r = _mm_loaddup_pd(&d);
assert_eq_m128d(r, _mm_setr_pd(d, d));
}
}

View file

@ -1,6 +1,7 @@
//! `i686`'s Streaming SIMD Extensions 4.1 (SSE4.1)
use v128::*;
use x86::*;
#[cfg(test)]
use stdsimd_test::assert_instr;
@ -111,8 +112,7 @@ pub unsafe fn _mm_test_all_zeros(a: __m128i, mask: __m128i) -> i32 {
#[cfg_attr(test, assert_instr(pcmpeqd))]
#[cfg_attr(test, assert_instr(ptest))]
pub unsafe fn _mm_test_all_ones(a: __m128i) -> i32 {
let b = i32x4::from(a);
_mm_testc_si128(a, __m128i::from(::x86::_mm_cmpeq_epi32(b, b)))
_mm_testc_si128(a, _mm_cmpeq_epi32(a, a))
}
/// Tests whether the specified bits in a 128-bit integer vector are

View file

@ -1,5 +1,7 @@
//! `x86` and `x86_64` intrinsics.
use core::mem;
#[macro_use]
mod macros;
@ -8,6 +10,70 @@ mod macros;
#[allow(non_camel_case_types)]
pub struct __m128(f32, f32, f32, f32);
#[repr(simd)]
#[derive(Clone, Copy, Debug)]
#[allow(non_camel_case_types)]
pub struct __m128d(f64, f64);
pub use v128::__m128i;
pub use v64::__m64;
#[cfg(test)]
mod test;
#[cfg(test)]
pub use self::test::*;
#[doc(hidden)]
#[allow(non_camel_case_types)]
trait m128iExt: Sized {
fn as_m128i(self) -> __m128i;
#[inline(always)]
fn as_u8x16(self) -> ::v128::u8x16 {
unsafe { mem::transmute(self.as_m128i()) }
}
#[inline(always)]
fn as_u16x8(self) -> ::v128::u16x8 {
unsafe { mem::transmute(self.as_m128i()) }
}
#[inline(always)]
fn as_u32x4(self) -> ::v128::u32x4 {
unsafe { mem::transmute(self.as_m128i()) }
}
#[inline(always)]
fn as_u64x2(self) -> ::v128::u64x2 {
unsafe { mem::transmute(self.as_m128i()) }
}
#[inline(always)]
fn as_i8x16(self) -> ::v128::i8x16 {
unsafe { mem::transmute(self.as_m128i()) }
}
#[inline(always)]
fn as_i16x8(self) -> ::v128::i16x8 {
unsafe { mem::transmute(self.as_m128i()) }
}
#[inline(always)]
fn as_i32x4(self) -> ::v128::i32x4 {
unsafe { mem::transmute(self.as_m128i()) }
}
#[inline(always)]
fn as_i64x2(self) -> ::v128::i64x2 {
unsafe { mem::transmute(self.as_m128i()) }
}
}
impl m128iExt for __m128i {
#[inline(always)]
fn as_m128i(self) -> __m128i { self }
}
mod i386;
pub use self::i386::*;

View file

@ -0,0 +1,32 @@
//! Utilities used in testing the x86 intrinsics
use std::mem;
use x86::*;
#[target_feature = "+sse2"]
pub unsafe fn assert_eq_m128d(a: __m128d, b: __m128d) {
if _mm_movemask_pd(_mm_cmpeq_pd(a, b)) != 0b11 {
panic!("{:?} != {:?}", a, b);
}
}
#[target_feature = "+sse2"]
pub unsafe fn get_m128d(a: __m128d, idx: usize) -> f64 {
union A { a: __m128d, b: [f64; 2] };
mem::transmute::<__m128d, A>(a).b[idx]
}
#[target_feature = "+sse"]
pub unsafe fn assert_eq_m128(a: __m128, b: __m128) {
let r = _mm_cmpeq_ps(a, b);
if _mm_movemask_ps(r) != 0b1111 {
panic!("{:?} != {:?}", a, b);
}
}
#[target_feature = "+sse"]
pub unsafe fn get_m128(a: __m128, idx: usize) -> f32 {
union A { a: __m128, b: [f32; 4] };
mem::transmute::<__m128, A>(a).b[idx]
}

View file

@ -63,9 +63,12 @@
//! #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
//! #[target_feature = "+sse2"]
//! unsafe fn sum_sse2(x: i32x4) -> i32 {
//! let x = vendor::_mm_add_epi32(x, vendor::_mm_srli_si128(x.into(), 8).into());
//! let x = vendor::_mm_add_epi32(x, vendor::_mm_srli_si128(x.into(), 4).into());
//! vendor::_mm_cvtsi128_si32(x)
//! use std::mem;
//! let x: vendor::__m128i = mem::transmute(x);
//! let x = vendor::_mm_add_epi32(x, vendor::_mm_srli_si128(x, 8));
//! let x = vendor::_mm_add_epi32(x, vendor::_mm_srli_si128(x, 4));
//! let ret = vendor::_mm_cvtsi128_si32(x);
//! mem::transmute(ret)
//! }
//!
//! // Uses the SSE2 version if SSE2 is enabled for all target

View file

@ -98,6 +98,7 @@ fn to_type(t: &syn::Type) -> Tokens {
match *t {
syn::Type::Path(ref p) => match extract_path_ident(&p.path).as_ref() {
"__m128" => my_quote! { &F32x4 },
"__m128d" => my_quote! { &F64x2 },
"__m128i" => my_quote! { &I8x16 },
"__m256i" => my_quote! { &I8x32 },
"__m64" => my_quote! { &I8x8 },
@ -178,6 +179,10 @@ fn walk(root: &Path, files: &mut Vec<syn::File>) {
continue;
}
if path.file_name().and_then(|s| s.to_str()) == Some("test.rs") {
continue
}
let mut contents = String::new();
File::open(&path)
.unwrap()