Lower the instruction limit to 20 (#262)

Right now it's 30 which is a bit high, most of the intrinsics requiring all these instructions ended up needing to be fixed anyway.
2018-01-03 17:21:01 -06:00 · 2018-01-03 17:21:01 -06:00 · edbfae36c0
commit edbfae36c0
parent 07ebce51b8
4 changed files with 86 additions and 82 deletions
--- a/library/stdarch/coresimd/src/x86/i686/mmx.rs
+++ b/library/stdarch/coresimd/src/x86/i686/mmx.rs
@ -32,8 +32,8 @@ pub unsafe fn _mm_setzero_si64() -> __m64 {
 #[inline(always)]
 #[target_feature = "+mmx,+sse"]
 #[cfg_attr(test, assert_instr(packsswb))]
-pub unsafe fn _mm_packs_pi16(a: i16x4, b: i16x4) -> i8x8 {
-    mem::transmute(packsswb(mem::transmute(a), mem::transmute(b)))
+pub unsafe fn _mm_packs_pi16(a: __m64, b: __m64) -> __m64 {
+    packsswb(a, b)
 }

 /// Convert packed 32-bit integers from `a` and `b` to packed 16-bit integers
@ -44,8 +44,8 @@ pub unsafe fn _mm_packs_pi16(a: i16x4, b: i16x4) -> i8x8 {
 #[inline(always)]
 #[target_feature = "+mmx,+sse"]
 #[cfg_attr(test, assert_instr(packssdw))]
-pub unsafe fn _mm_packs_pi32(a: i32x2, b: i32x2) -> i16x4 {
-    mem::transmute(packssdw(mem::transmute(a), mem::transmute(b)))
+pub unsafe fn _mm_packs_pi32(a: __m64, b: __m64) -> __m64 {
+    packssdw(a, b)
 }

 /// Compares the 8-bit integer elements of two 64-bit integer vectors of
@ -68,8 +68,8 @@ pub unsafe fn _mm_cmpgt_pi8(a: __m64, b: __m64) -> __m64 {
 #[inline(always)]
 #[target_feature = "+mmx"]
 #[cfg_attr(test, assert_instr(pcmpgtw))]
-pub unsafe fn _mm_cmpgt_pi16(a: i16x4, b: i16x4) -> i16x4 {
-    mem::transmute(pcmpgtw(mem::transmute(a), mem::transmute(b)))
+pub unsafe fn _mm_cmpgt_pi16(a: __m64, b: __m64) -> __m64 {
+    pcmpgtw(a, b)
 }

 /// Unpacks the upper 32 bits from two 64-bit integer vectors of
@ -77,8 +77,8 @@ pub unsafe fn _mm_cmpgt_pi16(a: i16x4, b: i16x4) -> i16x4 {
 #[inline(always)]
 #[target_feature = "+mmx"]
 #[cfg_attr(test, assert_instr(punpckhwd))] // FIXME punpcklbw expected
-pub unsafe fn _mm_unpackhi_pi16(a: i16x4, b: i16x4) -> i16x4 {
-    mem::transmute(punpckhwd(mem::transmute(a), mem::transmute(b)))
+pub unsafe fn _mm_unpackhi_pi16(a: __m64, b: __m64) -> __m64 {
+    punpckhwd(a, b)
 }

 /// Unpacks the lower 32 bits from two 64-bit integer vectors of [8 x i8]
@ -95,8 +95,8 @@ pub unsafe fn _mm_unpacklo_pi8(a: __m64, b: __m64) -> __m64 {
 #[inline(always)]
 #[target_feature = "+mmx"]
 #[cfg_attr(test, assert_instr(punpcklwd))]
-pub unsafe fn _mm_unpacklo_pi16(a: i16x4, b: i16x4) -> i16x4 {
-    mem::transmute(punpcklwd(mem::transmute(a), mem::transmute(b)))
+pub unsafe fn _mm_unpacklo_pi16(a: __m64, b: __m64) -> __m64 {
+    punpcklwd(a, b)
 }

 #[allow(improper_ctypes)]
@ -134,7 +134,7 @@ mod tests {
        let a = i16x4::new(-1, 2, -3, 4);
        let b = i16x4::new(-5, 6, -7, 8);
        let r = i8x8::new(-1, 2, -3, 4, -5, 6, -7, 8);
-        assert_eq!(r, mmx::_mm_packs_pi16(a, b));
+        assert_eq!(r, i8x8::from(mmx::_mm_packs_pi16(a.into(), b.into())));
    }

    #[simd_test = "sse"] // FIXME: should be mmx
@ -142,7 +142,7 @@ mod tests {
        let a = i32x2::new(-1, 2);
        let b = i32x2::new(-5, 6);
        let r = i16x4::new(-1, 2, -5, 6);
-        assert_eq!(r, mmx::_mm_packs_pi32(a, b));
+        assert_eq!(r, i16x4::from(mmx::_mm_packs_pi32(a.into(), b.into())));
    }

    #[simd_test = "mmx"]
@ -158,7 +158,7 @@ mod tests {
        let a = i16x4::new(0, 1, 2, 3);
        let b = i16x4::new(4, 3, 2, 1);
        let r = i16x4::new(0, 0, 0, -1);
-        assert_eq!(r, mmx::_mm_cmpgt_pi16(a, b));
+        assert_eq!(r, i16x4::from(mmx::_mm_cmpgt_pi16(a.into(), b.into())));
    }

    #[simd_test = "mmx"]
@ -166,7 +166,7 @@ mod tests {
        let a = i16x4::new(0, 1, 2, 3);
        let b = i16x4::new(4, 5, 6, 7);
        let r = i16x4::new(2, 6, 3, 7);
-        assert_eq!(r, mmx::_mm_unpackhi_pi16(a, b));
+        assert_eq!(r, i16x4::from(mmx::_mm_unpackhi_pi16(a.into(), b.into())));
    }

    #[simd_test = "mmx"]
@ -182,6 +182,6 @@ mod tests {
        let a = i16x4::new(0, 1, 2, 3);
        let b = i16x4::new(4, 5, 6, 7);
        let r = i16x4::new(0, 4, 1, 5);
-        assert_eq!(r, mmx::_mm_unpacklo_pi16(a, b));
+        assert_eq!(r, i16x4::from(mmx::_mm_unpacklo_pi16(a.into(), b.into())));
    }
 }
--- a/library/stdarch/coresimd/src/x86/i686/sse.rs
+++ b/library/stdarch/coresimd/src/x86/i686/sse.rs
@ -50,8 +50,8 @@ extern "C" {
 #[inline(always)]
 #[target_feature = "+sse"]
 #[cfg_attr(test, assert_instr(pmaxsw))]
-pub unsafe fn _mm_max_pi16(a: i16x4, b: i16x4) -> i16x4 {
-    mem::transmute(pmaxsw(mem::transmute(a), mem::transmute(b)))
+pub unsafe fn _mm_max_pi16(a: __m64, b: __m64) -> __m64 {
+    pmaxsw(a, b)
 }

 /// Compares the packed 16-bit signed integers of `a` and `b` writing the
@ -59,7 +59,7 @@ pub unsafe fn _mm_max_pi16(a: i16x4, b: i16x4) -> i16x4 {
 #[inline(always)]
 #[target_feature = "+sse"]
 #[cfg_attr(test, assert_instr(pmaxsw))]
-pub unsafe fn _m_pmaxsw(a: i16x4, b: i16x4) -> i16x4 {
+pub unsafe fn _m_pmaxsw(a: __m64, b: __m64) -> __m64 {
    _mm_max_pi16(a, b)
 }

@ -86,8 +86,8 @@ pub unsafe fn _m_pmaxub(a: __m64, b: __m64) -> __m64 {
 #[inline(always)]
 #[target_feature = "+sse"]
 #[cfg_attr(test, assert_instr(pminsw))]
-pub unsafe fn _mm_min_pi16(a: i16x4, b: i16x4) -> i16x4 {
-    mem::transmute(pminsw(mem::transmute(a), mem::transmute(b)))
+pub unsafe fn _mm_min_pi16(a: __m64, b: __m64) -> __m64 {
+    pminsw(a, b)
 }

 /// Compares the packed 16-bit signed integers of `a` and `b` writing the
@ -95,7 +95,7 @@ pub unsafe fn _mm_min_pi16(a: i16x4, b: i16x4) -> i16x4 {
 #[inline(always)]
 #[target_feature = "+sse"]
 #[cfg_attr(test, assert_instr(pminsw))]
-pub unsafe fn _m_pminsw(a: i16x4, b: i16x4) -> i16x4 {
+pub unsafe fn _m_pminsw(a: __m64, b: __m64) -> __m64 {
    _mm_min_pi16(a, b)
 }

@ -123,8 +123,8 @@ pub unsafe fn _m_pminub(a: __m64, b: __m64) -> __m64 {
 #[inline(always)]
 #[target_feature = "+sse"]
 #[cfg_attr(test, assert_instr(pmulhuw))]
-pub unsafe fn _mm_mulhi_pu16(a: u16x4, b: u16x4) -> u16x4 {
-    mem::transmute(pmulhuw(mem::transmute(a), mem::transmute(b)))
+pub unsafe fn _mm_mulhi_pu16(a: __m64, b: __m64) -> __m64 {
+    pmulhuw(a, b)
 }

 /// Multiplies packed 16-bit unsigned integer values and writes the
@ -133,7 +133,7 @@ pub unsafe fn _mm_mulhi_pu16(a: u16x4, b: u16x4) -> u16x4 {
 #[inline(always)]
 #[target_feature = "+sse"]
 #[cfg_attr(test, assert_instr(pmulhuw))]
-pub unsafe fn _m_pmulhuw(a: u16x4, b: u16x4) -> u16x4 {
+pub unsafe fn _m_pmulhuw(a: __m64, b: __m64) -> __m64 {
    _mm_mulhi_pu16(a, b)
 }

@ -163,8 +163,8 @@ pub unsafe fn _m_pavgb(a: __m64, b: __m64) -> __m64 {
 #[inline(always)]
 #[target_feature = "+sse"]
 #[cfg_attr(test, assert_instr(pavgw))]
-pub unsafe fn _mm_avg_pu16(a: u16x4, b: u16x4) -> u16x4 {
-    mem::transmute(pavgw(mem::transmute(a), mem::transmute(b)))
+pub unsafe fn _mm_avg_pu16(a: __m64, b: __m64) -> __m64 {
+    pavgw(a, b)
 }

 /// Computes the rounded averages of the packed unsigned 16-bit integer
@ -173,7 +173,7 @@ pub unsafe fn _mm_avg_pu16(a: u16x4, b: u16x4) -> u16x4 {
 #[inline(always)]
 #[target_feature = "+sse"]
 #[cfg_attr(test, assert_instr(pavgw))]
-pub unsafe fn _m_pavgw(a: u16x4, b: u16x4) -> u16x4 {
+pub unsafe fn _m_pavgw(a: __m64, b: __m64) -> __m64 {
    _mm_avg_pu16(a, b)
 }

@ -225,7 +225,7 @@ pub unsafe fn _mm_cvt_pi2ps(a: f32x4, b: i32x2) -> f32x4 {
 /// float].
 #[inline(always)]
 #[target_feature = "+sse"]
-pub unsafe fn _mm_cvtpi16_ps(a: i16x4) -> f32x4 {
+pub unsafe fn _mm_cvtpi16_ps(a: __m64) -> f32x4 {
    let b = mmx::_mm_setzero_si64();
    let b = mmx::_mm_cmpgt_pi16(mem::transmute(b), a);
    let c = mmx::_mm_unpackhi_pi16(a, b);
@ -240,14 +240,14 @@ pub unsafe fn _mm_cvtpi16_ps(a: i16x4) -> f32x4 {
 /// 128-bit vector of [4 x float].
 #[inline(always)]
 #[target_feature = "+sse"]
-pub unsafe fn _mm_cvtpu16_ps(a: u16x4) -> f32x4 {
-    let b = mem::transmute(mmx::_mm_setzero_si64());
-    let c = mmx::_mm_unpackhi_pi16(a.as_i16x4(), b);
+pub unsafe fn _mm_cvtpu16_ps(a: __m64) -> f32x4 {
+    let b = mmx::_mm_setzero_si64();
+    let c = mmx::_mm_unpackhi_pi16(a, b);
    let r = i586::_mm_setzero_ps();
-    let r = cvtpi2ps(r, mem::transmute(c));
+    let r = cvtpi2ps(r, c);
    let r = i586::_mm_movelh_ps(r, r);
-    let c = mmx::_mm_unpacklo_pi16(a.as_i16x4(), b);
-    cvtpi2ps(r, mem::transmute(c))
+    let c = mmx::_mm_unpacklo_pi16(a, b);
+    cvtpi2ps(r, c)
 }

 /// Converts the lower four 8-bit values from a 64-bit vector of [8 x i8]
@ -256,9 +256,9 @@ pub unsafe fn _mm_cvtpu16_ps(a: u16x4) -> f32x4 {
 #[target_feature = "+sse"]
 pub unsafe fn _mm_cvtpi8_ps(a: __m64) -> f32x4 {
    let b = mmx::_mm_setzero_si64();
-    let b = mmx::_mm_cmpgt_pi8(mem::transmute(b), a);
+    let b = mmx::_mm_cmpgt_pi8(b, a);
    let b = mmx::_mm_unpacklo_pi8(a, b);
-    _mm_cvtpi16_ps(mem::transmute(b))
+    _mm_cvtpi16_ps(b)
 }

 /// Converts the lower four unsigned 8-bit integer values from a 64-bit
@ -267,8 +267,8 @@ pub unsafe fn _mm_cvtpi8_ps(a: __m64) -> f32x4 {
 #[target_feature = "+sse"]
 pub unsafe fn _mm_cvtpu8_ps(a: __m64) -> f32x4 {
    let b = mmx::_mm_setzero_si64();
-    let b = mmx::_mm_unpacklo_pi8(a, mem::transmute(b));
-    _mm_cvtpi16_ps(mem::transmute(b))
+    let b = mmx::_mm_unpacklo_pi8(a, b);
+    _mm_cvtpi16_ps(b)
 }

 /// Converts the two 32-bit signed integer values from each 64-bit vector
@ -338,9 +338,9 @@ pub unsafe fn _m_pextrw(a: i16x4, imm2: i32) -> i16 {
 #[inline(always)]
 #[target_feature = "+sse"]
 #[cfg_attr(test, assert_instr(pinsrw, imm2 = 0))]
-pub unsafe fn _mm_insert_pi16(a: i16x4, d: i32, imm2: i32) -> i16x4 {
+pub unsafe fn _mm_insert_pi16(a: __m64, d: i32, imm2: i32) -> __m64 {
    macro_rules! call {
-        ($imm2:expr) => { mem::transmute(pinsrw(mem::transmute(a), d, $imm2)) }
+        ($imm2:expr) => { pinsrw(a, d, $imm2) }
    }
    constify_imm2!(imm2, call)
 }
@ -351,7 +351,7 @@ pub unsafe fn _mm_insert_pi16(a: i16x4, d: i32, imm2: i32) -> i16x4 {
 #[inline(always)]
 #[target_feature = "+sse"]
 #[cfg_attr(test, assert_instr(pinsrw, imm2 = 0))]
-pub unsafe fn _m_pinsrw(a: i16x4, d: i32, imm2: i32) -> i16x4 {
+pub unsafe fn _m_pinsrw(a: __m64, d: i32, imm2: i32) -> __m64 {
    _mm_insert_pi16(a, d, imm2)
 }

@ -380,9 +380,9 @@ pub unsafe fn _m_pmovmskb(a: i16x4) -> i32 {
 #[inline(always)]
 #[target_feature = "+sse"]
 #[cfg_attr(test, assert_instr(pshufw, imm8 = 0))]
-pub unsafe fn _mm_shuffle_pi16(a: i16x4, imm8: i32) -> i16x4 {
+pub unsafe fn _mm_shuffle_pi16(a: __m64, imm8: i32) -> __m64 {
    macro_rules! call {
-        ($imm8:expr) => { mem::transmute(pshufw(mem::transmute(a), $imm8)) }
+        ($imm8:expr) => { pshufw(a, $imm8) }
    }
    constify_imm8!(imm8, call)
 }
@ -392,7 +392,7 @@ pub unsafe fn _mm_shuffle_pi16(a: i16x4, imm8: i32) -> i16x4 {
 #[inline(always)]
 #[target_feature = "+sse"]
 #[cfg_attr(test, assert_instr(pshufw, imm8 = 0))]
-pub unsafe fn _m_pshufw(a: i16x4, imm8: i32) -> i16x4 {
+pub unsafe fn _m_pshufw(a: __m64, imm8: i32) -> __m64 {
    _mm_shuffle_pi16(a, imm8)
 }

@ -419,8 +419,8 @@ pub unsafe fn _mm_cvtt_ps2pi(a: f32x4) -> i32x2 {
 #[inline(always)]
 #[target_feature = "+sse"]
 #[cfg_attr(test, assert_instr(cvtps2pi))]
-pub unsafe fn _mm_cvtps_pi32(a: f32x4) -> i32x2 {
-    mem::transmute(cvtps2pi(a))
+pub unsafe fn _mm_cvtps_pi32(a: f32x4) -> __m64 {
+    cvtps2pi(a)
 }

 /// Convert the two lower packed single-precision (32-bit) floating-point
@ -428,7 +428,7 @@ pub unsafe fn _mm_cvtps_pi32(a: f32x4) -> i32x2 {
 #[inline(always)]
 #[target_feature = "+sse"]
 #[cfg_attr(test, assert_instr(cvtps2pi))]
-pub unsafe fn _mm_cvt_ps2pi(a: f32x4) -> i32x2 {
+pub unsafe fn _mm_cvt_ps2pi(a: f32x4) -> __m64 {
    _mm_cvtps_pi32(a)
 }

@ -437,7 +437,7 @@ pub unsafe fn _mm_cvt_ps2pi(a: f32x4) -> i32x2 {
 #[inline(always)]
 #[target_feature = "+sse"]
 #[cfg_attr(test, assert_instr(cvtps2pi))]
-pub unsafe fn _mm_cvtps_pi16(a: f32x4) -> i16x4 {
+pub unsafe fn _mm_cvtps_pi16(a: f32x4) -> __m64 {
    let b = _mm_cvtps_pi32(a);
    let a = i586::_mm_movehl_ps(a, a);
    let c = _mm_cvtps_pi32(a);
@ -450,10 +450,10 @@ pub unsafe fn _mm_cvtps_pi16(a: f32x4) -> i16x4 {
 #[inline(always)]
 #[target_feature = "+sse"]
 #[cfg_attr(test, assert_instr(cvtps2pi))]
-pub unsafe fn _mm_cvtps_pi8(a: f32x4) -> i8x8 {
+pub unsafe fn _mm_cvtps_pi8(a: f32x4) -> __m64 {
    let b = _mm_cvtps_pi16(a);
    let c = mmx::_mm_setzero_si64();
-    mmx::_mm_packs_pi16(b, mem::transmute(c))
+    mmx::_mm_packs_pi16(b, c)
 }

 #[cfg(test)]
@ -472,8 +472,8 @@ mod tests {
        let b = i16x4::new(5, -2, 7, -4);
        let r = i16x4::new(5, 6, 7, 8);

-        assert_eq!(r, sse::_mm_max_pi16(a, b));
-        assert_eq!(r, sse::_m_pmaxsw(a, b));
+        assert_eq!(r, i16x4::from(sse::_mm_max_pi16(a.into(), b.into())));
+        assert_eq!(r, i16x4::from(sse::_m_pmaxsw(a.into(), b.into())));
    }

    #[simd_test = "sse"]
@ -492,8 +492,8 @@ mod tests {
        let b = i16x4::new(5, -2, 7, -4);
        let r = i16x4::new(-1, -2, -3, -4);

-        assert_eq!(r, sse::_mm_min_pi16(a, b));
-        assert_eq!(r, sse::_m_pminsw(a, b));
+        assert_eq!(r, i16x4::from(sse::_mm_min_pi16(a.into(), b.into())));
+        assert_eq!(r, i16x4::from(sse::_m_pminsw(a.into(), b.into())));
    }

    #[simd_test = "sse"]
@ -509,7 +509,7 @@ mod tests {
    #[simd_test = "sse"]
    unsafe fn _mm_mulhi_pu16() {
        let (a, b) = (u16x4::splat(1000), u16x4::splat(1001));
-        let r = sse::_mm_mulhi_pu16(a, b);
+        let r = u16x4::from(sse::_mm_mulhi_pu16(a.into(), b.into()));
        assert_eq!(r, u16x4::splat(15));
    }

@ -526,10 +526,10 @@ mod tests {
    #[simd_test = "sse"]
    unsafe fn _mm_avg_pu16() {
        let (a, b) = (u16x4::splat(3), u16x4::splat(9));
-        let r = sse::_mm_avg_pu16(a, b);
+        let r = u16x4::from(sse::_mm_avg_pu16(a.into(), b.into()));
        assert_eq!(r, u16x4::splat(6));

-        let r = sse::_m_pavgw(a, b);
+        let r = u16x4::from(sse::_m_pavgw(a.into(), b.into()));
        assert_eq!(r, u16x4::splat(6));
    }

@ -561,7 +561,7 @@ mod tests {
    unsafe fn _mm_cvtpi16_ps() {
        let a = i16x4::new(1, 2, 3, 4);
        let expected = f32x4::new(1., 2., 3., 4.);
-        let r = sse::_mm_cvtpi16_ps(a);
+        let r = sse::_mm_cvtpi16_ps(a.into());
        assert_eq!(r, expected);
    }

@ -569,7 +569,7 @@ mod tests {
    unsafe fn _mm_cvtpu16_ps() {
        let a = u16x4::new(1, 2, 3, 4);
        let expected = f32x4::new(1., 2., 3., 4.);
-        let r = sse::_mm_cvtpu16_ps(a);
+        let r = sse::_mm_cvtpu16_ps(a.into());
        assert_eq!(r, expected);
    }

@ -626,14 +626,14 @@ mod tests {
    #[simd_test = "sse"]
    unsafe fn _mm_insert_pi16() {
        let a = i16x4::new(1, 2, 3, 4);
-        let r = sse::_mm_insert_pi16(a, 0, 0b0);
+        let r = i16x4::from(sse::_mm_insert_pi16(a.into(), 0, 0b0));
        let expected = i16x4::new(0, 2, 3, 4);
        assert_eq!(r, expected);
-        let r = sse::_mm_insert_pi16(a, 0, 0b10);
+        let r = i16x4::from(sse::_mm_insert_pi16(a.into(), 0, 0b10));
        let expected = i16x4::new(1, 2, 0, 4);
        assert_eq!(r, expected);

-        let r = sse::_m_pinsrw(a, 0, 0b10);
+        let r = i16x4::from(sse::_m_pinsrw(a.into(), 0, 0b10));
        assert_eq!(r, expected);
    }

@ -650,11 +650,11 @@ mod tests {
    #[simd_test = "sse"]
    unsafe fn _mm_shuffle_pi16() {
        let a = i16x4::new(1, 2, 3, 4);
-        let r = sse::_mm_shuffle_pi16(a, 0b00_01_01_11);
+        let r = i16x4::from(sse::_mm_shuffle_pi16(a.into(), 0b00_01_01_11));
        let expected = i16x4::new(4, 2, 2, 1);
        assert_eq!(r, expected);

-        let r = sse::_m_pshufw(a, 0b00_01_01_11);
+        let r = i16x4::from(sse::_m_pshufw(a.into(), 0b00_01_01_11));
        assert_eq!(r, expected);
    }

@ -663,8 +663,8 @@ mod tests {
        let a = f32x4::new(1.0, 2.0, 3.0, 4.0);
        let r = i32x2::new(1, 2);

-        assert_eq!(r, sse::_mm_cvtps_pi32(a));
-        assert_eq!(r, sse::_mm_cvt_ps2pi(a));
+        assert_eq!(r, i32x2::from(sse::_mm_cvtps_pi32(a)));
+        assert_eq!(r, i32x2::from(sse::_mm_cvt_ps2pi(a)));
    }

    #[simd_test = "sse"]
@ -680,13 +680,13 @@ mod tests {
    unsafe fn _mm_cvtps_pi16() {
        let a = f32x4::new(7.0, 2.0, 3.0, 4.0);
        let r = i16x4::new(7, 2, 3, 4);
-        assert_eq!(r, sse::_mm_cvtps_pi16(a));
+        assert_eq!(r, i16x4::from(sse::_mm_cvtps_pi16(a)));
    }

    #[simd_test = "sse"]
    unsafe fn _mm_cvtps_pi8() {
        let a = f32x4::new(7.0, 2.0, 3.0, 4.0);
        let r = i8x8::new(7, 2, 3, 4, 0, 0, 0, 0);
-        assert_eq!(r, sse::_mm_cvtps_pi8(a));
+        assert_eq!(r, i8x8::from(sse::_mm_cvtps_pi8(a)));
    }
 }
--- a/library/stdarch/coresimd/src/x86/i686/ssse3.rs
+++ b/library/stdarch/coresimd/src/x86/i686/ssse3.rs
@ -3,7 +3,6 @@
 #[cfg(test)]
 use stdsimd_test::assert_instr;

-use core::mem;
 use v64::*;

 /// Compute the absolute value of packed 8-bit integers in `a` and
@ -29,8 +28,8 @@ pub unsafe fn _mm_abs_pi16(a: __m64) -> __m64 {
 #[inline(always)]
 #[target_feature = "+ssse3"]
 #[cfg_attr(test, assert_instr(pabsd))]
-pub unsafe fn _mm_abs_pi32(a: i32x2) -> u32x2 {
-    mem::transmute(pabsd(mem::transmute(a)))
+pub unsafe fn _mm_abs_pi32(a: __m64) -> __m64 {
+    pabsd(a)
 }

 /// Shuffle packed 8-bit integers in `a` according to shuffle control mask in
@ -70,8 +69,8 @@ pub unsafe fn _mm_hadd_pi16(a: __m64, b: __m64) -> __m64 {
 #[inline(always)]
 #[target_feature = "+ssse3"]
 #[cfg_attr(test, assert_instr(phaddd))]
-pub unsafe fn _mm_hadd_pi32(a: i32x2, b: i32x2) -> i32x2 {
-    mem::transmute(phaddd(mem::transmute(a), mem::transmute(b)))
+pub unsafe fn _mm_hadd_pi32(a: __m64, b: __m64) -> __m64 {
+    phaddd(a, b)
 }

 /// Horizontally add the adjacent pairs of values contained in 2 packed
@ -98,8 +97,8 @@ pub unsafe fn _mm_hsub_pi16(a: __m64, b: __m64) -> __m64 {
 #[inline(always)]
 #[target_feature = "+ssse3"]
 #[cfg_attr(test, assert_instr(phsubd))]
-pub unsafe fn _mm_hsub_pi32(a: i32x2, b: i32x2) -> i32x2 {
-    mem::transmute(phsubd(mem::transmute(a), mem::transmute(b)))
+pub unsafe fn _mm_hsub_pi32(a: __m64, b: __m64) -> __m64 {
+    phsubd(a, b)
 }

 /// Horizontally subtracts the adjacent pairs of values contained in 2
@ -164,8 +163,8 @@ pub unsafe fn _mm_sign_pi16(a: __m64, b: __m64) -> __m64 {
 #[inline(always)]
 #[target_feature = "+ssse3"]
 #[cfg_attr(test, assert_instr(psignd))]
-pub unsafe fn _mm_sign_pi32(a: i32x2, b: i32x2) -> i32x2 {
-    mem::transmute(psignd(mem::transmute(a), mem::transmute(b)))
+pub unsafe fn _mm_sign_pi32(a: __m64, b: __m64) -> __m64 {
+    psignd(a, b)
 }

 #[allow(improper_ctypes)]
@ -240,7 +239,7 @@ mod tests {

    #[simd_test = "ssse3"]
    unsafe fn _mm_abs_pi32() {
-        let r = ssse3::_mm_abs_pi32(i32x2::splat(-5));
+        let r = u32x2::from(ssse3::_mm_abs_pi32(i32x2::splat(-5).into()));
        assert_eq!(r, u32x2::splat(5));
    }

@ -275,7 +274,7 @@ mod tests {
        let a = i32x2::new(1, 2);
        let b = i32x2::new(4, 128);
        let expected = i32x2::new(3, 132);
-        let r = ssse3::_mm_hadd_pi32(a, b);
+        let r = i32x2::from(ssse3::_mm_hadd_pi32(a.into(), b.into()));
        assert_eq!(r, expected);
    }

@ -302,7 +301,7 @@ mod tests {
        let a = i32x2::new(1, 2);
        let b = i32x2::new(4, 128);
        let expected = i32x2::new(-1, -124);
-        let r = ssse3::_mm_hsub_pi32(a, b);
+        let r = i32x2::from(ssse3::_mm_hsub_pi32(a.into(), b.into()));
        assert_eq!(r, expected);
    }

@ -356,7 +355,7 @@ mod tests {
        let a = i32x2::new(-1, 2);
        let b = i32x2::new(1, 0);
        let expected = i32x2::new(-1, 0);
-        let r = ssse3::_mm_sign_pi32(a, b);
+        let r = i32x2::from(ssse3::_mm_sign_pi32(a.into(), b.into()));
        assert_eq!(r, expected);
    }
 }
--- a/library/stdarch/stdsimd-test/src/lib.rs
+++ b/library/stdarch/stdsimd-test/src/lib.rs
@ -323,7 +323,12 @@ pub fn assert(fnptr: usize, fnname: &str, expected: &str) {
        break;
    }

-    let instruction_limit = 30;
+    let instruction_limit = match expected {
+        // cpuid returns a pretty big aggregate structure so excempt it from the
+        // slightly more restrictive 20 instructions below
+        "cpuid" => 30,
+        _ => 20,
+    };
    let probably_only_one_instruction =
        function.instrs.len() < instruction_limit;