Assert intrinsic implementations are inlined properly (#261)

* assert_instr check for failed inlining * Fix `call` instructions showing up in some intrinsics The ABI of types like `u8x8` as they're defined isn't actually the underlying type we need for LLVM, but only `__m64` currently satisfies that. Apparently this (and the casts involved) caused some extraneous instructions for a number of intrinsics. They've all moved over to the `__m64` type now to ensure that they're what the underlying interface is. * Allow PIC-relative `call` instructions on x86 These should be harmless when evaluating whether we failed inlining
2018-01-03 16:37:45 -06:00 · 2018-01-03 16:37:45 -06:00 · 07ebce51b8
commit 07ebce51b8
parent acc8d3de10
5 changed files with 118 additions and 80 deletions
--- a/library/stdarch/coresimd/src/x86/i686/mmx.rs
+++ b/library/stdarch/coresimd/src/x86/i686/mmx.rs
@ -56,8 +56,8 @@ pub unsafe fn _mm_packs_pi32(a: i32x2, b: i32x2) -> i16x4 {
 #[inline(always)]
 #[target_feature = "+mmx"]
 #[cfg_attr(test, assert_instr(pcmpgtb))]
-pub unsafe fn _mm_cmpgt_pi8(a: i8x8, b: i8x8) -> i8x8 {
-    mem::transmute(pcmpgtb(mem::transmute(a), mem::transmute(b)))
+pub unsafe fn _mm_cmpgt_pi8(a: __m64, b: __m64) -> __m64 {
+    pcmpgtb(a, b)
 }

 /// Compares the 16-bit integer elements of two 64-bit integer vectors of
@ -86,8 +86,8 @@ pub unsafe fn _mm_unpackhi_pi16(a: i16x4, b: i16x4) -> i16x4 {
 #[inline(always)]
 #[target_feature = "+mmx"]
 #[cfg_attr(test, assert_instr(punpcklbw))]
-pub unsafe fn _mm_unpacklo_pi8(a: i8x8, b: i8x8) -> i8x8 {
-    mem::transmute(punpcklbw(mem::transmute(a), mem::transmute(b)))
+pub unsafe fn _mm_unpacklo_pi8(a: __m64, b: __m64) -> __m64 {
+    punpcklbw(a, b)
 }

 /// Unpacks the lower 32 bits from two 64-bit integer vectors of
@ -150,7 +150,7 @@ mod tests {
        let a = i8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
        let b = i8x8::new(8, 7, 6, 5, 4, 3, 2, 1);
        let r = i8x8::new(0, 0, 0, 0, 0, -1, -1, -1);
-        assert_eq!(r, mmx::_mm_cmpgt_pi8(a, b));
+        assert_eq!(r, i8x8::from(mmx::_mm_cmpgt_pi8(a.into(), b.into())));
    }

    #[simd_test = "mmx"]
@ -174,7 +174,7 @@ mod tests {
        let a = i8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
        let b = i8x8::new(8, 9, 10, 11, 12, 13, 14, 15);
        let r = i8x8::new(0, 8, 1, 9, 2, 10, 3, 11);
-        assert_eq!(r, mmx::_mm_unpacklo_pi8(a, b));
+        assert_eq!(r, i8x8::from(mmx::_mm_unpacklo_pi8(a.into(), b.into())));
    }

    #[simd_test = "mmx"]
--- a/library/stdarch/coresimd/src/x86/i686/sse.rs
+++ b/library/stdarch/coresimd/src/x86/i686/sse.rs
@ -68,8 +68,8 @@ pub unsafe fn _m_pmaxsw(a: i16x4, b: i16x4) -> i16x4 {
 #[inline(always)]
 #[target_feature = "+sse"]
 #[cfg_attr(test, assert_instr(pmaxub))]
-pub unsafe fn _mm_max_pu8(a: u8x8, b: u8x8) -> u8x8 {
-    mem::transmute(pmaxub(mem::transmute(a), mem::transmute(b)))
+pub unsafe fn _mm_max_pu8(a: __m64, b: __m64) -> __m64 {
+    pmaxub(a, b)
 }

 /// Compares the packed 8-bit signed integers of `a` and `b` writing the
@ -77,7 +77,7 @@ pub unsafe fn _mm_max_pu8(a: u8x8, b: u8x8) -> u8x8 {
 #[inline(always)]
 #[target_feature = "+sse"]
 #[cfg_attr(test, assert_instr(pmaxub))]
-pub unsafe fn _m_pmaxub(a: u8x8, b: u8x8) -> u8x8 {
+pub unsafe fn _m_pmaxub(a: __m64, b: __m64) -> __m64 {
    _mm_max_pu8(a, b)
 }

@ -104,8 +104,8 @@ pub unsafe fn _m_pminsw(a: i16x4, b: i16x4) -> i16x4 {
 #[inline(always)]
 #[target_feature = "+sse"]
 #[cfg_attr(test, assert_instr(pminub))]
-pub unsafe fn _mm_min_pu8(a: u8x8, b: u8x8) -> u8x8 {
-    mem::transmute(pminub(mem::transmute(a), mem::transmute(b)))
+pub unsafe fn _mm_min_pu8(a: __m64, b: __m64) -> __m64 {
+    pminub(a, b)
 }

 /// Compares the packed 8-bit signed integers of `a` and `b` writing the
@ -113,7 +113,7 @@ pub unsafe fn _mm_min_pu8(a: u8x8, b: u8x8) -> u8x8 {
 #[inline(always)]
 #[target_feature = "+sse"]
 #[cfg_attr(test, assert_instr(pminub))]
-pub unsafe fn _m_pminub(a: u8x8, b: u8x8) -> u8x8 {
+pub unsafe fn _m_pminub(a: __m64, b: __m64) -> __m64 {
    _mm_min_pu8(a, b)
 }

@ -143,8 +143,8 @@ pub unsafe fn _m_pmulhuw(a: u16x4, b: u16x4) -> u16x4 {
 #[inline(always)]
 #[target_feature = "+sse"]
 #[cfg_attr(test, assert_instr(pavgb))]
-pub unsafe fn _mm_avg_pu8(a: u8x8, b: u8x8) -> u8x8 {
-    mem::transmute(pavgb(mem::transmute(a), mem::transmute(b)))
+pub unsafe fn _mm_avg_pu8(a: __m64, b: __m64) -> __m64 {
+    pavgb(a, b)
 }

 /// Computes the rounded averages of the packed unsigned 8-bit integer
@ -153,7 +153,7 @@ pub unsafe fn _mm_avg_pu8(a: u8x8, b: u8x8) -> u8x8 {
 #[inline(always)]
 #[target_feature = "+sse"]
 #[cfg_attr(test, assert_instr(pavgb))]
-pub unsafe fn _m_pavgb(a: u8x8, b: u8x8) -> u8x8 {
+pub unsafe fn _m_pavgb(a: __m64, b: __m64) -> __m64 {
    _mm_avg_pu8(a, b)
 }

@ -184,8 +184,8 @@ pub unsafe fn _m_pavgw(a: u16x4, b: u16x4) -> u16x4 {
 #[inline(always)]
 #[target_feature = "+sse"]
 #[cfg_attr(test, assert_instr(psadbw))]
-pub unsafe fn _mm_sad_pu8(a: u8x8, b: u8x8) -> __m64 {
-    mem::transmute(psadbw(mem::transmute(a), mem::transmute(b)))
+pub unsafe fn _mm_sad_pu8(a: __m64, b: __m64) -> __m64 {
+    psadbw(a, b)
 }

 /// Subtracts the corresponding 8-bit unsigned integer values of the two
@ -195,8 +195,8 @@ pub unsafe fn _mm_sad_pu8(a: u8x8, b: u8x8) -> __m64 {
 #[inline(always)]
 #[target_feature = "+sse"]
 #[cfg_attr(test, assert_instr(psadbw))]
-pub unsafe fn _m_psadbw(a: u8x8, b: u8x8) -> __m64 {
-    mem::transmute(_mm_sad_pu8(a, b))
+pub unsafe fn _m_psadbw(a: __m64, b: __m64) -> __m64 {
+    _mm_sad_pu8(a, b)
 }

 /// Converts two elements of a 64-bit vector of [2 x i32] into two
@ -254,7 +254,7 @@ pub unsafe fn _mm_cvtpu16_ps(a: u16x4) -> f32x4 {
 /// into a 128-bit vector of [4 x float].
 #[inline(always)]
 #[target_feature = "+sse"]
-pub unsafe fn _mm_cvtpi8_ps(a: i8x8) -> f32x4 {
+pub unsafe fn _mm_cvtpi8_ps(a: __m64) -> f32x4 {
    let b = mmx::_mm_setzero_si64();
    let b = mmx::_mm_cmpgt_pi8(mem::transmute(b), a);
    let b = mmx::_mm_unpacklo_pi8(a, b);
@ -265,9 +265,9 @@ pub unsafe fn _mm_cvtpi8_ps(a: i8x8) -> f32x4 {
 /// vector of [8 x u8] into a 128-bit vector of [4 x float].
 #[inline(always)]
 #[target_feature = "+sse"]
-pub unsafe fn _mm_cvtpu8_ps(a: u8x8) -> f32x4 {
+pub unsafe fn _mm_cvtpu8_ps(a: __m64) -> f32x4 {
    let b = mmx::_mm_setzero_si64();
-    let b = mmx::_mm_unpacklo_pi8(a.as_i8x8(), mem::transmute(b));
+    let b = mmx::_mm_unpacklo_pi8(a, mem::transmute(b));
    _mm_cvtpi16_ps(mem::transmute(b))
 }

@ -293,8 +293,8 @@ pub unsafe fn _mm_cvtpi32x2_ps(a: i32x2, b: i32x2) -> f32x4 {
 #[inline(always)]
 #[target_feature = "+sse"]
 #[cfg_attr(test, assert_instr(maskmovq))]
-pub unsafe fn _mm_maskmove_si64(a: i8x8, mask: i8x8, mem_addr: *mut i8) {
-    maskmovq(mem::transmute(a), mem::transmute(mask), mem_addr)
+pub unsafe fn _mm_maskmove_si64(a: __m64, mask: __m64, mem_addr: *mut i8) {
+    maskmovq(a, mask, mem_addr)
 }

 /// Conditionally copies the values from each 8-bit element in the first
@ -307,7 +307,7 @@ pub unsafe fn _mm_maskmove_si64(a: i8x8, mask: i8x8, mem_addr: *mut i8) {
 #[inline(always)]
 #[target_feature = "+sse"]
 #[cfg_attr(test, assert_instr(maskmovq))]
-pub unsafe fn _m_maskmovq(a: i8x8, mask: i8x8, mem_addr: *mut i8) {
+pub unsafe fn _m_maskmovq(a: __m64, mask: __m64, mem_addr: *mut i8) {
    _mm_maskmove_si64(a, mask, mem_addr)
 }

@ -482,8 +482,8 @@ mod tests {
        let b = u8x8::new(5, 2, 7, 4, 5, 2, 7, 4);
        let r = u8x8::new(5, 6, 7, 8, 5, 6, 7, 8);

-        assert_eq!(r, sse::_mm_max_pu8(a, b));
-        assert_eq!(r, sse::_m_pmaxub(a, b));
+        assert_eq!(r, u8x8::from(sse::_mm_max_pu8(a.into(), b.into())));
+        assert_eq!(r, u8x8::from(sse::_m_pmaxub(a.into(), b.into())));
    }

    #[simd_test = "sse"]
@ -502,8 +502,8 @@ mod tests {
        let b = u8x8::new(5, 2, 7, 4, 5, 2, 7, 4);
        let r = u8x8::new(2, 2, 3, 4, 2, 2, 3, 4);

-        assert_eq!(r, sse::_mm_min_pu8(a, b));
-        assert_eq!(r, sse::_m_pminub(a, b));
+        assert_eq!(r, u8x8::from(sse::_mm_min_pu8(a.into(), b.into())));
+        assert_eq!(r, u8x8::from(sse::_m_pminub(a.into(), b.into())));
    }

    #[simd_test = "sse"]
@ -516,10 +516,10 @@ mod tests {
    #[simd_test = "sse"]
    unsafe fn _mm_avg_pu8() {
        let (a, b) = (u8x8::splat(3), u8x8::splat(9));
-        let r = sse::_mm_avg_pu8(a, b);
+        let r = u8x8::from(sse::_mm_avg_pu8(a.into(), b.into()));
        assert_eq!(r, u8x8::splat(6));

-        let r = sse::_m_pavgb(a, b);
+        let r = u8x8::from(sse::_m_pavgb(a.into(), b.into()));
        assert_eq!(r, u8x8::splat(6));
    }

@ -538,10 +538,10 @@ mod tests {
    unsafe fn _mm_sad_pu8() {
        let a = u8x8::new(255, 254, 253, 252, 1, 2, 3, 4);
        let b = u8x8::new(0, 0, 0, 0, 2, 1, 2, 1);
-        let r = sse::_mm_sad_pu8(a, b);
+        let r = sse::_mm_sad_pu8(a.into(), b.into());
        assert_eq!(r, mem::transmute(u16x4::new(1020, 0, 0, 0)));

-        let r = sse::_m_psadbw(a, b);
+        let r = sse::_m_psadbw(a.into(), b.into());
        assert_eq!(r, mem::transmute(u16x4::new(1020, 0, 0, 0)));
    }

@ -577,7 +577,7 @@ mod tests {
    unsafe fn _mm_cvtpi8_ps() {
        let a = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
        let expected = f32x4::new(1., 2., 3., 4.);
-        let r = sse::_mm_cvtpi8_ps(a);
+        let r = sse::_mm_cvtpi8_ps(a.into());
        assert_eq!(r, expected);
    }

@ -585,7 +585,7 @@ mod tests {
    unsafe fn _mm_cvtpu8_ps() {
        let a = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
        let expected = f32x4::new(1., 2., 3., 4.);
-        let r = sse::_mm_cvtpu8_ps(a);
+        let r = sse::_mm_cvtpu8_ps(a.into());
        assert_eq!(r, expected);
    }

@ -603,11 +603,11 @@ mod tests {
        let a = i8x8::splat(9);
        let mask = i8x8::splat(0).replace(2, 0x80u8 as i8);
        let mut r = i8x8::splat(0);
-        sse::_mm_maskmove_si64(a, mask, &mut r as *mut _ as *mut i8);
+        sse::_mm_maskmove_si64(a.into(), mask.into(), &mut r as *mut _ as *mut i8);
        assert_eq!(r, i8x8::splat(0).replace(2, 9));

        let mut r = i8x8::splat(0);
-        sse::_m_maskmovq(a, mask, &mut r as *mut _ as *mut i8);
+        sse::_m_maskmovq(a.into(), mask.into(), &mut r as *mut _ as *mut i8);
        assert_eq!(r, i8x8::splat(0).replace(2, 9));
    }

--- a/library/stdarch/coresimd/src/x86/i686/sse41.rs
+++ b/library/stdarch/coresimd/src/x86/i686/sse41.rs
@ -32,7 +32,7 @@ extern "C" {
 #[target_feature = "+sse4.1"]
 #[cfg_attr(test, assert_instr(ptest))]
 pub unsafe fn _mm_testz_si128(a: __m128i, mask: __m128i) -> i32 {
-    ptestz(a.into(), mask.into())
+    ptestz(i64x2::from(a), i64x2::from(mask))
 }

 /// Tests whether the specified bits in a 128-bit integer vector are all
@ -52,7 +52,7 @@ pub unsafe fn _mm_testz_si128(a: __m128i, mask: __m128i) -> i32 {
 #[target_feature = "+sse4.1"]
 #[cfg_attr(test, assert_instr(ptest))]
 pub unsafe fn _mm_testc_si128(a: __m128i, mask: __m128i) -> i32 {
-    ptestc(a.into(), mask.into())
+    ptestc(i64x2::from(a), i64x2::from(mask))
 }

 /// Tests whether the specified bits in a 128-bit integer vector are
@ -72,7 +72,7 @@ pub unsafe fn _mm_testc_si128(a: __m128i, mask: __m128i) -> i32 {
 #[target_feature = "+sse4.1"]
 #[cfg_attr(test, assert_instr(ptest))]
 pub unsafe fn _mm_testnzc_si128(a: __m128i, mask: __m128i) -> i32 {
-    ptestnzc(a.into(), mask.into())
+    ptestnzc(i64x2::from(a), i64x2::from(mask))
 }

 /// Tests whether the specified bits in a 128-bit integer vector are all
@ -111,7 +111,8 @@ pub unsafe fn _mm_test_all_zeros(a: __m128i, mask: __m128i) -> i32 {
 #[cfg_attr(test, assert_instr(pcmpeqd))]
 #[cfg_attr(test, assert_instr(ptest))]
 pub unsafe fn _mm_test_all_ones(a: __m128i) -> i32 {
-    _mm_testc_si128(a, ::x86::_mm_cmpeq_epi32(a.into(), a.into()).into())
+    let b = i32x4::from(a);
+    _mm_testc_si128(a, __m128i::from(::x86::_mm_cmpeq_epi32(b, b)))
 }

 /// Tests whether the specified bits in a 128-bit integer vector are
--- a/library/stdarch/coresimd/src/x86/i686/ssse3.rs
+++ b/library/stdarch/coresimd/src/x86/i686/ssse3.rs
@ -11,8 +11,8 @@ use v64::*;
 #[inline(always)]
 #[target_feature = "+ssse3"]
 #[cfg_attr(test, assert_instr(pabsb))]
-pub unsafe fn _mm_abs_pi8(a: i8x8) -> u8x8 {
-    mem::transmute(pabsb(mem::transmute(a)))
+pub unsafe fn _mm_abs_pi8(a: __m64) -> __m64 {
+    pabsb(a)
 }

 /// Compute the absolute value of packed 8-bit integers in `a`, and return the
@ -20,8 +20,8 @@ pub unsafe fn _mm_abs_pi8(a: i8x8) -> u8x8 {
 #[inline(always)]
 #[target_feature = "+ssse3"]
 #[cfg_attr(test, assert_instr(pabsw))]
-pub unsafe fn _mm_abs_pi16(a: i16x4) -> u16x4 {
-    mem::transmute(pabsw(mem::transmute(a)))
+pub unsafe fn _mm_abs_pi16(a: __m64) -> __m64 {
+    pabsw(a)
 }

 /// Compute the absolute value of packed 32-bit integers in `a`, and return the
@ -38,8 +38,8 @@ pub unsafe fn _mm_abs_pi32(a: i32x2) -> u32x2 {
 #[inline(always)]
 #[target_feature = "+ssse3"]
 #[cfg_attr(test, assert_instr(pshufb))]
-pub unsafe fn _mm_shuffle_pi8(a: u8x8, b: u8x8) -> u8x8 {
-    mem::transmute(pshufb(mem::transmute(a), mem::transmute(b)))
+pub unsafe fn _mm_shuffle_pi8(a: __m64, b: __m64) -> __m64 {
+    pshufb(a, b)
 }

 /// Concatenates the two 64-bit integer vector operands, and right-shifts
@ -47,10 +47,10 @@ pub unsafe fn _mm_shuffle_pi8(a: u8x8, b: u8x8) -> u8x8 {
 #[inline(always)]
 #[target_feature = "+ssse3"]
 #[cfg_attr(test, assert_instr(palignr, n = 15))]
-pub unsafe fn _mm_alignr_pi8(a: u8x8, b: u8x8, n: i32) -> u8x8 {
+pub unsafe fn _mm_alignr_pi8(a: __m64, b: __m64, n: i32) -> __m64 {
    macro_rules! call {
        ($imm8:expr) => {
-            mem::transmute(palignrb(mem::transmute(a), mem::transmute(b), $imm8))
+            palignrb(a, b, $imm8)
        }
    }
    constify_imm8!(n, call)
@ -61,8 +61,8 @@ pub unsafe fn _mm_alignr_pi8(a: u8x8, b: u8x8, n: i32) -> u8x8 {
 #[inline(always)]
 #[target_feature = "+ssse3"]
 #[cfg_attr(test, assert_instr(phaddw))]
-pub unsafe fn _mm_hadd_pi16(a: i16x4, b: i16x4) -> i16x4 {
-    mem::transmute(phaddw(mem::transmute(a), mem::transmute(b)))
+pub unsafe fn _mm_hadd_pi16(a: __m64, b: __m64) -> __m64 {
+    phaddw(a, b)
 }

 /// Horizontally add the adjacent pairs of values contained in 2 packed
@ -80,8 +80,8 @@ pub unsafe fn _mm_hadd_pi32(a: i32x2, b: i32x2) -> i32x2 {
 #[inline(always)]
 #[target_feature = "+ssse3"]
 #[cfg_attr(test, assert_instr(phaddsw))]
-pub unsafe fn _mm_hadds_pi16(a: i16x4, b: i16x4) -> i16x4 {
-    mem::transmute(phaddsw(mem::transmute(a), mem::transmute(b)))
+pub unsafe fn _mm_hadds_pi16(a: __m64, b: __m64) -> __m64 {
+    phaddsw(a, b)
 }

 /// Horizontally subtracts the adjacent pairs of values contained in 2
@ -89,8 +89,8 @@ pub unsafe fn _mm_hadds_pi16(a: i16x4, b: i16x4) -> i16x4 {
 #[inline(always)]
 #[target_feature = "+ssse3"]
 #[cfg_attr(test, assert_instr(phsubw))]
-pub unsafe fn _mm_hsub_pi16(a: i16x4, b: i16x4) -> i16x4 {
-    mem::transmute(phsubw(mem::transmute(a), mem::transmute(b)))
+pub unsafe fn _mm_hsub_pi16(a: __m64, b: __m64) -> __m64 {
+    phsubw(a, b)
 }

 /// Horizontally subtracts the adjacent pairs of values contained in 2
@ -109,8 +109,8 @@ pub unsafe fn _mm_hsub_pi32(a: i32x2, b: i32x2) -> i32x2 {
 #[inline(always)]
 #[target_feature = "+ssse3"]
 #[cfg_attr(test, assert_instr(phsubsw))]
-pub unsafe fn _mm_hsubs_pi16(a: i16x4, b: i16x4) -> i16x4 {
-    mem::transmute(phsubsw(mem::transmute(a), mem::transmute(b)))
+pub unsafe fn _mm_hsubs_pi16(a: __m64, b: __m64) -> __m64 {
+    phsubsw(a, b)
 }

 /// Multiplies corresponding pairs of packed 8-bit unsigned integer
@ -121,8 +121,8 @@ pub unsafe fn _mm_hsubs_pi16(a: i16x4, b: i16x4) -> i16x4 {
 #[inline(always)]
 #[target_feature = "+ssse3"]
 #[cfg_attr(test, assert_instr(pmaddubsw))]
-pub unsafe fn _mm_maddubs_pi16(a: u8x8, b: i8x8) -> i16x4 {
-    mem::transmute(pmaddubsw(mem::transmute(a), mem::transmute(b)))
+pub unsafe fn _mm_maddubs_pi16(a: __m64, b: __m64) -> __m64 {
+    pmaddubsw(a, b)
 }

 /// Multiplies packed 16-bit signed integer values, truncates the 32-bit
@ -131,8 +131,8 @@ pub unsafe fn _mm_maddubs_pi16(a: u8x8, b: i8x8) -> i16x4 {
 #[inline(always)]
 #[target_feature = "+ssse3"]
 #[cfg_attr(test, assert_instr(pmulhrsw))]
-pub unsafe fn _mm_mulhrs_pi16(a: i16x4, b: i16x4) -> i16x4 {
-    mem::transmute(pmulhrsw(mem::transmute(a), mem::transmute(b)))
+pub unsafe fn _mm_mulhrs_pi16(a: __m64, b: __m64) -> __m64 {
+    pmulhrsw(a, b)
 }

 /// Negate packed 8-bit integers in `a` when the corresponding signed 8-bit
@ -142,8 +142,8 @@ pub unsafe fn _mm_mulhrs_pi16(a: i16x4, b: i16x4) -> i16x4 {
 #[inline(always)]
 #[target_feature = "+ssse3"]
 #[cfg_attr(test, assert_instr(psignb))]
-pub unsafe fn _mm_sign_pi8(a: i8x8, b: i8x8) -> i8x8 {
-    mem::transmute(psignb(mem::transmute(a), mem::transmute(b)))
+pub unsafe fn _mm_sign_pi8(a: __m64, b: __m64) -> __m64 {
+    psignb(a, b)
 }

 /// Negate packed 16-bit integers in `a` when the corresponding signed 16-bit
@ -153,8 +153,8 @@ pub unsafe fn _mm_sign_pi8(a: i8x8, b: i8x8) -> i8x8 {
 #[inline(always)]
 #[target_feature = "+ssse3"]
 #[cfg_attr(test, assert_instr(psignw))]
-pub unsafe fn _mm_sign_pi16(a: i16x4, b: i16x4) -> i16x4 {
-    mem::transmute(psignw(mem::transmute(a), mem::transmute(b)))
+pub unsafe fn _mm_sign_pi16(a: __m64, b: __m64) -> __m64 {
+    psignw(a, b)
 }

 /// Negate packed 32-bit integers in `a` when the corresponding signed 32-bit
@ -228,13 +228,13 @@ mod tests {

    #[simd_test = "ssse3"]
    unsafe fn _mm_abs_pi8() {
-        let r = ssse3::_mm_abs_pi8(i8x8::splat(-5));
+        let r = u8x8::from(ssse3::_mm_abs_pi8(i8x8::splat(-5).into()));
        assert_eq!(r, u8x8::splat(5));
    }

    #[simd_test = "ssse3"]
    unsafe fn _mm_abs_pi16() {
-        let r = ssse3::_mm_abs_pi16(i16x4::splat(-5));
+        let r = u16x4::from(ssse3::_mm_abs_pi16(i16x4::splat(-5).into()));
        assert_eq!(r, u16x4::splat(5));
    }

@ -249,7 +249,7 @@ mod tests {
        let a = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
        let b = u8x8::new(4, 128, 4, 3, 24, 12, 6, 19);
        let expected = u8x8::new(5, 0, 5, 4, 1, 5, 7, 4);
-        let r = ssse3::_mm_shuffle_pi8(a, b);
+        let r = u8x8::from(ssse3::_mm_shuffle_pi8(a.into(), b.into()));
        assert_eq!(r, expected);
    }

@ -257,7 +257,7 @@ mod tests {
    unsafe fn _mm_alignr_pi8() {
        let a = u32x2::new(0x89ABCDEF_u32, 0x01234567_u32);
        let b = u32x2::new(0xBBAA9988_u32, 0xFFDDEECC_u32);
-        let r = ssse3::_mm_alignr_pi8(u8x8::from(a), u8x8::from(b), 4);
+        let r = ssse3::_mm_alignr_pi8(u8x8::from(a).into(), u8x8::from(b).into(), 4);
        assert_eq!(r, ::std::mem::transmute(0x89abcdefffddeecc_u64));
    }

@ -266,7 +266,7 @@ mod tests {
        let a = i16x4::new(1, 2, 3, 4);
        let b = i16x4::new(4, 128, 4, 3);
        let expected = i16x4::new(3, 7, 132, 7);
-        let r = ssse3::_mm_hadd_pi16(a, b);
+        let r = i16x4::from(ssse3::_mm_hadd_pi16(a.into(), b.into()));
        assert_eq!(r, expected);
    }

@ -284,7 +284,7 @@ mod tests {
        let a = i16x4::new(1, 2, 3, 4);
        let b = i16x4::new(32767, 1, -32768, -1);
        let expected = i16x4::new(3, 7, 32767, -32768);
-        let r = ssse3::_mm_hadds_pi16(a, b);
+        let r = i16x4::from(ssse3::_mm_hadds_pi16(a.into(), b.into()));
        assert_eq!(r, expected);
    }

@ -293,7 +293,7 @@ mod tests {
        let a = i16x4::new(1, 2, 3, 4);
        let b = i16x4::new(4, 128, 4, 3);
        let expected = i16x4::new(-1, -1, -124, 1);
-        let r = ssse3::_mm_hsub_pi16(a, b);
+        let r = i16x4::from(ssse3::_mm_hsub_pi16(a.into(), b.into()));
        assert_eq!(r, expected);
    }

@ -311,7 +311,7 @@ mod tests {
        let a = i16x4::new(1, 2, 3, 4);
        let b = i16x4::new(4, 128, 4, 3);
        let expected = i16x4::new(-1, -1, -124, 1);
-        let r = ssse3::_mm_hsubs_pi16(a, b);
+        let r = i16x4::from(ssse3::_mm_hsubs_pi16(a.into(), b.into()));
        assert_eq!(r, expected);
    }

@ -320,7 +320,7 @@ mod tests {
        let a = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
        let b = i8x8::new(4, 63, 4, 3, 24, 12, 6, 19);
        let expected = i16x4::new(130, 24, 192, 194);
-        let r = ssse3::_mm_maddubs_pi16(a, b);
+        let r = i16x4::from(ssse3::_mm_maddubs_pi16(a.into(), b.into()));
        assert_eq!(r, expected);
    }

@ -329,7 +329,7 @@ mod tests {
        let a = i16x4::new(1, 2, 3, 4);
        let b = i16x4::new(4, 32767, -1, -32768);
        let expected = i16x4::new(0, 2, 0, -4);
-        let r = ssse3::_mm_mulhrs_pi16(a, b);
+        let r = i16x4::from(ssse3::_mm_mulhrs_pi16(a.into(), b.into()));
        assert_eq!(r, expected);
    }

@ -338,7 +338,7 @@ mod tests {
        let a = i8x8::new(1, 2, 3, 4, -5, -6, 7, 8);
        let b = i8x8::new(4, 64, 0, 3, 1, -1, -2, 1);
        let expected = i8x8::new(1, 2, 0, 4, -5, 6, -7, 8);
-        let r = ssse3::_mm_sign_pi8(a, b);
+        let r = i8x8::from(ssse3::_mm_sign_pi8(a.into(), b.into()));
        assert_eq!(r, expected);
    }

@ -347,7 +347,7 @@ mod tests {
        let a = i16x4::new(-1, 2, 3, 4);
        let b = i16x4::new(1, -1, 1, 0);
        let expected = i16x4::new(-1, -2, 3, 0);
-        let r = ssse3::_mm_sign_pi16(a, b);
+        let r = i16x4::from(ssse3::_mm_sign_pi16(a.into(), b.into()));
        assert_eq!(r, expected);
    }

--- a/library/stdarch/stdsimd-test/src/lib.rs
+++ b/library/stdarch/stdsimd-test/src/lib.rs
@ -293,9 +293,41 @@ pub fn assert(fnptr: usize, fnname: &str, expected: &str) {
        }
    }

-    let probably_only_one_instruction = function.instrs.len() < 30;
+    // Look for `call` instructions in the disassembly to detect whether
+    // inlining failed: all intrinsics are `#[inline(always)]`, so
+    // calling one intrinsic from another should not generate `call`
+    // instructions.
+    let mut inlining_failed = false;
+    for (i, instr) in function.instrs.iter().enumerate() {
+        let part = match instr.parts.get(0) {
+            Some(part) => part,
+            None => continue,
+        };
+        if !part.contains("call") {
+            continue
+        }

-    if found && probably_only_one_instruction {
+        // On 32-bit x86 position independent code will call itself and be
+        // immediately followed by a `pop` to learn about the current address.
+        // Let's not take that into account when considering whether a function
+        // failed inlining something.
+        let followed_by_pop = function.instrs.get(i + 1)
+            .and_then(|i| i.parts.get(0))
+            .map(|s| s.contains("pop"))
+            .unwrap_or(false);
+        if followed_by_pop && cfg!(target_arch = "x86") {
+            continue
+        }
+
+        inlining_failed = true;
+        break;
+    }
+
+    let instruction_limit = 30;
+    let probably_only_one_instruction =
+        function.instrs.len() < instruction_limit;
+
+    if found && probably_only_one_instruction && !inlining_failed {
        return;
    }

@ -319,7 +351,12 @@ pub fn assert(fnptr: usize, fnname: &str, expected: &str) {
            expected
        );
    } else if !probably_only_one_instruction {
-        panic!("too many instructions in the disassembly");
+        panic!("instruction found, but the disassembly contains too many \
+                instructions: #instructions = {} >= {} (limit)",
+               function.instrs.len(), instruction_limit);
+    } else if inlining_failed {
+        panic!("instruction found, but the disassembly contains `call` \
+                instructions, which hint that inlining failed");
    }
 }