Rework SIMD zeroing

2024-11-25 22:17:25 +01:00 · 2024-11-25 22:17:25 +01:00 · 4d2911ba4b
commit 4d2911ba4b
parent 91c0dabca3
20 changed files with 1468 additions and 2906 deletions
--- a/library/stdarch/crates/core_arch/src/simd.rs
+++ b/library/stdarch/crates/core_arch/src/simd.rs
@ -10,6 +10,9 @@ macro_rules! simd_ty {

        #[allow(clippy::use_self)]
        impl $id {
+            /// A value of this type where all elements are zeroed out.
+            pub(crate) const ZERO: Self = unsafe { crate::mem::zeroed() };
+
            #[inline(always)]
            pub(crate) const fn new($($param_name: $elem_type),*) -> Self {
                $id([$($param_name),*])
--- a/library/stdarch/crates/core_arch/src/wasm32/simd128.rs
+++ b/library/stdarch/crates/core_arch/src/wasm32/simd128.rs
@ -2232,7 +2232,7 @@ pub fn v128_any_true(a: v128) -> bool {
 pub fn i8x16_abs(a: v128) -> v128 {
    unsafe {
        let a = a.as_i8x16();
-        let zero = simd::i8x16::splat(0);
+        let zero = simd::i8x16::ZERO;
        simd_select::<simd::m8x16, simd::i8x16>(simd_lt(a, zero), simd_sub(zero, a), a).v128()
    }
 }
@ -2524,7 +2524,7 @@ pub use i16x8_extadd_pairwise_u8x16 as u16x8_extadd_pairwise_u8x16;
 #[stable(feature = "wasm_simd", since = "1.54.0")]
 pub fn i16x8_abs(a: v128) -> v128 {
    let a = a.as_i16x8();
-    let zero = simd::i16x8::splat(0);
+    let zero = simd::i16x8::ZERO;
    unsafe {
        simd_select::<simd::m16x8, simd::i16x8>(simd_lt(a, zero), simd_sub(zero, a), a).v128()
    }
@ -3012,7 +3012,7 @@ pub use i32x4_extadd_pairwise_u16x8 as u32x4_extadd_pairwise_u16x8;
 #[stable(feature = "wasm_simd", since = "1.54.0")]
 pub fn i32x4_abs(a: v128) -> v128 {
    let a = a.as_i32x4();
-    let zero = simd::i32x4::splat(0);
+    let zero = simd::i32x4::ZERO;
    unsafe {
        simd_select::<simd::m32x4, simd::i32x4>(simd_lt(a, zero), simd_sub(zero, a), a).v128()
    }
@ -3394,7 +3394,7 @@ pub use i32x4_extmul_high_u16x8 as u32x4_extmul_high_u16x8;
 #[stable(feature = "wasm_simd", since = "1.54.0")]
 pub fn i64x2_abs(a: v128) -> v128 {
    let a = a.as_i64x2();
-    let zero = simd::i64x2::splat(0);
+    let zero = simd::i64x2::ZERO;
    unsafe {
        simd_select::<simd::m64x2, simd::i64x2>(simd_lt(a, zero), simd_sub(zero, a), a).v128()
    }
@ -4105,7 +4105,7 @@ pub fn i32x4_trunc_sat_f64x2_zero(a: v128) -> v128 {
    let ret: simd::i32x4 = unsafe {
        simd_shuffle!(
            llvm_i32x2_trunc_sat_f64x2_s(a.as_f64x2()),
-            simd::i32x2::splat(0),
+            simd::i32x2::ZERO,
            [0, 1, 2, 3],
        )
    };
@ -4129,7 +4129,7 @@ pub fn u32x4_trunc_sat_f64x2_zero(a: v128) -> v128 {
    let ret: simd::i32x4 = unsafe {
        simd_shuffle!(
            llvm_i32x2_trunc_sat_f64x2_u(a.as_f64x2()),
-            simd::i32x2::splat(0),
+            simd::i32x2::ZERO,
            [0, 1, 2, 3],
        )
    };
@ -4176,7 +4176,7 @@ pub fn f32x4_demote_f64x2_zero(a: v128) -> v128 {
    unsafe {
        simd_cast::<simd::f64x4, simd::f32x4>(simd_shuffle!(
            a.as_f64x2(),
-            simd::f64x2::splat(0.0),
+            simd::f64x2::ZERO,
            [0, 1, 2, 3]
        ))
        .v128()
--- a/library/stdarch/crates/core_arch/src/x86/avx.rs
+++ b/library/stdarch/crates/core_arch/src/x86/avx.rs
@ -515,7 +515,7 @@ pub unsafe fn _mm256_blend_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
 #[cfg_attr(test, assert_instr(vblendvpd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_blendv_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
-    let mask: i64x4 = simd_lt(transmute::<_, i64x4>(c), i64x4::splat(0));
+    let mask: i64x4 = simd_lt(transmute::<_, i64x4>(c), i64x4::ZERO);
    transmute(simd_select(mask, b.as_f64x4(), a.as_f64x4()))
 }

@ -528,7 +528,7 @@ pub unsafe fn _mm256_blendv_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
 #[cfg_attr(test, assert_instr(vblendvps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_blendv_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
-    let mask: i32x8 = simd_lt(transmute::<_, i32x8>(c), i32x8::splat(0));
+    let mask: i32x8 = simd_lt(transmute::<_, i32x8>(c), i32x8::ZERO);
    transmute(simd_select(mask, b.as_f32x8(), a.as_f32x8()))
 }

@ -983,11 +983,7 @@ pub unsafe fn _mm256_extractf128_pd<const IMM1: i32>(a: __m256d) -> __m128d {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_extractf128_si256<const IMM1: i32>(a: __m256i) -> __m128i {
    static_assert_uimm_bits!(IMM1, 1);
-    let dst: i64x2 = simd_shuffle!(
-        a.as_i64x4(),
-        _mm256_undefined_si256().as_i64x4(),
-        [[0, 1], [2, 3]][IMM1 as usize],
-    );
+    let dst: i64x2 = simd_shuffle!(a.as_i64x4(), i64x4::ZERO, [[0, 1], [2, 3]][IMM1 as usize],);
    transmute(dst)
 }

@ -2139,7 +2135,7 @@ pub unsafe fn _mm_testnzc_ps(a: __m128, b: __m128) -> i32 {
 pub unsafe fn _mm256_movemask_pd(a: __m256d) -> i32 {
    // Propagate the highest bit to the rest, because simd_bitmask
    // requires all-1 or all-0.
-    let mask: i64x4 = simd_lt(transmute(a), i64x4::splat(0));
+    let mask: i64x4 = simd_lt(transmute(a), i64x4::ZERO);
    simd_bitmask::<i64x4, u8>(mask).into()
 }

@ -2155,7 +2151,7 @@ pub unsafe fn _mm256_movemask_pd(a: __m256d) -> i32 {
 pub unsafe fn _mm256_movemask_ps(a: __m256) -> i32 {
    // Propagate the highest bit to the rest, because simd_bitmask
    // requires all-1 or all-0.
-    let mask: i32x8 = simd_lt(transmute(a), i32x8::splat(0));
+    let mask: i32x8 = simd_lt(transmute(a), i32x8::ZERO);
    simd_bitmask::<i32x8, u8>(mask).into()
 }

@ -2167,7 +2163,7 @@ pub unsafe fn _mm256_movemask_ps(a: __m256) -> i32 {
 #[cfg_attr(test, assert_instr(vxorp))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_setzero_pd() -> __m256d {
-    _mm256_set1_pd(0.0)
+    const { mem::zeroed() }
 }

 /// Returns vector of type __m256 with all elements set to zero.
@ -2178,7 +2174,7 @@ pub unsafe fn _mm256_setzero_pd() -> __m256d {
 #[cfg_attr(test, assert_instr(vxorps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_setzero_ps() -> __m256 {
-    _mm256_set1_ps(0.0)
+    const { mem::zeroed() }
 }

 /// Returns vector of type __m256i with all elements set to zero.
@ -2189,7 +2185,7 @@ pub unsafe fn _mm256_setzero_ps() -> __m256 {
 #[cfg_attr(test, assert_instr(vxor))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_setzero_si256() -> __m256i {
-    _mm256_set1_epi8(0)
+    const { mem::zeroed() }
 }

 /// Sets packed double-precision (64-bit) floating-point elements in returned
@ -2722,7 +2718,7 @@ pub unsafe fn _mm256_castpd128_pd256(a: __m128d) -> __m256d {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_castsi128_si256(a: __m128i) -> __m256i {
    let a = a.as_i64x2();
-    let undefined = _mm_undefined_si128().as_i64x2();
+    let undefined = i64x2::ZERO;
    let dst: i64x4 = simd_shuffle!(a, undefined, [0, 1, 2, 2]);
    transmute(dst)
 }
@ -2752,7 +2748,7 @@ pub unsafe fn _mm256_zextps128_ps256(a: __m128) -> __m256 {
 // instructions, thus it has zero latency.
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_zextsi128_si256(a: __m128i) -> __m256i {
-    let b = _mm_setzero_si128().as_i64x2();
+    let b = i64x2::ZERO;
    let dst: i64x4 = simd_shuffle!(a.as_i64x2(), b, [0, 1, 2, 3]);
    transmute(dst)
 }
@ -2782,7 +2778,7 @@ pub unsafe fn _mm256_zextpd128_pd256(a: __m128d) -> __m256d {
 // This intrinsic has no corresponding instruction.
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_undefined_ps() -> __m256 {
-    _mm256_set1_ps(0.0)
+    const { mem::zeroed() }
 }

 /// Returns vector of type `__m256d` with indeterminate elements.
@ -2795,7 +2791,7 @@ pub unsafe fn _mm256_undefined_ps() -> __m256 {
 // This intrinsic has no corresponding instruction.
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_undefined_pd() -> __m256d {
-    _mm256_set1_pd(0.0)
+    const { mem::zeroed() }
 }

 /// Returns vector of type __m256i with with indeterminate elements.
@ -2808,7 +2804,7 @@ pub unsafe fn _mm256_undefined_pd() -> __m256d {
 // This intrinsic has no corresponding instruction.
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_undefined_si256() -> __m256i {
-    __m256i([0, 0, 0, 0])
+    const { mem::zeroed() }
 }

 /// Sets packed __m256 returned vector with the supplied values.
--- a/library/stdarch/crates/core_arch/src/x86/avx2.rs
+++ b/library/stdarch/crates/core_arch/src/x86/avx2.rs
@ -33,8 +33,7 @@ use stdarch_test::assert_instr;
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_abs_epi32(a: __m256i) -> __m256i {
    let a = a.as_i32x8();
-    let zero = i32x8::splat(0);
-    let r = simd_select::<m32x8, _>(simd_lt(a, zero), simd_neg(a), a);
+    let r = simd_select::<m32x8, _>(simd_lt(a, i32x8::ZERO), simd_neg(a), a);
    transmute(r)
 }

@ -47,8 +46,7 @@ pub unsafe fn _mm256_abs_epi32(a: __m256i) -> __m256i {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_abs_epi16(a: __m256i) -> __m256i {
    let a = a.as_i16x16();
-    let zero = i16x16::splat(0);
-    let r = simd_select::<m16x16, _>(simd_lt(a, zero), simd_neg(a), a);
+    let r = simd_select::<m16x16, _>(simd_lt(a, i16x16::ZERO), simd_neg(a), a);
    transmute(r)
 }

@ -61,8 +59,7 @@ pub unsafe fn _mm256_abs_epi16(a: __m256i) -> __m256i {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_abs_epi8(a: __m256i) -> __m256i {
    let a = a.as_i8x32();
-    let zero = i8x32::splat(0);
-    let r = simd_select::<m8x32, _>(simd_lt(a, zero), simd_neg(a), a);
+    let r = simd_select::<m8x32, _>(simd_lt(a, i8x32::ZERO), simd_neg(a), a);
    transmute(r)
 }

@ -168,12 +165,12 @@ pub unsafe fn _mm256_alignr_epi8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m
    // If palignr is shifting the pair of vectors more than the size of two
    // lanes, emit zero.
    if IMM8 > 32 {
-        return _mm256_set1_epi8(0);
+        return _mm256_setzero_si256();
    }
    // If palignr is shifting the pair of input vectors more than one lane,
    // but less than two lanes, convert to shifting in zeroes.
    let (a, b) = if IMM8 > 16 {
-        (_mm256_set1_epi8(0), a)
+        (_mm256_setzero_si256(), a)
    } else {
        (a, b)
    };
@ -471,7 +468,7 @@ pub unsafe fn _mm256_blend_epi16<const IMM8: i32>(a: __m256i, b: __m256i) -> __m
 #[cfg_attr(test, assert_instr(vpblendvb))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_blendv_epi8(a: __m256i, b: __m256i, mask: __m256i) -> __m256i {
-    let mask: i8x32 = simd_lt(mask.as_i8x32(), i8x32::splat(0));
+    let mask: i8x32 = simd_lt(mask.as_i8x32(), i8x32::ZERO);
    transmute(simd_select(mask, b.as_i8x32(), a.as_i8x32()))
 }

@ -484,8 +481,7 @@ pub unsafe fn _mm256_blendv_epi8(a: __m256i, b: __m256i, mask: __m256i) -> __m25
 #[cfg_attr(test, assert_instr(vpbroadcastb))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_broadcastb_epi8(a: __m128i) -> __m128i {
-    let zero = _mm_setzero_si128();
-    let ret = simd_shuffle!(a.as_i8x16(), zero.as_i8x16(), [0_u32; 16]);
+    let ret = simd_shuffle!(a.as_i8x16(), i8x16::ZERO, [0_u32; 16]);
    transmute::<i8x16, _>(ret)
 }

@ -498,8 +494,7 @@ pub unsafe fn _mm_broadcastb_epi8(a: __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(vpbroadcastb))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_broadcastb_epi8(a: __m128i) -> __m256i {
-    let zero = _mm_setzero_si128();
-    let ret = simd_shuffle!(a.as_i8x16(), zero.as_i8x16(), [0_u32; 32]);
+    let ret = simd_shuffle!(a.as_i8x16(), i8x16::ZERO, [0_u32; 32]);
    transmute::<i8x32, _>(ret)
 }

@ -514,8 +509,7 @@ pub unsafe fn _mm256_broadcastb_epi8(a: __m128i) -> __m256i {
 #[cfg_attr(test, assert_instr(vbroadcastss))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_broadcastd_epi32(a: __m128i) -> __m128i {
-    let zero = _mm_setzero_si128();
-    let ret = simd_shuffle!(a.as_i32x4(), zero.as_i32x4(), [0_u32; 4]);
+    let ret = simd_shuffle!(a.as_i32x4(), i32x4::ZERO, [0_u32; 4]);
    transmute::<i32x4, _>(ret)
 }

@ -530,8 +524,7 @@ pub unsafe fn _mm_broadcastd_epi32(a: __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(vbroadcastss))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_broadcastd_epi32(a: __m128i) -> __m256i {
-    let zero = _mm_setzero_si128();
-    let ret = simd_shuffle!(a.as_i32x4(), zero.as_i32x4(), [0_u32; 8]);
+    let ret = simd_shuffle!(a.as_i32x4(), i32x4::ZERO, [0_u32; 8]);
    transmute::<i32x8, _>(ret)
 }

@ -595,8 +588,7 @@ pub unsafe fn _mm256_broadcastsd_pd(a: __m128d) -> __m256d {
 #[target_feature(enable = "avx2")]
 #[stable(feature = "simd_x86_updates", since = "1.82.0")]
 pub unsafe fn _mm_broadcastsi128_si256(a: __m128i) -> __m256i {
-    let zero = _mm_setzero_si128();
-    let ret = simd_shuffle!(a.as_i64x2(), zero.as_i64x2(), [0, 1, 0, 1]);
+    let ret = simd_shuffle!(a.as_i64x2(), i64x2::ZERO, [0, 1, 0, 1]);
    transmute::<i64x4, _>(ret)
 }

@ -610,8 +602,7 @@ pub unsafe fn _mm_broadcastsi128_si256(a: __m128i) -> __m256i {
 #[target_feature(enable = "avx2")]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_broadcastsi128_si256(a: __m128i) -> __m256i {
-    let zero = _mm_setzero_si128();
-    let ret = simd_shuffle!(a.as_i64x2(), zero.as_i64x2(), [0, 1, 0, 1]);
+    let ret = simd_shuffle!(a.as_i64x2(), i64x2::ZERO, [0, 1, 0, 1]);
    transmute::<i64x4, _>(ret)
 }

@ -648,8 +639,7 @@ pub unsafe fn _mm256_broadcastss_ps(a: __m128) -> __m256 {
 #[cfg_attr(test, assert_instr(vpbroadcastw))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_broadcastw_epi16(a: __m128i) -> __m128i {
-    let zero = _mm_setzero_si128();
-    let ret = simd_shuffle!(a.as_i16x8(), zero.as_i16x8(), [0_u32; 8]);
+    let ret = simd_shuffle!(a.as_i16x8(), i16x8::ZERO, [0_u32; 8]);
    transmute::<i16x8, _>(ret)
 }

@ -662,8 +652,7 @@ pub unsafe fn _mm_broadcastw_epi16(a: __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(vpbroadcastw))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_broadcastw_epi16(a: __m128i) -> __m256i {
-    let zero = _mm_setzero_si128();
-    let ret = simd_shuffle!(a.as_i16x8(), zero.as_i16x8(), [0_u32; 16]);
+    let ret = simd_shuffle!(a.as_i16x8(), i16x8::ZERO, [0_u32; 16]);
    transmute::<i16x16, _>(ret)
 }

@ -917,7 +906,7 @@ pub unsafe fn _mm256_cvtepu8_epi64(a: __m128i) -> __m256i {
 pub unsafe fn _mm256_extracti128_si256<const IMM1: i32>(a: __m256i) -> __m128i {
    static_assert_uimm_bits!(IMM1, 1);
    let a = a.as_i64x4();
-    let b = _mm256_undefined_si256().as_i64x4();
+    let b = i64x4::ZERO;
    let dst: i64x2 = simd_shuffle!(a, b, [[0, 1], [2, 3]][IMM1 as usize]);
    transmute(dst)
 }
@ -1005,7 +994,7 @@ pub unsafe fn _mm_i32gather_epi32<const SCALE: i32>(
    offsets: __m128i,
 ) -> __m128i {
    static_assert_imm8_scale!(SCALE);
-    let zero = _mm_setzero_si128().as_i32x4();
+    let zero = i32x4::ZERO;
    let neg_one = _mm_set1_epi32(-1).as_i32x4();
    let offsets = offsets.as_i32x4();
    let slice = slice as *const i8;
@ -1054,7 +1043,7 @@ pub unsafe fn _mm256_i32gather_epi32<const SCALE: i32>(
    offsets: __m256i,
 ) -> __m256i {
    static_assert_imm8_scale!(SCALE);
-    let zero = _mm256_setzero_si256().as_i32x8();
+    let zero = i32x8::ZERO;
    let neg_one = _mm256_set1_epi32(-1).as_i32x8();
    let offsets = offsets.as_i32x8();
    let slice = slice as *const i8;
@ -1187,7 +1176,7 @@ pub unsafe fn _mm_i32gather_epi64<const SCALE: i32>(
    offsets: __m128i,
 ) -> __m128i {
    static_assert_imm8_scale!(SCALE);
-    let zero = _mm_setzero_si128().as_i64x2();
+    let zero = i64x2::ZERO;
    let neg_one = _mm_set1_epi64x(-1).as_i64x2();
    let offsets = offsets.as_i32x4();
    let slice = slice as *const i8;
@ -1236,7 +1225,7 @@ pub unsafe fn _mm256_i32gather_epi64<const SCALE: i32>(
    offsets: __m128i,
 ) -> __m256i {
    static_assert_imm8_scale!(SCALE);
-    let zero = _mm256_setzero_si256().as_i64x4();
+    let zero = i64x4::ZERO;
    let neg_one = _mm256_set1_epi64x(-1).as_i64x4();
    let offsets = offsets.as_i32x4();
    let slice = slice as *const i8;
@ -1372,7 +1361,7 @@ pub unsafe fn _mm_i64gather_epi32<const SCALE: i32>(
    offsets: __m128i,
 ) -> __m128i {
    static_assert_imm8_scale!(SCALE);
-    let zero = _mm_setzero_si128().as_i32x4();
+    let zero = i32x4::ZERO;
    let neg_one = _mm_set1_epi64x(-1).as_i32x4();
    let offsets = offsets.as_i64x2();
    let slice = slice as *const i8;
@ -1421,7 +1410,7 @@ pub unsafe fn _mm256_i64gather_epi32<const SCALE: i32>(
    offsets: __m256i,
 ) -> __m128i {
    static_assert_imm8_scale!(SCALE);
-    let zero = _mm_setzero_si128().as_i32x4();
+    let zero = i32x4::ZERO;
    let neg_one = _mm_set1_epi64x(-1).as_i32x4();
    let offsets = offsets.as_i64x4();
    let slice = slice as *const i8;
@ -1554,7 +1543,7 @@ pub unsafe fn _mm_i64gather_epi64<const SCALE: i32>(
    offsets: __m128i,
 ) -> __m128i {
    static_assert_imm8_scale!(SCALE);
-    let zero = _mm_setzero_si128().as_i64x2();
+    let zero = i64x2::ZERO;
    let neg_one = _mm_set1_epi64x(-1).as_i64x2();
    let slice = slice as *const i8;
    let offsets = offsets.as_i64x2();
@ -1603,7 +1592,7 @@ pub unsafe fn _mm256_i64gather_epi64<const SCALE: i32>(
    offsets: __m256i,
 ) -> __m256i {
    static_assert_imm8_scale!(SCALE);
-    let zero = _mm256_setzero_si256().as_i64x4();
+    let zero = i64x4::ZERO;
    let neg_one = _mm256_set1_epi64x(-1).as_i64x4();
    let slice = slice as *const i8;
    let offsets = offsets.as_i64x4();
@ -2052,7 +2041,7 @@ pub unsafe fn _mm256_min_epu8(a: __m256i, b: __m256i) -> __m256i {
 #[cfg_attr(test, assert_instr(vpmovmskb))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_movemask_epi8(a: __m256i) -> i32 {
-    let z = i8x32::splat(0);
+    let z = i8x32::ZERO;
    let m: i8x32 = simd_lt(a.as_i8x32(), z);
    simd_bitmask::<_, u32>(m) as i32
 }
@ -2265,7 +2254,7 @@ pub unsafe fn _mm256_permutevar8x32_epi32(a: __m256i, b: __m256i) -> __m256i {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_permute4x64_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
    static_assert_uimm_bits!(IMM8, 8);
-    let zero = _mm256_setzero_si256().as_i64x4();
+    let zero = i64x4::ZERO;
    let r: i64x4 = simd_shuffle!(
        a.as_i64x4(),
        zero,
@ -2670,9 +2659,8 @@ pub unsafe fn _mm256_bslli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
        }
    }
    let a = a.as_i8x32();
-    let zero = _mm256_setzero_si256().as_i8x32();
    let r: i8x32 = simd_shuffle!(
-        zero,
+        i8x32::ZERO,
        a,
        [
            mask(IMM8, 0),
@ -2864,7 +2852,7 @@ pub unsafe fn _mm256_srli_si256<const IMM8: i32>(a: __m256i) -> __m256i {
 pub unsafe fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
    static_assert_uimm_bits!(IMM8, 8);
    let a = a.as_i8x32();
-    let zero = _mm256_setzero_si256().as_i8x32();
+    let zero = i8x32::ZERO;
    let r: i8x32 = match IMM8 % 16 {
        0 => simd_shuffle!(
            a,
--- a/library/stdarch/crates/core_arch/src/x86/avx512bf16.rs
+++ b/library/stdarch/crates/core_arch/src/x86/avx512bf16.rs
@ -66,8 +66,7 @@ pub unsafe fn _mm_mask_cvtne2ps_pbh(src: __m128bh, k: __mmask8, a: __m128, b: __
 #[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
 pub unsafe fn _mm_maskz_cvtne2ps_pbh(k: __mmask8, a: __m128, b: __m128) -> __m128bh {
    let cvt = _mm_cvtne2ps_pbh(a, b).as_u16x8();
-    let zero = _mm_setzero_si128().as_u16x8();
-    transmute(simd_select_bitmask(k, cvt, zero))
+    transmute(simd_select_bitmask(k, cvt, u16x8::ZERO))
 }

 /// Convert packed single-precision (32-bit) floating-point elements in two 256-bit vectors
@ -110,8 +109,7 @@ pub unsafe fn _mm256_mask_cvtne2ps_pbh(
 #[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
 pub unsafe fn _mm256_maskz_cvtne2ps_pbh(k: __mmask16, a: __m256, b: __m256) -> __m256bh {
    let cvt = _mm256_cvtne2ps_pbh(a, b).as_u16x16();
-    let zero = _mm256_setzero_si256().as_u16x16();
-    transmute(simd_select_bitmask(k, cvt, zero))
+    transmute(simd_select_bitmask(k, cvt, u16x16::ZERO))
 }

 /// Convert packed single-precision (32-bit) floating-point elements in two 512-bit vectors
@ -156,8 +154,7 @@ pub unsafe fn _mm512_mask_cvtne2ps_pbh(
 #[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
 pub unsafe fn _mm512_maskz_cvtne2ps_pbh(k: __mmask32, a: __m512, b: __m512) -> __m512bh {
    let cvt = _mm512_cvtne2ps_pbh(a, b).as_u16x32();
-    let zero = _mm512_setzero_si512().as_u16x32();
-    transmute(simd_select_bitmask(k, cvt, zero))
+    transmute(simd_select_bitmask(k, cvt, u16x32::ZERO))
 }

 /// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
@ -194,8 +191,7 @@ pub unsafe fn _mm256_mask_cvtneps_pbh(src: __m128bh, k: __mmask8, a: __m256) ->
 #[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
 pub unsafe fn _mm256_maskz_cvtneps_pbh(k: __mmask8, a: __m256) -> __m128bh {
    let cvt = _mm256_cvtneps_pbh(a).as_u16x8();
-    let zero = _mm_setzero_si128().as_u16x8();
-    transmute(simd_select_bitmask(k, cvt, zero))
+    transmute(simd_select_bitmask(k, cvt, u16x8::ZERO))
 }

 /// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
@ -232,8 +228,7 @@ pub unsafe fn _mm512_mask_cvtneps_pbh(src: __m256bh, k: __mmask16, a: __m512) ->
 #[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
 pub unsafe fn _mm512_maskz_cvtneps_pbh(k: __mmask16, a: __m512) -> __m256bh {
    let cvt = _mm512_cvtneps_pbh(a).as_u16x16();
-    let zero = _mm256_setzero_si256().as_u16x16();
-    transmute(simd_select_bitmask(k, cvt, zero))
+    transmute(simd_select_bitmask(k, cvt, u16x16::ZERO))
 }

 /// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
@ -314,8 +309,7 @@ pub unsafe fn _mm256_mask_dpbf16_ps(src: __m256, k: __mmask8, a: __m256bh, b: __
 #[cfg_attr(test, assert_instr("vdpbf16ps"))]
 pub unsafe fn _mm256_maskz_dpbf16_ps(k: __mmask8, src: __m256, a: __m256bh, b: __m256bh) -> __m256 {
    let rst = _mm256_dpbf16_ps(src, a, b).as_f32x8();
-    let zero = _mm256_setzero_ps().as_f32x8();
-    transmute(simd_select_bitmask(k, rst, zero))
+    transmute(simd_select_bitmask(k, rst, f32x8::ZERO))
 }

 /// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
@ -362,8 +356,7 @@ pub unsafe fn _mm512_maskz_dpbf16_ps(
    b: __m512bh,
 ) -> __m512 {
    let rst = _mm512_dpbf16_ps(src, a, b).as_f32x16();
-    let zero = _mm512_setzero_ps().as_f32x16();
-    transmute(simd_select_bitmask(k, rst, zero))
+    transmute(simd_select_bitmask(k, rst, f32x16::ZERO))
 }

 /// Converts packed BF16 (16-bit) floating-point elements in a to packed single-precision (32-bit)
@ -400,8 +393,7 @@ pub unsafe fn _mm512_mask_cvtpbh_ps(src: __m512, k: __mmask16, a: __m256bh) -> _
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub unsafe fn _mm512_maskz_cvtpbh_ps(k: __mmask16, a: __m256bh) -> __m512 {
    let cvt = _mm512_cvtpbh_ps(a);
-    let zero = _mm512_setzero_ps();
-    transmute(simd_select_bitmask(k, cvt.as_f32x16(), zero.as_f32x16()))
+    transmute(simd_select_bitmask(k, cvt.as_f32x16(), f32x16::ZERO))
 }

 /// Converts packed BF16 (16-bit) floating-point elements in a to packed single-precision (32-bit)
@ -438,8 +430,7 @@ pub unsafe fn _mm256_mask_cvtpbh_ps(src: __m256, k: __mmask8, a: __m128bh) -> __
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub unsafe fn _mm256_maskz_cvtpbh_ps(k: __mmask8, a: __m128bh) -> __m256 {
    let cvt = _mm256_cvtpbh_ps(a);
-    let zero = _mm256_setzero_ps();
-    transmute(simd_select_bitmask(k, cvt.as_f32x8(), zero.as_f32x8()))
+    transmute(simd_select_bitmask(k, cvt.as_f32x8(), f32x8::ZERO))
 }

 /// Converts packed BF16 (16-bit) floating-point elements in a to single-precision (32-bit) floating-point
@ -476,8 +467,7 @@ pub unsafe fn _mm_mask_cvtpbh_ps(src: __m128, k: __mmask8, a: __m128bh) -> __m12
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub unsafe fn _mm_maskz_cvtpbh_ps(k: __mmask8, a: __m128bh) -> __m128 {
    let cvt = _mm_cvtpbh_ps(a);
-    let zero = _mm_setzero_ps();
-    transmute(simd_select_bitmask(k, cvt.as_f32x4(), zero.as_f32x4()))
+    transmute(simd_select_bitmask(k, cvt.as_f32x4(), f32x4::ZERO))
 }

 /// Converts a single BF16 (16-bit) floating-point element in a to a single-precision (32-bit) floating-point
--- a/library/stdarch/crates/core_arch/src/x86/avx512bitalg.rs
+++ b/library/stdarch/crates/core_arch/src/x86/avx512bitalg.rs
@ -7,6 +7,9 @@
 //!
 //! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf

+use crate::core_arch::simd::i16x16;
+use crate::core_arch::simd::i16x32;
+use crate::core_arch::simd::i16x8;
 use crate::core_arch::simd::i8x16;
 use crate::core_arch::simd::i8x32;
 use crate::core_arch::simd::i8x64;
@ -17,9 +20,6 @@ use crate::core_arch::x86::__mmask16;
 use crate::core_arch::x86::__mmask32;
 use crate::core_arch::x86::__mmask64;
 use crate::core_arch::x86::__mmask8;
-use crate::core_arch::x86::_mm256_setzero_si256;
-use crate::core_arch::x86::_mm512_setzero_si512;
-use crate::core_arch::x86::_mm_setzero_si128;
 use crate::core_arch::x86::m128iExt;
 use crate::core_arch::x86::m256iExt;
 use crate::core_arch::x86::m512iExt;
@ -61,8 +61,11 @@ pub unsafe fn _mm512_popcnt_epi16(a: __m512i) -> __m512i {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpopcntw))]
 pub unsafe fn _mm512_maskz_popcnt_epi16(k: __mmask32, a: __m512i) -> __m512i {
-    let zero = _mm512_setzero_si512().as_i16x32();
-    transmute(simd_select_bitmask(k, simd_ctpop(a.as_i16x32()), zero))
+    transmute(simd_select_bitmask(
+        k,
+        simd_ctpop(a.as_i16x32()),
+        i16x32::ZERO,
+    ))
 }

 /// For each packed 16-bit integer maps the value to the number of logical 1 bits.
@ -105,8 +108,11 @@ pub unsafe fn _mm256_popcnt_epi16(a: __m256i) -> __m256i {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpopcntw))]
 pub unsafe fn _mm256_maskz_popcnt_epi16(k: __mmask16, a: __m256i) -> __m256i {
-    let zero = _mm256_setzero_si256().as_i16x16();
-    transmute(simd_select_bitmask(k, simd_ctpop(a.as_i16x16()), zero))
+    transmute(simd_select_bitmask(
+        k,
+        simd_ctpop(a.as_i16x16()),
+        i16x16::ZERO,
+    ))
 }

 /// For each packed 16-bit integer maps the value to the number of logical 1 bits.
@ -149,8 +155,11 @@ pub unsafe fn _mm_popcnt_epi16(a: __m128i) -> __m128i {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpopcntw))]
 pub unsafe fn _mm_maskz_popcnt_epi16(k: __mmask8, a: __m128i) -> __m128i {
-    let zero = _mm_setzero_si128().as_i16x8();
-    transmute(simd_select_bitmask(k, simd_ctpop(a.as_i16x8()), zero))
+    transmute(simd_select_bitmask(
+        k,
+        simd_ctpop(a.as_i16x8()),
+        i16x8::ZERO,
+    ))
 }

 /// For each packed 16-bit integer maps the value to the number of logical 1 bits.
@ -193,8 +202,11 @@ pub unsafe fn _mm512_popcnt_epi8(a: __m512i) -> __m512i {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpopcntb))]
 pub unsafe fn _mm512_maskz_popcnt_epi8(k: __mmask64, a: __m512i) -> __m512i {
-    let zero = _mm512_setzero_si512().as_i8x64();
-    transmute(simd_select_bitmask(k, simd_ctpop(a.as_i8x64()), zero))
+    transmute(simd_select_bitmask(
+        k,
+        simd_ctpop(a.as_i8x64()),
+        i8x64::ZERO,
+    ))
 }

 /// For each packed 8-bit integer maps the value to the number of logical 1 bits.
@ -237,8 +249,11 @@ pub unsafe fn _mm256_popcnt_epi8(a: __m256i) -> __m256i {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpopcntb))]
 pub unsafe fn _mm256_maskz_popcnt_epi8(k: __mmask32, a: __m256i) -> __m256i {
-    let zero = _mm256_setzero_si256().as_i8x32();
-    transmute(simd_select_bitmask(k, simd_ctpop(a.as_i8x32()), zero))
+    transmute(simd_select_bitmask(
+        k,
+        simd_ctpop(a.as_i8x32()),
+        i8x32::ZERO,
+    ))
 }

 /// For each packed 8-bit integer maps the value to the number of logical 1 bits.
@ -281,8 +296,11 @@ pub unsafe fn _mm_popcnt_epi8(a: __m128i) -> __m128i {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpopcntb))]
 pub unsafe fn _mm_maskz_popcnt_epi8(k: __mmask16, a: __m128i) -> __m128i {
-    let zero = _mm_setzero_si128().as_i8x16();
-    transmute(simd_select_bitmask(k, simd_ctpop(a.as_i8x16()), zero))
+    transmute(simd_select_bitmask(
+        k,
+        simd_ctpop(a.as_i8x16()),
+        i8x16::ZERO,
+    ))
 }

 /// For each packed 8-bit integer maps the value to the number of logical 1 bits.
--- a/library/stdarch/crates/core_arch/src/x86/avx512bw.rs
+++ b/library/stdarch/crates/core_arch/src/x86/avx512bw.rs
--- a/library/stdarch/crates/core_arch/src/x86/avx512cd.rs
+++ b/library/stdarch/crates/core_arch/src/x86/avx512cd.rs
@ -102,8 +102,7 @@ pub unsafe fn _mm512_mask_conflict_epi32(src: __m512i, k: __mmask16, a: __m512i)
 #[cfg_attr(test, assert_instr(vpconflictd))]
 pub unsafe fn _mm512_maskz_conflict_epi32(k: __mmask16, a: __m512i) -> __m512i {
    let conflict = _mm512_conflict_epi32(a).as_i32x16();
-    let zero = _mm512_setzero_si512().as_i32x16();
-    transmute(simd_select_bitmask(k, conflict, zero))
+    transmute(simd_select_bitmask(k, conflict, i32x16::ZERO))
 }

 /// Test each 32-bit element of a for equality with all other elements in a closer to the least significant bit. Each element's comparison forms a zero extended bit vector in dst.
@ -138,8 +137,7 @@ pub unsafe fn _mm256_mask_conflict_epi32(src: __m256i, k: __mmask8, a: __m256i)
 #[cfg_attr(test, assert_instr(vpconflictd))]
 pub unsafe fn _mm256_maskz_conflict_epi32(k: __mmask8, a: __m256i) -> __m256i {
    let conflict = _mm256_conflict_epi32(a).as_i32x8();
-    let zero = _mm256_setzero_si256().as_i32x8();
-    transmute(simd_select_bitmask(k, conflict, zero))
+    transmute(simd_select_bitmask(k, conflict, i32x8::ZERO))
 }

 /// Test each 32-bit element of a for equality with all other elements in a closer to the least significant bit. Each element's comparison forms a zero extended bit vector in dst.
@ -174,8 +172,7 @@ pub unsafe fn _mm_mask_conflict_epi32(src: __m128i, k: __mmask8, a: __m128i) ->
 #[cfg_attr(test, assert_instr(vpconflictd))]
 pub unsafe fn _mm_maskz_conflict_epi32(k: __mmask8, a: __m128i) -> __m128i {
    let conflict = _mm_conflict_epi32(a).as_i32x4();
-    let zero = _mm_setzero_si128().as_i32x4();
-    transmute(simd_select_bitmask(k, conflict, zero))
+    transmute(simd_select_bitmask(k, conflict, i32x4::ZERO))
 }

 /// Test each 64-bit element of a for equality with all other elements in a closer to the least significant bit. Each element's comparison forms a zero extended bit vector in dst.
@ -210,8 +207,7 @@ pub unsafe fn _mm512_mask_conflict_epi64(src: __m512i, k: __mmask8, a: __m512i)
 #[cfg_attr(test, assert_instr(vpconflictq))]
 pub unsafe fn _mm512_maskz_conflict_epi64(k: __mmask8, a: __m512i) -> __m512i {
    let conflict = _mm512_conflict_epi64(a).as_i64x8();
-    let zero = _mm512_setzero_si512().as_i64x8();
-    transmute(simd_select_bitmask(k, conflict, zero))
+    transmute(simd_select_bitmask(k, conflict, i64x8::ZERO))
 }

 /// Test each 64-bit element of a for equality with all other elements in a closer to the least significant bit. Each element's comparison forms a zero extended bit vector in dst.
@ -246,8 +242,7 @@ pub unsafe fn _mm256_mask_conflict_epi64(src: __m256i, k: __mmask8, a: __m256i)
 #[cfg_attr(test, assert_instr(vpconflictq))]
 pub unsafe fn _mm256_maskz_conflict_epi64(k: __mmask8, a: __m256i) -> __m256i {
    let conflict = _mm256_conflict_epi64(a).as_i64x4();
-    let zero = _mm256_setzero_si256().as_i64x4();
-    transmute(simd_select_bitmask(k, conflict, zero))
+    transmute(simd_select_bitmask(k, conflict, i64x4::ZERO))
 }

 /// Test each 64-bit element of a for equality with all other elements in a closer to the least significant bit. Each element's comparison forms a zero extended bit vector in dst.
@ -282,8 +277,7 @@ pub unsafe fn _mm_mask_conflict_epi64(src: __m128i, k: __mmask8, a: __m128i) ->
 #[cfg_attr(test, assert_instr(vpconflictq))]
 pub unsafe fn _mm_maskz_conflict_epi64(k: __mmask8, a: __m128i) -> __m128i {
    let conflict = _mm_conflict_epi64(a).as_i64x2();
-    let zero = _mm_setzero_si128().as_i64x2();
-    transmute(simd_select_bitmask(k, conflict, zero))
+    transmute(simd_select_bitmask(k, conflict, i64x2::ZERO))
 }

 /// Counts the number of leading zero bits in each packed 32-bit integer in a, and store the results in dst.
@ -318,8 +312,7 @@ pub unsafe fn _mm512_mask_lzcnt_epi32(src: __m512i, k: __mmask16, a: __m512i) ->
 #[cfg_attr(test, assert_instr(vplzcntd))]
 pub unsafe fn _mm512_maskz_lzcnt_epi32(k: __mmask16, a: __m512i) -> __m512i {
    let zerocount = _mm512_lzcnt_epi32(a).as_i32x16();
-    let zero = _mm512_setzero_si512().as_i32x16();
-    transmute(simd_select_bitmask(k, zerocount, zero))
+    transmute(simd_select_bitmask(k, zerocount, i32x16::ZERO))
 }

 /// Counts the number of leading zero bits in each packed 32-bit integer in a, and store the results in dst.
@ -354,8 +347,7 @@ pub unsafe fn _mm256_mask_lzcnt_epi32(src: __m256i, k: __mmask8, a: __m256i) ->
 #[cfg_attr(test, assert_instr(vplzcntd))]
 pub unsafe fn _mm256_maskz_lzcnt_epi32(k: __mmask8, a: __m256i) -> __m256i {
    let zerocount = _mm256_lzcnt_epi32(a).as_i32x8();
-    let zero = _mm256_setzero_si256().as_i32x8();
-    transmute(simd_select_bitmask(k, zerocount, zero))
+    transmute(simd_select_bitmask(k, zerocount, i32x8::ZERO))
 }

 /// Counts the number of leading zero bits in each packed 32-bit integer in a, and store the results in dst.
@ -390,8 +382,7 @@ pub unsafe fn _mm_mask_lzcnt_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m
 #[cfg_attr(test, assert_instr(vplzcntd))]
 pub unsafe fn _mm_maskz_lzcnt_epi32(k: __mmask8, a: __m128i) -> __m128i {
    let zerocount = _mm_lzcnt_epi32(a).as_i32x4();
-    let zero = _mm_setzero_si128().as_i32x4();
-    transmute(simd_select_bitmask(k, zerocount, zero))
+    transmute(simd_select_bitmask(k, zerocount, i32x4::ZERO))
 }

 /// Counts the number of leading zero bits in each packed 64-bit integer in a, and store the results in dst.
@ -426,8 +417,7 @@ pub unsafe fn _mm512_mask_lzcnt_epi64(src: __m512i, k: __mmask8, a: __m512i) ->
 #[cfg_attr(test, assert_instr(vplzcntq))]
 pub unsafe fn _mm512_maskz_lzcnt_epi64(k: __mmask8, a: __m512i) -> __m512i {
    let zerocount = _mm512_lzcnt_epi64(a).as_i64x8();
-    let zero = _mm512_setzero_si512().as_i64x8();
-    transmute(simd_select_bitmask(k, zerocount, zero))
+    transmute(simd_select_bitmask(k, zerocount, i64x8::ZERO))
 }

 /// Counts the number of leading zero bits in each packed 64-bit integer in a, and store the results in dst.
@ -462,8 +452,7 @@ pub unsafe fn _mm256_mask_lzcnt_epi64(src: __m256i, k: __mmask8, a: __m256i) ->
 #[cfg_attr(test, assert_instr(vplzcntq))]
 pub unsafe fn _mm256_maskz_lzcnt_epi64(k: __mmask8, a: __m256i) -> __m256i {
    let zerocount = _mm256_lzcnt_epi64(a).as_i64x4();
-    let zero = _mm256_setzero_si256().as_i64x4();
-    transmute(simd_select_bitmask(k, zerocount, zero))
+    transmute(simd_select_bitmask(k, zerocount, i64x4::ZERO))
 }

 /// Counts the number of leading zero bits in each packed 64-bit integer in a, and store the results in dst.
@ -498,8 +487,7 @@ pub unsafe fn _mm_mask_lzcnt_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m
 #[cfg_attr(test, assert_instr(vplzcntq))]
 pub unsafe fn _mm_maskz_lzcnt_epi64(k: __mmask8, a: __m128i) -> __m128i {
    let zerocount = _mm_lzcnt_epi64(a).as_i64x2();
-    let zero = _mm_setzero_si128().as_i64x2();
-    transmute(simd_select_bitmask(k, zerocount, zero))
+    transmute(simd_select_bitmask(k, zerocount, i64x2::ZERO))
 }

 #[allow(improper_ctypes)]
--- a/library/stdarch/crates/core_arch/src/x86/avx512dq.rs
+++ b/library/stdarch/crates/core_arch/src/x86/avx512dq.rs
@ -30,8 +30,7 @@ pub unsafe fn _mm_mask_and_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d)
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub unsafe fn _mm_maskz_and_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
    let and = _mm_and_pd(a, b).as_f64x2();
-    let zero = _mm_setzero_pd().as_f64x2();
-    transmute(simd_select_bitmask(k, and, zero))
+    transmute(simd_select_bitmask(k, and, f64x2::ZERO))
 }

 /// Compute the bitwise AND of packed double-precision (64-bit) floating point numbers in a and b
@ -58,8 +57,7 @@ pub unsafe fn _mm256_mask_and_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m25
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub unsafe fn _mm256_maskz_and_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
    let and = _mm256_and_pd(a, b).as_f64x4();
-    let zero = _mm256_setzero_pd().as_f64x4();
-    transmute(simd_select_bitmask(k, and, zero))
+    transmute(simd_select_bitmask(k, and, f64x4::ZERO))
 }

 /// Compute the bitwise AND of packed double-precision (64-bit) floating point numbers in a and b
@ -98,8 +96,7 @@ pub unsafe fn _mm512_mask_and_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m51
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub unsafe fn _mm512_maskz_and_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
    let and = _mm512_and_pd(a, b).as_f64x8();
-    let zero = _mm512_setzero_pd().as_f64x8();
-    transmute(simd_select_bitmask(k, and, zero))
+    transmute(simd_select_bitmask(k, and, f64x8::ZERO))
 }

 /// Compute the bitwise AND of packed single-precision (32-bit) floating point numbers in a and b
@ -126,8 +123,7 @@ pub unsafe fn _mm_mask_and_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) ->
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub unsafe fn _mm_maskz_and_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
    let and = _mm_and_ps(a, b).as_f32x4();
-    let zero = _mm_setzero_ps().as_f32x4();
-    transmute(simd_select_bitmask(k, and, zero))
+    transmute(simd_select_bitmask(k, and, f32x4::ZERO))
 }

 /// Compute the bitwise AND of packed single-precision (32-bit) floating point numbers in a and b
@ -154,8 +150,7 @@ pub unsafe fn _mm256_mask_and_ps(src: __m256, k: __mmask8, a: __m256, b: __m256)
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub unsafe fn _mm256_maskz_and_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
    let and = _mm256_and_ps(a, b).as_f32x8();
-    let zero = _mm256_setzero_ps().as_f32x8();
-    transmute(simd_select_bitmask(k, and, zero))
+    transmute(simd_select_bitmask(k, and, f32x8::ZERO))
 }

 /// Compute the bitwise AND of packed single-precision (32-bit) floating point numbers in a and b
@ -197,8 +192,7 @@ pub unsafe fn _mm512_mask_and_ps(src: __m512, k: __mmask16, a: __m512, b: __m512
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub unsafe fn _mm512_maskz_and_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
    let and = _mm512_and_ps(a, b).as_f32x16();
-    let zero = _mm512_setzero_ps().as_f32x16();
-    transmute(simd_select_bitmask(k, and, zero))
+    transmute(simd_select_bitmask(k, and, f32x16::ZERO))
 }

 // Andnot
@ -228,8 +222,7 @@ pub unsafe fn _mm_mask_andnot_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m12
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub unsafe fn _mm_maskz_andnot_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
    let andnot = _mm_andnot_pd(a, b).as_f64x2();
-    let zero = _mm_setzero_pd().as_f64x2();
-    transmute(simd_select_bitmask(k, andnot, zero))
+    transmute(simd_select_bitmask(k, andnot, f64x2::ZERO))
 }

 /// Compute the bitwise NOT of packed double-precision (64-bit) floating point numbers in a and then
@ -257,8 +250,7 @@ pub unsafe fn _mm256_mask_andnot_pd(src: __m256d, k: __mmask8, a: __m256d, b: __
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub unsafe fn _mm256_maskz_andnot_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
    let andnot = _mm256_andnot_pd(a, b).as_f64x4();
-    let zero = _mm256_setzero_pd().as_f64x4();
-    transmute(simd_select_bitmask(k, andnot, zero))
+    transmute(simd_select_bitmask(k, andnot, f64x4::ZERO))
 }

 /// Compute the bitwise NOT of packed double-precision (64-bit) floating point numbers in a and then
@ -298,8 +290,7 @@ pub unsafe fn _mm512_mask_andnot_pd(src: __m512d, k: __mmask8, a: __m512d, b: __
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub unsafe fn _mm512_maskz_andnot_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
    let andnot = _mm512_andnot_pd(a, b).as_f64x8();
-    let zero = _mm512_setzero_pd().as_f64x8();
-    transmute(simd_select_bitmask(k, andnot, zero))
+    transmute(simd_select_bitmask(k, andnot, f64x8::ZERO))
 }

 /// Compute the bitwise NOT of packed single-precision (32-bit) floating point numbers in a and then
@ -327,8 +318,7 @@ pub unsafe fn _mm_mask_andnot_ps(src: __m128, k: __mmask8, a: __m128, b: __m128)
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub unsafe fn _mm_maskz_andnot_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
    let andnot = _mm_andnot_ps(a, b).as_f32x4();
-    let zero = _mm_setzero_ps().as_f32x4();
-    transmute(simd_select_bitmask(k, andnot, zero))
+    transmute(simd_select_bitmask(k, andnot, f32x4::ZERO))
 }

 /// Compute the bitwise NOT of packed single-precision (32-bit) floating point numbers in a and then
@ -356,8 +346,7 @@ pub unsafe fn _mm256_mask_andnot_ps(src: __m256, k: __mmask8, a: __m256, b: __m2
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub unsafe fn _mm256_maskz_andnot_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
    let andnot = _mm256_andnot_ps(a, b).as_f32x8();
-    let zero = _mm256_setzero_ps().as_f32x8();
-    transmute(simd_select_bitmask(k, andnot, zero))
+    transmute(simd_select_bitmask(k, andnot, f32x8::ZERO))
 }

 /// Compute the bitwise NOT of packed single-precision (32-bit) floating point numbers in a and then
@ -397,8 +386,7 @@ pub unsafe fn _mm512_mask_andnot_ps(src: __m512, k: __mmask16, a: __m512, b: __m
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub unsafe fn _mm512_maskz_andnot_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
    let andnot = _mm512_andnot_ps(a, b).as_f32x16();
-    let zero = _mm512_setzero_ps().as_f32x16();
-    transmute(simd_select_bitmask(k, andnot, zero))
+    transmute(simd_select_bitmask(k, andnot, f32x16::ZERO))
 }

 // Or
@ -427,8 +415,7 @@ pub unsafe fn _mm_mask_or_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d)
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub unsafe fn _mm_maskz_or_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
    let or = _mm_or_pd(a, b).as_f64x2();
-    let zero = _mm_setzero_pd().as_f64x2();
-    transmute(simd_select_bitmask(k, or, zero))
+    transmute(simd_select_bitmask(k, or, f64x2::ZERO))
 }

 /// Compute the bitwise OR of packed double-precision (64-bit) floating point numbers in a and b
@ -455,8 +442,7 @@ pub unsafe fn _mm256_mask_or_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m256
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub unsafe fn _mm256_maskz_or_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
    let or = _mm256_or_pd(a, b).as_f64x4();
-    let zero = _mm256_setzero_pd().as_f64x4();
-    transmute(simd_select_bitmask(k, or, zero))
+    transmute(simd_select_bitmask(k, or, f64x4::ZERO))
 }

 /// Compute the bitwise OR of packed double-precision (64-bit) floating point numbers in a and b
@ -495,8 +481,7 @@ pub unsafe fn _mm512_mask_or_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub unsafe fn _mm512_maskz_or_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
    let or = _mm512_or_pd(a, b).as_f64x8();
-    let zero = _mm512_setzero_pd().as_f64x8();
-    transmute(simd_select_bitmask(k, or, zero))
+    transmute(simd_select_bitmask(k, or, f64x8::ZERO))
 }

 /// Compute the bitwise OR of packed single-precision (32-bit) floating point numbers in a and b
@ -523,8 +508,7 @@ pub unsafe fn _mm_mask_or_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) ->
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub unsafe fn _mm_maskz_or_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
    let or = _mm_or_ps(a, b).as_f32x4();
-    let zero = _mm_setzero_ps().as_f32x4();
-    transmute(simd_select_bitmask(k, or, zero))
+    transmute(simd_select_bitmask(k, or, f32x4::ZERO))
 }

 /// Compute the bitwise OR of packed single-precision (32-bit) floating point numbers in a and b
@ -551,8 +535,7 @@ pub unsafe fn _mm256_mask_or_ps(src: __m256, k: __mmask8, a: __m256, b: __m256)
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub unsafe fn _mm256_maskz_or_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
    let or = _mm256_or_ps(a, b).as_f32x8();
-    let zero = _mm256_setzero_ps().as_f32x8();
-    transmute(simd_select_bitmask(k, or, zero))
+    transmute(simd_select_bitmask(k, or, f32x8::ZERO))
 }

 /// Compute the bitwise OR of packed single-precision (32-bit) floating point numbers in a and b
@ -594,8 +577,7 @@ pub unsafe fn _mm512_mask_or_ps(src: __m512, k: __mmask16, a: __m512, b: __m512)
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub unsafe fn _mm512_maskz_or_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
    let or = _mm512_or_ps(a, b).as_f32x16();
-    let zero = _mm512_setzero_ps().as_f32x16();
-    transmute(simd_select_bitmask(k, or, zero))
+    transmute(simd_select_bitmask(k, or, f32x16::ZERO))
 }

 // Xor
@ -624,8 +606,7 @@ pub unsafe fn _mm_mask_xor_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d)
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub unsafe fn _mm_maskz_xor_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
    let xor = _mm_xor_pd(a, b).as_f64x2();
-    let zero = _mm_setzero_pd().as_f64x2();
-    transmute(simd_select_bitmask(k, xor, zero))
+    transmute(simd_select_bitmask(k, xor, f64x2::ZERO))
 }

 /// Compute the bitwise XOR of packed double-precision (64-bit) floating point numbers in a and b
@ -652,8 +633,7 @@ pub unsafe fn _mm256_mask_xor_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m25
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub unsafe fn _mm256_maskz_xor_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
    let xor = _mm256_xor_pd(a, b).as_f64x4();
-    let zero = _mm256_setzero_pd().as_f64x4();
-    transmute(simd_select_bitmask(k, xor, zero))
+    transmute(simd_select_bitmask(k, xor, f64x4::ZERO))
 }

 /// Compute the bitwise XOR of packed double-precision (64-bit) floating point numbers in a and b
@ -692,8 +672,7 @@ pub unsafe fn _mm512_mask_xor_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m51
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub unsafe fn _mm512_maskz_xor_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
    let xor = _mm512_xor_pd(a, b).as_f64x8();
-    let zero = _mm512_setzero_pd().as_f64x8();
-    transmute(simd_select_bitmask(k, xor, zero))
+    transmute(simd_select_bitmask(k, xor, f64x8::ZERO))
 }

 /// Compute the bitwise XOR of packed single-precision (32-bit) floating point numbers in a and b
@ -720,8 +699,7 @@ pub unsafe fn _mm_mask_xor_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) ->
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub unsafe fn _mm_maskz_xor_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
    let xor = _mm_xor_ps(a, b).as_f32x4();
-    let zero = _mm_setzero_ps().as_f32x4();
-    transmute(simd_select_bitmask(k, xor, zero))
+    transmute(simd_select_bitmask(k, xor, f32x4::ZERO))
 }

 /// Compute the bitwise XOR of packed single-precision (32-bit) floating point numbers in a and b
@ -748,8 +726,7 @@ pub unsafe fn _mm256_mask_xor_ps(src: __m256, k: __mmask8, a: __m256, b: __m256)
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub unsafe fn _mm256_maskz_xor_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
    let xor = _mm256_xor_ps(a, b).as_f32x8();
-    let zero = _mm256_setzero_ps().as_f32x8();
-    transmute(simd_select_bitmask(k, xor, zero))
+    transmute(simd_select_bitmask(k, xor, f32x8::ZERO))
 }

 /// Compute the bitwise XOR of packed single-precision (32-bit) floating point numbers in a and b
@ -791,8 +768,7 @@ pub unsafe fn _mm512_mask_xor_ps(src: __m512, k: __mmask16, a: __m512, b: __m512
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub unsafe fn _mm512_maskz_xor_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
    let xor = _mm512_xor_ps(a, b).as_f32x16();
-    let zero = _mm512_setzero_ps().as_f32x16();
-    transmute(simd_select_bitmask(k, xor, zero))
+    transmute(simd_select_bitmask(k, xor, f32x16::ZERO))
 }

 // Broadcast
@ -832,8 +808,7 @@ pub unsafe fn _mm256_mask_broadcast_f32x2(src: __m256, k: __mmask8, a: __m128) -
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub unsafe fn _mm256_maskz_broadcast_f32x2(k: __mmask8, a: __m128) -> __m256 {
    let b = _mm256_broadcast_f32x2(a).as_f32x8();
-    let zero = _mm256_setzero_ps().as_f32x8();
-    transmute(simd_select_bitmask(k, b, zero))
+    transmute(simd_select_bitmask(k, b, f32x8::ZERO))
 }

 /// Broadcasts the lower 2 packed single-precision (32-bit) floating-point elements from a to all
@ -871,8 +846,7 @@ pub unsafe fn _mm512_mask_broadcast_f32x2(src: __m512, k: __mmask16, a: __m128)
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub unsafe fn _mm512_maskz_broadcast_f32x2(k: __mmask16, a: __m128) -> __m512 {
    let b = _mm512_broadcast_f32x2(a).as_f32x16();
-    let zero = _mm512_setzero_ps().as_f32x16();
-    transmute(simd_select_bitmask(k, b, zero))
+    transmute(simd_select_bitmask(k, b, f32x16::ZERO))
 }

 /// Broadcasts the 8 packed single-precision (32-bit) floating-point elements from a to all
@ -908,8 +882,7 @@ pub unsafe fn _mm512_mask_broadcast_f32x8(src: __m512, k: __mmask16, a: __m256)
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub unsafe fn _mm512_maskz_broadcast_f32x8(k: __mmask16, a: __m256) -> __m512 {
    let b = _mm512_broadcast_f32x8(a).as_f32x16();
-    let zero = _mm512_setzero_ps().as_f32x16();
-    transmute(simd_select_bitmask(k, b, zero))
+    transmute(simd_select_bitmask(k, b, f32x16::ZERO))
 }

 /// Broadcasts the 2 packed double-precision (64-bit) floating-point elements from a to all
@ -945,8 +918,7 @@ pub unsafe fn _mm256_mask_broadcast_f64x2(src: __m256d, k: __mmask8, a: __m128d)
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub unsafe fn _mm256_maskz_broadcast_f64x2(k: __mmask8, a: __m128d) -> __m256d {
    let b = _mm256_broadcast_f64x2(a).as_f64x4();
-    let zero = _mm256_setzero_pd().as_f64x4();
-    transmute(simd_select_bitmask(k, b, zero))
+    transmute(simd_select_bitmask(k, b, f64x4::ZERO))
 }

 /// Broadcasts the 2 packed double-precision (64-bit) floating-point elements from a to all
@ -982,8 +954,7 @@ pub unsafe fn _mm512_mask_broadcast_f64x2(src: __m512d, k: __mmask8, a: __m128d)
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub unsafe fn _mm512_maskz_broadcast_f64x2(k: __mmask8, a: __m128d) -> __m512d {
    let b = _mm512_broadcast_f64x2(a).as_f64x8();
-    let zero = _mm512_setzero_pd().as_f64x8();
-    transmute(simd_select_bitmask(k, b, zero))
+    transmute(simd_select_bitmask(k, b, f64x8::ZERO))
 }

 /// Broadcasts the lower 2 packed 32-bit integers from a to all elements of dst.
@ -1021,8 +992,7 @@ pub unsafe fn _mm_mask_broadcast_i32x2(src: __m128i, k: __mmask8, a: __m128i) ->
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub unsafe fn _mm_maskz_broadcast_i32x2(k: __mmask8, a: __m128i) -> __m128i {
    let b = _mm_broadcast_i32x2(a).as_i32x4();
-    let zero = _mm_setzero_si128().as_i32x4();
-    transmute(simd_select_bitmask(k, b, zero))
+    transmute(simd_select_bitmask(k, b, i32x4::ZERO))
 }

 /// Broadcasts the lower 2 packed 32-bit integers from a to all elements of dst.
@ -1060,8 +1030,7 @@ pub unsafe fn _mm256_mask_broadcast_i32x2(src: __m256i, k: __mmask8, a: __m128i)
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub unsafe fn _mm256_maskz_broadcast_i32x2(k: __mmask8, a: __m128i) -> __m256i {
    let b = _mm256_broadcast_i32x2(a).as_i32x8();
-    let zero = _mm256_setzero_si256().as_i32x8();
-    transmute(simd_select_bitmask(k, b, zero))
+    transmute(simd_select_bitmask(k, b, i32x8::ZERO))
 }

 /// Broadcasts the lower 2 packed 32-bit integers from a to all elements of dst.
@ -1099,8 +1068,7 @@ pub unsafe fn _mm512_mask_broadcast_i32x2(src: __m512i, k: __mmask16, a: __m128i
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub unsafe fn _mm512_maskz_broadcast_i32x2(k: __mmask16, a: __m128i) -> __m512i {
    let b = _mm512_broadcast_i32x2(a).as_i32x16();
-    let zero = _mm512_setzero_si512().as_i32x16();
-    transmute(simd_select_bitmask(k, b, zero))
+    transmute(simd_select_bitmask(k, b, i32x16::ZERO))
 }

 /// Broadcasts the 8 packed 32-bit integers from a to all elements of dst.
@ -1136,8 +1104,7 @@ pub unsafe fn _mm512_mask_broadcast_i32x8(src: __m512i, k: __mmask16, a: __m256i
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub unsafe fn _mm512_maskz_broadcast_i32x8(k: __mmask16, a: __m256i) -> __m512i {
    let b = _mm512_broadcast_i32x8(a).as_i32x16();
-    let zero = _mm512_setzero_si512().as_i32x16();
-    transmute(simd_select_bitmask(k, b, zero))
+    transmute(simd_select_bitmask(k, b, i32x16::ZERO))
 }

 /// Broadcasts the 2 packed 64-bit integers from a to all elements of dst.
@ -1173,8 +1140,7 @@ pub unsafe fn _mm256_mask_broadcast_i64x2(src: __m256i, k: __mmask8, a: __m128i)
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub unsafe fn _mm256_maskz_broadcast_i64x2(k: __mmask8, a: __m128i) -> __m256i {
    let b = _mm256_broadcast_i64x2(a).as_i64x4();
-    let zero = _mm256_setzero_si256().as_i64x4();
-    transmute(simd_select_bitmask(k, b, zero))
+    transmute(simd_select_bitmask(k, b, i64x4::ZERO))
 }

 /// Broadcasts the 2 packed 64-bit integers from a to all elements of dst.
@ -1210,8 +1176,7 @@ pub unsafe fn _mm512_mask_broadcast_i64x2(src: __m512i, k: __mmask8, a: __m128i)
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub unsafe fn _mm512_maskz_broadcast_i64x2(k: __mmask8, a: __m128i) -> __m512i {
    let b = _mm512_broadcast_i64x2(a).as_i64x8();
-    let zero = _mm512_setzero_si512().as_i64x8();
-    transmute(simd_select_bitmask(k, b, zero))
+    transmute(simd_select_bitmask(k, b, i64x8::ZERO))
 }

 // Extract
@ -1265,8 +1230,7 @@ pub unsafe fn _mm512_mask_extractf32x8_ps<const IMM8: i32>(
 pub unsafe fn _mm512_maskz_extractf32x8_ps<const IMM8: i32>(k: __mmask8, a: __m512) -> __m256 {
    static_assert_uimm_bits!(IMM8, 1);
    let b = _mm512_extractf32x8_ps::<IMM8>(a);
-    let zero = _mm256_setzero_ps().as_f32x8();
-    transmute(simd_select_bitmask(k, b.as_f32x8(), zero))
+    transmute(simd_select_bitmask(k, b.as_f32x8(), f32x8::ZERO))
 }

 /// Extracts 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from a,
@ -1318,8 +1282,7 @@ pub unsafe fn _mm256_mask_extractf64x2_pd<const IMM8: i32>(
 pub unsafe fn _mm256_maskz_extractf64x2_pd<const IMM8: i32>(k: __mmask8, a: __m256d) -> __m128d {
    static_assert_uimm_bits!(IMM8, 1);
    let b = _mm256_extractf64x2_pd::<IMM8>(a);
-    let zero = _mm_setzero_pd().as_f64x2();
-    transmute(simd_select_bitmask(k, b.as_f64x2(), zero))
+    transmute(simd_select_bitmask(k, b.as_f64x2(), f64x2::ZERO))
 }

 /// Extracts 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from a,
@ -1373,8 +1336,7 @@ pub unsafe fn _mm512_mask_extractf64x2_pd<const IMM8: i32>(
 pub unsafe fn _mm512_maskz_extractf64x2_pd<const IMM8: i32>(k: __mmask8, a: __m512d) -> __m128d {
    static_assert_uimm_bits!(IMM8, 2);
    let b = _mm512_extractf64x2_pd::<IMM8>(a).as_f64x2();
-    let zero = _mm_setzero_pd().as_f64x2();
-    transmute(simd_select_bitmask(k, b, zero))
+    transmute(simd_select_bitmask(k, b, f64x2::ZERO))
 }

 /// Extracts 256 bits (composed of 8 packed 32-bit integers) from a, selected with IMM8, and stores
@ -1426,8 +1388,7 @@ pub unsafe fn _mm512_mask_extracti32x8_epi32<const IMM8: i32>(
 pub unsafe fn _mm512_maskz_extracti32x8_epi32<const IMM8: i32>(k: __mmask8, a: __m512i) -> __m256i {
    static_assert_uimm_bits!(IMM8, 1);
    let b = _mm512_extracti32x8_epi32::<IMM8>(a).as_i32x8();
-    let zero = _mm256_setzero_si256().as_i32x8();
-    transmute(simd_select_bitmask(k, b, zero))
+    transmute(simd_select_bitmask(k, b, i32x8::ZERO))
 }

 /// Extracts 128 bits (composed of 2 packed 64-bit integers) from a, selected with IMM8, and stores
@ -1478,8 +1439,7 @@ pub unsafe fn _mm256_mask_extracti64x2_epi64<const IMM8: i32>(
 pub unsafe fn _mm256_maskz_extracti64x2_epi64<const IMM8: i32>(k: __mmask8, a: __m256i) -> __m128i {
    static_assert_uimm_bits!(IMM8, 1);
    let b = _mm256_extracti64x2_epi64::<IMM8>(a).as_i64x2();
-    let zero = _mm_setzero_si128().as_i64x2();
-    transmute(simd_select_bitmask(k, b, zero))
+    transmute(simd_select_bitmask(k, b, i64x2::ZERO))
 }

 /// Extracts 128 bits (composed of 2 packed 64-bit integers) from a, selected with IMM8, and stores
@ -1532,8 +1492,7 @@ pub unsafe fn _mm512_mask_extracti64x2_epi64<const IMM8: i32>(
 pub unsafe fn _mm512_maskz_extracti64x2_epi64<const IMM8: i32>(k: __mmask8, a: __m512i) -> __m128i {
    static_assert_uimm_bits!(IMM8, 2);
    let b = _mm512_extracti64x2_epi64::<IMM8>(a).as_i64x2();
-    let zero = _mm_setzero_si128().as_i64x2();
-    transmute(simd_select_bitmask(k, b, zero))
+    transmute(simd_select_bitmask(k, b, i64x2::ZERO))
 }

 // Insert
@ -1601,8 +1560,7 @@ pub unsafe fn _mm512_maskz_insertf32x8<const IMM8: i32>(
 ) -> __m512 {
    static_assert_uimm_bits!(IMM8, 1);
    let c = _mm512_insertf32x8::<IMM8>(a, b).as_f32x16();
-    let zero = _mm512_setzero_ps().as_f32x16();
-    transmute(simd_select_bitmask(k, c, zero))
+    transmute(simd_select_bitmask(k, c, f32x16::ZERO))
 }

 /// Copy a to dst, then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point
@ -1660,8 +1618,7 @@ pub unsafe fn _mm256_maskz_insertf64x2<const IMM8: i32>(
 ) -> __m256d {
    static_assert_uimm_bits!(IMM8, 1);
    let c = _mm256_insertf64x2::<IMM8>(a, b).as_f64x4();
-    let zero = _mm256_setzero_pd().as_f64x4();
-    transmute(simd_select_bitmask(k, c, zero))
+    transmute(simd_select_bitmask(k, c, f64x4::ZERO))
 }

 /// Copy a to dst, then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point
@ -1721,8 +1678,7 @@ pub unsafe fn _mm512_maskz_insertf64x2<const IMM8: i32>(
 ) -> __m512d {
    static_assert_uimm_bits!(IMM8, 2);
    let c = _mm512_insertf64x2::<IMM8>(a, b).as_f64x8();
-    let zero = _mm512_setzero_pd().as_f64x8();
-    transmute(simd_select_bitmask(k, c, zero))
+    transmute(simd_select_bitmask(k, c, f64x8::ZERO))
 }

 /// Copy a to dst, then insert 256 bits (composed of 8 packed 32-bit integers) from b into dst at the
@ -1790,8 +1746,7 @@ pub unsafe fn _mm512_maskz_inserti32x8<const IMM8: i32>(
 ) -> __m512i {
    static_assert_uimm_bits!(IMM8, 1);
    let c = _mm512_inserti32x8::<IMM8>(a, b).as_i32x16();
-    let zero = _mm512_setzero_si512().as_i32x16();
-    transmute(simd_select_bitmask(k, c, zero))
+    transmute(simd_select_bitmask(k, c, i32x16::ZERO))
 }

 /// Copy a to dst, then insert 128 bits (composed of 2 packed 64-bit integers) from b into dst at the
@ -1850,8 +1805,7 @@ pub unsafe fn _mm256_maskz_inserti64x2<const IMM8: i32>(
 ) -> __m256i {
    static_assert_uimm_bits!(IMM8, 1);
    let c = _mm256_inserti64x2::<IMM8>(a, b).as_i64x4();
-    let zero = _mm256_setzero_si256().as_i64x4();
-    transmute(simd_select_bitmask(k, c, zero))
+    transmute(simd_select_bitmask(k, c, i64x4::ZERO))
 }

 /// Copy a to dst, then insert 128 bits (composed of 2 packed 64-bit integers) from b into dst at the
@ -1912,8 +1866,7 @@ pub unsafe fn _mm512_maskz_inserti64x2<const IMM8: i32>(
 ) -> __m512i {
    static_assert_uimm_bits!(IMM8, 2);
    let c = _mm512_inserti64x2::<IMM8>(a, b).as_i64x8();
-    let zero = _mm512_setzero_si512().as_i64x8();
-    transmute(simd_select_bitmask(k, c, zero))
+    transmute(simd_select_bitmask(k, c, i64x8::ZERO))
 }

 // Convert
@ -1986,8 +1939,7 @@ pub unsafe fn _mm512_maskz_cvt_roundepi64_pd<const ROUNDING: i32>(
 ) -> __m512d {
    static_assert_rounding!(ROUNDING);
    let b = _mm512_cvt_roundepi64_pd::<ROUNDING>(a).as_f64x8();
-    let zero = _mm512_setzero_pd().as_f64x8();
-    transmute(simd_select_bitmask(k, b, zero))
+    transmute(simd_select_bitmask(k, b, f64x8::ZERO))
 }

 /// Convert packed signed 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
@ -2026,8 +1978,7 @@ pub unsafe fn _mm_mask_cvtepi64_pd(src: __m128d, k: __mmask8, a: __m128i) -> __m
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub unsafe fn _mm_maskz_cvtepi64_pd(k: __mmask8, a: __m128i) -> __m128d {
    let b = _mm_cvtepi64_pd(a).as_f64x2();
-    let zero = _mm_setzero_pd().as_f64x2();
-    transmute(simd_select_bitmask(k, b, zero))
+    transmute(simd_select_bitmask(k, b, f64x2::ZERO))
 }

 /// Convert packed signed 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
@ -2066,8 +2017,7 @@ pub unsafe fn _mm256_mask_cvtepi64_pd(src: __m256d, k: __mmask8, a: __m256i) ->
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub unsafe fn _mm256_maskz_cvtepi64_pd(k: __mmask8, a: __m256i) -> __m256d {
    let b = _mm256_cvtepi64_pd(a).as_f64x4();
-    let zero = _mm256_setzero_pd().as_f64x4();
-    transmute(simd_select_bitmask(k, b, zero))
+    transmute(simd_select_bitmask(k, b, f64x4::ZERO))
 }

 /// Convert packed signed 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
@ -2106,8 +2056,7 @@ pub unsafe fn _mm512_mask_cvtepi64_pd(src: __m512d, k: __mmask8, a: __m512i) ->
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub unsafe fn _mm512_maskz_cvtepi64_pd(k: __mmask8, a: __m512i) -> __m512d {
    let b = _mm512_cvtepi64_pd(a).as_f64x8();
-    let zero = _mm512_setzero_pd().as_f64x8();
-    transmute(simd_select_bitmask(k, b, zero))
+    transmute(simd_select_bitmask(k, b, f64x8::ZERO))
 }

 /// Convert packed signed 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
@ -2178,8 +2127,7 @@ pub unsafe fn _mm512_maskz_cvt_roundepi64_ps<const ROUNDING: i32>(
 ) -> __m256 {
    static_assert_rounding!(ROUNDING);
    let b = _mm512_cvt_roundepi64_ps::<ROUNDING>(a).as_f32x8();
-    let zero = _mm256_setzero_ps().as_f32x8();
-    transmute(simd_select_bitmask(k, b, zero))
+    transmute(simd_select_bitmask(k, b, f32x8::ZERO))
 }

 /// Convert packed signed 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
@ -2255,8 +2203,7 @@ pub unsafe fn _mm256_mask_cvtepi64_ps(src: __m128, k: __mmask8, a: __m256i) -> _
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub unsafe fn _mm256_maskz_cvtepi64_ps(k: __mmask8, a: __m256i) -> __m128 {
    let b = _mm256_cvtepi64_ps(a).as_f32x4();
-    let zero = _mm_setzero_ps().as_f32x4();
-    transmute(simd_select_bitmask(k, b, zero))
+    transmute(simd_select_bitmask(k, b, f32x4::ZERO))
 }

 /// Convert packed signed 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
@ -2295,8 +2242,7 @@ pub unsafe fn _mm512_mask_cvtepi64_ps(src: __m256, k: __mmask8, a: __m512i) -> _
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub unsafe fn _mm512_maskz_cvtepi64_ps(k: __mmask8, a: __m512i) -> __m256 {
    let b = _mm512_cvtepi64_ps(a).as_f32x8();
-    let zero = _mm256_setzero_ps().as_f32x8();
-    transmute(simd_select_bitmask(k, b, zero))
+    transmute(simd_select_bitmask(k, b, f32x8::ZERO))
 }

 /// Convert packed unsigned 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
@ -2367,8 +2313,7 @@ pub unsafe fn _mm512_maskz_cvt_roundepu64_pd<const ROUNDING: i32>(
 ) -> __m512d {
    static_assert_rounding!(ROUNDING);
    let b = _mm512_cvt_roundepu64_pd::<ROUNDING>(a).as_f64x8();
-    let zero = _mm512_setzero_pd().as_f64x8();
-    transmute(simd_select_bitmask(k, b, zero))
+    transmute(simd_select_bitmask(k, b, f64x8::ZERO))
 }

 /// Convert packed unsigned 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
@ -2407,8 +2352,7 @@ pub unsafe fn _mm_mask_cvtepu64_pd(src: __m128d, k: __mmask8, a: __m128i) -> __m
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub unsafe fn _mm_maskz_cvtepu64_pd(k: __mmask8, a: __m128i) -> __m128d {
    let b = _mm_cvtepu64_pd(a).as_f64x2();
-    let zero = _mm_setzero_pd().as_f64x2();
-    transmute(simd_select_bitmask(k, b, zero))
+    transmute(simd_select_bitmask(k, b, f64x2::ZERO))
 }

 /// Convert packed unsigned 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
@ -2447,8 +2391,7 @@ pub unsafe fn _mm256_mask_cvtepu64_pd(src: __m256d, k: __mmask8, a: __m256i) ->
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub unsafe fn _mm256_maskz_cvtepu64_pd(k: __mmask8, a: __m256i) -> __m256d {
    let b = _mm256_cvtepu64_pd(a).as_f64x4();
-    let zero = _mm256_setzero_pd().as_f64x4();
-    transmute(simd_select_bitmask(k, b, zero))
+    transmute(simd_select_bitmask(k, b, f64x4::ZERO))
 }

 /// Convert packed unsigned 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
@ -2487,8 +2430,7 @@ pub unsafe fn _mm512_mask_cvtepu64_pd(src: __m512d, k: __mmask8, a: __m512i) ->
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub unsafe fn _mm512_maskz_cvtepu64_pd(k: __mmask8, a: __m512i) -> __m512d {
    let b = _mm512_cvtepu64_pd(a).as_f64x8();
-    let zero = _mm512_setzero_pd().as_f64x8();
-    transmute(simd_select_bitmask(k, b, zero))
+    transmute(simd_select_bitmask(k, b, f64x8::ZERO))
 }

 /// Convert packed unsigned 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
@ -2559,8 +2501,7 @@ pub unsafe fn _mm512_maskz_cvt_roundepu64_ps<const ROUNDING: i32>(
 ) -> __m256 {
    static_assert_rounding!(ROUNDING);
    let b = _mm512_cvt_roundepu64_ps::<ROUNDING>(a).as_f32x8();
-    let zero = _mm256_setzero_ps().as_f32x8();
-    transmute(simd_select_bitmask(k, b, zero))
+    transmute(simd_select_bitmask(k, b, f32x8::ZERO))
 }

 /// Convert packed unsigned 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
@ -2636,8 +2577,7 @@ pub unsafe fn _mm256_mask_cvtepu64_ps(src: __m128, k: __mmask8, a: __m256i) -> _
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub unsafe fn _mm256_maskz_cvtepu64_ps(k: __mmask8, a: __m256i) -> __m128 {
    let b = _mm256_cvtepu64_ps(a).as_f32x4();
-    let zero = _mm_setzero_ps().as_f32x4();
-    transmute(simd_select_bitmask(k, b, zero))
+    transmute(simd_select_bitmask(k, b, f32x4::ZERO))
 }

 /// Convert packed unsigned 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
@ -2676,8 +2616,7 @@ pub unsafe fn _mm512_mask_cvtepu64_ps(src: __m256, k: __mmask8, a: __m512i) -> _
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub unsafe fn _mm512_maskz_cvtepu64_ps(k: __mmask8, a: __m512i) -> __m256 {
    let b = _mm512_cvtepu64_ps(a).as_f32x8();
-    let zero = _mm256_setzero_ps().as_f32x8();
-    transmute(simd_select_bitmask(k, b, zero))
+    transmute(simd_select_bitmask(k, b, f32x8::ZERO))
 }

 /// Convert packed double-precision (64-bit) floating-point elements in a to packed signed 64-bit integers,
@ -4131,8 +4070,7 @@ pub unsafe fn _mm_mask_mullo_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __m
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub unsafe fn _mm_maskz_mullo_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
    let b = _mm_mullo_epi64(a, b).as_i64x2();
-    let zero = _mm_setzero_si128().as_i64x2();
-    transmute(simd_select_bitmask(k, b, zero))
+    transmute(simd_select_bitmask(k, b, i64x2::ZERO))
 }

 /// Multiply packed 64-bit integers in `a` and `b`, producing intermediate 128-bit integers, and store
@ -4177,8 +4115,7 @@ pub unsafe fn _mm256_mask_mullo_epi64(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub unsafe fn _mm256_maskz_mullo_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
    let b = _mm256_mullo_epi64(a, b).as_i64x4();
-    let zero = _mm256_setzero_si256().as_i64x4();
-    transmute(simd_select_bitmask(k, b, zero))
+    transmute(simd_select_bitmask(k, b, i64x4::ZERO))
 }

 /// Multiply packed 64-bit integers in `a` and `b`, producing intermediate 128-bit integers, and store
@ -4223,8 +4160,7 @@ pub unsafe fn _mm512_mask_mullo_epi64(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub unsafe fn _mm512_maskz_mullo_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
    let b = _mm512_mullo_epi64(a, b).as_i64x8();
-    let zero = _mm512_setzero_si512().as_i64x8();
-    transmute(simd_select_bitmask(k, b, zero))
+    transmute(simd_select_bitmask(k, b, i64x8::ZERO))
 }

 // Mask Registers
--- a/library/stdarch/crates/core_arch/src/x86/avx512f.rs
+++ b/library/stdarch/crates/core_arch/src/x86/avx512f.rs
--- a/library/stdarch/crates/core_arch/src/x86/avx512fp16.rs
+++ b/library/stdarch/crates/core_arch/src/x86/avx512fp16.rs
@ -239,7 +239,7 @@ pub unsafe fn _mm512_setr_ph(
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub unsafe fn _mm_setzero_ph() -> __m128h {
-    transmute(f16x8::splat(0.0))
+    transmute(f16x8::ZERO)
 }

 /// Return vector of type __m256h with all elements set to zero.
@ -249,7 +249,7 @@ pub unsafe fn _mm_setzero_ph() -> __m128h {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub unsafe fn _mm256_setzero_ph() -> __m256h {
-    transmute(f16x16::splat(0.0))
+    transmute(f16x16::ZERO)
 }

 /// Return vector of type __m512h with all elements set to zero.
@ -259,7 +259,7 @@ pub unsafe fn _mm256_setzero_ph() -> __m256h {
 #[target_feature(enable = "avx512fp16")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub unsafe fn _mm512_setzero_ph() -> __m512h {
-    transmute(f16x32::splat(0.0))
+    transmute(f16x32::ZERO)
 }

 /// Return vector of type `__m128h` with undefined elements. In practice, this returns the all-zero
@ -270,7 +270,7 @@ pub unsafe fn _mm512_setzero_ph() -> __m512h {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub unsafe fn _mm_undefined_ph() -> __m128h {
-    transmute(f16x8::splat(0.0))
+    transmute(f16x8::ZERO)
 }

 /// Return vector of type `__m256h` with undefined elements. In practice, this returns the all-zero
@ -281,7 +281,7 @@ pub unsafe fn _mm_undefined_ph() -> __m128h {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub unsafe fn _mm256_undefined_ph() -> __m256h {
-    transmute(f16x16::splat(0.0))
+    transmute(f16x16::ZERO)
 }

 /// Return vector of type `__m512h` with undefined elements. In practice, this returns the all-zero
@ -292,7 +292,7 @@ pub unsafe fn _mm256_undefined_ph() -> __m256h {
 #[target_feature(enable = "avx512fp16")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub unsafe fn _mm512_undefined_ph() -> __m512h {
-    transmute(f16x32::splat(0.0))
+    transmute(f16x32::ZERO)
 }

 /// Cast vector of type `__m128d` to type `__m128h`. This intrinsic is only used for compilation and
@ -15986,7 +15986,7 @@ pub unsafe fn _mm_cvtsi128_si16(a: __m128i) -> i16 {
 #[target_feature(enable = "avx512fp16")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub unsafe fn _mm_cvtsi16_si128(a: i16) -> __m128i {
-    transmute(simd_insert!(i16x8::splat(0), 0, a))
+    transmute(simd_insert!(i16x8::ZERO, 0, a))
 }

 #[allow(improper_ctypes)]
--- a/library/stdarch/crates/core_arch/src/x86/avx512vbmi.rs
+++ b/library/stdarch/crates/core_arch/src/x86/avx512vbmi.rs
@ -46,8 +46,7 @@ pub unsafe fn _mm512_maskz_permutex2var_epi8(
    b: __m512i,
 ) -> __m512i {
    let permute = _mm512_permutex2var_epi8(a, idx, b).as_i8x64();
-    let zero = _mm512_setzero_si512().as_i8x64();
-    transmute(simd_select_bitmask(k, permute, zero))
+    transmute(simd_select_bitmask(k, permute, i8x64::ZERO))
 }

 /// Shuffle 8-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@ -109,8 +108,7 @@ pub unsafe fn _mm256_maskz_permutex2var_epi8(
    b: __m256i,
 ) -> __m256i {
    let permute = _mm256_permutex2var_epi8(a, idx, b).as_i8x32();
-    let zero = _mm256_setzero_si256().as_i8x32();
-    transmute(simd_select_bitmask(k, permute, zero))
+    transmute(simd_select_bitmask(k, permute, i8x32::ZERO))
 }

 /// Shuffle 8-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@ -172,8 +170,7 @@ pub unsafe fn _mm_maskz_permutex2var_epi8(
    b: __m128i,
 ) -> __m128i {
    let permute = _mm_permutex2var_epi8(a, idx, b).as_i8x16();
-    let zero = _mm_setzero_si128().as_i8x16();
-    transmute(simd_select_bitmask(k, permute, zero))
+    transmute(simd_select_bitmask(k, permute, i8x16::ZERO))
 }

 /// Shuffle 8-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@ -230,8 +227,7 @@ pub unsafe fn _mm512_mask_permutexvar_epi8(
 #[cfg_attr(test, assert_instr(vpermb))]
 pub unsafe fn _mm512_maskz_permutexvar_epi8(k: __mmask64, idx: __m512i, a: __m512i) -> __m512i {
    let permute = _mm512_permutexvar_epi8(idx, a).as_i8x64();
-    let zero = _mm512_setzero_si512().as_i8x64();
-    transmute(simd_select_bitmask(k, permute, zero))
+    transmute(simd_select_bitmask(k, permute, i8x64::ZERO))
 }

 /// Shuffle 8-bit integers in a across lanes using the corresponding index in idx, and store the results in dst.
@ -271,8 +267,7 @@ pub unsafe fn _mm256_mask_permutexvar_epi8(
 #[cfg_attr(test, assert_instr(vpermb))]
 pub unsafe fn _mm256_maskz_permutexvar_epi8(k: __mmask32, idx: __m256i, a: __m256i) -> __m256i {
    let permute = _mm256_permutexvar_epi8(idx, a).as_i8x32();
-    let zero = _mm256_setzero_si256().as_i8x32();
-    transmute(simd_select_bitmask(k, permute, zero))
+    transmute(simd_select_bitmask(k, permute, i8x32::ZERO))
 }

 /// Shuffle 8-bit integers in a across lanes using the corresponding index in idx, and store the results in dst.
@ -312,8 +307,7 @@ pub unsafe fn _mm_mask_permutexvar_epi8(
 #[cfg_attr(test, assert_instr(vpermb))]
 pub unsafe fn _mm_maskz_permutexvar_epi8(k: __mmask16, idx: __m128i, a: __m128i) -> __m128i {
    let permute = _mm_permutexvar_epi8(idx, a).as_i8x16();
-    let zero = _mm_setzero_si128().as_i8x16();
-    transmute(simd_select_bitmask(k, permute, zero))
+    transmute(simd_select_bitmask(k, permute, i8x16::ZERO))
 }

 /// For each 64-bit element in b, select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of a, and store the 8 assembled bytes to the corresponding 64-bit element of dst.
@ -353,8 +347,7 @@ pub unsafe fn _mm512_mask_multishift_epi64_epi8(
 #[cfg_attr(test, assert_instr(vpmultishiftqb))]
 pub unsafe fn _mm512_maskz_multishift_epi64_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
    let multishift = _mm512_multishift_epi64_epi8(a, b).as_i8x64();
-    let zero = _mm512_setzero_si512().as_i8x64();
-    transmute(simd_select_bitmask(k, multishift, zero))
+    transmute(simd_select_bitmask(k, multishift, i8x64::ZERO))
 }

 /// For each 64-bit element in b, select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of a, and store the 8 assembled bytes to the corresponding 64-bit element of dst.
@ -394,8 +387,7 @@ pub unsafe fn _mm256_mask_multishift_epi64_epi8(
 #[cfg_attr(test, assert_instr(vpmultishiftqb))]
 pub unsafe fn _mm256_maskz_multishift_epi64_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
    let multishift = _mm256_multishift_epi64_epi8(a, b).as_i8x32();
-    let zero = _mm256_setzero_si256().as_i8x32();
-    transmute(simd_select_bitmask(k, multishift, zero))
+    transmute(simd_select_bitmask(k, multishift, i8x32::ZERO))
 }

 /// For each 64-bit element in b, select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of a, and store the 8 assembled bytes to the corresponding 64-bit element of dst.
@ -435,8 +427,7 @@ pub unsafe fn _mm_mask_multishift_epi64_epi8(
 #[cfg_attr(test, assert_instr(vpmultishiftqb))]
 pub unsafe fn _mm_maskz_multishift_epi64_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
    let multishift = _mm_multishift_epi64_epi8(a, b).as_i8x16();
-    let zero = _mm_setzero_si128().as_i8x16();
-    transmute(simd_select_bitmask(k, multishift, zero))
+    transmute(simd_select_bitmask(k, multishift, i8x16::ZERO))
 }

 #[allow(improper_ctypes)]
--- a/library/stdarch/crates/core_arch/src/x86/avx512vbmi2.rs
+++ b/library/stdarch/crates/core_arch/src/x86/avx512vbmi2.rs
@ -247,11 +247,7 @@ pub unsafe fn _mm512_mask_compress_epi16(src: __m512i, k: __mmask32, a: __m512i)
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcompressw))]
 pub unsafe fn _mm512_maskz_compress_epi16(k: __mmask32, a: __m512i) -> __m512i {
-    transmute(vpcompressw(
-        a.as_i16x32(),
-        _mm512_setzero_si512().as_i16x32(),
-        k,
-    ))
+    transmute(vpcompressw(a.as_i16x32(), i16x32::ZERO, k))
 }

 /// Contiguously store the active 16-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
@ -273,11 +269,7 @@ pub unsafe fn _mm256_mask_compress_epi16(src: __m256i, k: __mmask16, a: __m256i)
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcompressw))]
 pub unsafe fn _mm256_maskz_compress_epi16(k: __mmask16, a: __m256i) -> __m256i {
-    transmute(vpcompressw256(
-        a.as_i16x16(),
-        _mm256_setzero_si256().as_i16x16(),
-        k,
-    ))
+    transmute(vpcompressw256(a.as_i16x16(), i16x16::ZERO, k))
 }

 /// Contiguously store the active 16-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
@ -299,11 +291,7 @@ pub unsafe fn _mm_mask_compress_epi16(src: __m128i, k: __mmask8, a: __m128i) ->
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcompressw))]
 pub unsafe fn _mm_maskz_compress_epi16(k: __mmask8, a: __m128i) -> __m128i {
-    transmute(vpcompressw128(
-        a.as_i16x8(),
-        _mm_setzero_si128().as_i16x8(),
-        k,
-    ))
+    transmute(vpcompressw128(a.as_i16x8(), i16x8::ZERO, k))
 }

 /// Contiguously store the active 8-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
@ -325,11 +313,7 @@ pub unsafe fn _mm512_mask_compress_epi8(src: __m512i, k: __mmask64, a: __m512i)
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcompressb))]
 pub unsafe fn _mm512_maskz_compress_epi8(k: __mmask64, a: __m512i) -> __m512i {
-    transmute(vpcompressb(
-        a.as_i8x64(),
-        _mm512_setzero_si512().as_i8x64(),
-        k,
-    ))
+    transmute(vpcompressb(a.as_i8x64(), i8x64::ZERO, k))
 }

 /// Contiguously store the active 8-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
@ -351,11 +335,7 @@ pub unsafe fn _mm256_mask_compress_epi8(src: __m256i, k: __mmask32, a: __m256i)
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcompressb))]
 pub unsafe fn _mm256_maskz_compress_epi8(k: __mmask32, a: __m256i) -> __m256i {
-    transmute(vpcompressb256(
-        a.as_i8x32(),
-        _mm256_setzero_si256().as_i8x32(),
-        k,
-    ))
+    transmute(vpcompressb256(a.as_i8x32(), i8x32::ZERO, k))
 }

 /// Contiguously store the active 8-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
@ -377,11 +357,7 @@ pub unsafe fn _mm_mask_compress_epi8(src: __m128i, k: __mmask16, a: __m128i) ->
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcompressb))]
 pub unsafe fn _mm_maskz_compress_epi8(k: __mmask16, a: __m128i) -> __m128i {
-    transmute(vpcompressb128(
-        a.as_i8x16(),
-        _mm_setzero_si128().as_i8x16(),
-        k,
-    ))
+    transmute(vpcompressb128(a.as_i8x16(), i8x16::ZERO, k))
 }

 /// Load contiguous active 16-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -403,11 +379,7 @@ pub unsafe fn _mm512_mask_expand_epi16(src: __m512i, k: __mmask32, a: __m512i) -
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpexpandw))]
 pub unsafe fn _mm512_maskz_expand_epi16(k: __mmask32, a: __m512i) -> __m512i {
-    transmute(vpexpandw(
-        a.as_i16x32(),
-        _mm512_setzero_si512().as_i16x32(),
-        k,
-    ))
+    transmute(vpexpandw(a.as_i16x32(), i16x32::ZERO, k))
 }

 /// Load contiguous active 16-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -429,11 +401,7 @@ pub unsafe fn _mm256_mask_expand_epi16(src: __m256i, k: __mmask16, a: __m256i) -
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpexpandw))]
 pub unsafe fn _mm256_maskz_expand_epi16(k: __mmask16, a: __m256i) -> __m256i {
-    transmute(vpexpandw256(
-        a.as_i16x16(),
-        _mm256_setzero_si256().as_i16x16(),
-        k,
-    ))
+    transmute(vpexpandw256(a.as_i16x16(), i16x16::ZERO, k))
 }

 /// Load contiguous active 16-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -455,11 +423,7 @@ pub unsafe fn _mm_mask_expand_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpexpandw))]
 pub unsafe fn _mm_maskz_expand_epi16(k: __mmask8, a: __m128i) -> __m128i {
-    transmute(vpexpandw128(
-        a.as_i16x8(),
-        _mm_setzero_si128().as_i16x8(),
-        k,
-    ))
+    transmute(vpexpandw128(a.as_i16x8(), i16x8::ZERO, k))
 }

 /// Load contiguous active 8-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -481,11 +445,7 @@ pub unsafe fn _mm512_mask_expand_epi8(src: __m512i, k: __mmask64, a: __m512i) ->
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpexpandb))]
 pub unsafe fn _mm512_maskz_expand_epi8(k: __mmask64, a: __m512i) -> __m512i {
-    transmute(vpexpandb(
-        a.as_i8x64(),
-        _mm512_setzero_si512().as_i8x64(),
-        k,
-    ))
+    transmute(vpexpandb(a.as_i8x64(), i8x64::ZERO, k))
 }

 /// Load contiguous active 8-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -507,11 +467,7 @@ pub unsafe fn _mm256_mask_expand_epi8(src: __m256i, k: __mmask32, a: __m256i) ->
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpexpandb))]
 pub unsafe fn _mm256_maskz_expand_epi8(k: __mmask32, a: __m256i) -> __m256i {
-    transmute(vpexpandb256(
-        a.as_i8x32(),
-        _mm256_setzero_si256().as_i8x32(),
-        k,
-    ))
+    transmute(vpexpandb256(a.as_i8x32(), i8x32::ZERO, k))
 }

 /// Load contiguous active 8-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@ -533,11 +489,7 @@ pub unsafe fn _mm_mask_expand_epi8(src: __m128i, k: __mmask16, a: __m128i) -> __
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpexpandb))]
 pub unsafe fn _mm_maskz_expand_epi8(k: __mmask16, a: __m128i) -> __m128i {
-    transmute(vpexpandb128(
-        a.as_i8x16(),
-        _mm_setzero_si128().as_i8x16(),
-        k,
-    ))
+    transmute(vpexpandb128(a.as_i8x16(), i8x16::ZERO, k))
 }

 /// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 64-bits in dst.
@ -572,8 +524,7 @@ pub unsafe fn _mm512_mask_shldv_epi64(a: __m512i, k: __mmask8, b: __m512i, c: __
 #[cfg_attr(test, assert_instr(vpshldvq))]
 pub unsafe fn _mm512_maskz_shldv_epi64(k: __mmask8, a: __m512i, b: __m512i, c: __m512i) -> __m512i {
    let shf = _mm512_shldv_epi64(a, b, c).as_i64x8();
-    let zero = _mm512_setzero_si512().as_i64x8();
-    transmute(simd_select_bitmask(k, shf, zero))
+    transmute(simd_select_bitmask(k, shf, i64x8::ZERO))
 }

 /// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 64-bits in dst.
@ -608,8 +559,7 @@ pub unsafe fn _mm256_mask_shldv_epi64(a: __m256i, k: __mmask8, b: __m256i, c: __
 #[cfg_attr(test, assert_instr(vpshldvq))]
 pub unsafe fn _mm256_maskz_shldv_epi64(k: __mmask8, a: __m256i, b: __m256i, c: __m256i) -> __m256i {
    let shf = _mm256_shldv_epi64(a, b, c).as_i64x4();
-    let zero = _mm256_setzero_si256().as_i64x4();
-    transmute(simd_select_bitmask(k, shf, zero))
+    transmute(simd_select_bitmask(k, shf, i64x4::ZERO))
 }

 /// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 64-bits in dst.
@ -644,8 +594,7 @@ pub unsafe fn _mm_mask_shldv_epi64(a: __m128i, k: __mmask8, b: __m128i, c: __m12
 #[cfg_attr(test, assert_instr(vpshldvq))]
 pub unsafe fn _mm_maskz_shldv_epi64(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) -> __m128i {
    let shf = _mm_shldv_epi64(a, b, c).as_i64x2();
-    let zero = _mm_setzero_si128().as_i64x2();
-    transmute(simd_select_bitmask(k, shf, zero))
+    transmute(simd_select_bitmask(k, shf, i64x2::ZERO))
 }

 /// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 32-bits in dst.
@ -685,8 +634,7 @@ pub unsafe fn _mm512_maskz_shldv_epi32(
    c: __m512i,
 ) -> __m512i {
    let shf = _mm512_shldv_epi32(a, b, c).as_i32x16();
-    let zero = _mm512_setzero_si512().as_i32x16();
-    transmute(simd_select_bitmask(k, shf, zero))
+    transmute(simd_select_bitmask(k, shf, i32x16::ZERO))
 }

 /// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 32-bits in dst.
@ -721,8 +669,7 @@ pub unsafe fn _mm256_mask_shldv_epi32(a: __m256i, k: __mmask8, b: __m256i, c: __
 #[cfg_attr(test, assert_instr(vpshldvd))]
 pub unsafe fn _mm256_maskz_shldv_epi32(k: __mmask8, a: __m256i, b: __m256i, c: __m256i) -> __m256i {
    let shf = _mm256_shldv_epi32(a, b, c).as_i32x8();
-    let zero = _mm256_setzero_si256().as_i32x8();
-    transmute(simd_select_bitmask(k, shf, zero))
+    transmute(simd_select_bitmask(k, shf, i32x8::ZERO))
 }

 /// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 32-bits in dst.
@ -757,8 +704,7 @@ pub unsafe fn _mm_mask_shldv_epi32(a: __m128i, k: __mmask8, b: __m128i, c: __m12
 #[cfg_attr(test, assert_instr(vpshldvd))]
 pub unsafe fn _mm_maskz_shldv_epi32(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) -> __m128i {
    let shf = _mm_shldv_epi32(a, b, c).as_i32x4();
-    let zero = _mm_setzero_si128().as_i32x4();
-    transmute(simd_select_bitmask(k, shf, zero))
+    transmute(simd_select_bitmask(k, shf, i32x4::ZERO))
 }

 /// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 16-bits in dst.
@ -798,8 +744,7 @@ pub unsafe fn _mm512_maskz_shldv_epi16(
    c: __m512i,
 ) -> __m512i {
    let shf = _mm512_shldv_epi16(a, b, c).as_i16x32();
-    let zero = _mm512_setzero_si512().as_i16x32();
-    transmute(simd_select_bitmask(k, shf, zero))
+    transmute(simd_select_bitmask(k, shf, i16x32::ZERO))
 }

 /// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 16-bits in dst.
@ -839,8 +784,7 @@ pub unsafe fn _mm256_maskz_shldv_epi16(
    c: __m256i,
 ) -> __m256i {
    let shf = _mm256_shldv_epi16(a, b, c).as_i16x16();
-    let zero = _mm256_setzero_si256().as_i16x16();
-    transmute(simd_select_bitmask(k, shf, zero))
+    transmute(simd_select_bitmask(k, shf, i16x16::ZERO))
 }

 /// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 16-bits in dst.
@ -875,8 +819,7 @@ pub unsafe fn _mm_mask_shldv_epi16(a: __m128i, k: __mmask8, b: __m128i, c: __m12
 #[cfg_attr(test, assert_instr(vpshldvw))]
 pub unsafe fn _mm_maskz_shldv_epi16(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) -> __m128i {
    let shf = _mm_shldv_epi16(a, b, c).as_i16x8();
-    let zero = _mm_setzero_si128().as_i16x8();
-    transmute(simd_select_bitmask(k, shf, zero))
+    transmute(simd_select_bitmask(k, shf, i16x8::ZERO))
 }

 /// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 64-bits in dst.
@ -911,8 +854,7 @@ pub unsafe fn _mm512_mask_shrdv_epi64(a: __m512i, k: __mmask8, b: __m512i, c: __
 #[cfg_attr(test, assert_instr(vpshrdvq))]
 pub unsafe fn _mm512_maskz_shrdv_epi64(k: __mmask8, a: __m512i, b: __m512i, c: __m512i) -> __m512i {
    let shf = _mm512_shrdv_epi64(a, b, c).as_i64x8();
-    let zero = _mm512_setzero_si512().as_i64x8();
-    transmute(simd_select_bitmask(k, shf, zero))
+    transmute(simd_select_bitmask(k, shf, i64x8::ZERO))
 }

 /// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 64-bits in dst.
@ -947,8 +889,7 @@ pub unsafe fn _mm256_mask_shrdv_epi64(a: __m256i, k: __mmask8, b: __m256i, c: __
 #[cfg_attr(test, assert_instr(vpshrdvq))]
 pub unsafe fn _mm256_maskz_shrdv_epi64(k: __mmask8, a: __m256i, b: __m256i, c: __m256i) -> __m256i {
    let shf = _mm256_shrdv_epi64(a, b, c).as_i64x4();
-    let zero = _mm256_setzero_si256().as_i64x4();
-    transmute(simd_select_bitmask(k, shf, zero))
+    transmute(simd_select_bitmask(k, shf, i64x4::ZERO))
 }

 /// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 64-bits in dst.
@ -983,8 +924,7 @@ pub unsafe fn _mm_mask_shrdv_epi64(a: __m128i, k: __mmask8, b: __m128i, c: __m12
 #[cfg_attr(test, assert_instr(vpshrdvq))]
 pub unsafe fn _mm_maskz_shrdv_epi64(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) -> __m128i {
    let shf = _mm_shrdv_epi64(a, b, c).as_i64x2();
-    let zero = _mm_setzero_si128().as_i64x2();
-    transmute(simd_select_bitmask(k, shf, zero))
+    transmute(simd_select_bitmask(k, shf, i64x2::ZERO))
 }

 /// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 32-bits in dst.
@ -1024,8 +964,7 @@ pub unsafe fn _mm512_maskz_shrdv_epi32(
    c: __m512i,
 ) -> __m512i {
    let shf = _mm512_shrdv_epi32(a, b, c).as_i32x16();
-    let zero = _mm512_setzero_si512().as_i32x16();
-    transmute(simd_select_bitmask(k, shf, zero))
+    transmute(simd_select_bitmask(k, shf, i32x16::ZERO))
 }

 /// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 32-bits in dst.
@ -1060,8 +999,7 @@ pub unsafe fn _mm256_mask_shrdv_epi32(a: __m256i, k: __mmask8, b: __m256i, c: __
 #[cfg_attr(test, assert_instr(vpshrdvd))]
 pub unsafe fn _mm256_maskz_shrdv_epi32(k: __mmask8, a: __m256i, b: __m256i, c: __m256i) -> __m256i {
    let shf = _mm256_shrdv_epi32(a, b, c).as_i32x8();
-    let zero = _mm256_setzero_si256().as_i32x8();
-    transmute(simd_select_bitmask(k, shf, zero))
+    transmute(simd_select_bitmask(k, shf, i32x8::ZERO))
 }

 /// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 32-bits in dst.
@ -1096,8 +1034,7 @@ pub unsafe fn _mm_mask_shrdv_epi32(a: __m128i, k: __mmask8, b: __m128i, c: __m12
 #[cfg_attr(test, assert_instr(vpshrdvd))]
 pub unsafe fn _mm_maskz_shrdv_epi32(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) -> __m128i {
    let shf = _mm_shrdv_epi32(a, b, c).as_i32x4();
-    let zero = _mm_setzero_si128().as_i32x4();
-    transmute(simd_select_bitmask(k, shf, zero))
+    transmute(simd_select_bitmask(k, shf, i32x4::ZERO))
 }

 /// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 16-bits in dst.
@ -1137,8 +1074,7 @@ pub unsafe fn _mm512_maskz_shrdv_epi16(
    c: __m512i,
 ) -> __m512i {
    let shf = _mm512_shrdv_epi16(a, b, c).as_i16x32();
-    let zero = _mm512_setzero_si512().as_i16x32();
-    transmute(simd_select_bitmask(k, shf, zero))
+    transmute(simd_select_bitmask(k, shf, i16x32::ZERO))
 }

 /// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 16-bits in dst.
@ -1178,8 +1114,7 @@ pub unsafe fn _mm256_maskz_shrdv_epi16(
    c: __m256i,
 ) -> __m256i {
    let shf = _mm256_shrdv_epi16(a, b, c).as_i16x16();
-    let zero = _mm256_setzero_si256().as_i16x16();
-    transmute(simd_select_bitmask(k, shf, zero))
+    transmute(simd_select_bitmask(k, shf, i16x16::ZERO))
 }

 /// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 16-bits in dst.
@ -1214,8 +1149,7 @@ pub unsafe fn _mm_mask_shrdv_epi16(a: __m128i, k: __mmask8, b: __m128i, c: __m12
 #[cfg_attr(test, assert_instr(vpshrdvw))]
 pub unsafe fn _mm_maskz_shrdv_epi16(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) -> __m128i {
    let shf = _mm_shrdv_epi16(a, b, c).as_i16x8();
-    let zero = _mm_setzero_si128().as_i16x8();
-    transmute(simd_select_bitmask(k, shf, zero))
+    transmute(simd_select_bitmask(k, shf, i16x8::ZERO))
 }

 /// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by imm8 bits, and store the upper 64-bits in dst).
@ -1265,8 +1199,7 @@ pub unsafe fn _mm512_maskz_shldi_epi64<const IMM8: i32>(
 ) -> __m512i {
    static_assert_uimm_bits!(IMM8, 8);
    let shf = _mm512_shldi_epi64::<IMM8>(a, b).as_i64x8();
-    let zero = _mm512_setzero_si512().as_i64x8();
-    transmute(simd_select_bitmask(k, shf, zero))
+    transmute(simd_select_bitmask(k, shf, i64x8::ZERO))
 }

 /// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by imm8 bits, and store the upper 64-bits in dst).
@ -1316,8 +1249,7 @@ pub unsafe fn _mm256_maskz_shldi_epi64<const IMM8: i32>(
 ) -> __m256i {
    static_assert_uimm_bits!(IMM8, 8);
    let shf = _mm256_shldi_epi64::<IMM8>(a, b).as_i64x4();
-    let zero = _mm256_setzero_si256().as_i64x4();
-    transmute(simd_select_bitmask(k, shf, zero))
+    transmute(simd_select_bitmask(k, shf, i64x4::ZERO))
 }

 /// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by imm8 bits, and store the upper 64-bits in dst).
@ -1367,8 +1299,7 @@ pub unsafe fn _mm_maskz_shldi_epi64<const IMM8: i32>(
 ) -> __m128i {
    static_assert_uimm_bits!(IMM8, 8);
    let shf = _mm_shldi_epi64::<IMM8>(a, b).as_i64x2();
-    let zero = _mm_setzero_si128().as_i64x2();
-    transmute(simd_select_bitmask(k, shf, zero))
+    transmute(simd_select_bitmask(k, shf, i64x2::ZERO))
 }

 /// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by imm8 bits, and store the upper 32-bits in dst.
@ -1418,8 +1349,7 @@ pub unsafe fn _mm512_maskz_shldi_epi32<const IMM8: i32>(
 ) -> __m512i {
    static_assert_uimm_bits!(IMM8, 8);
    let shf = _mm512_shldi_epi32::<IMM8>(a, b).as_i32x16();
-    let zero = _mm512_setzero_si512().as_i32x16();
-    transmute(simd_select_bitmask(k, shf, zero))
+    transmute(simd_select_bitmask(k, shf, i32x16::ZERO))
 }

 /// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by imm8 bits, and store the upper 32-bits in dst.
@ -1469,8 +1399,7 @@ pub unsafe fn _mm256_maskz_shldi_epi32<const IMM8: i32>(
 ) -> __m256i {
    static_assert_uimm_bits!(IMM8, 8);
    let shf = _mm256_shldi_epi32::<IMM8>(a, b).as_i32x8();
-    let zero = _mm256_setzero_si256().as_i32x8();
-    transmute(simd_select_bitmask(k, shf, zero))
+    transmute(simd_select_bitmask(k, shf, i32x8::ZERO))
 }

 /// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by imm8 bits, and store the upper 32-bits in dst.
@ -1520,8 +1449,7 @@ pub unsafe fn _mm_maskz_shldi_epi32<const IMM8: i32>(
 ) -> __m128i {
    static_assert_uimm_bits!(IMM8, 8);
    let shf = _mm_shldi_epi32::<IMM8>(a, b).as_i32x4();
-    let zero = _mm_setzero_si128().as_i32x4();
-    transmute(simd_select_bitmask(k, shf, zero))
+    transmute(simd_select_bitmask(k, shf, i32x4::ZERO))
 }

 /// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by imm8 bits, and store the upper 16-bits in dst).
@ -1571,8 +1499,7 @@ pub unsafe fn _mm512_maskz_shldi_epi16<const IMM8: i32>(
 ) -> __m512i {
    static_assert_uimm_bits!(IMM8, 8);
    let shf = _mm512_shldi_epi16::<IMM8>(a, b).as_i16x32();
-    let zero = _mm512_setzero_si512().as_i16x32();
-    transmute(simd_select_bitmask(k, shf, zero))
+    transmute(simd_select_bitmask(k, shf, i16x32::ZERO))
 }

 /// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by imm8 bits, and store the upper 16-bits in dst).
@ -1622,8 +1549,7 @@ pub unsafe fn _mm256_maskz_shldi_epi16<const IMM8: i32>(
 ) -> __m256i {
    static_assert_uimm_bits!(IMM8, 8);
    let shf = _mm256_shldi_epi16::<IMM8>(a, b).as_i16x16();
-    let zero = _mm256_setzero_si256().as_i16x16();
-    transmute(simd_select_bitmask(k, shf, zero))
+    transmute(simd_select_bitmask(k, shf, i16x16::ZERO))
 }

 /// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by imm8 bits, and store the upper 16-bits in dst).
@ -1673,8 +1599,7 @@ pub unsafe fn _mm_maskz_shldi_epi16<const IMM8: i32>(
 ) -> __m128i {
    static_assert_uimm_bits!(IMM8, 8);
    let shf = _mm_shldi_epi16::<IMM8>(a, b).as_i16x8();
-    let zero = _mm_setzero_si128().as_i16x8();
-    transmute(simd_select_bitmask(k, shf, zero))
+    transmute(simd_select_bitmask(k, shf, i16x8::ZERO))
 }

 /// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by imm8 bits, and store the lower 64-bits in dst.
@ -1724,8 +1649,7 @@ pub unsafe fn _mm512_maskz_shrdi_epi64<const IMM8: i32>(
 ) -> __m512i {
    static_assert_uimm_bits!(IMM8, 8);
    let shf = _mm512_shrdi_epi64::<IMM8>(a, b).as_i64x8();
-    let zero = _mm512_setzero_si512().as_i64x8();
-    transmute(simd_select_bitmask(k, shf, zero))
+    transmute(simd_select_bitmask(k, shf, i64x8::ZERO))
 }

 /// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by imm8 bits, and store the lower 64-bits in dst.
@ -1775,8 +1699,7 @@ pub unsafe fn _mm256_maskz_shrdi_epi64<const IMM8: i32>(
 ) -> __m256i {
    static_assert_uimm_bits!(IMM8, 8);
    let shf = _mm256_shrdi_epi64::<IMM8>(a, b).as_i64x4();
-    let zero = _mm256_setzero_si256().as_i64x4();
-    transmute(simd_select_bitmask(k, shf, zero))
+    transmute(simd_select_bitmask(k, shf, i64x4::ZERO))
 }

 /// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by imm8 bits, and store the lower 64-bits in dst.
@ -1826,8 +1749,7 @@ pub unsafe fn _mm_maskz_shrdi_epi64<const IMM8: i32>(
 ) -> __m128i {
    static_assert_uimm_bits!(IMM8, 8);
    let shf = _mm_shrdi_epi64::<IMM8>(a, b).as_i64x2();
-    let zero = _mm_setzero_si128().as_i64x2();
-    transmute(simd_select_bitmask(k, shf, zero))
+    transmute(simd_select_bitmask(k, shf, i64x2::ZERO))
 }

 /// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by imm8 bits, and store the lower 32-bits in dst.
@ -1877,8 +1799,7 @@ pub unsafe fn _mm512_maskz_shrdi_epi32<const IMM8: i32>(
 ) -> __m512i {
    static_assert_uimm_bits!(IMM8, 8);
    let shf = _mm512_shrdi_epi32::<IMM8>(a, b).as_i32x16();
-    let zero = _mm512_setzero_si512().as_i32x16();
-    transmute(simd_select_bitmask(k, shf, zero))
+    transmute(simd_select_bitmask(k, shf, i32x16::ZERO))
 }

 /// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by imm8 bits, and store the lower 32-bits in dst.
@ -1928,8 +1849,7 @@ pub unsafe fn _mm256_maskz_shrdi_epi32<const IMM8: i32>(
 ) -> __m256i {
    static_assert_uimm_bits!(IMM8, 8);
    let shf = _mm256_shrdi_epi32::<IMM8>(a, b).as_i32x8();
-    let zero = _mm256_setzero_si256().as_i32x8();
-    transmute(simd_select_bitmask(k, shf, zero))
+    transmute(simd_select_bitmask(k, shf, i32x8::ZERO))
 }

 /// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by imm8 bits, and store the lower 32-bits in dst.
@ -1979,8 +1899,7 @@ pub unsafe fn _mm_maskz_shrdi_epi32<const IMM8: i32>(
 ) -> __m128i {
    static_assert_uimm_bits!(IMM8, 8);
    let shf = _mm_shrdi_epi32::<IMM8>(a, b).as_i32x4();
-    let zero = _mm_setzero_si128().as_i32x4();
-    transmute(simd_select_bitmask(k, shf, zero))
+    transmute(simd_select_bitmask(k, shf, i32x4::ZERO))
 }

 /// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by imm8 bits, and store the lower 16-bits in dst.
@ -2030,8 +1949,7 @@ pub unsafe fn _mm512_maskz_shrdi_epi16<const IMM8: i32>(
 ) -> __m512i {
    static_assert_uimm_bits!(IMM8, 8);
    let shf = _mm512_shrdi_epi16::<IMM8>(a, b).as_i16x32();
-    let zero = _mm512_setzero_si512().as_i16x32();
-    transmute(simd_select_bitmask(k, shf, zero))
+    transmute(simd_select_bitmask(k, shf, i16x32::ZERO))
 }

 /// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by imm8 bits, and store the lower 16-bits in dst.
@ -2081,8 +1999,7 @@ pub unsafe fn _mm256_maskz_shrdi_epi16<const IMM8: i32>(
 ) -> __m256i {
    static_assert_uimm_bits!(IMM8, 8);
    let shf = _mm256_shrdi_epi16::<IMM8>(a, b).as_i16x16();
-    let zero = _mm256_setzero_si256().as_i16x16();
-    transmute(simd_select_bitmask(k, shf, zero))
+    transmute(simd_select_bitmask(k, shf, i16x16::ZERO))
 }

 /// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by imm8 bits, and store the lower 16-bits in dst.
@ -2132,8 +2049,7 @@ pub unsafe fn _mm_maskz_shrdi_epi16<const IMM8: i32>(
 ) -> __m128i {
    static_assert_uimm_bits!(IMM8, 8);
    let shf = _mm_shrdi_epi16::<IMM8>(a, b).as_i16x8();
-    let zero = _mm_setzero_si128().as_i16x8();
-    transmute(simd_select_bitmask(k, shf, zero))
+    transmute(simd_select_bitmask(k, shf, i16x8::ZERO))
 }

 #[allow(improper_ctypes)]
--- a/library/stdarch/crates/core_arch/src/x86/avx512vnni.rs
+++ b/library/stdarch/crates/core_arch/src/x86/avx512vnni.rs
@ -46,8 +46,7 @@ pub unsafe fn _mm512_maskz_dpwssd_epi32(
    b: __m512i,
 ) -> __m512i {
    let r = _mm512_dpwssd_epi32(src, a, b).as_i32x16();
-    let zero = _mm512_setzero_si512().as_i32x16();
-    transmute(simd_select_bitmask(k, r, zero))
+    transmute(simd_select_bitmask(k, r, i32x16::ZERO))
 }

 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
@ -106,8 +105,7 @@ pub unsafe fn _mm256_maskz_dpwssd_epi32(
    b: __m256i,
 ) -> __m256i {
    let r = _mm256_dpwssd_epi32(src, a, b).as_i32x8();
-    let zero = _mm256_setzero_si256().as_i32x8();
-    transmute(simd_select_bitmask(k, r, zero))
+    transmute(simd_select_bitmask(k, r, i32x8::ZERO))
 }

 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
@ -156,8 +154,7 @@ pub unsafe fn _mm_mask_dpwssd_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __
 #[cfg_attr(test, assert_instr(vpdpwssd))]
 pub unsafe fn _mm_maskz_dpwssd_epi32(k: __mmask8, src: __m128i, a: __m128i, b: __m128i) -> __m128i {
    let r = _mm_dpwssd_epi32(src, a, b).as_i32x4();
-    let zero = _mm_setzero_si128().as_i32x4();
-    transmute(simd_select_bitmask(k, r, zero))
+    transmute(simd_select_bitmask(k, r, i32x4::ZERO))
 }

 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
@ -202,8 +199,7 @@ pub unsafe fn _mm512_maskz_dpwssds_epi32(
    b: __m512i,
 ) -> __m512i {
    let r = _mm512_dpwssds_epi32(src, a, b).as_i32x16();
-    let zero = _mm512_setzero_si512().as_i32x16();
-    transmute(simd_select_bitmask(k, r, zero))
+    transmute(simd_select_bitmask(k, r, i32x16::ZERO))
 }

 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
@ -262,8 +258,7 @@ pub unsafe fn _mm256_maskz_dpwssds_epi32(
    b: __m256i,
 ) -> __m256i {
    let r = _mm256_dpwssds_epi32(src, a, b).as_i32x8();
-    let zero = _mm256_setzero_si256().as_i32x8();
-    transmute(simd_select_bitmask(k, r, zero))
+    transmute(simd_select_bitmask(k, r, i32x8::ZERO))
 }

 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
@ -317,8 +312,7 @@ pub unsafe fn _mm_maskz_dpwssds_epi32(
    b: __m128i,
 ) -> __m128i {
    let r = _mm_dpwssds_epi32(src, a, b).as_i32x4();
-    let zero = _mm_setzero_si128().as_i32x4();
-    transmute(simd_select_bitmask(k, r, zero))
+    transmute(simd_select_bitmask(k, r, i32x4::ZERO))
 }

 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
@ -363,8 +357,7 @@ pub unsafe fn _mm512_maskz_dpbusd_epi32(
    b: __m512i,
 ) -> __m512i {
    let r = _mm512_dpbusd_epi32(src, a, b).as_i32x16();
-    let zero = _mm512_setzero_si512().as_i32x16();
-    transmute(simd_select_bitmask(k, r, zero))
+    transmute(simd_select_bitmask(k, r, i32x16::ZERO))
 }

 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
@ -423,8 +416,7 @@ pub unsafe fn _mm256_maskz_dpbusd_epi32(
    b: __m256i,
 ) -> __m256i {
    let r = _mm256_dpbusd_epi32(src, a, b).as_i32x8();
-    let zero = _mm256_setzero_si256().as_i32x8();
-    transmute(simd_select_bitmask(k, r, zero))
+    transmute(simd_select_bitmask(k, r, i32x8::ZERO))
 }

 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
@ -473,8 +465,7 @@ pub unsafe fn _mm_mask_dpbusd_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __
 #[cfg_attr(test, assert_instr(vpdpbusd))]
 pub unsafe fn _mm_maskz_dpbusd_epi32(k: __mmask8, src: __m128i, a: __m128i, b: __m128i) -> __m128i {
    let r = _mm_dpbusd_epi32(src, a, b).as_i32x4();
-    let zero = _mm_setzero_si128().as_i32x4();
-    transmute(simd_select_bitmask(k, r, zero))
+    transmute(simd_select_bitmask(k, r, i32x4::ZERO))
 }

 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
@ -519,8 +510,7 @@ pub unsafe fn _mm512_maskz_dpbusds_epi32(
    b: __m512i,
 ) -> __m512i {
    let r = _mm512_dpbusds_epi32(src, a, b).as_i32x16();
-    let zero = _mm512_setzero_si512().as_i32x16();
-    transmute(simd_select_bitmask(k, r, zero))
+    transmute(simd_select_bitmask(k, r, i32x16::ZERO))
 }

 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
@ -579,8 +569,7 @@ pub unsafe fn _mm256_maskz_dpbusds_epi32(
    b: __m256i,
 ) -> __m256i {
    let r = _mm256_dpbusds_epi32(src, a, b).as_i32x8();
-    let zero = _mm256_setzero_si256().as_i32x8();
-    transmute(simd_select_bitmask(k, r, zero))
+    transmute(simd_select_bitmask(k, r, i32x8::ZERO))
 }

 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
@ -634,8 +623,7 @@ pub unsafe fn _mm_maskz_dpbusds_epi32(
    b: __m128i,
 ) -> __m128i {
    let r = _mm_dpbusds_epi32(src, a, b).as_i32x4();
-    let zero = _mm_setzero_si128().as_i32x4();
-    transmute(simd_select_bitmask(k, r, zero))
+    transmute(simd_select_bitmask(k, r, i32x4::ZERO))
 }

 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding signed 8-bit
--- a/library/stdarch/crates/core_arch/src/x86/avx512vpopcntdq.rs
+++ b/library/stdarch/crates/core_arch/src/x86/avx512vpopcntdq.rs
@ -7,14 +7,12 @@
 //!
 //! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf

+use crate::core_arch::simd::*;
 use crate::core_arch::x86::__m128i;
 use crate::core_arch::x86::__m256i;
 use crate::core_arch::x86::__m512i;
 use crate::core_arch::x86::__mmask16;
 use crate::core_arch::x86::__mmask8;
-use crate::core_arch::x86::_mm256_setzero_si256;
-use crate::core_arch::x86::_mm512_setzero_si512;
-use crate::core_arch::x86::_mm_setzero_si128;
 use crate::core_arch::x86::m128iExt;
 use crate::core_arch::x86::m256iExt;
 use crate::core_arch::x86::m512iExt;
@ -46,8 +44,11 @@ pub unsafe fn _mm512_popcnt_epi32(a: __m512i) -> __m512i {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpopcntd))]
 pub unsafe fn _mm512_maskz_popcnt_epi32(k: __mmask16, a: __m512i) -> __m512i {
-    let zero = _mm512_setzero_si512().as_i32x16();
-    transmute(simd_select_bitmask(k, simd_ctpop(a.as_i32x16()), zero))
+    transmute(simd_select_bitmask(
+        k,
+        simd_ctpop(a.as_i32x16()),
+        i32x16::ZERO,
+    ))
 }

 /// For each packed 32-bit integer maps the value to the number of logical 1 bits.
@ -90,8 +91,11 @@ pub unsafe fn _mm256_popcnt_epi32(a: __m256i) -> __m256i {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpopcntd))]
 pub unsafe fn _mm256_maskz_popcnt_epi32(k: __mmask8, a: __m256i) -> __m256i {
-    let zero = _mm256_setzero_si256().as_i32x8();
-    transmute(simd_select_bitmask(k, simd_ctpop(a.as_i32x8()), zero))
+    transmute(simd_select_bitmask(
+        k,
+        simd_ctpop(a.as_i32x8()),
+        i32x8::ZERO,
+    ))
 }

 /// For each packed 32-bit integer maps the value to the number of logical 1 bits.
@ -134,8 +138,11 @@ pub unsafe fn _mm_popcnt_epi32(a: __m128i) -> __m128i {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpopcntd))]
 pub unsafe fn _mm_maskz_popcnt_epi32(k: __mmask8, a: __m128i) -> __m128i {
-    let zero = _mm_setzero_si128().as_i32x4();
-    transmute(simd_select_bitmask(k, simd_ctpop(a.as_i32x4()), zero))
+    transmute(simd_select_bitmask(
+        k,
+        simd_ctpop(a.as_i32x4()),
+        i32x4::ZERO,
+    ))
 }

 /// For each packed 32-bit integer maps the value to the number of logical 1 bits.
@ -178,8 +185,11 @@ pub unsafe fn _mm512_popcnt_epi64(a: __m512i) -> __m512i {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpopcntq))]
 pub unsafe fn _mm512_maskz_popcnt_epi64(k: __mmask8, a: __m512i) -> __m512i {
-    let zero = _mm512_setzero_si512().as_i64x8();
-    transmute(simd_select_bitmask(k, simd_ctpop(a.as_i64x8()), zero))
+    transmute(simd_select_bitmask(
+        k,
+        simd_ctpop(a.as_i64x8()),
+        i64x8::ZERO,
+    ))
 }

 /// For each packed 64-bit integer maps the value to the number of logical 1 bits.
@ -222,8 +232,11 @@ pub unsafe fn _mm256_popcnt_epi64(a: __m256i) -> __m256i {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpopcntq))]
 pub unsafe fn _mm256_maskz_popcnt_epi64(k: __mmask8, a: __m256i) -> __m256i {
-    let zero = _mm256_setzero_si256().as_i64x4();
-    transmute(simd_select_bitmask(k, simd_ctpop(a.as_i64x4()), zero))
+    transmute(simd_select_bitmask(
+        k,
+        simd_ctpop(a.as_i64x4()),
+        i64x4::ZERO,
+    ))
 }

 /// For each packed 64-bit integer maps the value to the number of logical 1 bits.
@ -266,8 +279,11 @@ pub unsafe fn _mm_popcnt_epi64(a: __m128i) -> __m128i {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpopcntq))]
 pub unsafe fn _mm_maskz_popcnt_epi64(k: __mmask8, a: __m128i) -> __m128i {
-    let zero = _mm_setzero_si128().as_i64x2();
-    transmute(simd_select_bitmask(k, simd_ctpop(a.as_i64x2()), zero))
+    transmute(simd_select_bitmask(
+        k,
+        simd_ctpop(a.as_i64x2()),
+        i64x2::ZERO,
+    ))
 }

 /// For each packed 64-bit integer maps the value to the number of logical 1 bits.
--- a/library/stdarch/crates/core_arch/src/x86/gfni.rs
+++ b/library/stdarch/crates/core_arch/src/x86/gfni.rs
@ -16,9 +16,6 @@ use crate::core_arch::x86::__m512i;
 use crate::core_arch::x86::__mmask16;
 use crate::core_arch::x86::__mmask32;
 use crate::core_arch::x86::__mmask64;
-use crate::core_arch::x86::_mm256_setzero_si256;
-use crate::core_arch::x86::_mm512_setzero_si512;
-use crate::core_arch::x86::_mm_setzero_si128;
 use crate::core_arch::x86::m128iExt;
 use crate::core_arch::x86::m256iExt;
 use crate::core_arch::x86::m512iExt;
@ -110,7 +107,7 @@ pub unsafe fn _mm512_mask_gf2p8mul_epi8(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vgf2p8mulb))]
 pub unsafe fn _mm512_maskz_gf2p8mul_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
-    let zero = _mm512_setzero_si512().as_i8x64();
+    let zero = i8x64::ZERO;
    transmute(simd_select_bitmask(
        k,
        vgf2p8mulb_512(a.as_i8x64(), b.as_i8x64()),
@ -169,7 +166,7 @@ pub unsafe fn _mm256_mask_gf2p8mul_epi8(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vgf2p8mulb))]
 pub unsafe fn _mm256_maskz_gf2p8mul_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
-    let zero = _mm256_setzero_si256().as_i8x32();
+    let zero = i8x32::ZERO;
    transmute(simd_select_bitmask(
        k,
        vgf2p8mulb_256(a.as_i8x32(), b.as_i8x32()),
@ -228,7 +225,7 @@ pub unsafe fn _mm_mask_gf2p8mul_epi8(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vgf2p8mulb))]
 pub unsafe fn _mm_maskz_gf2p8mul_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
-    let zero = _mm_setzero_si128().as_i8x16();
+    let zero = i8x16::ZERO;
    transmute(simd_select_bitmask(
        k,
        vgf2p8mulb_128(a.as_i8x16(), b.as_i8x16()),
@ -277,7 +274,7 @@ pub unsafe fn _mm512_maskz_gf2p8affine_epi64_epi8<const B: i32>(
 ) -> __m512i {
    static_assert_uimm_bits!(B, 8);
    let b = B as u8;
-    let zero = _mm512_setzero_si512().as_i8x64();
+    let zero = i8x64::ZERO;
    let x = x.as_i8x64();
    let a = a.as_i8x64();
    let r = vgf2p8affineqb_512(x, a, b);
@ -353,7 +350,7 @@ pub unsafe fn _mm256_maskz_gf2p8affine_epi64_epi8<const B: i32>(
 ) -> __m256i {
    static_assert_uimm_bits!(B, 8);
    let b = B as u8;
-    let zero = _mm256_setzero_si256().as_i8x32();
+    let zero = i8x32::ZERO;
    let x = x.as_i8x32();
    let a = a.as_i8x32();
    let r = vgf2p8affineqb_256(x, a, b);
@ -429,7 +426,7 @@ pub unsafe fn _mm_maskz_gf2p8affine_epi64_epi8<const B: i32>(
 ) -> __m128i {
    static_assert_uimm_bits!(B, 8);
    let b = B as u8;
-    let zero = _mm_setzero_si128().as_i8x16();
+    let zero = i8x16::ZERO;
    let x = x.as_i8x16();
    let a = a.as_i8x16();
    let r = vgf2p8affineqb_128(x, a, b);
@ -509,7 +506,7 @@ pub unsafe fn _mm512_maskz_gf2p8affineinv_epi64_epi8<const B: i32>(
 ) -> __m512i {
    static_assert_uimm_bits!(B, 8);
    let b = B as u8;
-    let zero = _mm512_setzero_si512().as_i8x64();
+    let zero = i8x64::ZERO;
    let x = x.as_i8x64();
    let a = a.as_i8x64();
    let r = vgf2p8affineinvqb_512(x, a, b);
@ -591,7 +588,7 @@ pub unsafe fn _mm256_maskz_gf2p8affineinv_epi64_epi8<const B: i32>(
 ) -> __m256i {
    static_assert_uimm_bits!(B, 8);
    let b = B as u8;
-    let zero = _mm256_setzero_si256().as_i8x32();
+    let zero = i8x32::ZERO;
    let x = x.as_i8x32();
    let a = a.as_i8x32();
    let r = vgf2p8affineinvqb_256(x, a, b);
@ -673,7 +670,7 @@ pub unsafe fn _mm_maskz_gf2p8affineinv_epi64_epi8<const B: i32>(
 ) -> __m128i {
    static_assert_uimm_bits!(B, 8);
    let b = B as u8;
-    let zero = _mm_setzero_si128().as_i8x16();
+    let zero = i8x16::ZERO;
    let x = x.as_i8x16();
    let a = a.as_i8x16();
    let r = vgf2p8affineinvqb_128(x, a, b);
--- a/library/stdarch/crates/core_arch/src/x86/sse.rs
+++ b/library/stdarch/crates/core_arch/src/x86/sse.rs
@ -983,7 +983,7 @@ pub unsafe fn _mm_setr_ps(a: f32, b: f32, c: f32, d: f32) -> __m128 {
 #[cfg_attr(test, assert_instr(xorps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_setzero_ps() -> __m128 {
-    __m128([0.0, 0.0, 0.0, 0.0])
+    const { mem::zeroed() }
 }

 /// A utility function for creating masks to use with Intel shuffle and
@ -1089,7 +1089,7 @@ pub unsafe fn _mm_movelh_ps(a: __m128, b: __m128) -> __m128 {
 pub unsafe fn _mm_movemask_ps(a: __m128) -> i32 {
    // Propagate the highest bit to the rest, because simd_bitmask
    // requires all-1 or all-0.
-    let mask: i32x4 = simd_lt(transmute(a), i32x4::splat(0));
+    let mask: i32x4 = simd_lt(transmute(a), i32x4::ZERO);
    simd_bitmask::<i32x4, u8>(mask).into()
 }

@ -1881,7 +1881,7 @@ pub unsafe fn _mm_prefetch<const STRATEGY: i32>(p: *const i8) {
 #[target_feature(enable = "sse")]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_undefined_ps() -> __m128 {
-    _mm_set1_ps(0.0)
+    const { mem::zeroed() }
 }

 /// Transpose the 4x4 matrix formed by 4 rows of __m128 in place.
--- a/library/stdarch/crates/core_arch/src/x86/sse2.rs
+++ b/library/stdarch/crates/core_arch/src/x86/sse2.rs
@ -455,9 +455,8 @@ unsafe fn _mm_slli_si128_impl<const IMM8: i32>(a: __m128i) -> __m128i {
            16 - shift + i
        }
    }
-    let zero = _mm_set1_epi8(0).as_i8x16();
    transmute::<i8x16, _>(simd_shuffle!(
-        zero,
+        i8x16::ZERO,
        a.as_i8x16(),
        [
            mask(IMM8, 0),
@ -670,10 +669,9 @@ unsafe fn _mm_srli_si128_impl<const IMM8: i32>(a: __m128i) -> __m128i {
            i + (shift as u32)
        }
    }
-    let zero = _mm_set1_epi8(0).as_i8x16();
    let x: i8x16 = simd_shuffle!(
        a.as_i8x16(),
-        zero,
+        i8x16::ZERO,
        [
            mask(IMM8, 0),
            mask(IMM8, 1),
@ -1191,7 +1189,7 @@ pub unsafe fn _mm_setr_epi8(
 #[cfg_attr(test, assert_instr(xorps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_setzero_si128() -> __m128i {
-    _mm_set1_epi64x(0)
+    const { mem::zeroed() }
 }

 /// Loads 64-bit integer from memory into first element of returned vector.
@ -1359,8 +1357,7 @@ pub unsafe fn _mm_stream_si32(mem_addr: *mut i32, a: i32) {
 )]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_move_epi64(a: __m128i) -> __m128i {
-    let zero = _mm_setzero_si128();
-    let r: i64x2 = simd_shuffle!(a.as_i64x2(), zero.as_i64x2(), [0, 2]);
+    let r: i64x2 = simd_shuffle!(a.as_i64x2(), i64x2::ZERO, [0, 2]);
    transmute(r)
 }

@ -1434,7 +1431,7 @@ pub unsafe fn _mm_insert_epi16<const IMM8: i32>(a: __m128i, i: i32) -> __m128i {
 #[cfg_attr(test, assert_instr(pmovmskb))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_movemask_epi8(a: __m128i) -> i32 {
-    let z = i8x16::splat(0);
+    let z = i8x16::ZERO;
    let m: i8x16 = simd_lt(a.as_i8x16(), z);
    simd_bitmask::<_, u16>(m) as u32 as i32
 }
@ -2267,7 +2264,7 @@ pub unsafe fn _mm_ucomineq_sd(a: __m128d, b: __m128d) -> i32 {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtpd_ps(a: __m128d) -> __m128 {
    let r = simd_cast::<_, f32x2>(a.as_f64x2());
-    let zero = f32x2::new(0.0, 0.0);
+    let zero = f32x2::ZERO;
    transmute::<f32x4, _>(simd_shuffle!(r, zero, [0, 1, 2, 3]))
 }

@ -2447,7 +2444,7 @@ pub unsafe fn _mm_setr_pd(a: f64, b: f64) -> __m128d {
 #[cfg_attr(test, assert_instr(xorp))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_setzero_pd() -> __m128d {
-    _mm_set_pd(0.0, 0.0)
+    const { mem::zeroed() }
 }

 /// Returns a mask of the most significant bit of each element in `a`.
@ -2463,7 +2460,7 @@ pub unsafe fn _mm_setzero_pd() -> __m128d {
 pub unsafe fn _mm_movemask_pd(a: __m128d) -> i32 {
    // Propagate the highest bit to the rest, because simd_bitmask
    // requires all-1 or all-0.
-    let mask: i64x2 = simd_lt(transmute(a), i64x2::splat(0));
+    let mask: i64x2 = simd_lt(transmute(a), i64x2::ZERO);
    simd_bitmask::<i64x2, u8>(mask).into()
 }

@ -2902,7 +2899,7 @@ pub unsafe fn _mm_castsi128_ps(a: __m128i) -> __m128 {
 #[target_feature(enable = "sse2")]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_undefined_pd() -> __m128d {
-    __m128d([0.0, 0.0])
+    const { mem::zeroed() }
 }

 /// Returns vector of type __m128i with indeterminate elements.
@ -2914,7 +2911,7 @@ pub unsafe fn _mm_undefined_pd() -> __m128d {
 #[target_feature(enable = "sse2")]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_undefined_si128() -> __m128i {
-    __m128i([0, 0])
+    const { mem::zeroed() }
 }

 /// The resulting `__m128d` element is composed by the low-order values of
--- a/library/stdarch/crates/core_arch/src/x86/sse41.rs
+++ b/library/stdarch/crates/core_arch/src/x86/sse41.rs
@ -60,7 +60,7 @@ pub const _MM_FROUND_NEARBYINT: i32 = _MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTI
 #[cfg_attr(test, assert_instr(pblendvb))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_blendv_epi8(a: __m128i, b: __m128i, mask: __m128i) -> __m128i {
-    let mask: i8x16 = simd_lt(mask.as_i8x16(), i8x16::splat(0));
+    let mask: i8x16 = simd_lt(mask.as_i8x16(), i8x16::ZERO);
    transmute(simd_select(mask, b.as_i8x16(), a.as_i8x16()))
 }

@ -103,7 +103,7 @@ pub unsafe fn _mm_blend_epi16<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128
 #[cfg_attr(test, assert_instr(blendvpd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_blendv_pd(a: __m128d, b: __m128d, mask: __m128d) -> __m128d {
-    let mask: i64x2 = simd_lt(transmute::<_, i64x2>(mask), i64x2::splat(0));
+    let mask: i64x2 = simd_lt(transmute::<_, i64x2>(mask), i64x2::ZERO);
    transmute(simd_select(mask, b.as_f64x2(), a.as_f64x2()))
 }

@ -116,7 +116,7 @@ pub unsafe fn _mm_blendv_pd(a: __m128d, b: __m128d, mask: __m128d) -> __m128d {
 #[cfg_attr(test, assert_instr(blendvps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_blendv_ps(a: __m128, b: __m128, mask: __m128) -> __m128 {
-    let mask: i32x4 = simd_lt(transmute::<_, i32x4>(mask), i32x4::splat(0));
+    let mask: i32x4 = simd_lt(transmute::<_, i32x4>(mask), i32x4::ZERO);
    transmute(simd_select(mask, b.as_f32x4(), a.as_f32x4()))
 }

--- a/library/stdarch/crates/core_arch/src/x86/ssse3.rs
+++ b/library/stdarch/crates/core_arch/src/x86/ssse3.rs
@ -18,7 +18,7 @@ use stdarch_test::assert_instr;
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_abs_epi8(a: __m128i) -> __m128i {
    let a = a.as_i8x16();
-    let zero = i8x16::splat(0);
+    let zero = i8x16::ZERO;
    let r = simd_select::<m8x16, _>(simd_lt(a, zero), simd_neg(a), a);
    transmute(r)
 }
@ -34,7 +34,7 @@ pub unsafe fn _mm_abs_epi8(a: __m128i) -> __m128i {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_abs_epi16(a: __m128i) -> __m128i {
    let a = a.as_i16x8();
-    let zero = i16x8::splat(0);
+    let zero = i16x8::ZERO;
    let r = simd_select::<m16x8, _>(simd_lt(a, zero), simd_neg(a), a);
    transmute(r)
 }
@ -50,7 +50,7 @@ pub unsafe fn _mm_abs_epi16(a: __m128i) -> __m128i {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_abs_epi32(a: __m128i) -> __m128i {
    let a = a.as_i32x4();
-    let zero = i32x4::splat(0);
+    let zero = i32x4::ZERO;
    let r = simd_select::<m32x4, _>(simd_lt(a, zero), simd_neg(a), a);
    transmute(r)
 }
@ -103,12 +103,12 @@ pub unsafe fn _mm_alignr_epi8<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128
    // If palignr is shifting the pair of vectors more than the size of two
    // lanes, emit zero.
    if IMM8 > 32 {
-        return _mm_set1_epi8(0);
+        return _mm_setzero_si128();
    }
    // If palignr is shifting the pair of input vectors more than one lane,
    // but less than two lanes, convert to shifting in zeroes.
    let (a, b) = if IMM8 > 16 {
-        (_mm_set1_epi8(0), a)
+        (_mm_setzero_si128(), a)
    } else {
        (a, b)
    };