diff --git a/library/stdarch/crates/core_arch/avx512f.md b/library/stdarch/crates/core_arch/avx512f.md
index 0fcf4d96417b..b564ec5eecf2 100644
--- a/library/stdarch/crates/core_arch/avx512f.md
+++ b/library/stdarch/crates/core_arch/avx512f.md
@@ -1176,50 +1176,58 @@
   * [x] [`_mm256_mask_andnot_epi64`]
   * [x] [`_mm256_maskz_andnot_epi64`]
   * [x] [`_mm512_andnot_si512`]
-  * [x] [`_mm512_mask_unpackhi_epi32`]
   * [x] [`_mm512_unpackhi_epi32`]
+  * [x] [`_mm512_mask_unpackhi_epi32`]
+  * [x] [`_mm512_maskz_unpackhi_epi32`]
   * [x] [`_mm_mask_unpackhi_epi32`]
   * [x] [`_mm_maskz_unpackhi_epi32`]
   * [x] [`_mm256_mask_unpackhi_epi32`]
   * [x] [`_mm256_maskz_unpackhi_epi32`]
   * [x] [`_mm512_unpackhi_epi64`]
   * [x] [`_mm512_mask_unpackhi_epi64`]
+  * [x] [`_mm512_maskz_unpackhi_epi64`]
   * [x] [`_mm_mask_unpackhi_epi64`]
   * [x] [`_mm_maskz_unpackhi_epi64`]
   * [x] [`_mm256_mask_unpackhi_epi64`]
   * [x] [`_mm256_maskz_unpackhi_epi64`]
   * [x] [`_mm512_unpackhi_ps`]
   * [x] [`_mm512_mask_unpackhi_ps`]
+  * [x] [`_mm512_maskz_unpackhi_ps`]
   * [x] [`_mm_mask_unpackhi_ps`]
   * [x] [`_mm_maskz_unpackhi_ps`]
   * [x] [`_mm256_mask_unpackhi_ps`]
   * [x] [`_mm256_maskz_unpackhi_ps`]
   * [x] [`_mm512_unpackhi_pd`]
   * [x] [`_mm512_mask_unpackhi_pd`]
+  * [x] [`_mm512_maskz_unpackhi_pd`]
   * [x] [`_mm_mask_unpackhi_pd`]
   * [x] [`_mm_maskz_unpackhi_pd`]
   * [x] [`_mm256_mask_unpackhi_pd`]
   * [x] [`_mm256_maskz_unpackhi_pd`]
-  * [x] [`_mm512_mask_unpacklo_epi32`]
   * [x] [`_mm512_unpacklo_epi32`]
+  * [x] [`_mm512_mask_unpacklo_epi32`]
+  * [x] [`_mm512_maskz_unpacklo_epi32`]
   * [x] [`_mm_mask_unpacklo_epi32`]
   * [x] [`_mm_maskz_unpacklo_epi32`]
   * [x] [`_mm256_mask_unpacklo_epi32`]
   * [x] [`_mm256_maskz_unpacklo_epi32`]
   * [x] [`_mm512_unpacklo_epi64`]
   * [x] [`_mm512_mask_unpacklo_epi64`]
+  * [x] [`_mm512_maskz_unpacklo_epi64`]
   * [x] [`_mm_mask_unpacklo_epi64`]
   * [x] [`_mm_maskz_unpacklo_epi64`]
   * [x] [`_mm256_mask_unpacklo_epi64`]
   * [x] [`_mm256_maskz_unpacklo_epi64`]
   * [x] [`_mm512_unpacklo_ps`]
   * [x] [`_mm512_mask_unpacklo_ps`]
+  * [x] [`_mm512_maskz_unpacklo_ps`]
   * [x] [`_mm_mask_unpacklo_ps`]
   * [x] [`_mm_maskz_unpacklo_ps`]
   * [x] [`_mm256_mask_unpacklo_ps`]
   * [x] [`_mm256_maskz_unpacklo_ps`]
   * [x] [`_mm512_unpacklo_pd`]
   * [x] [`_mm512_mask_unpacklo_pd`]
+  * [x] [`_mm512_maskz_unpacklo_pd`]
   * [x] [`_mm_mask_unpacklo_pd`]
   * [x] [`_mm_maskz_unpacklo_pd`]
   * [x] [`_mm256_mask_unpacklo_pd`]
@@ -1282,39 +1290,46 @@
   * [x] [`_mm256_maskz_broadcastsd_pd`]
   * [x] [`_mm512_shuffle_epi32`]
   * [x] [`_mm512_mask_shuffle_epi32`]
+  * [x] [`_mm512_maskz_shuffle_epi32`]
   * [x] [`_mm_mask_shuffle_epi32`]
   * [x] [`_mm_maskz_shuffle_epi32`]
   * [x] [`_mm256_mask_shuffle_epi32`]
   * [x] [`_mm256_maskz_shuffle_epi32`]
   * [x] [`_mm512_shuffle_ps`]
   * [x] [`_mm512_mask_shuffle_ps`]
+  * [x] [`_mm512_maskz_shuffle_ps`]
   * [x] [`_mm_mask_shuffle_ps`]
   * [x] [`_mm_maskz_shuffle_ps`]
   * [x] [`_mm256_mask_shuffle_ps`]
   * [x] [`_mm256_maskz_shuffle_ps`]
   * [x] [`_mm512_shuffle_pd`]
   * [x] [`_mm512_mask_shuffle_pd`]
+  * [x] [`_mm512_maskz_shuffle_pd`]
   * [x] [`_mm_mask_shuffle_pd`]
   * [x] [`_mm_maskz_shuffle_pd`]
   * [x] [`_mm256_mask_shuffle_pd`]
   * [x] [`_mm256_maskz_shuffle_pd`]
   * [x] [`_mm512_shuffle_i32x4`]
   * [x] [`_mm512_mask_shuffle_i32x4`]
+  * [x] [`_mm512_maskz_shuffle_i32x4`]
   * [x] [`_mm256_mask_shuffle_i32x4`]
   * [x] [`_mm256_maskz_shuffle_i32x4`]
   * [x] [`_mm256_shuffle_i32x4`]
   * [x] [`_mm512_shuffle_i64x2`]
   * [x] [`_mm512_mask_shuffle_i64x2`]
+  * [x] [`_mm512_maskz_shuffle_i64x2`]
   * [x] [`_mm256_mask_shuffle_i64x2`]
   * [x] [`_mm256_maskz_shuffle_i64x2`]
   * [x] [`_mm256_shuffle_i64x2`]
   * [x] [`_mm512_shuffle_f32x4`]
   * [x] [`_mm512_mask_shuffle_f32x4`]
+  * [x] [`_mm512_maskz_shuffle_f32x4`]
   * [x] [`_mm256_mask_shuffle_f32x4`]
   * [x] [`_mm256_maskz_shuffle_f32x4`]
   * [x] [`_mm256_shuffle_f32x4`]
   * [x] [`_mm512_shuffle_f64x2`]
   * [x] [`_mm512_mask_shuffle_f64x2`]
+  * [x] [`_mm512_maskz_shuffle_f64x2`]
   * [x] [`_mm256_mask_shuffle_f64x2`]
   * [x] [`_mm256_maskz_shuffle_f64x2`]
   * [x] [`_mm256_shuffle_f64x2`]
@@ -1336,6 +1351,68 @@
   * [x] [`_mm256_alignr_epi64`]
   * [x] [`_mm256_mask_alignr_epi64`]
   * [x] [`_mm256_maskz_alignr_epi64`]
+  * [x] [`_mm512_permute_ps`]
+  * [x] [`_mm512_mask_permute_ps`]
+  * [x] [`_mm512_maskz_permute_ps`]
+
+  * [x] [`_mm512_permute_pd`]
+  * [x] [`_mm512_mask_permute_pd`]
+  * [x] [`_mm512_maskz_permute_pd`]
+
+  * [x] [`_mm512_permutevar_epi32`]
+  * [x] [`_mm512_mask_permutevar_epi32`]
+
+  * [x] [`_mm512_permutevar_ps`]
+  * [x] [`_mm512_mask_permutevar_ps`]
+  * [x] [`_mm512_maskz_permutevar_ps`]
+
+  * [x] [`_mm512_permutevar_pd`]
+  * [x] [`_mm512_mask_permutevar_pd`]
+  * [x] [`_mm512_maskz_permutevar_pd`]
+
+  * [x] [`_mm512_permutex2var_epi32`]
+  * [x] [`_mm512_mask_permutex2var_epi32`]
+  * [x] [`_mm512_maskz_permutex2var_epi32`]
+  * [x] [`_mm512_mask2_permutex2var_epi32`]
+
+  * [x] [`_mm512_permutex2var_epi64`]
+  * [x] [`_mm512_mask_permutex2var_epi64`]
+  * [x] [`_mm512_maskz_permutex2var_epi64`]
+  * [x] [`_mm512_mask2_permutex2var_epi64`]
+
+  * [x] [`_mm512_permutex2var_ps`]
+  * [x] [`_mm512_mask_permutex2var_ps`]
+  * [x] [`_mm512_maskz_permutex2var_ps`]
+  * [x] [`_mm512_mask2_permutex2var_ps`]
+
+  * [x] [`_mm512_permutex2var_pd`]
+  * [x] [`_mm512_mask_permutex2var_pd`]
+  * [x] [`_mm512_maskz_permutex2var_pd`]
+  * [x] [`_mm512_mask2_permutex2var_pd`]
+
+  * [x] [`_mm512_permutex_epi64`]
+  * [x] [`_mm512_mask_permutex_epi64`]
+  * [x] [`_mm512_maskz_permutex_epi64`]
+
+  * [x] [`_mm512_permutex_pd`]
+  * [x] [`_mm512_mask_permutex_pd`]
+  * [x] [`_mm512_maskz_permutex_pd`]
+
+  * [x] [`_mm512_permutexvar_epi32`]
+  * [x] [`_mm512_mask_permutexvar_epi32`]
+  * [x] [`_mm512_maskz_permutexvar_epi32`]
+
+  * [x] [`_mm512_permutexvar_epi64`]
+  * [x] [`_mm512_mask_permutexvar_epi64`]
+  * [x] [`_mm512_maskz_permutexvar_epi64`]
+
+  * [x] [`_mm512_permutexvar_ps`]
+  * [x] [`_mm512_mask_permutexvar_ps`]
+  * [x] [`_mm512_maskz_permutexvar_ps`]
+
+  * [x] [`_mm512_permutexvar_pd`]
+  * [x] [`_mm512_mask_permutexvar_pd`]
+  * [x] [`_mm512_maskz_permutexvar_pd`]
 
   * [x] [`_mm512_castpd128_pd512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castpd128_pd512&expand=5236)
   * [x] [`_mm512_castpd256_pd512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castpd256_pd512&expand=5236)
@@ -1470,10 +1547,6 @@
   * [x] [`_mm512_loadu_epi32`]
   * [x] [`_mm512_loadu_epi64`]
   * [x] [`_mm512_loadu_si512`]
-  * [x] [`_mm512_mask2_permutex2var_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask2_permutex2var_epi32&expand=5236)
-  * [x] [`_mm512_mask2_permutex2var_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask2_permutex2var_epi64&expand=5236)
-  * [x] [`_mm512_mask2_permutex2var_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask2_permutex2var_pd&expand=5236)
-  * [x] [`_mm512_mask2_permutex2var_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask2_permutex2var_ps&expand=5236)
   * [x] [`_mm512_mask2int`]
   * [x] [`_mm512_mask_compress_epi32`]
   * [x] [`_mm512_mask_compress_epi64`]
@@ -1608,21 +1681,6 @@
   * [ ] [`_mm512_mask_loadu_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_loadu_epi64&expand=5236)
   * [ ] [`_mm512_mask_loadu_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_loadu_pd&expand=5236)
   * [ ] [`_mm512_mask_loadu_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_loadu_ps&expand=5236)
-  * [x] [`_mm512_mask_permute_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_permute_pd&expand=5236)
-  * [x] [`_mm512_mask_permute_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_permute_ps&expand=5236)
-  * [x] [`_mm512_mask_permutevar_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_permutevar_epi32&expand=5236)
-  * [x] [`_mm512_mask_permutevar_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_permutevar_pd&expand=5236)
-  * [x] [`_mm512_mask_permutevar_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_permutevar_ps&expand=5236)
-  * [x] [`_mm512_mask_permutex2var_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_permutex2var_epi32&expand=5236)
-  * [x] [`_mm512_mask_permutex2var_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_permutex2var_epi64&expand=5236)
-  * [x] [`_mm512_mask_permutex2var_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_permutex2var_pd&expand=5236)
-  * [x] [`_mm512_mask_permutex2var_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_permutex2var_ps&expand=5236)
-  * [x] [`_mm512_mask_permutex_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_permutex_epi64&expand=5236)
-  * [x] [`_mm512_mask_permutex_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_permutex_pd&expand=5236)
-  * [x] [`_mm512_mask_permutexvar_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_permutexvar_epi32&expand=5236)
-  * [x] [`_mm512_mask_permutexvar_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_permutexvar_epi64&expand=5236)
-  * [x] [`_mm512_mask_permutexvar_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_permutexvar_pd&expand=5236)
-  * [x] [`_mm512_mask_permutexvar_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_permutexvar_ps&expand=5236)
   * [x] [`_mm512_mask_set1_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_set1_epi32&expand=5236)
   * [x] [`_mm512_mask_set1_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_set1_epi64&expand=5236)
   * [ ] [`_mm512_mask_store_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_store_epi32&expand=5236)
@@ -1722,54 +1780,10 @@
   * [ ] [`_mm512_maskz_loadu_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_loadu_epi64&expand=5236)
   * [ ] [`_mm512_maskz_loadu_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_loadu_pd&expand=5236)
   * [ ] [`_mm512_maskz_loadu_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_loadu_ps&expand=5236)
-  * [x] [`_mm512_maskz_permute_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_permute_pd&expand=5236)
-  * [x] [`_mm512_maskz_permute_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_permute_ps&expand=5236)
-  * [x] [`_mm512_maskz_permutevar_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_permutevar_pd&expand=5236)
-  * [x] [`_mm512_maskz_permutevar_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_permutevar_ps&expand=5236)
-  * [x] [`_mm512_maskz_permutex2var_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_permutex2var_epi32&expand=5236)
-  * [x] [`_mm512_maskz_permutex2var_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_permutex2var_epi64&expand=5236)
-  * [x] [`_mm512_maskz_permutex2var_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_permutex2var_pd&expand=5236)
-  * [x] [`_mm512_maskz_permutex2var_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_permutex2var_ps&expand=5236)
-  * [x] [`_mm512_maskz_permutex_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_permutex_epi64&expand=5236)
-  * [x] [`_mm512_maskz_permutex_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_permutex_pd&expand=5236)
-  * [x] [`_mm512_maskz_permutexvar_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_permutexvar_epi32&expand=5236)
-  * [x] [`_mm512_maskz_permutexvar_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_permutexvar_epi64&expand=5236)
-  * [x] [`_mm512_maskz_permutexvar_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_permutexvar_pd&expand=5236)
-  * [x] [`_mm512_maskz_permutexvar_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_permutexvar_ps&expand=5236)
   * [x] [`_mm512_maskz_set1_epi32`]
   * [x] [`_mm512_maskz_set1_epi64`]
-  * [x] [`_mm512_maskz_shuffle_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shuffle_epi32&expand=5236)
-  * [x] [`_mm512_maskz_shuffle_f32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shuffle_f32x4&expand=5236)
-  * [x] [`_mm512_maskz_shuffle_f64x2`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shuffle_f64x2&expand=5236)
-  * [x] [`_mm512_maskz_shuffle_i32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shuffle_i32x4&expand=5236)
-  * [x] [`_mm512_maskz_shuffle_i64x2`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shuffle_i64x2&expand=5236)
-  * [x] [`_mm512_maskz_shuffle_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shuffle_pd&expand=5236)
-  * [x] [`_mm512_maskz_shuffle_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shuffle_ps&expand=5236)
   * [x] [`_mm512_maskz_ternarylogic_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_ternarylogic_epi32&expand=5236)
   * [x] [`_mm512_maskz_ternarylogic_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_ternarylogic_epi64&expand=5236)
-  * [x] [`_mm512_maskz_unpackhi_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_unpackhi_epi32&expand=5236)
-  * [x] [`_mm512_maskz_unpackhi_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_unpackhi_epi64&expand=5236)
-  * [x] [`_mm512_maskz_unpackhi_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_unpackhi_pd&expand=5236)
-  * [x] [`_mm512_maskz_unpackhi_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_unpackhi_ps&expand=5236)
-  * [x] [`_mm512_maskz_unpacklo_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_unpacklo_epi32&expand=5236)
-  * [x] [`_mm512_maskz_unpacklo_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_unpacklo_epi64&expand=5236)
-  * [x] [`_mm512_maskz_unpacklo_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_unpacklo_pd&expand=5236)
-  * [x] [`_mm512_maskz_unpacklo_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_unpacklo_ps&expand=5236)
-  * [x] [`_mm512_permute_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permute_pd&expand=5236)
-  * [x] [`_mm512_permute_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permute_ps&expand=5236)
-  * [x] [`_mm512_permutevar_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutevar_epi32&expand=5236)
-  * [x] [`_mm512_permutevar_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutevar_pd&expand=5236)
-  * [x] [`_mm512_permutevar_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutevar_ps&expand=5236)
-  * [x] [`_mm512_permutex2var_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutex2var_epi32&expand=5236)
-  * [x] [`_mm512_permutex2var_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutex2var_epi64&expand=5236)
-  * [x] [`_mm512_permutex2var_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutex2var_pd&expand=5236)
-  * [x] [`_mm512_permutex2var_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutex2var_ps&expand=5236)
-  * [x] [`_mm512_permutex_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutex_epi64&expand=5236)
-  * [x] [`_mm512_permutex_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutex_pd&expand=5236)
-  * [x] [`_mm512_permutexvar_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutexvar_epi32&expand=5236)
-  * [x] [`_mm512_permutexvar_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutexvar_epi64&expand=5236)
-  * [x] [`_mm512_permutexvar_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutexvar_pd&expand=5236)
-  * [x] [`_mm512_permutexvar_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutexvar_ps&expand=5236)
   * [x] [`_mm512_set1_epi16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_set1_epi16&expand=5236)
   * [x] [`_mm512_set1_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_set1_epi32&expand=5236)
   * [x] [`_mm512_set1_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_set1_epi64&expand=5236)
diff --git a/library/stdarch/crates/core_arch/src/x86/avx512bw.rs b/library/stdarch/crates/core_arch/src/x86/avx512bw.rs
index 18171168c8d9..134ac1f2b9cf 100644
--- a/library/stdarch/crates/core_arch/src/x86/avx512bw.rs
+++ b/library/stdarch/crates/core_arch/src/x86/avx512bw.rs
@@ -7332,54 +7332,13 @@ pub unsafe fn _mm512_mask_shufflelo_epi16(
     a: __m512i,
     imm8: i32,
 ) -> __m512i {
-    let imm8 = (imm8 & 0xFF) as u8;
-    let a = a.as_i16x32();
-    macro_rules! shuffle_done {
-        ($x01: expr, $x23: expr, $x45: expr, $x67: expr) => {
-            #[rustfmt::skip]
-                        simd_shuffle32(a, a, [
-                            0+$x01, 0+$x23, 0+$x45, 0+$x67, 4, 5, 6, 7, 8+$x01, 8+$x23, 8+$x45, 8+$x67, 12, 13, 14, 15,
-                            16+$x01, 16+$x23, 16+$x45, 16+$x67, 20, 21, 22, 23, 24+$x01, 24+$x23, 24+$x45, 24+$x67, 28, 29, 30, 31,
-                        ])
+    macro_rules! call {
+        ($imm8:expr) => {
+            _mm512_shufflelo_epi16(a, $imm8)
         };
     }
-    macro_rules! shuffle_x67 {
-        ($x01:expr, $x23:expr, $x45:expr) => {
-            match (imm8 >> 6) & 0b11 {
-                0b00 => shuffle_done!($x01, $x23, $x45, 0),
-                0b01 => shuffle_done!($x01, $x23, $x45, 1),
-                0b10 => shuffle_done!($x01, $x23, $x45, 2),
-                _ => shuffle_done!($x01, $x23, $x45, 3),
-            }
-        };
-    }
-    macro_rules! shuffle_x45 {
-        ($x01:expr, $x23:expr) => {
-            match (imm8 >> 4) & 0b11 {
-                0b00 => shuffle_x67!($x01, $x23, 0),
-                0b01 => shuffle_x67!($x01, $x23, 1),
-                0b10 => shuffle_x67!($x01, $x23, 2),
-                _ => shuffle_x67!($x01, $x23, 3),
-            }
-        };
-    }
-    macro_rules! shuffle_x23 {
-        ($x01:expr) => {
-            match (imm8 >> 2) & 0b11 {
-                0b00 => shuffle_x45!($x01, 0),
-                0b01 => shuffle_x45!($x01, 1),
-                0b10 => shuffle_x45!($x01, 2),
-                _ => shuffle_x45!($x01, 3),
-            }
-        };
-    }
-    let r: i16x32 = match imm8 & 0b11 {
-        0b00 => shuffle_x23!(0),
-        0b01 => shuffle_x23!(1),
-        0b10 => shuffle_x23!(2),
-        _ => shuffle_x23!(3),
-    };
-    transmute(simd_select_bitmask(k, r, src.as_i16x32()))
+    let r = constify_imm8_sae!(imm8, call);
+    transmute(simd_select_bitmask(k, r.as_i16x32(), src.as_i16x32()))
 }
 
 /// Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the low 64 bits of 128-bit lanes of dst, with the high 64 bits of 128-bit lanes being copied from from a to dst, using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -7390,58 +7349,14 @@ pub unsafe fn _mm512_mask_shufflelo_epi16(
 #[cfg_attr(test, assert_instr(vpshuflw, imm8 = 0))]
 #[rustc_args_required_const(2)]
 pub unsafe fn _mm512_maskz_shufflelo_epi16(k: __mmask32, a: __m512i, imm8: i32) -> __m512i {
-    let imm8 = (imm8 & 0xFF) as u8;
-    let a = a.as_i16x32();
-    macro_rules! shuffle_done {
-        ($x01: expr, $x23: expr, $x45: expr, $x67: expr) => {
-            #[rustfmt::skip]
-                        simd_shuffle32(a, a, [
-                            0+$x01, 0+$x23, 0+$x45, 0+$x67, 4, 5, 6, 7, 8+$x01, 8+$x23, 8+$x45, 8+$x67, 12, 13, 14, 15,
-                            16+$x01, 16+$x23, 16+$x45, 16+$x67, 20, 21, 22, 23, 24+$x01, 24+$x23, 24+$x45, 24+$x67, 28, 29, 30, 31,
-                        ])
+    macro_rules! call {
+        ($imm8:expr) => {
+            _mm512_shufflelo_epi16(a, $imm8)
         };
     }
-    macro_rules! shuffle_x67 {
-        ($x01:expr, $x23:expr, $x45:expr) => {
-            match (imm8 >> 6) & 0b11 {
-                0b00 => shuffle_done!($x01, $x23, $x45, 0),
-                0b01 => shuffle_done!($x01, $x23, $x45, 1),
-                0b10 => shuffle_done!($x01, $x23, $x45, 2),
-                _ => shuffle_done!($x01, $x23, $x45, 3),
-            }
-        };
-    }
-    macro_rules! shuffle_x45 {
-        ($x01:expr, $x23:expr) => {
-            match (imm8 >> 4) & 0b11 {
-                0b00 => shuffle_x67!($x01, $x23, 0),
-                0b01 => shuffle_x67!($x01, $x23, 1),
-                0b10 => shuffle_x67!($x01, $x23, 2),
-                _ => shuffle_x67!($x01, $x23, 3),
-            }
-        };
-    }
-    macro_rules! shuffle_x23 {
-        ($x01:expr) => {
-            match (imm8 >> 2) & 0b11 {
-                0b00 => shuffle_x45!($x01, 0),
-                0b01 => shuffle_x45!($x01, 1),
-                0b10 => shuffle_x45!($x01, 2),
-                _ => shuffle_x45!($x01, 3),
-            }
-        };
-    }
-    let r: i16x32 = match imm8 & 0b11 {
-        0b00 => shuffle_x23!(0),
-        0b01 => shuffle_x23!(1),
-        0b10 => shuffle_x23!(2),
-        _ => shuffle_x23!(3),
-    };
-    transmute(simd_select_bitmask(
-        k,
-        r,
-        _mm512_setzero_si512().as_i16x32(),
-    ))
+    let r = constify_imm8_sae!(imm8, call);
+    let zero = _mm512_setzero_si512().as_i16x32();
+    transmute(simd_select_bitmask(k, r.as_i16x32(), zero))
 }
 
 /// Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the low 64 bits of 128-bit lanes of dst, with the high 64 bits of 128-bit lanes being copied from from a to dst, using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -7595,54 +7510,13 @@ pub unsafe fn _mm512_mask_shufflehi_epi16(
     a: __m512i,
     imm8: i32,
 ) -> __m512i {
-    let imm8 = (imm8 & 0xFF) as u8;
-    let a = a.as_i16x32();
-    macro_rules! shuffle_done {
-        ($x01: expr, $x23: expr, $x45: expr, $x67: expr) => {
-            #[rustfmt::skip]
-                        simd_shuffle32(a, a, [
-                            0, 1, 2, 3, 4+$x01, 4+$x23, 4+$x45, 4+$x67, 8, 9, 10, 11, 12+$x01, 12+$x23, 12+$x45, 12+$x67,
-                            16, 17, 18, 19, 20+$x01, 20+$x23, 20+$x45, 20+$x67, 24, 25, 26, 27, 28+$x01, 28+$x23, 28+$x45, 28+$x67,
-                        ])
+    macro_rules! call {
+        ($imm8:expr) => {
+            _mm512_shufflehi_epi16(a, $imm8)
         };
     }
-    macro_rules! shuffle_x67 {
-        ($x01:expr, $x23:expr, $x45:expr) => {
-            match (imm8 >> 6) & 0b11 {
-                0b00 => shuffle_done!($x01, $x23, $x45, 0),
-                0b01 => shuffle_done!($x01, $x23, $x45, 1),
-                0b10 => shuffle_done!($x01, $x23, $x45, 2),
-                _ => shuffle_done!($x01, $x23, $x45, 3),
-            }
-        };
-    }
-    macro_rules! shuffle_x45 {
-        ($x01:expr, $x23:expr) => {
-            match (imm8 >> 4) & 0b11 {
-                0b00 => shuffle_x67!($x01, $x23, 0),
-                0b01 => shuffle_x67!($x01, $x23, 1),
-                0b10 => shuffle_x67!($x01, $x23, 2),
-                _ => shuffle_x67!($x01, $x23, 3),
-            }
-        };
-    }
-    macro_rules! shuffle_x23 {
-        ($x01:expr) => {
-            match (imm8 >> 2) & 0b11 {
-                0b00 => shuffle_x45!($x01, 0),
-                0b01 => shuffle_x45!($x01, 1),
-                0b10 => shuffle_x45!($x01, 2),
-                _ => shuffle_x45!($x01, 3),
-            }
-        };
-    }
-    let r: i16x32 = match imm8 & 0b11 {
-        0b00 => shuffle_x23!(0),
-        0b01 => shuffle_x23!(1),
-        0b10 => shuffle_x23!(2),
-        _ => shuffle_x23!(3),
-    };
-    transmute(simd_select_bitmask(k, r, src.as_i16x32()))
+    let r = constify_imm8_sae!(imm8, call);
+    transmute(simd_select_bitmask(k, r.as_i16x32(), src.as_i16x32()))
 }
 
 /// Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the high 64 bits of 128-bit lanes of dst, with the low 64 bits of 128-bit lanes being copied from from a to dst, using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -7653,58 +7527,14 @@ pub unsafe fn _mm512_mask_shufflehi_epi16(
 #[cfg_attr(test, assert_instr(vpshufhw, imm8 = 0))]
 #[rustc_args_required_const(2)]
 pub unsafe fn _mm512_maskz_shufflehi_epi16(k: __mmask32, a: __m512i, imm8: i32) -> __m512i {
-    let imm8 = (imm8 & 0xFF) as u8;
-    let a = a.as_i16x32();
-    macro_rules! shuffle_done {
-        ($x01: expr, $x23: expr, $x45: expr, $x67: expr) => {
-            #[rustfmt::skip]
-                        simd_shuffle32(a, a, [
-                            0, 1, 2, 3, 4+$x01, 4+$x23, 4+$x45, 4+$x67, 8, 9, 10, 11, 12+$x01, 12+$x23, 12+$x45, 12+$x67,
-                            16, 17, 18, 19, 20+$x01, 20+$x23, 20+$x45, 20+$x67, 24, 25, 26, 27, 28+$x01, 28+$x23, 28+$x45, 28+$x67,
-                        ])
+    macro_rules! call {
+        ($imm8:expr) => {
+            _mm512_shufflehi_epi16(a, $imm8)
         };
     }
-    macro_rules! shuffle_x67 {
-        ($x01:expr, $x23:expr, $x45:expr) => {
-            match (imm8 >> 6) & 0b11 {
-                0b00 => shuffle_done!($x01, $x23, $x45, 0),
-                0b01 => shuffle_done!($x01, $x23, $x45, 1),
-                0b10 => shuffle_done!($x01, $x23, $x45, 2),
-                _ => shuffle_done!($x01, $x23, $x45, 3),
-            }
-        };
-    }
-    macro_rules! shuffle_x45 {
-        ($x01:expr, $x23:expr) => {
-            match (imm8 >> 4) & 0b11 {
-                0b00 => shuffle_x67!($x01, $x23, 0),
-                0b01 => shuffle_x67!($x01, $x23, 1),
-                0b10 => shuffle_x67!($x01, $x23, 2),
-                _ => shuffle_x67!($x01, $x23, 3),
-            }
-        };
-    }
-    macro_rules! shuffle_x23 {
-        ($x01:expr) => {
-            match (imm8 >> 2) & 0b11 {
-                0b00 => shuffle_x45!($x01, 0),
-                0b01 => shuffle_x45!($x01, 1),
-                0b10 => shuffle_x45!($x01, 2),
-                _ => shuffle_x45!($x01, 3),
-            }
-        };
-    }
-    let r: i16x32 = match imm8 & 0b11 {
-        0b00 => shuffle_x23!(0),
-        0b01 => shuffle_x23!(1),
-        0b10 => shuffle_x23!(2),
-        _ => shuffle_x23!(3),
-    };
-    transmute(simd_select_bitmask(
-        k,
-        r,
-        _mm512_setzero_si512().as_i16x32(),
-    ))
+    let r = constify_imm8_sae!(imm8, call);
+    let zero = _mm512_setzero_si512().as_i16x32();
+    transmute(simd_select_bitmask(k, r.as_i16x32(), zero))
 }
 
 /// Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the high 64 bits of 128-bit lanes of dst, with the low 64 bits of 128-bit lanes being copied from from a to dst, using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -9446,59 +9276,13 @@ pub unsafe fn _mm512_mask_alignr_epi8(
     b: __m512i,
     imm8: i32,
 ) -> __m512i {
-    // If palignr is shifting the pair of vectors more than the size of two
-    // lanes, emit zero.
-    if imm8 > 32 {
-        return _mm512_set1_epi8(0);
-    }
-    // If palignr is shifting the pair of input vectors more than one lane,
-    // but less than two lanes, convert to shifting in zeroes.
-    let (a, b, imm8) = if imm8 > 16 {
-        (_mm512_set1_epi8(0), a, imm8 - 16)
-    } else {
-        (a, b, imm8)
-    };
-    let a = a.as_i8x64();
-    let b = b.as_i8x64();
-    #[rustfmt::skip]
-    macro_rules! shuffle {
+    macro_rules! call {
         ($imm8:expr) => {
-            simd_shuffle64(
-                b,
-                a,
-                [
-                    0 + ($imm8+48), 1 + ($imm8+48), 2 + ($imm8+48), 3 + ($imm8+48), 4 + ($imm8+48), 5 + ($imm8+48), 6 + ($imm8+48), 7 + ($imm8+48),
-                    8 + ($imm8+48), 9 + ($imm8+48), 10 + ($imm8+48), 11 + ($imm8+48), 12 + ($imm8+48), 13 + ($imm8+48), 14 + ($imm8+48), 15 + ($imm8+48),
-                    16 + ($imm8+32), 17 + ($imm8+32), 18 + ($imm8+32), 19 + ($imm8+32), 20 + ($imm8+32), 21 + ($imm8+32), 22 + ($imm8+32), 23 + ($imm8+32),
-                    24 + ($imm8+32), 25 + ($imm8+32), 26 + ($imm8+32), 27 + ($imm8+32), 28 + ($imm8+32), 29 + ($imm8+32), 30 + ($imm8+32), 31 + ($imm8+32),
-                    32 + ($imm8+16), 33 + ($imm8+16), 34 + ($imm8+16), 35 + ($imm8+16), 36 + ($imm8+16), 37 + ($imm8+16), 38 + ($imm8+16), 39 + ($imm8+16),
-                    40 + ($imm8+16), 41 + ($imm8+16), 42 + ($imm8+16), 43 + ($imm8+16), 44 + ($imm8+16), 45 + ($imm8+16), 46 + ($imm8+16), 47 + ($imm8+16),
-                    48 + $imm8, 49 + $imm8, 50 + $imm8, 51 + $imm8, 52 + $imm8, 53 + $imm8, 54 + $imm8, 55 + $imm8,
-                    56 + $imm8, 57 + $imm8, 58 + $imm8, 59 + $imm8, 60 + $imm8, 61 + $imm8, 62 + $imm8, 63 + $imm8,
-                ],
-            )
+            _mm512_alignr_epi8(a, b, $imm8)
         };
     }
-    let r: i8x64 = match imm8 {
-        0 => shuffle!(0),
-        1 => shuffle!(1),
-        2 => shuffle!(2),
-        3 => shuffle!(3),
-        4 => shuffle!(4),
-        5 => shuffle!(5),
-        6 => shuffle!(6),
-        7 => shuffle!(7),
-        8 => shuffle!(8),
-        9 => shuffle!(9),
-        10 => shuffle!(10),
-        11 => shuffle!(11),
-        12 => shuffle!(12),
-        13 => shuffle!(13),
-        14 => shuffle!(14),
-        15 => shuffle!(15),
-        _ => shuffle!(16),
-    };
-    transmute(simd_select_bitmask(k, r, src.as_i8x64()))
+    let r = constify_imm8_sae!(imm8, call);
+    transmute(simd_select_bitmask(k, r.as_i8x64(), src.as_i8x64()))
 }
 
 /// Concatenate pairs of 16-byte blocks in a and b into a 32-byte temporary result, shift the result right by imm8 bytes, and store the low 16 bytes in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -9509,59 +9293,14 @@ pub unsafe fn _mm512_mask_alignr_epi8(
 #[cfg_attr(test, assert_instr(vpalignr, imm8 = 1))]
 #[rustc_args_required_const(3)]
 pub unsafe fn _mm512_maskz_alignr_epi8(k: __mmask64, a: __m512i, b: __m512i, imm8: i32) -> __m512i {
-    // If palignr is shifting the pair of vectors more than the size of two
-    // lanes, emit zero.
-    if imm8 > 32 {
-        return _mm512_set1_epi8(0);
-    }
-    // If palignr is shifting the pair of input vectors more than one lane,
-    // but less than two lanes, convert to shifting in zeroes.
-    let (a, b, imm8) = if imm8 > 16 {
-        (_mm512_set1_epi8(0), a, imm8 - 16)
-    } else {
-        (a, b, imm8)
-    };
-    let a = a.as_i8x64();
-    let b = b.as_i8x64();
-    #[rustfmt::skip]
-    macro_rules! shuffle {
+    macro_rules! call {
         ($imm8:expr) => {
-            simd_shuffle64(
-                b,
-                a,
-                [
-                    0 + ($imm8+48), 1 + ($imm8+48), 2 + ($imm8+48), 3 + ($imm8+48), 4 + ($imm8+48), 5 + ($imm8+48), 6 + ($imm8+48), 7 + ($imm8+48),
-                    8 + ($imm8+48), 9 + ($imm8+48), 10 + ($imm8+48), 11 + ($imm8+48), 12 + ($imm8+48), 13 + ($imm8+48), 14 + ($imm8+48), 15 + ($imm8+48),
-                    16 + ($imm8+32), 17 + ($imm8+32), 18 + ($imm8+32), 19 + ($imm8+32), 20 + ($imm8+32), 21 + ($imm8+32), 22 + ($imm8+32), 23 + ($imm8+32),
-                    24 + ($imm8+32), 25 + ($imm8+32), 26 + ($imm8+32), 27 + ($imm8+32), 28 + ($imm8+32), 29 + ($imm8+32), 30 + ($imm8+32), 31 + ($imm8+32),
-                    32 + ($imm8+16), 33 + ($imm8+16), 34 + ($imm8+16), 35 + ($imm8+16), 36 + ($imm8+16), 37 + ($imm8+16), 38 + ($imm8+16), 39 + ($imm8+16),
-                    40 + ($imm8+16), 41 + ($imm8+16), 42 + ($imm8+16), 43 + ($imm8+16), 44 + ($imm8+16), 45 + ($imm8+16), 46 + ($imm8+16), 47 + ($imm8+16),
-                    48 + $imm8, 49 + $imm8, 50 + $imm8, 51 + $imm8, 52 + $imm8, 53 + $imm8, 54 + $imm8, 55 + $imm8,
-                    56 + $imm8, 57 + $imm8, 58 + $imm8, 59 + $imm8, 60 + $imm8, 61 + $imm8, 62 + $imm8, 63 + $imm8,
-                ],
-            )
+            _mm512_alignr_epi8(a, b, $imm8)
         };
     }
-    let r: i8x64 = match imm8 {
-        0 => shuffle!(0),
-        1 => shuffle!(1),
-        2 => shuffle!(2),
-        3 => shuffle!(3),
-        4 => shuffle!(4),
-        5 => shuffle!(5),
-        6 => shuffle!(6),
-        7 => shuffle!(7),
-        8 => shuffle!(8),
-        9 => shuffle!(9),
-        10 => shuffle!(10),
-        11 => shuffle!(11),
-        12 => shuffle!(12),
-        13 => shuffle!(13),
-        14 => shuffle!(14),
-        15 => shuffle!(15),
-        _ => shuffle!(16),
-    };
-    transmute(simd_select_bitmask(k, r, _mm512_setzero_si512().as_i8x64()))
+    let r = constify_imm8_sae!(imm8, call);
+    let zero = _mm512_setzero_si512().as_i8x64();
+    transmute(simd_select_bitmask(k, r.as_i8x64(), zero))
 }
 
 /// Concatenate pairs of 16-byte blocks in a and b into a 32-byte temporary result, shift the result right by imm8 bytes, and store the low 16 bytes in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -9645,11 +9384,8 @@ pub unsafe fn _mm_maskz_alignr_epi8(k: __mmask16, a: __m128i, b: __m128i, imm8:
         };
     }
     let r = constify_imm8_sae!(imm8, call);
-    transmute(simd_select_bitmask(
-        k,
-        r.as_i8x16(),
-        _mm_setzero_si128().as_i8x16(),
-    ))
+    let zero = _mm_setzero_si128().as_i8x16();
+    transmute(simd_select_bitmask(k, r.as_i8x16(), zero))
 }
 
 #[allow(improper_ctypes)]
diff --git a/library/stdarch/crates/core_arch/src/x86/avx512f.rs b/library/stdarch/crates/core_arch/src/x86/avx512f.rs
index e2462b496fbe..8cdcb4f204fe 100644
--- a/library/stdarch/crates/core_arch/src/x86/avx512f.rs
+++ b/library/stdarch/crates/core_arch/src/x86/avx512f.rs
@@ -17173,20 +17173,76 @@ pub unsafe fn _mm_maskz_srlv_epi64(k: __mmask8, a: __m128i, count: __m128i) -> _
 
 /// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_permute_ps&expand=4170)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permute_ps&expand=4170)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vpermilps, imm8 = 1))]
 #[rustc_args_required_const(1)]
 pub unsafe fn _mm512_permute_ps(a: __m512, imm8: i32) -> __m512 {
-    let a = a.as_f32x16();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vpermilps(a, _mm512_set1_epi32($imm8).as_i32x16())
+    let imm8 = (imm8 & 0xFF) as u8;
+    let undefined = _mm512_undefined_ps();
+    macro_rules! shuffle4 {
+        ($a:expr, $b:expr, $c:expr, $d:expr) => {
+            simd_shuffle16(
+                a,
+                undefined,
+                [
+                    $a,
+                    $b,
+                    $c,
+                    $d,
+                    $a + 4,
+                    $b + 4,
+                    $c + 4,
+                    $d + 4,
+                    $a + 8,
+                    $b + 8,
+                    $c + 8,
+                    $d + 8,
+                    $a + 12,
+                    $b + 12,
+                    $c + 12,
+                    $d + 12,
+                ],
+            )
         };
     }
-    let r = constify_imm8_sae!(imm8, call);
-    transmute(r)
+    macro_rules! shuffle3 {
+        ($a:expr, $b:expr, $c:expr) => {
+            match (imm8 >> 6) & 0b11 {
+                0b00 => shuffle4!($a, $b, $c, 0),
+                0b01 => shuffle4!($a, $b, $c, 1),
+                0b10 => shuffle4!($a, $b, $c, 2),
+                _ => shuffle4!($a, $b, $c, 3),
+            }
+        };
+    }
+    macro_rules! shuffle2 {
+        ($a:expr, $b:expr) => {
+            match (imm8 >> 4) & 0b11 {
+                0b00 => shuffle3!($a, $b, 0),
+                0b01 => shuffle3!($a, $b, 1),
+                0b10 => shuffle3!($a, $b, 2),
+                _ => shuffle3!($a, $b, 3),
+            }
+        };
+    }
+    macro_rules! shuffle1 {
+        ($a:expr) => {
+            match (imm8 >> 2) & 0b11 {
+                0b00 => shuffle2!($a, 0),
+                0b01 => shuffle2!($a, 1),
+                0b10 => shuffle2!($a, 2),
+                _ => shuffle2!($a, 3),
+            }
+        };
+    }
+    match imm8 & 0b11 {
+        0b00 => shuffle1!(0),
+        0b01 => shuffle1!(1),
+        0b10 => shuffle1!(2),
+        _ => shuffle1!(3),
+    }
 }
 
 /// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -17197,14 +17253,13 @@ pub unsafe fn _mm512_permute_ps(a: __m512, imm8: i32) -> __m512 {
 #[cfg_attr(test, assert_instr(vpermilps, imm8 = 1))]
 #[rustc_args_required_const(3)]
 pub unsafe fn _mm512_mask_permute_ps(src: __m512, k: __mmask16, a: __m512, imm8: i32) -> __m512 {
-    let a = a.as_f32x16();
     macro_rules! call {
         ($imm8:expr) => {
-            vpermilps(a, _mm512_set1_epi32($imm8).as_i32x16())
+            _mm512_permute_ps(a, $imm8)
         };
     }
-    let permute = constify_imm8_sae!(imm8, call);
-    transmute(simd_select_bitmask(k, permute, src.as_f32x16()))
+    let r = constify_imm8_sae!(imm8, call);
+    transmute(simd_select_bitmask(k, r.as_f32x16(), src.as_f32x16()))
 }
 
 /// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -17215,15 +17270,49 @@ pub unsafe fn _mm512_mask_permute_ps(src: __m512, k: __mmask16, a: __m512, imm8:
 #[cfg_attr(test, assert_instr(vpermilps, imm8 = 1))]
 #[rustc_args_required_const(2)]
 pub unsafe fn _mm512_maskz_permute_ps(k: __mmask16, a: __m512, imm8: i32) -> __m512 {
-    let a = a.as_f32x16();
     macro_rules! call {
         ($imm8:expr) => {
-            vpermilps(a, _mm512_set1_epi32($imm8).as_i32x16())
+            _mm512_permute_ps(a, $imm8)
         };
     }
-    let permute = constify_imm8_sae!(imm8, call);
+    let r = constify_imm8_sae!(imm8, call);
     let zero = _mm512_setzero_ps().as_f32x16();
-    transmute(simd_select_bitmask(k, permute, zero))
+    transmute(simd_select_bitmask(k, r.as_f32x16(), zero))
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_permute_ps&expand=4165)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpermilps, imm8 = 1))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm256_mask_permute_ps(src: __m256, k: __mmask8, a: __m256, imm8: i32) -> __m256 {
+    macro_rules! call {
+        ($imm8:expr) => {
+            _mm256_permute_ps(a, $imm8)
+        };
+    }
+    let r = constify_imm8_sae!(imm8, call);
+    transmute(simd_select_bitmask(k, r.as_f32x8(), src.as_f32x8()))
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_permute_ps&expand=4166)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpermilps, imm8 = 1))]
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm256_maskz_permute_ps(k: __mmask8, a: __m256, imm8: i32) -> __m256 {
+    macro_rules! call {
+        ($imm8:expr) => {
+            _mm256_permute_ps(a, $imm8)
+        };
+    }
+    let r = constify_imm8_sae!(imm8, call);
+    let zero = _mm256_setzero_ps().as_f32x8();
+    transmute(simd_select_bitmask(k, r.as_f32x8(), zero))
 }
 
 /// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst.
@@ -17234,14 +17323,45 @@ pub unsafe fn _mm512_maskz_permute_ps(k: __mmask16, a: __m512, imm8: i32) -> __m
 #[cfg_attr(test, assert_instr(vpermilpd, imm8 = 2))]
 #[rustc_args_required_const(1)]
 pub unsafe fn _mm512_permute_pd(a: __m512d, imm8: i32) -> __m512d {
-    let a = a.as_f64x8();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vpermilpd(a, _mm512_set1_epi64($imm8).as_i64x8())
+    let imm8 = (imm8 & 0xFF) as u8;
+    let undefined = _mm512_undefined_pd();
+    macro_rules! shuffle4 {
+        ($a:expr, $b:expr, $c:expr, $d:expr) => {
+            simd_shuffle8(
+                a,
+                undefined,
+                [$a, $b, $c, $d, $a + 4, $b + 4, $c + 4, $d + 4],
+            )
         };
     }
-    let r = constify_imm8_sae!(imm8, call);
-    transmute(r)
+    macro_rules! shuffle3 {
+        ($a:expr, $b:expr, $c:expr) => {
+            match (imm8 >> 3) & 0x1 {
+                0 => shuffle4!($a, $b, $c, 2),
+                _ => shuffle4!($a, $b, $c, 3),
+            }
+        };
+    }
+    macro_rules! shuffle2 {
+        ($a:expr, $b:expr) => {
+            match (imm8 >> 2) & 0x1 {
+                0 => shuffle3!($a, $b, 2),
+                _ => shuffle3!($a, $b, 3),
+            }
+        };
+    }
+    macro_rules! shuffle1 {
+        ($a:expr) => {
+            match (imm8 >> 1) & 0x1 {
+                0 => shuffle2!($a, 0),
+                _ => shuffle2!($a, 1),
+            }
+        };
+    }
+    match imm8 & 0x1 {
+        0 => shuffle1!(0),
+        _ => shuffle1!(1),
+    }
 }
 
 /// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -17252,14 +17372,13 @@ pub unsafe fn _mm512_permute_pd(a: __m512d, imm8: i32) -> __m512d {
 #[cfg_attr(test, assert_instr(vpermilpd, imm8 = 2))]
 #[rustc_args_required_const(3)]
 pub unsafe fn _mm512_mask_permute_pd(src: __m512d, k: __mmask8, a: __m512d, imm8: i32) -> __m512d {
-    let a = a.as_f64x8();
     macro_rules! call {
         ($imm8:expr) => {
-            vpermilpd(a, _mm512_set1_epi64($imm8).as_i64x8())
+            _mm512_permute_pd(a, $imm8)
         };
     }
-    let permute = constify_imm8_sae!(imm8, call);
-    transmute(simd_select_bitmask(k, permute, src.as_f64x8()))
+    let r = constify_imm8_sae!(imm8, call);
+    transmute(simd_select_bitmask(k, r.as_f64x8(), src.as_f64x8()))
 }
 
 /// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -17270,15 +17389,14 @@ pub unsafe fn _mm512_mask_permute_pd(src: __m512d, k: __mmask8, a: __m512d, imm8
 #[cfg_attr(test, assert_instr(vpermilpd, imm8 = 2))]
 #[rustc_args_required_const(2)]
 pub unsafe fn _mm512_maskz_permute_pd(k: __mmask8, a: __m512d, imm8: i32) -> __m512d {
-    let a = a.as_f64x8();
     macro_rules! call {
         ($imm8:expr) => {
-            vpermilpd(a, _mm512_set1_epi64($imm8).as_i64x8())
+            _mm512_permute_pd(a, $imm8)
         };
     }
-    let permute = constify_imm8_sae!(imm8, call);
+    let r = constify_imm8_sae!(imm8, call);
     let zero = _mm512_setzero_pd().as_f64x8();
-    transmute(simd_select_bitmask(k, permute, zero))
+    transmute(simd_select_bitmask(k, r.as_f64x8(), zero))
 }
 
 /// Shuffle 64-bit integers in a within 256-bit lanes using the control in imm8, and store the results in dst.
@@ -17286,18 +17404,56 @@ pub unsafe fn _mm512_maskz_permute_pd(k: __mmask8, a: __m512d, imm8: i32) -> __m
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_permutex_epi64&expand=4208)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vbroadcast, imm8 = 0b11111111))]
-//shoud be vpermq, but generate vpermpd. It generates vpermq with mask. change to vbroadcast becaise CI Windows
+#[cfg_attr(test, assert_instr(vperm, imm8 = 0b10011011))] //shoud be vpermq
 #[rustc_args_required_const(1)]
 pub unsafe fn _mm512_permutex_epi64(a: __m512i, imm8: i32) -> __m512i {
-    let a = a.as_i64x8();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vpermq(a, _mm512_set1_epi64($imm8).as_i64x8())
+    let imm8 = (imm8 & 0xFF) as u8;
+    let undefined = _mm512_set1_epi64(0);
+    macro_rules! shuffle4 {
+        ($a:expr, $b:expr, $c:expr, $d:expr) => {
+            simd_shuffle8(
+                a,
+                undefined,
+                [$a, $b, $c, $d, $a + 4, $b + 4, $c + 4, $d + 4],
+            )
         };
     }
-    let r = constify_imm8_sae!(imm8, call);
-    transmute(r)
+    macro_rules! shuffle3 {
+        ($a:expr, $b:expr, $c:expr) => {
+            match (imm8 >> 6) & 0b11 {
+                0b00 => shuffle4!($a, $b, $c, 0),
+                0b01 => shuffle4!($a, $b, $c, 1),
+                0b10 => shuffle4!($a, $b, $c, 2),
+                _ => shuffle4!($a, $b, $c, 3),
+            }
+        };
+    }
+    macro_rules! shuffle2 {
+        ($a:expr, $b:expr) => {
+            match (imm8 >> 4) & 0b11 {
+                0b00 => shuffle3!($a, $b, 0),
+                0b01 => shuffle3!($a, $b, 1),
+                0b10 => shuffle3!($a, $b, 2),
+                _ => shuffle3!($a, $b, 3),
+            }
+        };
+    }
+    macro_rules! shuffle1 {
+        ($a:expr) => {
+            match (imm8 >> 2) & 0b11 {
+                0b00 => shuffle2!($a, 0),
+                0b01 => shuffle2!($a, 1),
+                0b10 => shuffle2!($a, 2),
+                _ => shuffle2!($a, 3),
+            }
+        };
+    }
+    match imm8 & 0b11 {
+        0b00 => shuffle1!(0),
+        0b01 => shuffle1!(1),
+        0b10 => shuffle1!(2),
+        _ => shuffle1!(3),
+    }
 }
 
 /// Shuffle 64-bit integers in a within 256-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -17305,7 +17461,7 @@ pub unsafe fn _mm512_permutex_epi64(a: __m512i, imm8: i32) -> __m512i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_permutex_epi64&expand=4206)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vpbroadcast, imm8 = 0b11111111))] //shoud be vpermq. change to vpbroadcast becaise CI Windows
+#[cfg_attr(test, assert_instr(vperm, imm8 = 0b11111111))] //should be vpermq
 #[rustc_args_required_const(3)]
 pub unsafe fn _mm512_mask_permutex_epi64(
     src: __m512i,
@@ -17313,14 +17469,13 @@ pub unsafe fn _mm512_mask_permutex_epi64(
     a: __m512i,
     imm8: i32,
 ) -> __m512i {
-    let a = a.as_i64x8();
     macro_rules! call {
         ($imm8:expr) => {
-            vpermq(a, _mm512_set1_epi64($imm8).as_i64x8())
+            _mm512_permutex_epi64(a, $imm8)
         };
     }
-    let permute = constify_imm8_sae!(imm8, call);
-    transmute(simd_select_bitmask(k, permute, src.as_i64x8()))
+    let r = constify_imm8_sae!(imm8, call);
+    transmute(simd_select_bitmask(k, r.as_i64x8(), src.as_i64x8()))
 }
 
 /// Shuffle 64-bit integers in a within 256-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -17328,18 +17483,17 @@ pub unsafe fn _mm512_mask_permutex_epi64(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_permutex_epi64&expand=4207)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vpbroadcast, imm8 = 0b11111111))] //shoud be vpermq. change to vpbroadcast becaise CI Windows
+#[cfg_attr(test, assert_instr(vperm, imm8 = 0b11111111))] //should be vpermq
 #[rustc_args_required_const(2)]
 pub unsafe fn _mm512_maskz_permutex_epi64(k: __mmask8, a: __m512i, imm8: i32) -> __m512i {
-    let a = a.as_i64x8();
     macro_rules! call {
         ($imm8:expr) => {
-            vpermq(a, _mm512_set1_epi64($imm8).as_i64x8())
+            _mm512_permutex_epi64(a, $imm8)
         };
     }
-    let permute = constify_imm8_sae!(imm8, call);
+    let r = constify_imm8_sae!(imm8, call);
     let zero = _mm512_setzero_si512().as_i64x8();
-    transmute(simd_select_bitmask(k, permute, zero))
+    transmute(simd_select_bitmask(k, r.as_i64x8(), zero))
 }
 
 /// Shuffle double-precision (64-bit) floating-point elements in a within 256-bit lanes using the control in imm8, and store the results in dst.
@@ -17347,17 +17501,56 @@ pub unsafe fn _mm512_maskz_permutex_epi64(k: __mmask8, a: __m512i, imm8: i32) ->
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_permutex_pd&expand=4214)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vbroadcast, imm8 = 0b11111111))] //shoud be vpermpd. change to vbroadcast becaise CI Windows
+#[cfg_attr(test, assert_instr(vperm, imm8 = 0b11111111))] //shoud be vpermpd
 #[rustc_args_required_const(1)]
 pub unsafe fn _mm512_permutex_pd(a: __m512d, imm8: i32) -> __m512d {
-    let a = a.as_f64x8();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vpermpd(a, _mm512_set1_epi64($imm8).as_i64x8())
+    let imm8 = (imm8 & 0xFF) as u8;
+    let undefined = _mm512_undefined_pd();
+    macro_rules! shuffle4 {
+        ($a:expr, $b:expr, $c:expr, $d:expr) => {
+            simd_shuffle8(
+                a,
+                undefined,
+                [$a, $b, $c, $d, $a + 4, $b + 4, $c + 4, $d + 4],
+            )
         };
     }
-    let r = constify_imm8_sae!(imm8, call);
-    transmute(r)
+    macro_rules! shuffle3 {
+        ($a:expr, $b:expr, $c:expr) => {
+            match (imm8 >> 6) & 0b11 {
+                0b00 => shuffle4!($a, $b, $c, 0),
+                0b01 => shuffle4!($a, $b, $c, 1),
+                0b10 => shuffle4!($a, $b, $c, 2),
+                _ => shuffle4!($a, $b, $c, 3),
+            }
+        };
+    }
+    macro_rules! shuffle2 {
+        ($a:expr, $b:expr) => {
+            match (imm8 >> 4) & 0b11 {
+                0b00 => shuffle3!($a, $b, 0),
+                0b01 => shuffle3!($a, $b, 1),
+                0b10 => shuffle3!($a, $b, 2),
+                _ => shuffle3!($a, $b, 3),
+            }
+        };
+    }
+    macro_rules! shuffle1 {
+        ($a:expr) => {
+            match (imm8 >> 2) & 0b11 {
+                0b00 => shuffle2!($a, 0),
+                0b01 => shuffle2!($a, 1),
+                0b10 => shuffle2!($a, 2),
+                _ => shuffle2!($a, 3),
+            }
+        };
+    }
+    match imm8 & 0b11 {
+        0b00 => shuffle1!(0),
+        0b01 => shuffle1!(1),
+        0b10 => shuffle1!(2),
+        _ => shuffle1!(3),
+    }
 }
 
 /// Shuffle double-precision (64-bit) floating-point elements in a within 256-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -17365,17 +17558,16 @@ pub unsafe fn _mm512_permutex_pd(a: __m512d, imm8: i32) -> __m512d {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_permutex_pd&expand=4212)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vbroadcast, imm8 = 0b11111111))] //shoud be vpermpd. change to vbroadcast becaise CI Windows
+#[cfg_attr(test, assert_instr(vperm, imm8 = 0b11111111))] //shoud be vpermpd
 #[rustc_args_required_const(3)]
 pub unsafe fn _mm512_mask_permutex_pd(src: __m512d, k: __mmask8, a: __m512d, imm8: i32) -> __m512d {
-    let a = a.as_f64x8();
     macro_rules! call {
         ($imm8:expr) => {
-            vpermpd(a, _mm512_set1_epi64($imm8).as_i64x8())
+            _mm512_permutex_pd(a, $imm8)
         };
     }
-    let permute = constify_imm8_sae!(imm8, call);
-    transmute(simd_select_bitmask(k, permute, src.as_f64x8()))
+    let r = constify_imm8_sae!(imm8, call);
+    transmute(simd_select_bitmask(k, r.as_f64x8(), src.as_f64x8()))
 }
 
 /// Shuffle double-precision (64-bit) floating-point elements in a within 256-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -17383,18 +17575,17 @@ pub unsafe fn _mm512_mask_permutex_pd(src: __m512d, k: __mmask8, a: __m512d, imm
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_permutex_pd&expand=4213)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vbroadcast, imm8 = 0b11111111))] //shoud be vpermpd. change to vbroadcast becaise CI Windows
+#[cfg_attr(test, assert_instr(vperm, imm8 = 0b11111111))] //shoud be vpermpd
 #[rustc_args_required_const(2)]
 pub unsafe fn _mm512_maskz_permutex_pd(k: __mmask8, a: __m512d, imm8: i32) -> __m512d {
-    let a = a.as_f64x8();
     macro_rules! call {
         ($imm8:expr) => {
-            vpermpd(a, _mm512_set1_epi64($imm8).as_i64x8())
+            _mm512_permutex_pd(a, $imm8)
         };
     }
-    let permute = constify_imm8_sae!(imm8, call);
+    let r = constify_imm8_sae!(imm8, call);
     let zero = _mm512_setzero_pd().as_f64x8();
-    transmute(simd_select_bitmask(k, permute, zero))
+    transmute(simd_select_bitmask(k, r.as_f64x8(), zero))
 }
 
 /// Shuffle 32-bit integers in a across lanes using the corresponding index in idx, and store the results in dst. Note that this intrinsic shuffles across 128-bit lanes, unlike past intrinsics that use the permutevar name. This intrinsic is identical to _mm512_permutexvar_epi32, and it is recommended that you use that intrinsic name.
@@ -17979,73 +18170,13 @@ pub unsafe fn _mm512_mask_shuffle_epi32(
     a: __m512i,
     imm8: _MM_PERM_ENUM,
 ) -> __m512i {
-    let imm8 = (imm8 & 0xFF) as u8;
-    let a = a.as_i32x16();
-    macro_rules! shuffle4 {
-        (
-            $a:expr,
-            $b:expr,
-            $c:expr,
-            $d:expr,
-            $e:expr,
-            $f:expr,
-            $g:expr,
-            $h:expr,
-            $i:expr,
-            $j:expr,
-            $k:expr,
-            $l:expr,
-            $m:expr,
-            $n:expr,
-            $o:expr,
-            $p:expr
-        ) => {
-            simd_shuffle16(
-                a,
-                a,
-                [
-                    $a, $b, $c, $d, $e, $f, $g, $h, $i, $j, $k, $l, $m, $n, $o, $p,
-                ],
-            )
+    macro_rules! call {
+        ($imm8:expr) => {
+            _mm512_shuffle_epi32(a, $imm8)
         };
     }
-    macro_rules! shuffle3 {
-        ($a:expr, $b:expr, $c:expr, $e:expr, $f:expr, $g:expr, $i:expr, $j:expr, $k:expr, $m:expr, $n:expr, $o:expr) => {
-            match (imm8 >> 6) & 0x3 {
-                0 => shuffle4!($a, $b, $c, 16, $e, $f, $g, 20, $i, $j, $k, 24, $m, $n, $o, 28),
-                1 => shuffle4!($a, $b, $c, 17, $e, $f, $g, 21, $i, $j, $k, 25, $m, $n, $o, 29),
-                2 => shuffle4!($a, $b, $c, 18, $e, $f, $g, 22, $i, $j, $k, 26, $m, $n, $o, 30),
-                _ => shuffle4!($a, $b, $c, 19, $e, $f, $g, 23, $i, $j, $k, 27, $m, $n, $o, 31),
-            }
-        };
-    }
-    macro_rules! shuffle2 {
-        ($a:expr, $b:expr, $e:expr, $f:expr, $i:expr, $j:expr, $m:expr, $n:expr) => {
-            match (imm8 >> 4) & 0x3 {
-                0 => shuffle3!($a, $b, 16, $e, $f, 20, $i, $j, 24, $m, $n, 28),
-                1 => shuffle3!($a, $b, 17, $e, $f, 21, $i, $j, 25, $m, $n, 29),
-                2 => shuffle3!($a, $b, 18, $e, $f, 22, $i, $j, 26, $m, $n, 30),
-                _ => shuffle3!($a, $b, 19, $e, $f, 23, $i, $j, 27, $m, $n, 31),
-            }
-        };
-    }
-    macro_rules! shuffle1 {
-        ($a:expr, $e:expr, $i: expr, $m: expr) => {
-            match (imm8 >> 2) & 0x3 {
-                0 => shuffle2!($a, 0, $e, 4, $i, 8, $m, 12),
-                1 => shuffle2!($a, 1, $e, 5, $i, 9, $m, 13),
-                2 => shuffle2!($a, 2, $e, 6, $i, 10, $m, 14),
-                _ => shuffle2!($a, 3, $e, 7, $i, 11, $m, 15),
-            }
-        };
-    }
-    let shuffle: i32x16 = match imm8 & 0x3 {
-        0 => shuffle1!(0, 4, 8, 12),
-        1 => shuffle1!(1, 5, 9, 13),
-        2 => shuffle1!(2, 6, 10, 14),
-        _ => shuffle1!(3, 7, 11, 15),
-    };
-    transmute(simd_select_bitmask(k, shuffle, src.as_i32x16()))
+    let r = constify_imm8_sae!(imm8, call);
+    transmute(simd_select_bitmask(k, r.as_i32x16(), src.as_i32x16()))
 }
 
 /// Shuffle 32-bit integers in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -18056,74 +18187,14 @@ pub unsafe fn _mm512_mask_shuffle_epi32(
 #[cfg_attr(test, assert_instr(vpshufd, imm8 = 9))]
 #[rustc_args_required_const(2)]
 pub unsafe fn _mm512_maskz_shuffle_epi32(k: __mmask16, a: __m512i, imm8: _MM_PERM_ENUM) -> __m512i {
-    let imm8 = (imm8 & 0xFF) as u8;
-    let a = a.as_i32x16();
-    macro_rules! shuffle4 {
-        (
-            $a:expr,
-            $b:expr,
-            $c:expr,
-            $d:expr,
-            $e:expr,
-            $f:expr,
-            $g:expr,
-            $h:expr,
-            $i:expr,
-            $j:expr,
-            $k:expr,
-            $l:expr,
-            $m:expr,
-            $n:expr,
-            $o:expr,
-            $p:expr
-        ) => {
-            simd_shuffle16(
-                a,
-                a,
-                [
-                    $a, $b, $c, $d, $e, $f, $g, $h, $i, $j, $k, $l, $m, $n, $o, $p,
-                ],
-            )
+    macro_rules! call {
+        ($imm8:expr) => {
+            _mm512_shuffle_epi32(a, $imm8)
         };
     }
-    macro_rules! shuffle3 {
-        ($a:expr, $b:expr, $c:expr, $e:expr, $f:expr, $g:expr, $i:expr, $j:expr, $k:expr, $m:expr, $n:expr, $o:expr) => {
-            match (imm8 >> 6) & 0x3 {
-                0 => shuffle4!($a, $b, $c, 16, $e, $f, $g, 20, $i, $j, $k, 24, $m, $n, $o, 28),
-                1 => shuffle4!($a, $b, $c, 17, $e, $f, $g, 21, $i, $j, $k, 25, $m, $n, $o, 29),
-                2 => shuffle4!($a, $b, $c, 18, $e, $f, $g, 22, $i, $j, $k, 26, $m, $n, $o, 30),
-                _ => shuffle4!($a, $b, $c, 19, $e, $f, $g, 23, $i, $j, $k, 27, $m, $n, $o, 31),
-            }
-        };
-    }
-    macro_rules! shuffle2 {
-        ($a:expr, $b:expr, $e:expr, $f:expr, $i:expr, $j:expr, $m:expr, $n:expr) => {
-            match (imm8 >> 4) & 0x3 {
-                0 => shuffle3!($a, $b, 16, $e, $f, 20, $i, $j, 24, $m, $n, 28),
-                1 => shuffle3!($a, $b, 17, $e, $f, 21, $i, $j, 25, $m, $n, 29),
-                2 => shuffle3!($a, $b, 18, $e, $f, 22, $i, $j, 26, $m, $n, 30),
-                _ => shuffle3!($a, $b, 19, $e, $f, 23, $i, $j, 27, $m, $n, 31),
-            }
-        };
-    }
-    macro_rules! shuffle1 {
-        ($a:expr, $e:expr, $i: expr, $m: expr) => {
-            match (imm8 >> 2) & 0x3 {
-                0 => shuffle2!($a, 0, $e, 4, $i, 8, $m, 12),
-                1 => shuffle2!($a, 1, $e, 5, $i, 9, $m, 13),
-                2 => shuffle2!($a, 2, $e, 6, $i, 10, $m, 14),
-                _ => shuffle2!($a, 3, $e, 7, $i, 11, $m, 15),
-            }
-        };
-    }
-    let shuffle: i32x16 = match imm8 & 0x3 {
-        0 => shuffle1!(0, 4, 8, 12),
-        1 => shuffle1!(1, 5, 9, 13),
-        2 => shuffle1!(2, 6, 10, 14),
-        _ => shuffle1!(3, 7, 11, 15),
-    };
+    let r = constify_imm8_sae!(imm8, call);
     let zero = _mm512_setzero_si512().as_i32x16();
-    transmute(simd_select_bitmask(k, shuffle, zero))
+    transmute(simd_select_bitmask(k, r.as_i32x16(), zero))
 }
 
 /// Shuffle 32-bit integers in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -18296,74 +18367,13 @@ pub unsafe fn _mm512_mask_shuffle_ps(
     b: __m512,
     imm8: i32,
 ) -> __m512 {
-    assert!(imm8 >= 0 && imm8 <= 255);
-    let imm8 = (imm8 & 0xFF) as u8;
-    macro_rules! shuffle4 {
-        (
-            $a:expr,
-            $b:expr,
-            $c:expr,
-            $d:expr,
-            $e:expr,
-            $f:expr,
-            $g:expr,
-            $h:expr,
-            $i:expr,
-            $j:expr,
-            $k:expr,
-            $l:expr,
-            $m:expr,
-            $n:expr,
-            $o:expr,
-            $p:expr
-        ) => {
-            simd_shuffle16(
-                a,
-                b,
-                [
-                    $a, $b, $c, $d, $e, $f, $g, $h, $i, $j, $k, $l, $m, $n, $o, $p,
-                ],
-            )
+    macro_rules! call {
+        ($imm8:expr) => {
+            _mm512_shuffle_ps(a, b, $imm8)
         };
     }
-    macro_rules! shuffle3 {
-        ($a:expr, $b:expr, $c:expr, $e:expr, $f:expr, $g:expr, $i:expr, $j:expr, $k:expr, $m:expr, $n:expr, $o:expr) => {
-            match (imm8 >> 6) & 0x3 {
-                0 => shuffle4!($a, $b, $c, 16, $e, $f, $g, 20, $i, $j, $k, 24, $m, $n, $o, 28),
-                1 => shuffle4!($a, $b, $c, 17, $e, $f, $g, 21, $i, $j, $k, 25, $m, $n, $o, 29),
-                2 => shuffle4!($a, $b, $c, 18, $e, $f, $g, 22, $i, $j, $k, 26, $m, $n, $o, 30),
-                _ => shuffle4!($a, $b, $c, 19, $e, $f, $g, 23, $i, $j, $k, 27, $m, $n, $o, 31),
-            }
-        };
-    }
-    macro_rules! shuffle2 {
-        ($a:expr, $b:expr, $e:expr, $f:expr, $i:expr, $j:expr, $m:expr, $n:expr) => {
-            match (imm8 >> 4) & 0x3 {
-                0 => shuffle3!($a, $b, 16, $e, $f, 20, $i, $j, 24, $m, $n, 28),
-                1 => shuffle3!($a, $b, 17, $e, $f, 21, $i, $j, 25, $m, $n, 29),
-                2 => shuffle3!($a, $b, 18, $e, $f, 22, $i, $j, 26, $m, $n, 30),
-                _ => shuffle3!($a, $b, 19, $e, $f, 23, $i, $j, 27, $m, $n, 31),
-            }
-        };
-    }
-    macro_rules! shuffle1 {
-        ($a:expr, $e:expr, $i: expr, $m: expr) => {
-            match (imm8 >> 2) & 0x3 {
-                0 => shuffle2!($a, 0, $e, 4, $i, 8, $m, 12),
-                1 => shuffle2!($a, 1, $e, 5, $i, 9, $m, 13),
-                2 => shuffle2!($a, 2, $e, 6, $i, 10, $m, 14),
-                _ => shuffle2!($a, 3, $e, 7, $i, 11, $m, 15),
-            }
-        };
-    }
-    let shuffle = match imm8 & 0x3 {
-        0 => shuffle1!(0, 4, 8, 12),
-        1 => shuffle1!(1, 5, 9, 13),
-        2 => shuffle1!(2, 6, 10, 14),
-        _ => shuffle1!(3, 7, 11, 15),
-    };
-
-    transmute(simd_select_bitmask(k, shuffle, src.as_f32x16()))
+    let r = constify_imm8_sae!(imm8, call);
+    transmute(simd_select_bitmask(k, r.as_f32x16(), src.as_f32x16()))
 }
 
 /// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -18374,75 +18384,14 @@ pub unsafe fn _mm512_mask_shuffle_ps(
 #[cfg_attr(test, assert_instr(vshufps, imm8 = 0))]
 #[rustc_args_required_const(3)]
 pub unsafe fn _mm512_maskz_shuffle_ps(k: __mmask16, a: __m512, b: __m512, imm8: i32) -> __m512 {
-    assert!(imm8 >= 0 && imm8 <= 255);
-    let imm8 = (imm8 & 0xFF) as u8;
-    macro_rules! shuffle4 {
-        (
-            $a:expr,
-            $b:expr,
-            $c:expr,
-            $d:expr,
-            $e:expr,
-            $f:expr,
-            $g:expr,
-            $h:expr,
-            $i:expr,
-            $j:expr,
-            $k:expr,
-            $l:expr,
-            $m:expr,
-            $n:expr,
-            $o:expr,
-            $p:expr
-        ) => {
-            simd_shuffle16(
-                a,
-                b,
-                [
-                    $a, $b, $c, $d, $e, $f, $g, $h, $i, $j, $k, $l, $m, $n, $o, $p,
-                ],
-            )
+    macro_rules! call {
+        ($imm8:expr) => {
+            _mm512_shuffle_ps(a, b, $imm8)
         };
     }
-    macro_rules! shuffle3 {
-        ($a:expr, $b:expr, $c:expr, $e:expr, $f:expr, $g:expr, $i:expr, $j:expr, $k:expr, $m:expr, $n:expr, $o:expr) => {
-            match (imm8 >> 6) & 0x3 {
-                0 => shuffle4!($a, $b, $c, 16, $e, $f, $g, 20, $i, $j, $k, 24, $m, $n, $o, 28),
-                1 => shuffle4!($a, $b, $c, 17, $e, $f, $g, 21, $i, $j, $k, 25, $m, $n, $o, 29),
-                2 => shuffle4!($a, $b, $c, 18, $e, $f, $g, 22, $i, $j, $k, 26, $m, $n, $o, 30),
-                _ => shuffle4!($a, $b, $c, 19, $e, $f, $g, 23, $i, $j, $k, 27, $m, $n, $o, 31),
-            }
-        };
-    }
-    macro_rules! shuffle2 {
-        ($a:expr, $b:expr, $e:expr, $f:expr, $i:expr, $j:expr, $m:expr, $n:expr) => {
-            match (imm8 >> 4) & 0x3 {
-                0 => shuffle3!($a, $b, 16, $e, $f, 20, $i, $j, 24, $m, $n, 28),
-                1 => shuffle3!($a, $b, 17, $e, $f, 21, $i, $j, 25, $m, $n, 29),
-                2 => shuffle3!($a, $b, 18, $e, $f, 22, $i, $j, 26, $m, $n, 30),
-                _ => shuffle3!($a, $b, 19, $e, $f, 23, $i, $j, 27, $m, $n, 31),
-            }
-        };
-    }
-    macro_rules! shuffle1 {
-        ($a:expr, $e:expr, $i: expr, $m: expr) => {
-            match (imm8 >> 2) & 0x3 {
-                0 => shuffle2!($a, 0, $e, 4, $i, 8, $m, 12),
-                1 => shuffle2!($a, 1, $e, 5, $i, 9, $m, 13),
-                2 => shuffle2!($a, 2, $e, 6, $i, 10, $m, 14),
-                _ => shuffle2!($a, 3, $e, 7, $i, 11, $m, 15),
-            }
-        };
-    }
-    let shuffle = match imm8 & 0x3 {
-        0 => shuffle1!(0, 4, 8, 12),
-        1 => shuffle1!(1, 5, 9, 13),
-        2 => shuffle1!(2, 6, 10, 14),
-        _ => shuffle1!(3, 7, 11, 15),
-    };
-
+    let r = constify_imm8_sae!(imm8, call);
     let zero = _mm512_setzero_ps().as_f32x16();
-    transmute(simd_select_bitmask(k, shuffle, zero))
+    transmute(simd_select_bitmask(k, r.as_f32x16(), zero))
 }
 
 /// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -18618,75 +18567,13 @@ pub unsafe fn _mm512_mask_shuffle_pd(
     b: __m512d,
     imm8: i32,
 ) -> __m512d {
-    assert!(imm8 >= 0 && imm8 <= 255);
-    let imm8 = (imm8 & 0xFF) as u8;
-    macro_rules! shuffle8 {
-        ($a:expr, $b:expr, $c:expr, $d:expr, $e:expr, $f:expr, $g:expr, $h:expr) => {
-            simd_shuffle8(a, b, [$a, $b, $c, $d, $e, $f, $g, $h])
+    macro_rules! call {
+        ($imm8:expr) => {
+            _mm512_shuffle_pd(a, b, $imm8)
         };
     }
-    macro_rules! shuffle7 {
-        ($a:expr, $b:expr, $c:expr, $d:expr, $e:expr, $f:expr, $g:expr) => {
-            match (imm8 >> 7) & 0x1 {
-                0 => shuffle8!($a, $b, $c, $d, $e, $f, $g, 14),
-                _ => shuffle8!($a, $b, $c, $d, $e, $f, $g, 15),
-            }
-        };
-    }
-    macro_rules! shuffle6 {
-        ($a:expr, $b:expr, $c:expr, $d:expr, $e:expr, $f:expr) => {
-            match (imm8 >> 6) & 0x1 {
-                0 => shuffle7!($a, $b, $c, $d, $e, $f, 6),
-                _ => shuffle7!($a, $b, $c, $d, $e, $f, 7),
-            }
-        };
-    }
-    macro_rules! shuffle5 {
-        ($a:expr, $b:expr, $c:expr, $d:expr, $e:expr) => {
-            match (imm8 >> 5) & 0x1 {
-                0 => shuffle6!($a, $b, $c, $d, $e, 12),
-                _ => shuffle6!($a, $b, $c, $d, $e, 13),
-            }
-        };
-    }
-    macro_rules! shuffle4 {
-        ($a:expr, $b:expr, $c:expr, $d:expr) => {
-            match (imm8 >> 4) & 0x1 {
-                0 => shuffle5!($a, $b, $c, $d, 4),
-                _ => shuffle5!($a, $b, $c, $d, 5),
-            }
-        };
-    }
-    macro_rules! shuffle3 {
-        ($a:expr, $b:expr, $c:expr) => {
-            match (imm8 >> 3) & 0x1 {
-                0 => shuffle4!($a, $b, $c, 10),
-                _ => shuffle4!($a, $b, $c, 11),
-            }
-        };
-    }
-    macro_rules! shuffle2 {
-        ($a:expr, $b:expr) => {
-            match (imm8 >> 2) & 0x1 {
-                0 => shuffle3!($a, $b, 2),
-                _ => shuffle3!($a, $b, 3),
-            }
-        };
-    }
-    macro_rules! shuffle1 {
-        ($a:expr) => {
-            match (imm8 >> 1) & 0x1 {
-                0 => shuffle2!($a, 8),
-                _ => shuffle2!($a, 9),
-            }
-        };
-    }
-    let shuffle = match imm8 & 0x1 {
-        0 => shuffle1!(0),
-        _ => shuffle1!(1),
-    };
-
-    transmute(simd_select_bitmask(k, shuffle, src.as_f64x8()))
+    let r = constify_imm8_sae!(imm8, call);
+    transmute(simd_select_bitmask(k, r.as_f64x8(), src.as_f64x8()))
 }
 
 /// Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -18697,76 +18584,14 @@ pub unsafe fn _mm512_mask_shuffle_pd(
 #[cfg_attr(test, assert_instr(vshufpd, imm8 = 3))]
 #[rustc_args_required_const(3)]
 pub unsafe fn _mm512_maskz_shuffle_pd(k: __mmask8, a: __m512d, b: __m512d, imm8: i32) -> __m512d {
-    assert!(imm8 >= 0 && imm8 <= 255);
-    let imm8 = (imm8 & 0xFF) as u8;
-    macro_rules! shuffle8 {
-        ($a:expr, $b:expr, $c:expr, $d:expr, $e:expr, $f:expr, $g:expr, $h:expr) => {
-            simd_shuffle8(a, b, [$a, $b, $c, $d, $e, $f, $g, $h])
+    macro_rules! call {
+        ($imm8:expr) => {
+            _mm512_shuffle_pd(a, b, $imm8)
         };
     }
-    macro_rules! shuffle7 {
-        ($a:expr, $b:expr, $c:expr, $d:expr, $e:expr, $f:expr, $g:expr) => {
-            match (imm8 >> 7) & 0x1 {
-                0 => shuffle8!($a, $b, $c, $d, $e, $f, $g, 14),
-                _ => shuffle8!($a, $b, $c, $d, $e, $f, $g, 15),
-            }
-        };
-    }
-    macro_rules! shuffle6 {
-        ($a:expr, $b:expr, $c:expr, $d:expr, $e:expr, $f:expr) => {
-            match (imm8 >> 6) & 0x1 {
-                0 => shuffle7!($a, $b, $c, $d, $e, $f, 6),
-                _ => shuffle7!($a, $b, $c, $d, $e, $f, 7),
-            }
-        };
-    }
-    macro_rules! shuffle5 {
-        ($a:expr, $b:expr, $c:expr, $d:expr, $e:expr) => {
-            match (imm8 >> 5) & 0x1 {
-                0 => shuffle6!($a, $b, $c, $d, $e, 12),
-                _ => shuffle6!($a, $b, $c, $d, $e, 13),
-            }
-        };
-    }
-    macro_rules! shuffle4 {
-        ($a:expr, $b:expr, $c:expr, $d:expr) => {
-            match (imm8 >> 4) & 0x1 {
-                0 => shuffle5!($a, $b, $c, $d, 4),
-                _ => shuffle5!($a, $b, $c, $d, 5),
-            }
-        };
-    }
-    macro_rules! shuffle3 {
-        ($a:expr, $b:expr, $c:expr) => {
-            match (imm8 >> 3) & 0x1 {
-                0 => shuffle4!($a, $b, $c, 10),
-                _ => shuffle4!($a, $b, $c, 11),
-            }
-        };
-    }
-    macro_rules! shuffle2 {
-        ($a:expr, $b:expr) => {
-            match (imm8 >> 2) & 0x1 {
-                0 => shuffle3!($a, $b, 2),
-                _ => shuffle3!($a, $b, 3),
-            }
-        };
-    }
-    macro_rules! shuffle1 {
-        ($a:expr) => {
-            match (imm8 >> 1) & 0x1 {
-                0 => shuffle2!($a, 8),
-                _ => shuffle2!($a, 9),
-            }
-        };
-    }
-    let shuffle = match imm8 & 0x1 {
-        0 => shuffle1!(0),
-        _ => shuffle1!(1),
-    };
-
+    let r = constify_imm8_sae!(imm8, call);
     let zero = _mm512_setzero_pd().as_f64x8();
-    transmute(simd_select_bitmask(k, shuffle, zero))
+    transmute(simd_select_bitmask(k, r.as_f64x8(), zero))
 }
 
 /// Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -18945,75 +18770,13 @@ pub unsafe fn _mm512_mask_shuffle_i32x4(
     b: __m512i,
     imm8: i32,
 ) -> __m512i {
-    assert!(imm8 >= 0 && imm8 <= 255);
-    let imm8 = (imm8 & 0xFF) as u8;
-    let a = a.as_i32x16();
-    let b = b.as_i32x16();
-    macro_rules! shuffle4 {
-        (
-            $a:expr,
-            $b:expr,
-            $c:expr,
-            $d:expr,
-            $e:expr,
-            $f:expr,
-            $g:expr,
-            $h:expr,
-            $i:expr,
-            $j:expr,
-            $k:expr,
-            $l:expr,
-            $m:expr,
-            $n:expr,
-            $o:expr,
-            $p:expr
-        ) => {
-            simd_shuffle16(
-                a,
-                b,
-                [
-                    $a, $b, $c, $d, $e, $f, $g, $h, $i, $j, $k, $l, $m, $n, $o, $p,
-                ],
-            )
+    macro_rules! call {
+        ($imm8:expr) => {
+            _mm512_shuffle_i32x4(a, b, $imm8)
         };
     }
-    macro_rules! shuffle3 {
-        ($a:expr, $b:expr, $c:expr, $e:expr, $f:expr, $g:expr, $i:expr, $j:expr, $k:expr, $m:expr, $n:expr, $o:expr) => {
-            match (imm8 >> 6) & 0x3 {
-                0 => shuffle4!($a, $b, $c, $e, $f, $g, $i, $j, $k, $m, $n, $o, 16, 17, 18, 19),
-                1 => shuffle4!($a, $b, $c, $e, $f, $g, $i, $j, $k, $m, $n, $o, 20, 21, 22, 23),
-                2 => shuffle4!($a, $b, $c, $e, $f, $g, $i, $j, $k, $m, $n, $o, 24, 25, 26, 27),
-                _ => shuffle4!($a, $b, $c, $e, $f, $g, $i, $j, $k, $m, $n, $o, 28, 29, 30, 31),
-            }
-        };
-    }
-    macro_rules! shuffle2 {
-        ($a:expr, $b:expr, $e:expr, $f:expr, $i:expr, $j:expr, $m:expr, $n:expr) => {
-            match (imm8 >> 4) & 0x3 {
-                0 => shuffle3!($a, $b, $e, $f, $i, $j, $m, $n, 16, 17, 18, 19),
-                1 => shuffle3!($a, $b, $e, $f, $i, $j, $m, $n, 20, 21, 22, 23),
-                2 => shuffle3!($a, $b, $e, $f, $i, $j, $m, $n, 24, 25, 26, 27),
-                _ => shuffle3!($a, $b, $e, $f, $i, $j, $m, $n, 28, 29, 30, 31),
-            }
-        };
-    }
-    macro_rules! shuffle1 {
-        ($a:expr, $e:expr, $i: expr, $m: expr) => {
-            match (imm8 >> 2) & 0x3 {
-                0 => shuffle2!($a, $e, $i, $m, 0, 1, 2, 3),
-                1 => shuffle2!($a, $e, $i, $m, 4, 5, 6, 7),
-                2 => shuffle2!($a, $e, $i, $m, 8, 9, 10, 11),
-                _ => shuffle2!($a, $e, $i, $m, 12, 13, 14, 15),
-            }
-        };
-    }
-    let shuffle = match imm8 & 0x3 {
-        0 => shuffle1!(0, 1, 2, 3),
-        1 => shuffle1!(4, 5, 6, 7),
-        2 => shuffle1!(8, 9, 10, 11),
-        _ => shuffle1!(12, 13, 14, 15),
-    };
-    transmute(simd_select_bitmask(k, shuffle, src.as_i32x16()))
+    let r = constify_imm8_sae!(imm8, call);
+    transmute(simd_select_bitmask(k, r.as_i32x16(), src.as_i32x16()))
 }
 
 /// Shuffle 128-bits (composed of 4 32-bit integers) selected by imm8 from a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -19029,76 +18792,14 @@ pub unsafe fn _mm512_maskz_shuffle_i32x4(
     b: __m512i,
     imm8: i32,
 ) -> __m512i {
-    assert!(imm8 >= 0 && imm8 <= 255);
-    let imm8 = (imm8 & 0xFF) as u8;
-    let a = a.as_i32x16();
-    let b = b.as_i32x16();
-    macro_rules! shuffle4 {
-        (
-            $a:expr,
-            $b:expr,
-            $c:expr,
-            $d:expr,
-            $e:expr,
-            $f:expr,
-            $g:expr,
-            $h:expr,
-            $i:expr,
-            $j:expr,
-            $k:expr,
-            $l:expr,
-            $m:expr,
-            $n:expr,
-            $o:expr,
-            $p:expr
-        ) => {
-            simd_shuffle16(
-                a,
-                b,
-                [
-                    $a, $b, $c, $d, $e, $f, $g, $h, $i, $j, $k, $l, $m, $n, $o, $p,
-                ],
-            )
+    macro_rules! call {
+        ($imm8:expr) => {
+            _mm512_shuffle_i32x4(a, b, $imm8)
         };
     }
-    macro_rules! shuffle3 {
-        ($a:expr, $b:expr, $c:expr, $e:expr, $f:expr, $g:expr, $i:expr, $j:expr, $k:expr, $m:expr, $n:expr, $o:expr) => {
-            match (imm8 >> 6) & 0x3 {
-                0 => shuffle4!($a, $b, $c, $e, $f, $g, $i, $j, $k, $m, $n, $o, 16, 17, 18, 19),
-                1 => shuffle4!($a, $b, $c, $e, $f, $g, $i, $j, $k, $m, $n, $o, 20, 21, 22, 23),
-                2 => shuffle4!($a, $b, $c, $e, $f, $g, $i, $j, $k, $m, $n, $o, 24, 25, 26, 27),
-                _ => shuffle4!($a, $b, $c, $e, $f, $g, $i, $j, $k, $m, $n, $o, 28, 29, 30, 31),
-            }
-        };
-    }
-    macro_rules! shuffle2 {
-        ($a:expr, $b:expr, $e:expr, $f:expr, $i:expr, $j:expr, $m:expr, $n:expr) => {
-            match (imm8 >> 4) & 0x3 {
-                0 => shuffle3!($a, $b, $e, $f, $i, $j, $m, $n, 16, 17, 18, 19),
-                1 => shuffle3!($a, $b, $e, $f, $i, $j, $m, $n, 20, 21, 22, 23),
-                2 => shuffle3!($a, $b, $e, $f, $i, $j, $m, $n, 24, 25, 26, 27),
-                _ => shuffle3!($a, $b, $e, $f, $i, $j, $m, $n, 28, 29, 30, 31),
-            }
-        };
-    }
-    macro_rules! shuffle1 {
-        ($a:expr, $e:expr, $i: expr, $m: expr) => {
-            match (imm8 >> 2) & 0x3 {
-                0 => shuffle2!($a, $e, $i, $m, 0, 1, 2, 3),
-                1 => shuffle2!($a, $e, $i, $m, 4, 5, 6, 7),
-                2 => shuffle2!($a, $e, $i, $m, 8, 9, 10, 11),
-                _ => shuffle2!($a, $e, $i, $m, 12, 13, 14, 15),
-            }
-        };
-    }
-    let shuffle = match imm8 & 0x3 {
-        0 => shuffle1!(0, 1, 2, 3),
-        1 => shuffle1!(4, 5, 6, 7),
-        2 => shuffle1!(8, 9, 10, 11),
-        _ => shuffle1!(12, 13, 14, 15),
-    };
+    let r = constify_imm8_sae!(imm8, call);
     let zero = _mm512_setzero_si512().as_i32x16();
-    transmute(simd_select_bitmask(k, shuffle, zero))
+    transmute(simd_select_bitmask(k, r.as_i32x16(), zero))
 }
 
 /// Shuffle 128-bits (composed of 4 32-bit integers) selected by imm8 from a and b, and store the results in dst.
@@ -19156,38 +18857,13 @@ pub unsafe fn _mm256_mask_shuffle_i32x4(
     b: __m256i,
     imm8: i32,
 ) -> __m256i {
-    assert!(imm8 >= 0 && imm8 <= 255);
-    let imm8 = (imm8 & 0xFF) as u8;
-    let a = a.as_i32x8();
-    let b = b.as_i32x8();
-    macro_rules! shuffle2 {
-        (
-            $a:expr,
-            $b:expr,
-            $c:expr,
-            $d:expr,
-            $e:expr,
-            $f:expr,
-            $g:expr,
-            $h:expr
-        ) => {
-            simd_shuffle8(a, b, [$a, $b, $c, $d, $e, $f, $g, $h])
+    macro_rules! call {
+        ($imm8:expr) => {
+            _mm256_shuffle_i32x4(a, b, $imm8)
         };
     }
-    macro_rules! shuffle1 {
-        ($a:expr, $b:expr, $c: expr, $d: expr) => {
-            match (imm8 >> 1) & 0x1 {
-                0 => shuffle2!($a, $b, $c, $d, 8, 9, 10, 11),
-                _ => shuffle2!($a, $b, $c, $d, 12, 13, 14, 15),
-            }
-        };
-    }
-    let r: i32x8 = match imm8 & 0x1 {
-        0 => shuffle1!(0, 1, 2, 3),
-        _ => shuffle1!(4, 5, 6, 7),
-    };
-
-    transmute(simd_select_bitmask(k, r, src.as_i32x8()))
+    let r = constify_imm8_sae!(imm8, call);
+    transmute(simd_select_bitmask(k, r.as_i32x8(), src.as_i32x8()))
 }
 
 /// Shuffle 128-bits (composed of 4 32-bit integers) selected by imm8 from a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -19203,38 +18879,14 @@ pub unsafe fn _mm256_maskz_shuffle_i32x4(
     b: __m256i,
     imm8: i32,
 ) -> __m256i {
-    assert!(imm8 >= 0 && imm8 <= 255);
-    let imm8 = (imm8 & 0xFF) as u8;
-    let a = a.as_i32x8();
-    let b = b.as_i32x8();
-    macro_rules! shuffle2 {
-        (
-            $a:expr,
-            $b:expr,
-            $c:expr,
-            $d:expr,
-            $e:expr,
-            $f:expr,
-            $g:expr,
-            $h:expr
-        ) => {
-            simd_shuffle8(a, b, [$a, $b, $c, $d, $e, $f, $g, $h])
+    macro_rules! call {
+        ($imm8:expr) => {
+            _mm256_shuffle_i32x4(a, b, $imm8)
         };
     }
-    macro_rules! shuffle1 {
-        ($a:expr, $b:expr, $c: expr, $d: expr) => {
-            match (imm8 >> 1) & 0x1 {
-                0 => shuffle2!($a, $b, $c, $d, 8, 9, 10, 11),
-                _ => shuffle2!($a, $b, $c, $d, 12, 13, 14, 15),
-            }
-        };
-    }
-    let r: i32x8 = match imm8 & 0x1 {
-        0 => shuffle1!(0, 1, 2, 3),
-        _ => shuffle1!(4, 5, 6, 7),
-    };
+    let r = constify_imm8_sae!(imm8, call);
     let zero = _mm256_setzero_si256().as_i32x8();
-    transmute(simd_select_bitmask(k, r, zero))
+    transmute(simd_select_bitmask(k, r.as_i32x8(), zero))
 }
 
 /// Shuffle 128-bits (composed of 2 64-bit integers) selected by imm8 from a and b, and store the results in dst.
@@ -19313,59 +18965,13 @@ pub unsafe fn _mm512_mask_shuffle_i64x2(
     b: __m512i,
     imm8: i32,
 ) -> __m512i {
-    assert!(imm8 >= 0 && imm8 <= 255);
-    let imm8 = (imm8 & 0xFF) as u8;
-    macro_rules! shuffle4 {
-        (
-            $a:expr,
-            $b:expr,
-            $c:expr,
-            $d:expr,
-            $e:expr,
-            $f:expr,
-            $g:expr,
-            $h:expr
-        ) => {
-            simd_shuffle8(a, b, [$a, $b, $c, $d, $e, $f, $g, $h])
+    macro_rules! call {
+        ($imm8:expr) => {
+            _mm512_shuffle_i64x2(a, b, $imm8)
         };
     }
-    macro_rules! shuffle3 {
-        ($a:expr, $b:expr, $c:expr, $e:expr, $f:expr, $g:expr) => {
-            match (imm8 >> 6) & 0x3 {
-                0 => shuffle4!($a, $b, $c, $e, $f, $g, 8, 9),
-                1 => shuffle4!($a, $b, $c, $e, $f, $g, 10, 11),
-                2 => shuffle4!($a, $b, $c, $e, $f, $g, 12, 13),
-                _ => shuffle4!($a, $b, $c, $e, $f, $g, 14, 15),
-            }
-        };
-    }
-    macro_rules! shuffle2 {
-        ($a:expr, $b:expr, $e:expr, $f:expr) => {
-            match (imm8 >> 4) & 0x3 {
-                0 => shuffle3!($a, $b, $e, $f, 8, 9),
-                1 => shuffle3!($a, $b, $e, $f, 10, 11),
-                2 => shuffle3!($a, $b, $e, $f, 12, 13),
-                _ => shuffle3!($a, $b, $e, $f, 14, 15),
-            }
-        };
-    }
-    macro_rules! shuffle1 {
-        ($a:expr, $e:expr) => {
-            match (imm8 >> 2) & 0x3 {
-                0 => shuffle2!($a, $e, 0, 1),
-                1 => shuffle2!($a, $e, 2, 3),
-                2 => shuffle2!($a, $e, 4, 5),
-                _ => shuffle2!($a, $e, 6, 7),
-            }
-        };
-    }
-    let shuffle = match imm8 & 0x3 {
-        0 => shuffle1!(0, 1),
-        1 => shuffle1!(2, 3),
-        2 => shuffle1!(4, 5),
-        _ => shuffle1!(6, 7),
-    };
-    transmute(simd_select_bitmask(k, shuffle, src.as_i64x8()))
+    let r = constify_imm8_sae!(imm8, call);
+    transmute(simd_select_bitmask(k, r.as_i64x8(), src.as_i64x8()))
 }
 
 /// Shuffle 128-bits (composed of 2 64-bit integers) selected by imm8 from a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -19381,61 +18987,14 @@ pub unsafe fn _mm512_maskz_shuffle_i64x2(
     b: __m512i,
     imm8: i32,
 ) -> __m512i {
-    assert!(imm8 >= 0 && imm8 <= 255);
-    let imm8 = (imm8 & 0xFF) as u8;
-    macro_rules! shuffle4 {
-        (
-            $a:expr,
-            $b:expr,
-            $c:expr,
-            $d:expr,
-            $e:expr,
-            $f:expr,
-            $g:expr,
-            $h:expr
-        ) => {
-            simd_shuffle8(a, b, [$a, $b, $c, $d, $e, $f, $g, $h])
+    macro_rules! call {
+        ($imm8:expr) => {
+            _mm512_shuffle_i64x2(a, b, $imm8)
         };
     }
-    macro_rules! shuffle3 {
-        ($a:expr, $b:expr, $c:expr, $e:expr, $f:expr, $g:expr) => {
-            match (imm8 >> 6) & 0x3 {
-                0 => shuffle4!($a, $b, $c, $e, $f, $g, 8, 9),
-                1 => shuffle4!($a, $b, $c, $e, $f, $g, 10, 11),
-                2 => shuffle4!($a, $b, $c, $e, $f, $g, 12, 13),
-                _ => shuffle4!($a, $b, $c, $e, $f, $g, 14, 15),
-            }
-        };
-    }
-    macro_rules! shuffle2 {
-        ($a:expr, $b:expr, $e:expr, $f:expr) => {
-            match (imm8 >> 4) & 0x3 {
-                0 => shuffle3!($a, $b, $e, $f, 8, 9),
-                1 => shuffle3!($a, $b, $e, $f, 10, 11),
-                2 => shuffle3!($a, $b, $e, $f, 12, 13),
-                _ => shuffle3!($a, $b, $e, $f, 14, 15),
-            }
-        };
-    }
-    macro_rules! shuffle1 {
-        ($a:expr, $e:expr) => {
-            match (imm8 >> 2) & 0x3 {
-                0 => shuffle2!($a, $e, 0, 1),
-                1 => shuffle2!($a, $e, 2, 3),
-                2 => shuffle2!($a, $e, 4, 5),
-                _ => shuffle2!($a, $e, 6, 7),
-            }
-        };
-    }
-    let shuffle = match imm8 & 0x3 {
-        0 => shuffle1!(0, 1),
-        1 => shuffle1!(2, 3),
-        2 => shuffle1!(4, 5),
-        _ => shuffle1!(6, 7),
-    };
-
+    let r = constify_imm8_sae!(imm8, call);
     let zero = _mm512_setzero_si512().as_i64x8();
-    transmute(simd_select_bitmask(k, shuffle, zero))
+    transmute(simd_select_bitmask(k, r.as_i64x8(), zero))
 }
 
 /// Shuffle 128-bits (composed of 2 64-bit integers) selected by imm8 from a and b, and store the results in dst.
@@ -19489,33 +19048,13 @@ pub unsafe fn _mm256_mask_shuffle_i64x2(
     b: __m256i,
     imm8: i32,
 ) -> __m256i {
-    assert!(imm8 >= 0 && imm8 <= 255);
-    let imm8 = (imm8 & 0xFF) as u8;
-    let a = a.as_i64x4();
-    let b = b.as_i64x4();
-    macro_rules! shuffle2 {
-        (
-            $a:expr,
-            $b:expr,
-            $c:expr,
-            $d:expr
-        ) => {
-            simd_shuffle4(a, b, [$a, $b, $c, $d])
+    macro_rules! call {
+        ($imm8:expr) => {
+            _mm256_shuffle_i64x2(a, b, $imm8)
         };
     }
-    macro_rules! shuffle1 {
-        ($a:expr, $b:expr) => {
-            match (imm8 >> 1) & 0x1 {
-                0 => shuffle2!($a, $b, 4, 5),
-                _ => shuffle2!($a, $b, 6, 7),
-            }
-        };
-    }
-    let r: i64x4 = match imm8 & 0x1 {
-        0 => shuffle1!(0, 1),
-        _ => shuffle1!(2, 3),
-    };
-    transmute(simd_select_bitmask(k, r, src.as_i64x4()))
+    let r = constify_imm8_sae!(imm8, call);
+    transmute(simd_select_bitmask(k, r.as_i64x4(), src.as_i64x4()))
 }
 
 /// Shuffle 128-bits (composed of 2 64-bit integers) selected by imm8 from a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -19531,34 +19070,14 @@ pub unsafe fn _mm256_maskz_shuffle_i64x2(
     b: __m256i,
     imm8: i32,
 ) -> __m256i {
-    assert!(imm8 >= 0 && imm8 <= 255);
-    let imm8 = (imm8 & 0xFF) as u8;
-    let a = a.as_i64x4();
-    let b = b.as_i64x4();
-    macro_rules! shuffle2 {
-        (
-            $a:expr,
-            $b:expr,
-            $c:expr,
-            $d:expr
-        ) => {
-            simd_shuffle4(a, b, [$a, $b, $c, $d])
+    macro_rules! call {
+        ($imm8:expr) => {
+            _mm256_shuffle_i64x2(a, b, $imm8)
         };
     }
-    macro_rules! shuffle1 {
-        ($a:expr, $b:expr) => {
-            match (imm8 >> 1) & 0x1 {
-                0 => shuffle2!($a, $b, 4, 5),
-                _ => shuffle2!($a, $b, 6, 7),
-            }
-        };
-    }
-    let r: i64x4 = match imm8 & 0x1 {
-        0 => shuffle1!(0, 1),
-        _ => shuffle1!(2, 3),
-    };
+    let r = constify_imm8_sae!(imm8, call);
     let zero = _mm256_setzero_si256().as_i64x4();
-    transmute(simd_select_bitmask(k, r, zero))
+    transmute(simd_select_bitmask(k, r.as_i64x4(), zero))
 }
 
 /// Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst.
@@ -19651,73 +19170,13 @@ pub unsafe fn _mm512_mask_shuffle_f32x4(
     b: __m512,
     imm8: i32,
 ) -> __m512 {
-    assert!(imm8 >= 0 && imm8 <= 255);
-    let imm8 = (imm8 & 0xFF) as u8;
-    macro_rules! shuffle4 {
-        (
-            $a:expr,
-            $b:expr,
-            $c:expr,
-            $d:expr,
-            $e:expr,
-            $f:expr,
-            $g:expr,
-            $h:expr,
-            $i:expr,
-            $j:expr,
-            $k:expr,
-            $l:expr,
-            $m:expr,
-            $n:expr,
-            $o:expr,
-            $p:expr
-        ) => {
-            simd_shuffle16(
-                a,
-                b,
-                [
-                    $a, $b, $c, $d, $e, $f, $g, $h, $i, $j, $k, $l, $m, $n, $o, $p,
-                ],
-            )
+    macro_rules! call {
+        ($imm8:expr) => {
+            _mm512_shuffle_f32x4(a, b, $imm8)
         };
     }
-    macro_rules! shuffle3 {
-        ($a:expr, $b:expr, $c:expr, $e:expr, $f:expr, $g:expr, $i:expr, $j:expr, $k:expr, $m:expr, $n:expr, $o:expr) => {
-            match (imm8 >> 6) & 0x3 {
-                0 => shuffle4!($a, $b, $c, $e, $f, $g, $i, $j, $k, $m, $n, $o, 16, 17, 18, 19),
-                1 => shuffle4!($a, $b, $c, $e, $f, $g, $i, $j, $k, $m, $n, $o, 20, 21, 22, 23),
-                2 => shuffle4!($a, $b, $c, $e, $f, $g, $i, $j, $k, $m, $n, $o, 24, 25, 26, 27),
-                _ => shuffle4!($a, $b, $c, $e, $f, $g, $i, $j, $k, $m, $n, $o, 28, 29, 30, 31),
-            }
-        };
-    }
-    macro_rules! shuffle2 {
-        ($a:expr, $b:expr, $e:expr, $f:expr, $i:expr, $j:expr, $m:expr, $n:expr) => {
-            match (imm8 >> 4) & 0x3 {
-                0 => shuffle3!($a, $b, $e, $f, $i, $j, $m, $n, 16, 17, 18, 19),
-                1 => shuffle3!($a, $b, $e, $f, $i, $j, $m, $n, 20, 21, 22, 23),
-                2 => shuffle3!($a, $b, $e, $f, $i, $j, $m, $n, 24, 25, 26, 27),
-                _ => shuffle3!($a, $b, $e, $f, $i, $j, $m, $n, 28, 29, 30, 31),
-            }
-        };
-    }
-    macro_rules! shuffle1 {
-        ($a:expr, $e:expr, $i: expr, $m: expr) => {
-            match (imm8 >> 2) & 0x3 {
-                0 => shuffle2!($a, $e, $i, $m, 0, 1, 2, 3),
-                1 => shuffle2!($a, $e, $i, $m, 4, 5, 6, 7),
-                2 => shuffle2!($a, $e, $i, $m, 8, 9, 10, 11),
-                _ => shuffle2!($a, $e, $i, $m, 12, 13, 14, 15),
-            }
-        };
-    }
-    let shuffle = match imm8 & 0x3 {
-        0 => shuffle1!(0, 1, 2, 3),
-        1 => shuffle1!(4, 5, 6, 7),
-        2 => shuffle1!(8, 9, 10, 11),
-        _ => shuffle1!(12, 13, 14, 15),
-    };
-    transmute(simd_select_bitmask(k, shuffle, src.as_f32x16()))
+    let r = constify_imm8_sae!(imm8, call);
+    transmute(simd_select_bitmask(k, r.as_f32x16(), src.as_f32x16()))
 }
 
 /// Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -19728,75 +19187,14 @@ pub unsafe fn _mm512_mask_shuffle_f32x4(
 #[cfg_attr(test, assert_instr(vshuff32x4, imm8 = 0b10111111))]
 #[rustc_args_required_const(3)]
 pub unsafe fn _mm512_maskz_shuffle_f32x4(k: __mmask16, a: __m512, b: __m512, imm8: i32) -> __m512 {
-    assert!(imm8 >= 0 && imm8 <= 255);
-    let imm8 = (imm8 & 0xFF) as u8;
-    macro_rules! shuffle4 {
-        (
-            $a:expr,
-            $b:expr,
-            $c:expr,
-            $d:expr,
-            $e:expr,
-            $f:expr,
-            $g:expr,
-            $h:expr,
-            $i:expr,
-            $j:expr,
-            $k:expr,
-            $l:expr,
-            $m:expr,
-            $n:expr,
-            $o:expr,
-            $p:expr
-        ) => {
-            simd_shuffle16(
-                a,
-                b,
-                [
-                    $a, $b, $c, $d, $e, $f, $g, $h, $i, $j, $k, $l, $m, $n, $o, $p,
-                ],
-            )
+    macro_rules! call {
+        ($imm8:expr) => {
+            _mm512_shuffle_f32x4(a, b, $imm8)
         };
     }
-    macro_rules! shuffle3 {
-        ($a:expr, $b:expr, $c:expr, $e:expr, $f:expr, $g:expr, $i:expr, $j:expr, $k:expr, $m:expr, $n:expr, $o:expr) => {
-            match (imm8 >> 6) & 0x3 {
-                0 => shuffle4!($a, $b, $c, $e, $f, $g, $i, $j, $k, $m, $n, $o, 16, 17, 18, 19),
-                1 => shuffle4!($a, $b, $c, $e, $f, $g, $i, $j, $k, $m, $n, $o, 20, 21, 22, 23),
-                2 => shuffle4!($a, $b, $c, $e, $f, $g, $i, $j, $k, $m, $n, $o, 24, 25, 26, 27),
-                _ => shuffle4!($a, $b, $c, $e, $f, $g, $i, $j, $k, $m, $n, $o, 28, 29, 30, 31),
-            }
-        };
-    }
-    macro_rules! shuffle2 {
-        ($a:expr, $b:expr, $e:expr, $f:expr, $i:expr, $j:expr, $m:expr, $n:expr) => {
-            match (imm8 >> 4) & 0x3 {
-                0 => shuffle3!($a, $b, $e, $f, $i, $j, $m, $n, 16, 17, 18, 19),
-                1 => shuffle3!($a, $b, $e, $f, $i, $j, $m, $n, 20, 21, 22, 23),
-                2 => shuffle3!($a, $b, $e, $f, $i, $j, $m, $n, 24, 25, 26, 27),
-                _ => shuffle3!($a, $b, $e, $f, $i, $j, $m, $n, 28, 29, 30, 31),
-            }
-        };
-    }
-    macro_rules! shuffle1 {
-        ($a:expr, $e:expr, $i: expr, $m: expr) => {
-            match (imm8 >> 2) & 0x3 {
-                0 => shuffle2!($a, $e, $i, $m, 0, 1, 2, 3),
-                1 => shuffle2!($a, $e, $i, $m, 4, 5, 6, 7),
-                2 => shuffle2!($a, $e, $i, $m, 8, 9, 10, 11),
-                _ => shuffle2!($a, $e, $i, $m, 12, 13, 14, 15),
-            }
-        };
-    }
-    let shuffle = match imm8 & 0x3 {
-        0 => shuffle1!(0, 1, 2, 3),
-        1 => shuffle1!(4, 5, 6, 7),
-        2 => shuffle1!(8, 9, 10, 11),
-        _ => shuffle1!(12, 13, 14, 15),
-    };
-
+    let r = constify_imm8_sae!(imm8, call);
     let zero = _mm512_setzero_ps().as_f32x16();
-    transmute(simd_select_bitmask(k, shuffle, zero))
+    transmute(simd_select_bitmask(k, r.as_f32x16(), zero))
 }
 
 /// Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst.
@@ -19855,37 +19253,13 @@ pub unsafe fn _mm256_mask_shuffle_f32x4(
     b: __m256,
     imm8: i32,
 ) -> __m256 {
-    assert!(imm8 >= 0 && imm8 <= 255);
-    let imm8 = (imm8 & 0xFF) as u8;
-    let a = a.as_f32x8();
-    let b = b.as_f32x8();
-    macro_rules! shuffle2 {
-        (
-            $a:expr,
-            $b:expr,
-            $c:expr,
-            $d:expr,
-            $e:expr,
-            $f:expr,
-            $g:expr,
-            $h:expr
-        ) => {
-            simd_shuffle8(a, b, [$a, $b, $c, $d, $e, $f, $g, $h])
+    macro_rules! call {
+        ($imm8:expr) => {
+            _mm256_shuffle_f32x4(a, b, $imm8)
         };
     }
-    macro_rules! shuffle1 {
-        ($a:expr, $b:expr, $c: expr, $d: expr) => {
-            match (imm8 >> 1) & 0x1 {
-                0 => shuffle2!($a, $b, $c, $d, 8, 9, 10, 11),
-                _ => shuffle2!($a, $b, $c, $d, 12, 13, 14, 15),
-            }
-        };
-    }
-    let r: f32x8 = match imm8 & 0x1 {
-        0 => shuffle1!(0, 1, 2, 3),
-        _ => shuffle1!(4, 5, 6, 7),
-    };
-    transmute(simd_select_bitmask(k, r, src.as_f32x8()))
+    let r = constify_imm8_sae!(imm8, call);
+    transmute(simd_select_bitmask(k, r.as_f32x8(), src.as_f32x8()))
 }
 
 /// Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -19896,38 +19270,14 @@ pub unsafe fn _mm256_mask_shuffle_f32x4(
 #[cfg_attr(test, assert_instr(vshuff32x4, imm8 = 0b11))]
 #[rustc_args_required_const(3)]
 pub unsafe fn _mm256_maskz_shuffle_f32x4(k: __mmask8, a: __m256, b: __m256, imm8: i32) -> __m256 {
-    assert!(imm8 >= 0 && imm8 <= 255);
-    let imm8 = (imm8 & 0xFF) as u8;
-    let a = a.as_f32x8();
-    let b = b.as_f32x8();
-    macro_rules! shuffle2 {
-        (
-            $a:expr,
-            $b:expr,
-            $c:expr,
-            $d:expr,
-            $e:expr,
-            $f:expr,
-            $g:expr,
-            $h:expr
-        ) => {
-            simd_shuffle8(a, b, [$a, $b, $c, $d, $e, $f, $g, $h])
+    macro_rules! call {
+        ($imm8:expr) => {
+            _mm256_shuffle_f32x4(a, b, $imm8)
         };
     }
-    macro_rules! shuffle1 {
-        ($a:expr, $b:expr, $c: expr, $d: expr) => {
-            match (imm8 >> 1) & 0x1 {
-                0 => shuffle2!($a, $b, $c, $d, 8, 9, 10, 11),
-                _ => shuffle2!($a, $b, $c, $d, 12, 13, 14, 15),
-            }
-        };
-    }
-    let r: f32x8 = match imm8 & 0x1 {
-        0 => shuffle1!(0, 1, 2, 3),
-        _ => shuffle1!(4, 5, 6, 7),
-    };
+    let r = constify_imm8_sae!(imm8, call);
     let zero = _mm256_setzero_ps().as_f32x8();
-    transmute(simd_select_bitmask(k, r, zero))
+    transmute(simd_select_bitmask(k, r.as_f32x8(), zero))
 }
 
 /// Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst.
@@ -20006,59 +19356,13 @@ pub unsafe fn _mm512_mask_shuffle_f64x2(
     b: __m512d,
     imm8: i32,
 ) -> __m512d {
-    assert!(imm8 >= 0 && imm8 <= 255);
-    let imm8 = (imm8 & 0xFF) as u8;
-    macro_rules! shuffle4 {
-        (
-            $a:expr,
-            $b:expr,
-            $c:expr,
-            $d:expr,
-            $e:expr,
-            $f:expr,
-            $g:expr,
-            $h:expr
-        ) => {
-            simd_shuffle8(a, b, [$a, $b, $c, $d, $e, $f, $g, $h])
+    macro_rules! call {
+        ($imm8:expr) => {
+            _mm512_shuffle_f64x2(a, b, $imm8)
         };
     }
-    macro_rules! shuffle3 {
-        ($a:expr, $b:expr, $c:expr, $e:expr, $f:expr, $g:expr) => {
-            match (imm8 >> 6) & 0x3 {
-                0 => shuffle4!($a, $b, $c, $e, $f, $g, 8, 9),
-                1 => shuffle4!($a, $b, $c, $e, $f, $g, 10, 11),
-                2 => shuffle4!($a, $b, $c, $e, $f, $g, 12, 13),
-                _ => shuffle4!($a, $b, $c, $e, $f, $g, 14, 15),
-            }
-        };
-    }
-    macro_rules! shuffle2 {
-        ($a:expr, $b:expr, $e:expr, $f:expr) => {
-            match (imm8 >> 4) & 0x3 {
-                0 => shuffle3!($a, $b, $e, $f, 8, 9),
-                1 => shuffle3!($a, $b, $e, $f, 10, 11),
-                2 => shuffle3!($a, $b, $e, $f, 12, 13),
-                _ => shuffle3!($a, $b, $e, $f, 14, 15),
-            }
-        };
-    }
-    macro_rules! shuffle1 {
-        ($a:expr, $e:expr) => {
-            match (imm8 >> 2) & 0x3 {
-                0 => shuffle2!($a, $e, 0, 1),
-                1 => shuffle2!($a, $e, 2, 3),
-                2 => shuffle2!($a, $e, 4, 5),
-                _ => shuffle2!($a, $e, 6, 7),
-            }
-        };
-    }
-    let shuffle = match imm8 & 0x3 {
-        0 => shuffle1!(0, 1),
-        1 => shuffle1!(2, 3),
-        2 => shuffle1!(4, 5),
-        _ => shuffle1!(6, 7),
-    };
-    transmute(simd_select_bitmask(k, shuffle, src.as_f64x8()))
+    let r = constify_imm8_sae!(imm8, call);
+    transmute(simd_select_bitmask(k, r.as_f64x8(), src.as_f64x8()))
 }
 
 /// Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -20074,60 +19378,14 @@ pub unsafe fn _mm512_maskz_shuffle_f64x2(
     b: __m512d,
     imm8: i32,
 ) -> __m512d {
-    assert!(imm8 >= 0 && imm8 <= 255);
-    let imm8 = (imm8 & 0xFF) as u8;
-    macro_rules! shuffle4 {
-        (
-            $a:expr,
-            $b:expr,
-            $c:expr,
-            $d:expr,
-            $e:expr,
-            $f:expr,
-            $g:expr,
-            $h:expr
-        ) => {
-            simd_shuffle8(a, b, [$a, $b, $c, $d, $e, $f, $g, $h])
+    macro_rules! call {
+        ($imm8:expr) => {
+            _mm512_shuffle_f64x2(a, b, $imm8)
         };
     }
-    macro_rules! shuffle3 {
-        ($a:expr, $b:expr, $c:expr, $e:expr, $f:expr, $g:expr) => {
-            match (imm8 >> 6) & 0x3 {
-                0 => shuffle4!($a, $b, $c, $e, $f, $g, 8, 9),
-                1 => shuffle4!($a, $b, $c, $e, $f, $g, 10, 11),
-                2 => shuffle4!($a, $b, $c, $e, $f, $g, 12, 13),
-                _ => shuffle4!($a, $b, $c, $e, $f, $g, 14, 15),
-            }
-        };
-    }
-    macro_rules! shuffle2 {
-        ($a:expr, $b:expr, $e:expr, $f:expr) => {
-            match (imm8 >> 4) & 0x3 {
-                0 => shuffle3!($a, $b, $e, $f, 8, 9),
-                1 => shuffle3!($a, $b, $e, $f, 10, 11),
-                2 => shuffle3!($a, $b, $e, $f, 12, 13),
-                _ => shuffle3!($a, $b, $e, $f, 14, 15),
-            }
-        };
-    }
-    macro_rules! shuffle1 {
-        ($a:expr, $e:expr) => {
-            match (imm8 >> 2) & 0x3 {
-                0 => shuffle2!($a, $e, 0, 1),
-                1 => shuffle2!($a, $e, 2, 3),
-                2 => shuffle2!($a, $e, 4, 5),
-                _ => shuffle2!($a, $e, 6, 7),
-            }
-        };
-    }
-    let shuffle = match imm8 & 0x3 {
-        0 => shuffle1!(0, 1),
-        1 => shuffle1!(2, 3),
-        2 => shuffle1!(4, 5),
-        _ => shuffle1!(6, 7),
-    };
+    let r = constify_imm8_sae!(imm8, call);
     let zero = _mm512_setzero_pd().as_f64x8();
-    transmute(simd_select_bitmask(k, shuffle, zero))
+    transmute(simd_select_bitmask(k, r.as_f64x8(), zero))
 }
 
 /// Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst.
@@ -20181,33 +19439,13 @@ pub unsafe fn _mm256_mask_shuffle_f64x2(
     b: __m256d,
     imm8: i32,
 ) -> __m256d {
-    assert!(imm8 >= 0 && imm8 <= 255);
-    let imm8 = (imm8 & 0xFF) as u8;
-    let a = a.as_f64x4();
-    let b = b.as_f64x4();
-    macro_rules! shuffle2 {
-        (
-            $a:expr,
-            $b:expr,
-            $c:expr,
-            $d:expr
-        ) => {
-            simd_shuffle4(a, b, [$a, $b, $c, $d])
+    macro_rules! call {
+        ($imm8:expr) => {
+            _mm256_shuffle_f64x2(a, b, $imm8)
         };
     }
-    macro_rules! shuffle1 {
-        ($a:expr, $b:expr) => {
-            match (imm8 >> 1) & 0x1 {
-                0 => shuffle2!($a, $b, 4, 5),
-                _ => shuffle2!($a, $b, 6, 7),
-            }
-        };
-    }
-    let r: f64x4 = match imm8 & 0x1 {
-        0 => shuffle1!(0, 1),
-        _ => shuffle1!(2, 3),
-    };
-    transmute(simd_select_bitmask(k, r, src.as_f64x4()))
+    let r = constify_imm8_sae!(imm8, call);
+    transmute(simd_select_bitmask(k, r.as_f64x4(), src.as_f64x4()))
 }
 
 /// Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -20223,34 +19461,14 @@ pub unsafe fn _mm256_maskz_shuffle_f64x2(
     b: __m256d,
     imm8: i32,
 ) -> __m256d {
-    assert!(imm8 >= 0 && imm8 <= 255);
-    let imm8 = (imm8 & 0xFF) as u8;
-    let a = a.as_f64x4();
-    let b = b.as_f64x4();
-    macro_rules! shuffle2 {
-        (
-            $a:expr,
-            $b:expr,
-            $c:expr,
-            $d:expr
-        ) => {
-            simd_shuffle4(a, b, [$a, $b, $c, $d])
+    macro_rules! call {
+        ($imm8:expr) => {
+            _mm256_shuffle_f64x2(a, b, $imm8)
         };
     }
-    macro_rules! shuffle1 {
-        ($a:expr, $b:expr) => {
-            match (imm8 >> 1) & 0x1 {
-                0 => shuffle2!($a, $b, 4, 5),
-                _ => shuffle2!($a, $b, 6, 7),
-            }
-        };
-    }
-    let r: f64x4 = match imm8 & 0x1 {
-        0 => shuffle1!(0, 1),
-        _ => shuffle1!(2, 3),
-    };
+    let r = constify_imm8_sae!(imm8, call);
     let zero = _mm256_setzero_pd().as_f64x4();
-    transmute(simd_select_bitmask(k, r, zero))
+    transmute(simd_select_bitmask(k, r.as_f64x4(), zero))
 }
 
 /// Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from a, selected with imm8, and store the result in dst.
@@ -20289,14 +19507,13 @@ pub unsafe fn _mm512_mask_extractf32x4_ps(
     a: __m512,
     imm8: i32,
 ) -> __m128 {
-    assert!(imm8 >= 0 && imm8 <= 3);
-    let extract: __m128 = match imm8 & 0x3 {
-        0 => simd_shuffle4(a, _mm512_undefined_ps(), [0, 1, 2, 3]),
-        1 => simd_shuffle4(a, _mm512_undefined_ps(), [4, 5, 6, 7]),
-        2 => simd_shuffle4(a, _mm512_undefined_ps(), [8, 9, 10, 11]),
-        _ => simd_shuffle4(a, _mm512_undefined_ps(), [12, 13, 14, 15]),
-    };
-    transmute(simd_select_bitmask(k, extract.as_f32x4(), src.as_f32x4()))
+    macro_rules! call {
+        ($imm8:expr) => {
+            _mm512_extractf32x4_ps(a, $imm8)
+        };
+    }
+    let r = constify_imm8_sae!(imm8, call);
+    transmute(simd_select_bitmask(k, r.as_f32x4(), src.as_f32x4()))
 }
 
 /// Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from a, selected with imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -20310,15 +19527,14 @@ pub unsafe fn _mm512_mask_extractf32x4_ps(
 )]
 #[rustc_args_required_const(2)]
 pub unsafe fn _mm512_maskz_extractf32x4_ps(k: __mmask8, a: __m512, imm8: i32) -> __m128 {
-    assert!(imm8 >= 0 && imm8 <= 3);
-    let extract: __m128 = match imm8 & 0x3 {
-        0 => simd_shuffle4(a, _mm512_undefined_ps(), [0, 1, 2, 3]),
-        1 => simd_shuffle4(a, _mm512_undefined_ps(), [4, 5, 6, 7]),
-        2 => simd_shuffle4(a, _mm512_undefined_ps(), [8, 9, 10, 11]),
-        _ => simd_shuffle4(a, _mm512_undefined_ps(), [12, 13, 14, 15]),
-    };
+    macro_rules! call {
+        ($imm8:expr) => {
+            _mm512_extractf32x4_ps(a, $imm8)
+        };
+    }
+    let r = constify_imm8_sae!(imm8, call);
     let zero = _mm_setzero_ps().as_f32x4();
-    transmute(simd_select_bitmask(k, extract.as_f32x4(), zero))
+    transmute(simd_select_bitmask(k, r.as_f32x4(), zero))
 }
 
 /// Extract 256 bits (composed of 4 packed 64-bit integers) from a, selected with imm8, and store the result in dst.
@@ -20355,12 +19571,13 @@ pub unsafe fn _mm512_mask_extracti64x4_epi64(
     a: __m512i,
     imm8: i32,
 ) -> __m256i {
-    assert!(imm8 >= 0 && imm8 <= 1);
-    let extract = match imm8 & 0x1 {
-        0 => simd_shuffle4(a, _mm512_set1_epi64(0), [0, 1, 2, 3]),
-        _ => simd_shuffle4(a, _mm512_set1_epi64(0), [4, 5, 6, 7]),
-    };
-    transmute(simd_select_bitmask(k, extract, src.as_i64x4()))
+    macro_rules! call {
+        ($imm8:expr) => {
+            _mm512_extracti64x4_epi64(a, $imm8)
+        };
+    }
+    let r = constify_imm8_sae!(imm8, call);
+    transmute(simd_select_bitmask(k, r.as_i64x4(), src.as_i64x4()))
 }
 
 /// Extract 256 bits (composed of 4 packed 64-bit integers) from a, selected with imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -20374,13 +19591,14 @@ pub unsafe fn _mm512_mask_extracti64x4_epi64(
 )]
 #[rustc_args_required_const(2)]
 pub unsafe fn _mm512_maskz_extracti64x4_epi64(k: __mmask8, a: __m512i, imm8: i32) -> __m256i {
-    assert!(imm8 >= 0 && imm8 <= 1);
-    let extract: __m256i = match imm8 & 0x1 {
-        0 => simd_shuffle4(a, _mm512_set1_epi64(0), [0, 1, 2, 3]),
-        _ => simd_shuffle4(a, _mm512_set1_epi64(0), [4, 5, 6, 7]),
-    };
+    macro_rules! call {
+        ($imm8:expr) => {
+            _mm512_extracti64x4_epi64(a, $imm8)
+        };
+    }
+    let r = constify_imm8_sae!(imm8, call);
     let zero = _mm256_setzero_si256().as_i64x4();
-    transmute(simd_select_bitmask(k, extract.as_i64x4(), zero))
+    transmute(simd_select_bitmask(k, r.as_i64x4(), zero))
 }
 
 /// Extract 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from a, selected with imm8, and store the result in dst.
@@ -20417,12 +19635,13 @@ pub unsafe fn _mm512_mask_extractf64x4_pd(
     a: __m512d,
     imm8: i32,
 ) -> __m256d {
-    assert!(imm8 >= 0 && imm8 <= 1);
-    let extract = match imm8 & 0x1 {
-        0 => simd_shuffle4(a, _mm512_undefined_pd(), [0, 1, 2, 3]),
-        _ => simd_shuffle4(a, _mm512_undefined_pd(), [4, 5, 6, 7]),
-    };
-    transmute(simd_select_bitmask(k, extract, src))
+    macro_rules! call {
+        ($imm8:expr) => {
+            _mm512_extractf64x4_pd(a, $imm8)
+        };
+    }
+    let r = constify_imm8_sae!(imm8, call);
+    transmute(simd_select_bitmask(k, r.as_f64x4(), src.as_f64x4()))
 }
 
 /// Extract 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from a, selected with imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -20436,13 +19655,14 @@ pub unsafe fn _mm512_mask_extractf64x4_pd(
 )]
 #[rustc_args_required_const(2)]
 pub unsafe fn _mm512_maskz_extractf64x4_pd(k: __mmask8, a: __m512d, imm8: i32) -> __m256d {
-    assert!(imm8 >= 0 && imm8 <= 1);
-    let extract = match imm8 & 0x1 {
-        0 => simd_shuffle4(a, _mm512_undefined_pd(), [0, 1, 2, 3]),
-        _ => simd_shuffle4(a, _mm512_undefined_pd(), [4, 5, 6, 7]),
-    };
-    let zero = _mm256_setzero_pd();
-    transmute(simd_select_bitmask(k, extract, zero))
+    macro_rules! call {
+        ($imm8:expr) => {
+            _mm512_extractf64x4_pd(a, $imm8)
+        };
+    }
+    let r = constify_imm8_sae!(imm8, call);
+    let zero = _mm256_setzero_pd().as_f64x4();
+    transmute(simd_select_bitmask(k, r.as_f64x4(), zero))
 }
 
 /// Extract 128 bits (composed of 4 packed 32-bit integers) from a, selected with imm8, and store the result in dst.
@@ -20484,16 +19704,13 @@ pub unsafe fn _mm512_mask_extracti32x4_epi32(
     a: __m512i,
     imm8: i32,
 ) -> __m128i {
-    assert!(imm8 >= 0 && imm8 <= 3);
-    let a = a.as_i32x16();
-    let undefined = _mm512_undefined_epi32().as_i32x16();
-    let extract: i32x4 = match imm8 & 0x3 {
-        0 => simd_shuffle4(a, undefined, [0, 1, 2, 3]),
-        1 => simd_shuffle4(a, undefined, [4, 5, 6, 7]),
-        2 => simd_shuffle4(a, undefined, [8, 9, 10, 11]),
-        _ => simd_shuffle4(a, undefined, [12, 13, 14, 15]),
-    };
-    transmute(simd_select_bitmask(k, extract, src.as_i32x4()))
+    macro_rules! call {
+        ($imm8:expr) => {
+            _mm512_extracti32x4_epi32(a, $imm8)
+        };
+    }
+    let r = constify_imm8_sae!(imm8, call);
+    transmute(simd_select_bitmask(k, r.as_i32x4(), src.as_i32x4()))
 }
 
 /// Extract 128 bits (composed of 4 packed 32-bit integers) from a, selected with imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -20507,17 +19724,14 @@ pub unsafe fn _mm512_mask_extracti32x4_epi32(
 )]
 #[rustc_args_required_const(2)]
 pub unsafe fn _mm512_maskz_extracti32x4_epi32(k: __mmask8, a: __m512i, imm8: i32) -> __m128i {
-    assert!(imm8 >= 0 && imm8 <= 3);
-    let a = a.as_i32x16();
-    let undefined = _mm512_undefined_epi32().as_i32x16();
-    let extract: i32x4 = match imm8 & 0x3 {
-        0 => simd_shuffle4(a, undefined, [0, 1, 2, 3]),
-        1 => simd_shuffle4(a, undefined, [4, 5, 6, 7]),
-        2 => simd_shuffle4(a, undefined, [8, 9, 10, 11]),
-        _ => simd_shuffle4(a, undefined, [12, 13, 14, 15]),
-    };
+    macro_rules! call {
+        ($imm8:expr) => {
+            _mm512_extracti32x4_epi32(a, $imm8)
+        };
+    }
+    let r = constify_imm8_sae!(imm8, call);
     let zero = _mm_setzero_si128().as_i32x4();
-    transmute(simd_select_bitmask(k, extract, zero))
+    transmute(simd_select_bitmask(k, r.as_i32x4(), zero))
 }
 
 /// Duplicate even-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst.
@@ -20806,28 +20020,13 @@ pub unsafe fn _mm512_mask_inserti32x4(
     b: __m128i,
     imm8: i32,
 ) -> __m512i {
-    assert!(imm8 >= 0 && imm8 <= 3);
-    let a = a.as_i32x16();
-    let b = _mm512_castsi128_si512(b).as_i32x16();
-    let insert: i32x16 = match imm8 & 0b11 {
-        0 => simd_shuffle16(
-            a,
-            b,
-            [16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
-        ),
-        1 => simd_shuffle16(
-            a,
-            b,
-            [0, 1, 2, 3, 16, 17, 18, 19, 8, 9, 10, 11, 12, 13, 14, 15],
-        ),
-        2 => simd_shuffle16(
-            a,
-            b,
-            [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 12, 13, 14, 15],
-        ),
-        _ => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16, 17, 18, 19]),
-    };
-    transmute(simd_select_bitmask(k, insert, src.as_i32x16()))
+    macro_rules! call {
+        ($imm8:expr) => {
+            _mm512_inserti32x4(a, b, $imm8)
+        };
+    }
+    let r = constify_imm8_sae!(imm8, call);
+    transmute(simd_select_bitmask(k, r.as_i32x16(), src.as_i32x16()))
 }
 
 /// Copy a to tmp, then insert 128 bits (composed of 4 packed 32-bit integers) from b into tmp at the location specified by imm8. Store tmp to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -20838,29 +20037,14 @@ pub unsafe fn _mm512_mask_inserti32x4(
 #[cfg_attr(test, assert_instr(vinserti32x4, imm8 = 2))]
 #[rustc_args_required_const(3)]
 pub unsafe fn _mm512_maskz_inserti32x4(k: __mmask16, a: __m512i, b: __m128i, imm8: i32) -> __m512i {
-    assert!(imm8 >= 0 && imm8 <= 3);
-    let a = a.as_i32x16();
-    let b = _mm512_castsi128_si512(b).as_i32x16();
-    let insert = match imm8 & 0b11 {
-        0 => simd_shuffle16(
-            a,
-            b,
-            [16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
-        ),
-        1 => simd_shuffle16(
-            a,
-            b,
-            [0, 1, 2, 3, 16, 17, 18, 19, 8, 9, 10, 11, 12, 13, 14, 15],
-        ),
-        2 => simd_shuffle16(
-            a,
-            b,
-            [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 12, 13, 14, 15],
-        ),
-        _ => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16, 17, 18, 19]),
-    };
+    macro_rules! call {
+        ($imm8:expr) => {
+            _mm512_inserti32x4(a, b, $imm8)
+        };
+    }
+    let r = constify_imm8_sae!(imm8, call);
     let zero = _mm512_setzero_si512().as_i32x16();
-    transmute(simd_select_bitmask(k, insert, zero))
+    transmute(simd_select_bitmask(k, r.as_i32x16(), zero))
 }
 
 /// Copy a to dst, then insert 256 bits (composed of 4 packed 64-bit integers) from b into dst at the location specified by imm8.
@@ -20893,13 +20077,13 @@ pub unsafe fn _mm512_mask_inserti64x4(
     b: __m256i,
     imm8: i32,
 ) -> __m512i {
-    assert!(imm8 >= 0 && imm8 <= 1);
-    let b = _mm512_castsi256_si512(b);
-    let insert = match imm8 & 0b1 {
-        0 => simd_shuffle8(a, b, [8, 9, 10, 11, 4, 5, 6, 7]),
-        _ => simd_shuffle8(a, b, [0, 1, 2, 3, 8, 9, 10, 11]),
-    };
-    transmute(simd_select_bitmask(k, insert, src.as_i64x8()))
+    macro_rules! call {
+        ($imm8:expr) => {
+            _mm512_inserti64x4(a, b, $imm8)
+        };
+    }
+    let r = constify_imm8_sae!(imm8, call);
+    transmute(simd_select_bitmask(k, r.as_i64x8(), src.as_i64x8()))
 }
 
 /// Copy a to tmp, then insert 256 bits (composed of 4 packed 64-bit integers) from b into tmp at the location specified by imm8. Store tmp to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -20910,14 +20094,14 @@ pub unsafe fn _mm512_mask_inserti64x4(
 #[cfg_attr(test, assert_instr(vinserti64x4, imm8 = 1))]
 #[rustc_args_required_const(3)]
 pub unsafe fn _mm512_maskz_inserti64x4(k: __mmask8, a: __m512i, b: __m256i, imm8: i32) -> __m512i {
-    assert!(imm8 >= 0 && imm8 <= 1);
-    let b = _mm512_castsi256_si512(b);
-    let insert = match imm8 & 0b1 {
-        0 => simd_shuffle8(a, b, [8, 9, 10, 11, 4, 5, 6, 7]),
-        _ => simd_shuffle8(a, b, [0, 1, 2, 3, 8, 9, 10, 11]),
-    };
+    macro_rules! call {
+        ($imm8:expr) => {
+            _mm512_inserti64x4(a, b, $imm8)
+        };
+    }
+    let r = constify_imm8_sae!(imm8, call);
     let zero = _mm512_setzero_si512().as_i64x8();
-    transmute(simd_select_bitmask(k, insert, zero))
+    transmute(simd_select_bitmask(k, r.as_i64x8(), zero))
 }
 
 /// Copy a to dst, then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from b into dst at the location specified by imm8.
@@ -20964,27 +20148,13 @@ pub unsafe fn _mm512_mask_insertf32x4(
     b: __m128,
     imm8: i32,
 ) -> __m512 {
-    assert!(imm8 >= 0 && imm8 <= 3);
-    let b = _mm512_castps128_ps512(b);
-    let insert = match imm8 & 0b11 {
-        0 => simd_shuffle16(
-            a,
-            b,
-            [16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
-        ),
-        1 => simd_shuffle16(
-            a,
-            b,
-            [0, 1, 2, 3, 16, 17, 18, 19, 8, 9, 10, 11, 12, 13, 14, 15],
-        ),
-        2 => simd_shuffle16(
-            a,
-            b,
-            [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 12, 13, 14, 15],
-        ),
-        _ => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16, 17, 18, 19]),
-    };
-    transmute(simd_select_bitmask(k, insert, src.as_f32x16()))
+    macro_rules! call {
+        ($imm8:expr) => {
+            _mm512_insertf32x4(a, b, $imm8)
+        };
+    }
+    let r = constify_imm8_sae!(imm8, call);
+    transmute(simd_select_bitmask(k, r.as_f32x16(), src.as_f32x16()))
 }
 
 /// Copy a to tmp, then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from b into tmp at the location specified by imm8. Store tmp to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -20995,28 +20165,14 @@ pub unsafe fn _mm512_mask_insertf32x4(
 #[cfg_attr(test, assert_instr(vinsertf32x4, imm8 = 2))]
 #[rustc_args_required_const(3)]
 pub unsafe fn _mm512_maskz_insertf32x4(k: __mmask16, a: __m512, b: __m128, imm8: i32) -> __m512 {
-    assert!(imm8 >= 0 && imm8 <= 3);
-    let b = _mm512_castps128_ps512(b);
-    let insert = match imm8 & 0b11 {
-        0 => simd_shuffle16(
-            a,
-            b,
-            [16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
-        ),
-        1 => simd_shuffle16(
-            a,
-            b,
-            [0, 1, 2, 3, 16, 17, 18, 19, 8, 9, 10, 11, 12, 13, 14, 15],
-        ),
-        2 => simd_shuffle16(
-            a,
-            b,
-            [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 12, 13, 14, 15],
-        ),
-        _ => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16, 17, 18, 19]),
-    };
+    macro_rules! call {
+        ($imm8:expr) => {
+            _mm512_insertf32x4(a, b, $imm8)
+        };
+    }
+    let r = constify_imm8_sae!(imm8, call);
     let zero = _mm512_setzero_ps().as_f32x16();
-    transmute(simd_select_bitmask(k, insert, zero))
+    transmute(simd_select_bitmask(k, r.as_f32x16(), zero))
 }
 
 /// Copy a to dst, then insert 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from b into dst at the location specified by imm8.
@@ -21049,13 +20205,13 @@ pub unsafe fn _mm512_mask_insertf64x4(
     b: __m256d,
     imm8: i32,
 ) -> __m512d {
-    assert!(imm8 >= 0 && imm8 <= 1);
-    let b = _mm512_castpd256_pd512(b);
-    let insert = match imm8 & 0b1 {
-        0 => simd_shuffle8(a, b, [8, 9, 10, 11, 4, 5, 6, 7]),
-        _ => simd_shuffle8(a, b, [0, 1, 2, 3, 8, 9, 10, 11]),
-    };
-    transmute(simd_select_bitmask(k, insert, src.as_f64x8()))
+    macro_rules! call {
+        ($imm8:expr) => {
+            _mm512_insertf64x4(a, b, $imm8)
+        };
+    }
+    let r = constify_imm8_sae!(imm8, call);
+    transmute(simd_select_bitmask(k, r.as_f64x8(), src.as_f64x8()))
 }
 
 /// Copy a to tmp, then insert 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from b into tmp at the location specified by imm8. Store tmp to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -21066,14 +20222,14 @@ pub unsafe fn _mm512_mask_insertf64x4(
 #[cfg_attr(test, assert_instr(vinsertf64x4, imm8 = 1))]
 #[rustc_args_required_const(3)]
 pub unsafe fn _mm512_maskz_insertf64x4(k: __mmask8, a: __m512d, b: __m256d, imm8: i32) -> __m512d {
-    assert!(imm8 >= 0 && imm8 <= 1);
-    let b = _mm512_castpd256_pd512(b);
-    let insert = match imm8 & 0b1 {
-        0 => simd_shuffle8(a, b, [8, 9, 10, 11, 4, 5, 6, 7]),
-        _ => simd_shuffle8(a, b, [0, 1, 2, 3, 8, 9, 10, 11]),
-    };
+    macro_rules! call {
+        ($imm8:expr) => {
+            _mm512_insertf64x4(a, b, $imm8)
+        };
+    }
+    let r = constify_imm8_sae!(imm8, call);
     let zero = _mm512_setzero_pd().as_f64x8();
-    transmute(simd_select_bitmask(k, insert, zero))
+    transmute(simd_select_bitmask(k, r.as_f64x8(), zero))
 }
 
 /// Unpack and interleave 32-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst.
@@ -22743,73 +21899,13 @@ pub unsafe fn _mm512_mask_alignr_epi32(
     b: __m512i,
     imm8: i32,
 ) -> __m512i {
-    assert!(imm8 >= 0 && imm8 <= 255);
-    let a = a.as_i32x16();
-    let b = b.as_i32x16();
-    let imm8: i32 = imm8 % 16;
-    let r: i32x16 = match imm8 {
-        0 => simd_shuffle16(
-            a,
-            b,
-            [
-                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-            ],
-        ),
-        1 => simd_shuffle16(
-            a,
-            b,
-            [
-                17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0,
-            ],
-        ),
-        2 => simd_shuffle16(
-            a,
-            b,
-            [18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1],
-        ),
-        3 => simd_shuffle16(
-            a,
-            b,
-            [19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2],
-        ),
-        4 => simd_shuffle16(
-            a,
-            b,
-            [20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3],
-        ),
-        5 => simd_shuffle16(
-            a,
-            b,
-            [21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4],
-        ),
-        6 => simd_shuffle16(
-            a,
-            b,
-            [22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5],
-        ),
-        7 => simd_shuffle16(
-            a,
-            b,
-            [23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6],
-        ),
-        8 => simd_shuffle16(
-            a,
-            b,
-            [24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7],
-        ),
-        9 => simd_shuffle16(
-            a,
-            b,
-            [25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8],
-        ),
-        10 => simd_shuffle16(a, b, [26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
-        11 => simd_shuffle16(a, b, [27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]),
-        12 => simd_shuffle16(a, b, [28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]),
-        13 => simd_shuffle16(a, b, [29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]),
-        14 => simd_shuffle16(a, b, [30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]),
-        _ => simd_shuffle16(a, b, [31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]),
-    };
-    transmute(simd_select_bitmask(k, r, src.as_i32x16()))
+    macro_rules! call {
+        ($imm8:expr) => {
+            _mm512_alignr_epi32(a, b, $imm8)
+        };
+    }
+    let r = constify_imm8_sae!(imm8, call);
+    transmute(simd_select_bitmask(k, r.as_i32x16(), src.as_i32x16()))
 }
 
 /// Concatenate a and b into a 128-byte immediate result, shift the result right by imm8 32-bit elements, and stores the low 64 bytes (16 elements) in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -22825,74 +21921,14 @@ pub unsafe fn _mm512_maskz_alignr_epi32(
     b: __m512i,
     imm8: i32,
 ) -> __m512i {
-    assert!(imm8 >= 0 && imm8 <= 255);
-    let a = a.as_i32x16();
-    let b = b.as_i32x16();
-    let imm8: i32 = imm8 % 16;
-    let r: i32x16 = match imm8 {
-        0 => simd_shuffle16(
-            a,
-            b,
-            [
-                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-            ],
-        ),
-        1 => simd_shuffle16(
-            a,
-            b,
-            [
-                17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0,
-            ],
-        ),
-        2 => simd_shuffle16(
-            a,
-            b,
-            [18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1],
-        ),
-        3 => simd_shuffle16(
-            a,
-            b,
-            [19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2],
-        ),
-        4 => simd_shuffle16(
-            a,
-            b,
-            [20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3],
-        ),
-        5 => simd_shuffle16(
-            a,
-            b,
-            [21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4],
-        ),
-        6 => simd_shuffle16(
-            a,
-            b,
-            [22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5],
-        ),
-        7 => simd_shuffle16(
-            a,
-            b,
-            [23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6],
-        ),
-        8 => simd_shuffle16(
-            a,
-            b,
-            [24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7],
-        ),
-        9 => simd_shuffle16(
-            a,
-            b,
-            [25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8],
-        ),
-        10 => simd_shuffle16(a, b, [26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
-        11 => simd_shuffle16(a, b, [27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]),
-        12 => simd_shuffle16(a, b, [28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]),
-        13 => simd_shuffle16(a, b, [29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]),
-        14 => simd_shuffle16(a, b, [30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]),
-        _ => simd_shuffle16(a, b, [31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]),
-    };
+    macro_rules! call {
+        ($imm8:expr) => {
+            _mm512_alignr_epi32(a, b, $imm8)
+        };
+    }
+    let r = constify_imm8_sae!(imm8, call);
     let zero = _mm512_setzero_si512().as_i32x16();
-    transmute(simd_select_bitmask(k, r, zero))
+    transmute(simd_select_bitmask(k, r.as_i32x16(), zero))
 }
 
 /// Concatenate a and b into a 64-byte immediate result, shift the result right by imm8 32-bit elements, and store the low 32 bytes (8 elements) in dst.
@@ -22942,29 +21978,13 @@ pub unsafe fn _mm256_mask_alignr_epi32(
     b: __m256i,
     imm8: i32,
 ) -> __m256i {
-    assert!(imm8 >= 0 && imm8 <= 255);
-    let a = a.as_i32x8();
-    let b = b.as_i32x8();
-    let imm8: i32 = imm8 % 16;
-    let r: i32x8 = match imm8 {
-        0 => simd_shuffle8(a, b, [8, 9, 10, 11, 12, 13, 14, 15]),
-        1 => simd_shuffle8(a, b, [9, 10, 11, 12, 13, 14, 15, 0]),
-        2 => simd_shuffle8(a, b, [10, 11, 12, 13, 14, 15, 0, 1]),
-        3 => simd_shuffle8(a, b, [11, 12, 13, 14, 15, 0, 1, 2]),
-        4 => simd_shuffle8(a, b, [12, 13, 14, 15, 0, 1, 2, 3]),
-        5 => simd_shuffle8(a, b, [13, 14, 15, 0, 1, 2, 3, 4]),
-        6 => simd_shuffle8(a, b, [14, 15, 0, 1, 2, 3, 4, 5]),
-        7 => simd_shuffle8(a, b, [15, 0, 1, 2, 3, 4, 5, 6]),
-        8 => simd_shuffle8(a, b, [0, 1, 2, 3, 4, 5, 6, 7]),
-        9 => simd_shuffle8(a, b, [1, 2, 3, 4, 5, 6, 7, 8]),
-        10 => simd_shuffle8(a, b, [2, 3, 4, 5, 6, 7, 8, 9]),
-        11 => simd_shuffle8(a, b, [3, 4, 5, 6, 7, 8, 9, 10]),
-        12 => simd_shuffle8(a, b, [4, 5, 6, 7, 8, 9, 10, 11]),
-        13 => simd_shuffle8(a, b, [5, 6, 7, 8, 9, 10, 11, 12]),
-        14 => simd_shuffle8(a, b, [6, 7, 8, 9, 10, 11, 12, 13]),
-        _ => simd_shuffle8(a, b, [7, 8, 9, 10, 11, 12, 13, 14]),
-    };
-    transmute(simd_select_bitmask(k, r, src.as_i32x8()))
+    macro_rules! call {
+        ($imm8:expr) => {
+            _mm256_alignr_epi32(a, b, $imm8)
+        };
+    }
+    let r = constify_imm8_sae!(imm8, call);
+    transmute(simd_select_bitmask(k, r.as_i32x8(), src.as_i32x8()))
 }
 
 /// Concatenate a and b into a 64-byte immediate result, shift the result right by imm8 32-bit elements, and store the low 32 bytes (8 elements) in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -22975,30 +21995,14 @@ pub unsafe fn _mm256_mask_alignr_epi32(
 #[cfg_attr(test, assert_instr(valignd, imm8 = 1))]
 #[rustc_args_required_const(3)]
 pub unsafe fn _mm256_maskz_alignr_epi32(k: __mmask8, a: __m256i, b: __m256i, imm8: i32) -> __m256i {
-    assert!(imm8 >= 0 && imm8 <= 255);
-    let a = a.as_i32x8();
-    let b = b.as_i32x8();
-    let imm8: i32 = imm8 % 16;
-    let r: i32x8 = match imm8 {
-        0 => simd_shuffle8(a, b, [8, 9, 10, 11, 12, 13, 14, 15]),
-        1 => simd_shuffle8(a, b, [9, 10, 11, 12, 13, 14, 15, 0]),
-        2 => simd_shuffle8(a, b, [10, 11, 12, 13, 14, 15, 0, 1]),
-        3 => simd_shuffle8(a, b, [11, 12, 13, 14, 15, 0, 1, 2]),
-        4 => simd_shuffle8(a, b, [12, 13, 14, 15, 0, 1, 2, 3]),
-        5 => simd_shuffle8(a, b, [13, 14, 15, 0, 1, 2, 3, 4]),
-        6 => simd_shuffle8(a, b, [14, 15, 0, 1, 2, 3, 4, 5]),
-        7 => simd_shuffle8(a, b, [15, 0, 1, 2, 3, 4, 5, 6]),
-        8 => simd_shuffle8(a, b, [0, 1, 2, 3, 4, 5, 6, 7]),
-        9 => simd_shuffle8(a, b, [1, 2, 3, 4, 5, 6, 7, 8]),
-        10 => simd_shuffle8(a, b, [2, 3, 4, 5, 6, 7, 8, 9]),
-        11 => simd_shuffle8(a, b, [3, 4, 5, 6, 7, 8, 9, 10]),
-        12 => simd_shuffle8(a, b, [4, 5, 6, 7, 8, 9, 10, 11]),
-        13 => simd_shuffle8(a, b, [5, 6, 7, 8, 9, 10, 11, 12]),
-        14 => simd_shuffle8(a, b, [6, 7, 8, 9, 10, 11, 12, 13]),
-        _ => simd_shuffle8(a, b, [7, 8, 9, 10, 11, 12, 13, 14]),
-    };
+    macro_rules! call {
+        ($imm8:expr) => {
+            _mm256_alignr_epi32(a, b, $imm8)
+        };
+    }
+    let r = constify_imm8_sae!(imm8, call);
     let zero = _mm256_setzero_si256().as_i32x8();
-    transmute(simd_select_bitmask(k, r, zero))
+    transmute(simd_select_bitmask(k, r.as_i32x8(), zero))
 }
 
 /// Concatenate a and b into a 32-byte immediate result, shift the result right by imm8 32-bit elements, and store the low 16 bytes (4 elements) in dst.
@@ -23040,21 +22044,13 @@ pub unsafe fn _mm_mask_alignr_epi32(
     b: __m128i,
     imm8: i32,
 ) -> __m128i {
-    assert!(imm8 >= 0 && imm8 <= 255);
-    let a = a.as_i32x4();
-    let b = b.as_i32x4();
-    let imm8: i32 = imm8 % 8;
-    let r: i32x4 = match imm8 {
-        0 => simd_shuffle4(a, b, [4, 5, 6, 7]),
-        1 => simd_shuffle4(a, b, [5, 6, 7, 0]),
-        2 => simd_shuffle4(a, b, [6, 7, 0, 1]),
-        3 => simd_shuffle4(a, b, [7, 0, 1, 2]),
-        4 => simd_shuffle4(a, b, [0, 1, 2, 3]),
-        5 => simd_shuffle4(a, b, [1, 2, 3, 0]),
-        6 => simd_shuffle4(a, b, [2, 3, 0, 1]),
-        _ => simd_shuffle4(a, b, [3, 0, 1, 2]),
-    };
-    transmute(simd_select_bitmask(k, r, src.as_i32x4()))
+    macro_rules! call {
+        ($imm8:expr) => {
+            _mm_alignr_epi32(a, b, $imm8)
+        };
+    }
+    let r = constify_imm8_sae!(imm8, call);
+    transmute(simd_select_bitmask(k, r.as_i32x4(), src.as_i32x4()))
 }
 
 /// Concatenate a and b into a 32-byte immediate result, shift the result right by imm8 32-bit elements, and store the low 16 bytes (4 elements) in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -23065,22 +22061,14 @@ pub unsafe fn _mm_mask_alignr_epi32(
 #[cfg_attr(test, assert_instr(valignd, imm8 = 1))]
 #[rustc_args_required_const(3)]
 pub unsafe fn _mm_maskz_alignr_epi32(k: __mmask8, a: __m128i, b: __m128i, imm8: i32) -> __m128i {
-    assert!(imm8 >= 0 && imm8 <= 255);
-    let a = a.as_i32x4();
-    let b = b.as_i32x4();
-    let imm8: i32 = imm8 % 8;
-    let r: i32x4 = match imm8 {
-        0 => simd_shuffle4(a, b, [4, 5, 6, 7]),
-        1 => simd_shuffle4(a, b, [5, 6, 7, 0]),
-        2 => simd_shuffle4(a, b, [6, 7, 0, 1]),
-        3 => simd_shuffle4(a, b, [7, 0, 1, 2]),
-        4 => simd_shuffle4(a, b, [0, 1, 2, 3]),
-        5 => simd_shuffle4(a, b, [1, 2, 3, 0]),
-        6 => simd_shuffle4(a, b, [2, 3, 0, 1]),
-        _ => simd_shuffle4(a, b, [3, 0, 1, 2]),
-    };
+    macro_rules! call {
+        ($imm8:expr) => {
+            _mm_alignr_epi32(a, b, $imm8)
+        };
+    }
+    let r = constify_imm8_sae!(imm8, call);
     let zero = _mm_setzero_si128().as_i32x4();
-    transmute(simd_select_bitmask(k, r, zero))
+    transmute(simd_select_bitmask(k, r.as_i32x4(), zero))
 }
 
 /// Concatenate a and b into a 128-byte immediate result, shift the result right by imm8 64-bit elements, and store the low 64 bytes (8 elements) in dst.
@@ -23120,19 +22108,13 @@ pub unsafe fn _mm512_mask_alignr_epi64(
     b: __m512i,
     imm8: i32,
 ) -> __m512i {
-    assert!(imm8 >= 0 && imm8 <= 255);
-    let imm8: i32 = imm8 % 8;
-    let r: i64x8 = match imm8 {
-        0 => simd_shuffle8(a, b, [8, 9, 10, 11, 12, 13, 14, 15]),
-        1 => simd_shuffle8(a, b, [9, 10, 11, 12, 13, 14, 15, 0]),
-        2 => simd_shuffle8(a, b, [10, 11, 12, 13, 14, 15, 0, 1]),
-        3 => simd_shuffle8(a, b, [11, 12, 13, 14, 15, 0, 1, 2]),
-        4 => simd_shuffle8(a, b, [12, 13, 14, 15, 0, 1, 2, 3]),
-        5 => simd_shuffle8(a, b, [13, 14, 15, 0, 1, 2, 3, 4]),
-        6 => simd_shuffle8(a, b, [14, 15, 0, 1, 2, 3, 4, 5]),
-        _ => simd_shuffle8(a, b, [15, 0, 1, 2, 3, 4, 5, 6]),
-    };
-    transmute(simd_select_bitmask(k, r, src.as_i64x8()))
+    macro_rules! call {
+        ($imm8:expr) => {
+            _mm512_alignr_epi64(a, b, $imm8)
+        };
+    }
+    let r = constify_imm8_sae!(imm8, call);
+    transmute(simd_select_bitmask(k, r.as_i64x8(), src.as_i64x8()))
 }
 
 /// Concatenate a and b into a 128-byte immediate result, shift the result right by imm8 64-bit elements, and stores the low 64 bytes (8 elements) in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -23143,20 +22125,14 @@ pub unsafe fn _mm512_mask_alignr_epi64(
 #[cfg_attr(test, assert_instr(valignq, imm8 = 1))]
 #[rustc_args_required_const(3)]
 pub unsafe fn _mm512_maskz_alignr_epi64(k: __mmask8, a: __m512i, b: __m512i, imm8: i32) -> __m512i {
-    assert!(imm8 >= 0 && imm8 <= 255);
-    let imm8: i32 = imm8 % 8;
-    let r: i64x8 = match imm8 {
-        0 => simd_shuffle8(a, b, [8, 9, 10, 11, 12, 13, 14, 15]),
-        1 => simd_shuffle8(a, b, [9, 10, 11, 12, 13, 14, 15, 0]),
-        2 => simd_shuffle8(a, b, [10, 11, 12, 13, 14, 15, 0, 1]),
-        3 => simd_shuffle8(a, b, [11, 12, 13, 14, 15, 0, 1, 2]),
-        4 => simd_shuffle8(a, b, [12, 13, 14, 15, 0, 1, 2, 3]),
-        5 => simd_shuffle8(a, b, [13, 14, 15, 0, 1, 2, 3, 4]),
-        6 => simd_shuffle8(a, b, [14, 15, 0, 1, 2, 3, 4, 5]),
-        _ => simd_shuffle8(a, b, [15, 0, 1, 2, 3, 4, 5, 6]),
-    };
+    macro_rules! call {
+        ($imm8:expr) => {
+            _mm512_alignr_epi64(a, b, $imm8)
+        };
+    }
+    let r = constify_imm8_sae!(imm8, call);
     let zero = _mm512_setzero_si512().as_i64x8();
-    transmute(simd_select_bitmask(k, r, zero))
+    transmute(simd_select_bitmask(k, r.as_i64x8(), zero))
 }
 
 /// Concatenate a and b into a 64-byte immediate result, shift the result right by imm8 64-bit elements, and store the low 32 bytes (4 elements) in dst.
@@ -23196,19 +22172,13 @@ pub unsafe fn _mm256_mask_alignr_epi64(
     b: __m256i,
     imm8: i32,
 ) -> __m256i {
-    assert!(imm8 >= 0 && imm8 <= 255);
-    let imm8: i32 = imm8 % 8;
-    let r: i64x4 = match imm8 {
-        0 => simd_shuffle4(a, b, [4, 5, 6, 7]),
-        1 => simd_shuffle4(a, b, [5, 6, 7, 0]),
-        2 => simd_shuffle4(a, b, [6, 7, 0, 1]),
-        3 => simd_shuffle4(a, b, [7, 0, 1, 2]),
-        4 => simd_shuffle4(a, b, [0, 1, 2, 3]),
-        5 => simd_shuffle4(a, b, [1, 2, 3, 4]),
-        6 => simd_shuffle4(a, b, [2, 3, 4, 5]),
-        _ => simd_shuffle4(a, b, [3, 4, 5, 6]),
-    };
-    transmute(simd_select_bitmask(k, r, src.as_i64x4()))
+    macro_rules! call {
+        ($imm8:expr) => {
+            _mm256_alignr_epi64(a, b, $imm8)
+        };
+    }
+    let r = constify_imm8_sae!(imm8, call);
+    transmute(simd_select_bitmask(k, r.as_i64x4(), src.as_i64x4()))
 }
 
 /// Concatenate a and b into a 64-byte immediate result, shift the result right by imm8 64-bit elements, and store the low 32 bytes (4 elements) in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -23219,20 +22189,14 @@ pub unsafe fn _mm256_mask_alignr_epi64(
 #[cfg_attr(test, assert_instr(valignq, imm8 = 1))]
 #[rustc_args_required_const(3)]
 pub unsafe fn _mm256_maskz_alignr_epi64(k: __mmask8, a: __m256i, b: __m256i, imm8: i32) -> __m256i {
-    assert!(imm8 >= 0 && imm8 <= 255);
-    let imm8: i32 = imm8 % 8;
-    let r: i64x4 = match imm8 {
-        0 => simd_shuffle4(a, b, [4, 5, 6, 7]),
-        1 => simd_shuffle4(a, b, [5, 6, 7, 0]),
-        2 => simd_shuffle4(a, b, [6, 7, 0, 1]),
-        3 => simd_shuffle4(a, b, [7, 0, 1, 2]),
-        4 => simd_shuffle4(a, b, [0, 1, 2, 3]),
-        5 => simd_shuffle4(a, b, [1, 2, 3, 4]),
-        6 => simd_shuffle4(a, b, [2, 3, 4, 5]),
-        _ => simd_shuffle4(a, b, [3, 4, 5, 6]),
-    };
+    macro_rules! call {
+        ($imm8:expr) => {
+            _mm256_alignr_epi64(a, b, $imm8)
+        };
+    }
+    let r = constify_imm8_sae!(imm8, call);
     let zero = _mm256_setzero_si256().as_i64x4();
-    transmute(simd_select_bitmask(k, r, zero))
+    transmute(simd_select_bitmask(k, r.as_i64x4(), zero))
 }
 
 /// Concatenate a and b into a 32-byte immediate result, shift the result right by imm8 64-bit elements, and store the low 16 bytes (2 elements) in dst.
@@ -23268,15 +22232,13 @@ pub unsafe fn _mm_mask_alignr_epi64(
     b: __m128i,
     imm8: i32,
 ) -> __m128i {
-    assert!(imm8 >= 0 && imm8 <= 255);
-    let imm8: i32 = imm8 % 4;
-    let r: i64x2 = match imm8 {
-        0 => simd_shuffle2(a, b, [2, 3]),
-        1 => simd_shuffle2(a, b, [3, 0]),
-        2 => simd_shuffle2(a, b, [0, 1]),
-        _ => simd_shuffle2(a, b, [1, 2]),
-    };
-    transmute(simd_select_bitmask(k, r, src.as_i64x2()))
+    macro_rules! call {
+        ($imm8:expr) => {
+            _mm_alignr_epi64(a, b, $imm8)
+        };
+    }
+    let r = constify_imm8_sae!(imm8, call);
+    transmute(simd_select_bitmask(k, r.as_i64x2(), src.as_i64x2()))
 }
 
 /// Concatenate a and b into a 32-byte immediate result, shift the result right by imm8 64-bit elements, and store the low 16 bytes (2 elements) in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -23287,16 +22249,14 @@ pub unsafe fn _mm_mask_alignr_epi64(
 #[cfg_attr(test, assert_instr(valignq, imm8 = 1))]
 #[rustc_args_required_const(3)]
 pub unsafe fn _mm_maskz_alignr_epi64(k: __mmask8, a: __m128i, b: __m128i, imm8: i32) -> __m128i {
-    assert!(imm8 >= 0 && imm8 <= 255);
-    let imm8: i32 = imm8 % 4;
-    let r: i64x2 = match imm8 {
-        0 => simd_shuffle2(a, b, [2, 3]),
-        1 => simd_shuffle2(a, b, [3, 0]),
-        2 => simd_shuffle2(a, b, [0, 1]),
-        _ => simd_shuffle2(a, b, [1, 2]),
-    };
+    macro_rules! call {
+        ($imm8:expr) => {
+            _mm_alignr_epi64(a, b, $imm8)
+        };
+    }
+    let r = constify_imm8_sae!(imm8, call);
     let zero = _mm_setzero_si128().as_i64x2();
-    transmute(simd_select_bitmask(k, r, zero))
+    transmute(simd_select_bitmask(k, r.as_i64x2(), zero))
 }
 
 /// Compute the bitwise AND of packed 32-bit integers in a and b, and store the results in dst.
@@ -28999,13 +27959,16 @@ pub unsafe fn _mm_getmant_ss(
     norm: _MM_MANTISSA_NORM_ENUM,
     sign: _MM_MANTISSA_SIGN_ENUM,
 ) -> __m128 {
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let zero = _mm_setzero_ps().as_f32x4();
     macro_rules! call {
         ($imm4_1:expr, $imm2:expr) => {
             vgetmantss(
-                a.as_f32x4(),
-                b.as_f32x4(),
+                a,
+                b,
                 $imm2 << 2 | $imm4_1,
-                _mm_setzero_ps().as_f32x4(),
+                zero,
                 0b1,
                 _MM_FROUND_CUR_DIRECTION,
             )
@@ -29040,16 +28003,12 @@ pub unsafe fn _mm_mask_getmant_ss(
     norm: _MM_MANTISSA_NORM_ENUM,
     sign: _MM_MANTISSA_SIGN_ENUM,
 ) -> __m128 {
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let src = src.as_f32x4();
     macro_rules! call {
         ($imm4_1:expr, $imm2:expr) => {
-            vgetmantss(
-                a.as_f32x4(),
-                b.as_f32x4(),
-                $imm2 << 2 | $imm4_1,
-                src.as_f32x4(),
-                k,
-                _MM_FROUND_CUR_DIRECTION,
-            )
+            vgetmantss(a, b, $imm2 << 2 | $imm4_1, src, k, _MM_FROUND_CUR_DIRECTION)
         };
     }
     let r = constify_imm4_mantissas!(norm, sign, call);
@@ -29080,13 +28039,16 @@ pub unsafe fn _mm_maskz_getmant_ss(
     norm: _MM_MANTISSA_NORM_ENUM,
     sign: _MM_MANTISSA_SIGN_ENUM,
 ) -> __m128 {
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let zero = _mm_setzero_ps().as_f32x4();
     macro_rules! call {
         ($imm4_1:expr, $imm2:expr) => {
             vgetmantss(
-                a.as_f32x4(),
-                b.as_f32x4(),
+                a,
+                b,
                 $imm2 << 2 | $imm4_1,
-                _mm_setzero_ps().as_f32x4(),
+                zero,
                 k,
                 _MM_FROUND_CUR_DIRECTION,
             )
@@ -29119,13 +28081,16 @@ pub unsafe fn _mm_getmant_sd(
     norm: _MM_MANTISSA_NORM_ENUM,
     sign: _MM_MANTISSA_SIGN_ENUM,
 ) -> __m128d {
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let zero = _mm_setzero_pd().as_f64x2();
     macro_rules! call {
         ($imm4_1:expr, $imm2:expr) => {
             vgetmantsd(
-                a.as_f64x2(),
-                b.as_f64x2(),
+                a,
+                b,
                 $imm2 << 2 | $imm4_1,
-                _mm_setzero_pd().as_f64x2(),
+                zero,
                 0b1,
                 _MM_FROUND_CUR_DIRECTION,
             )
@@ -29160,16 +28125,12 @@ pub unsafe fn _mm_mask_getmant_sd(
     norm: _MM_MANTISSA_NORM_ENUM,
     sign: _MM_MANTISSA_SIGN_ENUM,
 ) -> __m128d {
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let src = src.as_f64x2();
     macro_rules! call {
         ($imm4_1:expr, $imm2:expr) => {
-            vgetmantsd(
-                a.as_f64x2(),
-                b.as_f64x2(),
-                $imm2 << 2 | $imm4_1,
-                src.as_f64x2(),
-                k,
-                _MM_FROUND_CUR_DIRECTION,
-            )
+            vgetmantsd(a, b, $imm2 << 2 | $imm4_1, src, k, _MM_FROUND_CUR_DIRECTION)
         };
     }
     let r = constify_imm4_mantissas!(norm, sign, call);
@@ -29200,13 +28161,16 @@ pub unsafe fn _mm_maskz_getmant_sd(
     norm: _MM_MANTISSA_NORM_ENUM,
     sign: _MM_MANTISSA_SIGN_ENUM,
 ) -> __m128d {
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let zero = _mm_setzero_pd().as_f64x2();
     macro_rules! call {
         ($imm4_1:expr, $imm2:expr) => {
             vgetmantsd(
-                a.as_f64x2(),
-                b.as_f64x2(),
+                a,
+                b,
                 $imm2 << 2 | $imm4_1,
-                _mm_setzero_pd().as_f64x2(),
+                zero,
                 k,
                 _MM_FROUND_CUR_DIRECTION,
             )
@@ -29391,13 +28355,10 @@ pub unsafe fn _mm_maskz_roundscale_sd(k: __mmask8, a: __m128d, b: __m128d, imm8:
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vscalefss))]
 pub unsafe fn _mm_scalef_ss(a: __m128, b: __m128) -> __m128 {
-    transmute(vscalefss(
-        a.as_f32x4(),
-        b.as_f32x4(),
-        _mm_setzero_ps().as_f32x4(),
-        0b11111111,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let zero = _mm_setzero_ps().as_f32x4();
+    transmute(vscalefss(a, b, zero, 0b11111111, _MM_FROUND_CUR_DIRECTION))
 }
 
 /// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -29407,13 +28368,10 @@ pub unsafe fn _mm_scalef_ss(a: __m128, b: __m128) -> __m128 {
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vscalefss))]
 pub unsafe fn _mm_mask_scalef_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    transmute(vscalefss(
-        a.as_f32x4(),
-        b.as_f32x4(),
-        src.as_f32x4(),
-        k,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let src = src.as_f32x4();
+    transmute(vscalefss(a, b, src, k, _MM_FROUND_CUR_DIRECTION))
 }
 
 /// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -29935,15 +28893,12 @@ pub unsafe fn _mm_mask3_fnmsub_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask
 #[cfg_attr(test, assert_instr(vaddss, rounding = 8))]
 #[rustc_args_required_const(2)]
 pub unsafe fn _mm_add_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 {
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let zero = _mm_setzero_ps().as_f32x4();
     macro_rules! call {
         ($imm4:expr) => {
-            vaddss(
-                a.as_f32x4(),
-                b.as_f32x4(),
-                _mm_setzero_ps().as_f32x4(),
-                0b1,
-                $imm4,
-            )
+            vaddss(a, b, zero, 0b1, $imm4)
         };
     }
     transmute(constify_imm4_round!(rounding, call))
@@ -29970,9 +28925,12 @@ pub unsafe fn _mm_mask_add_round_ss(
     b: __m128,
     rounding: i32,
 ) -> __m128 {
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let src = src.as_f32x4();
     macro_rules! call {
         ($imm4:expr) => {
-            vaddss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k, $imm4)
+            vaddss(a, b, src, k, $imm4)
         };
     }
     transmute(constify_imm4_round!(rounding, call))
@@ -29993,15 +28951,12 @@ pub unsafe fn _mm_mask_add_round_ss(
 #[cfg_attr(test, assert_instr(vaddss, rounding = 8))]
 #[rustc_args_required_const(3)]
 pub unsafe fn _mm_maskz_add_round_ss(k: __mmask8, a: __m128, b: __m128, rounding: i32) -> __m128 {
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let zero = _mm_setzero_ps().as_f32x4();
     macro_rules! call {
         ($imm4:expr) => {
-            vaddss(
-                a.as_f32x4(),
-                b.as_f32x4(),
-                _mm_setzero_ps().as_f32x4(),
-                k,
-                $imm4,
-            )
+            vaddss(a, b, zero, k, $imm4)
         };
     }
     transmute(constify_imm4_round!(rounding, call))
@@ -30022,15 +28977,12 @@ pub unsafe fn _mm_maskz_add_round_ss(k: __mmask8, a: __m128, b: __m128, rounding
 #[cfg_attr(test, assert_instr(vaddsd, rounding = 8))]
 #[rustc_args_required_const(2)]
 pub unsafe fn _mm_add_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d {
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let zero = _mm_setzero_pd().as_f64x2();
     macro_rules! call {
         ($imm4:expr) => {
-            vaddsd(
-                a.as_f64x2(),
-                b.as_f64x2(),
-                _mm_setzero_pd().as_f64x2(),
-                0b1,
-                $imm4,
-            )
+            vaddsd(a, b, zero, 0b1, $imm4)
         };
     }
     transmute(constify_imm4_round!(rounding, call))
@@ -30057,9 +29009,12 @@ pub unsafe fn _mm_mask_add_round_sd(
     b: __m128d,
     rounding: i32,
 ) -> __m128d {
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let src = src.as_f64x2();
     macro_rules! call {
         ($imm4:expr) => {
-            vaddsd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k, $imm4)
+            vaddsd(a, b, src, k, $imm4)
         };
     }
     transmute(constify_imm4_round!(rounding, call))
@@ -30085,15 +29040,12 @@ pub unsafe fn _mm_maskz_add_round_sd(
     b: __m128d,
     rounding: i32,
 ) -> __m128d {
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let zero = _mm_setzero_pd().as_f64x2();
     macro_rules! call {
         ($imm4:expr) => {
-            vaddsd(
-                a.as_f64x2(),
-                b.as_f64x2(),
-                _mm_setzero_pd().as_f64x2(),
-                k,
-                $imm4,
-            )
+            vaddsd(a, b, zero, k, $imm4)
         };
     }
     transmute(constify_imm4_round!(rounding, call))
@@ -30114,15 +29066,12 @@ pub unsafe fn _mm_maskz_add_round_sd(
 #[cfg_attr(test, assert_instr(vsubss, rounding = 8))]
 #[rustc_args_required_const(2)]
 pub unsafe fn _mm_sub_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 {
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let zero = _mm_setzero_ps().as_f32x4();
     macro_rules! call {
         ($imm4:expr) => {
-            vsubss(
-                a.as_f32x4(),
-                b.as_f32x4(),
-                _mm_setzero_ps().as_f32x4(),
-                0b1,
-                $imm4,
-            )
+            vsubss(a, b, zero, 0b1, $imm4)
         };
     }
     transmute(constify_imm4_round!(rounding, call))
@@ -30149,9 +29098,12 @@ pub unsafe fn _mm_mask_sub_round_ss(
     b: __m128,
     rounding: i32,
 ) -> __m128 {
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let src = src.as_f32x4();
     macro_rules! call {
         ($imm4:expr) => {
-            vsubss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k, $imm4)
+            vsubss(a, b, src, k, $imm4)
         };
     }
     transmute(constify_imm4_round!(rounding, call))
@@ -30172,15 +29124,12 @@ pub unsafe fn _mm_mask_sub_round_ss(
 #[cfg_attr(test, assert_instr(vsubss, rounding = 8))]
 #[rustc_args_required_const(3)]
 pub unsafe fn _mm_maskz_sub_round_ss(k: __mmask8, a: __m128, b: __m128, rounding: i32) -> __m128 {
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let zero = _mm_setzero_ps().as_f32x4();
     macro_rules! call {
         ($imm4:expr) => {
-            vsubss(
-                a.as_f32x4(),
-                b.as_f32x4(),
-                _mm_setzero_ps().as_f32x4(),
-                k,
-                $imm4,
-            )
+            vsubss(a, b, zero, k, $imm4)
         };
     }
     transmute(constify_imm4_round!(rounding, call))
@@ -30201,15 +29150,12 @@ pub unsafe fn _mm_maskz_sub_round_ss(k: __mmask8, a: __m128, b: __m128, rounding
 #[cfg_attr(test, assert_instr(vsubsd, rounding = 8))]
 #[rustc_args_required_const(2)]
 pub unsafe fn _mm_sub_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d {
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let zero = _mm_setzero_pd().as_f64x2();
     macro_rules! call {
         ($imm4:expr) => {
-            vsubsd(
-                a.as_f64x2(),
-                b.as_f64x2(),
-                _mm_setzero_pd().as_f64x2(),
-                0b1,
-                $imm4,
-            )
+            vsubsd(a, b, zero, 0b1, $imm4)
         };
     }
     transmute(constify_imm4_round!(rounding, call))
@@ -30236,9 +29182,12 @@ pub unsafe fn _mm_mask_sub_round_sd(
     b: __m128d,
     rounding: i32,
 ) -> __m128d {
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let src = src.as_f64x2();
     macro_rules! call {
         ($imm4:expr) => {
-            vsubsd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k, $imm4)
+            vsubsd(a, b, src, k, $imm4)
         };
     }
     transmute(constify_imm4_round!(rounding, call))
@@ -30264,15 +29213,12 @@ pub unsafe fn _mm_maskz_sub_round_sd(
     b: __m128d,
     rounding: i32,
 ) -> __m128d {
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let zero = _mm_setzero_pd().as_f64x2();
     macro_rules! call {
         ($imm4:expr) => {
-            vsubsd(
-                a.as_f64x2(),
-                b.as_f64x2(),
-                _mm_setzero_pd().as_f64x2(),
-                k,
-                $imm4,
-            )
+            vsubsd(a, b, zero, k, $imm4)
         };
     }
     transmute(constify_imm4_round!(rounding, call))
@@ -30293,15 +29239,12 @@ pub unsafe fn _mm_maskz_sub_round_sd(
 #[cfg_attr(test, assert_instr(vmulss, rounding = 8))]
 #[rustc_args_required_const(2)]
 pub unsafe fn _mm_mul_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 {
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let zero = _mm_setzero_ps().as_f32x4();
     macro_rules! call {
         ($imm4:expr) => {
-            vmulss(
-                a.as_f32x4(),
-                b.as_f32x4(),
-                _mm_setzero_ps().as_f32x4(),
-                0b1,
-                $imm4,
-            )
+            vmulss(a, b, zero, 0b1, $imm4)
         };
     }
     transmute(constify_imm4_round!(rounding, call))
@@ -30328,9 +29271,12 @@ pub unsafe fn _mm_mask_mul_round_ss(
     b: __m128,
     rounding: i32,
 ) -> __m128 {
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let src = src.as_f32x4();
     macro_rules! call {
         ($imm4:expr) => {
-            vmulss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k, $imm4)
+            vmulss(a, b, src, k, $imm4)
         };
     }
     transmute(constify_imm4_round!(rounding, call))
@@ -30351,15 +29297,12 @@ pub unsafe fn _mm_mask_mul_round_ss(
 #[cfg_attr(test, assert_instr(vmulss, rounding = 8))]
 #[rustc_args_required_const(3)]
 pub unsafe fn _mm_maskz_mul_round_ss(k: __mmask8, a: __m128, b: __m128, rounding: i32) -> __m128 {
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let zero = _mm_setzero_ps().as_f32x4();
     macro_rules! call {
         ($imm4:expr) => {
-            vmulss(
-                a.as_f32x4(),
-                b.as_f32x4(),
-                _mm_setzero_ps().as_f32x4(),
-                k,
-                $imm4,
-            )
+            vmulss(a, b, zero, k, $imm4)
         };
     }
     transmute(constify_imm4_round!(rounding, call))
@@ -30380,15 +29323,12 @@ pub unsafe fn _mm_maskz_mul_round_ss(k: __mmask8, a: __m128, b: __m128, rounding
 #[cfg_attr(test, assert_instr(vmulsd, rounding = 8))]
 #[rustc_args_required_const(2)]
 pub unsafe fn _mm_mul_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d {
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let zero = _mm_setzero_pd().as_f64x2();
     macro_rules! call {
         ($imm4:expr) => {
-            vmulsd(
-                a.as_f64x2(),
-                b.as_f64x2(),
-                _mm_setzero_pd().as_f64x2(),
-                0b1,
-                $imm4,
-            )
+            vmulsd(a, b, zero, 0b1, $imm4)
         };
     }
     transmute(constify_imm4_round!(rounding, call))
@@ -30415,9 +29355,12 @@ pub unsafe fn _mm_mask_mul_round_sd(
     b: __m128d,
     rounding: i32,
 ) -> __m128d {
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let src = src.as_f64x2();
     macro_rules! call {
         ($imm4:expr) => {
-            vmulsd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k, $imm4)
+            vmulsd(a, b, src, k, $imm4)
         };
     }
     transmute(constify_imm4_round!(rounding, call))
@@ -30443,15 +29386,12 @@ pub unsafe fn _mm_maskz_mul_round_sd(
     b: __m128d,
     rounding: i32,
 ) -> __m128d {
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let zero = _mm_setzero_pd().as_f64x2();
     macro_rules! call {
         ($imm4:expr) => {
-            vmulsd(
-                a.as_f64x2(),
-                b.as_f64x2(),
-                _mm_setzero_pd().as_f64x2(),
-                k,
-                $imm4,
-            )
+            vmulsd(a, b, zero, k, $imm4)
         };
     }
     transmute(constify_imm4_round!(rounding, call))
@@ -30472,15 +29412,12 @@ pub unsafe fn _mm_maskz_mul_round_sd(
 #[cfg_attr(test, assert_instr(vdivss, rounding = 8))]
 #[rustc_args_required_const(2)]
 pub unsafe fn _mm_div_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 {
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let zero = _mm_setzero_ps().as_f32x4();
     macro_rules! call {
         ($imm4:expr) => {
-            vdivss(
-                a.as_f32x4(),
-                b.as_f32x4(),
-                _mm_setzero_ps().as_f32x4(),
-                0b1,
-                $imm4,
-            )
+            vdivss(a, b, zero, 0b1, $imm4)
         };
     }
     transmute(constify_imm4_round!(rounding, call))
@@ -30507,9 +29444,12 @@ pub unsafe fn _mm_mask_div_round_ss(
     b: __m128,
     rounding: i32,
 ) -> __m128 {
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let src = src.as_f32x4();
     macro_rules! call {
         ($imm4:expr) => {
-            vdivss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k, $imm4)
+            vdivss(a, b, src, k, $imm4)
         };
     }
     transmute(constify_imm4_round!(rounding, call))
@@ -30530,15 +29470,12 @@ pub unsafe fn _mm_mask_div_round_ss(
 #[cfg_attr(test, assert_instr(vdivss, rounding = 8))]
 #[rustc_args_required_const(3)]
 pub unsafe fn _mm_maskz_div_round_ss(k: __mmask8, a: __m128, b: __m128, rounding: i32) -> __m128 {
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let zero = _mm_setzero_ps().as_f32x4();
     macro_rules! call {
         ($imm4:expr) => {
-            vdivss(
-                a.as_f32x4(),
-                b.as_f32x4(),
-                _mm_setzero_ps().as_f32x4(),
-                k,
-                $imm4,
-            )
+            vdivss(a, b, zero, k, $imm4)
         };
     }
     transmute(constify_imm4_round!(rounding, call))
@@ -30559,15 +29496,12 @@ pub unsafe fn _mm_maskz_div_round_ss(k: __mmask8, a: __m128, b: __m128, rounding
 #[cfg_attr(test, assert_instr(vdivsd, rounding = 8))]
 #[rustc_args_required_const(2)]
 pub unsafe fn _mm_div_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d {
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let zero = _mm_setzero_pd().as_f64x2();
     macro_rules! call {
         ($imm4:expr) => {
-            vdivsd(
-                a.as_f64x2(),
-                b.as_f64x2(),
-                _mm_setzero_pd().as_f64x2(),
-                0b1,
-                $imm4,
-            )
+            vdivsd(a, b, zero, 0b1, $imm4)
         };
     }
     transmute(constify_imm4_round!(rounding, call))
@@ -30594,9 +29528,12 @@ pub unsafe fn _mm_mask_div_round_sd(
     b: __m128d,
     rounding: i32,
 ) -> __m128d {
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let src = src.as_f64x2();
     macro_rules! call {
         ($imm4:expr) => {
-            vdivsd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k, $imm4)
+            vdivsd(a, b, src, k, $imm4)
         };
     }
     transmute(constify_imm4_round!(rounding, call))
@@ -30622,15 +29559,12 @@ pub unsafe fn _mm_maskz_div_round_sd(
     b: __m128d,
     rounding: i32,
 ) -> __m128d {
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let zero = _mm_setzero_pd().as_f64x2();
     macro_rules! call {
         ($imm4:expr) => {
-            vdivsd(
-                a.as_f64x2(),
-                b.as_f64x2(),
-                _mm_setzero_pd().as_f64x2(),
-                k,
-                $imm4,
-            )
+            vdivsd(a, b, zero, k, $imm4)
         };
     }
     transmute(constify_imm4_round!(rounding, call))
@@ -30645,15 +29579,12 @@ pub unsafe fn _mm_maskz_div_round_sd(
 #[cfg_attr(test, assert_instr(vmaxss, sae = 8))]
 #[rustc_args_required_const(2)]
 pub unsafe fn _mm_max_round_ss(a: __m128, b: __m128, sae: i32) -> __m128 {
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let zero = _mm_setzero_ps().as_f32x4();
     macro_rules! call {
         ($imm4:expr) => {
-            vmaxss(
-                a.as_f32x4(),
-                b.as_f32x4(),
-                _mm_setzero_ps().as_f32x4(),
-                0b1,
-                $imm4,
-            )
+            vmaxss(a, b, zero, 0b1, $imm4)
         };
     }
     transmute(constify_imm4_sae!(sae, call))
@@ -30674,9 +29605,12 @@ pub unsafe fn _mm_mask_max_round_ss(
     b: __m128,
     sae: i32,
 ) -> __m128 {
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let src = src.as_f32x4();
     macro_rules! call {
         ($imm4:expr) => {
-            vmaxss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k, $imm4)
+            vmaxss(a, b, src, k, $imm4)
         };
     }
     transmute(constify_imm4_sae!(sae, call))
@@ -30691,15 +29625,12 @@ pub unsafe fn _mm_mask_max_round_ss(
 #[cfg_attr(test, assert_instr(vmaxss, sae = 8))]
 #[rustc_args_required_const(3)]
 pub unsafe fn _mm_maskz_max_round_ss(k: __mmask8, a: __m128, b: __m128, sae: i32) -> __m128 {
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let zero = _mm_setzero_ps().as_f32x4();
     macro_rules! call {
         ($imm4:expr) => {
-            vmaxss(
-                a.as_f32x4(),
-                b.as_f32x4(),
-                _mm_setzero_ps().as_f32x4(),
-                k,
-                $imm4,
-            )
+            vmaxss(a, b, zero, k, $imm4)
         };
     }
     transmute(constify_imm4_sae!(sae, call))
@@ -30714,15 +29645,12 @@ pub unsafe fn _mm_maskz_max_round_ss(k: __mmask8, a: __m128, b: __m128, sae: i32
 #[cfg_attr(test, assert_instr(vmaxsd, sae = 8))]
 #[rustc_args_required_const(2)]
 pub unsafe fn _mm_max_round_sd(a: __m128d, b: __m128d, sae: i32) -> __m128d {
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let zero = _mm_setzero_pd().as_f64x2();
     macro_rules! call {
         ($imm4:expr) => {
-            vmaxsd(
-                a.as_f64x2(),
-                b.as_f64x2(),
-                _mm_setzero_pd().as_f64x2(),
-                0b1,
-                $imm4,
-            )
+            vmaxsd(a, b, zero, 0b1, $imm4)
         };
     }
     transmute(constify_imm4_sae!(sae, call))
@@ -30743,9 +29671,12 @@ pub unsafe fn _mm_mask_max_round_sd(
     b: __m128d,
     sae: i32,
 ) -> __m128d {
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let src = src.as_f64x2();
     macro_rules! call {
         ($imm4:expr) => {
-            vmaxsd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k, $imm4)
+            vmaxsd(a, b, src, k, $imm4)
         };
     }
     transmute(constify_imm4_sae!(sae, call))
@@ -30760,15 +29691,12 @@ pub unsafe fn _mm_mask_max_round_sd(
 #[cfg_attr(test, assert_instr(vmaxsd, sae = 8))]
 #[rustc_args_required_const(3)]
 pub unsafe fn _mm_maskz_max_round_sd(k: __mmask8, a: __m128d, b: __m128d, sae: i32) -> __m128d {
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let zero = _mm_setzero_pd().as_f64x2();
     macro_rules! call {
         ($imm4:expr) => {
-            vmaxsd(
-                a.as_f64x2(),
-                b.as_f64x2(),
-                _mm_setzero_pd().as_f64x2(),
-                k,
-                $imm4,
-            )
+            vmaxsd(a, b, zero, k, $imm4)
         };
     }
     transmute(constify_imm4_sae!(sae, call))
@@ -30783,15 +29711,12 @@ pub unsafe fn _mm_maskz_max_round_sd(k: __mmask8, a: __m128d, b: __m128d, sae: i
 #[cfg_attr(test, assert_instr(vminss, sae = 8))]
 #[rustc_args_required_const(2)]
 pub unsafe fn _mm_min_round_ss(a: __m128, b: __m128, sae: i32) -> __m128 {
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let zero = _mm_setzero_ps().as_f32x4();
     macro_rules! call {
         ($imm4:expr) => {
-            vminss(
-                a.as_f32x4(),
-                b.as_f32x4(),
-                _mm_setzero_ps().as_f32x4(),
-                0b1,
-                $imm4,
-            )
+            vminss(a, b, zero, 0b1, $imm4)
         };
     }
     transmute(constify_imm4_sae!(sae, call))
@@ -30812,9 +29737,12 @@ pub unsafe fn _mm_mask_min_round_ss(
     b: __m128,
     sae: i32,
 ) -> __m128 {
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let src = src.as_f32x4();
     macro_rules! call {
         ($imm4:expr) => {
-            vminss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k, $imm4)
+            vminss(a, b, src, k, $imm4)
         };
     }
     transmute(constify_imm4_sae!(sae, call))
@@ -30829,15 +29757,12 @@ pub unsafe fn _mm_mask_min_round_ss(
 #[cfg_attr(test, assert_instr(vminss, sae = 8))]
 #[rustc_args_required_const(3)]
 pub unsafe fn _mm_maskz_min_round_ss(k: __mmask8, a: __m128, b: __m128, sae: i32) -> __m128 {
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let zero = _mm_setzero_ps().as_f32x4();
     macro_rules! call {
         ($imm4:expr) => {
-            vminss(
-                a.as_f32x4(),
-                b.as_f32x4(),
-                _mm_setzero_ps().as_f32x4(),
-                k,
-                $imm4,
-            )
+            vminss(a, b, zero, k, $imm4)
         };
     }
     transmute(constify_imm4_sae!(sae, call))
@@ -30852,15 +29777,12 @@ pub unsafe fn _mm_maskz_min_round_ss(k: __mmask8, a: __m128, b: __m128, sae: i32
 #[cfg_attr(test, assert_instr(vminsd, sae = 8))]
 #[rustc_args_required_const(2)]
 pub unsafe fn _mm_min_round_sd(a: __m128d, b: __m128d, sae: i32) -> __m128d {
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let zero = _mm_setzero_pd().as_f64x2();
     macro_rules! call {
         ($imm4:expr) => {
-            vminsd(
-                a.as_f64x2(),
-                b.as_f64x2(),
-                _mm_setzero_pd().as_f64x2(),
-                0b1,
-                $imm4,
-            )
+            vminsd(a, b, zero, 0b1, $imm4)
         };
     }
     transmute(constify_imm4_sae!(sae, call))
@@ -30881,9 +29803,12 @@ pub unsafe fn _mm_mask_min_round_sd(
     b: __m128d,
     sae: i32,
 ) -> __m128d {
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let src = src.as_f64x2();
     macro_rules! call {
         ($imm4:expr) => {
-            vminsd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k, $imm4)
+            vminsd(a, b, src, k, $imm4)
         };
     }
     transmute(constify_imm4_sae!(sae, call))
@@ -30898,15 +29823,12 @@ pub unsafe fn _mm_mask_min_round_sd(
 #[cfg_attr(test, assert_instr(vminsd, sae = 8))]
 #[rustc_args_required_const(3)]
 pub unsafe fn _mm_maskz_min_round_sd(k: __mmask8, a: __m128d, b: __m128d, sae: i32) -> __m128d {
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let zero = _mm_setzero_pd().as_f64x2();
     macro_rules! call {
         ($imm4:expr) => {
-            vminsd(
-                a.as_f64x2(),
-                b.as_f64x2(),
-                _mm_setzero_pd().as_f64x2(),
-                k,
-                $imm4,
-            )
+            vminsd(a, b, zero, k, $imm4)
         };
     }
     transmute(constify_imm4_sae!(sae, call))
@@ -30927,15 +29849,12 @@ pub unsafe fn _mm_maskz_min_round_sd(k: __mmask8, a: __m128d, b: __m128d, sae: i
 #[cfg_attr(test, assert_instr(vsqrtss, rounding = 8))]
 #[rustc_args_required_const(2)]
 pub unsafe fn _mm_sqrt_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 {
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let zero = _mm_setzero_ps().as_f32x4();
     macro_rules! call {
         ($imm4:expr) => {
-            vsqrtss(
-                a.as_f32x4(),
-                b.as_f32x4(),
-                _mm_setzero_ps().as_f32x4(),
-                0b1,
-                $imm4,
-            )
+            vsqrtss(a, b, zero, 0b1, $imm4)
         };
     }
     transmute(constify_imm4_round!(rounding, call))
@@ -30962,9 +29881,12 @@ pub unsafe fn _mm_mask_sqrt_round_ss(
     b: __m128,
     rounding: i32,
 ) -> __m128 {
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let src = src.as_f32x4();
     macro_rules! call {
         ($imm4:expr) => {
-            vsqrtss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k, $imm4)
+            vsqrtss(a, b, src, k, $imm4)
         };
     }
     transmute(constify_imm4_round!(rounding, call))
@@ -30985,15 +29907,12 @@ pub unsafe fn _mm_mask_sqrt_round_ss(
 #[cfg_attr(test, assert_instr(vsqrtss, rounding = 8))]
 #[rustc_args_required_const(3)]
 pub unsafe fn _mm_maskz_sqrt_round_ss(k: __mmask8, a: __m128, b: __m128, rounding: i32) -> __m128 {
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let zero = _mm_setzero_ps().as_f32x4();
     macro_rules! call {
         ($imm4:expr) => {
-            vsqrtss(
-                a.as_f32x4(),
-                b.as_f32x4(),
-                _mm_setzero_ps().as_f32x4(),
-                k,
-                $imm4,
-            )
+            vsqrtss(a, b, zero, k, $imm4)
         };
     }
     transmute(constify_imm4_round!(rounding, call))
@@ -31014,15 +29933,12 @@ pub unsafe fn _mm_maskz_sqrt_round_ss(k: __mmask8, a: __m128, b: __m128, roundin
 #[cfg_attr(test, assert_instr(vsqrtsd, rounding = 8))]
 #[rustc_args_required_const(2)]
 pub unsafe fn _mm_sqrt_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d {
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let zero = _mm_setzero_pd().as_f64x2();
     macro_rules! call {
         ($imm4:expr) => {
-            vsqrtsd(
-                a.as_f64x2(),
-                b.as_f64x2(),
-                _mm_setzero_pd().as_f64x2(),
-                0b1,
-                $imm4,
-            )
+            vsqrtsd(a, b, zero, 0b1, $imm4)
         };
     }
     transmute(constify_imm4_round!(rounding, call))
@@ -31049,9 +29965,12 @@ pub unsafe fn _mm_mask_sqrt_round_sd(
     b: __m128d,
     rounding: i32,
 ) -> __m128d {
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let src = src.as_f64x2();
     macro_rules! call {
         ($imm4:expr) => {
-            vsqrtsd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k, $imm4)
+            vsqrtsd(a, b, src, k, $imm4)
         };
     }
     transmute(constify_imm4_round!(rounding, call))
@@ -31077,15 +29996,12 @@ pub unsafe fn _mm_maskz_sqrt_round_sd(
     b: __m128d,
     rounding: i32,
 ) -> __m128d {
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let zero = _mm_setzero_pd().as_f64x2();
     macro_rules! call {
         ($imm4:expr) => {
-            vsqrtsd(
-                a.as_f64x2(),
-                b.as_f64x2(),
-                _mm_setzero_pd().as_f64x2(),
-                k,
-                $imm4,
-            )
+            vsqrtsd(a, b, zero, k, $imm4)
         };
     }
     transmute(constify_imm4_round!(rounding, call))
@@ -31100,15 +30016,12 @@ pub unsafe fn _mm_maskz_sqrt_round_sd(
 #[cfg_attr(test, assert_instr(vgetexpss, sae = 8))]
 #[rustc_args_required_const(2)]
 pub unsafe fn _mm_getexp_round_ss(a: __m128, b: __m128, sae: i32) -> __m128 {
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let zero = _mm_setzero_ps().as_f32x4();
     macro_rules! call {
         ($imm4:expr) => {
-            vgetexpss(
-                a.as_f32x4(),
-                b.as_f32x4(),
-                _mm_setzero_ps().as_f32x4(),
-                0b1,
-                $imm4,
-            )
+            vgetexpss(a, b, zero, 0b1, $imm4)
         };
     }
     let r = constify_imm4_sae!(sae, call);
@@ -31130,9 +30043,12 @@ pub unsafe fn _mm_mask_getexp_round_ss(
     b: __m128,
     sae: i32,
 ) -> __m128 {
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let src = src.as_f32x4();
     macro_rules! call {
         ($imm4:expr) => {
-            vgetexpss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k, $imm4)
+            vgetexpss(a, b, src, k, $imm4)
         };
     }
     let r = constify_imm4_sae!(sae, call);
@@ -31148,15 +30064,12 @@ pub unsafe fn _mm_mask_getexp_round_ss(
 #[cfg_attr(test, assert_instr(vgetexpss, sae = 8))]
 #[rustc_args_required_const(3)]
 pub unsafe fn _mm_maskz_getexp_round_ss(k: __mmask8, a: __m128, b: __m128, sae: i32) -> __m128 {
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let zero = _mm_setzero_ps().as_f32x4();
     macro_rules! call {
         ($imm4:expr) => {
-            vgetexpss(
-                a.as_f32x4(),
-                b.as_f32x4(),
-                _mm_setzero_ps().as_f32x4(),
-                k,
-                $imm4,
-            )
+            vgetexpss(a, b, zero, k, $imm4)
         };
     }
     let r = constify_imm4_sae!(sae, call);
@@ -31172,15 +30085,12 @@ pub unsafe fn _mm_maskz_getexp_round_ss(k: __mmask8, a: __m128, b: __m128, sae:
 #[cfg_attr(test, assert_instr(vgetexpsd, sae = 8))]
 #[rustc_args_required_const(2)]
 pub unsafe fn _mm_getexp_round_sd(a: __m128d, b: __m128d, sae: i32) -> __m128d {
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let zero = _mm_setzero_pd().as_f64x2();
     macro_rules! call {
         ($imm4:expr) => {
-            vgetexpsd(
-                a.as_f64x2(),
-                b.as_f64x2(),
-                _mm_setzero_pd().as_f64x2(),
-                0b1,
-                $imm4,
-            )
+            vgetexpsd(a, b, zero, 0b1, $imm4)
         };
     }
     let r = constify_imm4_sae!(sae, call);
@@ -31202,9 +30112,12 @@ pub unsafe fn _mm_mask_getexp_round_sd(
     b: __m128d,
     sae: i32,
 ) -> __m128d {
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let src = src.as_f64x2();
     macro_rules! call {
         ($imm4:expr) => {
-            vgetexpsd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k, $imm4)
+            vgetexpsd(a, b, src, k, $imm4)
         };
     }
     let r = constify_imm4_sae!(sae, call);
@@ -31220,15 +30133,12 @@ pub unsafe fn _mm_mask_getexp_round_sd(
 #[cfg_attr(test, assert_instr(vgetexpsd, sae = 8))]
 #[rustc_args_required_const(3)]
 pub unsafe fn _mm_maskz_getexp_round_sd(k: __mmask8, a: __m128d, b: __m128d, sae: i32) -> __m128d {
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let zero = _mm_setzero_pd().as_f64x2();
     macro_rules! call {
         ($imm4:expr) => {
-            vgetexpsd(
-                a.as_f64x2(),
-                b.as_f64x2(),
-                _mm_setzero_pd().as_f64x2(),
-                k,
-                $imm4,
-            )
+            vgetexpsd(a, b, zero, k, $imm4)
         };
     }
     let r = constify_imm4_sae!(sae, call);
@@ -31259,16 +30169,12 @@ pub unsafe fn _mm_getmant_round_ss(
     sign: _MM_MANTISSA_SIGN_ENUM,
     sae: i32,
 ) -> __m128 {
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let zero = _mm_setzero_ps().as_f32x4();
     macro_rules! call {
         ($imm4_1:expr, $imm2:expr, $imm4_2:expr) => {
-            vgetmantss(
-                a.as_f32x4(),
-                b.as_f32x4(),
-                $imm2 << 2 | $imm4_1,
-                _mm_setzero_ps().as_f32x4(),
-                0b1,
-                $imm4_2,
-            )
+            vgetmantss(a, b, $imm2 << 2 | $imm4_1, zero, 0b1, $imm4_2)
         };
     }
     let r = constify_imm4_mantissas_sae!(norm, sign, sae, call);
@@ -31301,16 +30207,12 @@ pub unsafe fn _mm_mask_getmant_round_ss(
     sign: _MM_MANTISSA_SIGN_ENUM,
     sae: i32,
 ) -> __m128 {
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let src = src.as_f32x4();
     macro_rules! call {
         ($imm4_1:expr, $imm2:expr, $imm4_2:expr) => {
-            vgetmantss(
-                a.as_f32x4(),
-                b.as_f32x4(),
-                $imm2 << 2 | $imm4_1,
-                src.as_f32x4(),
-                k,
-                $imm4_2,
-            )
+            vgetmantss(a, b, $imm2 << 2 | $imm4_1, src, k, $imm4_2)
         };
     }
     let r = constify_imm4_mantissas_sae!(norm, sign, sae, call);
@@ -31342,16 +30244,12 @@ pub unsafe fn _mm_maskz_getmant_round_ss(
     sign: _MM_MANTISSA_SIGN_ENUM,
     sae: i32,
 ) -> __m128 {
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let zero = _mm_setzero_ps().as_f32x4();
     macro_rules! call {
         ($imm4_1:expr, $imm2:expr, $imm4_2:expr) => {
-            vgetmantss(
-                a.as_f32x4(),
-                b.as_f32x4(),
-                $imm2 << 2 | $imm4_1,
-                _mm_setzero_ps().as_f32x4(),
-                k,
-                $imm4_2,
-            )
+            vgetmantss(a, b, $imm2 << 2 | $imm4_1, zero, k, $imm4_2)
         };
     }
     let r = constify_imm4_mantissas_sae!(norm, sign, sae, call);
@@ -31382,16 +30280,12 @@ pub unsafe fn _mm_getmant_round_sd(
     sign: _MM_MANTISSA_SIGN_ENUM,
     sae: i32,
 ) -> __m128d {
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let zero = _mm_setzero_pd().as_f64x2();
     macro_rules! call {
         ($imm4_1:expr, $imm2:expr, $imm4_2:expr) => {
-            vgetmantsd(
-                a.as_f64x2(),
-                b.as_f64x2(),
-                $imm2 << 2 | $imm4_1,
-                _mm_setzero_pd().as_f64x2(),
-                0b1,
-                $imm4_2,
-            )
+            vgetmantsd(a, b, $imm2 << 2 | $imm4_1, zero, 0b1, $imm4_2)
         };
     }
     let r = constify_imm4_mantissas_sae!(norm, sign, sae, call);
@@ -31424,16 +30318,12 @@ pub unsafe fn _mm_mask_getmant_round_sd(
     sign: _MM_MANTISSA_SIGN_ENUM,
     sae: i32,
 ) -> __m128d {
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let src = src.as_f64x2();
     macro_rules! call {
         ($imm4_1:expr, $imm2:expr, $imm4_2:expr) => {
-            vgetmantsd(
-                a.as_f64x2(),
-                b.as_f64x2(),
-                $imm2 << 2 | $imm4_1,
-                src.as_f64x2(),
-                k,
-                $imm4_2,
-            )
+            vgetmantsd(a, b, $imm2 << 2 | $imm4_1, src, k, $imm4_2)
         };
     }
     let r = constify_imm4_mantissas_sae!(norm, sign, sae, call);
@@ -31465,16 +30355,12 @@ pub unsafe fn _mm_maskz_getmant_round_sd(
     sign: _MM_MANTISSA_SIGN_ENUM,
     sae: i32,
 ) -> __m128d {
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let zero = _mm_setzero_pd().as_f64x2();
     macro_rules! call {
         ($imm4_1:expr, $imm2:expr, $imm4_2:expr) => {
-            vgetmantsd(
-                a.as_f64x2(),
-                b.as_f64x2(),
-                $imm2 << 2 | $imm4_1,
-                _mm_setzero_pd().as_f64x2(),
-                k,
-                $imm4_2,
-            )
+            vgetmantsd(a, b, $imm2 << 2 | $imm4_1, zero, k, $imm4_2)
         };
     }
     let r = constify_imm4_mantissas_sae!(norm, sign, sae, call);
@@ -31717,9 +30603,12 @@ pub unsafe fn _mm_mask_scalef_round_ss(
     b: __m128,
     rounding: i32,
 ) -> __m128 {
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let src = src.as_f32x4();
     macro_rules! call {
         ($imm4:expr) => {
-            vscalefss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k, $imm4)
+            vscalefss(a, b, src, k, $imm4)
         };
     }
     let r = constify_imm4_round!(rounding, call);
@@ -31746,15 +30635,12 @@ pub unsafe fn _mm_maskz_scalef_round_ss(
     b: __m128,
     rounding: i32,
 ) -> __m128 {
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
+    let zero = _mm_setzero_ps().as_f32x4();
     macro_rules! call {
         ($imm4:expr) => {
-            vscalefss(
-                a.as_f32x4(),
-                b.as_f32x4(),
-                _mm_setzero_ps().as_f32x4(),
-                k,
-                $imm4,
-            )
+            vscalefss(a, b, zero, k, $imm4)
         };
     }
     let r = constify_imm4_round!(rounding, call);
@@ -31776,15 +30662,12 @@ pub unsafe fn _mm_maskz_scalef_round_ss(
 #[cfg_attr(test, assert_instr(vscalefsd, rounding = 8))]
 #[rustc_args_required_const(2)]
 pub unsafe fn _mm_scalef_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d {
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let zero = _mm_setzero_pd().as_f64x2();
     macro_rules! call {
         ($imm4:expr) => {
-            vscalefsd(
-                a.as_f64x2(),
-                b.as_f64x2(),
-                _mm_setzero_pd().as_f64x2(),
-                0b11111111,
-                $imm4,
-            )
+            vscalefsd(a, b, zero, 0b11111111, $imm4)
         };
     }
     let r = constify_imm4_round!(rounding, call);
@@ -31812,9 +30695,12 @@ pub unsafe fn _mm_mask_scalef_round_sd(
     b: __m128d,
     rounding: i32,
 ) -> __m128d {
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let src = src.as_f64x2();
     macro_rules! call {
         ($imm4:expr) => {
-            vscalefsd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k, $imm4)
+            vscalefsd(a, b, src, k, $imm4)
         };
     }
     let r = constify_imm4_round!(rounding, call);
@@ -31841,15 +30727,12 @@ pub unsafe fn _mm_maskz_scalef_round_sd(
     b: __m128d,
     rounding: i32,
 ) -> __m128d {
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
+    let zero = _mm_setzero_pd().as_f64x2();
     macro_rules! call {
         ($imm4:expr) => {
-            vscalefsd(
-                a.as_f64x2(),
-                b.as_f64x2(),
-                _mm_setzero_pd().as_f64x2(),
-                k,
-                $imm4,
-            )
+            vscalefsd(a, b, zero, k, $imm4)
         };
     }
     let r = constify_imm4_round!(rounding, call);
@@ -33421,9 +32304,12 @@ pub unsafe fn _mm_mask_cvt_roundss_sd(
     b: __m128,
     sae: i32,
 ) -> __m128d {
+    let a = a.as_f64x2();
+    let b = b.as_f32x4();
+    let src = src.as_f64x2();
     macro_rules! call {
         ($imm4:expr) => {
-            vcvtss2sd(a.as_f64x2(), b.as_f32x4(), src.as_f64x2(), k, $imm4)
+            vcvtss2sd(a, b, src, k, $imm4)
         };
     }
     let r = constify_imm4_sae!(sae, call);
@@ -33439,15 +32325,12 @@ pub unsafe fn _mm_mask_cvt_roundss_sd(
 #[cfg_attr(test, assert_instr(vcvtss2sd, sae = 8))]
 #[rustc_args_required_const(3)]
 pub unsafe fn _mm_maskz_cvt_roundss_sd(k: __mmask8, a: __m128d, b: __m128, sae: i32) -> __m128d {
+    let a = a.as_f64x2();
+    let b = b.as_f32x4();
+    let zero = _mm_setzero_pd().as_f64x2();
     macro_rules! call {
         ($imm4:expr) => {
-            vcvtss2sd(
-                a.as_f64x2(),
-                b.as_f32x4(),
-                _mm_setzero_pd().as_f64x2(),
-                k,
-                $imm4,
-            )
+            vcvtss2sd(a, b, zero, k, $imm4)
         };
     }
     let r = constify_imm4_sae!(sae, call);
@@ -33469,15 +32352,12 @@ pub unsafe fn _mm_maskz_cvt_roundss_sd(k: __mmask8, a: __m128d, b: __m128, sae:
 #[cfg_attr(test, assert_instr(vcvtsd2ss, rounding = 8))]
 #[rustc_args_required_const(2)]
 pub unsafe fn _mm_cvt_roundsd_ss(a: __m128, b: __m128d, rounding: i32) -> __m128 {
+    let a = a.as_f32x4();
+    let b = b.as_f64x2();
+    let zero = _mm_setzero_ps().as_f32x4();
     macro_rules! call {
         ($imm4:expr) => {
-            vcvtsd2ss(
-                a.as_f32x4(),
-                b.as_f64x2(),
-                _mm_setzero_ps().as_f32x4(),
-                0b11111111,
-                $imm4,
-            )
+            vcvtsd2ss(a, b, zero, 0b11111111, $imm4)
         };
     }
     let r = constify_imm4_round!(rounding, call);
@@ -33505,9 +32385,12 @@ pub unsafe fn _mm_mask_cvt_roundsd_ss(
     b: __m128d,
     rounding: i32,
 ) -> __m128 {
+    let a = a.as_f32x4();
+    let b = b.as_f64x2();
+    let src = src.as_f32x4();
     macro_rules! call {
         ($imm4:expr) => {
-            vcvtsd2ss(a.as_f32x4(), b.as_f64x2(), src.as_f32x4(), k, $imm4)
+            vcvtsd2ss(a, b, src, k, $imm4)
         };
     }
     let r = constify_imm4_round!(rounding, call);
@@ -33534,15 +32417,12 @@ pub unsafe fn _mm_maskz_cvt_roundsd_ss(
     b: __m128d,
     rounding: i32,
 ) -> __m128 {
+    let a = a.as_f32x4();
+    let b = b.as_f64x2();
+    let zero = _mm_setzero_ps().as_f32x4();
     macro_rules! call {
         ($imm4:expr) => {
-            vcvtsd2ss(
-                a.as_f32x4(),
-                b.as_f64x2(),
-                _mm_setzero_ps().as_f32x4(),
-                k,
-                $imm4,
-            )
+            vcvtsd2ss(a, b, zero, k, $imm4)
         };
     }
     let r = constify_imm4_round!(rounding, call);
@@ -33564,9 +32444,10 @@ pub unsafe fn _mm_maskz_cvt_roundsd_ss(
 #[cfg_attr(test, assert_instr(vcvtss2si, rounding = 8))]
 #[rustc_args_required_const(1)]
 pub unsafe fn _mm_cvt_roundss_si32(a: __m128, rounding: i32) -> i32 {
+    let a = a.as_f32x4();
     macro_rules! call {
         ($imm4:expr) => {
-            vcvtss2si(a.as_f32x4(), $imm4)
+            vcvtss2si(a, $imm4)
         };
     }
     let r = constify_imm4_round!(rounding, call);
@@ -33588,9 +32469,10 @@ pub unsafe fn _mm_cvt_roundss_si32(a: __m128, rounding: i32) -> i32 {
 #[cfg_attr(test, assert_instr(vcvtss2si, rounding = 8))]
 #[rustc_args_required_const(1)]
 pub unsafe fn _mm_cvt_roundss_i32(a: __m128, rounding: i32) -> i32 {
+    let a = a.as_f32x4();
     macro_rules! call {
         ($imm4:expr) => {
-            vcvtss2si(a.as_f32x4(), $imm4)
+            vcvtss2si(a, $imm4)
         };
     }
     let r = constify_imm4_round!(rounding, call);
@@ -33612,9 +32494,10 @@ pub unsafe fn _mm_cvt_roundss_i32(a: __m128, rounding: i32) -> i32 {
 #[cfg_attr(test, assert_instr(vcvtss2usi, rounding = 8))]
 #[rustc_args_required_const(1)]
 pub unsafe fn _mm_cvt_roundss_u32(a: __m128, rounding: i32) -> u32 {
+    let a = a.as_f32x4();
     macro_rules! call {
         ($imm4:expr) => {
-            vcvtss2usi(a.as_f32x4(), $imm4)
+            vcvtss2usi(a, $imm4)
         };
     }
     let r = constify_imm4_round!(rounding, call);
@@ -33656,9 +32539,10 @@ pub unsafe fn _mm_cvtss_u32(a: __m128) -> u32 {
 #[cfg_attr(test, assert_instr(vcvtsd2si, rounding = 8))]
 #[rustc_args_required_const(1)]
 pub unsafe fn _mm_cvt_roundsd_si32(a: __m128d, rounding: i32) -> i32 {
+    let a = a.as_f64x2();
     macro_rules! call {
         ($imm4:expr) => {
-            vcvtsd2si(a.as_f64x2(), $imm4)
+            vcvtsd2si(a, $imm4)
         };
     }
     let r = constify_imm4_round!(rounding, call);
@@ -33680,9 +32564,10 @@ pub unsafe fn _mm_cvt_roundsd_si32(a: __m128d, rounding: i32) -> i32 {
 #[cfg_attr(test, assert_instr(vcvtsd2si, rounding = 8))]
 #[rustc_args_required_const(1)]
 pub unsafe fn _mm_cvt_roundsd_i32(a: __m128d, rounding: i32) -> i32 {
+    let a = a.as_f64x2();
     macro_rules! call {
         ($imm4:expr) => {
-            vcvtsd2si(a.as_f64x2(), $imm4)
+            vcvtsd2si(a, $imm4)
         };
     }
     let r = constify_imm4_round!(rounding, call);
@@ -33704,9 +32589,10 @@ pub unsafe fn _mm_cvt_roundsd_i32(a: __m128d, rounding: i32) -> i32 {
 #[cfg_attr(test, assert_instr(vcvtsd2usi, rounding = 8))]
 #[rustc_args_required_const(1)]
 pub unsafe fn _mm_cvt_roundsd_u32(a: __m128d, rounding: i32) -> u32 {
+    let a = a.as_f64x2();
     macro_rules! call {
         ($imm4:expr) => {
-            vcvtsd2usi(a.as_f64x2(), $imm4)
+            vcvtsd2usi(a, $imm4)
         };
     }
     let r = constify_imm4_round!(rounding, call);
@@ -33748,9 +32634,10 @@ pub unsafe fn _mm_cvtsd_u32(a: __m128d) -> u32 {
 #[cfg_attr(test, assert_instr(vcvtsi2ss, rounding = 8))]
 #[rustc_args_required_const(2)]
 pub unsafe fn _mm_cvt_roundi32_ss(a: __m128, b: i32, rounding: i32) -> __m128 {
+    let a = a.as_f32x4();
     macro_rules! call {
         ($imm4:expr) => {
-            vcvtsi2ss(a.as_f32x4(), b, $imm4)
+            vcvtsi2ss(a, b, $imm4)
         };
     }
     let r = constify_imm4_round!(rounding, call);
@@ -33772,9 +32659,10 @@ pub unsafe fn _mm_cvt_roundi32_ss(a: __m128, b: i32, rounding: i32) -> __m128 {
 #[cfg_attr(test, assert_instr(vcvtsi2ss, rounding = 8))]
 #[rustc_args_required_const(2)]
 pub unsafe fn _mm_cvt_roundsi32_ss(a: __m128, b: i32, rounding: i32) -> __m128 {
+    let a = a.as_f32x4();
     macro_rules! call {
         ($imm4:expr) => {
-            vcvtsi2ss(a.as_f32x4(), b, $imm4)
+            vcvtsi2ss(a, b, $imm4)
         };
     }
     let r = constify_imm4_round!(rounding, call);
@@ -33796,9 +32684,10 @@ pub unsafe fn _mm_cvt_roundsi32_ss(a: __m128, b: i32, rounding: i32) -> __m128 {
 #[cfg_attr(test, assert_instr(vcvtusi2ss, rounding = 8))]
 #[rustc_args_required_const(2)]
 pub unsafe fn _mm_cvt_roundu32_ss(a: __m128, b: u32, rounding: i32) -> __m128 {
+    let a = a.as_f32x4();
     macro_rules! call {
         ($imm4:expr) => {
-            vcvtusi2ss(a.as_f32x4(), b, $imm4)
+            vcvtusi2ss(a, b, $imm4)
         };
     }
     let r = constify_imm4_round!(rounding, call);
@@ -33838,9 +32727,10 @@ pub unsafe fn _mm_cvti32_sd(a: __m128d, b: i32) -> __m128d {
 #[cfg_attr(test, assert_instr(vcvtss2si, sae = 8))]
 #[rustc_args_required_const(1)]
 pub unsafe fn _mm_cvtt_roundss_si32(a: __m128, sae: i32) -> i32 {
+    let a = a.as_f32x4();
     macro_rules! call {
         ($imm4:expr) => {
-            vcvtss2si(a.as_f32x4(), $imm4)
+            vcvtss2si(a, $imm4)
         };
     }
     let r = constify_imm4_sae!(sae, call);
@@ -33856,9 +32746,10 @@ pub unsafe fn _mm_cvtt_roundss_si32(a: __m128, sae: i32) -> i32 {
 #[cfg_attr(test, assert_instr(vcvtss2si, sae = 8))]
 #[rustc_args_required_const(1)]
 pub unsafe fn _mm_cvtt_roundss_i32(a: __m128, sae: i32) -> i32 {
+    let a = a.as_f32x4();
     macro_rules! call {
         ($imm4:expr) => {
-            vcvtss2si(a.as_f32x4(), $imm4)
+            vcvtss2si(a, $imm4)
         };
     }
     let r = constify_imm4_sae!(sae, call);
@@ -33874,9 +32765,10 @@ pub unsafe fn _mm_cvtt_roundss_i32(a: __m128, sae: i32) -> i32 {
 #[cfg_attr(test, assert_instr(vcvtss2usi, sae = 8))]
 #[rustc_args_required_const(1)]
 pub unsafe fn _mm_cvtt_roundss_u32(a: __m128, sae: i32) -> u32 {
+    let a = a.as_f32x4();
     macro_rules! call {
         ($imm4:expr) => {
-            vcvtss2usi(a.as_f32x4(), $imm4)
+            vcvtss2usi(a, $imm4)
         };
     }
     let r = constify_imm4_sae!(sae, call);
@@ -33912,9 +32804,10 @@ pub unsafe fn _mm_cvttss_u32(a: __m128) -> u32 {
 #[cfg_attr(test, assert_instr(vcvtsd2si, sae = 8))]
 #[rustc_args_required_const(1)]
 pub unsafe fn _mm_cvtt_roundsd_si32(a: __m128d, sae: i32) -> i32 {
+    let a = a.as_f64x2();
     macro_rules! call {
         ($imm4:expr) => {
-            vcvtsd2si(a.as_f64x2(), $imm4)
+            vcvtsd2si(a, $imm4)
         };
     }
     let r = constify_imm4_sae!(sae, call);
@@ -33930,9 +32823,10 @@ pub unsafe fn _mm_cvtt_roundsd_si32(a: __m128d, sae: i32) -> i32 {
 #[cfg_attr(test, assert_instr(vcvtsd2si, sae = 8))]
 #[rustc_args_required_const(1)]
 pub unsafe fn _mm_cvtt_roundsd_i32(a: __m128d, sae: i32) -> i32 {
+    let a = a.as_f64x2();
     macro_rules! call {
         ($imm4:expr) => {
-            vcvtsd2si(a.as_f64x2(), $imm4)
+            vcvtsd2si(a, $imm4)
         };
     }
     let r = constify_imm4_sae!(sae, call);
@@ -33948,9 +32842,10 @@ pub unsafe fn _mm_cvtt_roundsd_i32(a: __m128d, sae: i32) -> i32 {
 #[cfg_attr(test, assert_instr(vcvtsd2usi, sae = 8))]
 #[rustc_args_required_const(1)]
 pub unsafe fn _mm_cvtt_roundsd_u32(a: __m128d, sae: i32) -> u32 {
+    let a = a.as_f64x2();
     macro_rules! call {
         ($imm4:expr) => {
-            vcvtsd2usi(a.as_f64x2(), $imm4)
+            vcvtsd2usi(a, $imm4)
         };
     }
     let r = constify_imm4_sae!(sae, call);
@@ -34034,9 +32929,11 @@ pub unsafe fn _mm_cvtu64_sd(a: __m128d, b: u64) -> __m128d {
 #[cfg_attr(test, assert_instr(vcmp, imm8 = 5, sae = 4))] //should be vcomiss
 #[rustc_args_required_const(2, 3)]
 pub unsafe fn _mm_comi_round_ss(a: __m128, b: __m128, imm8: i32, sae: i32) -> i32 {
+    let a = a.as_f32x4();
+    let b = b.as_f32x4();
     macro_rules! call {
         ($imm8:expr, $imm4:expr) => {
-            vcomiss(a.as_f32x4(), b.as_f32x4(), $imm8, $imm4)
+            vcomiss(a, b, $imm8, $imm4)
         };
     }
     let r = constify_imm5_sae!(imm8, sae, call);
@@ -34052,9 +32949,11 @@ pub unsafe fn _mm_comi_round_ss(a: __m128, b: __m128, imm8: i32, sae: i32) -> i3
 #[cfg_attr(test, assert_instr(vcmp, imm8 = 5, sae = 4))] //should be vcomisd
 #[rustc_args_required_const(2, 3)]
 pub unsafe fn _mm_comi_round_sd(a: __m128d, b: __m128d, imm8: i32, sae: i32) -> i32 {
+    let a = a.as_f64x2();
+    let b = b.as_f64x2();
     macro_rules! call {
         ($imm8:expr, $imm4:expr) => {
-            vcomisd(a.as_f64x2(), b.as_f64x2(), $imm8, $imm4)
+            vcomisd(a, b, $imm8, $imm4)
         };
     }
     let r = constify_imm5_sae!(imm8, sae, call);
@@ -43264,44 +42163,64 @@ mod tests {
 
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_permute_ps() {
-        let a = _mm512_set_ps(
+        let a = _mm512_setr_ps(
             0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
         );
-        let r = _mm512_permute_ps(a, 1);
-        let e = _mm512_set_ps(
-            2., 2., 2., 2., 6., 6., 6., 6., 10., 10., 10., 10., 14., 14., 14., 14.,
+        let r = _mm512_permute_ps(a, 0b11111111);
+        let e = _mm512_setr_ps(
+            3., 3., 3., 3., 7., 7., 7., 7., 11., 11., 11., 11., 15., 15., 15., 15.,
         );
         assert_eq_m512(r, e);
     }
 
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_mask_permute_ps() {
-        let a = _mm512_set_ps(
+        let a = _mm512_setr_ps(
             0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
         );
-        let r = _mm512_mask_permute_ps(a, 0b00000000_00000000, a, 1);
+        let r = _mm512_mask_permute_ps(a, 0, a, 0b11111111);
         assert_eq_m512(r, a);
-        let r = _mm512_mask_permute_ps(a, 0b11111111_11111111, a, 1);
-        let e = _mm512_set_ps(
-            2., 2., 2., 2., 6., 6., 6., 6., 10., 10., 10., 10., 14., 14., 14., 14.,
+        let r = _mm512_mask_permute_ps(a, 0b11111111_11111111, a, 0b111111111);
+        let e = _mm512_setr_ps(
+            3., 3., 3., 3., 7., 7., 7., 7., 11., 11., 11., 11., 15., 15., 15., 15.,
         );
         assert_eq_m512(r, e);
     }
 
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_maskz_permute_ps() {
-        let a = _mm512_set_ps(
+        let a = _mm512_setr_ps(
             0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
         );
-        let r = _mm512_maskz_permute_ps(0, a, 1);
+        let r = _mm512_maskz_permute_ps(0, a, 0b11111111);
         assert_eq_m512(r, _mm512_setzero_ps());
-        let r = _mm512_maskz_permute_ps(0b00000000_11111111, a, 1);
-        let e = _mm512_set_ps(
-            0., 0., 0., 0., 0., 0., 0., 0., 10., 10., 10., 10., 14., 14., 14., 14.,
+        let r = _mm512_maskz_permute_ps(0b11111111_11111111, a, 0b11111111);
+        let e = _mm512_setr_ps(
+            3., 3., 3., 3., 7., 7., 7., 7., 11., 11., 11., 11., 15., 15., 15., 15.,
         );
         assert_eq_m512(r, e);
     }
 
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_permute_ps() {
+        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let r = _mm256_mask_permute_ps(a, 0, a, 0b11111111);
+        assert_eq_m256(r, a);
+        let r = _mm256_mask_permute_ps(a, 0b11111111, a, 0b11111111);
+        let e = _mm256_set_ps(0., 0., 0., 0., 4., 4., 4., 4.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_permute_ps() {
+        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
+        let r = _mm256_maskz_permute_ps(0, a, 0b11111111);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_permute_ps(0b11111111, a, 0b11111111);
+        let e = _mm256_set_ps(0., 0., 0., 0., 4., 4., 4., 4.);
+        assert_eq_m256(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_permutevar_epi32() {
         let idx = _mm512_set1_epi32(1);
diff --git a/library/stdarch/crates/core_arch/src/x86_64/avx512f.rs b/library/stdarch/crates/core_arch/src/x86_64/avx512f.rs
index 69b497bca8a5..7adb31c1ccb7 100644
--- a/library/stdarch/crates/core_arch/src/x86_64/avx512f.rs
+++ b/library/stdarch/crates/core_arch/src/x86_64/avx512f.rs
@@ -7288,85 +7288,85 @@ mod tests {
 
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_permute_pd() {
-        let a = _mm512_set_pd(0., 1., 2., 3., 4., 5., 6., 7.);
-        let r = _mm512_permute_pd(a, 1);
-        let e = _mm512_set_pd(1., 1., 3., 3., 5., 5., 7., 7.);
+        let a = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let r = _mm512_permute_pd(a, 0b1111);
+        let e = _mm512_setr_pd(1., 1., 3., 3., 5., 5., 7., 7.);
         assert_eq_m512d(r, e);
     }
 
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_mask_permute_pd() {
-        let a = _mm512_set_pd(0., 1., 2., 3., 4., 5., 6., 7.);
-        let r = _mm512_mask_permute_pd(a, 0, a, 1);
+        let a = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let r = _mm512_mask_permute_pd(a, 0, a, 0b1111);
         assert_eq_m512d(r, a);
-        let r = _mm512_mask_permute_pd(a, 0b11111111, a, 1);
-        let e = _mm512_set_pd(1., 1., 3., 3., 5., 5., 7., 7.);
+        let r = _mm512_mask_permute_pd(a, 0b11111111, a, 0b1111);
+        let e = _mm512_setr_pd(1., 1., 3., 3., 5., 5., 7., 7.);
         assert_eq_m512d(r, e);
     }
 
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_maskz_permute_pd() {
-        let a = _mm512_set_pd(0., 1., 2., 3., 4., 5., 6., 7.);
-        let r = _mm512_maskz_permute_pd(0, a, 1);
+        let a = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let r = _mm512_maskz_permute_pd(0, a, 0b1111);
         assert_eq_m512d(r, _mm512_setzero_pd());
-        let r = _mm512_maskz_permute_pd(0b00001111, a, 1);
-        let e = _mm512_set_pd(0., 0., 0., 0., 5., 5., 7., 7.);
+        let r = _mm512_maskz_permute_pd(0b11111111, a, 0b1111);
+        let e = _mm512_setr_pd(1., 1., 3., 3., 5., 5., 7., 7.);
         assert_eq_m512d(r, e);
     }
 
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_permutex_epi64() {
-        let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
-        let r = _mm512_permutex_epi64(a, 1);
-        let e = _mm512_set_epi64(6, 6, 6, 6, 6, 6, 6, 6);
+        let a = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm512_permutex_epi64(a, 0b11111111);
+        let e = _mm512_setr_epi64(3, 3, 3, 3, 7, 7, 7, 7);
         assert_eq_m512i(r, e);
     }
 
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_mask_permutex_epi64() {
-        let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
-        let r = _mm512_mask_permutex_epi64(a, 0, a, 1);
+        let a = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm512_mask_permutex_epi64(a, 0, a, 0b11111111);
         assert_eq_m512i(r, a);
-        let r = _mm512_mask_permutex_epi64(a, 0b11111111, a, 1);
-        let e = _mm512_set_epi64(6, 6, 6, 6, 6, 6, 6, 6);
+        let r = _mm512_mask_permutex_epi64(a, 0b11111111, a, 0b11111111);
+        let e = _mm512_setr_epi64(3, 3, 3, 3, 7, 7, 7, 7);
         assert_eq_m512i(r, e);
     }
 
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_maskz_permutex_epi64() {
-        let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
-        let r = _mm512_maskz_permutex_epi64(0, a, 1);
+        let a = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        let r = _mm512_maskz_permutex_epi64(0, a, 0b11111111);
         assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_permutex_epi64(0b00001111, a, 1);
-        let e = _mm512_set_epi64(0, 0, 0, 0, 6, 6, 6, 6);
+        let r = _mm512_maskz_permutex_epi64(0b11111111, a, 0b11111111);
+        let e = _mm512_setr_epi64(3, 3, 3, 3, 7, 7, 7, 7);
         assert_eq_m512i(r, e);
     }
 
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_permutex_pd() {
-        let a = _mm512_set_pd(0., 1., 2., 3., 4., 5., 6., 7.);
-        let r = _mm512_permutex_pd(a, 1);
-        let e = _mm512_set_pd(6., 6., 6., 6., 6., 6., 6., 6.);
+        let a = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let r = _mm512_permutex_pd(a, 0b11111111);
+        let e = _mm512_setr_pd(3., 3., 3., 3., 7., 7., 7., 7.);
         assert_eq_m512d(r, e);
     }
 
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_mask_permutex_pd() {
-        let a = _mm512_set_pd(0., 1., 2., 3., 4., 5., 6., 7.);
-        let r = _mm512_mask_permutex_pd(a, 0, a, 1);
+        let a = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let r = _mm512_mask_permutex_pd(a, 0, a, 0b11111111);
         assert_eq_m512d(r, a);
-        let r = _mm512_mask_permutex_pd(a, 0b11111111, a, 1);
-        let e = _mm512_set_pd(6., 6., 6., 6., 6., 6., 6., 6.);
+        let r = _mm512_mask_permutex_pd(a, 0b11111111, a, 0b11111111);
+        let e = _mm512_setr_pd(3., 3., 3., 3., 7., 7., 7., 7.);
         assert_eq_m512d(r, e);
     }
 
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_maskz_permutex_pd() {
-        let a = _mm512_set_pd(0., 1., 2., 3., 4., 5., 6., 7.);
-        let r = _mm512_maskz_permutex_pd(0, a, 1);
+        let a = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        let r = _mm512_maskz_permutex_pd(0, a, 0b11111111);
         assert_eq_m512d(r, _mm512_setzero_pd());
-        let r = _mm512_maskz_permutex_pd(0b00001111, a, 1);
-        let e = _mm512_set_pd(0., 0., 0., 0., 6., 6., 6., 6.);
+        let r = _mm512_maskz_permutex_pd(0b11111111, a, 0b11111111);
+        let e = _mm512_setr_pd(3., 3., 3., 3., 7., 7., 7., 7.);
         assert_eq_m512d(r, e);
     }