diff --git a/library/stdarch/crates/core_arch/avx512f.md b/library/stdarch/crates/core_arch/avx512f.md
index dc6440a53841..0fcf4d96417b 100644
--- a/library/stdarch/crates/core_arch/avx512f.md
+++ b/library/stdarch/crates/core_arch/avx512f.md
@@ -1176,22 +1176,167 @@
   * [x] [`_mm256_mask_andnot_epi64`]
   * [x] [`_mm256_maskz_andnot_epi64`]
   * [x] [`_mm512_andnot_si512`]
-
+  * [x] [`_mm512_mask_unpackhi_epi32`]
+  * [x] [`_mm512_unpackhi_epi32`]
+  * [x] [`_mm_mask_unpackhi_epi32`]
+  * [x] [`_mm_maskz_unpackhi_epi32`]
+  * [x] [`_mm256_mask_unpackhi_epi32`]
+  * [x] [`_mm256_maskz_unpackhi_epi32`]
+  * [x] [`_mm512_unpackhi_epi64`]
+  * [x] [`_mm512_mask_unpackhi_epi64`]
+  * [x] [`_mm_mask_unpackhi_epi64`]
+  * [x] [`_mm_maskz_unpackhi_epi64`]
+  * [x] [`_mm256_mask_unpackhi_epi64`]
+  * [x] [`_mm256_maskz_unpackhi_epi64`]
+  * [x] [`_mm512_unpackhi_ps`]
+  * [x] [`_mm512_mask_unpackhi_ps`]
+  * [x] [`_mm_mask_unpackhi_ps`]
+  * [x] [`_mm_maskz_unpackhi_ps`]
+  * [x] [`_mm256_mask_unpackhi_ps`]
+  * [x] [`_mm256_maskz_unpackhi_ps`]
+  * [x] [`_mm512_unpackhi_pd`]
+  * [x] [`_mm512_mask_unpackhi_pd`]
+  * [x] [`_mm_mask_unpackhi_pd`]
+  * [x] [`_mm_maskz_unpackhi_pd`]
+  * [x] [`_mm256_mask_unpackhi_pd`]
+  * [x] [`_mm256_maskz_unpackhi_pd`]
+  * [x] [`_mm512_mask_unpacklo_epi32`]
+  * [x] [`_mm512_unpacklo_epi32`]
+  * [x] [`_mm_mask_unpacklo_epi32`]
+  * [x] [`_mm_maskz_unpacklo_epi32`]
+  * [x] [`_mm256_mask_unpacklo_epi32`]
+  * [x] [`_mm256_maskz_unpacklo_epi32`]
+  * [x] [`_mm512_unpacklo_epi64`]
+  * [x] [`_mm512_mask_unpacklo_epi64`]
+  * [x] [`_mm_mask_unpacklo_epi64`]
+  * [x] [`_mm_maskz_unpacklo_epi64`]
+  * [x] [`_mm256_mask_unpacklo_epi64`]
+  * [x] [`_mm256_maskz_unpacklo_epi64`]
+  * [x] [`_mm512_unpacklo_ps`]
+  * [x] [`_mm512_mask_unpacklo_ps`]
+  * [x] [`_mm_mask_unpacklo_ps`]
+  * [x] [`_mm_maskz_unpacklo_ps`]
+  * [x] [`_mm256_mask_unpacklo_ps`]
+  * [x] [`_mm256_maskz_unpacklo_ps`]
+  * [x] [`_mm512_unpacklo_pd`]
+  * [x] [`_mm512_mask_unpacklo_pd`]
+  * [x] [`_mm_mask_unpacklo_pd`]
+  * [x] [`_mm_maskz_unpacklo_pd`]
+  * [x] [`_mm256_mask_unpacklo_pd`]
+  * [x] [`_mm256_maskz_unpacklo_pd`]
+  * [x] [`_mm512_mask_blend_epi32`]
+  * [x] [`_mm_mask_blend_epi32`]
+  * [x] [`_mm256_mask_blend_epi32`]
+  * [x] [`_mm512_mask_blend_epi64`]
+  * [x] [`_mm_mask_blend_epi64`]
+  * [x] [`_mm256_mask_blend_epi64`]
+  * [x] [`_mm512_mask_blend_ps`]
+  * [x] [`_mm_mask_blend_ps`]
+  * [x] [`_mm256_mask_blend_ps`]
+  * [x] [`_mm512_mask_blend_pd`]
+  * [x] [`_mm_mask_blend_pd`]
+  * [x] [`_mm256_mask_blend_pd`]
+  * [x] [`_mm512_broadcast_f32x4`]
+  * [x] [`_mm512_mask_broadcast_f32x4`]
+  * [x] [`_mm512_maskz_broadcast_f32x4`]
+  * [x] [`_mm256_broadcast_f32x4`]
+  * [x] [`_mm256_mask_broadcast_f32x4`]
+  * [x] [`_mm256_maskz_broadcast_f32x4`]
+  * [x] [`_mm512_broadcast_f64x4`]
+  * [x] [`_mm512_mask_broadcast_f64x4`]
+  * [x] [`_mm512_maskz_broadcast_f64x4`]
+  * [x] [`_mm512_broadcast_i32x4`]
+  * [x] [`_mm512_mask_broadcast_i32x4`]
+  * [x] [`_mm512_maskz_broadcast_i32x4`]
+  * [x] [`_mm256_broadcast_i32x4`]
+  * [x] [`_mm256_mask_broadcast_i32x4`]
+  * [x] [`_mm256_maskz_broadcast_i32x4`]
+  * [x] [`_mm512_broadcast_i64x4`]
+  * [x] [`_mm512_mask_broadcast_i64x4`]
+  * [x] [`_mm512_maskz_broadcast_i64x4`]
+  * [x] [`_mm512_broadcastd_epi32`]
+  * [x] [`_mm512_mask_broadcastd_epi32`]
+  * [x] [`_mm512_maskz_broadcastd_epi32`]
+  * [x] [`_mm_mask_broadcastd_epi32`]
+  * [x] [`_mm_maskz_broadcastd_epi32`]
+  * [x] [`_mm256_mask_broadcastd_epi32`]
+  * [x] [`_mm256_maskz_broadcastd_epi32`]
+  * [x] [`_mm512_broadcastq_epi64`]
+  * [x] [`_mm512_mask_broadcastq_epi64`]
+  * [x] [`_mm512_maskz_broadcastq_epi64`]
+  * [x] [`_mm_mask_broadcastq_epi64`]
+  * [x] [`_mm_maskz_broadcastq_epi64`]
+  * [x] [`_mm256_mask_broadcastq_epi64`]
+  * [x] [`_mm256_maskz_broadcastq_epi64`]
+  * [x] [`_mm512_broadcastss_ps`]
+  * [x] [`_mm512_mask_broadcastss_ps`]
+  * [x] [`_mm512_maskz_broadcastss_ps`]
+  * [x] [`_mm_mask_broadcastss_ps`]
+  * [x] [`_mm_maskz_broadcastss_ps`]
+  * [x] [`_mm256_mask_broadcastss_ps`]
+  * [x] [`_mm256_maskz_broadcastss_ps`]
+  * [x] [`_mm512_broadcastsd_pd`]
+  * [x] [`_mm512_mask_broadcastsd_pd`]
+  * [x] [`_mm512_maskz_broadcastsd_pd`]
+  * [x] [`_mm256_mask_broadcastsd_pd`]
+  * [x] [`_mm256_maskz_broadcastsd_pd`]
+  * [x] [`_mm512_shuffle_epi32`]
+  * [x] [`_mm512_mask_shuffle_epi32`]
+  * [x] [`_mm_mask_shuffle_epi32`]
+  * [x] [`_mm_maskz_shuffle_epi32`]
+  * [x] [`_mm256_mask_shuffle_epi32`]
+  * [x] [`_mm256_maskz_shuffle_epi32`]
+  * [x] [`_mm512_shuffle_ps`]
+  * [x] [`_mm512_mask_shuffle_ps`]
+  * [x] [`_mm_mask_shuffle_ps`]
+  * [x] [`_mm_maskz_shuffle_ps`]
+  * [x] [`_mm256_mask_shuffle_ps`]
+  * [x] [`_mm256_maskz_shuffle_ps`]
+  * [x] [`_mm512_shuffle_pd`]
+  * [x] [`_mm512_mask_shuffle_pd`]
+  * [x] [`_mm_mask_shuffle_pd`]
+  * [x] [`_mm_maskz_shuffle_pd`]
+  * [x] [`_mm256_mask_shuffle_pd`]
+  * [x] [`_mm256_maskz_shuffle_pd`]
+  * [x] [`_mm512_shuffle_i32x4`]
+  * [x] [`_mm512_mask_shuffle_i32x4`]
+  * [x] [`_mm256_mask_shuffle_i32x4`]
+  * [x] [`_mm256_maskz_shuffle_i32x4`]
+  * [x] [`_mm256_shuffle_i32x4`]
+  * [x] [`_mm512_shuffle_i64x2`]
+  * [x] [`_mm512_mask_shuffle_i64x2`]
+  * [x] [`_mm256_mask_shuffle_i64x2`]
+  * [x] [`_mm256_maskz_shuffle_i64x2`]
+  * [x] [`_mm256_shuffle_i64x2`]
+  * [x] [`_mm512_shuffle_f32x4`]
+  * [x] [`_mm512_mask_shuffle_f32x4`]
+  * [x] [`_mm256_mask_shuffle_f32x4`]
+  * [x] [`_mm256_maskz_shuffle_f32x4`]
+  * [x] [`_mm256_shuffle_f32x4`]
+  * [x] [`_mm512_shuffle_f64x2`]
+  * [x] [`_mm512_mask_shuffle_f64x2`]
+  * [x] [`_mm256_mask_shuffle_f64x2`]
+  * [x] [`_mm256_maskz_shuffle_f64x2`]
+  * [x] [`_mm256_shuffle_f64x2`]
   * [x] [`_mm512_alignr_epi32`]
   * [x] [`_mm512_mask_alignr_epi32`]
   * [x] [`_mm512_maskz_alignr_epi32`]
+  * [x] [`_mm_alignr_epi32`]
+  * [x] [`_mm_mask_alignr_epi32`]
+  * [x] [`_mm_maskz_alignr_epi32`]
+  * [x] [`_mm256_alignr_epi32`]
+  * [x] [`_mm256_mask_alignr_epi32`]
+  * [x] [`_mm256_maskz_alignr_epi32`]
   * [x] [`_mm512_alignr_epi64`]
   * [x] [`_mm512_mask_alignr_epi64`]
   * [x] [`_mm512_maskz_alignr_epi64`]
+  * [x] [`_mm_alignr_epi64`]
+  * [x] [`_mm_mask_alignr_epi64`]
+  * [x] [`_mm_maskz_alignr_epi64`]
+  * [x] [`_mm256_alignr_epi64`]
+  * [x] [`_mm256_mask_alignr_epi64`]
+  * [x] [`_mm256_maskz_alignr_epi64`]
 
-  * [x] [`_mm512_broadcast_f32x4`]
-  * [x] [`_mm512_broadcast_f64x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcast_f64x4&expand=5236)
-  * [x] [`_mm512_broadcast_i32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcast_i32x4&expand=5236)
-  * [x] [`_mm512_broadcast_i64x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcast_i64x4&expand=5236)
-  * [x] [`_mm512_broadcastd_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcastd_epi32&expand=5236)
-  * [x] [`_mm512_broadcastq_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcastq_epi64&expand=5236)
-  * [x] [`_mm512_broadcastsd_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcastsd_pd&expand=5236)
-  * [x] [`_mm512_broadcastss_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcastss_ps&expand=5236)
   * [x] [`_mm512_castpd128_pd512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castpd128_pd512&expand=5236)
   * [x] [`_mm512_castpd256_pd512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castpd256_pd512&expand=5236)
   * [x] [`_mm512_castpd512_pd128`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castpd512_pd128&expand=5236)
@@ -1330,20 +1475,8 @@
   * [x] [`_mm512_mask2_permutex2var_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask2_permutex2var_pd&expand=5236)
   * [x] [`_mm512_mask2_permutex2var_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask2_permutex2var_ps&expand=5236)
   * [x] [`_mm512_mask2int`]
-  * [x] [`_mm512_mask_blend_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_blend_epi32&expand=5236)
-  * [x] [`_mm512_mask_blend_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_blend_epi64&expand=5236)
-  * [x] [`_mm512_mask_blend_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_blend_pd&expand=5236)
-  * [x] [`_mm512_mask_blend_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_blend_ps&expand=5236)
-  * [x] [`_mm512_mask_broadcast_f32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_broadcast_f32x4&expand=5236)
-  * [x] [`_mm512_mask_broadcast_f64x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_broadcast_f64x4&expand=5236)
-  * [x] [`_mm512_mask_broadcast_i32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_broadcast_i32x4&expand=5236)
-  * [x] [`_mm512_mask_broadcast_i64x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_broadcast_i64x4&expand=5236)
-  * [x] [`_mm512_mask_broadcastd_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_broadcastd_epi32&expand=5236)
-  * [x] [`_mm512_mask_broadcastq_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_broadcastq_epi64&expand=5236)
-  * [x] [`_mm512_mask_broadcastsd_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_broadcastsd_pd&expand=5236)
-  * [x] [`_mm512_mask_broadcastss_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_broadcastss_ps&expand=5236)
-  * [x] [`_mm512_mask_compress_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_compress_epi32&expand=5236)
-  * [x] [`_mm512_mask_compress_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_compress_epi64&expand=5236)
+  * [x] [`_mm512_mask_compress_epi32`]
+  * [x] [`_mm512_mask_compress_epi64`]
   * [x] [`_mm512_mask_compress_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_compress_pd&expand=5236)
   * [x] [`_mm512_mask_compress_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_compress_ps&expand=5236)
   * [ ] [`_mm512_mask_compressstoreu_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_compressstoreu_epi32&expand=5236)
@@ -1492,13 +1625,6 @@
   * [x] [`_mm512_mask_permutexvar_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_permutexvar_ps&expand=5236)
   * [x] [`_mm512_mask_set1_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_set1_epi32&expand=5236)
   * [x] [`_mm512_mask_set1_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_set1_epi64&expand=5236)
-  * [x] [`_mm512_mask_shuffle_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shuffle_epi32&expand=5236)
-  * [x] [`_mm512_mask_shuffle_f32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shuffle_f32x4&expand=5236)
-  * [x] [`_mm512_mask_shuffle_f64x2`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shuffle_f64x2&expand=5236)
-  * [x] [`_mm512_mask_shuffle_i32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shuffle_i32x4&expand=5236)
-  * [x] [`_mm512_mask_shuffle_i64x2`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shuffle_i64x2&expand=5236)
-  * [x] [`_mm512_mask_shuffle_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shuffle_pd&expand=5236)
-  * [x] [`_mm512_mask_shuffle_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shuffle_ps&expand=5236)
   * [ ] [`_mm512_mask_store_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_store_epi32&expand=5236)
   * [ ] [`_mm512_mask_store_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_store_epi64&expand=5236)
   * [ ] [`_mm512_mask_store_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_store_pd&expand=5236)
@@ -1513,22 +1639,6 @@
   * [x] [`_mm512_mask_test_epi64_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_test_epi64_mask&expand=5236)
   * [x] [`_mm512_mask_testn_epi32_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_testn_epi32_mask&expand=5236)
   * [x] [`_mm512_mask_testn_epi64_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_testn_epi64_mask&expand=5236)
-  * [x] [`_mm512_mask_unpackhi_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_unpackhi_epi32&expand=5236)
-  * [x] [`_mm512_mask_unpackhi_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_unpackhi_epi64&expand=5236)
-  * [x] [`_mm512_mask_unpackhi_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_unpackhi_pd&expand=5236)
-  * [x] [`_mm512_mask_unpackhi_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_unpackhi_ps&expand=5236)
-  * [x] [`_mm512_mask_unpacklo_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_unpacklo_epi32&expand=5236)
-  * [x] [`_mm512_mask_unpacklo_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_unpacklo_epi64&expand=5236)
-  * [x] [`_mm512_mask_unpacklo_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_unpacklo_pd&expand=5236)
-  * [x] [`_mm512_mask_unpacklo_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_unpacklo_ps&expand=5236)
-  * [x] [`_mm512_maskz_broadcast_f32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_broadcast_f32x4&expand=5236)
-  * [x] [`_mm512_maskz_broadcast_f64x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_broadcast_f64x4&expand=5236)
-  * [x] [`_mm512_maskz_broadcast_i32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_broadcast_i32x4&expand=5236)
-  * [x] [`_mm512_maskz_broadcast_i64x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_broadcast_i64x4&expand=5236)
-  * [x] [`_mm512_maskz_broadcastd_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_broadcastd_epi32&expand=5236)
-  * [x] [`_mm512_maskz_broadcastq_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_broadcastq_epi64&expand=5236)
-  * [x] [`_mm512_maskz_broadcastsd_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_broadcastsd_pd&expand=5236)
-  * [x] [`_mm512_maskz_broadcastss_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_broadcastss_ps&expand=5236)
   * [x] [`_mm512_maskz_compress_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_compress_epi32&expand=5236)
   * [x] [`_mm512_maskz_compress_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_compress_epi64&expand=5236)
   * [x] [`_mm512_maskz_compress_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_compress_pd&expand=5236)
@@ -1680,22 +1790,15 @@
   * [x] [`_mm512_setr4_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_setr4_epi64&expand=5236)
   * [x] [`_mm512_setr4_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_setr4_pd&expand=5236)
   * [x] [`_mm512_setr4_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_setr4_ps&expand=5236)
-  * [x] [`_mm512_setr_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_setr_epi32&expand=5236)
-  * [x] [`_mm512_setr_epi64`](https:/software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_setr_epi64&expand=5236)
-  * [x] [`_mm512_setr_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_setr_pd&expand=5236)
-  * [x] [`_mm512_setr_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_setr_ps&expand=5236)
-  * [x] [`_mm512_setzero_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_setzero_epi32&expand=5236)
-  * [x] [`_mm512_setzero_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_setzero_pd&expand=5236)
-  * [x] [`_mm512_setzero_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_setzero_ps&expand=5236)
-  * [x] [`_mm512_setzero_si512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_setzero_si512&expand=5236)
-  * [x] [`_mm512_setzero`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_setzero&expand=5236)
-  * [x] [`_mm512_shuffle_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shuffle_epi32&expand=5236)
-  * [x] [`_mm512_shuffle_f32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shuffle_f32x4&expand=5236)
-  * [x] [`_mm512_shuffle_f64x2`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shuffle_f64x2&expand=5236)
-  * [x] [`_mm512_shuffle_i32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shuffle_i32x4&expand=5236)
-  * [x] [`_mm512_shuffle_i64x2`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shuffle_i64x2&expand=5236)
-  * [x] [`_mm512_shuffle_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shuffle_pd&expand=5236)
-  * [x] [`_mm512_shuffle_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shuffle_ps&expand=5236)
+  * [x] [`_mm512_setr_epi32`]
+  * [x] [`_mm512_setr_epi64`]
+  * [x] [`_mm512_setr_pd`]
+  * [x] [`_mm512_setr_ps`]
+  * [x] [`_mm512_setzero_epi32`]
+  * [x] [`_mm512_setzero_pd`]
+  * [x] [`_mm512_setzero_ps`]
+  * [x] [`_mm512_setzero_si512`]
+  * [x] [`_mm512_setzero`]
   * [x] [`_mm512_store_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_store_epi32&expand=5236)
   * [x] [`_mm512_store_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_store_epi64&expand=5236)
   * [x] [`_mm512_store_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_store_pd&expand=5236)
@@ -1721,14 +1824,6 @@
   * [x] [`_mm512_undefined_pd`]
   * [x] [`_mm512_undefined_ps`]
   * [x] [`_mm512_undefined`]
-  * [x] [`_mm512_unpackhi_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_unpackhi_epi32&expand=5236)
-  * [x] [`_mm512_unpackhi_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_unpackhi_epi64&expand=5236)
-  * [x] [`_mm512_unpackhi_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_unpackhi_pd&expand=5236)
-  * [x] [`_mm512_unpackhi_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_unpackhi_ps&expand=5236)
-  * [x] [`_mm512_unpacklo_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_unpacklo_epi32&expand=5236)
-  * [x] [`_mm512_unpacklo_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_unpacklo_epi64&expand=5236)
-  * [x] [`_mm512_unpacklo_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_unpacklo_pd&expand=5236)
-  * [x] [`_mm512_unpacklo_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_unpacklo_ps&expand=5236)
   * [x] [`_mm512_zextpd128_pd512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_zextpd128_pd512&expand=5236)
   * [x] [`_mm512_zextpd256_pd512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_zextpd256_pd512&expand=5236)
   * [x] [`_mm512_zextps128_ps512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_zextps128_ps512&expand=5236)
diff --git a/library/stdarch/crates/core_arch/src/x86/avx512f.rs b/library/stdarch/crates/core_arch/src/x86/avx512f.rs
index e915260ac87d..e2462b496fbe 100644
--- a/library/stdarch/crates/core_arch/src/x86/avx512f.rs
+++ b/library/stdarch/crates/core_arch/src/x86/avx512f.rs
@@ -17898,7 +17898,6 @@ pub unsafe fn _mm512_mask2_permutex2var_pd(
 #[rustc_args_required_const(1)]
 pub unsafe fn _mm512_shuffle_epi32(a: __m512i, imm8: _MM_PERM_ENUM) -> __m512i {
     let imm8 = (imm8 & 0xFF) as u8;
-
     let a = a.as_i32x16();
     macro_rules! shuffle4 {
         (
@@ -17969,10 +17968,10 @@ pub unsafe fn _mm512_shuffle_epi32(a: __m512i, imm8: _MM_PERM_ENUM) -> __m512i {
 
 /// Shuffle 32-bit integers in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_shuffle_epi32&expand=5148)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shuffle_epi32&expand=5148)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vpshufd, imm8 = 9))] //should be vpshufd, but generate vpermilps
+#[cfg_attr(test, assert_instr(vpshufd, imm8 = 9))]
 #[rustc_args_required_const(3)]
 pub unsafe fn _mm512_mask_shuffle_epi32(
     src: __m512i,
@@ -17981,7 +17980,6 @@ pub unsafe fn _mm512_mask_shuffle_epi32(
     imm8: _MM_PERM_ENUM,
 ) -> __m512i {
     let imm8 = (imm8 & 0xFF) as u8;
-
     let a = a.as_i32x16();
     macro_rules! shuffle4 {
         (
@@ -18052,14 +18050,13 @@ pub unsafe fn _mm512_mask_shuffle_epi32(
 
 /// Shuffle 32-bit integers in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_shuffle_epi32&expand=5149)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shuffle_epi32&expand=5149)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vpshufd, imm8 = 9))] //should be vpshufd, but generate vpermilps
+#[cfg_attr(test, assert_instr(vpshufd, imm8 = 9))]
 #[rustc_args_required_const(2)]
 pub unsafe fn _mm512_maskz_shuffle_epi32(k: __mmask16, a: __m512i, imm8: _MM_PERM_ENUM) -> __m512i {
     let imm8 = (imm8 & 0xFF) as u8;
-
     let a = a.as_i32x16();
     macro_rules! shuffle4 {
         (
@@ -18129,9 +18126,89 @@ pub unsafe fn _mm512_maskz_shuffle_epi32(k: __mmask16, a: __m512i, imm8: _MM_PER
     transmute(simd_select_bitmask(k, shuffle, zero))
 }
 
+/// Shuffle 32-bit integers in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shuffle_epi32&expand=5145)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshufd, imm8 = 9))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm256_mask_shuffle_epi32(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+    imm8: _MM_PERM_ENUM,
+) -> __m256i {
+    macro_rules! call {
+        ($imm8:expr) => {
+            _mm256_shuffle_epi32(a, $imm8)
+        };
+    }
+    let r = constify_imm8_sae!(imm8, call);
+    transmute(simd_select_bitmask(k, r.as_i32x8(), src.as_i32x8()))
+}
+
+/// Shuffle 32-bit integers in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shuffle_epi32&expand=5146)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshufd, imm8 = 9))]
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm256_maskz_shuffle_epi32(k: __mmask8, a: __m256i, imm8: _MM_PERM_ENUM) -> __m256i {
+    macro_rules! call {
+        ($imm8:expr) => {
+            _mm256_shuffle_epi32(a, $imm8)
+        };
+    }
+    let r = constify_imm8_sae!(imm8, call);
+    let zero = _mm256_setzero_si256().as_i32x8();
+    transmute(simd_select_bitmask(k, r.as_i32x8(), zero))
+}
+
+/// Shuffle 32-bit integers in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_shuffle_epi32&expand=5142)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshufd, imm8 = 9))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm_mask_shuffle_epi32(
+    src: __m128i,
+    k: __mmask8,
+    a: __m128i,
+    imm8: _MM_PERM_ENUM,
+) -> __m128i {
+    macro_rules! call {
+        ($imm8:expr) => {
+            _mm_shuffle_epi32(a, $imm8)
+        };
+    }
+    let r = constify_imm8_sae!(imm8, call);
+    transmute(simd_select_bitmask(k, r.as_i32x4(), src.as_i32x4()))
+}
+
+/// Shuffle 32-bit integers in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_shuffle_epi32&expand=5143)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpshufd, imm8 = 9))]
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm_maskz_shuffle_epi32(k: __mmask8, a: __m128i, imm8: _MM_PERM_ENUM) -> __m128i {
+    macro_rules! call {
+        ($imm8:expr) => {
+            _mm_shuffle_epi32(a, $imm8)
+        };
+    }
+    let r = constify_imm8_sae!(imm8, call);
+    let zero = _mm_setzero_si128().as_i32x4();
+    transmute(simd_select_bitmask(k, r.as_i32x4(), zero))
+}
+
 /// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_shuffle_ps&expand=5203)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shuffle_ps&expand=5203)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vshufps, imm8 = 0))]
@@ -18207,7 +18284,7 @@ pub unsafe fn _mm512_shuffle_ps(a: __m512, b: __m512, imm8: i32) -> __m512 {
 
 /// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_shuffle_ps&expand=5201)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shuffle_ps&expand=5201)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vshufps, imm8 = 0))]
@@ -18291,7 +18368,7 @@ pub unsafe fn _mm512_mask_shuffle_ps(
 
 /// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_shuffle_ps&expand=5202)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shuffle_ps&expand=5202)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vshufps, imm8 = 0))]
@@ -18368,9 +18445,91 @@ pub unsafe fn _mm512_maskz_shuffle_ps(k: __mmask16, a: __m512, b: __m512, imm8:
     transmute(simd_select_bitmask(k, shuffle, zero))
 }
 
+/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shuffle_ps&expand=5198)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vshufps, imm8 = 9))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm256_mask_shuffle_ps(
+    src: __m256,
+    k: __mmask8,
+    a: __m256,
+    b: __m256,
+    imm8: i32,
+) -> __m256 {
+    macro_rules! call {
+        ($imm8:expr) => {
+            _mm256_shuffle_ps(a, b, $imm8)
+        };
+    }
+    let r = constify_imm8_sae!(imm8, call);
+    transmute(simd_select_bitmask(k, r.as_f32x8(), src.as_f32x8()))
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shuffle_ps&expand=5199)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vshufps, imm8 = 9))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm256_maskz_shuffle_ps(k: __mmask8, a: __m256, b: __m256, imm8: i32) -> __m256 {
+    macro_rules! call {
+        ($imm8:expr) => {
+            _mm256_shuffle_ps(a, b, $imm8)
+        };
+    }
+    let r = constify_imm8_sae!(imm8, call);
+    let zero = _mm256_setzero_ps().as_f32x8();
+    transmute(simd_select_bitmask(k, r.as_f32x8(), zero))
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_shuffle_ps&expand=5195)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vshufps, imm8 = 9))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm_mask_shuffle_ps(
+    src: __m128,
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+    imm8: i32,
+) -> __m128 {
+    macro_rules! call {
+        ($imm8:expr) => {
+            _mm_shuffle_ps(a, b, $imm8)
+        };
+    }
+    let r = constify_imm8_sae!(imm8, call);
+    transmute(simd_select_bitmask(k, r.as_f32x4(), src.as_f32x4()))
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in a using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_shuffle_ps&expand=5196)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vshufps, imm8 = 9))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm_maskz_shuffle_ps(k: __mmask8, a: __m128, b: __m128, imm8: i32) -> __m128 {
+    macro_rules! call {
+        ($imm8:expr) => {
+            _mm_shuffle_ps(a, b, $imm8)
+        };
+    }
+    let r = constify_imm8_sae!(imm8, call);
+    let zero = _mm_setzero_ps().as_f32x4();
+    transmute(simd_select_bitmask(k, r.as_f32x4(), zero))
+}
+
 /// Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in imm8, and store the results in dst.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_shuffle_pd&expand=5192)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shuffle_pd&expand=5192)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vshufpd, imm8 = 3))]
@@ -18447,7 +18606,7 @@ pub unsafe fn _mm512_shuffle_pd(a: __m512d, b: __m512d, imm8: i32) -> __m512d {
 
 /// Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_shuffle_pd&expand=5190)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shuffle_pd&expand=5190)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vshufpd, imm8 = 3))]
@@ -18532,7 +18691,7 @@ pub unsafe fn _mm512_mask_shuffle_pd(
 
 /// Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_shuffle_pd&expand=5191)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shuffle_pd&expand=5191)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vshufpd, imm8 = 3))]
@@ -18610,12 +18769,94 @@ pub unsafe fn _mm512_maskz_shuffle_pd(k: __mmask8, a: __m512d, b: __m512d, imm8:
     transmute(simd_select_bitmask(k, shuffle, zero))
 }
 
+/// Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shuffle_pd&expand=5187)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vshufpd, imm8 = 9))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm256_mask_shuffle_pd(
+    src: __m256d,
+    k: __mmask8,
+    a: __m256d,
+    b: __m256d,
+    imm8: i32,
+) -> __m256d {
+    macro_rules! call {
+        ($imm8:expr) => {
+            _mm256_shuffle_pd(a, b, $imm8)
+        };
+    }
+    let r = constify_imm8_sae!(imm8, call);
+    transmute(simd_select_bitmask(k, r.as_f64x4(), src.as_f64x4()))
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shuffle_pd&expand=5188)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vshufpd, imm8 = 9))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm256_maskz_shuffle_pd(k: __mmask8, a: __m256d, b: __m256d, imm8: i32) -> __m256d {
+    macro_rules! call {
+        ($imm8:expr) => {
+            _mm256_shuffle_pd(a, b, $imm8)
+        };
+    }
+    let r = constify_imm8_sae!(imm8, call);
+    let zero = _mm256_setzero_pd().as_f64x4();
+    transmute(simd_select_bitmask(k, r.as_f64x4(), zero))
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_shuffle_pd&expand=5184)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vshufpd, imm8 = 9))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm_mask_shuffle_pd(
+    src: __m128d,
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+    imm8: i32,
+) -> __m128d {
+    macro_rules! call {
+        ($imm8:expr) => {
+            _mm_shuffle_pd(a, b, $imm8)
+        };
+    }
+    let r = constify_imm8_sae!(imm8, call);
+    transmute(simd_select_bitmask(k, r.as_f64x2(), src.as_f64x2()))
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_shuffle_pd&expand=5185)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vshufpd, imm8 = 9))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm_maskz_shuffle_pd(k: __mmask8, a: __m128d, b: __m128d, imm8: i32) -> __m128d {
+    macro_rules! call {
+        ($imm8:expr) => {
+            _mm_shuffle_pd(a, b, $imm8)
+        };
+    }
+    let r = constify_imm8_sae!(imm8, call);
+    let zero = _mm_setzero_pd().as_f64x2();
+    transmute(simd_select_bitmask(k, r.as_f64x2(), zero))
+}
+
 /// Shuffle 128-bits (composed of 4 32-bit integers) selected by imm8 from a and b, and store the results in dst.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_shuffle_i32&expand=5177)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shuffle_i32&expand=5177)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vshufi64x2, imm8 = 0b10111111))] //should be vshufi32x4, but generate vshufi64x2
+#[cfg_attr(test, assert_instr(vshufi64x2, imm8 = 0b10010101))] //should be vshufi32x4
 #[rustc_args_required_const(2)]
 pub unsafe fn _mm512_shuffle_i32x4(a: __m512i, b: __m512i, imm8: i32) -> __m512i {
     assert!(imm8 >= 0 && imm8 <= 255);
@@ -18692,10 +18933,10 @@ pub unsafe fn _mm512_shuffle_i32x4(a: __m512i, b: __m512i, imm8: i32) -> __m512i
 
 /// Shuffle 128-bits (composed of 4 32-bit integers) selected by imm8 from a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_shuffle_i32x&expand=5175)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shuffle_i32x&expand=5175)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vshufi32x4, imm8 = 0b10111111))]
+#[cfg_attr(test, assert_instr(vshufi32x4, imm8 = 0b10110101))]
 #[rustc_args_required_const(4)]
 pub unsafe fn _mm512_mask_shuffle_i32x4(
     src: __m512i,
@@ -18772,16 +19013,15 @@ pub unsafe fn _mm512_mask_shuffle_i32x4(
         2 => shuffle1!(8, 9, 10, 11),
         _ => shuffle1!(12, 13, 14, 15),
     };
-
     transmute(simd_select_bitmask(k, shuffle, src.as_i32x16()))
 }
 
 /// Shuffle 128-bits (composed of 4 32-bit integers) selected by imm8 from a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_shuffle_i32&expand=5176)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shuffle_i32&expand=5176)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vshufi32x4, imm8 = 0b10111111))]
+#[cfg_attr(test, assert_instr(vshufi32x4, imm8 = 0b10110101))]
 #[rustc_args_required_const(3)]
 pub unsafe fn _mm512_maskz_shuffle_i32x4(
     k: __mmask16,
@@ -18857,14 +19097,149 @@ pub unsafe fn _mm512_maskz_shuffle_i32x4(
         2 => shuffle1!(8, 9, 10, 11),
         _ => shuffle1!(12, 13, 14, 15),
     };
-
     let zero = _mm512_setzero_si512().as_i32x16();
     transmute(simd_select_bitmask(k, shuffle, zero))
 }
 
+/// Shuffle 128-bits (composed of 4 32-bit integers) selected by imm8 from a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shuffle_i32x4&expand=5174)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vperm, imm8 = 0b01))] //should be vshufi32x4
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm256_shuffle_i32x4(a: __m256i, b: __m256i, imm8: i32) -> __m256i {
+    assert!(imm8 >= 0 && imm8 <= 255);
+    let imm8 = (imm8 & 0xFF) as u8;
+    let a = a.as_i32x8();
+    let b = b.as_i32x8();
+    macro_rules! shuffle2 {
+        (
+            $a:expr,
+            $b:expr,
+            $c:expr,
+            $d:expr,
+            $e:expr,
+            $f:expr,
+            $g:expr,
+            $h:expr
+        ) => {
+            simd_shuffle8(a, b, [$a, $b, $c, $d, $e, $f, $g, $h])
+        };
+    }
+    macro_rules! shuffle1 {
+        ($a:expr, $b:expr, $c: expr, $d: expr) => {
+            match (imm8 >> 1) & 0x1 {
+                0 => shuffle2!($a, $b, $c, $d, 8, 9, 10, 11),
+                _ => shuffle2!($a, $b, $c, $d, 12, 13, 14, 15),
+            }
+        };
+    }
+    let r: i32x8 = match imm8 & 0x1 {
+        0 => shuffle1!(0, 1, 2, 3),
+        _ => shuffle1!(4, 5, 6, 7),
+    };
+    transmute(r)
+}
+
+/// Shuffle 128-bits (composed of 4 32-bit integers) selected by imm8 from a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shuffle_i32x4&expand=5172)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vshufi32x4, imm8 = 0b11))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm256_mask_shuffle_i32x4(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+    b: __m256i,
+    imm8: i32,
+) -> __m256i {
+    assert!(imm8 >= 0 && imm8 <= 255);
+    let imm8 = (imm8 & 0xFF) as u8;
+    let a = a.as_i32x8();
+    let b = b.as_i32x8();
+    macro_rules! shuffle2 {
+        (
+            $a:expr,
+            $b:expr,
+            $c:expr,
+            $d:expr,
+            $e:expr,
+            $f:expr,
+            $g:expr,
+            $h:expr
+        ) => {
+            simd_shuffle8(a, b, [$a, $b, $c, $d, $e, $f, $g, $h])
+        };
+    }
+    macro_rules! shuffle1 {
+        ($a:expr, $b:expr, $c: expr, $d: expr) => {
+            match (imm8 >> 1) & 0x1 {
+                0 => shuffle2!($a, $b, $c, $d, 8, 9, 10, 11),
+                _ => shuffle2!($a, $b, $c, $d, 12, 13, 14, 15),
+            }
+        };
+    }
+    let r: i32x8 = match imm8 & 0x1 {
+        0 => shuffle1!(0, 1, 2, 3),
+        _ => shuffle1!(4, 5, 6, 7),
+    };
+
+    transmute(simd_select_bitmask(k, r, src.as_i32x8()))
+}
+
+/// Shuffle 128-bits (composed of 4 32-bit integers) selected by imm8 from a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shuffle_i32x4&expand=5173)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vshufi32x4, imm8 = 0b11))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm256_maskz_shuffle_i32x4(
+    k: __mmask8,
+    a: __m256i,
+    b: __m256i,
+    imm8: i32,
+) -> __m256i {
+    assert!(imm8 >= 0 && imm8 <= 255);
+    let imm8 = (imm8 & 0xFF) as u8;
+    let a = a.as_i32x8();
+    let b = b.as_i32x8();
+    macro_rules! shuffle2 {
+        (
+            $a:expr,
+            $b:expr,
+            $c:expr,
+            $d:expr,
+            $e:expr,
+            $f:expr,
+            $g:expr,
+            $h:expr
+        ) => {
+            simd_shuffle8(a, b, [$a, $b, $c, $d, $e, $f, $g, $h])
+        };
+    }
+    macro_rules! shuffle1 {
+        ($a:expr, $b:expr, $c: expr, $d: expr) => {
+            match (imm8 >> 1) & 0x1 {
+                0 => shuffle2!($a, $b, $c, $d, 8, 9, 10, 11),
+                _ => shuffle2!($a, $b, $c, $d, 12, 13, 14, 15),
+            }
+        };
+    }
+    let r: i32x8 = match imm8 & 0x1 {
+        0 => shuffle1!(0, 1, 2, 3),
+        _ => shuffle1!(4, 5, 6, 7),
+    };
+    let zero = _mm256_setzero_si256().as_i32x8();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
 /// Shuffle 128-bits (composed of 2 64-bit integers) selected by imm8 from a and b, and store the results in dst.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_shuffle_i64x2&expand=5183)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shuffle_i64x2&expand=5183)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vshufi64x2, imm8 = 0b10111111))]
@@ -18926,7 +19301,7 @@ pub unsafe fn _mm512_shuffle_i64x2(a: __m512i, b: __m512i, imm8: i32) -> __m512i
 
 /// Shuffle 128-bits (composed of 2 64-bit integers) selected by imm8 from a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_shuffle_i64x&expand=5181)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shuffle_i64x&expand=5181)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vshufi64x2, imm8 = 0b10111111))]
@@ -18990,13 +19365,12 @@ pub unsafe fn _mm512_mask_shuffle_i64x2(
         2 => shuffle1!(4, 5),
         _ => shuffle1!(6, 7),
     };
-
     transmute(simd_select_bitmask(k, shuffle, src.as_i64x8()))
 }
 
 /// Shuffle 128-bits (composed of 2 64-bit integers) selected by imm8 from a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_shuffle_i64&expand=5182)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shuffle_i64&expand=5182)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vshufi64x2, imm8 = 0b10111111))]
@@ -19064,9 +19438,132 @@ pub unsafe fn _mm512_maskz_shuffle_i64x2(
     transmute(simd_select_bitmask(k, shuffle, zero))
 }
 
+/// Shuffle 128-bits (composed of 2 64-bit integers) selected by imm8 from a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shuffle_i64x2&expand=5180)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vperm, imm8 = 0b01))] //should be vshufi64x2
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm256_shuffle_i64x2(a: __m256i, b: __m256i, imm8: i32) -> __m256i {
+    assert!(imm8 >= 0 && imm8 <= 255);
+    let imm8 = (imm8 & 0xFF) as u8;
+    let a = a.as_i64x4();
+    let b = b.as_i64x4();
+    macro_rules! shuffle2 {
+        (
+            $a:expr,
+            $b:expr,
+            $c:expr,
+            $d:expr
+        ) => {
+            simd_shuffle4(a, b, [$a, $b, $c, $d])
+        };
+    }
+    macro_rules! shuffle1 {
+        ($a:expr, $b:expr) => {
+            match (imm8 >> 1) & 0x1 {
+                0 => shuffle2!($a, $b, 4, 5),
+                _ => shuffle2!($a, $b, 6, 7),
+            }
+        };
+    }
+    let r: i64x4 = match imm8 & 0x1 {
+        0 => shuffle1!(0, 1),
+        _ => shuffle1!(2, 3),
+    };
+    transmute(r)
+}
+
+/// Shuffle 128-bits (composed of 2 64-bit integers) selected by imm8 from a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shuffle_i64x2&expand=5178)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vshufi64x2, imm8 = 0b11))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm256_mask_shuffle_i64x2(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+    b: __m256i,
+    imm8: i32,
+) -> __m256i {
+    assert!(imm8 >= 0 && imm8 <= 255);
+    let imm8 = (imm8 & 0xFF) as u8;
+    let a = a.as_i64x4();
+    let b = b.as_i64x4();
+    macro_rules! shuffle2 {
+        (
+            $a:expr,
+            $b:expr,
+            $c:expr,
+            $d:expr
+        ) => {
+            simd_shuffle4(a, b, [$a, $b, $c, $d])
+        };
+    }
+    macro_rules! shuffle1 {
+        ($a:expr, $b:expr) => {
+            match (imm8 >> 1) & 0x1 {
+                0 => shuffle2!($a, $b, 4, 5),
+                _ => shuffle2!($a, $b, 6, 7),
+            }
+        };
+    }
+    let r: i64x4 = match imm8 & 0x1 {
+        0 => shuffle1!(0, 1),
+        _ => shuffle1!(2, 3),
+    };
+    transmute(simd_select_bitmask(k, r, src.as_i64x4()))
+}
+
+/// Shuffle 128-bits (composed of 2 64-bit integers) selected by imm8 from a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shuffle_i64x2&expand=5179)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vshufi64x2, imm8 = 0b11))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm256_maskz_shuffle_i64x2(
+    k: __mmask8,
+    a: __m256i,
+    b: __m256i,
+    imm8: i32,
+) -> __m256i {
+    assert!(imm8 >= 0 && imm8 <= 255);
+    let imm8 = (imm8 & 0xFF) as u8;
+    let a = a.as_i64x4();
+    let b = b.as_i64x4();
+    macro_rules! shuffle2 {
+        (
+            $a:expr,
+            $b:expr,
+            $c:expr,
+            $d:expr
+        ) => {
+            simd_shuffle4(a, b, [$a, $b, $c, $d])
+        };
+    }
+    macro_rules! shuffle1 {
+        ($a:expr, $b:expr) => {
+            match (imm8 >> 1) & 0x1 {
+                0 => shuffle2!($a, $b, 4, 5),
+                _ => shuffle2!($a, $b, 6, 7),
+            }
+        };
+    }
+    let r: i64x4 = match imm8 & 0x1 {
+        0 => shuffle1!(0, 1),
+        _ => shuffle1!(2, 3),
+    };
+    let zero = _mm256_setzero_si256().as_i64x4();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
 /// Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_shuffle_f32x4&expand=5165)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shuffle_f32x4&expand=5165)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vshuff64x2, imm8 = 0b10111111))] //should be vshuff32x4, but generate vshuff64x2
@@ -19142,7 +19639,7 @@ pub unsafe fn _mm512_shuffle_f32x4(a: __m512, b: __m512, imm8: i32) -> __m512 {
 
 /// Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_shuffle_f32&expand=5163)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shuffle_f32&expand=5163)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vshuff32x4, imm8 = 0b10111111))]
@@ -19220,13 +19717,12 @@ pub unsafe fn _mm512_mask_shuffle_f32x4(
         2 => shuffle1!(8, 9, 10, 11),
         _ => shuffle1!(12, 13, 14, 15),
     };
-
     transmute(simd_select_bitmask(k, shuffle, src.as_f32x16()))
 }
 
 /// Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_shuffle_f32&expand=5164)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shuffle_f32&expand=5164)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vshuff32x4, imm8 = 0b10111111))]
@@ -19303,9 +19799,140 @@ pub unsafe fn _mm512_maskz_shuffle_f32x4(k: __mmask16, a: __m512, b: __m512, imm
     transmute(simd_select_bitmask(k, shuffle, zero))
 }
 
+/// Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shuffle_f32x4&expand=5162)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vperm, imm8 = 0b01))] //should be vshuff32x4
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm256_shuffle_f32x4(a: __m256, b: __m256, imm8: i32) -> __m256 {
+    assert!(imm8 >= 0 && imm8 <= 255);
+    let imm8 = (imm8 & 0xFF) as u8;
+    let a = a.as_f32x8();
+    let b = b.as_f32x8();
+    macro_rules! shuffle2 {
+        (
+            $a:expr,
+            $b:expr,
+            $c:expr,
+            $d:expr,
+            $e:expr,
+            $f:expr,
+            $g:expr,
+            $h:expr
+        ) => {
+            simd_shuffle8(a, b, [$a, $b, $c, $d, $e, $f, $g, $h])
+        };
+    }
+    macro_rules! shuffle1 {
+        ($a:expr, $b:expr, $c: expr, $d: expr) => {
+            match (imm8 >> 1) & 0x1 {
+                0 => shuffle2!($a, $b, $c, $d, 8, 9, 10, 11),
+                _ => shuffle2!($a, $b, $c, $d, 12, 13, 14, 15),
+            }
+        };
+    }
+    let r: f32x8 = match imm8 & 0x1 {
+        0 => shuffle1!(0, 1, 2, 3),
+        _ => shuffle1!(4, 5, 6, 7),
+    };
+
+    transmute(r)
+}
+
+/// Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shuffle_f32x4&expand=5160)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vshuff32x4, imm8 = 0b11))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm256_mask_shuffle_f32x4(
+    src: __m256,
+    k: __mmask8,
+    a: __m256,
+    b: __m256,
+    imm8: i32,
+) -> __m256 {
+    assert!(imm8 >= 0 && imm8 <= 255);
+    let imm8 = (imm8 & 0xFF) as u8;
+    let a = a.as_f32x8();
+    let b = b.as_f32x8();
+    macro_rules! shuffle2 {
+        (
+            $a:expr,
+            $b:expr,
+            $c:expr,
+            $d:expr,
+            $e:expr,
+            $f:expr,
+            $g:expr,
+            $h:expr
+        ) => {
+            simd_shuffle8(a, b, [$a, $b, $c, $d, $e, $f, $g, $h])
+        };
+    }
+    macro_rules! shuffle1 {
+        ($a:expr, $b:expr, $c: expr, $d: expr) => {
+            match (imm8 >> 1) & 0x1 {
+                0 => shuffle2!($a, $b, $c, $d, 8, 9, 10, 11),
+                _ => shuffle2!($a, $b, $c, $d, 12, 13, 14, 15),
+            }
+        };
+    }
+    let r: f32x8 = match imm8 & 0x1 {
+        0 => shuffle1!(0, 1, 2, 3),
+        _ => shuffle1!(4, 5, 6, 7),
+    };
+    transmute(simd_select_bitmask(k, r, src.as_f32x8()))
+}
+
+/// Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shuffle_f32x4&expand=5161)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vshuff32x4, imm8 = 0b11))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm256_maskz_shuffle_f32x4(k: __mmask8, a: __m256, b: __m256, imm8: i32) -> __m256 {
+    assert!(imm8 >= 0 && imm8 <= 255);
+    let imm8 = (imm8 & 0xFF) as u8;
+    let a = a.as_f32x8();
+    let b = b.as_f32x8();
+    macro_rules! shuffle2 {
+        (
+            $a:expr,
+            $b:expr,
+            $c:expr,
+            $d:expr,
+            $e:expr,
+            $f:expr,
+            $g:expr,
+            $h:expr
+        ) => {
+            simd_shuffle8(a, b, [$a, $b, $c, $d, $e, $f, $g, $h])
+        };
+    }
+    macro_rules! shuffle1 {
+        ($a:expr, $b:expr, $c: expr, $d: expr) => {
+            match (imm8 >> 1) & 0x1 {
+                0 => shuffle2!($a, $b, $c, $d, 8, 9, 10, 11),
+                _ => shuffle2!($a, $b, $c, $d, 12, 13, 14, 15),
+            }
+        };
+    }
+    let r: f32x8 = match imm8 & 0x1 {
+        0 => shuffle1!(0, 1, 2, 3),
+        _ => shuffle1!(4, 5, 6, 7),
+    };
+    let zero = _mm256_setzero_ps().as_f32x8();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
 /// Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_shuffle_f64x2&expand=5171)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shuffle_f64x2&expand=5171)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vshuff64x2, imm8 = 0b10111111))]
@@ -19367,7 +19994,7 @@ pub unsafe fn _mm512_shuffle_f64x2(a: __m512d, b: __m512d, imm8: i32) -> __m512d
 
 /// Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_shuffle_f64x2&expand=5169)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shuffle_f64x2&expand=5169)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vshuff64x2, imm8 = 0b10111111))]
@@ -19431,13 +20058,12 @@ pub unsafe fn _mm512_mask_shuffle_f64x2(
         2 => shuffle1!(4, 5),
         _ => shuffle1!(6, 7),
     };
-
     transmute(simd_select_bitmask(k, shuffle, src.as_f64x8()))
 }
 
 /// Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_shuffle_f64x2&expand=5170)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shuffle_f64x2&expand=5170)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vshuff64x2, imm8 = 0b10111111))]
@@ -19500,11 +20126,133 @@ pub unsafe fn _mm512_maskz_shuffle_f64x2(
         2 => shuffle1!(4, 5),
         _ => shuffle1!(6, 7),
     };
-
     let zero = _mm512_setzero_pd().as_f64x8();
     transmute(simd_select_bitmask(k, shuffle, zero))
 }
 
+/// Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shuffle_f64x2&expand=5168)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vperm, imm8 = 0b01))] //should be vshuff64x2
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm256_shuffle_f64x2(a: __m256d, b: __m256d, imm8: i32) -> __m256d {
+    assert!(imm8 >= 0 && imm8 <= 255);
+    let imm8 = (imm8 & 0xFF) as u8;
+    let a = a.as_f64x4();
+    let b = b.as_f64x4();
+    macro_rules! shuffle2 {
+        (
+            $a:expr,
+            $b:expr,
+            $c:expr,
+            $d:expr
+        ) => {
+            simd_shuffle4(a, b, [$a, $b, $c, $d])
+        };
+    }
+    macro_rules! shuffle1 {
+        ($a:expr, $b:expr) => {
+            match (imm8 >> 1) & 0x1 {
+                0 => shuffle2!($a, $b, 4, 5),
+                _ => shuffle2!($a, $b, 6, 7),
+            }
+        };
+    }
+    let r: f64x4 = match imm8 & 0x1 {
+        0 => shuffle1!(0, 1),
+        _ => shuffle1!(2, 3),
+    };
+    transmute(r)
+}
+
+/// Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shuffle_f64x2&expand=5166)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vshuff64x2, imm8 = 0b11))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm256_mask_shuffle_f64x2(
+    src: __m256d,
+    k: __mmask8,
+    a: __m256d,
+    b: __m256d,
+    imm8: i32,
+) -> __m256d {
+    assert!(imm8 >= 0 && imm8 <= 255);
+    let imm8 = (imm8 & 0xFF) as u8;
+    let a = a.as_f64x4();
+    let b = b.as_f64x4();
+    macro_rules! shuffle2 {
+        (
+            $a:expr,
+            $b:expr,
+            $c:expr,
+            $d:expr
+        ) => {
+            simd_shuffle4(a, b, [$a, $b, $c, $d])
+        };
+    }
+    macro_rules! shuffle1 {
+        ($a:expr, $b:expr) => {
+            match (imm8 >> 1) & 0x1 {
+                0 => shuffle2!($a, $b, 4, 5),
+                _ => shuffle2!($a, $b, 6, 7),
+            }
+        };
+    }
+    let r: f64x4 = match imm8 & 0x1 {
+        0 => shuffle1!(0, 1),
+        _ => shuffle1!(2, 3),
+    };
+    transmute(simd_select_bitmask(k, r, src.as_f64x4()))
+}
+
+/// Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shuffle_f64x2&expand=5167)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vshuff64x2, imm8 = 0b11))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm256_maskz_shuffle_f64x2(
+    k: __mmask8,
+    a: __m256d,
+    b: __m256d,
+    imm8: i32,
+) -> __m256d {
+    assert!(imm8 >= 0 && imm8 <= 255);
+    let imm8 = (imm8 & 0xFF) as u8;
+    let a = a.as_f64x4();
+    let b = b.as_f64x4();
+    macro_rules! shuffle2 {
+        (
+            $a:expr,
+            $b:expr,
+            $c:expr,
+            $d:expr
+        ) => {
+            simd_shuffle4(a, b, [$a, $b, $c, $d])
+        };
+    }
+    macro_rules! shuffle1 {
+        ($a:expr, $b:expr) => {
+            match (imm8 >> 1) & 0x1 {
+                0 => shuffle2!($a, $b, 4, 5),
+                _ => shuffle2!($a, $b, 6, 7),
+            }
+        };
+    }
+    let r: f64x4 = match imm8 & 0x1 {
+        0 => shuffle1!(0, 1),
+        _ => shuffle1!(2, 3),
+    };
+    let zero = _mm256_setzero_pd().as_f64x4();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
 /// Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from a, selected with imm8, and store the result in dst.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_extractf32x4_ps&expand=2442)
@@ -20330,41 +21078,27 @@ pub unsafe fn _mm512_maskz_insertf64x4(k: __mmask8, a: __m512d, b: __m256d, imm8
 
 /// Unpack and interleave 32-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_unpackhi_epi32&expand=6021)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_unpackhi_epi32&expand=6021)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vunpckhps))] //should be vpunpckhdq
 pub unsafe fn _mm512_unpackhi_epi32(a: __m512i, b: __m512i) -> __m512i {
     let a = a.as_i32x16();
     let b = b.as_i32x16();
+    #[rustfmt::skip]
     let r: i32x16 = simd_shuffle16(
-        a,
-        b,
-        [
-            2,
-            18,
-            3,
-            19,
-            2 + 4,
-            18 + 4,
-            3 + 4,
-            19 + 4,
-            2 + 8,
-            18 + 8,
-            3 + 8,
-            19 + 8,
-            2 + 12,
-            18 + 12,
-            3 + 12,
-            19 + 12,
-        ],
+        a, b,
+        [ 2, 18, 3, 19,
+          2 + 4, 18 + 4, 3 + 4, 19 + 4,
+          2 + 8, 18 + 8, 3 + 8, 19 + 8,
+          2 + 12, 18 + 12, 3 + 12, 19 + 12],
     );
     transmute(r)
 }
 
 /// Unpack and interleave 32-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_unpackhi_epi32&expand=6019)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_unpackhi_epi32&expand=6019)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vpunpckhdq))]
@@ -20380,7 +21114,7 @@ pub unsafe fn _mm512_mask_unpackhi_epi32(
 
 /// Unpack and interleave 32-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_unpackhi_epi32&expand=6020)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_unpackhi_epi32&expand=6020)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vpunpckhdq))]
@@ -20390,10 +21124,65 @@ pub unsafe fn _mm512_maskz_unpackhi_epi32(k: __mmask16, a: __m512i, b: __m512i)
     transmute(simd_select_bitmask(k, unpackhi, zero))
 }
 
-/// Unpack and interleave 64-bit integers from the high half of each 128-bit lane in a and b, and
-/// store the results in dst.
+/// Unpack and interleave 32-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_unpackhi_epi64&expand=6030)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_unpackhi_epi32&expand=6016)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpunpckhdq))]
+pub unsafe fn _mm256_mask_unpackhi_epi32(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    let unpackhi = _mm256_unpackhi_epi32(a, b).as_i32x8();
+    transmute(simd_select_bitmask(k, unpackhi, src.as_i32x8()))
+}
+
+/// Unpack and interleave 32-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_unpackhi_epi32&expand=6017)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpunpckhdq))]
+pub unsafe fn _mm256_maskz_unpackhi_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    let unpackhi = _mm256_unpackhi_epi32(a, b).as_i32x8();
+    let zero = _mm256_setzero_si256().as_i32x8();
+    transmute(simd_select_bitmask(k, unpackhi, zero))
+}
+
+/// Unpack and interleave 32-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_unpackhi_epi32&expand=6013)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpunpckhdq))]
+pub unsafe fn _mm_mask_unpackhi_epi32(
+    src: __m128i,
+    k: __mmask8,
+    a: __m128i,
+    b: __m128i,
+) -> __m128i {
+    let unpackhi = _mm_unpackhi_epi32(a, b).as_i32x4();
+    transmute(simd_select_bitmask(k, unpackhi, src.as_i32x4()))
+}
+
+/// Unpack and interleave 32-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_unpackhi_epi32&expand=6014)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpunpckhdq))]
+pub unsafe fn _mm_maskz_unpackhi_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let unpackhi = _mm_unpackhi_epi32(a, b).as_i32x4();
+    let zero = _mm_setzero_si128().as_i32x4();
+    transmute(simd_select_bitmask(k, unpackhi, zero))
+}
+
+/// Unpack and interleave 64-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_unpackhi_epi64&expand=6030)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vunpckhpd))] //should be vpunpckhqdq
@@ -20403,7 +21192,7 @@ pub unsafe fn _mm512_unpackhi_epi64(a: __m512i, b: __m512i) -> __m512i {
 
 /// Unpack and interleave 64-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_unpackhi_epi64&expand=6028)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_unpackhi_epi64&expand=6028)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vpunpckhqdq))]
@@ -20419,7 +21208,7 @@ pub unsafe fn _mm512_mask_unpackhi_epi64(
 
 /// Unpack and interleave 64-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_unpackhi_epi64&expand=6029)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_unpackhi_epi64&expand=6029)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vpunpckhqdq))]
@@ -20429,40 +21218,82 @@ pub unsafe fn _mm512_maskz_unpackhi_epi64(k: __mmask8, a: __m512i, b: __m512i) -
     transmute(simd_select_bitmask(k, unpackhi, zero))
 }
 
+/// Unpack and interleave 64-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_unpackhi_epi64&expand=6025)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpunpckhqdq))]
+pub unsafe fn _mm256_mask_unpackhi_epi64(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    let unpackhi = _mm256_unpackhi_epi64(a, b).as_i64x4();
+    transmute(simd_select_bitmask(k, unpackhi, src.as_i64x4()))
+}
+
+/// Unpack and interleave 64-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_unpackhi_epi64&expand=6026)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpunpckhqdq))]
+pub unsafe fn _mm256_maskz_unpackhi_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    let unpackhi = _mm256_unpackhi_epi64(a, b).as_i64x4();
+    let zero = _mm256_setzero_si256().as_i64x4();
+    transmute(simd_select_bitmask(k, unpackhi, zero))
+}
+
+/// Unpack and interleave 64-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_unpackhi_epi64&expand=6022)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpunpckhqdq))]
+pub unsafe fn _mm_mask_unpackhi_epi64(
+    src: __m128i,
+    k: __mmask8,
+    a: __m128i,
+    b: __m128i,
+) -> __m128i {
+    let unpackhi = _mm_unpackhi_epi64(a, b).as_i64x2();
+    transmute(simd_select_bitmask(k, unpackhi, src.as_i64x2()))
+}
+
+/// Unpack and interleave 64-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_unpackhi_epi64&expand=6023)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpunpckhqdq))]
+pub unsafe fn _mm_maskz_unpackhi_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let unpackhi = _mm_unpackhi_epi64(a, b).as_i64x2();
+    let zero = _mm_setzero_si128().as_i64x2();
+    transmute(simd_select_bitmask(k, unpackhi, zero))
+}
+
 /// Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_unpackhi_ps&expand=6060)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_unpackhi_ps&expand=6060)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vunpckhps))]
 pub unsafe fn _mm512_unpackhi_ps(a: __m512, b: __m512) -> __m512 {
+    #[rustfmt::skip]
     simd_shuffle16(
-        a,
-        b,
-        [
-            2,
-            18,
-            3,
-            19,
-            2 + 4,
-            18 + 4,
-            3 + 4,
-            19 + 4,
-            2 + 8,
-            18 + 8,
-            3 + 8,
-            19 + 8,
-            2 + 12,
-            18 + 12,
-            3 + 12,
-            19 + 12,
-        ],
+        a, b,
+        [ 2, 18, 3, 19,
+          2 + 4, 18 + 4, 3 + 4, 19 + 4,
+          2 + 8, 18 + 8, 3 + 8, 19 + 8,
+          2 + 12, 18 + 12, 3 + 12, 19 + 12],
     )
 }
 
 /// Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_unpackhi_ps&expand=6058)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_unpackhi_ps&expand=6058)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vunpckhps))]
@@ -20473,7 +21304,7 @@ pub unsafe fn _mm512_mask_unpackhi_ps(src: __m512, k: __mmask16, a: __m512, b: _
 
 /// Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_unpackhi_ps&expand=6059)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_unpackhi_ps&expand=6059)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vunpckhps))]
@@ -20483,9 +21314,55 @@ pub unsafe fn _mm512_maskz_unpackhi_ps(k: __mmask16, a: __m512, b: __m512) -> __
     transmute(simd_select_bitmask(k, unpackhi, zero))
 }
 
+/// Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_unpackhi_ps&expand=6055)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vunpckhps))]
+pub unsafe fn _mm256_mask_unpackhi_ps(src: __m256, k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    let unpackhi = _mm256_unpackhi_ps(a, b).as_f32x8();
+    transmute(simd_select_bitmask(k, unpackhi, src.as_f32x8()))
+}
+
+/// Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_unpackhi_ps&expand=6056)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vunpckhps))]
+pub unsafe fn _mm256_maskz_unpackhi_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    let unpackhi = _mm256_unpackhi_ps(a, b).as_f32x8();
+    let zero = _mm256_setzero_ps().as_f32x8();
+    transmute(simd_select_bitmask(k, unpackhi, zero))
+}
+
+/// Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_unpackhi_ps&expand=6052)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vunpckhps))]
+pub unsafe fn _mm_mask_unpackhi_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    let unpackhi = _mm_unpackhi_ps(a, b).as_f32x4();
+    transmute(simd_select_bitmask(k, unpackhi, src.as_f32x4()))
+}
+
+/// Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_unpackhi_ps&expand=6053)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vunpckhps))]
+pub unsafe fn _mm_maskz_unpackhi_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    let unpackhi = _mm_unpackhi_ps(a, b).as_f32x4();
+    let zero = _mm_setzero_ps().as_f32x4();
+    transmute(simd_select_bitmask(k, unpackhi, zero))
+}
+
 /// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_unpackhi_pd&expand=6048)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_unpackhi_pd&expand=6048)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vunpckhpd))]
@@ -20495,7 +21372,7 @@ pub unsafe fn _mm512_unpackhi_pd(a: __m512d, b: __m512d) -> __m512d {
 
 /// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_unpackhi_pd&expand=6046)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_unpackhi_pd&expand=6046)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vunpckhpd))]
@@ -20511,7 +21388,7 @@ pub unsafe fn _mm512_mask_unpackhi_pd(
 
 /// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_unpackhi_pd&expand=6047)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_unpackhi_pd&expand=6047)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vunpckhpd))]
@@ -20521,43 +21398,80 @@ pub unsafe fn _mm512_maskz_unpackhi_pd(k: __mmask8, a: __m512d, b: __m512d) -> _
     transmute(simd_select_bitmask(k, unpackhi, zero))
 }
 
+/// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_unpackhi_pd&expand=6043)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vunpckhpd))]
+pub unsafe fn _mm256_mask_unpackhi_pd(
+    src: __m256d,
+    k: __mmask8,
+    a: __m256d,
+    b: __m256d,
+) -> __m256d {
+    let unpackhi = _mm256_unpackhi_pd(a, b).as_f64x4();
+    transmute(simd_select_bitmask(k, unpackhi, src.as_f64x4()))
+}
+
+/// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_unpackhi_pd&expand=6044)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vunpckhpd))]
+pub unsafe fn _mm256_maskz_unpackhi_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    let unpackhi = _mm256_unpackhi_pd(a, b).as_f64x4();
+    let zero = _mm256_setzero_pd().as_f64x4();
+    transmute(simd_select_bitmask(k, unpackhi, zero))
+}
+
+/// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_unpackhi_pd&expand=6040)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vunpckhpd))]
+pub unsafe fn _mm_mask_unpackhi_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    let unpackhi = _mm_unpackhi_pd(a, b).as_f64x2();
+    transmute(simd_select_bitmask(k, unpackhi, src.as_f64x2()))
+}
+
+/// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_unpackhi_pd&expand=6041)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vunpckhpd))]
+pub unsafe fn _mm_maskz_unpackhi_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    let unpackhi = _mm_unpackhi_pd(a, b).as_f64x2();
+    let zero = _mm_setzero_pd().as_f64x2();
+    transmute(simd_select_bitmask(k, unpackhi, zero))
+}
+
 /// Unpack and interleave 32-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_unpacklo_epi32&expand=6078)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_unpacklo_epi32&expand=6078)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vunpcklps))] //should be vpunpckldq
 pub unsafe fn _mm512_unpacklo_epi32(a: __m512i, b: __m512i) -> __m512i {
     let a = a.as_i32x16();
     let b = b.as_i32x16();
+    #[rustfmt::skip]
     let r: i32x16 = simd_shuffle16(
-        a,
-        b,
-        [
-            0,
-            16,
-            1,
-            17,
-            0 + 4,
-            16 + 4,
-            1 + 4,
-            17 + 4,
-            0 + 8,
-            16 + 8,
-            1 + 8,
-            17 + 8,
-            0 + 12,
-            16 + 12,
-            1 + 12,
-            17 + 12,
-        ],
+        a, b,
+        [ 0, 16, 1, 17,
+          0 + 4, 16 + 4, 1 + 4, 17 + 4,
+          0 + 8, 16 + 8, 1 + 8, 17 + 8,
+          0 + 12, 16 + 12, 1 + 12, 17 + 12],
     );
     transmute(r)
 }
 
 /// Unpack and interleave 32-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_unpacklo_epi32&expand=6076)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_unpacklo_epi32&expand=6076)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vpunpckldq))]
@@ -20567,25 +21481,81 @@ pub unsafe fn _mm512_mask_unpacklo_epi32(
     a: __m512i,
     b: __m512i,
 ) -> __m512i {
-    let unpackhi = _mm512_unpacklo_epi32(a, b).as_i32x16();
-    transmute(simd_select_bitmask(k, unpackhi, src.as_i32x16()))
+    let unpacklo = _mm512_unpacklo_epi32(a, b).as_i32x16();
+    transmute(simd_select_bitmask(k, unpacklo, src.as_i32x16()))
 }
 
 /// Unpack and interleave 32-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_unpacklo_epi32&expand=6077)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_unpacklo_epi32&expand=6077)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vpunpckldq))]
 pub unsafe fn _mm512_maskz_unpacklo_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
-    let unpackhi = _mm512_unpacklo_epi32(a, b).as_i32x16();
+    let unpacklo = _mm512_unpacklo_epi32(a, b).as_i32x16();
     let zero = _mm512_setzero_si512().as_i32x16();
-    transmute(simd_select_bitmask(k, unpackhi, zero))
+    transmute(simd_select_bitmask(k, unpacklo, zero))
+}
+
+/// Unpack and interleave 32-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_unpacklo_epi32&expand=6073)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpunpckldq))]
+pub unsafe fn _mm256_mask_unpacklo_epi32(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    let unpacklo = _mm256_unpacklo_epi32(a, b).as_i32x8();
+    transmute(simd_select_bitmask(k, unpacklo, src.as_i32x8()))
+}
+
+/// Unpack and interleave 32-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_unpacklo_epi32&expand=6074)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpunpckldq))]
+pub unsafe fn _mm256_maskz_unpacklo_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    let unpacklo = _mm256_unpacklo_epi32(a, b).as_i32x8();
+    let zero = _mm256_setzero_si256().as_i32x8();
+    transmute(simd_select_bitmask(k, unpacklo, zero))
+}
+
+/// Unpack and interleave 32-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_unpacklo_epi32&expand=6070)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpunpckldq))]
+pub unsafe fn _mm_mask_unpacklo_epi32(
+    src: __m128i,
+    k: __mmask8,
+    a: __m128i,
+    b: __m128i,
+) -> __m128i {
+    let unpacklo = _mm_unpacklo_epi32(a, b).as_i32x4();
+    transmute(simd_select_bitmask(k, unpacklo, src.as_i32x4()))
+}
+
+/// Unpack and interleave 32-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_unpacklo_epi32&expand=6071)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpunpckldq))]
+pub unsafe fn _mm_maskz_unpacklo_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let unpacklo = _mm_unpacklo_epi32(a, b).as_i32x4();
+    let zero = _mm_setzero_si128().as_i32x4();
+    transmute(simd_select_bitmask(k, unpacklo, zero))
 }
 
 /// Unpack and interleave 64-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_unpacklo_epi64&expand=6087)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_unpacklo_epi64&expand=6087)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vunpcklpd))] //should be vpunpcklqdq
@@ -20595,7 +21565,7 @@ pub unsafe fn _mm512_unpacklo_epi64(a: __m512i, b: __m512i) -> __m512i {
 
 /// Unpack and interleave 64-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_unpacklo_epi64&expand=6085)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_unpacklo_epi64&expand=6085)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vpunpcklqdq))]
@@ -20605,79 +21575,166 @@ pub unsafe fn _mm512_mask_unpacklo_epi64(
     a: __m512i,
     b: __m512i,
 ) -> __m512i {
-    let unpackhi = _mm512_unpacklo_epi64(a, b).as_i64x8();
-    transmute(simd_select_bitmask(k, unpackhi, src.as_i64x8()))
+    let unpacklo = _mm512_unpacklo_epi64(a, b).as_i64x8();
+    transmute(simd_select_bitmask(k, unpacklo, src.as_i64x8()))
 }
 
 /// Unpack and interleave 64-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_unpacklo_epi64&expand=6086)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_unpacklo_epi64&expand=6086)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vpunpcklqdq))]
 pub unsafe fn _mm512_maskz_unpacklo_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
-    let unpackhi = _mm512_unpacklo_epi64(a, b).as_i64x8();
+    let unpacklo = _mm512_unpacklo_epi64(a, b).as_i64x8();
     let zero = _mm512_setzero_si512().as_i64x8();
-    transmute(simd_select_bitmask(k, unpackhi, zero))
+    transmute(simd_select_bitmask(k, unpacklo, zero))
+}
+
+/// Unpack and interleave 64-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_unpacklo_epi64&expand=6082)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpunpcklqdq))]
+pub unsafe fn _mm256_mask_unpacklo_epi64(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    let unpacklo = _mm256_unpacklo_epi64(a, b).as_i64x4();
+    transmute(simd_select_bitmask(k, unpacklo, src.as_i64x4()))
+}
+
+/// Unpack and interleave 64-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_unpacklo_epi64&expand=6083)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpunpcklqdq))]
+pub unsafe fn _mm256_maskz_unpacklo_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    let unpacklo = _mm256_unpacklo_epi64(a, b).as_i64x4();
+    let zero = _mm256_setzero_si256().as_i64x4();
+    transmute(simd_select_bitmask(k, unpacklo, zero))
+}
+
+/// Unpack and interleave 64-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_unpacklo_epi64&expand=6079)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpunpcklqdq))]
+pub unsafe fn _mm_mask_unpacklo_epi64(
+    src: __m128i,
+    k: __mmask8,
+    a: __m128i,
+    b: __m128i,
+) -> __m128i {
+    let unpacklo = _mm_unpacklo_epi64(a, b).as_i64x2();
+    transmute(simd_select_bitmask(k, unpacklo, src.as_i64x2()))
+}
+
+/// Unpack and interleave 64-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_unpacklo_epi64&expand=6080)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpunpcklqdq))]
+pub unsafe fn _mm_maskz_unpacklo_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    let unpacklo = _mm_unpacklo_epi64(a, b).as_i64x2();
+    let zero = _mm_setzero_si128().as_i64x2();
+    transmute(simd_select_bitmask(k, unpacklo, zero))
 }
 
 /// Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_unpacklo_ps&expand=6117)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_unpacklo_ps&expand=6117)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vunpcklps))]
 pub unsafe fn _mm512_unpacklo_ps(a: __m512, b: __m512) -> __m512 {
-    simd_shuffle16(
-        a,
-        b,
-        [
-            0,
-            16,
-            1,
-            17,
-            0 + 4,
-            16 + 4,
-            1 + 4,
-            17 + 4,
-            0 + 8,
-            16 + 8,
-            1 + 8,
-            17 + 8,
-            0 + 12,
-            16 + 12,
-            1 + 12,
-            17 + 12,
-        ],
+    #[rustfmt::skip]
+    simd_shuffle16(a, b,
+                   [ 0, 16, 1, 17,
+                     0 + 4, 16 + 4, 1 + 4, 17 + 4,
+                     0 + 8, 16 + 8, 1 + 8, 17 + 8,
+                     0 + 12, 16 + 12, 1 + 12, 17 + 12],
     )
 }
 
 /// Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_unpacklo_ps&expand=6115)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_unpacklo_ps&expand=6115)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vunpcklps))]
 pub unsafe fn _mm512_mask_unpacklo_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
-    let unpackhi = _mm512_unpacklo_ps(a, b).as_f32x16();
-    transmute(simd_select_bitmask(k, unpackhi, src.as_f32x16()))
+    let unpacklo = _mm512_unpacklo_ps(a, b).as_f32x16();
+    transmute(simd_select_bitmask(k, unpacklo, src.as_f32x16()))
 }
 
 /// Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_unpacklo_ps&expand=6116)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_unpacklo_ps&expand=6116)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vunpcklps))]
 pub unsafe fn _mm512_maskz_unpacklo_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
-    let unpackhi = _mm512_unpacklo_ps(a, b).as_f32x16();
+    let unpacklo = _mm512_unpacklo_ps(a, b).as_f32x16();
     let zero = _mm512_setzero_ps().as_f32x16();
-    transmute(simd_select_bitmask(k, unpackhi, zero))
+    transmute(simd_select_bitmask(k, unpacklo, zero))
+}
+
+/// Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_unpacklo_ps&expand=6112)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vunpcklps))]
+pub unsafe fn _mm256_mask_unpacklo_ps(src: __m256, k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    let unpacklo = _mm256_unpacklo_ps(a, b).as_f32x8();
+    transmute(simd_select_bitmask(k, unpacklo, src.as_f32x8()))
+}
+
+/// Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_unpacklo_ps&expand=6113)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vunpcklps))]
+pub unsafe fn _mm256_maskz_unpacklo_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    let unpacklo = _mm256_unpacklo_ps(a, b).as_f32x8();
+    let zero = _mm256_setzero_ps().as_f32x8();
+    transmute(simd_select_bitmask(k, unpacklo, zero))
+}
+
+/// Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_unpacklo_ps&expand=6109)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vunpcklps))]
+pub unsafe fn _mm_mask_unpacklo_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    let unpacklo = _mm_unpacklo_ps(a, b).as_f32x4();
+    transmute(simd_select_bitmask(k, unpacklo, src.as_f32x4()))
+}
+
+/// Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_unpacklo_ps&expand=6110)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vunpcklps))]
+pub unsafe fn _mm_maskz_unpacklo_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    let unpacklo = _mm_unpacklo_ps(a, b).as_f32x4();
+    let zero = _mm_setzero_ps().as_f32x4();
+    transmute(simd_select_bitmask(k, unpacklo, zero))
 }
 
 /// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_unpacklo_pd&expand=6105)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_unpacklo_pd&expand=6105)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vunpcklpd))]
@@ -20687,7 +21744,7 @@ pub unsafe fn _mm512_unpacklo_pd(a: __m512d, b: __m512d) -> __m512d {
 
 /// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_unpacklo_pd&expand=6103)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_unpacklo_pd&expand=6103)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vunpcklpd))]
@@ -20697,20 +21754,71 @@ pub unsafe fn _mm512_mask_unpacklo_pd(
     a: __m512d,
     b: __m512d,
 ) -> __m512d {
-    let unpackhi = _mm512_unpacklo_pd(a, b).as_f64x8();
-    transmute(simd_select_bitmask(k, unpackhi, src.as_f64x8()))
+    let unpacklo = _mm512_unpacklo_pd(a, b).as_f64x8();
+    transmute(simd_select_bitmask(k, unpacklo, src.as_f64x8()))
 }
 
 /// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_unpacklo_pd&expand=6104)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_unpacklo_pd&expand=6104)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vunpcklpd))]
 pub unsafe fn _mm512_maskz_unpacklo_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
-    let unpackhi = _mm512_unpacklo_pd(a, b).as_f64x8();
+    let unpacklo = _mm512_unpacklo_pd(a, b).as_f64x8();
     let zero = _mm512_setzero_pd().as_f64x8();
-    transmute(simd_select_bitmask(k, unpackhi, zero))
+    transmute(simd_select_bitmask(k, unpacklo, zero))
+}
+
+/// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_unpacklo_pd&expand=6100)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vunpcklpd))]
+pub unsafe fn _mm256_mask_unpacklo_pd(
+    src: __m256d,
+    k: __mmask8,
+    a: __m256d,
+    b: __m256d,
+) -> __m256d {
+    let unpacklo = _mm256_unpacklo_pd(a, b).as_f64x4();
+    transmute(simd_select_bitmask(k, unpacklo, src.as_f64x4()))
+}
+
+/// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_unpacklo_pd&expand=6101)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vunpcklpd))]
+pub unsafe fn _mm256_maskz_unpacklo_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    let unpacklo = _mm256_unpacklo_pd(a, b).as_f64x4();
+    let zero = _mm256_setzero_pd().as_f64x4();
+    transmute(simd_select_bitmask(k, unpacklo, zero))
+}
+
+/// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_unpacklo_pd&expand=6097)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vunpcklpd))]
+pub unsafe fn _mm_mask_unpacklo_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    let unpacklo = _mm_unpacklo_pd(a, b).as_f64x2();
+    transmute(simd_select_bitmask(k, unpacklo, src.as_f64x2()))
+}
+
+/// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_unpacklo_pd&expand=6098)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vunpcklpd))]
+pub unsafe fn _mm_maskz_unpacklo_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    let unpacklo = _mm_unpacklo_pd(a, b).as_f64x2();
+    let zero = _mm_setzero_pd().as_f64x2();
+    transmute(simd_select_bitmask(k, unpacklo, zero))
 }
 
 /// Cast vector of type __m128 to type __m512; the upper 384 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
@@ -20947,7 +22055,7 @@ pub unsafe fn _mm512_castsi512_pd(a: __m512i) -> __m512d {
 
 /// Broadcast the low packed 32-bit integer from a to all elements of dst.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_broadcastd_epi32&expand=545)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcastd_epi32&expand=545)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vbroadcast))] //should be vpbroadcastd
@@ -20959,7 +22067,7 @@ pub unsafe fn _mm512_broadcastd_epi32(a: __m128i) -> __m512i {
 
 /// Broadcast the low packed 32-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_broadcastd_epi32&expand=546)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_broadcastd_epi32&expand=546)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastd
@@ -20970,7 +22078,7 @@ pub unsafe fn _mm512_mask_broadcastd_epi32(src: __m512i, k: __mmask16, a: __m128
 
 /// Broadcast the low packed 32-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_broadcastd_epi32&expand=547)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_broadcastd_epi32&expand=547)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastd
@@ -20980,9 +22088,55 @@ pub unsafe fn _mm512_maskz_broadcastd_epi32(k: __mmask16, a: __m128i) -> __m512i
     transmute(simd_select_bitmask(k, broadcast, zero))
 }
 
+/// Broadcast the low packed 32-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_broadcastd_epi32&expand=543)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastd
+pub unsafe fn _mm256_mask_broadcastd_epi32(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
+    let broadcast = _mm256_broadcastd_epi32(a).as_i32x8();
+    transmute(simd_select_bitmask(k, broadcast, src.as_i32x8()))
+}
+
+/// Broadcast the low packed 32-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_broadcastd_epi32&expand=544)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastd
+pub unsafe fn _mm256_maskz_broadcastd_epi32(k: __mmask8, a: __m128i) -> __m256i {
+    let broadcast = _mm256_broadcastd_epi32(a).as_i32x8();
+    let zero = _mm256_setzero_si256().as_i32x8();
+    transmute(simd_select_bitmask(k, broadcast, zero))
+}
+
+/// Broadcast the low packed 32-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_broadcastd_epi32&expand=540)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastd
+pub unsafe fn _mm_mask_broadcastd_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    let broadcast = _mm_broadcastd_epi32(a).as_i32x4();
+    transmute(simd_select_bitmask(k, broadcast, src.as_i32x4()))
+}
+
+/// Broadcast the low packed 32-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_broadcastd_epi32&expand=541)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastd
+pub unsafe fn _mm_maskz_broadcastd_epi32(k: __mmask8, a: __m128i) -> __m128i {
+    let broadcast = _mm_broadcastd_epi32(a).as_i32x4();
+    let zero = _mm_setzero_si128().as_i32x4();
+    transmute(simd_select_bitmask(k, broadcast, zero))
+}
+
 /// Broadcast the low packed 64-bit integer from a to all elements of dst.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_broadcastq_epi64&expand=560)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcastq_epi64&expand=560)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vbroadcas))] //should be vpbroadcastq
@@ -20992,7 +22146,7 @@ pub unsafe fn _mm512_broadcastq_epi64(a: __m128i) -> __m512i {
 
 /// Broadcast the low packed 64-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_broadcastq_epi64&expand=561)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_broadcastq_epi64&expand=561)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastq
@@ -21003,7 +22157,7 @@ pub unsafe fn _mm512_mask_broadcastq_epi64(src: __m512i, k: __mmask8, a: __m128i
 
 /// Broadcast the low packed 64-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_broadcastq_epi64&expand=562)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_broadcastq_epi64&expand=562)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastq
@@ -21013,9 +22167,55 @@ pub unsafe fn _mm512_maskz_broadcastq_epi64(k: __mmask8, a: __m128i) -> __m512i
     transmute(simd_select_bitmask(k, broadcast, zero))
 }
 
+/// Broadcast the low packed 64-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_broadcastq_epi64&expand=558)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastq
+pub unsafe fn _mm256_mask_broadcastq_epi64(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
+    let broadcast = _mm256_broadcastq_epi64(a).as_i64x4();
+    transmute(simd_select_bitmask(k, broadcast, src.as_i64x4()))
+}
+
+/// Broadcast the low packed 64-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_broadcastq_epi64&expand=559)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastq
+pub unsafe fn _mm256_maskz_broadcastq_epi64(k: __mmask8, a: __m128i) -> __m256i {
+    let broadcast = _mm256_broadcastq_epi64(a).as_i64x4();
+    let zero = _mm256_setzero_si256().as_i64x4();
+    transmute(simd_select_bitmask(k, broadcast, zero))
+}
+
+/// Broadcast the low packed 64-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_broadcastq_epi64&expand=555)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastq
+pub unsafe fn _mm_mask_broadcastq_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    let broadcast = _mm_broadcastq_epi64(a).as_i64x2();
+    transmute(simd_select_bitmask(k, broadcast, src.as_i64x2()))
+}
+
+/// Broadcast the low packed 64-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_broadcastq_epi64&expand=556)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastq
+pub unsafe fn _mm_maskz_broadcastq_epi64(k: __mmask8, a: __m128i) -> __m128i {
+    let broadcast = _mm_broadcastq_epi64(a).as_i64x2();
+    let zero = _mm_setzero_si128().as_i64x2();
+    transmute(simd_select_bitmask(k, broadcast, zero))
+}
+
 /// Broadcast the low single-precision (32-bit) floating-point element from a to all elements of dst.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_broadcastss_ps&expand=578)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcastss_ps&expand=578)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vbroadcastss))]
@@ -21025,7 +22225,7 @@ pub unsafe fn _mm512_broadcastss_ps(a: __m128) -> __m512 {
 
 /// Broadcast the low single-precision (32-bit) floating-point element from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_broadcastss_ps&expand=579)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_broadcastss_ps&expand=579)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vbroadcastss))]
@@ -21036,7 +22236,7 @@ pub unsafe fn _mm512_mask_broadcastss_ps(src: __m512, k: __mmask16, a: __m128) -
 
 /// Broadcast the low single-precision (32-bit) floating-point element from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_broadcastss_ps&expand=580)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_broadcastss_ps&expand=580)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vbroadcastss))]
@@ -21046,19 +22246,65 @@ pub unsafe fn _mm512_maskz_broadcastss_ps(k: __mmask16, a: __m128) -> __m512 {
     transmute(simd_select_bitmask(k, broadcast, zero))
 }
 
+/// Broadcast the low single-precision (32-bit) floating-point element from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_broadcastss_ps&expand=576)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vbroadcastss))]
+pub unsafe fn _mm256_mask_broadcastss_ps(src: __m256, k: __mmask8, a: __m128) -> __m256 {
+    let broadcast = _mm256_broadcastss_ps(a).as_f32x8();
+    transmute(simd_select_bitmask(k, broadcast, src.as_f32x8()))
+}
+
+/// Broadcast the low single-precision (32-bit) floating-point element from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_broadcastss_ps&expand=577)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vbroadcastss))]
+pub unsafe fn _mm256_maskz_broadcastss_ps(k: __mmask8, a: __m128) -> __m256 {
+    let broadcast = _mm256_broadcastss_ps(a).as_f32x8();
+    let zero = _mm256_setzero_ps().as_f32x8();
+    transmute(simd_select_bitmask(k, broadcast, zero))
+}
+
+/// Broadcast the low single-precision (32-bit) floating-point element from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_broadcastss_ps&expand=573)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vbroadcastss))]
+pub unsafe fn _mm_mask_broadcastss_ps(src: __m128, k: __mmask8, a: __m128) -> __m128 {
+    let broadcast = _mm_broadcastss_ps(a).as_f32x4();
+    transmute(simd_select_bitmask(k, broadcast, src.as_f32x4()))
+}
+
+/// Broadcast the low single-precision (32-bit) floating-point element from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_broadcastss_ps&expand=574)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vbroadcastss))]
+pub unsafe fn _mm_maskz_broadcastss_ps(k: __mmask8, a: __m128) -> __m128 {
+    let broadcast = _mm_broadcastss_ps(a).as_f32x4();
+    let zero = _mm_setzero_ps().as_f32x4();
+    transmute(simd_select_bitmask(k, broadcast, zero))
+}
+
 /// Broadcast the low double-precision (64-bit) floating-point element from a to all elements of dst.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_broadcastsd_pd&expand=567)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcastsd_pd&expand=567)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vbroadcastsd))]
 pub unsafe fn _mm512_broadcastsd_pd(a: __m128d) -> __m512d {
-    simd_shuffle8(a, a, [1, 1, 1, 1, 1, 1, 1, 1])
+    simd_shuffle8(a, a, [0, 0, 0, 0, 0, 0, 0, 0])
 }
 
 /// Broadcast the low double-precision (64-bit) floating-point element from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_broadcastsd_pd&expand=568)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_broadcastsd_pd&expand=568)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vbroadcastsd))]
@@ -21069,7 +22315,7 @@ pub unsafe fn _mm512_mask_broadcastsd_pd(src: __m512d, k: __mmask8, a: __m128d)
 
 /// Broadcast the low double-precision (64-bit) floating-point element from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_broadcastsd_pd&expand=569)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_broadcastsd_pd&expand=569)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vbroadcastsd))]
@@ -21079,20 +22325,43 @@ pub unsafe fn _mm512_maskz_broadcastsd_pd(k: __mmask8, a: __m128d) -> __m512d {
     transmute(simd_select_bitmask(k, broadcast, zero))
 }
 
+/// Broadcast the low double-precision (64-bit) floating-point element from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_broadcastsd_pd&expand=565)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vbroadcastsd))]
+pub unsafe fn _mm256_mask_broadcastsd_pd(src: __m256d, k: __mmask8, a: __m128d) -> __m256d {
+    let broadcast = _mm256_broadcastsd_pd(a).as_f64x4();
+    transmute(simd_select_bitmask(k, broadcast, src.as_f64x4()))
+}
+
+/// Broadcast the low double-precision (64-bit) floating-point element from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_broadcastsd_pd&expand=566)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vbroadcastsd))]
+pub unsafe fn _mm256_maskz_broadcastsd_pd(k: __mmask8, a: __m128d) -> __m256d {
+    let broadcast = _mm256_broadcastsd_pd(a).as_f64x4();
+    let zero = _mm256_setzero_pd().as_f64x4();
+    transmute(simd_select_bitmask(k, broadcast, zero))
+}
+
 /// Broadcast the 4 packed 32-bit integers from a to all elements of dst.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_broadcast_i32x4&expand=510)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcast_i32x4&expand=510)
 #[inline]
 #[target_feature(enable = "avx512f")] //msvc: vbroadcasti32x4, linux: vshuf
 pub unsafe fn _mm512_broadcast_i32x4(a: __m128i) -> __m512i {
-    let a = _mm512_castsi128_si512(a).as_i32x16();
+    let a = a.as_i32x4();
     let ret: i32x16 = simd_shuffle16(a, a, [0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3]);
     transmute(ret)
 }
 
 /// Broadcast the 4 packed 32-bit integers from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_broadcast_i32x4&expand=511)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_broadcast_i32x4&expand=511)
 #[inline]
 #[target_feature(enable = "avx512f")] //msvc: vbroadcasti32x4, linux: vshuf
 pub unsafe fn _mm512_mask_broadcast_i32x4(src: __m512i, k: __mmask16, a: __m128i) -> __m512i {
@@ -21102,7 +22371,7 @@ pub unsafe fn _mm512_mask_broadcast_i32x4(src: __m512i, k: __mmask16, a: __m128i
 
 /// Broadcast the 4 packed 32-bit integers from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_broadcast_i32x4&expand=512)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_broadcast_i32x4&expand=512)
 #[inline]
 #[target_feature(enable = "avx512f")] //msvc: vbroadcasti32x4, linux: vshuf
 pub unsafe fn _mm512_maskz_broadcast_i32x4(k: __mmask16, a: __m128i) -> __m512i {
@@ -21111,6 +22380,38 @@ pub unsafe fn _mm512_maskz_broadcast_i32x4(k: __mmask16, a: __m128i) -> __m512i
     transmute(simd_select_bitmask(k, broadcast, zero))
 }
 
+/// Broadcast the 4 packed 32-bit integers from a to all elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_broadcast_i32x4&expand=507)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")] //msvc: vbroadcasti32x4, linux: vshuf
+pub unsafe fn _mm256_broadcast_i32x4(a: __m128i) -> __m256i {
+    let a = a.as_i32x4();
+    let ret: i32x8 = simd_shuffle8(a, a, [0, 1, 2, 3, 0, 1, 2, 3]);
+    transmute(ret)
+}
+
+/// Broadcast the 4 packed 32-bit integers from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_broadcast_i32x4&expand=508)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")] //msvc: vbroadcasti32x4, linux: vshuf
+pub unsafe fn _mm256_mask_broadcast_i32x4(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
+    let broadcast = _mm256_broadcast_i32x4(a).as_i32x8();
+    transmute(simd_select_bitmask(k, broadcast, src.as_i32x8()))
+}
+
+/// Broadcast the 4 packed 32-bit integers from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_broadcast_i32x4&expand=509)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")] //msvc: vbroadcasti32x4, linux: vshuf
+pub unsafe fn _mm256_maskz_broadcast_i32x4(k: __mmask8, a: __m128i) -> __m256i {
+    let broadcast = _mm256_broadcast_i32x4(a).as_i32x8();
+    let zero = _mm256_setzero_si256().as_i32x8();
+    transmute(simd_select_bitmask(k, broadcast, zero))
+}
+
 /// Broadcast the 4 packed 64-bit integers from a to all elements of dst.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_broadcast_i64x4&expand=522)
@@ -21143,7 +22444,7 @@ pub unsafe fn _mm512_maskz_broadcast_i64x4(k: __mmask8, a: __m256i) -> __m512i {
 
 /// Broadcast the 4 packed single-precision (32-bit) floating-point elements from a to all elements of dst.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_broadcast_f32x4&expand=483)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcast_f32x4&expand=483)
 #[inline]
 #[target_feature(enable = "avx512f")] //msvc: vbroadcastf32x4, linux: vshuf
 pub unsafe fn _mm512_broadcast_f32x4(a: __m128) -> __m512 {
@@ -21152,7 +22453,7 @@ pub unsafe fn _mm512_broadcast_f32x4(a: __m128) -> __m512 {
 
 /// Broadcast the 4 packed single-precision (32-bit) floating-point elements from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_broadcast_f32x4&expand=484)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_broadcast_f32x4&expand=484)
 #[inline]
 #[target_feature(enable = "avx512f")] //msvc: vbroadcastf32x4, linux: vshu
 pub unsafe fn _mm512_mask_broadcast_f32x4(src: __m512, k: __mmask16, a: __m128) -> __m512 {
@@ -21162,7 +22463,7 @@ pub unsafe fn _mm512_mask_broadcast_f32x4(src: __m512, k: __mmask16, a: __m128)
 
 /// Broadcast the 4 packed single-precision (32-bit) floating-point elements from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_broadcast_f32x4&expand=485)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_broadcast_f32x4&expand=485)
 #[inline]
 #[target_feature(enable = "avx512f")] //msvc: vbroadcastf32x4, linux: vshu
 pub unsafe fn _mm512_maskz_broadcast_f32x4(k: __mmask16, a: __m128) -> __m512 {
@@ -21171,6 +22472,36 @@ pub unsafe fn _mm512_maskz_broadcast_f32x4(k: __mmask16, a: __m128) -> __m512 {
     transmute(simd_select_bitmask(k, broadcast, zero))
 }
 
+/// Broadcast the 4 packed single-precision (32-bit) floating-point elements from a to all elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_broadcast_f32x4&expand=480)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")] //msvc: vbroadcastf32x4, linux: vshuf
+pub unsafe fn _mm256_broadcast_f32x4(a: __m128) -> __m256 {
+    simd_shuffle8(a, a, [0, 1, 2, 3, 0, 1, 2, 3])
+}
+
+/// Broadcast the 4 packed single-precision (32-bit) floating-point elements from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_broadcast_f32x4&expand=481)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")] //msvc: vbroadcastf32x4, linux: vshu
+pub unsafe fn _mm256_mask_broadcast_f32x4(src: __m256, k: __mmask8, a: __m128) -> __m256 {
+    let broadcast = _mm256_broadcast_f32x4(a).as_f32x8();
+    transmute(simd_select_bitmask(k, broadcast, src.as_f32x8()))
+}
+
+/// Broadcast the 4 packed single-precision (32-bit) floating-point elements from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_broadcast_f32x4&expand=482)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")] //msvc: vbroadcastf32x4, linux: vshu
+pub unsafe fn _mm256_maskz_broadcast_f32x4(k: __mmask8, a: __m128) -> __m256 {
+    let broadcast = _mm256_broadcast_f32x4(a).as_f32x8();
+    let zero = _mm256_setzero_ps().as_f32x8();
+    transmute(simd_select_bitmask(k, broadcast, zero))
+}
+
 /// Broadcast the 4 packed double-precision (64-bit) floating-point elements from a to all elements of dst.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_broadcast_f64x4&expand=495)
@@ -21203,7 +22534,7 @@ pub unsafe fn _mm512_maskz_broadcast_f64x4(k: __mmask8, a: __m256d) -> __m512d {
 
 /// Blend packed 32-bit integers from a and b using control mask k, and store the results in dst.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_blend_epi32&expand=435)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_blend_epi32&expand=435)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vmovdqa32))] //should be vpblendmd
@@ -21211,9 +22542,29 @@ pub unsafe fn _mm512_mask_blend_epi32(k: __mmask16, a: __m512i, b: __m512i) -> _
     transmute(simd_select_bitmask(k, b.as_i32x16(), a.as_i32x16()))
 }
 
+/// Blend packed 32-bit integers from a and b using control mask k, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_blend_epi32&expand=434)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqa32))] //should be vpblendmd
+pub unsafe fn _mm256_mask_blend_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    transmute(simd_select_bitmask(k, b.as_i32x8(), a.as_i32x8()))
+}
+
+/// Blend packed 32-bit integers from a and b using control mask k, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_blend_epi32&expand=432)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqa32))] //should be vpblendmd
+pub unsafe fn _mm_mask_blend_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    transmute(simd_select_bitmask(k, b.as_i32x4(), a.as_i32x4()))
+}
+
 /// Blend packed 64-bit integers from a and b using control mask k, and store the results in dst.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_blend_epi64&expand=438)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_blend_epi64&expand=438)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vmovdqa64))] //should be vpblendmq
@@ -21221,9 +22572,29 @@ pub unsafe fn _mm512_mask_blend_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __
     transmute(simd_select_bitmask(k, b.as_i64x8(), a.as_i64x8()))
 }
 
+/// Blend packed 64-bit integers from a and b using control mask k, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_blend_epi64&expand=437)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqa64))] //should be vpblendmq
+pub unsafe fn _mm256_mask_blend_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    transmute(simd_select_bitmask(k, b.as_i64x4(), a.as_i64x4()))
+}
+
+/// Blend packed 64-bit integers from a and b using control mask k, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_blend_epi64&expand=436)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovdqa64))] //should be vpblendmq
+pub unsafe fn _mm_mask_blend_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    transmute(simd_select_bitmask(k, b.as_i64x2(), a.as_i64x2()))
+}
+
 /// Blend packed single-precision (32-bit) floating-point elements from a and b using control mask k, and store the results in dst.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_blend_ps&expand=451)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_blend_ps&expand=451)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vmovaps))] //should be vpblendmps
@@ -21231,9 +22602,29 @@ pub unsafe fn _mm512_mask_blend_ps(k: __mmask16, a: __m512, b: __m512) -> __m512
     transmute(simd_select_bitmask(k, b.as_f32x16(), a.as_f32x16()))
 }
 
+/// Blend packed single-precision (32-bit) floating-point elements from a and b using control mask k, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_blend_ps&expand=450)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovaps))] //should be vpblendmps
+pub unsafe fn _mm256_mask_blend_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    transmute(simd_select_bitmask(k, b.as_f32x8(), a.as_f32x8()))
+}
+
+/// Blend packed single-precision (32-bit) floating-point elements from a and b using control mask k, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_blend_ps&expand=448)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovaps))] //should be vpblendmps
+pub unsafe fn _mm_mask_blend_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    transmute(simd_select_bitmask(k, b.as_f32x4(), a.as_f32x4()))
+}
+
 /// Blend packed double-precision (64-bit) floating-point elements from a and b using control mask k, and store the results in dst.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_blend_pd&expand=446)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_blend_pd&expand=446)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vmovapd))] //should be vpblendmpd
@@ -21241,6 +22632,26 @@ pub unsafe fn _mm512_mask_blend_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m51
     transmute(simd_select_bitmask(k, b.as_f64x8(), a.as_f64x8()))
 }
 
+/// Blend packed double-precision (64-bit) floating-point elements from a and b using control mask k, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_blend_pd&expand=445)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovapd))] //should be vpblendmpd
+pub unsafe fn _mm256_mask_blend_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    transmute(simd_select_bitmask(k, b.as_f64x4(), a.as_f64x4()))
+}
+
+/// Blend packed double-precision (64-bit) floating-point elements from a and b using control mask k, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_blend_pd&expand=443)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vmovapd))] //should be vpblendmpd
+pub unsafe fn _mm_mask_blend_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    transmute(simd_select_bitmask(k, b.as_f64x2(), a.as_f64x2()))
+}
+
 /// Concatenate a and b into a 128-byte immediate result, shift the result right by imm8 32-bit elements, and store the low 64 bytes (16 elements) in dst.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_alignr_epi32&expand=245)
@@ -21484,6 +22895,194 @@ pub unsafe fn _mm512_maskz_alignr_epi32(
     transmute(simd_select_bitmask(k, r, zero))
 }
 
+/// Concatenate a and b into a 64-byte immediate result, shift the result right by imm8 32-bit elements, and store the low 32 bytes (8 elements) in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_alignr_epi32&expand=242)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(valignd, imm8 = 1))]
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm256_alignr_epi32(a: __m256i, b: __m256i, imm8: i32) -> __m256i {
+    assert!(imm8 >= 0 && imm8 <= 255);
+    let a = a.as_i32x8();
+    let b = b.as_i32x8();
+    let imm8: i32 = imm8 % 16;
+    let r: i32x8 = match imm8 {
+        0 => simd_shuffle8(a, b, [8, 9, 10, 11, 12, 13, 14, 15]),
+        1 => simd_shuffle8(a, b, [9, 10, 11, 12, 13, 14, 15, 0]),
+        2 => simd_shuffle8(a, b, [10, 11, 12, 13, 14, 15, 0, 1]),
+        3 => simd_shuffle8(a, b, [11, 12, 13, 14, 15, 0, 1, 2]),
+        4 => simd_shuffle8(a, b, [12, 13, 14, 15, 0, 1, 2, 3]),
+        5 => simd_shuffle8(a, b, [13, 14, 15, 0, 1, 2, 3, 4]),
+        6 => simd_shuffle8(a, b, [14, 15, 0, 1, 2, 3, 4, 5]),
+        7 => simd_shuffle8(a, b, [15, 0, 1, 2, 3, 4, 5, 6]),
+        8 => simd_shuffle8(a, b, [0, 1, 2, 3, 4, 5, 6, 7]),
+        9 => simd_shuffle8(a, b, [1, 2, 3, 4, 5, 6, 7, 8]),
+        10 => simd_shuffle8(a, b, [2, 3, 4, 5, 6, 7, 8, 9]),
+        11 => simd_shuffle8(a, b, [3, 4, 5, 6, 7, 8, 9, 10]),
+        12 => simd_shuffle8(a, b, [4, 5, 6, 7, 8, 9, 10, 11]),
+        13 => simd_shuffle8(a, b, [5, 6, 7, 8, 9, 10, 11, 12]),
+        14 => simd_shuffle8(a, b, [6, 7, 8, 9, 10, 11, 12, 13]),
+        _ => simd_shuffle8(a, b, [7, 8, 9, 10, 11, 12, 13, 14]),
+    };
+    transmute(r)
+}
+
+/// Concatenate a and b into a 64-byte immediate result, shift the result right by imm8 32-bit elements, and store the low 32 bytes (8 elements) in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_alignr_epi32&expand=243)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(valignd, imm8 = 1))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm256_mask_alignr_epi32(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+    b: __m256i,
+    imm8: i32,
+) -> __m256i {
+    assert!(imm8 >= 0 && imm8 <= 255);
+    let a = a.as_i32x8();
+    let b = b.as_i32x8();
+    let imm8: i32 = imm8 % 16;
+    let r: i32x8 = match imm8 {
+        0 => simd_shuffle8(a, b, [8, 9, 10, 11, 12, 13, 14, 15]),
+        1 => simd_shuffle8(a, b, [9, 10, 11, 12, 13, 14, 15, 0]),
+        2 => simd_shuffle8(a, b, [10, 11, 12, 13, 14, 15, 0, 1]),
+        3 => simd_shuffle8(a, b, [11, 12, 13, 14, 15, 0, 1, 2]),
+        4 => simd_shuffle8(a, b, [12, 13, 14, 15, 0, 1, 2, 3]),
+        5 => simd_shuffle8(a, b, [13, 14, 15, 0, 1, 2, 3, 4]),
+        6 => simd_shuffle8(a, b, [14, 15, 0, 1, 2, 3, 4, 5]),
+        7 => simd_shuffle8(a, b, [15, 0, 1, 2, 3, 4, 5, 6]),
+        8 => simd_shuffle8(a, b, [0, 1, 2, 3, 4, 5, 6, 7]),
+        9 => simd_shuffle8(a, b, [1, 2, 3, 4, 5, 6, 7, 8]),
+        10 => simd_shuffle8(a, b, [2, 3, 4, 5, 6, 7, 8, 9]),
+        11 => simd_shuffle8(a, b, [3, 4, 5, 6, 7, 8, 9, 10]),
+        12 => simd_shuffle8(a, b, [4, 5, 6, 7, 8, 9, 10, 11]),
+        13 => simd_shuffle8(a, b, [5, 6, 7, 8, 9, 10, 11, 12]),
+        14 => simd_shuffle8(a, b, [6, 7, 8, 9, 10, 11, 12, 13]),
+        _ => simd_shuffle8(a, b, [7, 8, 9, 10, 11, 12, 13, 14]),
+    };
+    transmute(simd_select_bitmask(k, r, src.as_i32x8()))
+}
+
+/// Concatenate a and b into a 64-byte immediate result, shift the result right by imm8 32-bit elements, and store the low 32 bytes (8 elements) in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_alignr_epi32&expand=244)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(valignd, imm8 = 1))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm256_maskz_alignr_epi32(k: __mmask8, a: __m256i, b: __m256i, imm8: i32) -> __m256i {
+    assert!(imm8 >= 0 && imm8 <= 255);
+    let a = a.as_i32x8();
+    let b = b.as_i32x8();
+    let imm8: i32 = imm8 % 16;
+    let r: i32x8 = match imm8 {
+        0 => simd_shuffle8(a, b, [8, 9, 10, 11, 12, 13, 14, 15]),
+        1 => simd_shuffle8(a, b, [9, 10, 11, 12, 13, 14, 15, 0]),
+        2 => simd_shuffle8(a, b, [10, 11, 12, 13, 14, 15, 0, 1]),
+        3 => simd_shuffle8(a, b, [11, 12, 13, 14, 15, 0, 1, 2]),
+        4 => simd_shuffle8(a, b, [12, 13, 14, 15, 0, 1, 2, 3]),
+        5 => simd_shuffle8(a, b, [13, 14, 15, 0, 1, 2, 3, 4]),
+        6 => simd_shuffle8(a, b, [14, 15, 0, 1, 2, 3, 4, 5]),
+        7 => simd_shuffle8(a, b, [15, 0, 1, 2, 3, 4, 5, 6]),
+        8 => simd_shuffle8(a, b, [0, 1, 2, 3, 4, 5, 6, 7]),
+        9 => simd_shuffle8(a, b, [1, 2, 3, 4, 5, 6, 7, 8]),
+        10 => simd_shuffle8(a, b, [2, 3, 4, 5, 6, 7, 8, 9]),
+        11 => simd_shuffle8(a, b, [3, 4, 5, 6, 7, 8, 9, 10]),
+        12 => simd_shuffle8(a, b, [4, 5, 6, 7, 8, 9, 10, 11]),
+        13 => simd_shuffle8(a, b, [5, 6, 7, 8, 9, 10, 11, 12]),
+        14 => simd_shuffle8(a, b, [6, 7, 8, 9, 10, 11, 12, 13]),
+        _ => simd_shuffle8(a, b, [7, 8, 9, 10, 11, 12, 13, 14]),
+    };
+    let zero = _mm256_setzero_si256().as_i32x8();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Concatenate a and b into a 32-byte immediate result, shift the result right by imm8 32-bit elements, and store the low 16 bytes (4 elements) in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_alignr_epi32&expand=239)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpalignr, imm8 = 1))] //should be valignd
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm_alignr_epi32(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
+    assert!(imm8 >= 0 && imm8 <= 255);
+    let a = a.as_i32x4();
+    let b = b.as_i32x4();
+    let imm8: i32 = imm8 % 8;
+    let r: i32x4 = match imm8 {
+        0 => simd_shuffle4(a, b, [4, 5, 6, 7]),
+        1 => simd_shuffle4(a, b, [5, 6, 7, 0]),
+        2 => simd_shuffle4(a, b, [6, 7, 0, 1]),
+        3 => simd_shuffle4(a, b, [7, 0, 1, 2]),
+        4 => simd_shuffle4(a, b, [0, 1, 2, 3]),
+        5 => simd_shuffle4(a, b, [1, 2, 3, 0]),
+        6 => simd_shuffle4(a, b, [2, 3, 0, 1]),
+        _ => simd_shuffle4(a, b, [3, 0, 1, 2]),
+    };
+    transmute(r)
+}
+
+/// Concatenate a and b into a 32-byte immediate result, shift the result right by imm8 32-bit elements, and store the low 16 bytes (4 elements) in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_alignr_epi32&expand=240)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(valignd, imm8 = 1))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm_mask_alignr_epi32(
+    src: __m128i,
+    k: __mmask8,
+    a: __m128i,
+    b: __m128i,
+    imm8: i32,
+) -> __m128i {
+    assert!(imm8 >= 0 && imm8 <= 255);
+    let a = a.as_i32x4();
+    let b = b.as_i32x4();
+    let imm8: i32 = imm8 % 8;
+    let r: i32x4 = match imm8 {
+        0 => simd_shuffle4(a, b, [4, 5, 6, 7]),
+        1 => simd_shuffle4(a, b, [5, 6, 7, 0]),
+        2 => simd_shuffle4(a, b, [6, 7, 0, 1]),
+        3 => simd_shuffle4(a, b, [7, 0, 1, 2]),
+        4 => simd_shuffle4(a, b, [0, 1, 2, 3]),
+        5 => simd_shuffle4(a, b, [1, 2, 3, 0]),
+        6 => simd_shuffle4(a, b, [2, 3, 0, 1]),
+        _ => simd_shuffle4(a, b, [3, 0, 1, 2]),
+    };
+    transmute(simd_select_bitmask(k, r, src.as_i32x4()))
+}
+
+/// Concatenate a and b into a 32-byte immediate result, shift the result right by imm8 32-bit elements, and store the low 16 bytes (4 elements) in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_alignr_epi32&expand=241)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(valignd, imm8 = 1))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm_maskz_alignr_epi32(k: __mmask8, a: __m128i, b: __m128i, imm8: i32) -> __m128i {
+    assert!(imm8 >= 0 && imm8 <= 255);
+    let a = a.as_i32x4();
+    let b = b.as_i32x4();
+    let imm8: i32 = imm8 % 8;
+    let r: i32x4 = match imm8 {
+        0 => simd_shuffle4(a, b, [4, 5, 6, 7]),
+        1 => simd_shuffle4(a, b, [5, 6, 7, 0]),
+        2 => simd_shuffle4(a, b, [6, 7, 0, 1]),
+        3 => simd_shuffle4(a, b, [7, 0, 1, 2]),
+        4 => simd_shuffle4(a, b, [0, 1, 2, 3]),
+        5 => simd_shuffle4(a, b, [1, 2, 3, 0]),
+        6 => simd_shuffle4(a, b, [2, 3, 0, 1]),
+        _ => simd_shuffle4(a, b, [3, 0, 1, 2]),
+    };
+    let zero = _mm_setzero_si128().as_i32x4();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
 /// Concatenate a and b into a 128-byte immediate result, shift the result right by imm8 64-bit elements, and store the low 64 bytes (8 elements) in dst.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_alignr_epi64&expand=254)
@@ -21560,6 +23159,146 @@ pub unsafe fn _mm512_maskz_alignr_epi64(k: __mmask8, a: __m512i, b: __m512i, imm
     transmute(simd_select_bitmask(k, r, zero))
 }
 
+/// Concatenate a and b into a 64-byte immediate result, shift the result right by imm8 64-bit elements, and store the low 32 bytes (4 elements) in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_alignr_epi64&expand=251)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(valignq, imm8 = 1))]
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm256_alignr_epi64(a: __m256i, b: __m256i, imm8: i32) -> __m256i {
+    assert!(imm8 >= 0 && imm8 <= 255);
+    let imm8: i32 = imm8 % 8;
+    let r: i64x4 = match imm8 {
+        0 => simd_shuffle4(a, b, [4, 5, 6, 7]),
+        1 => simd_shuffle4(a, b, [5, 6, 7, 0]),
+        2 => simd_shuffle4(a, b, [6, 7, 0, 1]),
+        3 => simd_shuffle4(a, b, [7, 0, 1, 2]),
+        4 => simd_shuffle4(a, b, [0, 1, 2, 3]),
+        5 => simd_shuffle4(a, b, [1, 2, 3, 4]),
+        6 => simd_shuffle4(a, b, [2, 3, 4, 5]),
+        _ => simd_shuffle4(a, b, [3, 4, 5, 6]),
+    };
+    transmute(r)
+}
+
+/// Concatenate a and b into a 64-byte immediate result, shift the result right by imm8 64-bit elements, and store the low 32 bytes (4 elements) in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_alignr_epi64&expand=252)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(valignq, imm8 = 1))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm256_mask_alignr_epi64(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+    b: __m256i,
+    imm8: i32,
+) -> __m256i {
+    assert!(imm8 >= 0 && imm8 <= 255);
+    let imm8: i32 = imm8 % 8;
+    let r: i64x4 = match imm8 {
+        0 => simd_shuffle4(a, b, [4, 5, 6, 7]),
+        1 => simd_shuffle4(a, b, [5, 6, 7, 0]),
+        2 => simd_shuffle4(a, b, [6, 7, 0, 1]),
+        3 => simd_shuffle4(a, b, [7, 0, 1, 2]),
+        4 => simd_shuffle4(a, b, [0, 1, 2, 3]),
+        5 => simd_shuffle4(a, b, [1, 2, 3, 4]),
+        6 => simd_shuffle4(a, b, [2, 3, 4, 5]),
+        _ => simd_shuffle4(a, b, [3, 4, 5, 6]),
+    };
+    transmute(simd_select_bitmask(k, r, src.as_i64x4()))
+}
+
+/// Concatenate a and b into a 64-byte immediate result, shift the result right by imm8 64-bit elements, and store the low 32 bytes (4 elements) in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_alignr_epi64&expand=253)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(valignq, imm8 = 1))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm256_maskz_alignr_epi64(k: __mmask8, a: __m256i, b: __m256i, imm8: i32) -> __m256i {
+    assert!(imm8 >= 0 && imm8 <= 255);
+    let imm8: i32 = imm8 % 8;
+    let r: i64x4 = match imm8 {
+        0 => simd_shuffle4(a, b, [4, 5, 6, 7]),
+        1 => simd_shuffle4(a, b, [5, 6, 7, 0]),
+        2 => simd_shuffle4(a, b, [6, 7, 0, 1]),
+        3 => simd_shuffle4(a, b, [7, 0, 1, 2]),
+        4 => simd_shuffle4(a, b, [0, 1, 2, 3]),
+        5 => simd_shuffle4(a, b, [1, 2, 3, 4]),
+        6 => simd_shuffle4(a, b, [2, 3, 4, 5]),
+        _ => simd_shuffle4(a, b, [3, 4, 5, 6]),
+    };
+    let zero = _mm256_setzero_si256().as_i64x4();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
+/// Concatenate a and b into a 32-byte immediate result, shift the result right by imm8 64-bit elements, and store the low 16 bytes (2 elements) in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_alignr_epi64&expand=248)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(vpalignr, imm8 = 1))] //should be valignq
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm_alignr_epi64(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
+    assert!(imm8 >= 0 && imm8 <= 255);
+    let imm8: i32 = imm8 % 4;
+    let r: i64x2 = match imm8 {
+        0 => simd_shuffle2(a, b, [2, 3]),
+        1 => simd_shuffle2(a, b, [3, 0]),
+        2 => simd_shuffle2(a, b, [0, 1]),
+        _ => simd_shuffle2(a, b, [1, 2]),
+    };
+    transmute(r)
+}
+
+/// Concatenate a and b into a 32-byte immediate result, shift the result right by imm8 64-bit elements, and store the low 16 bytes (2 elements) in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_alignr_epi64&expand=249)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(valignq, imm8 = 1))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm_mask_alignr_epi64(
+    src: __m128i,
+    k: __mmask8,
+    a: __m128i,
+    b: __m128i,
+    imm8: i32,
+) -> __m128i {
+    assert!(imm8 >= 0 && imm8 <= 255);
+    let imm8: i32 = imm8 % 4;
+    let r: i64x2 = match imm8 {
+        0 => simd_shuffle2(a, b, [2, 3]),
+        1 => simd_shuffle2(a, b, [3, 0]),
+        2 => simd_shuffle2(a, b, [0, 1]),
+        _ => simd_shuffle2(a, b, [1, 2]),
+    };
+    transmute(simd_select_bitmask(k, r, src.as_i64x2()))
+}
+
+/// Concatenate a and b into a 32-byte immediate result, shift the result right by imm8 64-bit elements, and store the low 16 bytes (2 elements) in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_alignr_epi64&expand=250)
+#[inline]
+#[target_feature(enable = "avx512f,avx512vl")]
+#[cfg_attr(test, assert_instr(valignq, imm8 = 1))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm_maskz_alignr_epi64(k: __mmask8, a: __m128i, b: __m128i, imm8: i32) -> __m128i {
+    assert!(imm8 >= 0 && imm8 <= 255);
+    let imm8: i32 = imm8 % 4;
+    let r: i64x2 = match imm8 {
+        0 => simd_shuffle2(a, b, [2, 3]),
+        1 => simd_shuffle2(a, b, [3, 0]),
+        2 => simd_shuffle2(a, b, [0, 1]),
+        _ => simd_shuffle2(a, b, [1, 2]),
+    };
+    let zero = _mm_setzero_si128().as_i64x2();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
 /// Compute the bitwise AND of packed 32-bit integers in a and b, and store the results in dst.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_and_epi32&expand=272)
@@ -41989,6 +43728,46 @@ mod tests {
         assert_eq_m512i(r, e);
     }
 
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_shuffle_epi32() {
+        let a = _mm256_set_epi32(1, 4, 5, 8, 9, 12, 13, 16);
+        let r = _mm256_mask_shuffle_epi32(a, 0, a, _MM_PERM_AADD);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_shuffle_epi32(a, 0b11111111, a, _MM_PERM_AADD);
+        let e = _mm256_set_epi32(8, 8, 1, 1, 16, 16, 9, 9);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_shuffle_epi32() {
+        let a = _mm256_set_epi32(1, 4, 5, 8, 9, 12, 13, 16);
+        let r = _mm256_maskz_shuffle_epi32(0, a, _MM_PERM_AADD);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_shuffle_epi32(0b11111111, a, _MM_PERM_AADD);
+        let e = _mm256_set_epi32(8, 8, 1, 1, 16, 16, 9, 9);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_shuffle_epi32() {
+        let a = _mm_set_epi32(1, 4, 5, 8);
+        let r = _mm_mask_shuffle_epi32(a, 0, a, _MM_PERM_AADD);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_shuffle_epi32(a, 0b00001111, a, _MM_PERM_AADD);
+        let e = _mm_set_epi32(8, 8, 1, 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_shuffle_epi32() {
+        let a = _mm_set_epi32(1, 4, 5, 8);
+        let r = _mm_maskz_shuffle_epi32(0, a, _MM_PERM_AADD);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_shuffle_epi32(0b00001111, a, _MM_PERM_AADD);
+        let e = _mm_set_epi32(8, 8, 1, 1);
+        assert_eq_m128i(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_shuffle_ps() {
         let a = _mm512_setr_ps(
@@ -42038,11 +43817,55 @@ mod tests {
         assert_eq_m512(r, e);
     }
 
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_shuffle_ps() {
+        let a = _mm256_set_ps(1., 4., 5., 8., 9., 12., 13., 16.);
+        let b = _mm256_set_ps(2., 3., 6., 7., 10., 11., 14., 15.);
+        let r = _mm256_mask_shuffle_ps(a, 0, a, b, 0x0F);
+        assert_eq_m256(r, a);
+        let r = _mm256_mask_shuffle_ps(a, 0b11111111, a, b, 0x0F);
+        let e = _mm256_set_ps(7., 7., 1., 1., 15., 15., 9., 9.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_shuffle_ps() {
+        let a = _mm256_set_ps(1., 4., 5., 8., 9., 12., 13., 16.);
+        let b = _mm256_set_ps(2., 3., 6., 7., 10., 11., 14., 15.);
+        let r = _mm256_maskz_shuffle_ps(0, a, b, 0x0F);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_shuffle_ps(0b11111111, a, b, 0x0F);
+        let e = _mm256_set_ps(7., 7., 1., 1., 15., 15., 9., 9.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_shuffle_ps() {
+        let a = _mm_set_ps(1., 4., 5., 8.);
+        let b = _mm_set_ps(2., 3., 6., 7.);
+        let r = _mm_mask_shuffle_ps(a, 0, a, b, 0x0F);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_shuffle_ps(a, 0b00001111, a, b, 0x0F);
+        let e = _mm_set_ps(7., 7., 1., 1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_shuffle_ps() {
+        let a = _mm_set_ps(1., 4., 5., 8.);
+        let b = _mm_set_ps(2., 3., 6., 7.);
+        let r = _mm_maskz_shuffle_ps(0, a, b, 0x0F);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_shuffle_ps(0b00001111, a, b, 0x0F);
+        let e = _mm_set_ps(7., 7., 1., 1.);
+        assert_eq_m128(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_shuffle_i32x4() {
         let a = _mm512_setr_epi32(1, 4, 5, 8, 9, 12, 13, 16, 1, 4, 5, 8, 9, 12, 13, 16);
         let b = _mm512_setr_epi32(2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15);
-        let r = _mm512_shuffle_i32x4(a, b, 0b00000000);
+        let r = _mm512_shuffle_i32x4(a, b, 0b0000);
         let e = _mm512_setr_epi32(1, 4, 5, 8, 1, 4, 5, 8, 2, 3, 6, 7, 2, 3, 6, 7);
         assert_eq_m512i(r, e);
     }
@@ -42051,9 +43874,9 @@ mod tests {
     unsafe fn test_mm512_mask_shuffle_i32x4() {
         let a = _mm512_setr_epi32(1, 4, 5, 8, 9, 12, 13, 16, 1, 4, 5, 8, 9, 12, 13, 16);
         let b = _mm512_setr_epi32(2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15);
-        let r = _mm512_mask_shuffle_i32x4(a, 0, a, b, 0b00000000);
+        let r = _mm512_mask_shuffle_i32x4(a, 0, a, b, 0b0000);
         assert_eq_m512i(r, a);
-        let r = _mm512_mask_shuffle_i32x4(a, 0b11111111_11111111, a, b, 0b00000000);
+        let r = _mm512_mask_shuffle_i32x4(a, 0b11111111_11111111, a, b, 0b0000);
         let e = _mm512_setr_epi32(1, 4, 5, 8, 1, 4, 5, 8, 2, 3, 6, 7, 2, 3, 6, 7);
         assert_eq_m512i(r, e);
     }
@@ -42062,13 +43885,44 @@ mod tests {
     unsafe fn test_mm512_maskz_shuffle_i32x4() {
         let a = _mm512_setr_epi32(1, 4, 5, 8, 9, 12, 13, 16, 1, 4, 5, 8, 9, 12, 13, 16);
         let b = _mm512_setr_epi32(2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15);
-        let r = _mm512_maskz_shuffle_i32x4(0, a, b, 0b00000000);
+        let r = _mm512_maskz_shuffle_i32x4(0, a, b, 0b0000);
         assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_shuffle_i32x4(0b00000000_11111111, a, b, 0b00000000);
+        let r = _mm512_maskz_shuffle_i32x4(0b00000000_11111111, a, b, 0b0000);
         let e = _mm512_setr_epi32(1, 4, 5, 8, 1, 4, 5, 8, 0, 0, 0, 0, 0, 0, 0, 0);
         assert_eq_m512i(r, e);
     }
 
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_shuffle_i32x4() {
+        let a = _mm256_set_epi32(1, 4, 5, 8, 9, 12, 13, 16);
+        let b = _mm256_set_epi32(2, 3, 6, 7, 10, 11, 14, 15);
+        let r = _mm256_shuffle_i32x4(a, b, 0b00);
+        let e = _mm256_set_epi32(10, 11, 14, 15, 9, 12, 13, 16);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_shuffle_i32x4() {
+        let a = _mm256_set_epi32(1, 4, 5, 8, 9, 12, 13, 16);
+        let b = _mm256_set_epi32(2, 3, 6, 7, 10, 11, 14, 15);
+        let r = _mm256_mask_shuffle_i32x4(a, 0, a, b, 0b00);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_shuffle_i32x4(a, 0b11111111, a, b, 0b00);
+        let e = _mm256_set_epi32(10, 11, 14, 15, 9, 12, 13, 16);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_shuffle_i32x4() {
+        let a = _mm256_set_epi32(1, 4, 5, 8, 9, 12, 13, 16);
+        let b = _mm256_set_epi32(2, 3, 6, 7, 10, 11, 14, 15);
+        let r = _mm256_maskz_shuffle_i32x4(0, a, b, 0b00);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_shuffle_i32x4(0b11111111, a, b, 0b00);
+        let e = _mm256_set_epi32(10, 11, 14, 15, 9, 12, 13, 16);
+        assert_eq_m256i(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_shuffle_f32x4() {
         let a = _mm512_setr_ps(
@@ -42118,6 +43972,37 @@ mod tests {
         assert_eq_m512(r, e);
     }
 
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_shuffle_f32x4() {
+        let a = _mm256_set_ps(1., 4., 5., 8., 9., 12., 13., 16.);
+        let b = _mm256_set_ps(2., 3., 6., 7., 10., 11., 14., 15.);
+        let r = _mm256_shuffle_f32x4(a, b, 0b00);
+        let e = _mm256_set_ps(10., 11., 14., 15., 9., 12., 13., 16.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_shuffle_f32x4() {
+        let a = _mm256_set_ps(1., 4., 5., 8., 9., 12., 13., 16.);
+        let b = _mm256_set_ps(2., 3., 6., 7., 10., 11., 14., 15.);
+        let r = _mm256_mask_shuffle_f32x4(a, 0, a, b, 0b00);
+        assert_eq_m256(r, a);
+        let r = _mm256_mask_shuffle_f32x4(a, 0b11111111, a, b, 0b00);
+        let e = _mm256_set_ps(10., 11., 14., 15., 9., 12., 13., 16.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_shuffle_f32x4() {
+        let a = _mm256_set_ps(1., 4., 5., 8., 9., 12., 13., 16.);
+        let b = _mm256_set_ps(2., 3., 6., 7., 10., 11., 14., 15.);
+        let r = _mm256_maskz_shuffle_f32x4(0, a, b, 0b00);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_shuffle_f32x4(0b11111111, a, b, 0b00);
+        let e = _mm256_set_ps(10., 11., 14., 15., 9., 12., 13., 16.);
+        assert_eq_m256(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_extractf32x4_ps() {
         let a = _mm512_setr_ps(
@@ -42521,6 +44406,48 @@ mod tests {
         assert_eq_m512i(r, e);
     }
 
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_broadcastd_epi32() {
+        let src = _mm256_set1_epi32(20);
+        let a = _mm_set_epi32(17, 18, 19, 20);
+        let r = _mm256_mask_broadcastd_epi32(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_broadcastd_epi32(src, 0b11111111, a);
+        let e = _mm256_set1_epi32(20);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_broadcastd_epi32() {
+        let a = _mm_set_epi32(17, 18, 19, 20);
+        let r = _mm256_maskz_broadcastd_epi32(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_broadcastd_epi32(0b11111111, a);
+        let e = _mm256_set1_epi32(20);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_broadcastd_epi32() {
+        let src = _mm_set1_epi32(20);
+        let a = _mm_set_epi32(17, 18, 19, 20);
+        let r = _mm_mask_broadcastd_epi32(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_broadcastd_epi32(src, 0b00001111, a);
+        let e = _mm_set1_epi32(20);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_broadcastd_epi32() {
+        let a = _mm_set_epi32(17, 18, 19, 20);
+        let r = _mm_maskz_broadcastd_epi32(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_broadcastd_epi32(0b00001111, a);
+        let e = _mm_set1_epi32(20);
+        assert_eq_m128i(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_broadcastss_ps() {
         let a = _mm_set_ps(17., 18., 19., 20.);
@@ -42552,6 +44479,48 @@ mod tests {
         assert_eq_m512(r, e);
     }
 
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_broadcastss_ps() {
+        let src = _mm256_set1_ps(20.);
+        let a = _mm_set_ps(17., 18., 19., 20.);
+        let r = _mm256_mask_broadcastss_ps(src, 0, a);
+        assert_eq_m256(r, src);
+        let r = _mm256_mask_broadcastss_ps(src, 0b11111111, a);
+        let e = _mm256_set1_ps(20.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_broadcastss_ps() {
+        let a = _mm_set_ps(17., 18., 19., 20.);
+        let r = _mm256_maskz_broadcastss_ps(0, a);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_broadcastss_ps(0b11111111, a);
+        let e = _mm256_set1_ps(20.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_broadcastss_ps() {
+        let src = _mm_set1_ps(20.);
+        let a = _mm_set_ps(17., 18., 19., 20.);
+        let r = _mm_mask_broadcastss_ps(src, 0, a);
+        assert_eq_m128(r, src);
+        let r = _mm_mask_broadcastss_ps(src, 0b00001111, a);
+        let e = _mm_set1_ps(20.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_broadcastss_ps() {
+        let a = _mm_set_ps(17., 18., 19., 20.);
+        let r = _mm_maskz_broadcastss_ps(0, a);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_broadcastss_ps(0b00001111, a);
+        let e = _mm_set1_ps(20.);
+        assert_eq_m128(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_broadcast_i32x4() {
         let a = _mm_set_epi32(17, 18, 19, 20);
@@ -42585,6 +44554,35 @@ mod tests {
         assert_eq_m512i(r, e);
     }
 
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_broadcast_i32x4() {
+        let a = _mm_set_epi32(17, 18, 19, 20);
+        let r = _mm256_broadcast_i32x4(a);
+        let e = _mm256_set_epi32(17, 18, 19, 20, 17, 18, 19, 20);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_broadcast_i32x4() {
+        let src = _mm256_set1_epi32(20);
+        let a = _mm_set_epi32(17, 18, 19, 20);
+        let r = _mm256_mask_broadcast_i32x4(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_broadcast_i32x4(src, 0b11111111, a);
+        let e = _mm256_set_epi32(17, 18, 19, 20, 17, 18, 19, 20);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_broadcast_i32x4() {
+        let a = _mm_set_epi32(17, 18, 19, 20);
+        let r = _mm256_maskz_broadcast_i32x4(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_broadcast_i32x4(0b11111111, a);
+        let e = _mm256_set_epi32(17, 18, 19, 20, 17, 18, 19, 20);
+        assert_eq_m256i(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_broadcast_f32x4() {
         let a = _mm_set_ps(17., 18., 19., 20.);
@@ -42620,6 +44618,35 @@ mod tests {
         assert_eq_m512(r, e);
     }
 
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_broadcast_f32x4() {
+        let a = _mm_set_ps(17., 18., 19., 20.);
+        let r = _mm256_broadcast_f32x4(a);
+        let e = _mm256_set_ps(17., 18., 19., 20., 17., 18., 19., 20.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_broadcast_f32x4() {
+        let src = _mm256_set1_ps(20.);
+        let a = _mm_set_ps(17., 18., 19., 20.);
+        let r = _mm256_mask_broadcast_f32x4(src, 0, a);
+        assert_eq_m256(r, src);
+        let r = _mm256_mask_broadcast_f32x4(src, 0b11111111, a);
+        let e = _mm256_set_ps(17., 18., 19., 20., 17., 18., 19., 20.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_broadcast_f32x4() {
+        let a = _mm_set_ps(17., 18., 19., 20.);
+        let r = _mm256_maskz_broadcast_f32x4(0, a);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_broadcast_f32x4(0b11111111, a);
+        let e = _mm256_set_ps(17., 18., 19., 20., 17., 18., 19., 20.);
+        assert_eq_m256(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_mask_blend_epi32() {
         let a = _mm512_set1_epi32(1);
@@ -42629,6 +44656,24 @@ mod tests {
         assert_eq_m512i(r, e);
     }
 
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_blend_epi32() {
+        let a = _mm256_set1_epi32(1);
+        let b = _mm256_set1_epi32(2);
+        let r = _mm256_mask_blend_epi32(0b11111111, a, b);
+        let e = _mm256_set1_epi32(2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_blend_epi32() {
+        let a = _mm_set1_epi32(1);
+        let b = _mm_set1_epi32(2);
+        let r = _mm_mask_blend_epi32(0b00001111, a, b);
+        let e = _mm_set1_epi32(2);
+        assert_eq_m128i(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_mask_blend_ps() {
         let a = _mm512_set1_ps(1.);
@@ -42640,6 +44685,24 @@ mod tests {
         assert_eq_m512(r, e);
     }
 
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_blend_ps() {
+        let a = _mm256_set1_ps(1.);
+        let b = _mm256_set1_ps(2.);
+        let r = _mm256_mask_blend_ps(0b11111111, a, b);
+        let e = _mm256_set1_ps(2.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_blend_ps() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(2.);
+        let r = _mm_mask_blend_ps(0b00001111, a, b);
+        let e = _mm_set1_ps(2.);
+        assert_eq_m128(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_unpackhi_epi32() {
         let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
@@ -42677,6 +44740,50 @@ mod tests {
         assert_eq_m512i(r, e);
     }
 
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_unpackhi_epi32() {
+        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm256_set_epi32(17, 18, 19, 20, 21, 22, 23, 24);
+        let r = _mm256_mask_unpackhi_epi32(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_unpackhi_epi32(a, 0b11111111, a, b);
+        let e = _mm256_set_epi32(17, 1, 18, 2, 21, 5, 22, 6);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_unpackhi_epi32() {
+        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm256_set_epi32(17, 18, 19, 20, 21, 22, 23, 24);
+        let r = _mm256_maskz_unpackhi_epi32(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_unpackhi_epi32(0b11111111, a, b);
+        let e = _mm256_set_epi32(17, 1, 18, 2, 21, 5, 22, 6);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_unpackhi_epi32() {
+        let a = _mm_set_epi32(1, 2, 3, 4);
+        let b = _mm_set_epi32(17, 18, 19, 20);
+        let r = _mm_mask_unpackhi_epi32(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_unpackhi_epi32(a, 0b00001111, a, b);
+        let e = _mm_set_epi32(17, 1, 18, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_unpackhi_epi32() {
+        let a = _mm_set_epi32(1, 2, 3, 4);
+        let b = _mm_set_epi32(17, 18, 19, 20);
+        let r = _mm_maskz_unpackhi_epi32(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_unpackhi_epi32(0b00001111, a, b);
+        let e = _mm_set_epi32(17, 1, 18, 2);
+        assert_eq_m128i(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_unpackhi_ps() {
         let a = _mm512_set_ps(
@@ -42726,6 +44833,50 @@ mod tests {
         assert_eq_m512(r, e);
     }
 
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_unpackhi_ps() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm256_set_ps(17., 18., 19., 20., 21., 22., 23., 24.);
+        let r = _mm256_mask_unpackhi_ps(a, 0, a, b);
+        assert_eq_m256(r, a);
+        let r = _mm256_mask_unpackhi_ps(a, 0b11111111, a, b);
+        let e = _mm256_set_ps(17., 1., 18., 2., 21., 5., 22., 6.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_unpackhi_ps() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm256_set_ps(17., 18., 19., 20., 21., 22., 23., 24.);
+        let r = _mm256_maskz_unpackhi_ps(0, a, b);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_unpackhi_ps(0b11111111, a, b);
+        let e = _mm256_set_ps(17., 1., 18., 2., 21., 5., 22., 6.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_unpackhi_ps() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let b = _mm_set_ps(17., 18., 19., 20.);
+        let r = _mm_mask_unpackhi_ps(a, 0, a, b);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_unpackhi_ps(a, 0b00001111, a, b);
+        let e = _mm_set_ps(17., 1., 18., 2.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_unpackhi_ps() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let b = _mm_set_ps(17., 18., 19., 20.);
+        let r = _mm_maskz_unpackhi_ps(0, a, b);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_unpackhi_ps(0b00001111, a, b);
+        let e = _mm_set_ps(17., 1., 18., 2.);
+        assert_eq_m128(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_unpacklo_epi32() {
         let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
@@ -42763,6 +44914,50 @@ mod tests {
         assert_eq_m512i(r, e);
     }
 
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_unpacklo_epi32() {
+        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm256_set_epi32(17, 18, 19, 20, 21, 22, 23, 24);
+        let r = _mm256_mask_unpacklo_epi32(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_unpacklo_epi32(a, 0b11111111, a, b);
+        let e = _mm256_set_epi32(19, 3, 20, 4, 23, 7, 24, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_unpacklo_epi32() {
+        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm256_set_epi32(17, 18, 19, 20, 21, 22, 23, 24);
+        let r = _mm256_maskz_unpacklo_epi32(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_unpacklo_epi32(0b11111111, a, b);
+        let e = _mm256_set_epi32(19, 3, 20, 4, 23, 7, 24, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_unpacklo_epi32() {
+        let a = _mm_set_epi32(1, 2, 3, 4);
+        let b = _mm_set_epi32(17, 18, 19, 20);
+        let r = _mm_mask_unpacklo_epi32(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_unpacklo_epi32(a, 0b00001111, a, b);
+        let e = _mm_set_epi32(19, 3, 20, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_unpacklo_epi32() {
+        let a = _mm_set_epi32(1, 2, 3, 4);
+        let b = _mm_set_epi32(17, 18, 19, 20);
+        let r = _mm_maskz_unpacklo_epi32(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_unpacklo_epi32(0b00001111, a, b);
+        let e = _mm_set_epi32(19, 3, 20, 4);
+        assert_eq_m128i(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_unpacklo_ps() {
         let a = _mm512_set_ps(
@@ -42812,6 +45007,50 @@ mod tests {
         assert_eq_m512(r, e);
     }
 
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_unpacklo_ps() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm256_set_ps(17., 18., 19., 20., 21., 22., 23., 24.);
+        let r = _mm256_mask_unpacklo_ps(a, 0, a, b);
+        assert_eq_m256(r, a);
+        let r = _mm256_mask_unpacklo_ps(a, 0b11111111, a, b);
+        let e = _mm256_set_ps(19., 3., 20., 4., 23., 7., 24., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_unpacklo_ps() {
+        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm256_set_ps(17., 18., 19., 20., 21., 22., 23., 24.);
+        let r = _mm256_maskz_unpacklo_ps(0, a, b);
+        assert_eq_m256(r, _mm256_setzero_ps());
+        let r = _mm256_maskz_unpacklo_ps(0b11111111, a, b);
+        let e = _mm256_set_ps(19., 3., 20., 4., 23., 7., 24., 8.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_unpacklo_ps() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let b = _mm_set_ps(17., 18., 19., 20.);
+        let r = _mm_mask_unpacklo_ps(a, 0, a, b);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_unpacklo_ps(a, 0b00001111, a, b);
+        let e = _mm_set_ps(19., 3., 20., 4.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_unpacklo_ps() {
+        let a = _mm_set_ps(1., 2., 3., 4.);
+        let b = _mm_set_ps(17., 18., 19., 20.);
+        let r = _mm_maskz_unpacklo_ps(0, a, b);
+        assert_eq_m128(r, _mm_setzero_ps());
+        let r = _mm_maskz_unpacklo_ps(0b00001111, a, b);
+        let e = _mm_set_ps(19., 3., 20., 4.);
+        assert_eq_m128(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_alignr_epi32() {
         let a = _mm512_set_epi32(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
@@ -42857,6 +45096,72 @@ mod tests {
         assert_eq_m512i(r, e);
     }
 
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_alignr_epi32() {
+        let a = _mm256_set_epi32(8, 7, 6, 5, 4, 3, 2, 1);
+        let b = _mm256_set_epi32(16, 15, 14, 13, 12, 11, 10, 9);
+        let r = _mm256_alignr_epi32(a, b, 0);
+        assert_eq_m256i(r, b);
+        let r = _mm256_alignr_epi32(a, b, 1);
+        let e = _mm256_set_epi32(1, 16, 15, 14, 13, 12, 11, 10);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_alignr_epi32() {
+        let a = _mm256_set_epi32(8, 7, 6, 5, 4, 3, 2, 1);
+        let b = _mm256_set_epi32(16, 15, 14, 13, 12, 11, 10, 9);
+        let r = _mm256_mask_alignr_epi32(a, 0, a, b, 1);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_alignr_epi32(a, 0b11111111, a, b, 1);
+        let e = _mm256_set_epi32(1, 16, 15, 14, 13, 12, 11, 10);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_alignr_epi32() {
+        let a = _mm256_set_epi32(8, 7, 6, 5, 4, 3, 2, 1);
+        let b = _mm256_set_epi32(16, 15, 14, 13, 12, 11, 10, 9);
+        let r = _mm256_maskz_alignr_epi32(0, a, b, 1);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_alignr_epi32(0b11111111, a, b, 1);
+        let e = _mm256_set_epi32(1, 16, 15, 14, 13, 12, 11, 10);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_alignr_epi32() {
+        let a = _mm_set_epi32(4, 3, 2, 1);
+        let b = _mm_set_epi32(8, 7, 6, 5);
+        let r = _mm_alignr_epi32(a, b, 0);
+        assert_eq_m128i(r, b);
+        let r = _mm_alignr_epi32(a, b, 1);
+        let e = _mm_set_epi32(1, 8, 7, 6);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_alignr_epi32() {
+        let a = _mm_set_epi32(4, 3, 2, 1);
+        let b = _mm_set_epi32(8, 7, 6, 5);
+        let r = _mm_mask_alignr_epi32(a, 0, a, b, 1);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_alignr_epi32(a, 0b00001111, a, b, 1);
+        let e = _mm_set_epi32(1, 8, 7, 6);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_alignr_epi32() {
+        let a = _mm_set_epi32(4, 3, 2, 1);
+        let b = _mm_set_epi32(8, 7, 6, 5);
+        let r = _mm_maskz_alignr_epi32(0, a, b, 1);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_alignr_epi32(0b00001111, a, b, 1);
+        let e = _mm_set_epi32(1, 8, 7, 6);
+        assert_eq_m128i(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_and_epi32() {
         #[rustfmt::skip]
diff --git a/library/stdarch/crates/core_arch/src/x86_64/avx512f.rs b/library/stdarch/crates/core_arch/src/x86_64/avx512f.rs
index ce1dbbcd9d78..69b497bca8a5 100644
--- a/library/stdarch/crates/core_arch/src/x86_64/avx512f.rs
+++ b/library/stdarch/crates/core_arch/src/x86_64/avx512f.rs
@@ -7612,6 +7612,50 @@ mod tests {
         assert_eq_m512d(r, e);
     }
 
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_shuffle_pd() {
+        let a = _mm256_set_pd(1., 4., 5., 8.);
+        let b = _mm256_set_pd(2., 3., 6., 7.);
+        let r = _mm256_mask_shuffle_pd(a, 0, a, b, 1 << 0 | 1 << 1 | 1 << 2 | 1 << 3);
+        assert_eq_m256d(r, a);
+        let r = _mm256_mask_shuffle_pd(a, 0b00001111, a, b, 1 << 0 | 1 << 1 | 1 << 2 | 1 << 3);
+        let e = _mm256_set_pd(2., 1., 6., 5.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_shuffle_pd() {
+        let a = _mm256_set_pd(1., 4., 5., 8.);
+        let b = _mm256_set_pd(2., 3., 6., 7.);
+        let r = _mm256_maskz_shuffle_pd(0, a, b, 1 << 0 | 1 << 1 | 1 << 2 | 1 << 3);
+        assert_eq_m256d(r, _mm256_setzero_pd());
+        let r = _mm256_maskz_shuffle_pd(0b00001111, a, b, 1 << 0 | 1 << 1 | 1 << 2 | 1 << 3);
+        let e = _mm256_set_pd(2., 1., 6., 5.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_shuffle_pd() {
+        let a = _mm_set_pd(1., 4.);
+        let b = _mm_set_pd(2., 3.);
+        let r = _mm_mask_shuffle_pd(a, 0, a, b, 1 << 0 | 1 << 1);
+        assert_eq_m128d(r, a);
+        let r = _mm_mask_shuffle_pd(a, 0b00000011, a, b, 1 << 0 | 1 << 1);
+        let e = _mm_set_pd(2., 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_shuffle_pd() {
+        let a = _mm_set_pd(1., 4.);
+        let b = _mm_set_pd(2., 3.);
+        let r = _mm_maskz_shuffle_pd(0, a, b, 1 << 0 | 1 << 1);
+        assert_eq_m128d(r, _mm_setzero_pd());
+        let r = _mm_maskz_shuffle_pd(0b00000011, a, b, 1 << 0 | 1 << 1);
+        let e = _mm_set_pd(2., 1.);
+        assert_eq_m128d(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_shuffle_i64x2() {
         let a = _mm512_setr_epi64(1, 4, 5, 8, 9, 12, 13, 16);
@@ -7643,6 +7687,37 @@ mod tests {
         assert_eq_m512i(r, e);
     }
 
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_shuffle_i64x2() {
+        let a = _mm256_set_epi64x(1, 4, 5, 8);
+        let b = _mm256_set_epi64x(2, 3, 6, 7);
+        let r = _mm256_shuffle_i64x2(a, b, 0b00);
+        let e = _mm256_set_epi64x(6, 7, 5, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_shuffle_i64x2() {
+        let a = _mm256_set_epi64x(1, 4, 5, 8);
+        let b = _mm256_set_epi64x(2, 3, 6, 7);
+        let r = _mm256_mask_shuffle_i64x2(a, 0, a, b, 0b00);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_shuffle_i64x2(a, 0b00001111, a, b, 0b00);
+        let e = _mm256_set_epi64x(6, 7, 5, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_shuffle_i64x2() {
+        let a = _mm256_set_epi64x(1, 4, 5, 8);
+        let b = _mm256_set_epi64x(2, 3, 6, 7);
+        let r = _mm256_maskz_shuffle_i64x2(0, a, b, 0b00);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_shuffle_i64x2(0b00001111, a, b, 0b00);
+        let e = _mm256_set_epi64x(6, 7, 5, 8);
+        assert_eq_m256i(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_shuffle_f64x2() {
         let a = _mm512_setr_pd(1., 4., 5., 8., 9., 12., 13., 16.);
@@ -7674,6 +7749,37 @@ mod tests {
         assert_eq_m512d(r, e);
     }
 
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_shuffle_f64x2() {
+        let a = _mm256_set_pd(1., 4., 5., 8.);
+        let b = _mm256_set_pd(2., 3., 6., 7.);
+        let r = _mm256_shuffle_f64x2(a, b, 0b00);
+        let e = _mm256_set_pd(6., 7., 5., 8.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_shuffle_f64x2() {
+        let a = _mm256_set_pd(1., 4., 5., 8.);
+        let b = _mm256_set_pd(2., 3., 6., 7.);
+        let r = _mm256_mask_shuffle_f64x2(a, 0, a, b, 0b00);
+        assert_eq_m256d(r, a);
+        let r = _mm256_mask_shuffle_f64x2(a, 0b00001111, a, b, 0b00);
+        let e = _mm256_set_pd(6., 7., 5., 8.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_shuffle_f64x2() {
+        let a = _mm256_set_pd(1., 4., 5., 8.);
+        let b = _mm256_set_pd(2., 3., 6., 7.);
+        let r = _mm256_maskz_shuffle_f64x2(0, a, b, 0b00);
+        assert_eq_m256d(r, _mm256_setzero_pd());
+        let r = _mm256_maskz_shuffle_f64x2(0b00001111, a, b, 0b00);
+        let e = _mm256_set_pd(6., 7., 5., 8.);
+        assert_eq_m256d(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_movedup_pd() {
         let a = _mm512_setr_pd(1., 2., 3., 4., 5., 6., 7., 8.);
@@ -7969,9 +8075,51 @@ mod tests {
         assert_eq_m512i(r, e);
     }
 
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_broadcastq_epi64() {
+        let src = _mm256_set1_epi64x(18);
+        let a = _mm_set_epi64x(17, 18);
+        let r = _mm256_mask_broadcastq_epi64(src, 0, a);
+        assert_eq_m256i(r, src);
+        let r = _mm256_mask_broadcastq_epi64(src, 0b00001111, a);
+        let e = _mm256_set1_epi64x(18);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_broadcastq_epi64() {
+        let a = _mm_set_epi64x(17, 18);
+        let r = _mm256_maskz_broadcastq_epi64(0, a);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_broadcastq_epi64(0b00001111, a);
+        let e = _mm256_set1_epi64x(18);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_broadcastq_epi64() {
+        let src = _mm_set1_epi64x(18);
+        let a = _mm_set_epi64x(17, 18);
+        let r = _mm_mask_broadcastq_epi64(src, 0, a);
+        assert_eq_m128i(r, src);
+        let r = _mm_mask_broadcastq_epi64(src, 0b00000011, a);
+        let e = _mm_set1_epi64x(18);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_broadcastq_epi64() {
+        let a = _mm_set_epi64x(17, 18);
+        let r = _mm_maskz_broadcastq_epi64(0, a);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_broadcastq_epi64(0b00000011, a);
+        let e = _mm_set1_epi64x(18);
+        assert_eq_m128i(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_broadcastsd_pd() {
-        let a = _mm_setr_pd(17., 18.);
+        let a = _mm_set_pd(17., 18.);
         let r = _mm512_broadcastsd_pd(a);
         let e = _mm512_set1_pd(18.);
         assert_eq_m512d(r, e);
@@ -7980,17 +8128,17 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_mask_broadcastsd_pd() {
         let src = _mm512_set1_pd(18.);
-        let a = _mm_setr_pd(17., 18.);
+        let a = _mm_set_pd(17., 18.);
         let r = _mm512_mask_broadcastsd_pd(src, 0, a);
         assert_eq_m512d(r, src);
-        let r = _mm512_mask_broadcastsd_pd(src, 0b01111111, a);
+        let r = _mm512_mask_broadcastsd_pd(src, 0b11111111, a);
         let e = _mm512_set1_pd(18.);
         assert_eq_m512d(r, e);
     }
 
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_maskz_broadcastsd_pd() {
-        let a = _mm_setr_pd(17., 18.);
+        let a = _mm_set_pd(17., 18.);
         let r = _mm512_maskz_broadcastsd_pd(0, a);
         assert_eq_m512d(r, _mm512_setzero_pd());
         let r = _mm512_maskz_broadcastsd_pd(0b00001111, a);
@@ -7998,6 +8146,27 @@ mod tests {
         assert_eq_m512d(r, e);
     }
 
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_broadcastsd_pd() {
+        let src = _mm256_set1_pd(18.);
+        let a = _mm_set_pd(17., 18.);
+        let r = _mm256_mask_broadcastsd_pd(src, 0, a);
+        assert_eq_m256d(r, src);
+        let r = _mm256_mask_broadcastsd_pd(src, 0b00001111, a);
+        let e = _mm256_set1_pd(18.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_broadcastsd_pd() {
+        let a = _mm_set_pd(17., 18.);
+        let r = _mm256_maskz_broadcastsd_pd(0, a);
+        assert_eq_m256d(r, _mm256_setzero_pd());
+        let r = _mm256_maskz_broadcastsd_pd(0b00001111, a);
+        let e = _mm256_set1_pd(18.);
+        assert_eq_m256d(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_broadcast_i64x4() {
         let a = _mm256_set_epi64x(17, 18, 19, 20);
@@ -8065,6 +8234,24 @@ mod tests {
         assert_eq_m512i(r, e);
     }
 
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_blend_epi64() {
+        let a = _mm256_set1_epi64x(1);
+        let b = _mm256_set1_epi64x(2);
+        let r = _mm256_mask_blend_epi64(0b00001111, a, b);
+        let e = _mm256_set1_epi64x(2);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_blend_epi64() {
+        let a = _mm_set1_epi64x(1);
+        let b = _mm_set1_epi64x(2);
+        let r = _mm_mask_blend_epi64(0b00000011, a, b);
+        let e = _mm_set1_epi64x(2);
+        assert_eq_m128i(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_mask_blend_pd() {
         let a = _mm512_set1_pd(1.);
@@ -8074,6 +8261,24 @@ mod tests {
         assert_eq_m512d(r, e);
     }
 
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_blend_pd() {
+        let a = _mm256_set1_pd(1.);
+        let b = _mm256_set1_pd(2.);
+        let r = _mm256_mask_blend_pd(0b00001111, a, b);
+        let e = _mm256_set1_pd(2.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_blend_pd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(2.);
+        let r = _mm_mask_blend_pd(0b00000011, a, b);
+        let e = _mm_set1_pd(2.);
+        assert_eq_m128d(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_unpackhi_epi64() {
         let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
@@ -8105,6 +8310,50 @@ mod tests {
         assert_eq_m512i(r, e);
     }
 
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_unpackhi_epi64() {
+        let a = _mm256_set_epi64x(1, 2, 3, 4);
+        let b = _mm256_set_epi64x(17, 18, 19, 20);
+        let r = _mm256_mask_unpackhi_epi64(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_unpackhi_epi64(a, 0b00001111, a, b);
+        let e = _mm256_set_epi64x(17, 1, 19, 3);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_unpackhi_epi64() {
+        let a = _mm256_set_epi64x(1, 2, 3, 4);
+        let b = _mm256_set_epi64x(17, 18, 19, 20);
+        let r = _mm256_maskz_unpackhi_epi64(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_unpackhi_epi64(0b00001111, a, b);
+        let e = _mm256_set_epi64x(17, 1, 19, 3);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_unpackhi_epi64() {
+        let a = _mm_set_epi64x(1, 2);
+        let b = _mm_set_epi64x(17, 18);
+        let r = _mm_mask_unpackhi_epi64(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_unpackhi_epi64(a, 0b00000011, a, b);
+        let e = _mm_set_epi64x(17, 1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_unpackhi_epi64() {
+        let a = _mm_set_epi64x(1, 2);
+        let b = _mm_set_epi64x(17, 18);
+        let r = _mm_maskz_unpackhi_epi64(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_unpackhi_epi64(0b00000011, a, b);
+        let e = _mm_set_epi64x(17, 1);
+        assert_eq_m128i(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_unpackhi_pd() {
         let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
@@ -8136,6 +8385,50 @@ mod tests {
         assert_eq_m512d(r, e);
     }
 
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_unpackhi_pd() {
+        let a = _mm256_set_pd(1., 2., 3., 4.);
+        let b = _mm256_set_pd(17., 18., 19., 20.);
+        let r = _mm256_mask_unpackhi_pd(a, 0, a, b);
+        assert_eq_m256d(r, a);
+        let r = _mm256_mask_unpackhi_pd(a, 0b00001111, a, b);
+        let e = _mm256_set_pd(17., 1., 19., 3.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_unpackhi_pd() {
+        let a = _mm256_set_pd(1., 2., 3., 4.);
+        let b = _mm256_set_pd(17., 18., 19., 20.);
+        let r = _mm256_maskz_unpackhi_pd(0, a, b);
+        assert_eq_m256d(r, _mm256_setzero_pd());
+        let r = _mm256_maskz_unpackhi_pd(0b00001111, a, b);
+        let e = _mm256_set_pd(17., 1., 19., 3.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_unpackhi_pd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(17., 18.);
+        let r = _mm_mask_unpackhi_pd(a, 0, a, b);
+        assert_eq_m128d(r, a);
+        let r = _mm_mask_unpackhi_pd(a, 0b00000011, a, b);
+        let e = _mm_set_pd(17., 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_unpackhi_pd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(17., 18.);
+        let r = _mm_maskz_unpackhi_pd(0, a, b);
+        assert_eq_m128d(r, _mm_setzero_pd());
+        let r = _mm_maskz_unpackhi_pd(0b00000011, a, b);
+        let e = _mm_set_pd(17., 1.);
+        assert_eq_m128d(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_unpacklo_epi64() {
         let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
@@ -8167,6 +8460,50 @@ mod tests {
         assert_eq_m512i(r, e);
     }
 
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_unpacklo_epi64() {
+        let a = _mm256_set_epi64x(1, 2, 3, 4);
+        let b = _mm256_set_epi64x(17, 18, 19, 20);
+        let r = _mm256_mask_unpacklo_epi64(a, 0, a, b);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_unpacklo_epi64(a, 0b00001111, a, b);
+        let e = _mm256_set_epi64x(18, 2, 20, 4);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_unpacklo_epi64() {
+        let a = _mm256_set_epi64x(1, 2, 3, 4);
+        let b = _mm256_set_epi64x(17, 18, 19, 20);
+        let r = _mm256_maskz_unpacklo_epi64(0, a, b);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_unpacklo_epi64(0b00001111, a, b);
+        let e = _mm256_set_epi64x(18, 2, 20, 4);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_unpacklo_epi64() {
+        let a = _mm_set_epi64x(1, 2);
+        let b = _mm_set_epi64x(17, 18);
+        let r = _mm_mask_unpacklo_epi64(a, 0, a, b);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_unpacklo_epi64(a, 0b00000011, a, b);
+        let e = _mm_set_epi64x(18, 2);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_unpacklo_epi64() {
+        let a = _mm_set_epi64x(1, 2);
+        let b = _mm_set_epi64x(17, 18);
+        let r = _mm_maskz_unpacklo_epi64(0, a, b);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_unpacklo_epi64(0b00000011, a, b);
+        let e = _mm_set_epi64x(18, 2);
+        assert_eq_m128i(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_unpacklo_pd() {
         let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
@@ -8198,6 +8535,50 @@ mod tests {
         assert_eq_m512d(r, e);
     }
 
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_unpacklo_pd() {
+        let a = _mm256_set_pd(1., 2., 3., 4.);
+        let b = _mm256_set_pd(17., 18., 19., 20.);
+        let r = _mm256_mask_unpacklo_pd(a, 0, a, b);
+        assert_eq_m256d(r, a);
+        let r = _mm256_mask_unpacklo_pd(a, 0b00001111, a, b);
+        let e = _mm256_set_pd(18., 2., 20., 4.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_unpacklo_pd() {
+        let a = _mm256_set_pd(1., 2., 3., 4.);
+        let b = _mm256_set_pd(17., 18., 19., 20.);
+        let r = _mm256_maskz_unpacklo_pd(0, a, b);
+        assert_eq_m256d(r, _mm256_setzero_pd());
+        let r = _mm256_maskz_unpacklo_pd(0b00001111, a, b);
+        let e = _mm256_set_pd(18., 2., 20., 4.);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_unpacklo_pd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(17., 18.);
+        let r = _mm_mask_unpacklo_pd(a, 0, a, b);
+        assert_eq_m128d(r, a);
+        let r = _mm_mask_unpacklo_pd(a, 0b00000011, a, b);
+        let e = _mm_set_pd(18., 2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_unpacklo_pd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(17., 18.);
+        let r = _mm_maskz_unpacklo_pd(0, a, b);
+        assert_eq_m128d(r, _mm_setzero_pd());
+        let r = _mm_maskz_unpacklo_pd(0b00000011, a, b);
+        let e = _mm_set_pd(18., 2.);
+        assert_eq_m128d(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_alignr_epi64() {
         let a = _mm512_set_epi64(8, 7, 6, 5, 4, 3, 2, 1);
@@ -8233,6 +8614,71 @@ mod tests {
         assert_eq_m512i(r, e);
     }
 
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_alignr_epi64() {
+        let a = _mm256_set_epi64x(4, 3, 2, 1);
+        let b = _mm256_set_epi64x(8, 7, 6, 5);
+        let r = _mm256_alignr_epi64(a, b, 0);
+        let e = _mm256_set_epi64x(8, 7, 6, 5);
+        assert_eq_m256i(r, e);
+        let r = _mm256_alignr_epi64(a, b, 6);
+        let e = _mm256_set_epi64x(6, 5, 4, 3);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_mask_alignr_epi64() {
+        let a = _mm256_set_epi64x(4, 3, 2, 1);
+        let b = _mm256_set_epi64x(8, 7, 6, 5);
+        let r = _mm256_mask_alignr_epi64(a, 0, a, b, 1);
+        assert_eq_m256i(r, a);
+        let r = _mm256_mask_alignr_epi64(a, 0b00001111, a, b, 0);
+        let e = _mm256_set_epi64x(8, 7, 6, 5);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm256_maskz_alignr_epi64() {
+        let a = _mm256_set_epi64x(4, 3, 2, 1);
+        let b = _mm256_set_epi64x(8, 7, 6, 5);
+        let r = _mm256_maskz_alignr_epi64(0, a, b, 1);
+        assert_eq_m256i(r, _mm256_setzero_si256());
+        let r = _mm256_maskz_alignr_epi64(0b00001111, a, b, 0);
+        let e = _mm256_set_epi64x(8, 7, 6, 5);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_alignr_epi64() {
+        let a = _mm_set_epi64x(2, 1);
+        let b = _mm_set_epi64x(4, 3);
+        let r = _mm_alignr_epi64(a, b, 0);
+        let e = _mm_set_epi64x(4, 3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_mask_alignr_epi64() {
+        let a = _mm_set_epi64x(2, 1);
+        let b = _mm_set_epi64x(4, 3);
+        let r = _mm_mask_alignr_epi64(a, 0, a, b, 1);
+        assert_eq_m128i(r, a);
+        let r = _mm_mask_alignr_epi64(a, 0b00000011, a, b, 0);
+        let e = _mm_set_epi64x(4, 3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f,avx512vl")]
+    unsafe fn test_mm_maskz_alignr_epi64() {
+        let a = _mm_set_epi64x(2, 1);
+        let b = _mm_set_epi64x(4, 3);
+        let r = _mm_maskz_alignr_epi64(0, a, b, 1);
+        assert_eq_m128i(r, _mm_setzero_si128());
+        let r = _mm_maskz_alignr_epi64(0b00000011, a, b, 0);
+        let e = _mm_set_epi64x(4, 3);
+        assert_eq_m128i(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_and_epi64() {
         let a = _mm512_set_epi64(1 << 0 | 1 << 15, 0, 0, 0, 0, 0, 0, 1 << 1 | 1 << 2 | 1 << 3);