Avx (#90)

* avx: _mm256_andnot_pd, _mm256_andnot_ps * avx: _mm256_blendv_pd * avx: _mm256_blend_pd with no assert_instr With assert_instr: too many instructions in the disassembly * avx: _mm256_blendv_ps * avx: _mm256_hadd_pd * avx: _mm256_hadd_ps * avx: _mm256_hsub_pd * avx: _mm256_hsub_ps * avx: _mm256_xor_pd * avx: _mm256_xor_ps * avx: _mm256_cvtepi32_pd * avx: _mm256_cvtepi32_ps * avx: _mm256_cvtpd_ps * avx: _mm256_cvtps_epi32 * avx: _mm256_cvtps_pd * avx: _mm256_cvttpd_epi32 * avx: _mm256_cvtpd_epi32 * avx: replace simd_cast by proper instrunction * avx: _mm256_cvttps_epi32 * avx: _mm256_extractf128_ps, _mm256_undefined_ps * avx: _mm256_extractf128_pd, _mm256_undefined_pd * avx: _mm256_extractf128_si256, _mm256_undefined_si256 * avx: _mm256_extract_epi8 * avx: _mm256_extract_epi16 * avx: _mm256_extract_epi32 * avx: _mm256_extract_epi64 * avx: _mm256_zeroall * avx: _mm256_zeroupper * avx: _mm256_permutevar_ps * avx: _mm_permutevar_ps * avx: replace simd_cast by as_* * avx: _mm256_permute_ps * avx: _mm256_dp_ps * avx: _mm256_shuffle_pd * avx: _mm256_shuffle_pd, wrong instruction generated * implement _mm256_hadd_ps and _mm256_hadd_pd * avx: implement _mm256_hsub_pd and _mm256_hsub_ps * assert_instr: raise the limit up to 30 instructions
2017-10-05 20:42:29 +02:00 · 2017-10-05 20:42:29 +02:00 · ee0f165e8e
commit ee0f165e8e
parent b421e9210c
2 changed files with 791 additions and 1 deletions
--- a/library/stdarch/src/x86/avx.rs
+++ b/library/stdarch/src/x86/avx.rs
@ -3,6 +3,8 @@ use std::mem;
 #[cfg(test)]
 use stdsimd_test::assert_instr;

+use simd_llvm::{simd_cast, simd_shuffle2, simd_shuffle4, simd_shuffle8};
+use v128::{f32x4, f64x2, i32x4, i64x2};
 use v256::*;

 /// Add packed double-precision (64-bit) floating-point elements
@ -68,6 +70,71 @@ pub unsafe fn _mm256_or_ps(a: f32x8, b: f32x8) -> f32x8 {
    mem::transmute(a | b)
 }

+/// Shuffle double-precision (64-bit) floating-point elements within 128-bit
+/// lanes using the control in `imm8`.
+#[inline(always)]
+#[target_feature = "+avx"]
+//#[cfg_attr(test, assert_instr(vshufpd, imm8 = 0x0))] // FIXME
+pub unsafe fn _mm256_shuffle_pd(a: f64x4, b: f64x4, imm8: i32) -> f64x4 {
+    let imm8 = (imm8 & 0xFF) as u8;
+    macro_rules! shuffle4 {
+        ($a:expr, $b:expr, $c:expr, $d:expr) => {
+            simd_shuffle4(a, b, [$a, $b, $c, $d]);
+        }
+    }
+    macro_rules! shuffle3 {
+        ($a:expr, $b: expr, $c: expr) => {
+            match (imm8 >> 3) & 0x1 {
+                0 => shuffle4!($a, $b, $c, 6),
+                _ => shuffle4!($a, $b, $c, 7),
+            }
+        }
+    }
+    macro_rules! shuffle2 {
+        ($a:expr, $b:expr) => {
+            match (imm8 >> 2) & 0x1 {
+                0 => shuffle3!($a, $b, 2),
+                _ => shuffle3!($a, $b, 3),
+            }
+        }
+    }
+    macro_rules! shuffle1 {
+        ($a:expr) => {
+            match (imm8 >> 1) & 0x1 {
+                0 => shuffle2!($a, 4),
+                _ => shuffle2!($a, 5),
+            }
+        }
+    }
+    match (imm8 >> 0) & 0x1 {
+        0 => shuffle1!(0),
+        _ => shuffle1!(1),
+    }
+}
+
+/// Compute the bitwise NOT of packed double-precision (64-bit) floating-point elements in `a`
+/// and then AND with `b`.
+#[inline(always)]
+#[target_feature = "+avx"]
+// Should be 'vandnpd' instruction.
+#[cfg_attr(test, assert_instr(vandnps))]
+pub unsafe fn _mm256_andnot_pd(a: f64x4, b: f64x4) -> f64x4 {
+    let a: u64x4 = mem::transmute(a);
+    let b: u64x4 = mem::transmute(b);
+    mem::transmute((!a) & b)
+}
+
+/// Compute the bitwise NOT of packed single-precision (32-bit) floating-point elements in `a`
+/// and then AND with `b`.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vandnps))]
+pub unsafe fn _mm256_andnot_ps(a: f32x8, b: f32x8) -> f32x8 {
+    let a: u32x8 = mem::transmute(a);
+    let b: u32x8 = mem::transmute(b);
+    mem::transmute((!a) & b)
+}
+
 /// Compare packed double-precision (64-bit) floating-point elements 
 /// in `a` and `b`, and return packed maximum values
 #[inline(always)]
@ -274,6 +341,393 @@ pub unsafe fn _mm256_sqrt_pd(a: f64x4) -> f64x4 {
    sqrtpd256(a)
 }

+/// Blend packed double-precision (64-bit) floating-point elements from
+/// `a` and `b` using control mask `imm8`.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vblendpd, imm8 = 9))]
+pub unsafe fn _mm256_blend_pd(a: f64x4, b: f64x4, imm8: i32) -> f64x4 {
+    let imm8 = (imm8 & 0xFF) as u8;
+    macro_rules! blend4 {
+        ($a:expr, $b:expr, $c:expr, $d:expr) => {
+            simd_shuffle4(a, b, [$a, $b, $c, $d]);
+        }
+    }
+    macro_rules! blend3 {
+        ($a:expr, $b: expr, $c: expr) => {
+            match imm8 & 0x8 {
+                0 => blend4!($a, $b, $c, 3),
+                _ => blend4!($a, $b, $c, 7),
+            }
+        }
+    }
+    macro_rules! blend2 {
+        ($a:expr, $b:expr) => {
+            match imm8 & 0x4 {
+                0 => blend3!($a, $b, 2),
+                _ => blend3!($a, $b, 6),
+            }
+        }
+    }
+    macro_rules! blend1 {
+        ($a:expr) => {
+            match imm8 & 0x2 {
+                0 => blend2!($a, 1),
+                _ => blend2!($a, 5),
+            }
+        }
+    }
+    match imm8 & 0x1 {
+        0 => blend1!(0),
+        _ => blend1!(4),
+    }
+}
+
+/// Blend packed double-precision (64-bit) floating-point elements from
+/// `a` and `b` using `c` as a mask.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vblendvpd))]
+pub unsafe fn _mm256_blendv_pd(a: f64x4, b: f64x4, c: f64x4) -> f64x4 {
+    vblendvpd(a, b, c)
+}
+
+/// Blend packed single-precision (32-bit) floating-point elements from
+/// `a` and `b` using `c` as a mask.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vblendvps))]
+pub unsafe fn _mm256_blendv_ps(a: f32x8, b: f32x8, c: f32x8) -> f32x8 {
+    vblendvps(a, b, c)
+}
+
+/// Conditionally multiply the packed single-precision (32-bit) floating-point
+/// elements in `a` and `b` using the high 4 bits in `imm8`,
+/// sum the four products, and conditionally return the sum
+///  using the low 4 bits of `imm8`.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vdpps, imm8 = 0x0))]
+pub unsafe fn _mm256_dp_ps(a: f32x8, b: f32x8, imm8: i32) -> f32x8 {
+    macro_rules! call {
+        ($imm8:expr) => { vdpps(a, b, $imm8) }
+    }
+    constify_imm8!(imm8, call)
+}
+
+/// Horizontal addition of adjacent pairs in the two packed vectors
+/// of 4 64-bit floating points `a` and `b`.
+/// In the result, sums of elements from `a` are returned in even locations,
+/// while sums of elements from `b` are returned in odd locations.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vhaddpd))]
+pub unsafe fn _mm256_hadd_pd(a: f64x4, b: f64x4) -> f64x4 {
+    vhaddpd(a, b)
+}
+
+/// Horizontal addition of adjacent pairs in the two packed vectors
+/// of 8 32-bit floating points `a` and `b`.
+/// In the result, sums of elements from `a` are returned in locations of
+/// indices 0, 1, 4, 5; while sums of elements from `b` are locations
+/// 2, 3, 6, 7.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vhaddps))]
+pub unsafe fn _mm256_hadd_ps(a: f32x8, b: f32x8) -> f32x8 {
+    vhaddps(a, b)
+}
+
+/// Horizontal subtraction of adjacent pairs in the two packed vectors
+/// of 4 64-bit floating points `a` and `b`.
+/// In the result, sums of elements from `a` are returned in even locations,
+/// while sums of elements from `b` are returned in odd locations.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vhsubpd))]
+pub unsafe fn _mm256_hsub_pd(a: f64x4, b: f64x4) -> f64x4 {
+    vhsubpd(a, b)
+}
+
+/// Horizontal subtraction of adjacent pairs in the two packed vectors
+/// of 8 32-bit floating points `a` and `b`.
+/// In the result, sums of elements from `a` are returned in locations of
+/// indices 0, 1, 4, 5; while sums of elements from `b` are locations
+/// 2, 3, 6, 7.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vhsubps))]
+pub unsafe fn _mm256_hsub_ps(a: f32x8, b: f32x8) -> f32x8 {
+    vhsubps(a, b)
+}
+
+/// Compute the bitwise XOR of packed double-precision (64-bit) floating-point
+/// elements in `a` and `b`.
+#[inline(always)]
+#[target_feature = "+avx"]
+// FIXME Should be 'vxorpd' instruction.
+#[cfg_attr(test, assert_instr(vxorps))]
+pub unsafe fn _mm256_xor_pd(a: f64x4, b: f64x4) -> f64x4 {
+    let a: u64x4 = mem::transmute(a);
+    let b: u64x4 = mem::transmute(b);
+    mem::transmute(a ^ b)
+}
+
+/// Compute the bitwise XOR of packed single-precision (32-bit) floating-point
+/// elements in `a` and `b`.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vxorps))]
+pub unsafe fn _mm256_xor_ps(a: f32x8, b: f32x8) -> f32x8 {
+    let a: u32x8 = mem::transmute(a);
+    let b: u32x8 = mem::transmute(b);
+    mem::transmute(a ^ b)
+}
+
+/// Convert packed 32-bit integers in `a` to packed double-precision (64-bit)
+/// floating-point elements.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vcvtdq2pd))]
+pub unsafe fn _mm256_cvtepi32_pd(a: i32x4) -> f64x4 {
+    simd_cast(a)
+}
+
+/// Convert packed 32-bit integers in `a` to packed single-precision (32-bit)
+/// floating-point elements.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vcvtdq2ps))]
+pub unsafe fn _mm256_cvtepi32_ps(a: i32x8) -> f32x8 {
+    vcvtdq2ps(a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in `a`
+/// to packed single-precision (32-bit) floating-point elements.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vcvtpd2ps))]
+pub unsafe fn _mm256_cvtpd_ps(a: f64x4) -> f32x4 {
+    vcvtpd2ps(a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in `a`
+/// to packed 32-bit integers.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vcvtps2dq))]
+pub unsafe fn _mm256_cvtps_epi32(a: f32x8) -> i32x8 {
+    vcvtps2dq(a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in `a`
+/// to packed double-precision (64-bit) floating-point elements.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vcvtps2pd))]
+pub unsafe fn _mm256_cvtps_pd(a: f32x4) -> f64x4 {
+    a.as_f64x4()
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in `a`
+/// to packed 32-bit integers with truncation.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vcvttpd2dq))]
+pub unsafe fn _mm256_cvttpd_epi32(a: f64x4) -> i32x4 {
+    vcvttpd2dq(a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in `a`
+/// to packed 32-bit integers.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vcvtpd2dq))]
+pub unsafe fn _mm256_cvtpd_epi32(a: f64x4) -> i32x4 {
+    vcvtpd2dq(a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in `a`
+/// to packed 32-bit integers with truncation.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vcvttps2dq))]
+pub unsafe fn _mm256_cvttps_epi32(a: f32x8) -> i32x8 {
+    vcvttps2dq(a)
+}
+
+/// Extract 128 bits (composed of 4 packed single-precision (32-bit)
+/// floating-point elements) from `a`, selected with `imm8`.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vextractf128))]
+pub unsafe fn _mm256_extractf128_ps(a: f32x8, imm8: i32) -> f32x4 {
+    match imm8 & 1 {
+        0 => simd_shuffle4(a, _mm256_undefined_ps(), [0, 1, 2, 3]),
+        _ => simd_shuffle4(a, _mm256_undefined_ps(), [4, 5, 6, 7]),
+    }
+}
+
+/// Extract 128 bits (composed of 2 packed double-precision (64-bit)
+/// floating-point elements) from `a`, selected with `imm8`.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vextractf128))]
+pub unsafe fn _mm256_extractf128_pd(a: f64x4, imm8: i32) -> f64x2 {
+    match imm8 & 1 {
+        0 => simd_shuffle2(a, _mm256_undefined_pd(), [0, 1]),
+        _ => simd_shuffle2(a, _mm256_undefined_pd(), [2, 3]),
+    }
+}
+
+/// Extract 128 bits (composed of integer data) from `a`, selected with `imm8`.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vextractf128))]
+pub unsafe fn _mm256_extractf128_si256(a: i64x4, imm8: i32) -> i64x2 {
+    match imm8 & 1 {
+        0 => simd_shuffle2(a, _mm256_undefined_si256(), [0, 1]),
+        _ => simd_shuffle2(a, _mm256_undefined_si256(), [2, 3]),
+    }
+}
+
+/// Extract an 8-bit integer from `a`, selected with `imm8`.
+#[inline(always)]
+#[target_feature = "+avx"]
+pub unsafe fn _mm256_extract_epi8(a: i8x32, imm8: i32) -> i32 {
+    a.extract(imm8 as u32 & 31) as i32
+}
+
+/// Extract a 16-bit integer from `a`, selected with `imm8`.
+#[inline(always)]
+#[target_feature = "+avx"]
+pub unsafe fn _mm256_extract_epi16(a: i16x16, imm8: i32) -> i32 {
+    a.extract(imm8 as u32 & 15) as i32
+}
+
+/// Extract a 32-bit integer from `a`, selected with `imm8`.
+#[inline(always)]
+#[target_feature = "+avx"]
+pub unsafe fn _mm256_extract_epi32(a: i32x8, imm8: i32) -> i32 {
+    a.extract(imm8 as u32 & 7) as i32
+}
+
+/// Extract a 64-bit integer from `a`, selected with `imm8`.
+#[inline(always)]
+#[target_feature = "+avx"]
+pub unsafe fn _mm256_extract_epi64(a: i64x4, imm8: i32) -> i32 {
+    a.extract(imm8 as u32 & 3) as i32
+}
+
+/// Zero the contents of all XMM or YMM registers.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vzeroall))]
+pub unsafe fn _mm256_zeroall() {
+    vzeroall()
+}
+
+/// Zero the upper 128 bits of all YMM registers;
+/// the lower 128-bits of the registers are unmodified.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vzeroupper))]
+pub unsafe fn _mm256_zeroupper() {
+    vzeroupper()
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in `a`
+/// within 128-bit lanes using the control in `b`.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vpermilps))]
+pub unsafe fn _mm256_permutevar_ps(a: f32x8, b: i32x8) -> f32x8 {
+    vpermilps256(a, b)
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in `a`
+/// using the control in `b`.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vpermilps))]
+pub unsafe fn _mm_permutevar_ps(a: f32x4, b: i32x4) -> f32x4 {
+    vpermilps(a, b)
+}
+
+/// Shuffle single-precision (32-bit) floating-point elements in `a`
+/// within 128-bit lanes using the control in `imm8`.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vpermilps, imm8 = 9))]
+pub unsafe fn _mm256_permute_ps(a: f32x8, imm8: i32) -> f32x8 {
+    let imm8 = (imm8 & 0xFF) as u8;
+    const fn add4(x: u32) -> u32 { x + 4 }
+    macro_rules! shuffle4 {
+        ($a:expr, $b:expr, $c:expr, $d:expr) => {
+            simd_shuffle8(a, _mm256_undefined_ps(), [
+                $a, $b, $c, $d, add4($a), add4($b), add4($c), add4($d)
+            ])
+        }
+    }
+    macro_rules! shuffle3 {
+        ($a:expr, $b:expr, $c:expr) => {
+            match (imm8 >> 6) & 0b11 {
+                0b00 => shuffle4!($a, $b, $c, 0),
+                0b01 => shuffle4!($a, $b, $c, 1),
+                0b10 => shuffle4!($a, $b, $c, 2),
+                _ => shuffle4!($a, $b, $c, 3),
+            }
+        }
+    }
+    macro_rules! shuffle2 {
+        ($a:expr, $b:expr) => {
+            match (imm8 >> 4) & 0b11 {
+                0b00 => shuffle3!($a, $b, 0),
+                0b01 => shuffle3!($a, $b, 1),
+                0b10 => shuffle3!($a, $b, 2),
+                _ => shuffle3!($a, $b, 3),
+            }
+        }
+    }
+    macro_rules! shuffle1 {
+        ($a:expr) => {
+            match (imm8 >> 2) & 0b11 {
+                0b00 => shuffle2!($a, 0),
+                0b01 => shuffle2!($a, 1),
+                0b10 => shuffle2!($a, 2),
+                _ => shuffle2!($a, 3),
+            }
+        }
+    }
+    match (imm8 >> 0) & 0b11 {
+        0b00 => shuffle1!(0),
+        0b01 => shuffle1!(1),
+        0b10 => shuffle1!(2),
+        _ => shuffle1!(3),
+    }
+}
+
+/// Return vector of type `f32x8` with undefined elements.
+#[inline(always)]
+#[target_feature = "+avx"]
+pub unsafe fn _mm256_undefined_ps() -> f32x8 {
+    mem::uninitialized()
+}
+
+/// Return vector of type `f64x4` with undefined elements.
+#[inline(always)]
+#[target_feature = "+avx"]
+pub unsafe fn _mm256_undefined_pd() -> f64x4 {
+    mem::uninitialized()
+}
+
+/// Return vector of type `i64x4` with undefined elements.
+#[inline(always)]
+#[target_feature = "+avx"]
+pub unsafe fn _mm256_undefined_si256() -> i64x4 {
+    mem::uninitialized()
+}
+
 /// LLVM intrinsics used in the above functions
 #[allow(improper_ctypes)]
 extern "C" {
@ -297,12 +751,47 @@ extern "C" {
    fn sqrtpd256(a: f64x4) -> f64x4;
    #[link_name = "llvm.x86.avx.sqrt.ps.256"]
    fn sqrtps256(a: f32x8) -> f32x8;
+    #[link_name = "llvm.x86.avx.blendv.pd.256"]
+    fn vblendvpd(a: f64x4, b: f64x4, c: f64x4) -> f64x4;
+    #[link_name = "llvm.x86.avx.blendv.ps.256"]
+    fn vblendvps(a: f32x8, b: f32x8, c: f32x8) -> f32x8;
+    #[link_name = "llvm.x86.avx.dp.ps.256"]
+    fn vdpps(a: f32x8, b: f32x8, imm8: i32) -> f32x8;
+    #[link_name = "llvm.x86.avx.hadd.pd.256"]
+    fn vhaddpd(a: f64x4, b: f64x4) -> f64x4;
+    #[link_name = "llvm.x86.avx.hadd.ps.256"]
+    fn vhaddps(a: f32x8, b: f32x8) -> f32x8;
+    #[link_name = "llvm.x86.avx.hsub.pd.256"]
+    fn vhsubpd(a: f64x4, b: f64x4) -> f64x4;
+    #[link_name = "llvm.x86.avx.hsub.ps.256"]
+    fn vhsubps(a: f32x8, b: f32x8) -> f32x8;
+    #[link_name = "llvm.x86.avx.cvtdq2.ps.256"]
+    fn vcvtdq2ps(a: i32x8) -> f32x8;
+    #[link_name = "llvm.x86.avx.cvt.pd2.ps.256"]
+    fn vcvtpd2ps(a: f64x4) -> f32x4;
+    #[link_name = "llvm.x86.avx.cvt.ps2dq.256"]
+    fn vcvtps2dq(a: f32x8) -> i32x8;
+    #[link_name = "llvm.x86.avx.cvtt.pd2dq.256"]
+    fn vcvttpd2dq(a: f64x4) -> i32x4;
+    #[link_name = "llvm.x86.avx.cvt.pd2dq.256"]
+    fn vcvtpd2dq(a: f64x4) -> i32x4;
+    #[link_name = "llvm.x86.avx.cvtt.ps2dq.256"]
+    fn vcvttps2dq(a: f32x8) -> i32x8;
+    #[link_name = "llvm.x86.avx.vzeroall"]
+    fn vzeroall();
+    #[link_name = "llvm.x86.avx.vzeroupper"]
+    fn vzeroupper();
+    #[link_name = "llvm.x86.avx.vpermilvar.ps.256"]
+    fn vpermilps256(a: f32x8, b: i32x8) -> f32x8;
+    #[link_name = "llvm.x86.avx.vpermilvar.ps"]
+    fn vpermilps(a: f32x4, b: i32x4) -> f32x4;
 }

 #[cfg(test)]
 mod tests {
    use stdsimd_test::simd_test;

+    use v128::{f32x4, f64x2, i32x4, i64x2};
    use v256::*;
    use x86::avx;

@ -360,6 +849,31 @@ mod tests {
        assert_eq!(r, e);
    }

+    #[simd_test = "avx"]
+    unsafe fn _mm256_shuffle_pd() {
+        let a = f64x4::new(1.0, 4.0, 5.0, 8.0);
+        let b = f64x4::new(2.0, 3.0, 6.0, 7.0);
+        let r = avx::_mm256_shuffle_pd(a, b, 0xF);
+        let e = f64x4::new(4.0, 3.0, 8.0, 7.0);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_andnot_pd() {
+        let a = f64x4::splat(0.0);
+        let b = f64x4::splat(0.6);
+        let r = avx::_mm256_andnot_pd(a, b);
+        assert_eq!(r, b);
+    }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_andnot_ps() {
+        let a = f32x8::splat(0.0);
+        let b = f32x8::splat(0.6);
+        let r = avx::_mm256_andnot_ps(a, b);
+        assert_eq!(r, b);
+    }
+
    #[simd_test = "avx"]
    unsafe fn _mm256_max_pd() {
        let a = f64x4::new(1.0, 4.0, 5.0, 8.0);
@ -543,4 +1057,280 @@ mod tests {
        let e = f64x4::new(1.0, 3.0, 8.0, 5.0);
        assert_eq!(r, e);
    }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_blend_pd() {
+        let a = f64x4::new(4.0, 9.0, 16.0, 25.0);
+        let b = f64x4::new(4.0, 3.0, 2.0, 5.0);
+        let r = avx::_mm256_blend_pd(a, b, 0x0);
+        assert_eq!(r, f64x4::new(4.0, 9.0, 16.0, 25.0));
+        let r = avx::_mm256_blend_pd(a, b, 0x3);
+        assert_eq!(r, f64x4::new(4.0, 3.0, 16.0, 25.0));
+        let r = avx::_mm256_blend_pd(a, b, 0xF);
+        assert_eq!(r, f64x4::new(4.0, 3.0, 2.0, 5.0));
+    }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_blendv_pd() {
+        let a = f64x4::new(4.0, 9.0, 16.0, 25.0);
+        let b = f64x4::new(4.0, 3.0, 2.0, 5.0);
+        let c = f64x4::new(0.0, 0.0, !0 as f64, !0 as f64);
+        let r = avx::_mm256_blendv_pd(a, b, c);
+        let e = f64x4::new(4.0, 9.0, 2.0, 5.0);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_blendv_ps() {
+        let a = f32x8::new(4.0, 9.0, 16.0, 25.0, 4.0, 9.0, 16.0, 25.0);
+        let b = f32x8::new(4.0, 3.0, 2.0, 5.0, 8.0, 9.0, 64.0, 50.0);
+        let c = f32x8::new(0.0, 0.0, 0.0, 0.0, !0 as f32, !0 as f32, !0 as f32, !0 as f32);
+        let r = avx::_mm256_blendv_ps(a, b, c);
+        let e = f32x8::new(4.0, 9.0, 16.0, 25.0, 8.0, 9.0, 64.0, 50.0);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_dp_ps() {
+        let a = f32x8::new(4.0, 9.0, 16.0, 25.0, 4.0, 9.0, 16.0, 25.0);
+        let b = f32x8::new(4.0, 3.0, 2.0, 5.0, 8.0, 9.0, 64.0, 50.0);
+        let r = avx::_mm256_dp_ps(a, b, 0xFF);
+        let e = f32x8::new(200.0, 200.0, 200.0, 200.0, 2387.0, 2387.0, 2387.0, 2387.0);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_hadd_pd() {
+        let a = f64x4::new(4.0, 9.0, 16.0, 25.0);
+        let b = f64x4::new(4.0, 3.0, 2.0, 5.0);
+        let r = avx::_mm256_hadd_pd(a, b);
+        let e = f64x4::new(13.0, 7.0, 41.0, 7.0);
+        assert_eq!(r, e);
+
+        let a = f64x4::new(1.0, 2.0, 3.0, 4.0);
+        let b = f64x4::new(5.0, 6.0, 7.0, 8.0);
+        let r = avx::_mm256_hadd_pd(a, b);
+        let e = f64x4::new(3.0, 11.0, 7.0, 15.0);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_hadd_ps() {
+        let a = f32x8::new(4.0, 9.0, 16.0, 25.0, 4.0, 9.0, 16.0, 25.0);
+        let b = f32x8::new(4.0, 3.0, 2.0, 5.0, 8.0, 9.0, 64.0, 50.0);
+        let r = avx::_mm256_hadd_ps(a, b);
+        let e = f32x8::new(13.0, 41.0, 7.0, 7.0, 13.0, 41.0, 17.0, 114.0);
+        assert_eq!(r, e);
+
+        let a = f32x8::new(1.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0, 4.0);
+        let b = f32x8::new(5.0, 6.0, 7.0, 8.0, 5.0, 6.0, 7.0, 8.0);
+        let r = avx::_mm256_hadd_ps(a, b);
+        let e = f32x8::new(3.0, 7.0, 11.0, 15.0, 3.0, 7.0, 11.0, 15.0);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_hsub_pd() {
+        let a = f64x4::new(4.0, 9.0, 16.0, 25.0);
+        let b = f64x4::new(4.0, 3.0, 2.0, 5.0);
+        let r = avx::_mm256_hsub_pd(a, b);
+        let e = f64x4::new(-5.0, 1.0, -9.0, -3.0);
+        assert_eq!(r, e);
+
+        let a = f64x4::new(1.0, 2.0, 3.0, 4.0);
+        let b = f64x4::new(5.0, 6.0, 7.0, 8.0);
+        let r = avx::_mm256_hsub_pd(a, b);
+        let e = f64x4::new(-1., -1., -1., -1.);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_hsub_ps() {
+        let a = f32x8::new(4.0, 9.0, 16.0, 25.0, 4.0, 9.0, 16.0, 25.0);
+        let b = f32x8::new(4.0, 3.0, 2.0, 5.0, 8.0, 9.0, 64.0, 50.0);
+        let r = avx::_mm256_hsub_ps(a, b);
+        let e = f32x8::new(-5.0, -9.0, 1.0, -3.0, -5.0, -9.0, -1.0, 14.0);
+        assert_eq!(r, e);
+
+        let a = f32x8::new(1.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0, 4.0);
+        let b = f32x8::new(5.0, 6.0, 7.0, 8.0, 5.0, 6.0, 7.0, 8.0);
+        let r = avx::_mm256_hsub_ps(a, b);
+        let e = f32x8::new(-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0);
+        assert_eq!(r, e);
+    }
+
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_xor_pd() {
+        let a = f64x4::new(4.0, 9.0, 16.0, 25.0);
+        let b = f64x4::splat(0.0);
+        let r = avx::_mm256_xor_pd(a, b);
+        assert_eq!(r, a);
+    }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_xor_ps() {
+        let a = f32x8::new(4.0, 9.0, 16.0, 25.0, 4.0, 9.0, 16.0, 25.0);
+        let b = f32x8::splat(0.0);
+        let r = avx::_mm256_xor_ps(a, b);
+        assert_eq!(r, a);
+    }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_cvtepi32_pd() {
+        let a = i32x4::new(4, 9, 16, 25);
+        let r = avx::_mm256_cvtepi32_pd(a);
+        let e = f64x4::new(4.0, 9.0, 16.0, 25.0);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_cvtepi32_ps() {
+        let a = i32x8::new(4, 9, 16, 25, 4, 9, 16, 25);
+        let r = avx::_mm256_cvtepi32_ps(a);
+        let e = f32x8::new(4.0, 9.0, 16.0, 25.0, 4.0, 9.0, 16.0, 25.0);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_cvtpd_ps() {
+        let a = f64x4::new(4.0, 9.0, 16.0, 25.0);
+        let r = avx::_mm256_cvtpd_ps(a);
+        let e = f32x4::new(4.0, 9.0, 16.0, 25.0);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_cvtps_epi32() {
+        let a = f32x8::new(4.0, 9.0, 16.0, 25.0, 4.0, 9.0, 16.0, 25.0);
+        let r = avx::_mm256_cvtps_epi32(a);
+        let e = i32x8::new(4, 9, 16, 25, 4, 9, 16, 25);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_cvtps_pd() {
+        let a = f32x4::new(4.0, 9.0, 16.0, 25.0);
+        let r = avx::_mm256_cvtps_pd(a);
+        let e = f64x4::new(4.0, 9.0, 16.0, 25.0);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_cvttpd_epi32() {
+        let a = f64x4::new(4.0, 9.0, 16.0, 25.0);
+        let r = avx::_mm256_cvttpd_epi32(a);
+        let e = i32x4::new(4, 9, 16, 25);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_cvtpd_epi32() {
+        let a = f64x4::new(4.0, 9.0, 16.0, 25.0);
+        let r = avx::_mm256_cvtpd_epi32(a);
+        let e = i32x4::new(4, 9, 16, 25);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_cvttps_epi32() {
+        let a = f32x8::new(4.0, 9.0, 16.0, 25.0, 4.0, 9.0, 16.0, 25.0);
+        let r = avx::_mm256_cvttps_epi32(a);
+        let e = i32x8::new(4, 9, 16, 25, 4, 9, 16, 25);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_extractf128_ps() {
+        let a = f32x8::new(4.0, 3.0, 2.0, 5.0, 8.0, 9.0, 64.0, 50.0);
+        let r = avx::_mm256_extractf128_ps(a, 0);
+        let e = f32x4::new(4.0, 3.0, 2.0, 5.0);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_extractf128_pd() {
+        let a = f64x4::new(4.0, 3.0, 2.0, 5.0);
+        let r = avx::_mm256_extractf128_pd(a, 0);
+        let e = f64x2::new(4.0, 3.0);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_extractf128_si256() {
+        let a = i64x4::new(4, 3, 2, 5);
+        let r = avx::_mm256_extractf128_si256(a, 0);
+        let e = i64x2::new(4, 3);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_extract_epi8() {
+        let a = i8x32::new(
+            1, 2, 3, 4, 5, 6, 7, 8,
+            9, 10, 11, 12, 13, 14, 15, 16,
+            17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32);
+        let r = avx::_mm256_extract_epi8(a, 0);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_extract_epi16() {
+        let a = i16x16::new(
+            0, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15);
+        let r = avx::_mm256_extract_epi16(a, 0);
+        assert_eq!(r, 0);
+    }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_extract_epi32() {
+        let a = i32x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = avx::_mm256_extract_epi32(a, 0);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_extract_epi64() {
+        let a = i64x4::new(0, 1, 2, 3);
+        let r = avx::_mm256_extract_epi64(a, 3);
+        assert_eq!(r, 3);
+    }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_zeroall() {
+        avx::_mm256_zeroall();
+    }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_zeroupper() {
+        avx::_mm256_zeroupper();
+    }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_permutevar_ps() {
+        let a = f32x8::new(4.0, 3.0, 2.0, 5.0, 8.0, 9.0, 64.0, 50.0);
+        let b = i32x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = avx::_mm256_permutevar_ps(a, b);
+        let e = f32x8::new(3.0, 2.0, 5.0, 4.0, 9.0, 64.0, 50.0, 8.0);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm_permutevar_ps() {
+        let a = f32x4::new(4.0, 3.0, 2.0, 5.0);
+        let b = i32x4::new(1, 2, 3, 4);
+        let r = avx::_mm_permutevar_ps(a, b);
+        let e = f32x4::new(3.0, 2.0, 5.0, 4.0);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_permute_ps() {
+        let a = f32x8::new(4.0, 3.0, 2.0, 5.0, 8.0, 9.0, 64.0, 50.0);
+        let r = avx::_mm256_permute_ps(a, 0x1b);
+        let e = f32x8::new(5.0, 2.0, 3.0, 4.0, 50.0, 64.0, 9.0, 8.0);
+        assert_eq!(r, e);
+    }
 }
--- a/library/stdarch/stdsimd-test/src/lib.rs
+++ b/library/stdarch/stdsimd-test/src/lib.rs
@ -271,7 +271,7 @@ pub fn assert(fnptr: usize, fnname: &str, expected: &str) {
        }
    }

-    let probably_only_one_instruction = function.instrs.len() < 20;
+    let probably_only_one_instruction = function.instrs.len() < 30;

    if found && probably_only_one_instruction {
        return