Avx (#105)

* avx: _mm_permute_ps and sse: _mm_undefined_ps * avx: _mm256_permutevar_pdi, _mm_permutevar_pd * avx: _mm256_permute_pd * avx: _mm256_shuffle_pd fixed * avx: _mm_permute_pd, sse2: _mm_undefined_pd * avx: _mm256_permute2f128_ps * avx: _mm256_permute2f128_pd * avx: _mm256_permute2f128_si256 * avx: _mm256_broadcast_ss * avx: _mm_broadcast_ss * avx: _mm256_broadcast_sd * avx: _mm256_broadcast_ps * avx: _mm256_broadcast_pd * avx: _mm_cmp_pd * avx: _mm256_cmp_pd * avx: _mm_cmp_ps * avx: _mm256_cmp_ps * avx: _mm_cmp_sd * avx: _mm_cmp_ss * avx: _mm256_insertf128_pd, _mm256_castpd128_pd256 * avx: _mm256_insertf128_si256, _mm256_castsi128_si256 * avx: _mm256_insertf128_ps, _mm256_castps128_ps256 * avx: _mm256_insert_epi8 * avx: _mm256_insert_epi16 * avx: _mm256_insert_epi32 * avx: _mm256_insert_epi64 * Try to fix i586 build * Fix missing inline and target_feature * sse: fix _mm_undefined_ps
2017-10-09 23:05:36 +02:00 · 2017-10-09 23:05:36 +02:00 · 7c88f7c49b
commit 7c88f7c49b
parent 807ec089b7
4 changed files with 784 additions and 1 deletions
--- a/library/stdarch/src/x86/avx.rs
+++ b/library/stdarch/src/x86/avx.rs
@ -74,7 +74,7 @@ pub unsafe fn _mm256_or_ps(a: f32x8, b: f32x8) -> f32x8 {
 /// lanes using the control in `imm8`.
 #[inline(always)]
 #[target_feature = "+avx"]
-//#[cfg_attr(test, assert_instr(vshufpd, imm8 = 0x0))] // FIXME
+#[cfg_attr(test, assert_instr(vshufpd, imm8 = 0x1))]
 pub unsafe fn _mm256_shuffle_pd(a: f64x4, b: f64x4, imm8: i32) -> f64x4 {
    let imm8 = (imm8 & 0xFF) as u8;
    macro_rules! shuffle4 {
@ -484,6 +484,152 @@ pub unsafe fn _mm256_xor_ps(a: f32x8, b: f32x8) -> f32x8 {
    mem::transmute(a ^ b)
 }

+// Equal (ordered, non-signaling)
+pub const _CMP_EQ_OQ: u8 = 0x00;
+// Less-than (ordered, signaling)
+pub const _CMP_LT_OS: u8 = 0x01;
+// Less-than-or-equal (ordered, signaling)
+pub const _CMP_LE_OS: u8 = 0x02;
+// Unordered (non-signaling)
+pub const _CMP_UNORD_Q: u8 = 0x03;
+// Not-equal (unordered, non-signaling)
+pub const _CMP_NEQ_UQ: u8 = 0x04;
+// Not-less-than (unordered, signaling)
+pub const _CMP_NLT_US: u8 = 0x05;
+// Not-less-than-or-equal (unordered, signaling)
+pub const _CMP_NLE_US: u8 = 0x06;
+// Ordered (non-signaling)
+pub const _CMP_ORD_Q: u8 = 0x07;
+// Equal (unordered, non-signaling)
+pub const _CMP_EQ_UQ: u8 = 0x08;
+// Not-greater-than-or-equal (unordered, signaling)
+pub const _CMP_NGE_US: u8 = 0x09;
+// Not-greater-than (unordered, signaling)
+pub const _CMP_NGT_US: u8 = 0x0a;
+// False (ordered, non-signaling)
+pub const _CMP_FALSE_OQ: u8 = 0x0b;
+// Not-equal (ordered, non-signaling)
+pub const _CMP_NEQ_OQ: u8 = 0x0c;
+// Greater-than-or-equal (ordered, signaling)
+pub const _CMP_GE_OS: u8 = 0x0d;
+// Greater-than (ordered, signaling)
+pub const _CMP_GT_OS: u8 = 0x0e;
+// True (unordered, non-signaling)
+pub const _CMP_TRUE_UQ: u8 = 0x0f;
+// Equal (ordered, signaling)
+pub const _CMP_EQ_OS: u8 = 0x10;
+// Less-than (ordered, non-signaling)
+pub const _CMP_LT_OQ: u8 = 0x11;
+// Less-than-or-equal (ordered, non-signaling)
+pub const _CMP_LE_OQ: u8 = 0x12;
+// Unordered (signaling)
+pub const _CMP_UNORD_S: u8 = 0x13;
+// Not-equal (unordered, signaling)
+pub const _CMP_NEQ_US: u8 = 0x14;
+// Not-less-than (unordered, non-signaling)
+pub const _CMP_NLT_UQ: u8 = 0x15;
+// Not-less-than-or-equal (unordered, non-signaling)
+pub const _CMP_NLE_UQ: u8 = 0x16;
+// Ordered (signaling)
+pub const _CMP_ORD_S: u8 = 0x17;
+// Equal (unordered, signaling)
+pub const _CMP_EQ_US: u8 = 0x18;
+// Not-greater-than-or-equal (unordered, non-signaling)
+pub const _CMP_NGE_UQ: u8 = 0x19;
+// Not-greater-than (unordered, non-signaling)
+pub const _CMP_NGT_UQ: u8 = 0x1a;
+// False (ordered, signaling)
+pub const _CMP_FALSE_OS: u8 = 0x1b;
+// Not-equal (ordered, signaling)
+pub const _CMP_NEQ_OS: u8 = 0x1c;
+// Greater-than-or-equal (ordered, non-signaling)
+pub const _CMP_GE_OQ: u8 = 0x1d;
+// Greater-than (ordered, non-signaling)
+pub const _CMP_GT_OQ: u8 = 0x1e;
+// True (unordered, signaling)
+pub const _CMP_TRUE_US: u8 = 0x1f;
+
+/// Compare packed double-precision (64-bit) floating-point
+/// elements in `a` and `b` based on the comparison operand
+/// specified by `imm8`.
+#[inline(always)]
+#[target_feature = "+avx,+sse2"]
+#[cfg_attr(test, assert_instr(vcmpeqpd, imm8 = 0))] // TODO Validate vcmppd
+pub unsafe fn _mm_cmp_pd(a: f64x2, b: f64x2, imm8: u8) -> f64x2 {
+    macro_rules! call {
+        ($imm8:expr) => { vcmppd(a, b, $imm8) }
+    }
+    constify_imm6!(imm8, call)
+}
+
+/// Compare packed double-precision (64-bit) floating-point
+/// elements in `a` and `b` based on the comparison operand
+/// specified by `imm8`.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vcmpeqpd, imm8 = 0))] // TODO Validate vcmppd
+pub unsafe fn _mm256_cmp_pd(a: f64x4, b: f64x4, imm8: u8) -> f64x4 {
+    macro_rules! call {
+        ($imm8:expr) => { vcmppd256(a, b, $imm8) }
+    }
+    constify_imm6!(imm8, call)
+}
+
+/// Compare packed single-precision (32-bit) floating-point
+/// elements in `a` and `b` based on the comparison operand
+/// specified by `imm8`.
+#[inline(always)]
+#[target_feature = "+avx,+sse"]
+#[cfg_attr(test, assert_instr(vcmpeqps, imm8 = 0))] // TODO Validate vcmpps
+pub unsafe fn _mm_cmp_ps(a: f32x4, b: f32x4, imm8: u8) -> f32x4 {
+    macro_rules! call {
+        ($imm8:expr) => { vcmpps(a, b, $imm8) }
+    }
+    constify_imm6!(imm8, call)
+}
+
+/// Compare packed single-precision (32-bit) floating-point
+/// elements in `a` and `b` based on the comparison operand
+/// specified by `imm8`.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vcmpeqps, imm8 = 0))] // TODO Validate vcmpps
+pub unsafe fn _mm256_cmp_ps(a: f32x8, b: f32x8, imm8: u8) -> f32x8 {
+    macro_rules! call {
+        ($imm8:expr) => { vcmpps256(a, b, $imm8) }
+    }
+    constify_imm6!(imm8, call)
+}
+
+/// Compare the lower double-precision (64-bit) floating-point element in
+/// `a` and `b` based on the comparison operand specified by `imm8`,
+/// store the result in the lower element of returned vector,
+/// and copy the upper element from `a` to the upper element of returned vector.
+#[inline(always)]
+#[target_feature = "+avx,+sse2"]
+#[cfg_attr(test, assert_instr(vcmpeqsd, imm8 = 0))] // TODO Validate vcmpsd
+pub unsafe fn _mm_cmp_sd(a: f64x2, b: f64x2, imm8: u8) -> f64x2 {
+    macro_rules! call {
+        ($imm8:expr) => { vcmpsd(a, b, $imm8) }
+    }
+    constify_imm6!(imm8, call)
+}
+
+/// Compare the lower single-precision (32-bit) floating-point element in
+/// `a` and `b` based on the comparison operand specified by `imm8`,
+/// store the result in the lower element of returned vector,
+/// and copy the upper 3 packed elements from `a` to the upper elements of
+/// returned vector.
+#[inline(always)]
+#[target_feature = "+avx,+sse"]
+#[cfg_attr(test, assert_instr(vcmpeqss, imm8 = 0))] // TODO Validate vcmpss
+pub unsafe fn _mm_cmp_ss(a: f32x4, b: f32x4, imm8: u8) -> f32x4 {
+    macro_rules! call {
+        ($imm8:expr) => { vcmpss(a, b, $imm8) }
+    }
+    constify_imm6!(imm8, call)
+}
+
 /// Convert packed 32-bit integers in `a` to packed double-precision (64-bit)
 /// floating-point elements.
 #[inline(always)]
@ -707,6 +853,328 @@ pub unsafe fn _mm256_permute_ps(a: f32x8, imm8: i32) -> f32x8 {
    }
 }

+/// Shuffle single-precision (32-bit) floating-point elements in `a` 
+/// using the control in `imm8`.
+#[inline(always)]
+#[target_feature = "+avx,+sse"]
+#[cfg_attr(test, assert_instr(vpermilps, imm8 = 9))]
+pub unsafe fn _mm_permute_ps(a: f32x4, imm8: i32) -> f32x4 {
+    use x86::sse::_mm_undefined_ps;
+
+    let imm8 = (imm8 & 0xFF) as u8;
+    macro_rules! shuffle4 {
+        ($a:expr, $b:expr, $c:expr, $d:expr) => {
+            simd_shuffle4(a, _mm_undefined_ps(), [
+                $a, $b, $c, $d
+            ])
+        }
+    }
+    macro_rules! shuffle3 {
+        ($a:expr, $b:expr, $c:expr) => {
+            match (imm8 >> 6) & 0b11 {
+                0b00 => shuffle4!($a, $b, $c, 0),
+                0b01 => shuffle4!($a, $b, $c, 1),
+                0b10 => shuffle4!($a, $b, $c, 2),
+                _ => shuffle4!($a, $b, $c, 3),
+            }
+        }
+    }
+    macro_rules! shuffle2 {
+        ($a:expr, $b:expr) => {
+            match (imm8 >> 4) & 0b11 {
+                0b00 => shuffle3!($a, $b, 0),
+                0b01 => shuffle3!($a, $b, 1),
+                0b10 => shuffle3!($a, $b, 2),
+                _ => shuffle3!($a, $b, 3),
+            }
+        }
+    }
+    macro_rules! shuffle1 {
+        ($a:expr) => {
+            match (imm8 >> 2) & 0b11 {
+                0b00 => shuffle2!($a, 0),
+                0b01 => shuffle2!($a, 1),
+                0b10 => shuffle2!($a, 2),
+                _ => shuffle2!($a, 3),
+            }
+        }
+    }
+    match (imm8 >> 0) & 0b11 {
+        0b00 => shuffle1!(0),
+        0b01 => shuffle1!(1),
+        0b10 => shuffle1!(2),
+        _ => shuffle1!(3),
+    }
+}
+
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vpermilpd))]
+pub unsafe fn _mm256_permutevar_pd(a: f64x4, b: i64x4) -> f64x4 {
+    vpermilpd256(a, b)
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in `a`
+/// using the control in `b`.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vpermilpd))]
+pub unsafe fn _mm_permutevar_pd(a: f64x2, b: i64x2) -> f64x2 {
+    vpermilpd(a, b)
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in `a`
+/// within 128-bit lanes using the control in `imm8`.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vpermilpd, imm8 = 0x1))]
+pub unsafe fn _mm256_permute_pd(a: f64x4, imm8: i32) -> f64x4 {
+    let imm8 = (imm8 & 0xFF) as u8;
+    macro_rules! shuffle4 {
+        ($a:expr, $b:expr, $c:expr, $d:expr) => {
+            simd_shuffle4(a, _mm256_undefined_pd(), [$a, $b, $c, $d]);
+        }
+    }
+    macro_rules! shuffle3 {
+        ($a:expr, $b: expr, $c: expr) => {
+            match (imm8 >> 3) & 0x1 {
+                0 => shuffle4!($a, $b, $c, 2),
+                _ => shuffle4!($a, $b, $c, 3),
+            }
+        }
+    }
+    macro_rules! shuffle2 {
+        ($a:expr, $b:expr) => {
+            match (imm8 >> 2) & 0x1 {
+                0 => shuffle3!($a, $b, 2),
+                _ => shuffle3!($a, $b, 3),
+            }
+        }
+    }
+    macro_rules! shuffle1 {
+        ($a:expr) => {
+            match (imm8 >> 1) & 0x1 {
+                0 => shuffle2!($a, 0),
+                _ => shuffle2!($a, 1),
+            }
+        }
+    }
+    match (imm8 >> 0) & 0x1 {
+        0 => shuffle1!(0),
+        _ => shuffle1!(1),
+    }
+}
+
+/// Shuffle double-precision (64-bit) floating-point elements in `a`
+/// using the control in `imm8`.
+#[inline(always)]
+#[target_feature = "+avx,+sse2"]
+#[cfg_attr(test, assert_instr(vpermilpd, imm8 = 0x1))]
+pub unsafe fn _mm_permute_pd(a: f64x2, imm8: i32) -> f64x2 {
+    use x86::sse2::_mm_undefined_pd;
+
+    let imm8 = (imm8 & 0xFF) as u8;
+    macro_rules! shuffle2 {
+        ($a:expr, $b:expr) => {
+            simd_shuffle2(a, _mm_undefined_pd(), [$a, $b]);
+        }
+    }
+    macro_rules! shuffle1 {
+        ($a:expr) => {
+            match (imm8 >> 1) & 0x1 {
+                0 => shuffle2!($a, 0),
+                _ => shuffle2!($a, 1),
+            }
+        }
+    }
+    match (imm8 >> 0) & 0x1 {
+        0 => shuffle1!(0),
+        _ => shuffle1!(1),
+    }
+}
+
+/// Shuffle 256-bits (composed of 8 packed single-precision (32-bit)
+/// floating-point elements) selected by `imm8` from `a` and `b`.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vperm2f128, imm8 = 0x5))]
+pub unsafe fn _mm256_permute2f128_ps(a: f32x8, b: f32x8, imm8: i8) -> f32x8 {
+    macro_rules! call {
+        ($imm8:expr) => { vperm2f128ps256(a, b, $imm8) }
+    }
+    constify_imm8!(imm8, call)
+}
+
+/// Shuffle 256-bits (composed of 4 packed double-precision (64-bit)
+/// floating-point elements) selected by `imm8` from `a` and `b`.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vperm2f128, imm8 = 0x31))]
+pub unsafe fn _mm256_permute2f128_pd(a: f64x4, b: f64x4, imm8: i8) -> f64x4 {
+    macro_rules! call {
+        ($imm8:expr) => { vperm2f128pd256(a, b, $imm8) }
+    }
+    constify_imm8!(imm8, call)
+}
+
+/// Shuffle 258-bits (composed of integer data) selected by `imm8`
+/// from `a` and `b`.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vperm2f128, imm8 = 0x31))]
+pub unsafe fn _mm256_permute2f128_si256(a: i32x8, b: i32x8, imm8: i8) -> i32x8 {
+    macro_rules! call {
+        ($imm8:expr) => { vperm2f128si256(a, b, $imm8) }
+    }
+    constify_imm8!(imm8, call)
+}
+
+/// Broadcast a single-precision (32-bit) floating-point element from memory
+/// to all elements of the returned vector.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vbroadcastss))]
+pub unsafe fn _mm256_broadcast_ss(f: &f32) -> f32x8 {
+    f32x8::splat(*f)
+}
+
+/// Broadcast a single-precision (32-bit) floating-point element from memory
+/// to all elements of the returned vector.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vbroadcastss))]
+pub unsafe fn _mm_broadcast_ss(f: &f32) -> f32x4 {
+    f32x4::splat(*f)
+}
+
+/// Broadcast a double-precision (64-bit) floating-point element from memory
+/// to all elements of the returned vector.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vbroadcastsd))]
+pub unsafe fn _mm256_broadcast_sd(f: &f64) -> f64x4 {
+    f64x4::splat(*f)
+}
+
+/// Broadcast 128 bits from memory (composed of 4 packed single-precision
+/// (32-bit) floating-point elements) to all elements of the returned vector.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vbroadcastf128))]
+pub unsafe fn _mm256_broadcast_ps(a: &f32x4) -> f32x8 {
+    vbroadcastf128ps256(a)
+}
+
+/// Broadcast 128 bits from memory (composed of 2 packed double-precision
+/// (64-bit) floating-point elements) to all elements of the returned vector.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vbroadcastf128))]
+pub unsafe fn _mm256_broadcast_pd(a: &f64x2) -> f64x4 {
+    vbroadcastf128pd256(a)
+}
+
+/// Copy `a` to result, then insert 128 bits (composed of 4 packed
+/// single-precision (32-bit) floating-point elements) from `b` into result
+/// at the location specified by `imm8`.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vinsertf128, imm8 = 1))]
+pub unsafe fn _mm256_insertf128_ps(a: f32x8, b: f32x4, imm8: i32) -> f32x8 {
+    match imm8 & 1 {
+        0 => simd_shuffle8(a, _mm256_castps128_ps256(b), [8, 9, 10, 11, 4, 5, 6, 7]),
+        _ => simd_shuffle8(a, _mm256_castps128_ps256(b), [0, 1, 2, 3, 8, 9, 10, 11]),
+    }
+}
+
+/// Copy `a` to result, then insert 128 bits (composed of 2 packed
+/// double-precision (64-bit) floating-point elements) from `b` into result
+/// at the location specified by `imm8`.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vinsertf128, imm8 = 1))]
+pub unsafe fn _mm256_insertf128_pd(a: f64x4, b: f64x2, imm8: i32) -> f64x4 {
+    match imm8 & 1 {
+        0 => simd_shuffle4(a, _mm256_castpd128_pd256(b), [4, 5, 2, 3]),
+        _ => simd_shuffle4(a, _mm256_castpd128_pd256(b), [0, 1, 4, 5]),
+    }
+}
+
+/// Copy `a` to result, then insert 128 bits from `b` into result
+/// at the location specified by `imm8`.
+#[inline(always)]
+#[target_feature = "+avx"]
+#[cfg_attr(test, assert_instr(vinsertf128, imm8 = 1))]
+pub unsafe fn _mm256_insertf128_si256(a: i64x4, b: i64x2, imm8: i32) -> i64x4 {
+    match imm8 & 1 {
+        0 => simd_shuffle4(a, _mm256_castsi128_si256(b), [4, 5, 2, 3]),
+        _ => simd_shuffle4(a, _mm256_castsi128_si256(b), [0, 1, 4, 5]),
+    }
+}
+
+/// Copy `a` to result, and insert the 8-bit integer `i` into result
+/// at the location specified by `index`.
+#[inline(always)]
+#[target_feature = "+avx"]
+pub unsafe fn _mm256_insert_epi8(a: i8x32, i: i8, index: i32) -> i8x32 {
+    let c = a;
+    c.replace(index as u32 & 31, i)
+}
+
+/// Copy `a` to result, and insert the 16-bit integer `i` into result
+/// at the location specified by `index`.
+#[inline(always)]
+#[target_feature = "+avx"]
+pub unsafe fn _mm256_insert_epi16(a: i16x16, i: i16, index: i32) -> i16x16 {
+    let c = a;
+    c.replace(index as u32 & 15, i)
+}
+
+/// Copy `a` to result, and insert the 32-bit integer `i` into result
+/// at the location specified by `index`.
+#[inline(always)]
+#[target_feature = "+avx"]
+pub unsafe fn _mm256_insert_epi32(a: i32x8, i: i32, index: i32) -> i32x8 {
+    let c = a;
+    c.replace(index as u32 & 7, i)
+}
+
+/// Copy `a` to result, and insert the 64-bit integer `i` into result
+/// at the location specified by `index`.
+#[inline(always)]
+#[target_feature = "+avx"]
+pub unsafe fn _mm256_insert_epi64(a: i64x4, i: i64, index: i32) -> i64x4 {
+    let c = a;
+    c.replace(index as u32 & 3, i)
+}
+
+/// Casts vector of type __m128 to type __m256;
+/// the upper 128 bits of the result are undefined.
+#[inline(always)]
+#[target_feature = "+avx"]
+pub unsafe fn _mm256_castps128_ps256(a: f32x4) -> f32x8 {
+    // FIXME simd_shuffle8(a, a, [0, 1, 2, 3, -1, -1, -1, -1])
+    simd_shuffle8(a, a, [0, 1, 2, 3, 0, 0, 0, 0])
+}
+
+/// Casts vector of type __m128d to type __m256d;
+/// the upper 128 bits of the result are undefined.
+#[inline(always)]
+#[target_feature = "+avx"]
+pub unsafe fn _mm256_castpd128_pd256(a: f64x2) -> f64x4 {
+    // FIXME simd_shuffle4(a, a, [0, 1, -1, -1])
+    simd_shuffle4(a, a, [0, 1, 0, 0])
+}
+
+/// Casts vector of type __m128i to type __m256i;
+/// the upper 128 bits of the result are undefined.
+#[inline(always)]
+#[target_feature = "+avx"]
+pub unsafe fn _mm256_castsi128_si256(a: i64x2) -> i64x4 {
+    // FIXME simd_shuffle4(a, a, [0, 1, -1, -1])
+    simd_shuffle4(a, a, [0, 1, 0, 0])
+}
+
 /// Return vector of type `f32x8` with undefined elements.
 #[inline(always)]
 #[target_feature = "+avx"]
@ -765,6 +1233,18 @@ extern "C" {
    fn vhsubpd(a: f64x4, b: f64x4) -> f64x4;
    #[link_name = "llvm.x86.avx.hsub.ps.256"]
    fn vhsubps(a: f32x8, b: f32x8) -> f32x8;
+    #[link_name = "llvm.x86.sse2.cmp.pd"]
+    fn vcmppd(a: f64x2, b: f64x2, imm8: u8) -> f64x2;
+    #[link_name = "llvm.x86.avx.cmp.pd.256"]
+    fn vcmppd256(a: f64x4, b: f64x4, imm8: u8) -> f64x4;
+    #[link_name = "llvm.x86.sse.cmp.ps"]
+    fn vcmpps(a: f32x4, b: f32x4, imm8: u8) -> f32x4;
+    #[link_name = "llvm.x86.avx.cmp.ps.256"]
+    fn vcmpps256(a: f32x8, b: f32x8, imm8: u8) -> f32x8;
+    #[link_name = "llvm.x86.sse2.cmp.sd"]
+    fn vcmpsd(a: f64x2, b: f64x2, imm8: u8) -> f64x2;
+    #[link_name = "llvm.x86.sse.cmp.ss"]
+    fn vcmpss(a: f32x4, b: f32x4, imm8: u8) -> f32x4;
    #[link_name = "llvm.x86.avx.cvtdq2.ps.256"]
    fn vcvtdq2ps(a: i32x8) -> f32x8;
    #[link_name = "llvm.x86.avx.cvt.pd2.ps.256"]
@ -785,6 +1265,20 @@ extern "C" {
    fn vpermilps256(a: f32x8, b: i32x8) -> f32x8;
    #[link_name = "llvm.x86.avx.vpermilvar.ps"]
    fn vpermilps(a: f32x4, b: i32x4) -> f32x4;
+    #[link_name = "llvm.x86.avx.vpermilvar.pd.256"]
+    fn vpermilpd256(a: f64x4, b: i64x4) -> f64x4;
+    #[link_name = "llvm.x86.avx.vpermilvar.pd"]
+    fn vpermilpd(a: f64x2, b: i64x2) -> f64x2;
+    #[link_name = "llvm.x86.avx.vperm2f128.ps.256"]
+    fn vperm2f128ps256(a: f32x8, b: f32x8, imm8: i8) -> f32x8;
+    #[link_name = "llvm.x86.avx.vperm2f128.pd.256"]
+    fn vperm2f128pd256(a: f64x4, b: f64x4, imm8: i8) -> f64x4;
+    #[link_name = "llvm.x86.avx.vperm2f128.si.256"]
+    fn vperm2f128si256(a: i32x8, b: i32x8, imm8: i8) -> i32x8;
+    #[link_name = "llvm.x86.avx.vbroadcastf128.ps.256"]
+    fn vbroadcastf128ps256(a: &f32x4) -> f32x8;
+    #[link_name = "llvm.x86.avx.vbroadcastf128.pd.256"]
+    fn vbroadcastf128pd256(a: &f64x2) -> f64x4;
 }

 #[cfg(test)]
@ -1176,6 +1670,64 @@ mod tests {
        assert_eq!(r, a);
    }

+    #[simd_test = "avx"]
+    unsafe fn _mm_cmp_pd() {
+        let a = f64x2::new(4.0, 9.0);
+        let b = f64x2::new(4.0, 3.0);
+        let r = avx::_mm_cmp_pd(a, b, avx::_CMP_GE_OS);
+        assert!(r.extract(0).is_nan());
+        assert!(r.extract(1).is_nan());
+    }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_cmp_pd() {
+        let a = f64x4::new(1.0, 2.0, 3.0, 4.0);
+        let b = f64x4::new(5.0, 6.0, 7.0, 8.0);
+        let r = avx::_mm256_cmp_pd(a, b, avx::_CMP_GE_OS);
+        let e = f64x4::splat(0.0);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm_cmp_ps() {
+        let a = f32x4::new(4.0, 3.0, 2.0, 5.0);
+        let b = f32x4::new(4.0, 9.0, 16.0, 25.0);
+        let r = avx::_mm_cmp_ps(a, b, avx::_CMP_GE_OS);
+        assert!(r.extract(0).is_nan());
+        assert_eq!(r.extract(1), 0.0);
+        assert_eq!(r.extract(2), 0.0);
+        assert_eq!(r.extract(3), 0.0);
+    }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_cmp_ps() {
+        let a = f32x8::new(1.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0, 4.0);
+        let b = f32x8::new(5.0, 6.0, 7.0, 8.0, 5.0, 6.0, 7.0, 8.0);
+        let r = avx::_mm256_cmp_ps(a, b, avx::_CMP_GE_OS);
+        let e = f32x8::splat(0.0);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm_cmp_sd() {
+        let a = f64x2::new(4.0, 9.0);
+        let b = f64x2::new(4.0, 3.0);
+        let r = avx::_mm_cmp_sd(a, b, avx::_CMP_GE_OS);
+        assert!(r.extract(0).is_nan());
+        assert_eq!(r.extract(1), 9.0);
+    }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm_cmp_ss() {
+        let a = f32x4::new(4.0, 3.0, 2.0, 5.0);
+        let b = f32x4::new(4.0, 9.0, 16.0, 25.0);
+        let r = avx::_mm_cmp_ss(a, b, avx::_CMP_GE_OS);
+        assert!(r.extract(0).is_nan());
+        assert_eq!(r.extract(1), 3.0);
+        assert_eq!(r.extract(2), 2.0);
+        assert_eq!(r.extract(3), 5.0);
+    }
+
    #[simd_test = "avx"]
    unsafe fn _mm256_cvtepi32_pd() {
        let a = i32x4::new(4, 9, 16, 25);
@ -1333,4 +1885,181 @@ mod tests {
        let e = f32x8::new(5.0, 2.0, 3.0, 4.0, 50.0, 64.0, 9.0, 8.0);
        assert_eq!(r, e);
    }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm_permute_ps() {
+        let a = f32x4::new(4.0, 3.0, 2.0, 5.0);
+        let r = avx::_mm_permute_ps(a, 0x1b);
+        let e = f32x4::new(5.0, 2.0, 3.0, 4.0);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_permutevar_pd() {
+        let a = f64x4::new(4.0, 3.0, 2.0, 5.0);
+        let b = i64x4::new(1, 2, 3, 4);
+        let r = avx::_mm256_permutevar_pd(a, b);
+        let e = f64x4::new(4.0, 3.0, 5.0, 2.0);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm_permutevar_pd() {
+        let a = f64x2::new(4.0, 3.0);
+        let b = i64x2::new(3, 0);
+        let r = avx::_mm_permutevar_pd(a, b);
+        let e = f64x2::new(3.0, 4.0);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_permute_pd() {
+        let a = f64x4::new(4.0, 3.0, 2.0, 5.0);
+        let r = avx::_mm256_permute_pd(a, 5);
+        let e = f64x4::new(3.0, 4.0, 5.0, 2.0);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm_permute_pd() {
+        let a = f64x2::new(4.0, 3.0);
+        let r = avx::_mm_permute_pd(a, 1);
+        let e = f64x2::new(3.0, 4.0);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_permute2f128_ps() {
+        let a = f32x8::new(1.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0, 4.0);
+        let b = f32x8::new(5.0, 6.0, 7.0, 8.0, 5.0, 6.0, 7.0, 8.0);
+        let r = avx::_mm256_permute2f128_ps(a, b, 0x13);
+        let e = f32x8::new(5.0, 6.0, 7.0, 8.0, 1.0, 2.0, 3.0, 4.0);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_permute2f128_pd() {
+        let a = f64x4::new(1.0, 2.0, 3.0, 4.0);
+        let b = f64x4::new(5.0, 6.0, 7.0, 8.0);
+        let r = avx::_mm256_permute2f128_pd(a, b, 0x31);
+        let e = f64x4::new(3.0, 4.0, 7.0, 8.0);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_permute2f128_si256() {
+        let a = i32x8::new(1, 2, 3, 4, 1, 2, 3, 4);
+        let b = i32x8::new(5, 6, 7, 8, 5, 6, 7, 8);
+        let r = avx::_mm256_permute2f128_si256(a, b, 0x20);
+        let e = i32x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_broadcast_ss() {
+        let r = avx::_mm256_broadcast_ss(&3.0);
+        let e = f32x8::splat(3.0);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm_broadcast_ss() {
+        let r = avx::_mm_broadcast_ss(&3.0);
+        let e = f32x4::splat(3.0);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_broadcast_sd() {
+        let r = avx::_mm256_broadcast_sd(&3.0);
+        let e = f64x4::splat(3.0);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_broadcast_ps() {
+        let a = f32x4::new(4.0, 3.0, 2.0, 5.0);
+        let r = avx::_mm256_broadcast_ps(&a);
+        let e = f32x8::new(4.0, 3.0, 2.0, 5.0, 4.0, 3.0, 2.0, 5.0);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_broadcast_pd() {
+        let a = f64x2::new(4.0, 3.0);
+        let r = avx::_mm256_broadcast_pd(&a);
+        let e = f64x4::new(4.0, 3.0, 4.0, 3.0);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_insertf128_ps() {
+        let a = f32x8::new(4.0, 3.0, 2.0, 5.0, 8.0, 9.0, 64.0, 50.0);
+        let b = f32x4::new(4.0, 9.0, 16.0, 25.0);
+        let r = avx::_mm256_insertf128_ps(a, b, 0);
+        let e = f32x8::new(4.0, 9.0, 16.0, 25.0, 8.0, 9.0, 64.0, 50.0);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_insertf128_pd() {
+        let a = f64x4::new(1.0, 2.0, 3.0, 4.0);
+        let b = f64x2::new(5.0, 6.0);
+        let r = avx::_mm256_insertf128_pd(a, b, 0);
+        let e = f64x4::new(5.0, 6.0, 3.0, 4.0);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_insertf128_si256() {
+        let a = i64x4::new(1, 2, 3, 4);
+        let b = i64x2::new(5, 6);
+        let r = avx::_mm256_insertf128_si256(a, b, 0);
+        let e = i64x4::new(5, 6, 3, 4);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_insert_epi8() {
+        let a = i8x32::new(
+            1, 2, 3, 4, 5, 6, 7, 8,
+            9, 10, 11, 12, 13, 14, 15, 16,
+            17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32);
+        let r = avx::_mm256_insert_epi8(a, 0, 31);
+        let e = i8x32::new(
+            1, 2, 3, 4, 5, 6, 7, 8,
+            9, 10, 11, 12, 13, 14, 15, 16,
+            17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 0);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_insert_epi16() {
+        let a = i16x16::new(
+            0, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15);
+        let r = avx::_mm256_insert_epi16(a, 0, 15);
+        let e = i16x16::new(
+            0, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 0);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_insert_epi32() {
+        let a = i32x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = avx::_mm256_insert_epi32(a, 0, 7);
+        let e = i32x8::new(1, 2, 3, 4, 5, 6, 7, 0);
+        assert_eq!(r, e);
+    }
+
+    #[simd_test = "avx"]
+    unsafe fn _mm256_insert_epi64() {
+        let a = i64x4::new(1, 2, 3, 4);
+        let r = avx::_mm256_insert_epi64(a, 0, 3);
+        let e = i64x4::new(1, 2, 3, 0);
+        assert_eq!(r, e);
+    }
 }
--- a/library/stdarch/src/x86/macros.rs
+++ b/library/stdarch/src/x86/macros.rs
@ -261,3 +261,43 @@ macro_rules! constify_imm8 {
        }
    }
 }
+
+macro_rules! constify_imm6 {
+    ($imm8:expr, $expand:ident) => {
+        #[allow(overflowing_literals)]
+        match $imm8 & 0b1_1111 {
+            0 => $expand!(0),
+            1 => $expand!(1),
+            2 => $expand!(2),
+            3 => $expand!(3),
+            4 => $expand!(4),
+            5 => $expand!(5),
+            6 => $expand!(6),
+            7 => $expand!(7),
+            8 => $expand!(8),
+            9 => $expand!(9),
+            10 => $expand!(10),
+            11 => $expand!(11),
+            12 => $expand!(12),
+            13 => $expand!(13),
+            14 => $expand!(14),
+            15 => $expand!(15),
+            16 => $expand!(16),
+            17 => $expand!(17),
+            18 => $expand!(18),
+            19 => $expand!(19),
+            20 => $expand!(20),
+            21 => $expand!(21),
+            22 => $expand!(22),
+            23 => $expand!(23),
+            24 => $expand!(24),
+            25 => $expand!(25),
+            26 => $expand!(26),
+            27 => $expand!(27),
+            28 => $expand!(28),
+            29 => $expand!(29),
+            30 => $expand!(30),
+            _ => $expand!(31),
+        }
+    }
+}
--- a/library/stdarch/src/x86/sse.rs
+++ b/library/stdarch/src/x86/sse.rs
@ -868,6 +868,13 @@ pub unsafe fn _mm_prefetch(p: *const c_void, strategy: i8) {
    pref!(strategy)
 }

+/// Return vector of type __m128 with undefined elements.
+#[inline(always)]
+#[target_feature = "+sse"]
+pub unsafe fn _mm_undefined_ps() -> f32x4 {
+    f32x4::splat(mem::uninitialized())
+}
+
 #[allow(improper_ctypes)]
 extern {
    #[link_name = "llvm.x86.sse.add.ss"]
--- a/library/stdarch/src/x86/sse2.rs
+++ b/library/stdarch/src/x86/sse2.rs
@ -1827,6 +1827,13 @@ pub unsafe fn _mm_load1_pd(mem_addr: *const f64) -> f64x2 {
    f64x2::new(d, d)
 }

+/// Return vector of type __m128d with undefined elements.
+#[inline(always)]
+#[target_feature = "+sse2"]
+pub unsafe fn _mm_undefined_pd() -> f64x2 {
+    f64x2::splat(mem::uninitialized())
+}
+
 #[allow(improper_ctypes)]
 extern {
    #[link_name = "llvm.x86.sse2.pause"]