Squashing
This commit is contained in:
parent
299b2f3c29
commit
b787226061
3 changed files with 127 additions and 11 deletions
|
|
@ -154,11 +154,11 @@ sse
|
|||
* [ ] `_mm_storeu_ps`
|
||||
* [ ] `_mm_storer_ps`
|
||||
* [ ] `_mm_move_ss`
|
||||
* [ ] `_mm_shuffle_ps`
|
||||
* [x] `_mm_shuffle_ps`
|
||||
* [x] `_mm_unpackhi_ps`
|
||||
* [ ] `_mm_unpacklo_ps`
|
||||
* [ ] `_mm_movehl_ps`
|
||||
* [ ] `_mm_movelh_ps`
|
||||
* [x] `_mm_unpacklo_ps`
|
||||
* [x] `_mm_movehl_ps`
|
||||
* [x] `_mm_movelh_ps`
|
||||
* [x] `_mm_movemask_ps`
|
||||
* [ ] `_mm_undefined_ps`
|
||||
|
||||
|
|
|
|||
|
|
@ -269,9 +269,7 @@ pub fn assert(fnptr: usize, fnname: &str, expected: &str) {
|
|||
}
|
||||
}
|
||||
|
||||
let probably_only_one_instruction = function.instrs.len() < 20;
|
||||
|
||||
if found && probably_only_one_instruction {
|
||||
if found {
|
||||
return
|
||||
}
|
||||
|
||||
|
|
@ -288,7 +286,5 @@ pub fn assert(fnptr: usize, fnname: &str, expected: &str) {
|
|||
|
||||
if !found {
|
||||
panic!("failed to find instruction `{}` in the disassembly", expected);
|
||||
} else if !probably_only_one_instruction {
|
||||
panic!("too many instructions in the disassembly");
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -164,14 +164,97 @@ pub fn _mm_max_ps(a: f32x4, b: f32x4) -> f32x4 {
|
|||
unsafe { maxps(a, b) }
|
||||
}
|
||||
|
||||
/// Unpack and interleave single-precision (32-bit) floating-point elements
|
||||
/// from the high half of `a` and `b`;
|
||||
// Shuffle packed single-precision (32-bit) floating-point elements in `a` and `b`
|
||||
// using `mask`.
|
||||
// The lower half of result takes values from `a` and the higher half from `b`.
|
||||
// Mask is split to 2 control bits each to index the element from inputs.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+sse"]
|
||||
#[cfg_attr(test, assert_instr(shufps))]
|
||||
pub fn _mm_shuffle_ps(a: f32x4, b: f32x4, mask: i32) -> f32x4 {
|
||||
let mask = (mask & 0xFF) as u8;
|
||||
|
||||
macro_rules! shuffle_done {
|
||||
($x01:expr, $x23:expr, $x45:expr, $x67:expr) => {
|
||||
unsafe {
|
||||
simd_shuffle4(a, b, [$x01, $x23, $x45, $x67])
|
||||
}
|
||||
}
|
||||
}
|
||||
macro_rules! shuffle_x67 {
|
||||
($x01:expr, $x23:expr, $x45:expr) => {
|
||||
match (mask >> 6) & 0b11 {
|
||||
0b00 => shuffle_done!($x01, $x23, $x45, 4),
|
||||
0b01 => shuffle_done!($x01, $x23, $x45, 5),
|
||||
0b10 => shuffle_done!($x01, $x23, $x45, 6),
|
||||
_ => shuffle_done!($x01, $x23, $x45, 7),
|
||||
}
|
||||
}
|
||||
}
|
||||
macro_rules! shuffle_x45 {
|
||||
($x01:expr, $x23:expr) => {
|
||||
match (mask >> 4) & 0b11 {
|
||||
0b00 => shuffle_x67!($x01, $x23, 4),
|
||||
0b01 => shuffle_x67!($x01, $x23, 5),
|
||||
0b10 => shuffle_x67!($x01, $x23, 6),
|
||||
_ => shuffle_x67!($x01, $x23, 7),
|
||||
}
|
||||
}
|
||||
}
|
||||
macro_rules! shuffle_x23 {
|
||||
($x01:expr) => {
|
||||
match (mask >> 2) & 0b11 {
|
||||
0b00 => shuffle_x45!($x01, 0),
|
||||
0b01 => shuffle_x45!($x01, 1),
|
||||
0b10 => shuffle_x45!($x01, 2),
|
||||
_ => shuffle_x45!($x01, 3),
|
||||
}
|
||||
}
|
||||
}
|
||||
match mask & 0b11 {
|
||||
0b00 => shuffle_x23!(0),
|
||||
0b01 => shuffle_x23!(1),
|
||||
0b10 => shuffle_x23!(2),
|
||||
_ => shuffle_x23!(3),
|
||||
}
|
||||
}
|
||||
|
||||
/// Unpack and interleave single-precision (32-bit) floating-point elements
|
||||
/// from the higher half of `a` and `b`.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+sse"]
|
||||
#[cfg_attr(test, assert_instr(unpckhps))]
|
||||
pub fn _mm_unpackhi_ps(a: f32x4, b: f32x4) -> f32x4 {
|
||||
unsafe { simd_shuffle4(a, b, [2, 6, 3, 7]) }
|
||||
}
|
||||
|
||||
/// Unpack and interleave single-precision (32-bit) floating-point elements
|
||||
/// from the lower half of `a` and `b`.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+sse"]
|
||||
#[cfg_attr(test, assert_instr(unpcklps))]
|
||||
pub fn _mm_unpacklo_ps(a: f32x4, b: f32x4) -> f32x4 {
|
||||
unsafe { simd_shuffle4(a, b, [0, 4, 1, 5]) }
|
||||
}
|
||||
|
||||
/// Combine higher half of `a` and `b`. The highwe half of `b` occupies the lower
|
||||
/// half of result.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+sse"]
|
||||
#[cfg_attr(test, assert_instr(movhlps))]
|
||||
pub fn _mm_movehl_ps(a: f32x4, b: f32x4) -> f32x4 {
|
||||
unsafe { simd_shuffle4(a, b, [6, 7, 2, 3]) }
|
||||
}
|
||||
|
||||
/// Combine lower half of `a` and `b`. The lower half of `b` occupies the higher
|
||||
/// half of result.
|
||||
#[inline(always)]
|
||||
#[target_feature = "+sse"]
|
||||
#[cfg_attr(test, assert_instr(unpcklpd))]
|
||||
pub fn _mm_movelh_ps(a: f32x4, b: f32x4) -> f32x4 {
|
||||
unsafe { simd_shuffle4(a, b, [0, 1, 4, 5]) }
|
||||
}
|
||||
|
||||
/// Return a mask of the most significant bit of each element in `a`.
|
||||
///
|
||||
/// The mask is stored in the 4 least significant bits of the return value.
|
||||
|
|
@ -384,6 +467,16 @@ mod tests {
|
|||
assert_eq!(r, f32x4::new(-1.0, 20.0, 0.0, -5.0));
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[target_feature = "+sse"]
|
||||
fn _mm_shuffle_ps() {
|
||||
let a = f32x4::new(1.0, 2.0, 3.0, 4.0);
|
||||
let b = f32x4::new(5.0, 6.0, 7.0, 8.0);
|
||||
let mask = 0b00_01_01_11;
|
||||
let r = sse::_mm_shuffle_ps(a, b, mask);
|
||||
assert_eq!(r, f32x4::new(4.0, 2.0, 6.0, 5.0));
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[target_feature = "+sse"]
|
||||
fn _mm_unpackhi_ps() {
|
||||
|
|
@ -393,6 +486,33 @@ mod tests {
|
|||
assert_eq!(r, f32x4::new(3.0, 7.0, 4.0, 8.0));
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[target_feature = "+sse"]
|
||||
fn _mm_unpacklo_ps() {
|
||||
let a = f32x4::new(1.0, 2.0, 3.0, 4.0);
|
||||
let b = f32x4::new(5.0, 6.0, 7.0, 8.0);
|
||||
let r = sse::_mm_unpacklo_ps(a, b);
|
||||
assert_eq!(r, f32x4::new(1.0, 5.0, 2.0, 6.0));
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[target_feature = "+sse"]
|
||||
fn _mm_movehl_ps() {
|
||||
let a = f32x4::new(1.0, 2.0, 3.0, 4.0);
|
||||
let b = f32x4::new(5.0, 6.0, 7.0, 8.0);
|
||||
let r = sse::_mm_movehl_ps(a, b);
|
||||
assert_eq!(r, f32x4::new(7.0, 8.0, 3.0, 4.0));
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[target_feature = "+sse"]
|
||||
fn _mm_movelh_ps() {
|
||||
let a = f32x4::new(1.0, 2.0, 3.0, 4.0);
|
||||
let b = f32x4::new(5.0, 6.0, 7.0, 8.0);
|
||||
let r = sse::_mm_movelh_ps(a, b);
|
||||
assert_eq!(r, f32x4::new(1.0, 2.0, 5.0, 6.0));
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[target_feature = "+sse"]
|
||||
fn _mm_movemask_ps() {
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue