This commit is contained in:
Andrew Gallant 2016-11-27 01:06:39 -05:00
parent c709196f7a
commit 12121fc2bb
8 changed files with 867 additions and 81 deletions

View file

@ -12,3 +12,4 @@ license = "MIT"
[profile.release]
debug = true
opt-level = 3

View file

@ -1,28 +1,24 @@
extern crate stdsimd;
use std::env;
use std::io::Write;
use stdsimd as s;
fn main() {
let arg1: f64 = env::args().nth(1).unwrap().parse().unwrap();
let arg2: f64 = env::args().nth(2).unwrap().parse().unwrap();
let arg3: f64 = env::args().nth(3).unwrap().parse().unwrap();
let arg4: f64 = env::args().nth(4).unwrap().parse().unwrap();
let arg1: u8 = env::args().nth(1).unwrap().parse().unwrap();
let arg2: u8 = env::args().nth(2).unwrap().parse().unwrap();
let arg3: u8 = env::args().nth(3).unwrap().parse().unwrap();
let arg4: u8 = env::args().nth(4).unwrap().parse().unwrap();
unsafe {
let a1 = s::_mm_load_pd(&(arg1, arg2) as *const _ as *const f64);
let b1 = s::_mm_load_pd(&(arg3, arg4) as *const _ as *const f64);
// println!("{:?}, {:?}", a, b);
let r1 = s::_mm_add_sd(a1, b1);
// println!("{:?}", r1);
let mut r2: (f64, f64) = (0.0, 0.0);
s::_mm_store_pd(&mut r2 as *mut _ as *mut f64, r1);
if r2 == (4.0, 2.0) {
::std::io::stdout().write_all(b"yes\n").unwrap();
} else {
::std::io::stdout().write_all(b"NO\n").unwrap();
}
// println!("{:?}", r2);
s::_mm_lfence();
s::_mm_pause();
let a = s::u8x16::new(
arg1, arg1, arg1, arg1, arg1, arg1, arg1, arg1,
arg2, arg2, arg2, arg2, arg2, arg2, arg2, arg2);
let b = s::u8x16::new(
arg3, arg3, arg3, arg3, arg3, arg3, arg3, arg3,
arg4, arg4, arg4, arg4, arg4, arg4, arg4, arg4);
let r = s::_mm_sad_epu8(a.as_m128i(), b.as_m128i());
println!("{:?}", s::u64x2::from(r));
}
}

View file

@ -1,36 +1,14 @@
#![allow(dead_code)]
#![feature(platform_intrinsics, repr_simd)]
#![feature(link_llvm_intrinsics, platform_intrinsics, repr_simd, simd_ffi)]
// pub use v128::{__m128, __m128d, __m128i};
pub use v128::*;
pub use v64::__m64;
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
pub use x86::*;
mod simd;
mod v128;
mod v64;
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
mod x86;
extern "platform-intrinsic" {
fn simd_eq<T, U>(x: T, y: T) -> U;
fn simd_ne<T, U>(x: T, y: T) -> U;
fn simd_lt<T, U>(x: T, y: T) -> U;
fn simd_le<T, U>(x: T, y: T) -> U;
fn simd_gt<T, U>(x: T, y: T) -> U;
fn simd_ge<T, U>(x: T, y: T) -> U;
fn simd_shuffle2<T, U>(x: T, y: T, idx: [u32; 2]) -> U;
fn simd_shuffle4<T, U>(x: T, y: T, idx: [u32; 4]) -> U;
fn simd_shuffle8<T, U>(x: T, y: T, idx: [u32; 8]) -> U;
fn simd_shuffle16<T, U>(x: T, y: T, idx: [u32; 16]) -> U;
fn simd_insert<T, U>(x: T, idx: u32, val: U) -> T;
fn simd_extract<T, U>(x: T, idx: u32) -> U;
fn simd_cast<T, U>(x: T) -> U;
fn simd_add<T>(x: T, y: T) -> T;
fn simd_sub<T>(x: T, y: T) -> T;
fn simd_mul<T>(x: T, y: T) -> T;
fn simd_div<T>(x: T, y: T) -> T;
fn simd_shl<T>(x: T, y: T) -> T;
fn simd_shr<T>(x: T, y: T) -> T;
fn simd_and<T>(x: T, y: T) -> T;
fn simd_or<T>(x: T, y: T) -> T;
fn simd_xor<T>(x: T, y: T) -> T;
}

View file

@ -0,0 +1,28 @@
extern "platform-intrinsic" {
pub fn simd_eq<T, U>(x: T, y: T) -> U;
pub fn simd_ne<T, U>(x: T, y: T) -> U;
pub fn simd_lt<T, U>(x: T, y: T) -> U;
pub fn simd_le<T, U>(x: T, y: T) -> U;
pub fn simd_gt<T, U>(x: T, y: T) -> U;
pub fn simd_ge<T, U>(x: T, y: T) -> U;
pub fn simd_shuffle2<T, U>(x: T, y: T, idx: [u32; 2]) -> U;
pub fn simd_shuffle4<T, U>(x: T, y: T, idx: [u32; 4]) -> U;
pub fn simd_shuffle8<T, U>(x: T, y: T, idx: [u32; 8]) -> U;
pub fn simd_shuffle16<T, U>(x: T, y: T, idx: [u32; 16]) -> U;
pub fn simd_insert<T, U>(x: T, idx: u32, val: U) -> T;
pub fn simd_extract<T, U>(x: T, idx: u32) -> U;
pub fn simd_cast<T, U>(x: T) -> U;
pub fn simd_add<T>(x: T, y: T) -> T;
pub fn simd_sub<T>(x: T, y: T) -> T;
pub fn simd_mul<T>(x: T, y: T) -> T;
pub fn simd_div<T>(x: T, y: T) -> T;
pub fn simd_shl<T>(x: T, y: T) -> T;
pub fn simd_shr<T>(x: T, y: T) -> T;
pub fn simd_and<T>(x: T, y: T) -> T;
pub fn simd_or<T>(x: T, y: T) -> T;
pub fn simd_xor<T>(x: T, y: T) -> T;
}

160
library/stdarch/src/v128.rs Normal file
View file

@ -0,0 +1,160 @@
use std::mem::transmute;
use simd::*;
macro_rules! define_ty {
($name:ident, $($elty:ident),+) => {
#[repr(simd)]
#[derive(Clone, Copy, Debug)]
#[allow(non_camel_case_types)]
pub struct $name($($elty),*);
}
}
macro_rules! define_ty_internal {
($name:ident, $($elty:ident),+) => {
#[repr(simd)]
#[derive(Clone, Copy, Debug, PartialEq)]
#[allow(non_camel_case_types)]
pub struct $name($($elty),*);
}
}
macro_rules! define_impl {
($name:ident, $boolname:ident, $elemty:ident, $nelems:expr,
$($elname:ident),+) => {
impl From<__m128> for $name {
#[inline]
fn from(v: __m128) -> $name { unsafe { transmute(v) } }
}
impl From<__m128i> for $name {
#[inline]
fn from(v: __m128i) -> $name { unsafe { transmute(v) } }
}
impl From<__m128d> for $name {
#[inline]
fn from(v: __m128d) -> $name { unsafe { transmute(v) } }
}
impl $name {
#[inline]
pub fn new($($elname: $elemty),*) -> $name {
$name($($elname),*)
}
#[inline]
pub fn splat(value: $elemty) -> $name {
$name($({
#[allow(non_camel_case_types, dead_code)]
struct $elname;
value
}),*)
}
#[inline]
pub fn eq(self, other: $name) -> $boolname {
unsafe { simd_eq(self, other) }
}
#[inline]
pub fn ne(self, other: $name) -> $boolname {
unsafe { simd_ne(self, other) }
}
#[inline]
pub fn lt(self, other: $name) -> $boolname {
unsafe { simd_lt(self, other) }
}
#[inline]
pub fn le(self, other: $name) -> $boolname {
unsafe { simd_le(self, other) }
}
#[inline]
pub fn gt(self, other: $name) -> $boolname {
unsafe { simd_gt(self, other) }
}
#[inline]
pub fn ge(self, other: $name) -> $boolname {
unsafe { simd_ge(self, other) }
}
#[inline]
pub unsafe fn extract(self, idx: u32) -> $elemty {
debug_assert!(idx < $nelems);
simd_extract(self, idx)
}
#[inline]
pub unsafe fn insert(self, idx: u32, val: $elemty) -> $name {
debug_assert!(idx < $nelems);
simd_insert(self, idx, val)
}
#[inline]
pub fn as_m128(self) -> __m128 { unsafe { transmute(self) } }
#[inline]
pub fn as_m128d(self) -> __m128d { unsafe { transmute(self) } }
#[inline]
pub fn as_m128i(self) -> __m128i { unsafe { transmute(self) } }
#[inline]
pub fn as_f32x4(self) -> f32x4 { unsafe { transmute(self) } }
#[inline]
pub fn as_f64x2(self) -> f64x2 { unsafe { transmute(self) } }
#[inline]
pub fn as_u8x16(self) -> u8x16 { unsafe { transmute(self) } }
}
}
}
define_ty! { __m128, f32, f32, f32, f32 }
define_ty! { __m128d, f64, f64 }
define_ty! { __m128i, u64, u64 }
define_ty_internal! { boolu64x2, u64, u64 }
define_ty_internal! { boolu32x4, u32, u32, u32, u32 }
define_ty_internal! { boolu16x8, u16, u16, u16, u16, u16, u16, u16, u16 }
define_ty_internal! {
boolu8x16, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8
}
define_ty_internal! { f64x2, f64, f64 }
define_impl! { f64x2, boolu64x2, f64, 2, x0, x1 }
define_ty_internal! { f32x4, f32, f32, f32, f32 }
define_impl! { f32x4, boolu32x4, f32, 2, x0, x1, x2, x3 }
define_ty_internal! { u64x2, u64, u64 }
define_impl! { u64x2, boolu64x2, u64, 2, x0, x1 }
define_ty_internal! { u32x4, u32, u32, u32, u32 }
define_impl! { u32x4, boolu32x4, u32, 4, x0, x1, x2, x3 }
define_ty_internal! { i32x4, i32, i32, i32, i32 }
define_impl! { i32x4, boolu32x4, i32, 4, x0, x1, x2, x3 }
define_ty_internal! { u16x8, u16, u16, u16, u16, u16, u16, u16, u16 }
define_impl! { u16x8, boolu16x8, u16, 8, x0, x1, x2, x3, x4, x5, x6, x7 }
define_ty_internal! { i16x8, i16, i16, i16, i16, i16, i16, i16, i16 }
define_impl! { i16x8, boolu16x8, i16, 8, x0, x1, x2, x3, x4, x5, x6, x7 }
define_ty_internal! {
u8x16, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8
}
define_impl! {
u8x16, boolu8x16, u8, 16,
x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15
}
define_ty_internal! {
i8x16, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8
}
define_impl! {
i8x16, boolu8x16, i8, 16,
x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15
}

105
library/stdarch/src/v64.rs Normal file
View file

@ -0,0 +1,105 @@
use std::mem::transmute;
use simd::*;
macro_rules! define_ty {
($name:ident, $($elty:ident),+) => {
#[repr(simd)]
#[derive(Clone, Copy, Debug)]
#[allow(non_camel_case_types)]
pub struct $name($($elty),*);
}
}
macro_rules! define_ty_internal {
($name:ident, $($elty:ident),+) => {
#[repr(simd)]
#[derive(Clone, Copy, Debug, PartialEq)]
#[allow(non_camel_case_types)]
pub struct $name($($elty),*);
}
}
macro_rules! define_impl {
($name:ident, $boolname:ident, $elemty:ident, $nelems:expr,
$($elname:ident),+) => {
impl From<__m64> for $name {
#[inline]
fn from(v: __m64) -> $name { unsafe { transmute(v) } }
}
impl $name {
#[inline]
pub fn new($($elname: $elemty),*) -> $name {
$name($($elname),*)
}
#[inline]
pub fn splat(value: $elemty) -> $name {
$name($({
#[allow(non_camel_case_types, dead_code)]
struct $elname;
value
}),*)
}
#[inline]
pub fn eq(self, other: $name) -> $boolname {
unsafe { simd_eq(self, other) }
}
#[inline]
pub fn ne(self, other: $name) -> $boolname {
unsafe { simd_ne(self, other) }
}
#[inline]
pub fn lt(self, other: $name) -> $boolname {
unsafe { simd_lt(self, other) }
}
#[inline]
pub fn le(self, other: $name) -> $boolname {
unsafe { simd_le(self, other) }
}
#[inline]
pub fn gt(self, other: $name) -> $boolname {
unsafe { simd_gt(self, other) }
}
#[inline]
pub fn ge(self, other: $name) -> $boolname {
unsafe { simd_ge(self, other) }
}
#[inline]
pub unsafe fn extract(self, idx: u32) -> $elemty {
debug_assert!(idx < $nelems);
simd_extract(self, idx)
}
#[inline]
pub unsafe fn insert(self, idx: u32, val: $elemty) -> $name {
debug_assert!(idx < $nelems);
simd_insert(self, idx, val)
}
#[inline]
pub fn as_m64(self) -> __m64 { unsafe { transmute(self) } }
#[inline]
pub fn as_u64(self) -> u64 { unsafe { transmute(self) } }
}
}
}
define_ty! { __m64, u64 }
define_ty_internal! { boolu64x1, u64 }
define_ty_internal! { boolu32x2, u32, u32 }
define_ty_internal! { u64x1, u64 }
define_impl! { u64x1, boolu64x1, u64, 1, x0 }
define_ty_internal! { u32x2, u32, u32 }
define_impl! { u32x2, boolu32x2, u32, 2, x0, x1 }

View file

@ -1,29 +1,5 @@
pub use self::sse::*;
pub use self::sse2::*;
mod sse;
mod sse2;
#[repr(simd)]
#[derive(Clone, Copy, Debug)]
#[allow(non_camel_case_types)]
pub struct __m128(f32, f32, f32, f32);
#[repr(simd)]
#[derive(Clone, Copy, Debug)]
#[allow(non_camel_case_types)]
pub struct __m128d(f64, f64);
#[repr(simd)]
#[derive(Clone, Copy, Debug)]
#[allow(non_camel_case_types)]
pub struct __m128i(u64, u64);
#[repr(simd)]
#[derive(Clone, Copy, Debug)]
#[allow(non_camel_case_types)]
pub struct f64x2(f64, f64);
#[repr(simd)]
#[derive(Clone, Copy, Debug)]
#[allow(non_camel_case_types)]
struct u8x16(u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8);

View file

@ -1,26 +1,568 @@
use std::mem::transmute;
use std::os::raw::c_void;
use super::{__m128d, __m128i, f64x2, u8x16};
use {simd_add, simd_extract, simd_insert};
use simd::*;
use v128::*;
use v64::*;
/// Provide a hint to the processor that the code sequence is a spin-wait loop.
///
/// This can help improve the performance and power consumption of spin-wait
/// loops.
#[inline]
pub unsafe fn _mm_pause() {
pause()
}
/// Invalidate and flush the cache line that contains `p` from all levels of
/// the cache hierarchy.
#[inline]
pub unsafe fn _mm_clflush(p: *mut c_void) {
clflush(p)
}
/// Perform a serializing operation on all load-from-memory instructions
/// that were issued prior to this instruction.
///
/// Guarantees that every load instruction that precedes, in program order, is
/// globally visible before any load instruction which follows the fence in
/// program order.
#[inline]
pub unsafe fn _mm_lfence() {
lfence()
}
/// Perform a serializing operation on all load-from-memory and store-to-memory
/// instructions that were issued prior to this instruction.
///
/// Guarantees that every memory access that precedes, in program order, the
/// memory fence instruction is globally visible before any memory instruction
/// which follows the fence in program order.
#[inline]
pub unsafe fn _mm_mfence() {
mfence()
}
/// Add packed 8-bit integers in "a" and "b", and return the results.
#[inline]
pub unsafe fn _mm_add_epi8(a: __m128i, b: __m128i) -> __m128i {
transmute(simd_add::<u8x16>(transmute(a), transmute(b)))
simd_add(u8x16::from(a), u8x16::from(b)).as_m128i()
}
/// Add packed 16-bit integers in "a" and "b", and return the results.
#[inline]
pub unsafe fn _mm_add_epi16(a: __m128i, b: __m128i) -> __m128i {
simd_add(u16x8::from(a), u16x8::from(b)).as_m128i()
}
/// Add packed 32-bit integers in "a" and "b", and return the results.
#[inline]
pub unsafe fn _mm_add_epi32(a: __m128i, b: __m128i) -> __m128i {
simd_add(u32x4::from(a), u32x4::from(b)).as_m128i()
}
/// Add 64-bit integers "a" and "b", and return the results.
#[inline]
unsafe fn _mm_add_si64(_a: __m64, _b: __m64) -> __m64 {
unimplemented!()
}
/// Add packed 64-bit integers in "a" and "b", and return the results.
#[inline]
pub unsafe fn _mm_add_epi64(a: __m128i, b: __m128i) -> __m128i {
simd_add(u64x2::from(a), u64x2::from(b)).as_m128i()
}
/// Add packed 8-bit integers in "a" and "b" using saturation, and return the
/// results.
#[inline]
pub unsafe fn _mm_adds_epi8(a: __m128i, b: __m128i) -> __m128i {
paddsb(i8x16::from(a), i8x16::from(b)).as_m128i()
}
/// Add packed 16-bit integers in "a" and "b" using saturation, and return the
/// results.
#[inline]
pub unsafe fn _mm_adds_epi16(a: __m128i, b: __m128i) -> __m128i {
paddsw(i16x8::from(a), i16x8::from(b)).as_m128i()
}
/// Add packed unsigned 8-bit integers in "a" and "b" using saturation, and
/// return the results.
#[inline]
pub unsafe fn _mm_adds_epu8(a: __m128i, b: __m128i) -> __m128i {
paddsub(u8x16::from(a), u8x16::from(b)).as_m128i()
}
/// Add packed unsigned 16-bit integers in "a" and "b" using saturation, and
/// return the results.
#[inline]
pub unsafe fn _mm_adds_epu16(a: __m128i, b: __m128i) -> __m128i {
paddsuw(u16x8::from(a), u16x8::from(b)).as_m128i()
}
/// Average packed unsigned 8-bit integers in "a" and "b", and return the
/// results.
#[inline]
pub unsafe fn _mm_avg_epu8(a: __m128i, b: __m128i) -> __m128i {
pavgb(u8x16::from(a), u8x16::from(b)).as_m128i()
}
/// Average packed unsigned 16-bit integers in "a" and "b", and return the
/// results.
#[inline]
pub unsafe fn _mm_avg_epu16(a: __m128i, b: __m128i) -> __m128i {
pavgw(u16x8::from(a), u16x8::from(b)).as_m128i()
}
/// Multiply packed signed 16-bit integers in "a" and "b", producing
/// intermediate signed 32-bit integers.
///
/// Horizontally add adjacent pairs of intermediate 32-bit integers, and pack
/// the results in "dst".
#[inline]
pub unsafe fn _mm_madd_epi16(a: __m128i, b: __m128i) -> __m128i {
pmaddwd(i16x8::from(a), i16x8::from(b)).as_m128i()
}
/// Compare packed 16-bit integers in `a` and `b`, and return the packed
/// maximum values.
#[inline]
pub unsafe fn _mm_max_epi16(a: __m128i, b: __m128i) -> __m128i {
pmaxsw(i16x8::from(a), i16x8::from(b)).as_m128i()
}
/// Compare packed unsigned 8-bit integers in `a` and `b`, and return the
/// packed maximum values.
#[inline]
pub unsafe fn _mm_max_epu8(a: __m128i, b: __m128i) -> __m128i {
pmaxub(u8x16::from(a), u8x16::from(b)).as_m128i()
}
/// Compare packed 16-bit integers in `a` and `b`, and return the packed
/// minimum values.
#[inline]
pub unsafe fn _mm_min_epi16(a: __m128i, b: __m128i) -> __m128i {
pminsw(i16x8::from(a), i16x8::from(b)).as_m128i()
}
/// Compare packed unsigned 8-bit integers in `a` and `b`, and return the
/// packed minimum values.
#[inline]
pub unsafe fn _mm_min_epu8(a: __m128i, b: __m128i) -> __m128i {
pminub(u8x16::from(a), u8x16::from(b)).as_m128i()
}
/// Multiply the packed 16-bit integers in `a` and `b`.
///
/// The multiplication produces intermediate 32-bit integers, and returns the
/// high 16 bits of the intermediate integers.
#[inline]
pub unsafe fn _mm_mulhi_epi16(a: __m128i, b: __m128i) -> __m128i {
pmulhw(i16x8::from(a), i16x8::from(b)).as_m128i()
}
/// Multiply the packed unsigned 16-bit integers in `a` and `b`.
///
/// The multiplication produces intermediate 32-bit integers, and returns the
/// high 16 bits of the intermediate integers.
#[inline]
pub unsafe fn _mm_mulhi_epu16(a: __m128i, b: __m128i) -> __m128i {
pmulhuw(u16x8::from(a), u16x8::from(b)).as_m128i()
}
/// Multiply the packed 16-bit integers in `a` and `b`.
///
/// The multiplication produces intermediate 32-bit integers, and returns the
/// low 16 bits of the intermediate integers.
#[inline]
pub unsafe fn _mm_mullo_epi16(a: __m128i, b: __m128i) -> __m128i {
simd_mul(i16x8::from(a), i16x8::from(b)).as_m128i()
}
/// Multiply the low unsigned 32-bit integers from `a` and `b`.
///
/// Return the unsigned 64-bit result.
#[inline]
unsafe fn _mm_mul_su32(_a: __m64, _b: __m64) -> __m64 {
unimplemented!()
}
/// Multiply the low unsigned 32-bit integers from each packed 64-bit element
/// in `a` and `b`.
///
/// Return the unsigned 64-bit results.
#[inline]
pub unsafe fn _mm_mul_epu32(a: __m128i, b: __m128i) -> __m128i {
pmuludq(u32x4::from(a), u32x4::from(b)).as_m128i()
}
/// Sum the absolute differences of packed unsigned 8-bit integers.
///
/// Compute the absolute differences of packed unsigned 8-bit integers in `a`
/// and `b`, then horizontally sum each consecutive 8 differences to produce
/// two unsigned 16-bit integers, and pack these unsigned 16-bit integers in
/// the low 16 bits of 64-bit elements returned.
#[inline]
pub unsafe fn _mm_sad_epu8(a: __m128i, b: __m128i) -> __m128i {
psadbw(u8x16::from(a), u8x16::from(b)).as_m128i()
}
#[inline]
pub unsafe fn _mm_add_sd(a: __m128d, b: __m128d) -> __m128d {
let alow = simd_extract::<f64x2, f64>(transmute(a), 0);
let blow = simd_extract::<f64x2, f64>(transmute(b), 0);
transmute(simd_insert::<f64x2, f64>(transmute(a), 0, alow + blow))
let (a, b) = (f64x2::from(a), f64x2::from(b));
a.insert(0, a.extract(0) + b.extract(0)).as_m128d()
}
#[inline]
pub unsafe fn _mm_add_pd(a: __m128d, b: __m128d) -> __m128d {
transmute(simd_add::<f64x2>(transmute(a), transmute(b)))
simd_add(f64x2::from(a), f64x2::from(b)).as_m128d()
}
#[inline]
pub unsafe fn _mm_load_pd(mem_addr: *const f64) -> __m128d {
*(mem_addr as *const __m128d)
}
#[inline]
pub unsafe fn _mm_store_pd(mem_addr: *mut f64, a: __m128d) {
*(mem_addr as *mut __m128d) = a;
}
#[allow(improper_ctypes)]
extern {
#[link_name = "llvm.x86.sse2.pause"]
pub fn pause();
#[link_name = "llvm.x86.sse2.clflush"]
pub fn clflush(p: *mut c_void);
#[link_name = "llvm.x86.sse2.lfence"]
pub fn lfence();
#[link_name = "llvm.x86.sse2.mfence"]
pub fn mfence();
#[link_name = "llvm.x86.sse2.padds.b"]
pub fn paddsb(a: i8x16, b: i8x16) -> i8x16;
#[link_name = "llvm.x86.sse2.padds.w"]
pub fn paddsw(a: i16x8, b: i16x8) -> i16x8;
#[link_name = "llvm.x86.sse2.paddus.b"]
pub fn paddsub(a: u8x16, b: u8x16) -> u8x16;
#[link_name = "llvm.x86.sse2.paddus.w"]
pub fn paddsuw(a: u16x8, b: u16x8) -> u16x8;
#[link_name = "llvm.x86.sse2.pavg.b"]
pub fn pavgb(a: u8x16, b: u8x16) -> u8x16;
#[link_name = "llvm.x86.sse2.pavg.w"]
pub fn pavgw(a: u16x8, b: u16x8) -> u16x8;
#[link_name = "llvm.x86.sse2.pmadd.wd"]
pub fn pmaddwd(a: i16x8, b: i16x8) -> i32x4;
#[link_name = "llvm.x86.sse2.pmaxs.w"]
pub fn pmaxsw(a: i16x8, b: i16x8) -> i16x8;
#[link_name = "llvm.x86.sse2.pmaxu.b"]
pub fn pmaxub(a: u8x16, b: u8x16) -> u8x16;
#[link_name = "llvm.x86.sse2.pmins.w"]
pub fn pminsw(a: i16x8, b: i16x8) -> i16x8;
#[link_name = "llvm.x86.sse2.pminu.b"]
pub fn pminub(a: u8x16, b: u8x16) -> u8x16;
#[link_name = "llvm.x86.sse2.pmulh.w"]
pub fn pmulhw(a: i16x8, b: i16x8) -> i16x8;
#[link_name = "llvm.x86.sse2.pmulhu.w"]
pub fn pmulhuw(a: u16x8, b: u16x8) -> u16x8;
#[link_name = "llvm.x86.sse2.pmulu.dq"]
pub fn pmuludq(a: u32x4, b: u32x4) -> u64x2;
#[link_name = "llvm.x86.sse2.psad.bw"]
pub fn psadbw(a: u8x16, b: u8x16) -> u64x2;
}
#[cfg(test)]
mod tests {
use std::os::raw::c_void;
use v128::*;
use v64::*;
use x86::sse2 as sse2;
#[test]
fn _mm_pause() {
unsafe { sse2::_mm_pause() }
}
#[test]
fn _mm_clflush() {
let x = 0;
unsafe { sse2::_mm_clflush(&x as *const _ as *mut c_void) }
}
#[test]
fn _mm_lfence() {
unsafe { sse2::_mm_lfence() }
}
#[test]
fn _mm_mfence() {
unsafe { sse2::_mm_mfence() }
}
#[test]
fn _mm_add_epi8() {
let a = u8x16::new(
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
let b = u8x16::new(
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
let r = unsafe { sse2::_mm_add_epi8(a.as_m128i(), b.as_m128i()) };
let e = u8x16::new(
16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46);
assert_eq!(u8x16::from(r), e);
}
#[test]
fn _mm_adds_epi8_overflow() {
let a = u8x16::splat(0xFF);
let b = u8x16::splat(1);
let r = unsafe { sse2::_mm_adds_epi8(a.as_m128i(), b.as_m128i()) };
assert_eq!(u8x16::from(r), u8x16::splat(0));
}
#[test]
fn _mm_add_epi16() {
let a = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
let b = u16x8::new(8, 9, 10, 11, 12, 13, 14, 15);
let r = unsafe { sse2::_mm_add_epi16(a.as_m128i(), b.as_m128i()) };
let e = u16x8::new(8, 10, 12, 14, 16, 18, 20, 22);
assert_eq!(u16x8::from(r), e);
}
#[test]
fn _mm_add_epi32() {
let a = u32x4::new(0, 1, 2, 3);
let b = u32x4::new(4, 5, 6, 7);
let r = unsafe { sse2::_mm_add_epi32(a.as_m128i(), b.as_m128i()) };
let e = u32x4::new(4, 6, 8, 10);
assert_eq!(u32x4::from(r), e);
}
#[test]
#[ignore]
fn _mm_add_si64() {
let (a, b) = (u64x1::new(1), u64x1::new(2));
let r = unsafe { sse2::_mm_add_si64(a.as_m64(), b.as_m64()) };
let e = u64x1::new(3);
assert_eq!(u64x1::from(r), e);
}
#[test]
fn _mm_add_epi64() {
let a = u64x2::new(0, 1);
let b = u64x2::new(2, 3);
let r = unsafe { sse2::_mm_add_epi64(a.as_m128i(), b.as_m128i()) };
let e = u64x2::new(2, 4);
assert_eq!(u64x2::from(r), e);
}
#[test]
fn _mm_adds_epi8() {
let a = i8x16::new(
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
let b = i8x16::new(
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
let r = unsafe { sse2::_mm_adds_epi8(a.as_m128i(), b.as_m128i()) };
let e = i8x16::new(
16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46);
assert_eq!(i8x16::from(r), e);
}
#[test]
fn _mm_adds_epi8_saturate_positive() {
let a = i8x16::splat(0x7F);
let b = i8x16::splat(1);
let r = unsafe { sse2::_mm_adds_epi8(a.as_m128i(), b.as_m128i()) };
assert_eq!(i8x16::from(r), a);
}
#[test]
fn _mm_adds_epi8_saturate_negative() {
let a = i8x16::splat(-0x80);
let b = i8x16::splat(-1);
let r = unsafe { sse2::_mm_adds_epi8(a.as_m128i(), b.as_m128i()) };
assert_eq!(i8x16::from(r), a);
}
#[test]
fn _mm_adds_epi16() {
let a = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
let b = i16x8::new(8, 9, 10, 11, 12, 13, 14, 15);
let r = unsafe { sse2::_mm_adds_epi16(a.as_m128i(), b.as_m128i()) };
let e = i16x8::new(8, 10, 12, 14, 16, 18, 20, 22);
assert_eq!(i16x8::from(r), e);
}
#[test]
fn _mm_adds_epi16_saturate_positive() {
let a = i16x8::splat(0x7FFF);
let b = i16x8::splat(1);
let r = unsafe { sse2::_mm_adds_epi16(a.as_m128i(), b.as_m128i()) };
assert_eq!(i16x8::from(r), a);
}
#[test]
fn _mm_adds_epi16_saturate_negative() {
let a = i16x8::splat(-0x8000);
let b = i16x8::splat(-1);
let r = unsafe { sse2::_mm_adds_epi16(a.as_m128i(), b.as_m128i()) };
assert_eq!(i16x8::from(r), a);
}
#[test]
fn _mm_adds_epu8() {
let a = u8x16::new(
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
let b = u8x16::new(
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
let r = unsafe { sse2::_mm_adds_epu8(a.as_m128i(), b.as_m128i()) };
let e = u8x16::new(
16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46);
assert_eq!(u8x16::from(r), e);
}
#[test]
fn _mm_adds_epu8_saturate() {
let a = u8x16::splat(0xFF);
let b = u8x16::splat(1);
let r = unsafe { sse2::_mm_adds_epu8(a.as_m128i(), b.as_m128i()) };
assert_eq!(u8x16::from(r), a);
}
#[test]
fn _mm_adds_epu16() {
let a = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
let b = u16x8::new(8, 9, 10, 11, 12, 13, 14, 15);
let r = unsafe { sse2::_mm_adds_epu16(a.as_m128i(), b.as_m128i()) };
let e = u16x8::new(8, 10, 12, 14, 16, 18, 20, 22);
assert_eq!(u16x8::from(r), e);
}
#[test]
fn _mm_adds_epu16_saturate() {
let a = u16x8::splat(0xFFFF);
let b = u16x8::splat(1);
let r = unsafe { sse2::_mm_adds_epu16(a.as_m128i(), b.as_m128i()) };
assert_eq!(u16x8::from(r), a);
}
#[test]
fn _mm_avg_epu8() {
let (a, b) = (u8x16::splat(3), u8x16::splat(9));
let r = unsafe { sse2::_mm_avg_epu8(a.as_m128i(), b.as_m128i()) };
assert_eq!(u8x16::from(r), u8x16::splat(6));
}
#[test]
fn _mm_avg_epu16() {
let (a, b) = (u16x8::splat(3), u16x8::splat(9));
let r = unsafe { sse2::_mm_avg_epu8(a.as_m128i(), b.as_m128i()) };
assert_eq!(u16x8::from(r), u16x8::splat(6));
}
#[test]
fn _mm_madd_epi16() {
let a = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
let b = i16x8::new(9, 10, 11, 12, 13, 14, 15, 16);
let r = unsafe { sse2::_mm_madd_epi16(a.as_m128i(), b.as_m128i()) };
let e = i32x4::new(29, 81, 149, 233);
assert_eq!(i32x4::from(r), e);
}
#[test]
fn _mm_max_epi16() {
let a = i16x8::splat(1);
let b = i16x8::splat(-1);
let r = unsafe { sse2::_mm_max_epi16(a.as_m128i(), b.as_m128i()) };
assert_eq!(i16x8::from(r), a);
}
#[test]
fn _mm_max_epu8() {
let a = u8x16::splat(1);
let b = u8x16::splat(255);
let r = unsafe { sse2::_mm_max_epu8(a.as_m128i(), b.as_m128i()) };
assert_eq!(u8x16::from(r), b);
}
#[test]
fn _mm_min_epi16() {
let a = i16x8::splat(1);
let b = i16x8::splat(-1);
let r = unsafe { sse2::_mm_min_epi16(a.as_m128i(), b.as_m128i()) };
assert_eq!(i16x8::from(r), b);
}
#[test]
fn _mm_min_epu8() {
let a = u8x16::splat(1);
let b = u8x16::splat(255);
let r = unsafe { sse2::_mm_min_epu8(a.as_m128i(), b.as_m128i()) };
assert_eq!(u8x16::from(r), a);
}
#[test]
fn _mm_mulhi_epi16() {
let (a, b) = (i16x8::splat(1000), i16x8::splat(-1001));
let r = unsafe { sse2::_mm_mulhi_epi16(a.as_m128i(), b.as_m128i()) };
assert_eq!(i16x8::from(r), i16x8::splat(-16));
}
#[test]
fn _mm_mulhi_epu16() {
let (a, b) = (u16x8::splat(1000), u16x8::splat(1001));
let r = unsafe { sse2::_mm_mulhi_epu16(a.as_m128i(), b.as_m128i()) };
assert_eq!(u16x8::from(r), u16x8::splat(15));
}
#[test]
fn _mm_mullo_epi16() {
let (a, b) = (i16x8::splat(1000), i16x8::splat(-1001));
let r = unsafe { sse2::_mm_mullo_epi16(a.as_m128i(), b.as_m128i()) };
assert_eq!(i16x8::from(r), i16x8::splat(-17960));
}
#[test]
#[ignore]
fn _mm_mul_su32() {
let a = u32x2::new(1_000_000_000, 3);
let b = u32x2::new(1_000_000_000, 4);
let r = unsafe { sse2::_mm_mul_su32(a.as_m64(), b.as_m64()) };
let e = u64x1::new(1_000_000_000 * 1_000_000_000);
assert_eq!(u64x1::from(r), e);
}
#[test]
fn _mm_mul_epu32() {
let a = u64x2::new(1_000_000_000, 1 << 34);
let b = u64x2::new(1_000_000_000, 1 << 35);
let r = unsafe { sse2::_mm_mul_epu32(a.as_m128i(), b.as_m128i()) };
let e = u64x2::new(1_000_000_000 * 1_000_000_000, 0);
assert_eq!(u64x2::from(r), e);
}
#[test]
fn _mm_sad_epu8() {
let a = u8x16::new(
255, 254, 253, 252, 1, 2, 3, 4,
155, 154, 153, 152, 1, 2, 3, 4);
let b = u8x16::new(
0, 0, 0, 0, 2, 1, 2, 1,
1, 1, 1, 1, 1, 2, 1, 2);
let r = unsafe { sse2::_mm_sad_epu8(a.as_m128i(), b.as_m128i()) };
let e = u64x2::new(1020, 614);
assert_eq!(u64x2::from(r), e);
}
}