Use REP MOVSB/STOSB when the ERMSB feature is present (#392)
* Reorganize mem functions This reduces the amount of platform-specific code Signed-off-by: Joe Richey <joerichey@google.com> * Use ERMSB implementations if the feature is set Signed-off-by: Joe Richey <joerichey@google.com> * Add non-aligned benchmarks Signed-off-by: Joe Richey <joerichey@google.com>
This commit is contained in:
parent
a97fe5f1d3
commit
53daa3c593
5 changed files with 148 additions and 88 deletions
29
library/compiler-builtins/src/mem/impls.rs
Normal file
29
library/compiler-builtins/src/mem/impls.rs
Normal file
|
|
@ -0,0 +1,29 @@
|
|||
use super::c_int;
|
||||
|
||||
#[inline(always)]
|
||||
pub unsafe fn copy_forward(dest: *mut u8, src: *const u8, n: usize) {
|
||||
let mut i = 0;
|
||||
while i < n {
|
||||
*dest.offset(i as isize) = *src.offset(i as isize);
|
||||
i += 1;
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub unsafe fn copy_backward(dest: *mut u8, src: *const u8, n: usize) {
|
||||
// copy from end
|
||||
let mut i = n;
|
||||
while i != 0 {
|
||||
i -= 1;
|
||||
*dest.offset(i as isize) = *src.offset(i as isize);
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub unsafe fn set_bytes(s: *mut u8, c: u8, n: usize) {
|
||||
let mut i = 0;
|
||||
while i < n {
|
||||
*s.offset(i as isize) = c;
|
||||
i += 1;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,41 +0,0 @@
|
|||
use super::c_int;
|
||||
|
||||
#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)]
|
||||
pub unsafe extern "C" fn memcpy(dest: *mut u8, src: *const u8, n: usize) -> *mut u8 {
|
||||
let mut i = 0;
|
||||
while i < n {
|
||||
*dest.offset(i as isize) = *src.offset(i as isize);
|
||||
i += 1;
|
||||
}
|
||||
dest
|
||||
}
|
||||
|
||||
#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)]
|
||||
pub unsafe extern "C" fn memmove(dest: *mut u8, src: *const u8, n: usize) -> *mut u8 {
|
||||
if src < dest as *const u8 {
|
||||
// copy from end
|
||||
let mut i = n;
|
||||
while i != 0 {
|
||||
i -= 1;
|
||||
*dest.offset(i as isize) = *src.offset(i as isize);
|
||||
}
|
||||
} else {
|
||||
// copy from beginning
|
||||
let mut i = 0;
|
||||
while i < n {
|
||||
*dest.offset(i as isize) = *src.offset(i as isize);
|
||||
i += 1;
|
||||
}
|
||||
}
|
||||
dest
|
||||
}
|
||||
|
||||
#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)]
|
||||
pub unsafe extern "C" fn memset(s: *mut u8, c: c_int, n: usize) -> *mut u8 {
|
||||
let mut i = 0;
|
||||
while i < n {
|
||||
*s.offset(i as isize) = c as u8;
|
||||
i += 1;
|
||||
}
|
||||
s
|
||||
}
|
||||
|
|
@ -11,8 +11,32 @@ use core::ops::{BitOr, Shl};
|
|||
|
||||
// memcpy/memmove/memset have optimized implementations on some architectures
|
||||
#[cfg_attr(all(feature = "asm", target_arch = "x86_64"), path = "x86_64.rs")]
|
||||
mod memcpy;
|
||||
pub use self::memcpy::*;
|
||||
mod impls;
|
||||
|
||||
#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)]
|
||||
pub unsafe extern "C" fn memcpy(dest: *mut u8, src: *const u8, n: usize) -> *mut u8 {
|
||||
impls::copy_forward(dest, src, n);
|
||||
dest
|
||||
}
|
||||
|
||||
#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)]
|
||||
pub unsafe extern "C" fn memmove(dest: *mut u8, src: *const u8, n: usize) -> *mut u8 {
|
||||
let delta = (dest as usize).wrapping_sub(src as usize);
|
||||
if delta >= n {
|
||||
// We can copy forwards because either dest is far enough ahead of src,
|
||||
// or src is ahead of dest (and delta overflowed).
|
||||
impls::copy_forward(dest, src, n);
|
||||
} else {
|
||||
impls::copy_backward(dest, src, n);
|
||||
}
|
||||
dest
|
||||
}
|
||||
|
||||
#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)]
|
||||
pub unsafe extern "C" fn memset(s: *mut u8, c: c_int, n: usize) -> *mut u8 {
|
||||
impls::set_bytes(s, c as u8, n);
|
||||
s
|
||||
}
|
||||
|
||||
#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)]
|
||||
pub unsafe extern "C" fn memcmp(s1: *const u8, s2: *const u8, n: usize) -> i32 {
|
||||
|
|
|
|||
|
|
@ -1,5 +1,3 @@
|
|||
use super::c_int;
|
||||
|
||||
// On most modern Intel and AMD processors, "rep movsq" and "rep stosq" have
|
||||
// been enhanced to perform better than an simple qword loop, making them ideal
|
||||
// for implementing memcpy/memset. Note that "rep cmps" has received no such
|
||||
|
|
@ -13,11 +11,26 @@ use super::c_int;
|
|||
// - FSRM - Fast Short REP MOV (Ice Lake and later)
|
||||
// - Fast Zero-Length MOVSB (On no current hardware)
|
||||
// - Fast Short STOSB (On no current hardware)
|
||||
// However, to avoid run-time feature detection, we don't use these byte-based
|
||||
// instructions for most of the copying, preferring the qword variants.
|
||||
//
|
||||
// To simplify things, we switch to using the byte-based variants if the "ermsb"
|
||||
// feature is present at compile-time. We don't bother detecting other features.
|
||||
// Note that ERMSB does not enhance the backwards (DF=1) "rep movsb".
|
||||
|
||||
#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)]
|
||||
pub unsafe extern "C" fn memcpy(dest: *mut u8, src: *const u8, count: usize) -> *mut u8 {
|
||||
#[inline(always)]
|
||||
#[cfg(target_feature = "ermsb")]
|
||||
pub unsafe fn copy_forward(dest: *mut u8, src: *const u8, count: usize) {
|
||||
asm!(
|
||||
"rep movsb [rdi], [rsi]",
|
||||
inout("rcx") count => _,
|
||||
inout("rdi") dest => _,
|
||||
inout("rsi") src => _,
|
||||
options(nostack, preserves_flags)
|
||||
);
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
#[cfg(not(target_feature = "ermsb"))]
|
||||
pub unsafe fn copy_forward(dest: *mut u8, src: *const u8, count: usize) {
|
||||
let qword_count = count >> 3;
|
||||
let byte_count = count & 0b111;
|
||||
asm!(
|
||||
|
|
@ -30,18 +43,10 @@ pub unsafe extern "C" fn memcpy(dest: *mut u8, src: *const u8, count: usize) ->
|
|||
inout("rsi") src => _,
|
||||
options(nostack, preserves_flags)
|
||||
);
|
||||
dest
|
||||
}
|
||||
|
||||
#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)]
|
||||
pub unsafe extern "C" fn memmove(dest: *mut u8, src: *const u8, count: usize) -> *mut u8 {
|
||||
let delta = (dest as usize).wrapping_sub(src as usize);
|
||||
if delta >= count {
|
||||
// We can copy forwards because either dest is far enough ahead of src,
|
||||
// or src is ahead of dest (and delta overflowed).
|
||||
return self::memcpy(dest, src, count);
|
||||
}
|
||||
// copy backwards
|
||||
#[inline(always)]
|
||||
pub unsafe fn copy_backward(dest: *mut u8, src: *const u8, count: usize) {
|
||||
let qword_count = count >> 3;
|
||||
let byte_count = count & 0b111;
|
||||
asm!(
|
||||
|
|
@ -58,11 +63,23 @@ pub unsafe extern "C" fn memmove(dest: *mut u8, src: *const u8, count: usize) ->
|
|||
inout("rsi") src.offset(count as isize).wrapping_sub(8) => _,
|
||||
options(nostack)
|
||||
);
|
||||
dest
|
||||
}
|
||||
|
||||
#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)]
|
||||
pub unsafe extern "C" fn memset(dest: *mut u8, c: c_int, count: usize) -> *mut u8 {
|
||||
#[inline(always)]
|
||||
#[cfg(target_feature = "ermsb")]
|
||||
pub unsafe fn set_bytes(dest: *mut u8, c: u8, count: usize) {
|
||||
asm!(
|
||||
"rep stosb [rdi], al",
|
||||
inout("rcx") count => _,
|
||||
inout("rdi") dest => _,
|
||||
inout("al") c => _,
|
||||
options(nostack, preserves_flags)
|
||||
)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
#[cfg(not(target_feature = "ermsb"))]
|
||||
pub unsafe fn set_bytes(dest: *mut u8, c: u8, count: usize) {
|
||||
let qword_count = count >> 3;
|
||||
let byte_count = count & 0b111;
|
||||
asm!(
|
||||
|
|
@ -72,8 +89,7 @@ pub unsafe extern "C" fn memset(dest: *mut u8, c: c_int, count: usize) -> *mut u
|
|||
byte_count = in(reg) byte_count,
|
||||
inout("rcx") qword_count => _,
|
||||
inout("rdi") dest => _,
|
||||
in("rax") (c as u8 as u64) * 0x0101010101010101,
|
||||
in("rax") (c as u64) * 0x0101010101010101,
|
||||
options(nostack, preserves_flags)
|
||||
);
|
||||
dest
|
||||
}
|
||||
|
|
|
|||
|
|
@ -6,33 +6,33 @@ use test::{black_box, Bencher};
|
|||
extern crate compiler_builtins;
|
||||
use compiler_builtins::mem::{memcmp, memcpy, memmove, memset};
|
||||
|
||||
fn memcpy_builtin(b: &mut Bencher, n: usize) {
|
||||
let v1 = vec![1u8; n];
|
||||
let mut v2 = vec![0u8; n];
|
||||
fn memcpy_builtin(b: &mut Bencher, n: usize, offset: usize) {
|
||||
let v1 = vec![1u8; n + offset];
|
||||
let mut v2 = vec![0u8; n + offset];
|
||||
b.bytes = n as u64;
|
||||
b.iter(|| {
|
||||
let src: &[u8] = black_box(&v1);
|
||||
let dst: &mut [u8] = black_box(&mut v2);
|
||||
let src: &[u8] = black_box(&v1[offset..]);
|
||||
let dst: &mut [u8] = black_box(&mut v2[offset..]);
|
||||
dst.copy_from_slice(src);
|
||||
})
|
||||
}
|
||||
|
||||
fn memcpy_rust(b: &mut Bencher, n: usize) {
|
||||
let v1 = vec![1u8; n];
|
||||
let mut v2 = vec![0u8; n];
|
||||
fn memcpy_rust(b: &mut Bencher, n: usize, offset: usize) {
|
||||
let v1 = vec![1u8; n + offset];
|
||||
let mut v2 = vec![0u8; n + offset];
|
||||
b.bytes = n as u64;
|
||||
b.iter(|| {
|
||||
let src: &[u8] = black_box(&v1);
|
||||
let dst: &mut [u8] = black_box(&mut v2);
|
||||
let src: &[u8] = black_box(&v1[offset..]);
|
||||
let dst: &mut [u8] = black_box(&mut v2[offset..]);
|
||||
unsafe { memcpy(dst.as_mut_ptr(), src.as_ptr(), n) }
|
||||
})
|
||||
}
|
||||
|
||||
fn memset_builtin(b: &mut Bencher, n: usize) {
|
||||
let mut v1 = vec![0u8; n];
|
||||
fn memset_builtin(b: &mut Bencher, n: usize, offset: usize) {
|
||||
let mut v1 = vec![0u8; n + offset];
|
||||
b.bytes = n as u64;
|
||||
b.iter(|| {
|
||||
let dst: &mut [u8] = black_box(&mut v1);
|
||||
let dst: &mut [u8] = black_box(&mut v1[offset..]);
|
||||
let val: u8 = black_box(27);
|
||||
for b in dst {
|
||||
*b = val;
|
||||
|
|
@ -40,11 +40,11 @@ fn memset_builtin(b: &mut Bencher, n: usize) {
|
|||
})
|
||||
}
|
||||
|
||||
fn memset_rust(b: &mut Bencher, n: usize) {
|
||||
let mut v1 = vec![0u8; n];
|
||||
fn memset_rust(b: &mut Bencher, n: usize, offset: usize) {
|
||||
let mut v1 = vec![0u8; n + offset];
|
||||
b.bytes = n as u64;
|
||||
b.iter(|| {
|
||||
let dst: &mut [u8] = black_box(&mut v1);
|
||||
let dst: &mut [u8] = black_box(&mut v1[offset..]);
|
||||
let val = black_box(27);
|
||||
unsafe { memset(dst.as_mut_ptr(), val, n) }
|
||||
})
|
||||
|
|
@ -95,36 +95,68 @@ fn memmove_rust(b: &mut Bencher, n: usize) {
|
|||
|
||||
#[bench]
|
||||
fn memcpy_builtin_4096(b: &mut Bencher) {
|
||||
memcpy_builtin(b, 4096)
|
||||
memcpy_builtin(b, 4096, 0)
|
||||
}
|
||||
#[bench]
|
||||
fn memcpy_rust_4096(b: &mut Bencher) {
|
||||
memcpy_rust(b, 4096)
|
||||
memcpy_rust(b, 4096, 0)
|
||||
}
|
||||
#[bench]
|
||||
fn memcpy_builtin_1048576(b: &mut Bencher) {
|
||||
memcpy_builtin(b, 1048576)
|
||||
memcpy_builtin(b, 1048576, 0)
|
||||
}
|
||||
#[bench]
|
||||
fn memcpy_rust_1048576(b: &mut Bencher) {
|
||||
memcpy_rust(b, 1048576)
|
||||
memcpy_rust(b, 1048576, 0)
|
||||
}
|
||||
#[bench]
|
||||
fn memcpy_builtin_4096_offset(b: &mut Bencher) {
|
||||
memcpy_builtin(b, 4096, 65)
|
||||
}
|
||||
#[bench]
|
||||
fn memcpy_rust_4096_offset(b: &mut Bencher) {
|
||||
memcpy_rust(b, 4096, 65)
|
||||
}
|
||||
#[bench]
|
||||
fn memcpy_builtin_1048576_offset(b: &mut Bencher) {
|
||||
memcpy_builtin(b, 1048576, 65)
|
||||
}
|
||||
#[bench]
|
||||
fn memcpy_rust_1048576_offset(b: &mut Bencher) {
|
||||
memcpy_rust(b, 1048576, 65)
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn memset_builtin_4096(b: &mut Bencher) {
|
||||
memset_builtin(b, 4096)
|
||||
memset_builtin(b, 4096, 0)
|
||||
}
|
||||
#[bench]
|
||||
fn memset_rust_4096(b: &mut Bencher) {
|
||||
memset_rust(b, 4096)
|
||||
memset_rust(b, 4096, 0)
|
||||
}
|
||||
#[bench]
|
||||
fn memset_builtin_1048576(b: &mut Bencher) {
|
||||
memset_builtin(b, 1048576)
|
||||
memset_builtin(b, 1048576, 0)
|
||||
}
|
||||
#[bench]
|
||||
fn memset_rust_1048576(b: &mut Bencher) {
|
||||
memset_rust(b, 1048576)
|
||||
memset_rust(b, 1048576, 0)
|
||||
}
|
||||
#[bench]
|
||||
fn memset_builtin_4096_offset(b: &mut Bencher) {
|
||||
memset_builtin(b, 4096, 65)
|
||||
}
|
||||
#[bench]
|
||||
fn memset_rust_4096_offset(b: &mut Bencher) {
|
||||
memset_rust(b, 4096, 65)
|
||||
}
|
||||
#[bench]
|
||||
fn memset_builtin_1048576_offset(b: &mut Bencher) {
|
||||
memset_builtin(b, 1048576, 65)
|
||||
}
|
||||
#[bench]
|
||||
fn memset_rust_1048576_offset(b: &mut Bencher) {
|
||||
memset_rust(b, 1048576, 65)
|
||||
}
|
||||
|
||||
#[bench]
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue