Use REP MOVSB/STOSB when the ERMSB feature is present (#392)

* Reorganize mem functions

This reduces the amount of platform-specific code

Signed-off-by: Joe Richey <joerichey@google.com>

* Use ERMSB implementations if the feature is set

Signed-off-by: Joe Richey <joerichey@google.com>

* Add non-aligned benchmarks

Signed-off-by: Joe Richey <joerichey@google.com>
This commit is contained in:
Joseph Richey 2020-11-03 06:57:08 -08:00 committed by GitHub
parent a97fe5f1d3
commit 53daa3c593
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 148 additions and 88 deletions

View file

@ -0,0 +1,29 @@
use super::c_int;
#[inline(always)]
pub unsafe fn copy_forward(dest: *mut u8, src: *const u8, n: usize) {
let mut i = 0;
while i < n {
*dest.offset(i as isize) = *src.offset(i as isize);
i += 1;
}
}
#[inline(always)]
pub unsafe fn copy_backward(dest: *mut u8, src: *const u8, n: usize) {
// copy from end
let mut i = n;
while i != 0 {
i -= 1;
*dest.offset(i as isize) = *src.offset(i as isize);
}
}
#[inline(always)]
pub unsafe fn set_bytes(s: *mut u8, c: u8, n: usize) {
let mut i = 0;
while i < n {
*s.offset(i as isize) = c;
i += 1;
}
}

View file

@ -1,41 +0,0 @@
use super::c_int;
#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)]
pub unsafe extern "C" fn memcpy(dest: *mut u8, src: *const u8, n: usize) -> *mut u8 {
let mut i = 0;
while i < n {
*dest.offset(i as isize) = *src.offset(i as isize);
i += 1;
}
dest
}
#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)]
pub unsafe extern "C" fn memmove(dest: *mut u8, src: *const u8, n: usize) -> *mut u8 {
if src < dest as *const u8 {
// copy from end
let mut i = n;
while i != 0 {
i -= 1;
*dest.offset(i as isize) = *src.offset(i as isize);
}
} else {
// copy from beginning
let mut i = 0;
while i < n {
*dest.offset(i as isize) = *src.offset(i as isize);
i += 1;
}
}
dest
}
#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)]
pub unsafe extern "C" fn memset(s: *mut u8, c: c_int, n: usize) -> *mut u8 {
let mut i = 0;
while i < n {
*s.offset(i as isize) = c as u8;
i += 1;
}
s
}

View file

@ -11,8 +11,32 @@ use core::ops::{BitOr, Shl};
// memcpy/memmove/memset have optimized implementations on some architectures
#[cfg_attr(all(feature = "asm", target_arch = "x86_64"), path = "x86_64.rs")]
mod memcpy;
pub use self::memcpy::*;
mod impls;
#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)]
pub unsafe extern "C" fn memcpy(dest: *mut u8, src: *const u8, n: usize) -> *mut u8 {
impls::copy_forward(dest, src, n);
dest
}
#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)]
pub unsafe extern "C" fn memmove(dest: *mut u8, src: *const u8, n: usize) -> *mut u8 {
let delta = (dest as usize).wrapping_sub(src as usize);
if delta >= n {
// We can copy forwards because either dest is far enough ahead of src,
// or src is ahead of dest (and delta overflowed).
impls::copy_forward(dest, src, n);
} else {
impls::copy_backward(dest, src, n);
}
dest
}
#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)]
pub unsafe extern "C" fn memset(s: *mut u8, c: c_int, n: usize) -> *mut u8 {
impls::set_bytes(s, c as u8, n);
s
}
#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)]
pub unsafe extern "C" fn memcmp(s1: *const u8, s2: *const u8, n: usize) -> i32 {

View file

@ -1,5 +1,3 @@
use super::c_int;
// On most modern Intel and AMD processors, "rep movsq" and "rep stosq" have
// been enhanced to perform better than an simple qword loop, making them ideal
// for implementing memcpy/memset. Note that "rep cmps" has received no such
@ -13,11 +11,26 @@ use super::c_int;
// - FSRM - Fast Short REP MOV (Ice Lake and later)
// - Fast Zero-Length MOVSB (On no current hardware)
// - Fast Short STOSB (On no current hardware)
// However, to avoid run-time feature detection, we don't use these byte-based
// instructions for most of the copying, preferring the qword variants.
//
// To simplify things, we switch to using the byte-based variants if the "ermsb"
// feature is present at compile-time. We don't bother detecting other features.
// Note that ERMSB does not enhance the backwards (DF=1) "rep movsb".
#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)]
pub unsafe extern "C" fn memcpy(dest: *mut u8, src: *const u8, count: usize) -> *mut u8 {
#[inline(always)]
#[cfg(target_feature = "ermsb")]
pub unsafe fn copy_forward(dest: *mut u8, src: *const u8, count: usize) {
asm!(
"rep movsb [rdi], [rsi]",
inout("rcx") count => _,
inout("rdi") dest => _,
inout("rsi") src => _,
options(nostack, preserves_flags)
);
}
#[inline(always)]
#[cfg(not(target_feature = "ermsb"))]
pub unsafe fn copy_forward(dest: *mut u8, src: *const u8, count: usize) {
let qword_count = count >> 3;
let byte_count = count & 0b111;
asm!(
@ -30,18 +43,10 @@ pub unsafe extern "C" fn memcpy(dest: *mut u8, src: *const u8, count: usize) ->
inout("rsi") src => _,
options(nostack, preserves_flags)
);
dest
}
#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)]
pub unsafe extern "C" fn memmove(dest: *mut u8, src: *const u8, count: usize) -> *mut u8 {
let delta = (dest as usize).wrapping_sub(src as usize);
if delta >= count {
// We can copy forwards because either dest is far enough ahead of src,
// or src is ahead of dest (and delta overflowed).
return self::memcpy(dest, src, count);
}
// copy backwards
#[inline(always)]
pub unsafe fn copy_backward(dest: *mut u8, src: *const u8, count: usize) {
let qword_count = count >> 3;
let byte_count = count & 0b111;
asm!(
@ -58,11 +63,23 @@ pub unsafe extern "C" fn memmove(dest: *mut u8, src: *const u8, count: usize) ->
inout("rsi") src.offset(count as isize).wrapping_sub(8) => _,
options(nostack)
);
dest
}
#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)]
pub unsafe extern "C" fn memset(dest: *mut u8, c: c_int, count: usize) -> *mut u8 {
#[inline(always)]
#[cfg(target_feature = "ermsb")]
pub unsafe fn set_bytes(dest: *mut u8, c: u8, count: usize) {
asm!(
"rep stosb [rdi], al",
inout("rcx") count => _,
inout("rdi") dest => _,
inout("al") c => _,
options(nostack, preserves_flags)
)
}
#[inline(always)]
#[cfg(not(target_feature = "ermsb"))]
pub unsafe fn set_bytes(dest: *mut u8, c: u8, count: usize) {
let qword_count = count >> 3;
let byte_count = count & 0b111;
asm!(
@ -72,8 +89,7 @@ pub unsafe extern "C" fn memset(dest: *mut u8, c: c_int, count: usize) -> *mut u
byte_count = in(reg) byte_count,
inout("rcx") qword_count => _,
inout("rdi") dest => _,
in("rax") (c as u8 as u64) * 0x0101010101010101,
in("rax") (c as u64) * 0x0101010101010101,
options(nostack, preserves_flags)
);
dest
}

View file

@ -6,33 +6,33 @@ use test::{black_box, Bencher};
extern crate compiler_builtins;
use compiler_builtins::mem::{memcmp, memcpy, memmove, memset};
fn memcpy_builtin(b: &mut Bencher, n: usize) {
let v1 = vec![1u8; n];
let mut v2 = vec![0u8; n];
fn memcpy_builtin(b: &mut Bencher, n: usize, offset: usize) {
let v1 = vec![1u8; n + offset];
let mut v2 = vec![0u8; n + offset];
b.bytes = n as u64;
b.iter(|| {
let src: &[u8] = black_box(&v1);
let dst: &mut [u8] = black_box(&mut v2);
let src: &[u8] = black_box(&v1[offset..]);
let dst: &mut [u8] = black_box(&mut v2[offset..]);
dst.copy_from_slice(src);
})
}
fn memcpy_rust(b: &mut Bencher, n: usize) {
let v1 = vec![1u8; n];
let mut v2 = vec![0u8; n];
fn memcpy_rust(b: &mut Bencher, n: usize, offset: usize) {
let v1 = vec![1u8; n + offset];
let mut v2 = vec![0u8; n + offset];
b.bytes = n as u64;
b.iter(|| {
let src: &[u8] = black_box(&v1);
let dst: &mut [u8] = black_box(&mut v2);
let src: &[u8] = black_box(&v1[offset..]);
let dst: &mut [u8] = black_box(&mut v2[offset..]);
unsafe { memcpy(dst.as_mut_ptr(), src.as_ptr(), n) }
})
}
fn memset_builtin(b: &mut Bencher, n: usize) {
let mut v1 = vec![0u8; n];
fn memset_builtin(b: &mut Bencher, n: usize, offset: usize) {
let mut v1 = vec![0u8; n + offset];
b.bytes = n as u64;
b.iter(|| {
let dst: &mut [u8] = black_box(&mut v1);
let dst: &mut [u8] = black_box(&mut v1[offset..]);
let val: u8 = black_box(27);
for b in dst {
*b = val;
@ -40,11 +40,11 @@ fn memset_builtin(b: &mut Bencher, n: usize) {
})
}
fn memset_rust(b: &mut Bencher, n: usize) {
let mut v1 = vec![0u8; n];
fn memset_rust(b: &mut Bencher, n: usize, offset: usize) {
let mut v1 = vec![0u8; n + offset];
b.bytes = n as u64;
b.iter(|| {
let dst: &mut [u8] = black_box(&mut v1);
let dst: &mut [u8] = black_box(&mut v1[offset..]);
let val = black_box(27);
unsafe { memset(dst.as_mut_ptr(), val, n) }
})
@ -95,36 +95,68 @@ fn memmove_rust(b: &mut Bencher, n: usize) {
#[bench]
fn memcpy_builtin_4096(b: &mut Bencher) {
memcpy_builtin(b, 4096)
memcpy_builtin(b, 4096, 0)
}
#[bench]
fn memcpy_rust_4096(b: &mut Bencher) {
memcpy_rust(b, 4096)
memcpy_rust(b, 4096, 0)
}
#[bench]
fn memcpy_builtin_1048576(b: &mut Bencher) {
memcpy_builtin(b, 1048576)
memcpy_builtin(b, 1048576, 0)
}
#[bench]
fn memcpy_rust_1048576(b: &mut Bencher) {
memcpy_rust(b, 1048576)
memcpy_rust(b, 1048576, 0)
}
#[bench]
fn memcpy_builtin_4096_offset(b: &mut Bencher) {
memcpy_builtin(b, 4096, 65)
}
#[bench]
fn memcpy_rust_4096_offset(b: &mut Bencher) {
memcpy_rust(b, 4096, 65)
}
#[bench]
fn memcpy_builtin_1048576_offset(b: &mut Bencher) {
memcpy_builtin(b, 1048576, 65)
}
#[bench]
fn memcpy_rust_1048576_offset(b: &mut Bencher) {
memcpy_rust(b, 1048576, 65)
}
#[bench]
fn memset_builtin_4096(b: &mut Bencher) {
memset_builtin(b, 4096)
memset_builtin(b, 4096, 0)
}
#[bench]
fn memset_rust_4096(b: &mut Bencher) {
memset_rust(b, 4096)
memset_rust(b, 4096, 0)
}
#[bench]
fn memset_builtin_1048576(b: &mut Bencher) {
memset_builtin(b, 1048576)
memset_builtin(b, 1048576, 0)
}
#[bench]
fn memset_rust_1048576(b: &mut Bencher) {
memset_rust(b, 1048576)
memset_rust(b, 1048576, 0)
}
#[bench]
fn memset_builtin_4096_offset(b: &mut Bencher) {
memset_builtin(b, 4096, 65)
}
#[bench]
fn memset_rust_4096_offset(b: &mut Bencher) {
memset_rust(b, 4096, 65)
}
#[bench]
fn memset_builtin_1048576_offset(b: &mut Bencher) {
memset_builtin(b, 1048576, 65)
}
#[bench]
fn memset_rust_1048576_offset(b: &mut Bencher) {
memset_rust(b, 1048576, 65)
}
#[bench]