Implement faster memcmp for x86_64

x86_64 can load unaligned words in a single cache line as fast as
aligned words. Even when crossing cache or page boundaries it is just as
fast to do an unaligned word read instead of multiple byte reads.

Also add a couple more tests & benchmarks.
This commit is contained in:
David Hoppenbrouwers 2022-03-01 09:50:55 +01:00
parent 735ad07501
commit 4dbd8387f9
No known key found for this signature in database
GPG key ID: A9156EA5E4B644FF
5 changed files with 193 additions and 15 deletions

View file

@ -265,3 +265,17 @@ pub unsafe fn set_bytes(mut s: *mut u8, c: u8, mut n: usize) {
}
set_bytes_bytes(s, c, n);
}
#[inline(always)]
pub unsafe fn compare_bytes(s1: *const u8, s2: *const u8, n: usize) {
let mut i = 0;
while i < n {
let a = *s1.add(i);
let b = *s2.add(i);
if a != b {
return a as i32 - b as i32;
}
i += 1;
}
0
}

View file

@ -51,16 +51,7 @@ intrinsics! {
#[mem_builtin]
#[cfg_attr(not(all(target_os = "windows", target_env = "gnu")), linkage = "weak")]
pub unsafe extern "C" fn memcmp(s1: *const u8, s2: *const u8, n: usize) -> i32 {
let mut i = 0;
while i < n {
let a = *s1.add(i);
let b = *s2.add(i);
if a != b {
return a as i32 - b as i32;
}
i += 1;
}
0
impls::compare_bytes(s1, s2, n)
}
#[mem_builtin]

View file

@ -16,6 +16,8 @@
// feature is present at compile-time. We don't bother detecting other features.
// Note that ERMSB does not enhance the backwards (DF=1) "rep movsb".
use core::mem;
#[inline(always)]
#[cfg(target_feature = "ermsb")]
pub unsafe fn copy_forward(dest: *mut u8, src: *const u8, count: usize) {
@ -98,3 +100,42 @@ pub unsafe fn set_bytes(dest: *mut u8, c: u8, count: usize) {
options(att_syntax, nostack, preserves_flags)
);
}
#[inline(always)]
pub unsafe fn compare_bytes(
a: *const u8,
b: *const u8,
n: usize,
) -> i32 {
unsafe fn cmp<T, U, F>(mut a: *const T, mut b: *const T, n: usize, f: F) -> i32
where
T: Clone + Copy + Eq,
U: Clone + Copy + Eq,
F: FnOnce(*const U, *const U, usize) -> i32,
{
for _ in 0..n / mem::size_of::<T>() {
if a.read_unaligned() != b.read_unaligned() {
return f(a.cast(), b.cast(), mem::size_of::<T>());
}
a = a.add(1);
b = b.add(1);
}
f(a.cast(), b.cast(), n % mem::size_of::<T>())
}
let c1 = |mut a: *const u8, mut b: *const u8, n| {
for _ in 0..n {
if a.read() != b.read() {
return i32::from(a.read()) - i32::from(b.read());
}
a = a.add(1);
b = b.add(1);
}
0
};
let c2 = |a: *const u16, b, n| cmp(a, b, n, c1);
let c4 = |a: *const u32, b, n| cmp(a, b, n, c2);
let c8 = |a: *const u64, b, n| cmp(a, b, n, c4);
let c16 = |a: *const u128, b, n| cmp(a, b, n, c8);
let c32 = |a: *const [u128; 2], b, n| cmp(a, b, n, c16);
c32(a.cast(), b.cast(), n)
}

View file

@ -96,6 +96,18 @@ fn memcmp_builtin(b: &mut Bencher, n: usize) {
})
}
fn memcmp_builtin_unaligned(b: &mut Bencher, n: usize) {
let v1 = AlignedVec::new(0, n);
let mut v2 = AlignedVec::new(0, n);
v2[n - 1] = 1;
b.bytes = n as u64;
b.iter(|| {
let s1: &[u8] = black_box(&v1[0..]);
let s2: &[u8] = black_box(&v2[1..]);
s1.cmp(s2)
})
}
fn memcmp_rust(b: &mut Bencher, n: usize) {
let v1 = AlignedVec::new(0, n);
let mut v2 = AlignedVec::new(0, n);
@ -108,6 +120,18 @@ fn memcmp_rust(b: &mut Bencher, n: usize) {
})
}
fn memcmp_rust_unaligned(b: &mut Bencher, n: usize) {
let v1 = AlignedVec::new(0, n);
let mut v2 = AlignedVec::new(0, n);
v2[n - 1] = 1;
b.bytes = n as u64;
b.iter(|| {
let s1: &[u8] = black_box(&v1[0..]);
let s2: &[u8] = black_box(&v2[1..]);
unsafe { memcmp(s1.as_ptr(), s2.as_ptr(), n - 1) }
})
}
fn memmove_builtin(b: &mut Bencher, n: usize, offset: usize) {
let mut v = AlignedVec::new(0, n + n / 2 + offset);
b.bytes = n as u64;
@ -209,6 +233,38 @@ fn memset_rust_1048576_offset(b: &mut Bencher) {
memset_rust(b, 1048576, 65)
}
#[bench]
fn memcmp_builtin_8(b: &mut Bencher) {
memcmp_builtin(b, 8)
}
#[bench]
fn memcmp_rust_8(b: &mut Bencher) {
memcmp_rust(b, 8)
}
#[bench]
fn memcmp_builtin_16(b: &mut Bencher) {
memcmp_builtin(b, 16)
}
#[bench]
fn memcmp_rust_16(b: &mut Bencher) {
memcmp_rust(b, 16)
}
#[bench]
fn memcmp_builtin_32(b: &mut Bencher) {
memcmp_builtin(b, 32)
}
#[bench]
fn memcmp_rust_32(b: &mut Bencher) {
memcmp_rust(b, 32)
}
#[bench]
fn memcmp_builtin_64(b: &mut Bencher) {
memcmp_builtin(b, 64)
}
#[bench]
fn memcmp_rust_64(b: &mut Bencher) {
memcmp_rust(b, 64)
}
#[bench]
fn memcmp_builtin_4096(b: &mut Bencher) {
memcmp_builtin(b, 4096)
@ -225,6 +281,54 @@ fn memcmp_builtin_1048576(b: &mut Bencher) {
fn memcmp_rust_1048576(b: &mut Bencher) {
memcmp_rust(b, 1048576)
}
#[bench]
fn memcmp_builtin_unaligned_7(b: &mut Bencher) {
memcmp_builtin_unaligned(b, 8)
}
#[bench]
fn memcmp_rust_unaligned_7(b: &mut Bencher) {
memcmp_rust_unaligned(b, 8)
}
#[bench]
fn memcmp_builtin_unaligned_15(b: &mut Bencher) {
memcmp_builtin_unaligned(b, 16)
}
#[bench]
fn memcmp_rust_unaligned_15(b: &mut Bencher) {
memcmp_rust_unaligned(b, 16)
}
#[bench]
fn memcmp_builtin_unaligned_31(b: &mut Bencher) {
memcmp_builtin_unaligned(b, 32)
}
#[bench]
fn memcmp_rust_unaligned_31(b: &mut Bencher) {
memcmp_rust_unaligned(b, 32)
}
#[bench]
fn memcmp_builtin_unaligned_63(b: &mut Bencher) {
memcmp_builtin_unaligned(b, 64)
}
#[bench]
fn memcmp_rust_unaligned_63(b: &mut Bencher) {
memcmp_rust_unaligned(b, 64)
}
#[bench]
fn memcmp_builtin_unaligned_4095(b: &mut Bencher) {
memcmp_builtin_unaligned(b, 4096)
}
#[bench]
fn memcmp_rust_unaligned_4095(b: &mut Bencher) {
memcmp_rust_unaligned(b, 4096)
}
#[bench]
fn memcmp_builtin_unaligned_1048575(b: &mut Bencher) {
memcmp_builtin_unaligned(b, 1048576)
}
#[bench]
fn memcmp_rust_unaligned_1048575(b: &mut Bencher) {
memcmp_rust_unaligned(b, 1048576)
}
#[bench]
fn memmove_builtin_4096(b: &mut Bencher) {

View file

@ -116,11 +116,13 @@ fn memset_nonzero() {
#[test]
fn memcmp_eq() {
let arr1: [u8; 8] = [0, 1, 2, 3, 4, 5, 6, 7];
let arr2: [u8; 8] = [0, 1, 2, 3, 4, 5, 6, 7];
unsafe {
assert_eq!(memcmp(arr1.as_ptr(), arr2.as_ptr(), 8), 0);
assert_eq!(memcmp(arr1.as_ptr(), arr2.as_ptr(), 3), 0);
let arr1: [u8; 32] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31];
let arr2: [u8; 32] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31];
for i in 0..32 {
unsafe {
assert_eq!(memcmp(arr1.as_ptr(), arr2.as_ptr(), i), 0);
assert_eq!(memcmp(arr2.as_ptr(), arr1.as_ptr(), i), 0);
}
}
}
@ -134,6 +136,32 @@ fn memcmp_ne() {
}
}
#[test]
fn memcmp_ne_16() {
let arr1: [u8; 16] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
let arr2: [u8; 16] = [0, 1, 2, 3, 4, 5, 7, 7, 8, 9, 10, 11, 12, 13, 14, 15];
unsafe {
assert!(memcmp(arr1.as_ptr(), arr2.as_ptr(), 16) < 0);
assert!(memcmp(arr2.as_ptr(), arr1.as_ptr(), 16) > 0);
}
}
#[test]
fn memcmp_ne_32() {
let arr1: [u8; 32] = [
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0,
];
let arr2: [u8; 32] = [
0, 1, 2, 3, 4, 5, 7, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0,
];
unsafe {
assert!(memcmp(arr1.as_ptr(), arr2.as_ptr(), 32) < 0);
assert!(memcmp(arr2.as_ptr(), arr1.as_ptr(), 32) > 0);
}
}
#[derive(Clone, Copy)]
struct AlignedStorage<const N: usize>([u8; N], [usize; 0]);