From 4dbd8387f9bc3e5c30e7c3060db0c1c25cd49f89 Mon Sep 17 00:00:00 2001 From: David Hoppenbrouwers Date: Tue, 1 Mar 2022 09:50:55 +0100 Subject: [PATCH 1/8] Implement faster memcmp for x86_64 x86_64 can load unaligned words in a single cache line as fast as aligned words. Even when crossing cache or page boundaries it is just as fast to do an unaligned word read instead of multiple byte reads. Also add a couple more tests & benchmarks. --- library/compiler-builtins/src/mem/impls.rs | 14 +++ library/compiler-builtins/src/mem/mod.rs | 11 +- library/compiler-builtins/src/mem/x86_64.rs | 41 +++++++ .../testcrate/benches/mem.rs | 104 ++++++++++++++++++ .../compiler-builtins/testcrate/tests/mem.rs | 38 ++++++- 5 files changed, 193 insertions(+), 15 deletions(-) diff --git a/library/compiler-builtins/src/mem/impls.rs b/library/compiler-builtins/src/mem/impls.rs index 8151324254a0..f31366d765c1 100644 --- a/library/compiler-builtins/src/mem/impls.rs +++ b/library/compiler-builtins/src/mem/impls.rs @@ -265,3 +265,17 @@ pub unsafe fn set_bytes(mut s: *mut u8, c: u8, mut n: usize) { } set_bytes_bytes(s, c, n); } + +#[inline(always)] +pub unsafe fn compare_bytes(s1: *const u8, s2: *const u8, n: usize) { + let mut i = 0; + while i < n { + let a = *s1.add(i); + let b = *s2.add(i); + if a != b { + return a as i32 - b as i32; + } + i += 1; + } + 0 +} diff --git a/library/compiler-builtins/src/mem/mod.rs b/library/compiler-builtins/src/mem/mod.rs index a551138612bb..c5b0ddc16ec3 100644 --- a/library/compiler-builtins/src/mem/mod.rs +++ b/library/compiler-builtins/src/mem/mod.rs @@ -51,16 +51,7 @@ intrinsics! { #[mem_builtin] #[cfg_attr(not(all(target_os = "windows", target_env = "gnu")), linkage = "weak")] pub unsafe extern "C" fn memcmp(s1: *const u8, s2: *const u8, n: usize) -> i32 { - let mut i = 0; - while i < n { - let a = *s1.add(i); - let b = *s2.add(i); - if a != b { - return a as i32 - b as i32; - } - i += 1; - } - 0 + impls::compare_bytes(s1, s2, n) } #[mem_builtin] diff --git a/library/compiler-builtins/src/mem/x86_64.rs b/library/compiler-builtins/src/mem/x86_64.rs index a7ab6f605bdc..0bfacf713c57 100644 --- a/library/compiler-builtins/src/mem/x86_64.rs +++ b/library/compiler-builtins/src/mem/x86_64.rs @@ -16,6 +16,8 @@ // feature is present at compile-time. We don't bother detecting other features. // Note that ERMSB does not enhance the backwards (DF=1) "rep movsb". +use core::mem; + #[inline(always)] #[cfg(target_feature = "ermsb")] pub unsafe fn copy_forward(dest: *mut u8, src: *const u8, count: usize) { @@ -98,3 +100,42 @@ pub unsafe fn set_bytes(dest: *mut u8, c: u8, count: usize) { options(att_syntax, nostack, preserves_flags) ); } + +#[inline(always)] +pub unsafe fn compare_bytes( + a: *const u8, + b: *const u8, + n: usize, +) -> i32 { + unsafe fn cmp(mut a: *const T, mut b: *const T, n: usize, f: F) -> i32 + where + T: Clone + Copy + Eq, + U: Clone + Copy + Eq, + F: FnOnce(*const U, *const U, usize) -> i32, + { + for _ in 0..n / mem::size_of::() { + if a.read_unaligned() != b.read_unaligned() { + return f(a.cast(), b.cast(), mem::size_of::()); + } + a = a.add(1); + b = b.add(1); + } + f(a.cast(), b.cast(), n % mem::size_of::()) + } + let c1 = |mut a: *const u8, mut b: *const u8, n| { + for _ in 0..n { + if a.read() != b.read() { + return i32::from(a.read()) - i32::from(b.read()); + } + a = a.add(1); + b = b.add(1); + } + 0 + }; + let c2 = |a: *const u16, b, n| cmp(a, b, n, c1); + let c4 = |a: *const u32, b, n| cmp(a, b, n, c2); + let c8 = |a: *const u64, b, n| cmp(a, b, n, c4); + let c16 = |a: *const u128, b, n| cmp(a, b, n, c8); + let c32 = |a: *const [u128; 2], b, n| cmp(a, b, n, c16); + c32(a.cast(), b.cast(), n) +} diff --git a/library/compiler-builtins/testcrate/benches/mem.rs b/library/compiler-builtins/testcrate/benches/mem.rs index b6883a93b248..98a040958c4d 100644 --- a/library/compiler-builtins/testcrate/benches/mem.rs +++ b/library/compiler-builtins/testcrate/benches/mem.rs @@ -96,6 +96,18 @@ fn memcmp_builtin(b: &mut Bencher, n: usize) { }) } +fn memcmp_builtin_unaligned(b: &mut Bencher, n: usize) { + let v1 = AlignedVec::new(0, n); + let mut v2 = AlignedVec::new(0, n); + v2[n - 1] = 1; + b.bytes = n as u64; + b.iter(|| { + let s1: &[u8] = black_box(&v1[0..]); + let s2: &[u8] = black_box(&v2[1..]); + s1.cmp(s2) + }) +} + fn memcmp_rust(b: &mut Bencher, n: usize) { let v1 = AlignedVec::new(0, n); let mut v2 = AlignedVec::new(0, n); @@ -108,6 +120,18 @@ fn memcmp_rust(b: &mut Bencher, n: usize) { }) } +fn memcmp_rust_unaligned(b: &mut Bencher, n: usize) { + let v1 = AlignedVec::new(0, n); + let mut v2 = AlignedVec::new(0, n); + v2[n - 1] = 1; + b.bytes = n as u64; + b.iter(|| { + let s1: &[u8] = black_box(&v1[0..]); + let s2: &[u8] = black_box(&v2[1..]); + unsafe { memcmp(s1.as_ptr(), s2.as_ptr(), n - 1) } + }) +} + fn memmove_builtin(b: &mut Bencher, n: usize, offset: usize) { let mut v = AlignedVec::new(0, n + n / 2 + offset); b.bytes = n as u64; @@ -209,6 +233,38 @@ fn memset_rust_1048576_offset(b: &mut Bencher) { memset_rust(b, 1048576, 65) } +#[bench] +fn memcmp_builtin_8(b: &mut Bencher) { + memcmp_builtin(b, 8) +} +#[bench] +fn memcmp_rust_8(b: &mut Bencher) { + memcmp_rust(b, 8) +} +#[bench] +fn memcmp_builtin_16(b: &mut Bencher) { + memcmp_builtin(b, 16) +} +#[bench] +fn memcmp_rust_16(b: &mut Bencher) { + memcmp_rust(b, 16) +} +#[bench] +fn memcmp_builtin_32(b: &mut Bencher) { + memcmp_builtin(b, 32) +} +#[bench] +fn memcmp_rust_32(b: &mut Bencher) { + memcmp_rust(b, 32) +} +#[bench] +fn memcmp_builtin_64(b: &mut Bencher) { + memcmp_builtin(b, 64) +} +#[bench] +fn memcmp_rust_64(b: &mut Bencher) { + memcmp_rust(b, 64) +} #[bench] fn memcmp_builtin_4096(b: &mut Bencher) { memcmp_builtin(b, 4096) @@ -225,6 +281,54 @@ fn memcmp_builtin_1048576(b: &mut Bencher) { fn memcmp_rust_1048576(b: &mut Bencher) { memcmp_rust(b, 1048576) } +#[bench] +fn memcmp_builtin_unaligned_7(b: &mut Bencher) { + memcmp_builtin_unaligned(b, 8) +} +#[bench] +fn memcmp_rust_unaligned_7(b: &mut Bencher) { + memcmp_rust_unaligned(b, 8) +} +#[bench] +fn memcmp_builtin_unaligned_15(b: &mut Bencher) { + memcmp_builtin_unaligned(b, 16) +} +#[bench] +fn memcmp_rust_unaligned_15(b: &mut Bencher) { + memcmp_rust_unaligned(b, 16) +} +#[bench] +fn memcmp_builtin_unaligned_31(b: &mut Bencher) { + memcmp_builtin_unaligned(b, 32) +} +#[bench] +fn memcmp_rust_unaligned_31(b: &mut Bencher) { + memcmp_rust_unaligned(b, 32) +} +#[bench] +fn memcmp_builtin_unaligned_63(b: &mut Bencher) { + memcmp_builtin_unaligned(b, 64) +} +#[bench] +fn memcmp_rust_unaligned_63(b: &mut Bencher) { + memcmp_rust_unaligned(b, 64) +} +#[bench] +fn memcmp_builtin_unaligned_4095(b: &mut Bencher) { + memcmp_builtin_unaligned(b, 4096) +} +#[bench] +fn memcmp_rust_unaligned_4095(b: &mut Bencher) { + memcmp_rust_unaligned(b, 4096) +} +#[bench] +fn memcmp_builtin_unaligned_1048575(b: &mut Bencher) { + memcmp_builtin_unaligned(b, 1048576) +} +#[bench] +fn memcmp_rust_unaligned_1048575(b: &mut Bencher) { + memcmp_rust_unaligned(b, 1048576) +} #[bench] fn memmove_builtin_4096(b: &mut Bencher) { diff --git a/library/compiler-builtins/testcrate/tests/mem.rs b/library/compiler-builtins/testcrate/tests/mem.rs index 3f20e72a04c8..69a63e71df59 100644 --- a/library/compiler-builtins/testcrate/tests/mem.rs +++ b/library/compiler-builtins/testcrate/tests/mem.rs @@ -116,11 +116,13 @@ fn memset_nonzero() { #[test] fn memcmp_eq() { - let arr1: [u8; 8] = [0, 1, 2, 3, 4, 5, 6, 7]; - let arr2: [u8; 8] = [0, 1, 2, 3, 4, 5, 6, 7]; - unsafe { - assert_eq!(memcmp(arr1.as_ptr(), arr2.as_ptr(), 8), 0); - assert_eq!(memcmp(arr1.as_ptr(), arr2.as_ptr(), 3), 0); + let arr1: [u8; 32] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]; + let arr2: [u8; 32] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]; + for i in 0..32 { + unsafe { + assert_eq!(memcmp(arr1.as_ptr(), arr2.as_ptr(), i), 0); + assert_eq!(memcmp(arr2.as_ptr(), arr1.as_ptr(), i), 0); + } } } @@ -134,6 +136,32 @@ fn memcmp_ne() { } } +#[test] +fn memcmp_ne_16() { + let arr1: [u8; 16] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]; + let arr2: [u8; 16] = [0, 1, 2, 3, 4, 5, 7, 7, 8, 9, 10, 11, 12, 13, 14, 15]; + unsafe { + assert!(memcmp(arr1.as_ptr(), arr2.as_ptr(), 16) < 0); + assert!(memcmp(arr2.as_ptr(), arr1.as_ptr(), 16) > 0); + } +} + +#[test] +fn memcmp_ne_32() { + let arr1: [u8; 32] = [ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, + ]; + let arr2: [u8; 32] = [ + 0, 1, 2, 3, 4, 5, 7, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, + ]; + unsafe { + assert!(memcmp(arr1.as_ptr(), arr2.as_ptr(), 32) < 0); + assert!(memcmp(arr2.as_ptr(), arr1.as_ptr(), 32) > 0); + } +} + #[derive(Clone, Copy)] struct AlignedStorage([u8; N], [usize; 0]); From d6650678de2239397ae8ab1a6af443a2b2ecfcef Mon Sep 17 00:00:00 2001 From: David Hoppenbrouwers Date: Fri, 27 May 2022 22:37:54 +0200 Subject: [PATCH 2/8] Fix formatting --- library/compiler-builtins/testcrate/tests/mem.rs | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/library/compiler-builtins/testcrate/tests/mem.rs b/library/compiler-builtins/testcrate/tests/mem.rs index 69a63e71df59..8385dfc2cec9 100644 --- a/library/compiler-builtins/testcrate/tests/mem.rs +++ b/library/compiler-builtins/testcrate/tests/mem.rs @@ -116,8 +116,14 @@ fn memset_nonzero() { #[test] fn memcmp_eq() { - let arr1: [u8; 32] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]; - let arr2: [u8; 32] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]; + let arr1: [u8; 32] = [ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, + 25, 26, 27, 28, 29, 30, 31, + ]; + let arr2: [u8; 32] = [ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, + 25, 26, 27, 28, 29, 30, 31, + ]; for i in 0..32 { unsafe { assert_eq!(memcmp(arr1.as_ptr(), arr2.as_ptr(), i), 0); From e7a8932e3b6173d98b0925ee5011d8601d023a3b Mon Sep 17 00:00:00 2001 From: David Hoppenbrouwers Date: Sat, 28 May 2022 00:09:02 +0200 Subject: [PATCH 3/8] Fix CI, better memcmp tests --- library/compiler-builtins/src/mem/impls.rs | 2 +- library/compiler-builtins/src/mem/x86_64.rs | 6 +- .../compiler-builtins/testcrate/tests/mem.rs | 55 +++++-------------- 3 files changed, 15 insertions(+), 48 deletions(-) diff --git a/library/compiler-builtins/src/mem/impls.rs b/library/compiler-builtins/src/mem/impls.rs index f31366d765c1..72003a5c472b 100644 --- a/library/compiler-builtins/src/mem/impls.rs +++ b/library/compiler-builtins/src/mem/impls.rs @@ -267,7 +267,7 @@ pub unsafe fn set_bytes(mut s: *mut u8, c: u8, mut n: usize) { } #[inline(always)] -pub unsafe fn compare_bytes(s1: *const u8, s2: *const u8, n: usize) { +pub unsafe fn compare_bytes(s1: *const u8, s2: *const u8, n: usize) -> i32 { let mut i = 0; while i < n { let a = *s1.add(i); diff --git a/library/compiler-builtins/src/mem/x86_64.rs b/library/compiler-builtins/src/mem/x86_64.rs index 0bfacf713c57..fc89aa768d33 100644 --- a/library/compiler-builtins/src/mem/x86_64.rs +++ b/library/compiler-builtins/src/mem/x86_64.rs @@ -102,11 +102,7 @@ pub unsafe fn set_bytes(dest: *mut u8, c: u8, count: usize) { } #[inline(always)] -pub unsafe fn compare_bytes( - a: *const u8, - b: *const u8, - n: usize, -) -> i32 { +pub unsafe fn compare_bytes(a: *const u8, b: *const u8, n: usize) -> i32 { unsafe fn cmp(mut a: *const T, mut b: *const T, n: usize, f: F) -> i32 where T: Clone + Copy + Eq, diff --git a/library/compiler-builtins/testcrate/tests/mem.rs b/library/compiler-builtins/testcrate/tests/mem.rs index 8385dfc2cec9..48ac95adc17f 100644 --- a/library/compiler-builtins/testcrate/tests/mem.rs +++ b/library/compiler-builtins/testcrate/tests/mem.rs @@ -116,55 +116,26 @@ fn memset_nonzero() { #[test] fn memcmp_eq() { - let arr1: [u8; 32] = [ - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, - 25, 26, 27, 28, 29, 30, 31, - ]; - let arr2: [u8; 32] = [ - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, - 25, 26, 27, 28, 29, 30, 31, - ]; - for i in 0..32 { + let arr1 @ arr2 = gen_arr::<256>(); + for i in 0..256 { unsafe { - assert_eq!(memcmp(arr1.as_ptr(), arr2.as_ptr(), i), 0); - assert_eq!(memcmp(arr2.as_ptr(), arr1.as_ptr(), i), 0); + assert_eq!(memcmp(arr1.0.as_ptr(), arr2.0.as_ptr(), i), 0); + assert_eq!(memcmp(arr2.0.as_ptr(), arr1.0.as_ptr(), i), 0); } } } #[test] fn memcmp_ne() { - let arr1: [u8; 8] = [0, 1, 2, 3, 4, 5, 6, 7]; - let arr2: [u8; 8] = [0, 1, 2, 3, 4, 5, 7, 7]; - unsafe { - assert!(memcmp(arr1.as_ptr(), arr2.as_ptr(), 8) < 0); - assert!(memcmp(arr2.as_ptr(), arr1.as_ptr(), 8) > 0); - } -} - -#[test] -fn memcmp_ne_16() { - let arr1: [u8; 16] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]; - let arr2: [u8; 16] = [0, 1, 2, 3, 4, 5, 7, 7, 8, 9, 10, 11, 12, 13, 14, 15]; - unsafe { - assert!(memcmp(arr1.as_ptr(), arr2.as_ptr(), 16) < 0); - assert!(memcmp(arr2.as_ptr(), arr1.as_ptr(), 16) > 0); - } -} - -#[test] -fn memcmp_ne_32() { - let arr1: [u8; 32] = [ - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, - ]; - let arr2: [u8; 32] = [ - 0, 1, 2, 3, 4, 5, 7, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, - ]; - unsafe { - assert!(memcmp(arr1.as_ptr(), arr2.as_ptr(), 32) < 0); - assert!(memcmp(arr2.as_ptr(), arr1.as_ptr(), 32) > 0); + let arr1 @ arr2 = gen_arr::<256>(); + for i in 0..256 { + let mut diff_arr = arr1; + diff_arr.0[i] = 127; + let expect = diff_arr.0[i].cmp(&arr2.0[i]); + for k in i + 1..256 { + let result = unsafe { memcmp(diff_arr.0.as_ptr(), arr2.0.as_ptr(), k) }; + assert_eq!(expect, result.cmp(&0)); + } } } From 2071d05a19980737bbb15948c5d914cae7ba23de Mon Sep 17 00:00:00 2001 From: David Hoppenbrouwers Date: Sat, 28 May 2022 00:50:05 +0200 Subject: [PATCH 4/8] Always inline compare_bytes::cmp --- library/compiler-builtins/src/mem/x86_64.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/library/compiler-builtins/src/mem/x86_64.rs b/library/compiler-builtins/src/mem/x86_64.rs index fc89aa768d33..65b61224dc22 100644 --- a/library/compiler-builtins/src/mem/x86_64.rs +++ b/library/compiler-builtins/src/mem/x86_64.rs @@ -103,6 +103,7 @@ pub unsafe fn set_bytes(dest: *mut u8, c: u8, count: usize) { #[inline(always)] pub unsafe fn compare_bytes(a: *const u8, b: *const u8, n: usize) -> i32 { + #[inline(always)] unsafe fn cmp(mut a: *const T, mut b: *const T, n: usize, f: F) -> i32 where T: Clone + Copy + Eq, From 217748f91bbe20959262a46246edf2d0d3c9a7bd Mon Sep 17 00:00:00 2001 From: David Hoppenbrouwers Date: Sat, 28 May 2022 01:23:50 +0200 Subject: [PATCH 5/8] Fix panic not being optimized out. I don't know why it isn't being optimized out though, which worries me. --- library/compiler-builtins/src/lib.rs | 1 + library/compiler-builtins/src/mem/x86_64.rs | 10 ++++++++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/library/compiler-builtins/src/lib.rs b/library/compiler-builtins/src/lib.rs index 009923d27e5b..acac040be332 100644 --- a/library/compiler-builtins/src/lib.rs +++ b/library/compiler-builtins/src/lib.rs @@ -6,6 +6,7 @@ #![feature(compiler_builtins)] #![feature(core_ffi_c)] #![feature(core_intrinsics)] +#![feature(inline_const)] #![feature(lang_items)] #![feature(linkage)] #![feature(naked_functions)] diff --git a/library/compiler-builtins/src/mem/x86_64.rs b/library/compiler-builtins/src/mem/x86_64.rs index 65b61224dc22..6eecd5a515e6 100644 --- a/library/compiler-builtins/src/mem/x86_64.rs +++ b/library/compiler-builtins/src/mem/x86_64.rs @@ -110,14 +110,20 @@ pub unsafe fn compare_bytes(a: *const u8, b: *const u8, n: usize) -> i32 { U: Clone + Copy + Eq, F: FnOnce(*const U, *const U, usize) -> i32, { - for _ in 0..n / mem::size_of::() { + // Just to be sure we're actually working with powers of two... + let _ = const { 1 - mem::size_of::().count_ones() }; // <= 1 + let _ = const { mem::size_of::().count_ones() - 1 }; // >= 1 + // This should be equivalent to division with power-of-two sizes, except the former + // somehow still leaves a call to panic because ?? + for _ in 0..n >> mem::size_of::().trailing_zeros() { if a.read_unaligned() != b.read_unaligned() { return f(a.cast(), b.cast(), mem::size_of::()); } a = a.add(1); b = b.add(1); } - f(a.cast(), b.cast(), n % mem::size_of::()) + // Ditto + f(a.cast(), b.cast(), n & (mem::size_of::() - 1)) } let c1 = |mut a: *const u8, mut b: *const u8, n| { for _ in 0..n { From 95d2cd550288c798e14be620229de936c54cc0c3 Mon Sep 17 00:00:00 2001 From: David Hoppenbrouwers Date: Sat, 28 May 2022 08:16:46 +0200 Subject: [PATCH 6/8] Fix rustfmt sillyness --- library/compiler-builtins/src/mem/x86_64.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/library/compiler-builtins/src/mem/x86_64.rs b/library/compiler-builtins/src/mem/x86_64.rs index 6eecd5a515e6..66b51fedf8a9 100644 --- a/library/compiler-builtins/src/mem/x86_64.rs +++ b/library/compiler-builtins/src/mem/x86_64.rs @@ -113,6 +113,7 @@ pub unsafe fn compare_bytes(a: *const u8, b: *const u8, n: usize) -> i32 { // Just to be sure we're actually working with powers of two... let _ = const { 1 - mem::size_of::().count_ones() }; // <= 1 let _ = const { mem::size_of::().count_ones() - 1 }; // >= 1 + // This should be equivalent to division with power-of-two sizes, except the former // somehow still leaves a call to panic because ?? for _ in 0..n >> mem::size_of::().trailing_zeros() { From b94e93ead806cc90f9831ca6a150318edcea35fe Mon Sep 17 00:00:00 2001 From: David Hoppenbrouwers Date: Sat, 28 May 2022 22:46:16 +0200 Subject: [PATCH 7/8] Slightly optimize main (32b) memcmp loop It only seems to save a single instruction at first sight yet the effects are significant. --- library/compiler-builtins/src/mem/x86_64.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/library/compiler-builtins/src/mem/x86_64.rs b/library/compiler-builtins/src/mem/x86_64.rs index 66b51fedf8a9..2b4875697ae5 100644 --- a/library/compiler-builtins/src/mem/x86_64.rs +++ b/library/compiler-builtins/src/mem/x86_64.rs @@ -116,7 +116,8 @@ pub unsafe fn compare_bytes(a: *const u8, b: *const u8, n: usize) -> i32 { // This should be equivalent to division with power-of-two sizes, except the former // somehow still leaves a call to panic because ?? - for _ in 0..n >> mem::size_of::().trailing_zeros() { + let end = a.add(n >> mem::size_of::().trailing_zeros()); + while a != end { if a.read_unaligned() != b.read_unaligned() { return f(a.cast(), b.cast(), mem::size_of::()); } From cb63d7d500f03b19618fd166e1cd9afd91612fa6 Mon Sep 17 00:00:00 2001 From: David Hoppenbrouwers Date: Tue, 31 May 2022 08:20:30 +0200 Subject: [PATCH 8/8] Use unchecked_div/rem --- library/compiler-builtins/src/mem/x86_64.rs | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/library/compiler-builtins/src/mem/x86_64.rs b/library/compiler-builtins/src/mem/x86_64.rs index 2b4875697ae5..4d2f6e5ee329 100644 --- a/library/compiler-builtins/src/mem/x86_64.rs +++ b/library/compiler-builtins/src/mem/x86_64.rs @@ -16,6 +16,7 @@ // feature is present at compile-time. We don't bother detecting other features. // Note that ERMSB does not enhance the backwards (DF=1) "rep movsb". +use core::intrinsics; use core::mem; #[inline(always)] @@ -110,13 +111,10 @@ pub unsafe fn compare_bytes(a: *const u8, b: *const u8, n: usize) -> i32 { U: Clone + Copy + Eq, F: FnOnce(*const U, *const U, usize) -> i32, { - // Just to be sure we're actually working with powers of two... - let _ = const { 1 - mem::size_of::().count_ones() }; // <= 1 - let _ = const { mem::size_of::().count_ones() - 1 }; // >= 1 + // Ensure T is not a ZST. + const { assert!(mem::size_of::() != 0) }; - // This should be equivalent to division with power-of-two sizes, except the former - // somehow still leaves a call to panic because ?? - let end = a.add(n >> mem::size_of::().trailing_zeros()); + let end = a.add(intrinsics::unchecked_div(n, mem::size_of::())); while a != end { if a.read_unaligned() != b.read_unaligned() { return f(a.cast(), b.cast(), mem::size_of::()); @@ -124,8 +122,11 @@ pub unsafe fn compare_bytes(a: *const u8, b: *const u8, n: usize) -> i32 { a = a.add(1); b = b.add(1); } - // Ditto - f(a.cast(), b.cast(), n & (mem::size_of::() - 1)) + f( + a.cast(), + b.cast(), + intrinsics::unchecked_rem(n, mem::size_of::()), + ) } let c1 = |mut a: *const u8, mut b: *const u8, n| { for _ in 0..n {