From a977b01090ce83dd2cfc900b77ceaeb6eb61f795 Mon Sep 17 00:00:00 2001
From: David Hoppenbrouwers <david@salt-inc.org>
Date: Sat, 2 Jul 2022 23:54:30 +0200
Subject: [PATCH] Align destination in mem* instructions.

While misaligned reads are generally fast, misaligned writes aren't and
can have severe penalties.
---
 library/compiler-builtins/src/mem/x86_64.rs | 130 ++++++++++++++------
 1 file changed, 94 insertions(+), 36 deletions(-)

diff --git a/library/compiler-builtins/src/mem/x86_64.rs b/library/compiler-builtins/src/mem/x86_64.rs
index 3b372d10d1a9..68ef17f1e867 100644
--- a/library/compiler-builtins/src/mem/x86_64.rs
+++ b/library/compiler-builtins/src/mem/x86_64.rs
@@ -16,6 +16,7 @@
 // feature is present at compile-time. We don't bother detecting other features.
 // Note that ERMSB does not enhance the backwards (DF=1) "rep movsb".
 
+use core::arch::asm;
 use core::intrinsics;
 use core::mem;
 
@@ -34,40 +35,61 @@ pub unsafe fn copy_forward(dest: *mut u8, src: *const u8, count: usize) {
 
 #[inline(always)]
 #[cfg(not(target_feature = "ermsb"))]
-pub unsafe fn copy_forward(dest: *mut u8, src: *const u8, count: usize) {
-    let qword_count = count >> 3;
-    let byte_count = count & 0b111;
-    // FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust.
-    core::arch::asm!(
-        "repe movsq (%rsi), (%rdi)",
-        "mov {byte_count:e}, %ecx",
-        "repe movsb (%rsi), (%rdi)",
-        byte_count = in(reg) byte_count,
+pub unsafe fn copy_forward(mut dest: *mut u8, mut src: *const u8, count: usize) {
+    let (pre_byte_count, qword_count, byte_count) = rep_param(dest, count);
+    // Separating the blocks gives the compiler more freedom to reorder instructions.
+    // It also allows us to trivially skip the rep movsb, which is faster when memcpying
+    // aligned data.
+    if pre_byte_count > 0 {
+        asm!(
+            "rep movsb",
+            inout("ecx") pre_byte_count => _,
+            inout("rdi") dest => dest,
+            inout("rsi") src => src,
+            options(nostack, preserves_flags)
+        );
+    }
+    asm!(
+        "rep movsq",
         inout("rcx") qword_count => _,
-        inout("rdi") dest => _,
-        inout("rsi") src => _,
-        options(att_syntax, nostack, preserves_flags)
+        inout("rdi") dest => dest,
+        inout("rsi") src => src,
+        options(nostack, preserves_flags)
     );
+    if byte_count > 0 {
+        asm!(
+            "rep movsb",
+            inout("ecx") byte_count => _,
+            inout("rdi") dest => _,
+            inout("rsi") src => _,
+            options(nostack, preserves_flags)
+        );
+    }
 }
 
 #[inline(always)]
 pub unsafe fn copy_backward(dest: *mut u8, src: *const u8, count: usize) {
-    let qword_count = count >> 3;
-    let byte_count = count & 0b111;
-    // FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust.
-    core::arch::asm!(
+    let (pre_byte_count, qword_count, byte_count) = rep_param_rev(dest, count);
+    // We can't separate this block due to std/cld
+    asm!(
         "std",
-        "repe movsq (%rsi), (%rdi)",
-        "movl {byte_count:e}, %ecx",
-        "addq $7, %rdi",
-        "addq $7, %rsi",
-        "repe movsb (%rsi), (%rdi)",
+        "rep movsb",
+        "sub rsi, 7",
+        "sub rdi, 7",
+        "mov rcx, {qword_count}",
+        "rep movsq",
+        "add rsi, 7",
+        "add rdi, 7",
+        "mov ecx, {byte_count:e}",
+        "rep movsb",
         "cld",
         byte_count = in(reg) byte_count,
-        inout("rcx") qword_count => _,
-        inout("rdi") dest.add(count).wrapping_sub(8) => _,
-        inout("rsi") src.add(count).wrapping_sub(8) => _,
-        options(att_syntax, nostack)
+        qword_count = in(reg) qword_count,
+        inout("ecx") pre_byte_count => _,
+        inout("rdi") dest.add(count - 1) => _,
+        inout("rsi") src.add(count - 1) => _,
+        // We modify flags, but we restore it afterwards
+        options(nostack, preserves_flags)
     );
 }
 
@@ -86,20 +108,36 @@ pub unsafe fn set_bytes(dest: *mut u8, c: u8, count: usize) {
 
 #[inline(always)]
 #[cfg(not(target_feature = "ermsb"))]
-pub unsafe fn set_bytes(dest: *mut u8, c: u8, count: usize) {
-    let qword_count = count >> 3;
-    let byte_count = count & 0b111;
-    // FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust.
-    core::arch::asm!(
-        "repe stosq %rax, (%rdi)",
-        "mov {byte_count:e}, %ecx",
-        "repe stosb %al, (%rdi)",
-        byte_count = in(reg) byte_count,
+pub unsafe fn set_bytes(mut dest: *mut u8, c: u8, count: usize) {
+    let (pre_byte_count, qword_count, byte_count) = rep_param(dest, count);
+    // Separating the blocks gives the compiler more freedom to reorder instructions.
+    // It also allows us to trivially skip the rep stosb, which is faster when memcpying
+    // aligned data.
+    if pre_byte_count > 0 {
+        asm!(
+            "rep stosb",
+            inout("ecx") pre_byte_count => _,
+            inout("rdi") dest => dest,
+            in("al") c,
+            options(nostack, preserves_flags)
+        );
+    }
+    asm!(
+        "rep stosq",
         inout("rcx") qword_count => _,
-        inout("rdi") dest => _,
+        inout("rdi") dest => dest,
         in("rax") (c as u64) * 0x0101010101010101,
-        options(att_syntax, nostack, preserves_flags)
+        options(nostack, preserves_flags)
     );
+    if byte_count > 0 {
+        asm!(
+            "rep stosb",
+            inout("ecx") byte_count => _,
+            inout("rdi") dest => _,
+            in("al") c,
+            options(nostack, preserves_flags)
+        );
+    }
 }
 
 #[inline(always)]
@@ -156,3 +194,23 @@ pub unsafe fn compare_bytes(a: *const u8, b: *const u8, n: usize) -> i32 {
         c16(a.cast(), b.cast(), n)
     }
 }
+
+/// Determine optimal parameters for a `rep` instruction.
+fn rep_param(dest: *mut u8, mut count: usize) -> (usize, usize, usize) {
+    // Unaligned writes are still slow on modern processors, so align the destination address.
+    let pre_byte_count = ((8 - (dest as usize & 0b111)) & 0b111).min(count);
+    count -= pre_byte_count;
+    let qword_count = count >> 3;
+    let byte_count = count & 0b111;
+    (pre_byte_count, qword_count, byte_count)
+}
+
+/// Determine optimal parameters for a reverse `rep` instruction (i.e. direction bit is set).
+fn rep_param_rev(dest: *mut u8, mut count: usize) -> (usize, usize, usize) {
+    // Unaligned writes are still slow on modern processors, so align the destination address.
+    let pre_byte_count = ((dest as usize + count) & 0b111).min(count);
+    count -= pre_byte_count;
+    let qword_count = count >> 3;
+    let byte_count = count & 0b111;
+    (pre_byte_count, qword_count, byte_count)
+}