Merge pull request #474 from Demindiro/x86_64-mem-align-dest
This commit is contained in:
commit
c22f0d54c4
1 changed files with 71 additions and 33 deletions
|
|
@ -16,6 +16,7 @@
|
|||
// feature is present at compile-time. We don't bother detecting other features.
|
||||
// Note that ERMSB does not enhance the backwards (DF=1) "rep movsb".
|
||||
|
||||
use core::arch::asm;
|
||||
use core::intrinsics;
|
||||
use core::mem;
|
||||
|
||||
|
|
@ -34,16 +35,26 @@ pub unsafe fn copy_forward(dest: *mut u8, src: *const u8, count: usize) {
|
|||
|
||||
#[inline(always)]
|
||||
#[cfg(not(target_feature = "ermsb"))]
|
||||
pub unsafe fn copy_forward(dest: *mut u8, src: *const u8, count: usize) {
|
||||
let qword_count = count >> 3;
|
||||
let byte_count = count & 0b111;
|
||||
// FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust.
|
||||
core::arch::asm!(
|
||||
"repe movsq (%rsi), (%rdi)",
|
||||
"mov {byte_count:e}, %ecx",
|
||||
"repe movsb (%rsi), (%rdi)",
|
||||
byte_count = in(reg) byte_count,
|
||||
pub unsafe fn copy_forward(mut dest: *mut u8, mut src: *const u8, count: usize) {
|
||||
let (pre_byte_count, qword_count, byte_count) = rep_param(dest, count);
|
||||
// Separating the blocks gives the compiler more freedom to reorder instructions.
|
||||
asm!(
|
||||
"rep movsb",
|
||||
inout("ecx") pre_byte_count => _,
|
||||
inout("rdi") dest => dest,
|
||||
inout("rsi") src => src,
|
||||
options(att_syntax, nostack, preserves_flags)
|
||||
);
|
||||
asm!(
|
||||
"rep movsq",
|
||||
inout("rcx") qword_count => _,
|
||||
inout("rdi") dest => dest,
|
||||
inout("rsi") src => src,
|
||||
options(att_syntax, nostack, preserves_flags)
|
||||
);
|
||||
asm!(
|
||||
"rep movsb",
|
||||
inout("ecx") byte_count => _,
|
||||
inout("rdi") dest => _,
|
||||
inout("rsi") src => _,
|
||||
options(att_syntax, nostack, preserves_flags)
|
||||
|
|
@ -52,22 +63,28 @@ pub unsafe fn copy_forward(dest: *mut u8, src: *const u8, count: usize) {
|
|||
|
||||
#[inline(always)]
|
||||
pub unsafe fn copy_backward(dest: *mut u8, src: *const u8, count: usize) {
|
||||
let qword_count = count >> 3;
|
||||
let byte_count = count & 0b111;
|
||||
// FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust.
|
||||
core::arch::asm!(
|
||||
let (pre_byte_count, qword_count, byte_count) = rep_param(dest, count);
|
||||
// We can't separate this block due to std/cld
|
||||
asm!(
|
||||
"std",
|
||||
"repe movsq (%rsi), (%rdi)",
|
||||
"movl {byte_count:e}, %ecx",
|
||||
"addq $7, %rdi",
|
||||
"addq $7, %rsi",
|
||||
"repe movsb (%rsi), (%rdi)",
|
||||
"rep movsb",
|
||||
"sub $7, %rsi",
|
||||
"sub $7, %rdi",
|
||||
"mov {qword_count}, %rcx",
|
||||
"rep movsq",
|
||||
"test {pre_byte_count:e}, {pre_byte_count:e}",
|
||||
"add $7, %rsi",
|
||||
"add $7, %rdi",
|
||||
"mov {pre_byte_count:e}, %ecx",
|
||||
"rep movsb",
|
||||
"cld",
|
||||
byte_count = in(reg) byte_count,
|
||||
inout("rcx") qword_count => _,
|
||||
inout("rdi") dest.add(count).wrapping_sub(8) => _,
|
||||
inout("rsi") src.add(count).wrapping_sub(8) => _,
|
||||
options(att_syntax, nostack)
|
||||
pre_byte_count = in(reg) pre_byte_count,
|
||||
qword_count = in(reg) qword_count,
|
||||
inout("ecx") byte_count => _,
|
||||
inout("rdi") dest.add(count - 1) => _,
|
||||
inout("rsi") src.add(count - 1) => _,
|
||||
// We modify flags, but we restore it afterwards
|
||||
options(att_syntax, nostack, preserves_flags)
|
||||
);
|
||||
}
|
||||
|
||||
|
|
@ -86,18 +103,29 @@ pub unsafe fn set_bytes(dest: *mut u8, c: u8, count: usize) {
|
|||
|
||||
#[inline(always)]
|
||||
#[cfg(not(target_feature = "ermsb"))]
|
||||
pub unsafe fn set_bytes(dest: *mut u8, c: u8, count: usize) {
|
||||
let qword_count = count >> 3;
|
||||
let byte_count = count & 0b111;
|
||||
// FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust.
|
||||
core::arch::asm!(
|
||||
"repe stosq %rax, (%rdi)",
|
||||
"mov {byte_count:e}, %ecx",
|
||||
"repe stosb %al, (%rdi)",
|
||||
byte_count = in(reg) byte_count,
|
||||
pub unsafe fn set_bytes(mut dest: *mut u8, c: u8, count: usize) {
|
||||
let c = c as u64 * 0x0101_0101_0101_0101;
|
||||
let (pre_byte_count, qword_count, byte_count) = rep_param(dest, count);
|
||||
// Separating the blocks gives the compiler more freedom to reorder instructions.
|
||||
asm!(
|
||||
"rep stosb",
|
||||
inout("ecx") pre_byte_count => _,
|
||||
inout("rdi") dest => dest,
|
||||
in("rax") c,
|
||||
options(att_syntax, nostack, preserves_flags)
|
||||
);
|
||||
asm!(
|
||||
"rep stosq",
|
||||
inout("rcx") qword_count => _,
|
||||
inout("rdi") dest => dest,
|
||||
in("rax") c,
|
||||
options(att_syntax, nostack, preserves_flags)
|
||||
);
|
||||
asm!(
|
||||
"rep stosb",
|
||||
inout("ecx") byte_count => _,
|
||||
inout("rdi") dest => _,
|
||||
in("rax") (c as u64) * 0x0101010101010101,
|
||||
in("rax") c,
|
||||
options(att_syntax, nostack, preserves_flags)
|
||||
);
|
||||
}
|
||||
|
|
@ -156,3 +184,13 @@ pub unsafe fn compare_bytes(a: *const u8, b: *const u8, n: usize) -> i32 {
|
|||
c16(a.cast(), b.cast(), n)
|
||||
}
|
||||
}
|
||||
|
||||
/// Determine optimal parameters for a `rep` instruction.
|
||||
fn rep_param(dest: *mut u8, mut count: usize) -> (usize, usize, usize) {
|
||||
// Unaligned writes are still slow on modern processors, so align the destination address.
|
||||
let pre_byte_count = ((8 - (dest as usize & 0b111)) & 0b111).min(count);
|
||||
count -= pre_byte_count;
|
||||
let qword_count = count >> 3;
|
||||
let byte_count = count & 0b111;
|
||||
(pre_byte_count, qword_count, byte_count)
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue