Add binary-size optimized variants for stable and unstable sort as well as select_nth_unstable

- Stable sort uses a simple merge-sort that re-uses the existing - rather gnarly - merge function. - Unstable sort jumps directly to the branchless heapsort fallback. - select_nth_unstable jumps directly to the median_of_medians fallback, which is augmented with a custom tiny smallsort and partition impl. Some code is duplicated but de-duplication would bring it's own problems. For example `swap_if_less` is critical for performance, if the sorting networks don't inline it perf drops drastically, however `#[inline(always)]` is also a poor fit, if the provided comparison function is huge, it gives the compiler an out to only instantiate `swap_if_less` once and call it. Another aspect that would suffer when making `swap_if_less` pub, is having to cfg out dozens of functions in in smallsort module.
2024-08-25 19:14:00 +02:00 · 2024-08-25 19:14:00 +02:00 · 13d7b546da
commit 13d7b546da
parent c8b14ba7b6
7 changed files with 284 additions and 33 deletions
--- a/library/core/src/slice/sort/mod.rs
+++ b/library/core/src/slice/sort/mod.rs
@ -5,4 +5,5 @@ pub mod stable;
 pub mod unstable;

 pub(crate) mod select;
+#[cfg(not(feature = "optimize_for_size"))]
 pub(crate) mod shared;
--- a/library/core/src/slice/sort/select.rs
+++ b/library/core/src/slice/sort/select.rs
@ -6,9 +6,13 @@
 //! for pivot selection. Using this as a fallback ensures O(n) worst case running time with
 //! better performance than one would get using heapsort as fallback.

+use crate::intrinsics;
 use crate::mem::{self, SizedTypeProperties};
+#[cfg(not(feature = "optimize_for_size"))]
 use crate::slice::sort::shared::pivot::choose_pivot;
+#[cfg(not(feature = "optimize_for_size"))]
 use crate::slice::sort::shared::smallsort::insertion_sort_shift_left;
+#[cfg(not(feature = "optimize_for_size"))]
 use crate::slice::sort::unstable::quicksort::partition;

 /// Reorders the slice such that the element at `index` is at its final sorted position.
@ -40,7 +44,15 @@ where
        let min_idx = min_index(v, &mut is_less).unwrap();
        v.swap(min_idx, index);
    } else {
-        partition_at_index_loop(v, index, None, &mut is_less);
+        #[cfg(not(feature = "optimize_for_size"))]
+        {
+            partition_at_index_loop(v, index, None, &mut is_less);
+        }
+
+        #[cfg(feature = "optimize_for_size")]
+        {
+            median_of_medians(v, &mut is_less, index);
+        }
    }

    let (left, right) = v.split_at_mut(index);
@ -53,6 +65,7 @@ where
 // most once, it doesn't make sense to use something more sophisticated than insertion-sort.
 const INSERTION_SORT_THRESHOLD: usize = 16;

+#[cfg(not(feature = "optimize_for_size"))]
 fn partition_at_index_loop<'a, T, F>(
    mut v: &'a mut [T],
    mut index: usize,
@ -167,8 +180,17 @@ fn median_of_medians<T, F: FnMut(&T, &T) -> bool>(mut v: &mut [T], is_less: &mut
    loop {
        if v.len() <= INSERTION_SORT_THRESHOLD {
            if v.len() >= 2 {
-                insertion_sort_shift_left(v, 1, is_less);
+                #[cfg(not(feature = "optimize_for_size"))]
+                {
+                    insertion_sort_shift_left(v, 1, is_less);
+                }
+
+                #[cfg(feature = "optimize_for_size")]
+                {
+                    bubble_sort(v, is_less);
+                }
            }
+
            return;
        }

@ -230,7 +252,15 @@ fn median_of_ninthers<T, F: FnMut(&T, &T) -> bool>(v: &mut [T], is_less: &mut F)

    median_of_medians(&mut v[lo..lo + frac], is_less, pivot);

-    partition(v, lo + pivot, is_less)
+    #[cfg(not(feature = "optimize_for_size"))]
+    {
+        partition(v, lo + pivot, is_less)
+    }
+
+    #[cfg(feature = "optimize_for_size")]
+    {
+        partition_size_opt(v, lo + pivot, is_less)
+    }
 }

 /// Moves around the 9 elements at the indices a..i, such that
@ -298,3 +328,92 @@ fn median_idx<T, F: FnMut(&T, &T) -> bool>(
    }
    b
 }
+
+// It's possible to re-use the insertion sort in the smallsort module, but with optimize_for_size it
+// would clutter that module with cfg statements and make it generally harder to read and develop.
+// So to decouple things and simplify it, we use a an even smaller bubble sort.
+#[cfg(feature = "optimize_for_size")]
+fn bubble_sort<T, F: FnMut(&T, &T) -> bool>(v: &mut [T], is_less: &mut F) {
+    let mut n = v.len();
+    let mut did_swap = true;
+
+    while did_swap && n > 1 {
+        did_swap = false;
+        for i in 1..n {
+            // SAFETY: The loop construction implies that `i` and `i - 1` will always be in-bounds.
+            unsafe {
+                if is_less(v.get_unchecked(i), v.get_unchecked(i - 1)) {
+                    v.swap_unchecked(i - 1, i);
+                    did_swap = true;
+                }
+            }
+        }
+        n -= 1;
+    }
+}
+
+#[cfg(feature = "optimize_for_size")]
+fn partition_size_opt<T, F>(v: &mut [T], pivot: usize, is_less: &mut F) -> usize
+where
+    F: FnMut(&T, &T) -> bool,
+{
+    let len = v.len();
+
+    // Allows for panic-free code-gen by proving this property to the compiler.
+    if len == 0 {
+        return 0;
+    }
+
+    if pivot >= len {
+        intrinsics::abort();
+    }
+
+    // SAFETY: We checked that `pivot` is in-bounds.
+    unsafe {
+        // Place the pivot at the beginning of slice.
+        v.swap_unchecked(0, pivot);
+    }
+    let (pivot, v_without_pivot) = v.split_at_mut(1);
+
+    // Assuming that Rust generates noalias LLVM IR we can be sure that a partition function
+    // signature of the form `(v: &mut [T], pivot: &T)` guarantees that pivot and v can't alias.
+    // Having this guarantee is crucial for optimizations. It's possible to copy the pivot value
+    // into a stack value, but this creates issues for types with interior mutability mandating
+    // a drop guard.
+    let pivot = &mut pivot[0];
+
+    let num_lt = partition_lomuto_branchless_simple(v_without_pivot, pivot, is_less);
+
+    if num_lt >= len {
+        intrinsics::abort();
+    }
+
+    // SAFETY: We checked that `num_lt` is in-bounds.
+    unsafe {
+        // Place the pivot between the two partitions.
+        v.swap_unchecked(0, num_lt);
+    }
+
+    num_lt
+}
+
+#[cfg(feature = "optimize_for_size")]
+fn partition_lomuto_branchless_simple<T, F: FnMut(&T, &T) -> bool>(
+    v: &mut [T],
+    pivot: &T,
+    is_less: &mut F,
+) -> usize {
+    let mut left = 0;
+
+    for right in 0..v.len() {
+        // SAFETY: `left` can at max be incremented by 1 each loop iteration, which implies that
+        // left <= right and that both are in-bounds.
+        unsafe {
+            let right_is_lt = is_less(v.get_unchecked(right), pivot);
+            v.swap_unchecked(left, right);
+            left += right_is_lt as usize;
+        }
+    }
+
+    left
+}
--- a/library/core/src/slice/sort/shared/smallsort.rs
+++ b/library/core/src/slice/sort/shared/smallsort.rs
@ -378,7 +378,7 @@ where

 /// Swap two values in the slice pointed to by `v_base` at the position `a_pos` and `b_pos` if the
 /// value at position `b_pos` is less than the one at position `a_pos`.
-pub unsafe fn swap_if_less<T, F>(v_base: *mut T, a_pos: usize, b_pos: usize, is_less: &mut F)
+unsafe fn swap_if_less<T, F>(v_base: *mut T, a_pos: usize, b_pos: usize, is_less: &mut F)
 where
    F: FnMut(&T, &T) -> bool,
 {
--- a/library/core/src/slice/sort/stable/mod.rs
+++ b/library/core/src/slice/sort/stable/mod.rs
@ -1,15 +1,24 @@
 //! This module contains the entry points for `slice::sort`.

+#[cfg(not(feature = "optimize_for_size"))]
+use crate::cmp;
+use crate::intrinsics;
 use crate::mem::{self, MaybeUninit, SizedTypeProperties};
+#[cfg(not(feature = "optimize_for_size"))]
 use crate::slice::sort::shared::smallsort::{
    insertion_sort_shift_left, StableSmallSortTypeImpl, SMALL_SORT_GENERAL_SCRATCH_LEN,
 };
-use crate::{cmp, intrinsics};

-pub(crate) mod drift;
 pub(crate) mod merge;
+
+#[cfg(not(feature = "optimize_for_size"))]
+pub(crate) mod drift;
+#[cfg(not(feature = "optimize_for_size"))]
 pub(crate) mod quicksort;

+#[cfg(feature = "optimize_for_size")]
+pub(crate) mod tiny;
+
 /// Stable sort called driftsort by Orson Peters and Lukas Bergdoll.
 /// Design document:
 /// <https://github.com/Voultapher/sort-research-rs/blob/main/writeup/driftsort_introduction/text.md>
@ -30,25 +39,48 @@ pub fn sort<T, F: FnMut(&T, &T) -> bool, BufT: BufGuard<T>>(v: &mut [T], is_less
        return;
    }

-    // More advanced sorting methods than insertion sort are faster if called in
-    // a hot loop for small inputs, but for general-purpose code the small
-    // binary size of insertion sort is more important. The instruction cache in
-    // modern processors is very valuable, and for a single sort call in general
-    // purpose code any gains from an advanced method are cancelled by i-cache
-    // misses during the sort, and thrashing the i-cache for surrounding code.
-    const MAX_LEN_ALWAYS_INSERTION_SORT: usize = 20;
-    if intrinsics::likely(len <= MAX_LEN_ALWAYS_INSERTION_SORT) {
-        insertion_sort_shift_left(v, 1, is_less);
-        return;
+    #[cfg(not(feature = "optimize_for_size"))]
+    {
+        // More advanced sorting methods than insertion sort are faster if called in
+        // a hot loop for small inputs, but for general-purpose code the small
+        // binary size of insertion sort is more important. The instruction cache in
+        // modern processors is very valuable, and for a single sort call in general
+        // purpose code any gains from an advanced method are cancelled by i-cache
+        // misses during the sort, and thrashing the i-cache for surrounding code.
+        const MAX_LEN_ALWAYS_INSERTION_SORT: usize = 20;
+        if intrinsics::likely(len <= MAX_LEN_ALWAYS_INSERTION_SORT) {
+            insertion_sort_shift_left(v, 1, is_less);
+            return;
+        }
+
+        driftsort_main::<T, F, BufT>(v, is_less);
    }

-    driftsort_main::<T, F, BufT>(v, is_less);
+    #[cfg(feature = "optimize_for_size")]
+    {
+        let alloc_len = len / 2;
+
+        // For small inputs 4KiB of stack storage suffices, which allows us to avoid
+        // calling the (de-)allocator. Benchmarks showed this was quite beneficial.
+        let mut stack_buf = AlignedStorage::<T, 4096>::new();
+        let stack_scratch = stack_buf.as_uninit_slice_mut();
+        let mut heap_buf;
+        let scratch = if stack_scratch.len() >= alloc_len {
+            stack_scratch
+        } else {
+            heap_buf = BufT::with_capacity(alloc_len);
+            heap_buf.as_uninit_slice_mut()
+        };
+
+        tiny::mergesort(v, scratch, is_less);
+    }
 }

 /// See [`sort`]
 ///
 /// Deliberately don't inline the main sorting routine entrypoint to ensure the
 /// inlined insertion sort i-cache footprint remains minimal.
+#[cfg(not(feature = "optimize_for_size"))]
 #[inline(never)]
 fn driftsort_main<T, F: FnMut(&T, &T) -> bool, BufT: BufGuard<T>>(v: &mut [T], is_less: &mut F) {
    // By allocating n elements of memory we can ensure the entire input can
--- a/library/core/src/slice/sort/stable/tiny.rs
+++ b/library/core/src/slice/sort/stable/tiny.rs
@ -0,0 +1,75 @@
+//! Binary-size optimized mergesort inspired by https://github.com/voultapher/tiny-sort-rs.
+
+use crate::mem::{ManuallyDrop, MaybeUninit};
+use crate::ptr;
+use crate::slice::sort::stable::merge;
+
+/// Tiny recursive top-down merge sort optimized for binary size. It has no adaptiveness whatsoever,
+/// no run detection, etc.
+#[inline(always)]
+pub fn mergesort<T, F: FnMut(&T, &T) -> bool>(
+    v: &mut [T],
+    scratch: &mut [MaybeUninit<T>],
+    is_less: &mut F,
+) {
+    let len = v.len();
+
+    if len > 2 {
+        let mid = len / 2;
+
+        // SAFETY: mid is in-bounds.
+        unsafe {
+            // Sort the left half recursively.
+            mergesort(v.get_unchecked_mut(..mid), scratch, is_less);
+            // Sort the right half recursively.
+            mergesort(v.get_unchecked_mut(mid..), scratch, is_less);
+        }
+
+        merge::merge(v, scratch, mid, is_less);
+    } else if len == 2 {
+        // Branchless swap the two elements. This reduces the recursion depth and improves
+        // perf significantly at a small binary-size cost. Trades ~10% perf boost for integers
+        // for ~50 bytes in the binary.
+
+        // SAFETY: We checked the len, the pointers we create are valid and don't overlap.
+        unsafe {
+            swap_if_less(v.as_mut_ptr(), 0, 1, is_less);
+        }
+    }
+}
+
+/// Swap two values in the slice pointed to by `v_base` at the position `a_pos` and `b_pos` if the
+/// value at position `b_pos` is less than the one at position `a_pos`.
+unsafe fn swap_if_less<T, F>(v_base: *mut T, a_pos: usize, b_pos: usize, is_less: &mut F)
+where
+    F: FnMut(&T, &T) -> bool,
+{
+    // SAFETY: the caller must guarantee that `a` and `b` each added to `v_base` yield valid
+    // pointers into `v_base`, and are properly aligned, and part of the same allocation.
+    unsafe {
+        let v_a = v_base.add(a_pos);
+        let v_b = v_base.add(b_pos);
+
+        // PANIC SAFETY: if is_less panics, no scratch memory was created and the slice should still be
+        // in a well defined state, without duplicates.
+
+        // Important to only swap if it is more and not if it is equal. is_less should return false for
+        // equal, so we don't swap.
+        let should_swap = is_less(&*v_b, &*v_a);
+
+        // This is a branchless version of swap if.
+        // The equivalent code with a branch would be:
+        //
+        // if should_swap {
+        //     ptr::swap(left, right, 1);
+        // }
+
+        // The goal is to generate cmov instructions here.
+        let left_swap = if should_swap { v_b } else { v_a };
+        let right_swap = if should_swap { v_a } else { v_b };
+
+        let right_swap_tmp = ManuallyDrop::new(ptr::read(right_swap));
+        ptr::copy(left_swap, v_a, 1);
+        ptr::copy_nonoverlapping(&*right_swap_tmp, v_b, 1);
+    }
+}
--- a/library/core/src/slice/sort/unstable/mod.rs
+++ b/library/core/src/slice/sort/unstable/mod.rs
@ -2,10 +2,13 @@

 use crate::intrinsics;
 use crate::mem::SizedTypeProperties;
+#[cfg(not(feature = "optimize_for_size"))]
 use crate::slice::sort::shared::find_existing_run;
+#[cfg(not(feature = "optimize_for_size"))]
 use crate::slice::sort::shared::smallsort::insertion_sort_shift_left;

 pub(crate) mod heapsort;
+#[cfg(not(feature = "optimize_for_size"))]
 pub(crate) mod quicksort;

 /// Unstable sort called ipnsort by Lukas Bergdoll and Orson Peters.
@ -28,25 +31,37 @@ pub fn sort<T, F: FnMut(&T, &T) -> bool>(v: &mut [T], is_less: &mut F) {
        return;
    }

-    // More advanced sorting methods than insertion sort are faster if called in
-    // a hot loop for small inputs, but for general-purpose code the small
-    // binary size of insertion sort is more important. The instruction cache in
-    // modern processors is very valuable, and for a single sort call in general
-    // purpose code any gains from an advanced method are cancelled by i-cache
-    // misses during the sort, and thrashing the i-cache for surrounding code.
-    const MAX_LEN_ALWAYS_INSERTION_SORT: usize = 20;
-    if intrinsics::likely(len <= MAX_LEN_ALWAYS_INSERTION_SORT) {
-        insertion_sort_shift_left(v, 1, is_less);
-        return;
+    #[cfg(not(feature = "optimize_for_size"))]
+    {
+        // More advanced sorting methods than insertion sort are faster if called in
+        // a hot loop for small inputs, but for general-purpose code the small
+        // binary size of insertion sort is more important. The instruction cache in
+        // modern processors is very valuable, and for a single sort call in general
+        // purpose code any gains from an advanced method are cancelled by i-cache
+        // misses during the sort, and thrashing the i-cache for surrounding code.
+        const MAX_LEN_ALWAYS_INSERTION_SORT: usize = 20;
+        if intrinsics::likely(len <= MAX_LEN_ALWAYS_INSERTION_SORT) {
+            insertion_sort_shift_left(v, 1, is_less);
+            return;
+        }
+
+        ipnsort(v, is_less);
    }

-    ipnsort(v, is_less);
+    #[cfg(feature = "optimize_for_size")]
+    {
+        // SAFETY: We checked that `len >= 2`.
+        unsafe {
+            heapsort::heapsort(v, is_less);
+        }
+    }
 }

 /// See [`sort`]
 ///
 /// Deliberately don't inline the main sorting routine entrypoint to ensure the
 /// inlined insertion sort i-cache footprint remains minimal.
+#[cfg(not(feature = "optimize_for_size"))]
 #[inline(never)]
 fn ipnsort<T, F>(v: &mut [T], is_less: &mut F)
 where
--- a/library/core/src/slice/sort/unstable/quicksort.rs
+++ b/library/core/src/slice/sort/unstable/quicksort.rs
@ -98,13 +98,15 @@ where
        return 0;
    }

-    // Allows for panic-free code-gen by proving this property to the compiler.
    if pivot >= len {
        intrinsics::abort();
    }

-    // Place the pivot at the beginning of slice.
-    v.swap(0, pivot);
+    // SAFETY: We checked that `pivot` is in-bounds.
+    unsafe {
+        // Place the pivot at the beginning of slice.
+        v.swap_unchecked(0, pivot);
+    }
    let (pivot, v_without_pivot) = v.split_at_mut(1);

    // Assuming that Rust generates noalias LLVM IR we can be sure that a partition function
@ -118,8 +120,15 @@ where
    // compile-time by only instantiating the code that is needed. Idea by Frank Steffahn.
    let num_lt = (const { inst_partition::<T, F>() })(v_without_pivot, pivot, is_less);

-    // Place the pivot between the two partitions.
-    v.swap(0, num_lt);
+    if num_lt >= len {
+        intrinsics::abort();
+    }
+
+    // SAFETY: We checked that `num_lt` is in-bounds.
+    unsafe {
+        // Place the pivot between the two partitions.
+        v.swap_unchecked(0, num_lt);
+    }

    num_lt
 }