From f1ea540e9024db9b5d2e6a6e92431875feb345b3 Mon Sep 17 00:00:00 2001 From: Erick Tryzelaar Date: Sat, 5 Jul 2014 23:11:18 -0700 Subject: [PATCH] collections: Optimize Vec when cloning from a slice llvm is currently not able to conver `Vec::extend` into a memcpy for `Copy` types, which results in methods like `Vec::push_all` to run twice as slow as it should be running. This patch takes the unsafe `Vec::clone` optimization to speed up all the operations that are cloning a slice into a `Vec`. before: test vec::tests::bench_clone_from_0000_0000 ... bench: 12 ns/iter (+/- 2) test vec::tests::bench_clone_from_0000_0010 ... bench: 125 ns/iter (+/- 4) = 80 MB/s test vec::tests::bench_clone_from_0000_0100 ... bench: 360 ns/iter (+/- 33) = 277 MB/s test vec::tests::bench_clone_from_0000_1000 ... bench: 2601 ns/iter (+/- 175) = 384 MB/s test vec::tests::bench_clone_from_0010_0000 ... bench: 12 ns/iter (+/- 2) test vec::tests::bench_clone_from_0010_0010 ... bench: 125 ns/iter (+/- 10) = 80 MB/s test vec::tests::bench_clone_from_0010_0100 ... bench: 361 ns/iter (+/- 28) = 277 MB/s test vec::tests::bench_clone_from_0100_0010 ... bench: 131 ns/iter (+/- 13) = 76 MB/s test vec::tests::bench_clone_from_0100_0100 ... bench: 360 ns/iter (+/- 9) = 277 MB/s test vec::tests::bench_clone_from_0100_1000 ... bench: 2575 ns/iter (+/- 168) = 388 MB/s test vec::tests::bench_clone_from_1000_0100 ... bench: 356 ns/iter (+/- 20) = 280 MB/s test vec::tests::bench_clone_from_1000_1000 ... bench: 2605 ns/iter (+/- 167) = 383 MB/s test vec::tests::bench_from_slice_0000 ... bench: 11 ns/iter (+/- 0) test vec::tests::bench_from_slice_0010 ... bench: 115 ns/iter (+/- 5) = 86 MB/s test vec::tests::bench_from_slice_0100 ... bench: 309 ns/iter (+/- 170) = 323 MB/s test vec::tests::bench_from_slice_1000 ... bench: 2065 ns/iter (+/- 198) = 484 MB/s test vec::tests::bench_push_all_0000_0000 ... bench: 7 ns/iter (+/- 0) test vec::tests::bench_push_all_0000_0010 ... bench: 79 ns/iter (+/- 7) = 126 MB/s test vec::tests::bench_push_all_0000_0100 ... bench: 342 ns/iter (+/- 18) = 292 MB/s test vec::tests::bench_push_all_0000_1000 ... bench: 2873 ns/iter (+/- 75) = 348 MB/s test vec::tests::bench_push_all_0010_0010 ... bench: 154 ns/iter (+/- 8) = 64 MB/s test vec::tests::bench_push_all_0100_0100 ... bench: 518 ns/iter (+/- 18) = 193 MB/s test vec::tests::bench_push_all_1000_1000 ... bench: 4490 ns/iter (+/- 223) = 222 MB/s after: test vec::tests::bench_clone_from_0000_0000 ... bench: 12 ns/iter (+/- 1) test vec::tests::bench_clone_from_0000_0010 ... bench: 123 ns/iter (+/- 5) = 81 MB/s test vec::tests::bench_clone_from_0000_0100 ... bench: 367 ns/iter (+/- 23) = 272 MB/s test vec::tests::bench_clone_from_0000_1000 ... bench: 2618 ns/iter (+/- 252) = 381 MB/s test vec::tests::bench_clone_from_0010_0000 ... bench: 12 ns/iter (+/- 1) test vec::tests::bench_clone_from_0010_0010 ... bench: 124 ns/iter (+/- 7) = 80 MB/s test vec::tests::bench_clone_from_0010_0100 ... bench: 369 ns/iter (+/- 34) = 271 MB/s test vec::tests::bench_clone_from_0100_0010 ... bench: 123 ns/iter (+/- 6) = 81 MB/s test vec::tests::bench_clone_from_0100_0100 ... bench: 371 ns/iter (+/- 25) = 269 MB/s test vec::tests::bench_clone_from_0100_1000 ... bench: 2713 ns/iter (+/- 532) = 368 MB/s test vec::tests::bench_clone_from_1000_0100 ... bench: 369 ns/iter (+/- 14) = 271 MB/s test vec::tests::bench_clone_from_1000_1000 ... bench: 2611 ns/iter (+/- 194) = 382 MB/s test vec::tests::bench_from_slice_0000 ... bench: 7 ns/iter (+/- 0) test vec::tests::bench_from_slice_0010 ... bench: 108 ns/iter (+/- 4) = 92 MB/s test vec::tests::bench_from_slice_0100 ... bench: 235 ns/iter (+/- 24) = 425 MB/s test vec::tests::bench_from_slice_1000 ... bench: 1318 ns/iter (+/- 96) = 758 MB/s test vec::tests::bench_push_all_0000_0000 ... bench: 7 ns/iter (+/- 0) test vec::tests::bench_push_all_0000_0010 ... bench: 70 ns/iter (+/- 4) = 142 MB/s test vec::tests::bench_push_all_0000_0100 ... bench: 176 ns/iter (+/- 16) = 568 MB/s test vec::tests::bench_push_all_0000_1000 ... bench: 1125 ns/iter (+/- 94) = 888 MB/s test vec::tests::bench_push_all_0010_0010 ... bench: 159 ns/iter (+/- 15) = 62 MB/s test vec::tests::bench_push_all_0100_0100 ... bench: 363 ns/iter (+/- 12) = 275 MB/s test vec::tests::bench_push_all_1000_1000 ... bench: 2860 ns/iter (+/- 415) = 349 MB/s --- src/libcollections/vec.rs | 75 +++++++++++++++++++++++---------------- 1 file changed, 44 insertions(+), 31 deletions(-) diff --git a/src/libcollections/vec.rs b/src/libcollections/vec.rs index e10fc66ea856..4dd283dbefa0 100644 --- a/src/libcollections/vec.rs +++ b/src/libcollections/vec.rs @@ -197,7 +197,9 @@ impl Vec { /// ``` #[inline] pub fn from_slice(values: &[T]) -> Vec { - values.iter().map(|x| x.clone()).collect() + let mut vector = Vec::with_capacity(values.len()); + vector.push_all(values); + vector } /// Constructs a `Vec` with copies of a value. @@ -238,7 +240,10 @@ impl Vec { /// ``` #[inline] pub fn push_all(&mut self, other: &[T]) { - self.extend(other.iter().map(|e| e.clone())); + unsafe { + self.reserve_additional(other.len()); + unsafe_push_all_clone(self, other) + } } /// Grows the `Vec` in-place. @@ -318,41 +323,31 @@ impl Vec { #[unstable] impl Clone for Vec { fn clone(&self) -> Vec { - let len = self.len; - let mut vector = Vec::with_capacity(len); - // Unsafe code so this can be optimised to a memcpy (or something - // similarly fast) when T is Copy. LLVM is easily confused, so any - // extra operations during the loop can prevent this optimisation - { - let this_slice = self.as_slice(); - while vector.len < len { - unsafe { - let len = vector.len; - ptr::write( - vector.as_mut_slice().unsafe_mut_ref(len), - this_slice.unsafe_ref(len).clone()); - } - vector.len += 1; - } + unsafe { + let mut vector = Vec::with_capacity(self.len); + unsafe_push_all_clone(&mut vector, self.as_slice()); + vector } - vector } fn clone_from(&mut self, other: &Vec) { - // drop anything in self that will not be overwritten - if self.len() > other.len() { - self.truncate(other.len()) - } + unsafe { + // drop anything in self that will not be overwritten + if self.len() > other.len() { + self.truncate(other.len()) + } - // reuse the contained values' allocations/resources. - for (place, thing) in self.mut_iter().zip(other.iter()) { - place.clone_from(thing) - } + // reuse the contained values' allocations/resources. + for (place, thing) in self.mut_iter().zip(other.iter()) { + place.clone_from(thing) + } - // self.len <= other.len due to the truncate above, so the - // slice here is always in-bounds. - let len = self.len(); - self.extend(other.slice_from(len).iter().map(|x| x.clone())); + // self.len <= other.len due to the truncate above, so the + // slice here is always in-bounds. + let slice = other.slice_from(self.len()); + self.reserve_additional(slice.len()); + unsafe_push_all_clone(self, slice) + } } } @@ -1555,6 +1550,24 @@ pub mod raw { } } +// Unsafe code so this can be optimised to a memcpy (or something similarly +// fast) when T is Copy. LLVM is easily confused, so any extra operations +// during the loop can prevent this optimisation. +// +// WARNING: You must preallocate space on the vector before you call this +// method. +#[inline(always)] +unsafe fn unsafe_push_all_clone(dst: &mut Vec, src: &[T]) { + let mut dst_len = dst.len(); + + for i in range(0, src.len()) { + ptr::write( + dst.as_mut_slice().unsafe_mut_ref(dst_len), + src.unsafe_ref(i).clone()); + dst_len += 1; + dst.set_len(dst_len); + } +} #[cfg(test)] mod tests {