Skip to content

Commit a419dd1

Browse files
authored
Rollup merge of #39538 - stjepang:slightly-optimize-sort, r=alexcrichton
Slightly optimize slice::sort First, get rid of some bound checks. Second, instead of comparing by ternary `compare` function, use a binary function testing whether an element is less than some other element. This apparently makes it easier for the compiler to reason about the code. I've noticed the same effect with [pdqsort](https://github.com/stjepang/pdqsort) crate. Benchmark: ``` name before ns/iter after ns/iter diff ns/iter diff % slice::bench::sort_large_ascending 8,969 (8919 MB/s) 7,410 (10796 MB/s) -1,559 -17.38% slice::bench::sort_large_big_ascending 355,640 (3599 MB/s) 359,137 (3564 MB/s) 3,497 0.98% slice::bench::sort_large_big_descending 427,112 (2996 MB/s) 424,721 (3013 MB/s) -2,391 -0.56% slice::bench::sort_large_big_random 2,207,799 (579 MB/s) 2,138,804 (598 MB/s) -68,995 -3.13% slice::bench::sort_large_descending 13,694 (5841 MB/s) 13,514 (5919 MB/s) -180 -1.31% slice::bench::sort_large_mostly_ascending 239,697 (333 MB/s) 203,542 (393 MB/s) -36,155 -15.08% slice::bench::sort_large_mostly_descending 270,102 (296 MB/s) 234,263 (341 MB/s) -35,839 -13.27% slice::bench::sort_large_random 513,406 (155 MB/s) 470,084 (170 MB/s) -43,322 -8.44% slice::bench::sort_large_random_expensive 23,650,321 (3 MB/s) 23,675,098 (3 MB/s) 24,777 0.10% slice::bench::sort_medium_ascending 143 (5594 MB/s) 132 (6060 MB/s) -11 -7.69% slice::bench::sort_medium_descending 197 (4060 MB/s) 188 (4255 MB/s) -9 -4.57% slice::bench::sort_medium_random 3,358 (238 MB/s) 3,271 (244 MB/s) -87 -2.59% slice::bench::sort_small_ascending 32 (2500 MB/s) 32 (2500 MB/s) 0 0.00% slice::bench::sort_small_big_ascending 97 (13195 MB/s) 97 (13195 MB/s) 0 0.00% slice::bench::sort_small_big_descending 247 (5182 MB/s) 249 (5140 MB/s) 2 0.81% slice::bench::sort_small_big_random 502 (2549 MB/s) 498 (2570 MB/s) -4 -0.80% slice::bench::sort_small_descending 55 (1454 MB/s) 61 (1311 MB/s) 6 10.91% slice::bench::sort_small_random 358 (223 MB/s) 356 (224 MB/s) -2 -0.56% ```
2 parents 70cc1d3 + fa457bf commit a419dd1

File tree

2 files changed

+39
-38
lines changed

2 files changed

+39
-38
lines changed

src/libcollections/slice.rs

+36-32
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@
9898
#![cfg_attr(test, allow(unused_imports, dead_code))]
9999

100100
use alloc::boxed::Box;
101-
use core::cmp::Ordering::{self, Greater};
101+
use core::cmp::Ordering::{self, Less};
102102
use core::mem::size_of;
103103
use core::mem;
104104
use core::ptr;
@@ -1089,7 +1089,7 @@ impl<T> [T] {
10891089
pub fn sort(&mut self)
10901090
where T: Ord
10911091
{
1092-
self.sort_by(|a, b| a.cmp(b))
1092+
merge_sort(self, |a, b| a.lt(b));
10931093
}
10941094

10951095
/// Sorts the slice using `f` to extract a key to compare elements by.
@@ -1119,7 +1119,7 @@ impl<T> [T] {
11191119
pub fn sort_by_key<B, F>(&mut self, mut f: F)
11201120
where F: FnMut(&T) -> B, B: Ord
11211121
{
1122-
self.sort_by(|a, b| f(a).cmp(&f(b)))
1122+
merge_sort(self, |a, b| f(a).lt(&f(b)));
11231123
}
11241124

11251125
/// Sorts the slice using `compare` to compare elements.
@@ -1149,10 +1149,10 @@ impl<T> [T] {
11491149
/// ```
11501150
#[stable(feature = "rust1", since = "1.0.0")]
11511151
#[inline]
1152-
pub fn sort_by<F>(&mut self, compare: F)
1152+
pub fn sort_by<F>(&mut self, mut compare: F)
11531153
where F: FnMut(&T, &T) -> Ordering
11541154
{
1155-
merge_sort(self, compare)
1155+
merge_sort(self, |a, b| compare(a, b) == Less);
11561156
}
11571157

11581158
/// Copies the elements from `src` into `self`.
@@ -1355,10 +1355,10 @@ impl<T: Clone> ToOwned for [T] {
13551355
/// Inserts `v[0]` into pre-sorted sequence `v[1..]` so that whole `v[..]` becomes sorted.
13561356
///
13571357
/// This is the integral subroutine of insertion sort.
1358-
fn insert_head<T, F>(v: &mut [T], compare: &mut F)
1359-
where F: FnMut(&T, &T) -> Ordering
1358+
fn insert_head<T, F>(v: &mut [T], is_less: &mut F)
1359+
where F: FnMut(&T, &T) -> bool
13601360
{
1361-
if v.len() >= 2 && compare(&v[0], &v[1]) == Greater {
1361+
if v.len() >= 2 && is_less(&v[1], &v[0]) {
13621362
unsafe {
13631363
// There are three ways to implement insertion here:
13641364
//
@@ -1381,12 +1381,12 @@ fn insert_head<T, F>(v: &mut [T], compare: &mut F)
13811381

13821382
// Intermediate state of the insertion process is always tracked by `hole`, which
13831383
// serves two purposes:
1384-
// 1. Protects integrity of `v` from panics in `compare`.
1384+
// 1. Protects integrity of `v` from panics in `is_less`.
13851385
// 2. Fills the remaining hole in `v` in the end.
13861386
//
13871387
// Panic safety:
13881388
//
1389-
// If `compare` panics at any point during the process, `hole` will get dropped and
1389+
// If `is_less` panics at any point during the process, `hole` will get dropped and
13901390
// fill the hole in `v` with `tmp`, thus ensuring that `v` still holds every object it
13911391
// initially held exactly once.
13921392
let mut hole = InsertionHole {
@@ -1396,7 +1396,7 @@ fn insert_head<T, F>(v: &mut [T], compare: &mut F)
13961396
ptr::copy_nonoverlapping(&v[1], &mut v[0], 1);
13971397

13981398
for i in 2..v.len() {
1399-
if compare(&tmp.value, &v[i]) != Greater {
1399+
if !is_less(&v[i], &tmp.value) {
14001400
break;
14011401
}
14021402
ptr::copy_nonoverlapping(&v[i], &mut v[i - 1], 1);
@@ -1432,8 +1432,8 @@ fn insert_head<T, F>(v: &mut [T], compare: &mut F)
14321432
///
14331433
/// The two slices must be non-empty and `mid` must be in bounds. Buffer `buf` must be long enough
14341434
/// to hold a copy of the shorter slice. Also, `T` must not be a zero-sized type.
1435-
unsafe fn merge<T, F>(v: &mut [T], mid: usize, buf: *mut T, compare: &mut F)
1436-
where F: FnMut(&T, &T) -> Ordering
1435+
unsafe fn merge<T, F>(v: &mut [T], mid: usize, buf: *mut T, is_less: &mut F)
1436+
where F: FnMut(&T, &T) -> bool
14371437
{
14381438
let len = v.len();
14391439
let v = v.as_mut_ptr();
@@ -1449,12 +1449,12 @@ unsafe fn merge<T, F>(v: &mut [T], mid: usize, buf: *mut T, compare: &mut F)
14491449
// hole in `v`.
14501450
//
14511451
// Intermediate state of the process is always tracked by `hole`, which serves two purposes:
1452-
// 1. Protects integrity of `v` from panics in `compare`.
1452+
// 1. Protects integrity of `v` from panics in `is_less`.
14531453
// 2. Fills the remaining hole in `v` if the longer run gets consumed first.
14541454
//
14551455
// Panic safety:
14561456
//
1457-
// If `compare` panics at any point during the process, `hole` will get dropped and fill the
1457+
// If `is_less` panics at any point during the process, `hole` will get dropped and fill the
14581458
// hole in `v` with the unconsumed range in `buf`, thus ensuring that `v` still holds every
14591459
// object it initially held exactly once.
14601460
let mut hole;
@@ -1476,7 +1476,7 @@ unsafe fn merge<T, F>(v: &mut [T], mid: usize, buf: *mut T, compare: &mut F)
14761476
while *left < hole.end && right < v_end {
14771477
// Consume the lesser side.
14781478
// If equal, prefer the left run to maintain stability.
1479-
let to_copy = if compare(&**left, &*right) == Greater {
1479+
let to_copy = if is_less(&*right, &**left) {
14801480
get_and_increment(&mut right)
14811481
} else {
14821482
get_and_increment(left)
@@ -1500,7 +1500,7 @@ unsafe fn merge<T, F>(v: &mut [T], mid: usize, buf: *mut T, compare: &mut F)
15001500
while v < *left && buf < *right {
15011501
// Consume the greater side.
15021502
// If equal, prefer the right run to maintain stability.
1503-
let to_copy = if compare(&*left.offset(-1), &*right.offset(-1)) == Greater {
1503+
let to_copy = if is_less(&*right.offset(-1), &*left.offset(-1)) {
15041504
decrement_and_get(left)
15051505
} else {
15061506
decrement_and_get(right)
@@ -1550,8 +1550,8 @@ unsafe fn merge<T, F>(v: &mut [T], mid: usize, buf: *mut T, compare: &mut F)
15501550
/// 2. for every `i` in `2..runs.len()`: `runs[i - 2].len > runs[i - 1].len + runs[i].len`
15511551
///
15521552
/// The invariants ensure that the total running time is `O(n log n)` worst-case.
1553-
fn merge_sort<T, F>(v: &mut [T], mut compare: F)
1554-
where F: FnMut(&T, &T) -> Ordering
1553+
fn merge_sort<T, F>(v: &mut [T], mut is_less: F)
1554+
where F: FnMut(&T, &T) -> bool
15551555
{
15561556
// Sorting has no meaningful behavior on zero-sized types.
15571557
if size_of::<T>() == 0 {
@@ -1565,7 +1565,7 @@ fn merge_sort<T, F>(v: &mut [T], mut compare: F)
15651565
//
15661566
// Short runs are extended using insertion sort to span at least `min_run` elements, in order
15671567
// to improve performance.
1568-
let (max_insertion, min_run) = if size_of::<T>() <= 16 {
1568+
let (max_insertion, min_run) = if size_of::<T>() <= 2 * mem::size_of::<usize>() {
15691569
(64, 32)
15701570
} else {
15711571
(32, 16)
@@ -1577,15 +1577,15 @@ fn merge_sort<T, F>(v: &mut [T], mut compare: F)
15771577
if len <= max_insertion {
15781578
if len >= 2 {
15791579
for i in (0..len-1).rev() {
1580-
insert_head(&mut v[i..], &mut compare);
1580+
insert_head(&mut v[i..], &mut is_less);
15811581
}
15821582
}
15831583
return;
15841584
}
15851585

15861586
// Allocate a buffer to use as scratch memory. We keep the length 0 so we can keep in it
15871587
// shallow copies of the contents of `v` without risking the dtors running on copies if
1588-
// `compare` panics. When merging two sorted runs, this buffer holds a copy of the shorter run,
1588+
// `is_less` panics. When merging two sorted runs, this buffer holds a copy of the shorter run,
15891589
// which will always have length at most `len / 2`.
15901590
let mut buf = Vec::with_capacity(len / 2);
15911591

@@ -1600,14 +1600,18 @@ fn merge_sort<T, F>(v: &mut [T], mut compare: F)
16001600
let mut start = end - 1;
16011601
if start > 0 {
16021602
start -= 1;
1603-
if compare(&v[start], &v[start + 1]) == Greater {
1604-
while start > 0 && compare(&v[start - 1], &v[start]) == Greater {
1605-
start -= 1;
1606-
}
1607-
v[start..end].reverse();
1608-
} else {
1609-
while start > 0 && compare(&v[start - 1], &v[start]) != Greater {
1610-
start -= 1;
1603+
unsafe {
1604+
if is_less(v.get_unchecked(start + 1), v.get_unchecked(start)) {
1605+
while start > 0 && is_less(v.get_unchecked(start),
1606+
v.get_unchecked(start - 1)) {
1607+
start -= 1;
1608+
}
1609+
v[start..end].reverse();
1610+
} else {
1611+
while start > 0 && !is_less(v.get_unchecked(start),
1612+
v.get_unchecked(start - 1)) {
1613+
start -= 1;
1614+
}
16111615
}
16121616
}
16131617
}
@@ -1616,7 +1620,7 @@ fn merge_sort<T, F>(v: &mut [T], mut compare: F)
16161620
// merge sort on short sequences, so this significantly improves performance.
16171621
while start > 0 && end - start < min_run {
16181622
start -= 1;
1619-
insert_head(&mut v[start..end], &mut compare);
1623+
insert_head(&mut v[start..end], &mut is_less);
16201624
}
16211625

16221626
// Push this run onto the stack.
@@ -1632,7 +1636,7 @@ fn merge_sort<T, F>(v: &mut [T], mut compare: F)
16321636
let right = runs[r];
16331637
unsafe {
16341638
merge(&mut v[left.start .. right.start + right.len], left.len, buf.as_mut_ptr(),
1635-
&mut compare);
1639+
&mut is_less);
16361640
}
16371641
runs[r] = Run {
16381642
start: left.start,

src/libcollectionstest/slice.rs

+3-6
Original file line numberDiff line numberDiff line change
@@ -1429,18 +1429,15 @@ mod bench {
14291429
fn sort_large_random_expensive(b: &mut Bencher) {
14301430
let len = 10000;
14311431
b.iter(|| {
1432+
let mut v = gen_random(len);
14321433
let mut count = 0;
1433-
let cmp = move |a: &u64, b: &u64| {
1434+
v.sort_by(|a: &u64, b: &u64| {
14341435
count += 1;
14351436
if count % 1_000_000_000 == 0 {
14361437
panic!("should not happen");
14371438
}
14381439
(*a as f64).cos().partial_cmp(&(*b as f64).cos()).unwrap()
1439-
};
1440-
1441-
let mut v = gen_random(len);
1442-
v.sort_by(cmp);
1443-
1440+
});
14441441
black_box(count);
14451442
});
14461443
b.bytes = len as u64 * mem::size_of::<u64>() as u64;

0 commit comments

Comments
 (0)