Skip to content

Instantly share code, notes, and snippets.

@nagisa
Last active February 2, 2020 00:22
Show Gist options
  • Save nagisa/c7b51916adc7641c853e67aec21d6407 to your computer and use it in GitHub Desktop.
Save nagisa/c7b51916adc7641c853e67aec21d6407 to your computer and use it in GitHub Desktop.
// bugs : sysret_ss_attrs null_seg spectre_v1 spectre_v2 spec_store_bypass
args/align_offset_v0/<24>(8, 16)
time: [4.4721 ns 4.4764 ns 4.4834 ns]
args/align_offset_v4/<24>(8, 16)
time: [4.1772 ns 4.1899 ns 4.2122 ns]
args/align_offset_v0/<24>(8, const 16)
time: [4.3952 ns 4.3966 ns 4.3985 ns]
args/align_offset_v4/<24>(8, const 16)
time: [1.4238 ns 1.4243 ns 1.4250 ns]
args/align_offset_v0/<5>(3, const 16)
time: [4.7113 ns 4.7140 ns 4.7174 ns]
args/align_offset_v4/<5>(3, const 16)
time: [1.1592 ns 1.1613 ns 1.1640 ns]
args/align_offset_v0/<24>(8, 512)
time: [7.0166 ns 7.0270 ns 7.0407 ns]
args/align_offset_v4/<24>(8, 512)
time: [4.7613 ns 4.7737 ns 4.7936 ns]
args/align_offset_v0/<24>(8, const 512)
time: [5.3369 ns 5.3449 ns 5.3557 ns]
args/align_offset_v4/<24>(8, const 512)
time: [2.1115 ns 2.1139 ns 2.1171 ns]
args/align_offset_v0/<5>(3, const 512)
time: [5.3354 ns 5.3399 ns 5.3486 ns]
args/align_offset_v4/<5>(3, const 512)
time: [2.4223 ns 2.4255 ns 2.4288 ns]
args/align_offset_v0/<24>(8, 4096)
time: [7.1637 ns 7.1658 ns 7.1682 ns]
args/align_offset_v4/<24>(8, 4096)
time: [5.8294 ns 5.8350 ns 5.8427 ns]
args/align_offset_v0/<24>(8, const 4096)
time: [5.9679 ns 5.9697 ns 5.9720 ns]
args/align_offset_v4/<24>(8, const 4096)
time: [2.8193 ns 2.8249 ns 2.8320 ns]
args/align_offset_v0/<5>(3, const 4096)
time: [6.0019 ns 6.0035 ns 6.0058 ns]
args/align_offset_v4/<5>(3, const 4096)
time: [2.4268 ns 2.4317 ns 2.4377 ns]
args/align_offset_v0/<24>(8, 1048576)
time: [8.8583 ns 8.8927 ns 8.9543 ns]
args/align_offset_v4/<24>(8, 1048576)
time: [6.8852 ns 6.8899 ns 6.8962 ns]
args/align_offset_v0/<24>(8, const 1048576)
time: [7.2200 ns 7.2244 ns 7.2300 ns]
args/align_offset_v4/<24>(8, const 1048576)
time: [3.6942 ns 3.7008 ns 3.7116 ns]
args/align_offset_v0/<5>(3, const 1048576)
time: [7.2475 ns 7.2904 ns 7.3544 ns]
args/align_offset_v4/<5>(3, const 1048576)
time: [3.3883 ns 3.3974 ns 3.4088 ns]
args/align_offset_v0/<24>(8, 16)
time: [17.922 ns 17.923 ns 17.925 ns]
args/align_offset_v4/<24>(8, 16)
time: [12.920 ns 12.921 ns 12.921 ns]
args/align_offset_v0/<24>(8, const 16)
time: [11.253 ns 11.253 ns 11.253 ns]
args/align_offset_v4/<24>(8, const 16)
time: [6.6690 ns 6.6701 ns 6.6715 ns]
args/align_offset_v0/<5>(3, const 16)
time: [12.095 ns 12.100 ns 12.106 ns]
args/align_offset_v4/<5>(3, const 16)
time: [5.4183 ns 5.4184 ns 5.4186 ns]
args/align_offset_v0/<24>(8, 512)
time: [32.091 ns 32.092 ns 32.092 ns]
args/align_offset_v4/<24>(8, 512)
time: [17.088 ns 17.088 ns 17.089 ns]
args/align_offset_v0/<24>(8, const 512)
time: [25.007 ns 25.009 ns 25.011 ns]
args/align_offset_v4/<24>(8, const 512)
time: [10.420 ns 10.422 ns 10.425 ns]
args/align_offset_v0/<5>(3, const 512)
time: [26.259 ns 26.264 ns 26.270 ns]
args/align_offset_v4/<5>(3, const 512)
time: [14.170 ns 14.172 ns 14.176 ns]
args/align_offset_v0/<24>(8, 4096)
time: [35.426 ns 35.427 ns 35.428 ns]
args/align_offset_v4/<24>(8, 4096)
time: [21.672 ns 21.672 ns 21.672 ns]
args/align_offset_v0/<24>(8, const 4096)
time: [28.342 ns 28.344 ns 28.349 ns]
args/align_offset_v4/<24>(8, const 4096)
time: [15.004 ns 15.006 ns 15.009 ns]
args/align_offset_v0/<5>(3, const 4096)
time: [27.924 ns 27.926 ns 27.930 ns]
args/align_offset_v4/<5>(3, const 4096)
time: [14.170 ns 14.170 ns 14.171 ns]
args/align_offset_v0/<24>(8, 1048576)
time: [47.095 ns 47.099 ns 47.106 ns]
args/align_offset_v4/<24>(8, 1048576)
time: [26.257 ns 26.259 ns 26.263 ns]
args/align_offset_v0/<24>(8, const 1048576)
time: [40.011 ns 40.012 ns 40.012 ns]
args/align_offset_v4/<24>(8, const 1048576)
time: [19.595 ns 19.601 ns 19.609 ns]
args/align_offset_v0/<5>(3, const 1048576)
time: [39.599 ns 39.604 ns 39.612 ns]
args/align_offset_v4/<5>(3, const 1048576)
time: [18.755 ns 18.755 ns 18.756 ns]
diff --git a/benches/align_offset.rs b/benches/align_offset.rs
index fe0e07b..bfe04b2 100644
--- a/benches/align_offset.rs
+++ b/benches/align_offset.rs
@@ -1,51 +1,66 @@
-use bench_align_offset::ALIGN_OFFSET_FNS;
use criterion::{black_box as bb, criterion_group, criterion_main, BenchmarkId, Criterion};
fn bench_align_offset(c: &mut Criterion) {
let mut group = c.benchmark_group("args");
- for (p, stride) in [(8usize, 24usize)].iter().copied() {
- for align in [16usize, 128, 256, 512, 2048, 4096, 1 << 17, 1 << 20]
- .iter()
- .copied()
- {
- for i in 0..ALIGN_OFFSET_FNS.len() {
- group.bench_function(
- BenchmarkId::new(
- format!("align_offset_v{}", i),
- format!("({}, {}, {}", p, stride, align),
- ),
- |b| b.iter(|| unsafe { ALIGN_OFFSET_FNS[i](bb(p), bb(stride), bb(align)) }),
- );
- }
- for i in 0..ALIGN_OFFSET_FNS.len() {
- group.bench_function(
- BenchmarkId::new(
- format!("align_offset_v{}", i),
- format!("({}, {}, {}*", p, stride, align),
- ),
- |b| b.iter(|| unsafe { ALIGN_OFFSET_FNS[i](bb(p), bb(stride), align) }),
- );
- }
- for i in 0..ALIGN_OFFSET_FNS.len() {
- group.bench_function(
- BenchmarkId::new(
- format!("align_offset_v{}", i),
- format!("({}, {}*, {}", p, stride, align),
- ),
- |b| b.iter(|| unsafe { ALIGN_OFFSET_FNS[i](bb(p), stride, bb(align)) }),
- );
- }
- for i in 0..ALIGN_OFFSET_FNS.len() {
- group.bench_function(
- BenchmarkId::new(
- format!("align_offset_v{}", i),
- format!("({}, {}*, {}*", p, stride, align),
- ),
- |b| b.iter(|| unsafe { ALIGN_OFFSET_FNS[i](bb(p), stride, align) }),
- );
- }
- }
+
+ for align in [16usize, 512, 4096, 1 << 20].iter().copied() {
+ group.bench_function(
+ BenchmarkId::new(
+ "align_offset_v0",
+ format!("<24>(8, {})", align),
+ ),
+ |b| b.iter(|| unsafe {
+ bench_align_offset::align_offset_v0::<24>(bb(8), bb(align))
+ }),
+ );
+ group.bench_function(
+ BenchmarkId::new(
+ "align_offset_v4",
+ format!("<24>(8, {})", align),
+ ),
+ |b| b.iter(|| unsafe {
+ bench_align_offset::align_offset_v4::<24>(bb(8), bb(align))
+ }),
+ );
+ group.bench_function(
+ BenchmarkId::new(
+ "align_offset_v0",
+ format!("<24>(8, const {})", align),
+ ),
+ |b| b.iter(|| unsafe {
+ bench_align_offset::align_offset_v0::<24>(bb(8), align)
+ }),
+ );
+ group.bench_function(
+ BenchmarkId::new(
+ "align_offset_v4",
+ format!("<24>(8, const {})", align),
+ ),
+ |b| b.iter(|| unsafe {
+ bench_align_offset::align_offset_v4::<24>(bb(8), align)
+ }),
+ );
+
+ group.bench_function(
+ BenchmarkId::new(
+ "align_offset_v0",
+ format!("<5>(3, const {})", align),
+ ),
+ |b| b.iter(|| unsafe {
+ bench_align_offset::align_offset_v0::<5>(bb(3), align)
+ }),
+ );
+ group.bench_function(
+ BenchmarkId::new(
+ "align_offset_v4",
+ format!("<5>(3, const {})", align),
+ ),
+ |b| b.iter(|| unsafe {
+ bench_align_offset::align_offset_v4::<5>(bb(3), align)
+ }),
+ );
}
+
group.finish();
}
diff --git a/src/lib.rs b/src/lib.rs
index 4336551..1483041 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,63 +1,8 @@
#![feature(core_intrinsics)]
+#![feature(const_generics)]
mod v0;
-mod v1;
-mod v2;
-mod v3;
mod v4;
pub use v0::align_offset as align_offset_v0;
-pub use v1::align_offset as align_offset_v1;
-pub use v2::align_offset as align_offset_v2;
-pub use v3::align_offset as align_offset_v3;
pub use v4::align_offset as align_offset_v4;
-
-pub const ALIGN_OFFSET_FNS: [unsafe fn(usize, usize, usize) -> usize; 5] = [
- align_offset_v0,
- align_offset_v1,
- align_offset_v2,
- align_offset_v3,
- align_offset_v4,
-];
-
-#[test]
-fn align_offset_weird_strides() {
- unsafe fn test_weird_stride(ptr: usize, stride: usize, align: usize) -> bool {
- let mut expected = usize::max_value();
- // Naive but definitely correct way to find the *first* aligned element of stride::<T>.
- for el in 0..align {
- if (ptr + el * stride) % align == 0 {
- expected = el;
- break;
- }
- }
- let mut ret = false;
- for i in 0..ALIGN_OFFSET_FNS.len() {
- let got = ALIGN_OFFSET_FNS[i](ptr, stride, align);
- if got != expected {
- eprintln!(
- "align_offset_v{}: aligning {:x} (with stride of {}) to {}, expected {}, got {}",
- i, ptr, stride, align, expected, got
- );
- ret |= true;
- }
- }
- return ret;
- }
-
- // For pointers of stride != 1, we verify the algorithm against the naivest possible
- // implementation
- let mut align = 1;
- let mut x = false;
- while align < 1024 {
- for ptr in 1usize..4 * align {
- for stride in 3..11 {
- unsafe {
- x |= test_weird_stride(ptr, stride, align);
- }
- }
- }
- align = (align + 1).next_power_of_two();
- }
- assert!(!x);
-}
diff --git a/src/v0.rs b/src/v0.rs
index 366f722..b5be01c 100644
--- a/src/v0.rs
+++ b/src/v0.rs
@@ -1,6 +1,6 @@
use core::intrinsics;
-pub unsafe fn align_offset(p: usize, stride: usize, a: usize) -> usize {
+pub unsafe fn align_offset<const STRIDE: usize>(p: usize, a: usize) -> usize {
/// Calculate multiplicative modular inverse of `x` modulo `m`.
///
/// This implementation is tailored for align_offset and has following preconditions:
@@ -51,6 +51,7 @@ pub unsafe fn align_offset(p: usize, stride: usize, a: usize) -> usize {
let a_minus_one = a.wrapping_sub(1);
let pmoda = p & a_minus_one;
+ let stride = STRIDE;
if pmoda == 0 {
// Already aligned. Yay!
diff --git a/src/v4.rs b/src/v4.rs
index ddf8846..9537351 100644
--- a/src/v4.rs
+++ b/src/v4.rs
@@ -1,6 +1,6 @@
use core::intrinsics;
-pub unsafe fn align_offset(p: usize, stride: usize, a: usize) -> usize {
+pub unsafe fn align_offset<const STRIDE: usize>(p: usize, a: usize) -> usize {
/// Calculate multiplicative modular inverse of `x` modulo `m`, where
/// `m = 2^mpow` and `mask = m - 1`.
///
@@ -61,6 +61,7 @@ pub unsafe fn align_offset(p: usize, stride: usize, a: usize) -> usize {
let a_minus_one = a.wrapping_sub(1);
let pmoda = p & a_minus_one;
+ let stride = STRIDE;
if pmoda == 0 {
// Already aligned. Yay!
// bugs : cpu_meltdown spectre_v1 spectre_v2 spec_store_bypass l1tf mds swapgs itlb_multihit
args/align_offset_v0/<24>(8, 16)
time: [15.065 ns 15.122 ns 15.184 ns]
args/align_offset_v4/<24>(8, 16)
time: [4.3836 ns 4.4066 ns 4.4326 ns]
args/align_offset_v0/<24>(8, const 16)
time: [9.5868 ns 9.6397 ns 9.7000 ns]
args/align_offset_v4/<24>(8, const 16)
time: [1.4840 ns 1.4903 ns 1.4979 ns]
args/align_offset_v0/<5>(3, const 16)
time: [8.9378 ns 8.9565 ns 8.9765 ns]
args/align_offset_v4/<5>(3, const 16)
time: [1.0090 ns 1.0140 ns 1.0200 ns]
args/align_offset_v0/<24>(8, 512)
time: [14.592 ns 14.626 ns 14.662 ns]
args/align_offset_v4/<24>(8, 512)
time: [5.0840 ns 5.1075 ns 5.1337 ns]
args/align_offset_v0/<24>(8, const 512)
time: [11.420 ns 11.471 ns 11.528 ns]
args/align_offset_v4/<24>(8, const 512)
time: [2.7074 ns 2.7409 ns 2.7790 ns]
args/align_offset_v0/<5>(3, const 512)
time: [10.989 ns 11.032 ns 11.080 ns]
args/align_offset_v4/<5>(3, const 512)
time: [2.9416 ns 2.9660 ns 2.9954 ns]
args/align_offset_v0/<24>(8, 4096)
time: [14.995 ns 15.065 ns 15.146 ns]
args/align_offset_v4/<24>(8, 4096)
time: [6.2128 ns 6.2481 ns 6.2901 ns]
args/align_offset_v0/<24>(8, const 4096)
time: [11.594 ns 11.652 ns 11.713 ns]
args/align_offset_v4/<24>(8, const 4096)
time: [3.2236 ns 3.2344 ns 3.2465 ns]
args/align_offset_v0/<5>(3, const 4096)
time: [10.891 ns 10.929 ns 10.972 ns]
args/align_offset_v4/<5>(3, const 4096)
time: [2.9214 ns 2.9529 ns 2.9915 ns]
args/align_offset_v0/<24>(8, 1048576)
time: [15.623 ns 15.680 ns 15.743 ns]
args/align_offset_v4/<24>(8, 1048576)
time: [7.2406 ns 7.2695 ns 7.3028 ns]
args/align_offset_v0/<24>(8, const 1048576)
time: [12.175 ns 12.208 ns 12.243 ns]
args/align_offset_v4/<24>(8, const 1048576)
time: [4.1520 ns 4.1666 ns 4.1836 ns]
args/align_offset_v0/<5>(3, const 1048576)
time: [11.804 ns 11.838 ns 11.877 ns]
args/align_offset_v4/<5>(3, const 1048576)
time: [3.5327 ns 3.5546 ns 3.5823 ns]
// bugs : cpu_meltdown spectre_v1 spectre_v2 spec_store_bypass l1tf
args/align_offset_v0/<24>(8, 16)
time: [9.7291 ns 9.7618 ns 9.8019 ns]
args/align_offset_v4/<24>(8, 16)
time: [3.4615 ns 3.4628 ns 3.4643 ns]
args/align_offset_v0/<24>(8, const 16)
time: [7.7940 ns 7.7954 ns 7.7972 ns]
args/align_offset_v4/<24>(8, const 16)
time: [1.2188 ns 1.2196 ns 1.2205 ns]
args/align_offset_v0/<5>(3, const 16)
time: [7.1460 ns 7.1520 ns 7.1585 ns]
args/align_offset_v4/<5>(3, const 16)
time: [818.61 ps 819.82 ps 821.17 ps]
args/align_offset_v0/<24>(8, 512)
time: [12.724 ns 12.754 ns 12.786 ns]
args/align_offset_v4/<24>(8, 512)
time: [4.1613 ns 4.1763 ns 4.1928 ns]
args/align_offset_v0/<24>(8, const 512)
time: [9.3090 ns 9.3203 ns 9.3321 ns]
args/align_offset_v4/<24>(8, const 512)
time: [2.1038 ns 2.1087 ns 2.1147 ns]
args/align_offset_v0/<5>(3, const 512)
time: [8.5298 ns 8.5336 ns 8.5393 ns]
args/align_offset_v4/<5>(3, const 512)
time: [7.5177 ns 7.5536 ns 7.5956 ns]
args/align_offset_v0/<24>(8, 4096)
time: [12.819 ns 12.875 ns 12.944 ns]
args/align_offset_v4/<24>(8, 4096)
time: [5.0213 ns 5.0271 ns 5.0338 ns]
args/align_offset_v0/<24>(8, const 4096)
time: [9.3860 ns 9.4130 ns 9.4412 ns]
args/align_offset_v4/<24>(8, const 4096)
time: [2.7312 ns 2.7368 ns 2.7429 ns]
args/align_offset_v0/<5>(3, const 4096)
time: [8.6224 ns 8.6483 ns 8.6758 ns]
args/align_offset_v4/<5>(3, const 4096)
time: [7.5934 ns 7.6248 ns 7.6569 ns]
args/align_offset_v0/<24>(8, 1048576)
time: [13.585 ns 13.643 ns 13.700 ns]
args/align_offset_v4/<24>(8, 1048576)
time: [5.9430 ns 5.9442 ns 5.9455 ns]
args/align_offset_v0/<24>(8, const 1048576)
time: [10.178 ns 10.204 ns 10.242 ns]
args/align_offset_v4/<24>(8, const 1048576)
time: [3.3155 ns 3.3164 ns 3.3175 ns]
args/align_offset_v0/<5>(3, const 1048576)
time: [9.4072 ns 9.4686 ns 9.5528 ns]
args/align_offset_v4/<5>(3, const 1048576)
time: [8.4526 ns 8.5024 ns 8.5527 ns]
args/align_offset_v0/<24>(8, 16)
time: [6.3633 ns 6.3650 ns 6.3666 ns]
args/align_offset_v4/<24>(8, 16)
time: [6.0332 ns 6.0347 ns 6.0363 ns]
args/align_offset_v0/<24>(8, const 16)
time: [4.1927 ns 4.1952 ns 4.1992 ns]
args/align_offset_v4/<24>(8, const 16)
time: [4.0846 ns 4.0876 ns 4.0928 ns]
args/align_offset_v0/<5>(3, const 16)
time: [4.1727 ns 4.1759 ns 4.1815 ns]
args/align_offset_v4/<5>(3, const 16)
time: [3.9395 ns 3.9405 ns 3.9414 ns]
args/align_offset_v0/<24>(8, 512)
time: [8.0810 ns 8.0861 ns 8.0954 ns]
args/align_offset_v4/<24>(8, 512)
time: [6.5192 ns 6.5225 ns 6.5260 ns]
args/align_offset_v0/<24>(8, const 512)
time: [4.4775 ns 4.4785 ns 4.4795 ns]
args/align_offset_v4/<24>(8, const 512)
time: [3.8124 ns 3.8132 ns 3.8140 ns]
args/align_offset_v0/<5>(3, const 512)
time: [4.3254 ns 4.3269 ns 4.3291 ns]
args/align_offset_v4/<5>(3, const 512)
time: [3.9977 ns 3.9991 ns 4.0005 ns]
args/align_offset_v0/<24>(8, 4096)
time: [8.0912 ns 8.0933 ns 8.0954 ns]
args/align_offset_v4/<24>(8, 4096)
time: [7.2076 ns 7.2381 ns 7.2699 ns]
args/align_offset_v0/<24>(8, const 4096)
time: [4.4752 ns 4.4757 ns 4.4763 ns]
args/align_offset_v4/<24>(8, const 4096)
time: [4.0250 ns 4.0261 ns 4.0273 ns]
args/align_offset_v0/<5>(3, const 4096)
time: [4.3262 ns 4.3271 ns 4.3281 ns]
args/align_offset_v4/<5>(3, const 4096)
time: [3.9899 ns 3.9905 ns 3.9912 ns]
args/align_offset_v0/<24>(8, 1048576)
time: [8.7230 ns 8.7249 ns 8.7268 ns]
args/align_offset_v4/<24>(8, 1048576)
time: [6.9903 ns 6.9930 ns 6.9961 ns]
args/align_offset_v0/<24>(8, const 1048576)
time: [5.1931 ns 5.1946 ns 5.1975 ns]
args/align_offset_v4/<24>(8, const 1048576)
time: [4.3535 ns 4.3546 ns 4.3558 ns]
args/align_offset_v0/<5>(3, const 1048576)
time: [5.3515 ns 5.3525 ns 5.3537 ns]
args/align_offset_v4/<5>(3, const 1048576)
time: [4.3250 ns 4.3266 ns 4.3281 ns]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment