Auto merge of #83093 - the8472:smaller-instant-hammer, r=Amanieu

where available use AtomicU{64,128} instead of mutex for Instant backsliding protection This decreases the overhead of backsliding protection on x86 systems with unreliable TSC, e.g. windows. And on aarch64 systems where 128bit atomics are available. The following benchmarks were taken on x86_64 linux though by overriding `actually_monotonic()`, the numbers may look different on other platforms ``` # actually_monotonic() == true test time::tests::instant_contention_01_threads ... bench: 44 ns/iter (+/- 0) test time::tests::instant_contention_02_threads ... bench: 44 ns/iter (+/- 0) test time::tests::instant_contention_04_threads ... bench: 44 ns/iter (+/- 0) test time::tests::instant_contention_08_threads ... bench: 44 ns/iter (+/- 0) test time::tests::instant_contention_16_threads ... bench: 44 ns/iter (+/- 0) # 1x AtomicU64 test time::tests::instant_contention_01_threads ... bench: 65 ns/iter (+/- 0) test time::tests::instant_contention_02_threads ... bench: 157 ns/iter (+/- 20) test time::tests::instant_contention_04_threads ... bench: 281 ns/iter (+/- 53) test time::tests::instant_contention_08_threads ... bench: 555 ns/iter (+/- 77) test time::tests::instant_contention_16_threads ... bench: 883 ns/iter (+/- 107) # mutex test time::tests::instant_contention_01_threads ... bench: 60 ns/iter (+/- 2) test time::tests::instant_contention_02_threads ... bench: 770 ns/iter (+/- 231) test time::tests::instant_contention_04_threads ... bench: 1,347 ns/iter (+/- 45) test time::tests::instant_contention_08_threads ... bench: 2,693 ns/iter (+/- 114) test time::tests::instant_contention_16_threads ... bench: 5,244 ns/iter (+/- 487) ``` Since I don't have an arm machine with 128bit atomics I wasn't able to benchmark the AtomicU128 implementation.
2021-08-20 19:06:46 +00:00 · 2021-08-20 19:06:46 +00:00 · a0035916e0
parent 914a1e2c51 cd82b4246e
commit a0035916e0
3 changed files with 215 additions and 12 deletions
--- a/library/std/src/time.rs
+++ b/library/std/src/time.rs
@ -12,15 +12,14 @@

 #![stable(feature = "time", since = "1.3.0")]

+mod monotonic;
 #[cfg(test)]
 mod tests;

-use crate::cmp;
 use crate::error::Error;
 use crate::fmt;
 use crate::ops::{Add, AddAssign, Sub, SubAssign};
 use crate::sys::time;
-use crate::sys_common::mutex::StaticMutex;
 use crate::sys_common::FromInner;

 #[stable(feature = "time", since = "1.3.0")]
@ -249,14 +248,7 @@ impl Instant {
            return Instant(os_now);
        }

-        static LOCK: StaticMutex = StaticMutex::new();
-        static mut LAST_NOW: time::Instant = time::Instant::zero();
-        unsafe {
-            let _lock = LOCK.lock();
-            let now = cmp::max(LAST_NOW, os_now);
-            LAST_NOW = now;
-            Instant(now)
-        }
+        Instant(monotonic::monotonize(os_now))
    }

    /// Returns the amount of time elapsed from another instant to this one.
--- a/library/std/src/time/monotonic.rs
+++ b/library/std/src/time/monotonic.rs
@ -0,0 +1,115 @@
+use crate::sys::time;
+
+#[inline]
+pub(super) fn monotonize(raw: time::Instant) -> time::Instant {
+    inner::monotonize(raw)
+}
+
+#[cfg(all(target_has_atomic = "64", not(target_has_atomic = "128")))]
+pub mod inner {
+    use crate::sync::atomic::AtomicU64;
+    use crate::sync::atomic::Ordering::*;
+    use crate::sys::time;
+    use crate::time::Duration;
+
+    pub(in crate::time) const ZERO: time::Instant = time::Instant::zero();
+
+    // bits 30 and 31 are never used since the nanoseconds part never exceeds 10^9
+    const UNINITIALIZED: u64 = 0b11 << 30;
+    static MONO: AtomicU64 = AtomicU64::new(UNINITIALIZED);
+
+    #[inline]
+    pub(super) fn monotonize(raw: time::Instant) -> time::Instant {
+        monotonize_impl(&MONO, raw)
+    }
+
+    #[inline]
+    pub(in crate::time) fn monotonize_impl(mono: &AtomicU64, raw: time::Instant) -> time::Instant {
+        let delta = raw.checked_sub_instant(&ZERO).unwrap();
+        let secs = delta.as_secs();
+        // occupies no more than 30 bits (10^9 seconds)
+        let nanos = delta.subsec_nanos() as u64;
+
+        // This wraps around every 136 years (2^32 seconds).
+        // To detect backsliding we use wrapping arithmetic and declare forward steps smaller
+        // than 2^31 seconds as expected and everything else as a backslide which will be
+        // monotonized.
+        // This could be a problem for programs that call instants at intervals greater
+        // than 68 years. Interstellar probes may want to ensure that actually_monotonic() is true.
+        let packed = (secs << 32) | nanos;
+        let old = mono.load(Relaxed);
+
+        if old == UNINITIALIZED || packed.wrapping_sub(old) < u64::MAX / 2 {
+            mono.store(packed, Relaxed);
+            raw
+        } else {
+            // Backslide occurred. We reconstruct monotonized time from the upper 32 bit of the
+            // passed in value and the 64bits loaded from the atomic
+            let seconds_lower = old >> 32;
+            let mut seconds_upper = secs & 0xffff_ffff_0000_0000;
+            if secs & 0xffff_ffff > seconds_lower {
+                // Backslide caused the lower 32bit of the seconds part to wrap.
+                // This must be the case because the seconds part is larger even though
+                // we are in the backslide branch, i.e. the seconds count should be smaller or equal.
+                //
+                // We assume that backslides are smaller than 2^32 seconds
+                // which means we need to add 1 to the upper half to restore it.
+                //
+                // Example:
+                // most recent observed time: 0xA1_0000_0000_0000_0000u128
+                // bits stored in AtomicU64:     0x0000_0000_0000_0000u64
+                // backslide by 1s
+                // caller time is             0xA0_ffff_ffff_0000_0000u128
+                // -> we can fix up the upper half time by adding 1 << 32
+                seconds_upper = seconds_upper.wrapping_add(0x1_0000_0000);
+            }
+            let secs = seconds_upper | seconds_lower;
+            let nanos = old as u32;
+            ZERO.checked_add_duration(&Duration::new(secs, nanos)).unwrap()
+        }
+    }
+}
+
+#[cfg(target_has_atomic = "128")]
+pub mod inner {
+    use crate::sync::atomic::AtomicU128;
+    use crate::sync::atomic::Ordering::*;
+    use crate::sys::time;
+    use crate::time::Duration;
+
+    const ZERO: time::Instant = time::Instant::zero();
+    static MONO: AtomicU128 = AtomicU128::new(0);
+
+    #[inline]
+    pub(super) fn monotonize(raw: time::Instant) -> time::Instant {
+        let delta = raw.checked_sub_instant(&ZERO).unwrap();
+        // Split into seconds and nanos since Duration doesn't have a
+        // constructor that takes an u128
+        let secs = delta.as_secs() as u128;
+        let nanos = delta.subsec_nanos() as u128;
+        let timestamp: u128 = secs << 64 | nanos;
+        let timestamp = MONO.fetch_max(timestamp, Relaxed).max(timestamp);
+        let secs = (timestamp >> 64) as u64;
+        let nanos = timestamp as u32;
+        ZERO.checked_add_duration(&Duration::new(secs, nanos)).unwrap()
+    }
+}
+
+#[cfg(not(any(target_has_atomic = "64", target_has_atomic = "128")))]
+pub mod inner {
+    use crate::cmp;
+    use crate::sys::time;
+    use crate::sys_common::mutex::StaticMutex;
+
+    #[inline]
+    pub(super) fn monotonize(os_now: time::Instant) -> time::Instant {
+        static LOCK: StaticMutex = StaticMutex::new();
+        static mut LAST_NOW: time::Instant = time::Instant::zero();
+        unsafe {
+            let _lock = LOCK.lock();
+            let now = cmp::max(LAST_NOW, os_now);
+            LAST_NOW = now;
+            now
+        }
+    }
+}
--- a/library/std/src/time/tests.rs
+++ b/library/std/src/time/tests.rs
@ -1,4 +1,6 @@
 use super::{Duration, Instant, SystemTime, UNIX_EPOCH};
+#[cfg(not(target_arch = "wasm32"))]
+use test::{black_box, Bencher};

 macro_rules! assert_almost_eq {
    ($a:expr, $b:expr) => {{
@ -13,8 +15,34 @@ macro_rules! assert_almost_eq {
 #[test]
 fn instant_monotonic() {
    let a = Instant::now();
-    let b = Instant::now();
-    assert!(b >= a);
+    loop {
+        let b = Instant::now();
+        assert!(b >= a);
+        if b > a {
+            break;
+        }
+    }
+}
+
+#[test]
+#[cfg(not(target_arch = "wasm32"))]
+fn instant_monotonic_concurrent() -> crate::thread::Result<()> {
+    let threads: Vec<_> = (0..8)
+        .map(|_| {
+            crate::thread::spawn(|| {
+                let mut old = Instant::now();
+                for _ in 0..5_000_000 {
+                    let new = Instant::now();
+                    assert!(new >= old);
+                    old = new;
+                }
+            })
+        })
+        .collect();
+    for t in threads {
+        t.join()?;
+    }
+    Ok(())
 }

 #[test]
@ -163,3 +191,71 @@ fn since_epoch() {
    let hundred_twenty_years = thirty_years * 4;
    assert!(a < hundred_twenty_years);
 }
+
+#[cfg(all(target_has_atomic = "64", not(target_has_atomic = "128")))]
+#[test]
+fn monotonizer_wrapping_backslide() {
+    use super::monotonic::inner::{monotonize_impl, ZERO};
+    use core::sync::atomic::AtomicU64;
+
+    let reference = AtomicU64::new(0);
+
+    let time = match ZERO.checked_add_duration(&Duration::from_secs(0xffff_ffff)) {
+        Some(time) => time,
+        None => {
+            // platform cannot represent u32::MAX seconds so it won't have to deal with this kind
+            // of overflow either
+            return;
+        }
+    };
+
+    let monotonized = monotonize_impl(&reference, time);
+    let expected = ZERO.checked_add_duration(&Duration::from_secs(1 << 32)).unwrap();
+    assert_eq!(
+        monotonized, expected,
+        "64bit monotonizer should handle overflows in the seconds part"
+    );
+}
+
+macro_rules! bench_instant_threaded {
+    ($bench_name:ident, $thread_count:expr) => {
+        #[bench]
+        #[cfg(not(target_arch = "wasm32"))]
+        fn $bench_name(b: &mut Bencher) -> crate::thread::Result<()> {
+            use crate::sync::atomic::{AtomicBool, Ordering};
+            use crate::sync::Arc;
+
+            let running = Arc::new(AtomicBool::new(true));
+
+            let threads: Vec<_> = (0..$thread_count)
+                .map(|_| {
+                    let flag = Arc::clone(&running);
+                    crate::thread::spawn(move || {
+                        while flag.load(Ordering::Relaxed) {
+                            black_box(Instant::now());
+                        }
+                    })
+                })
+                .collect();
+
+            b.iter(|| {
+                let a = Instant::now();
+                let b = Instant::now();
+                assert!(b >= a);
+            });
+
+            running.store(false, Ordering::Relaxed);
+
+            for t in threads {
+                t.join()?;
+            }
+            Ok(())
+        }
+    };
+}
+
+bench_instant_threaded!(instant_contention_01_threads, 0);
+bench_instant_threaded!(instant_contention_02_threads, 1);
+bench_instant_threaded!(instant_contention_04_threads, 3);
+bench_instant_threaded!(instant_contention_08_threads, 7);
+bench_instant_threaded!(instant_contention_16_threads, 15);