diff --git a/src/bin/miri.rs b/src/bin/miri.rs
index ef1429a35020..1117b69116a5 100644
--- a/src/bin/miri.rs
+++ b/src/bin/miri.rs
@@ -195,6 +195,9 @@ fn main() {
                 "-Zmiri-disable-stacked-borrows" => {
                     miri_config.stacked_borrows = false;
                 }
+                "-Zmiri-disable-data-race-detector" => {
+                    miri_config.data_race_detector = false;
+                }
                 "-Zmiri-disable-alignment-check" => {
                     miri_config.check_alignment = miri::AlignmentCheck::None;
                 }
diff --git a/src/data_race.rs b/src/data_race.rs
index 57f09146d6f8..822ceab8fa04 100644
--- a/src/data_race.rs
+++ b/src/data_race.rs
@@ -1,16 +1,36 @@
-//! Implementation of a data-race detector
-//!  uses Lamport Timestamps / Vector-clocks
-//!  base on the Dyamic Race Detection for C++:
-//!     - https://www.doc.ic.ac.uk/~afd/homepages/papers/pdfs/2017/POPL.pdf
-//!  to extend data-race detection to work correctly with fences
-//!  and RMW operations
+//! Implementation of a data-race detector using Lamport Timestamps / Vector-clocks
+//! based on the Dyamic Race Detection for C++:
+//! https://www.doc.ic.ac.uk/~afd/homepages/papers/pdfs/2017/POPL.pdf
+//! which does not report false-positives when fences are used, and gives better
+//! accuracy in presence of read-modify-write operations.
+//!
 //! This does not explore weak memory orders and so can still miss data-races
-//!  but should not report false-positives
+//! but should not report false-positives
+//!
 //! Data-race definiton from(https://en.cppreference.com/w/cpp/language/memory_model#Threads_and_data_races):
-//!  - if a memory location is accessed by twice is a data-race unless:
-//!    - both operations execute on the same thread/signal-handler
-//!    - both conflicting operations are atomic operations (1 atomic and 1 non-atomic race)
-//!    - 1 of the operations happens-before the other operation (see link for definition)
+//! a data race occurs between two memory accesses if they are on different threads, at least one operation
+//! is non-atomic, at least one operation is a write and neither access happens-before the other. Read the link
+//! for full definition.
+//! 
+//! This re-uses vector indexes for threads that are known to be unable to report data-races, this is valid
+//! because it only re-uses vector indexes once all currently-active (not-terminated) threads have an internal
+//! vector clock that happens-after the join operation of the candidate thread. Threads that have not been joined
+//! on are not considered. Since the thread's vector clock will only increase and a data-race implies that
+//! there is some index x where clock[x] > thread_clock, when this is true clock[candidate-idx] > thread_clock
+//! can never hold and hence a data-race can never be reported in that vector index again.
+//! This means that the thread-index can be safely re-used, starting on the next timestamp for the newly created
+//! thread.
+//!
+//! The sequentially consistant ordering corresponds to the ordering that the threads
+//! are currently scheduled, this means that the data-race detector has no additional
+//! logic for sequentially consistent accesses at the moment since they are indistinguishable
+//! from acquire/release operations. If weak memory orderings are explored then this
+//! may need to change or be updated accordingly.
+//!
+//! FIXME:
+//! currently we have our own local copy of the currently active thread index and names, this is due
+//! in part to the inability to access the current location of threads.active_thread inside the AllocExtra
+//! read, write and deallocate functions and should be cleaned up in the future.
 
 use std::{
     fmt::Debug, rc::Rc,
@@ -19,23 +39,23 @@ use std::{
 
 use rustc_index::vec::{Idx, IndexVec};
 use rustc_target::abi::Size;
-use rustc_middle::ty::layout::TyAndLayout;
+use rustc_middle::{mir, ty::layout::TyAndLayout};
 use rustc_data_structures::fx::{FxHashSet, FxHashMap};
 
 use crate::{
     MiriEvalContext, MiriEvalContextExt,
     ThreadId, Tag, RangeMap,
     InterpResult, Pointer, ScalarMaybeUninit,
-    MPlaceTy, OpTy, MemPlaceMeta,
-    VClock, VSmallClockSet, VectorIdx, VTimestamp
+    MPlaceTy, OpTy, MemPlaceMeta, ImmTy, Immediate,
+    VClock, VSmallClockMap, VectorIdx, VTimestamp
 };
 
 pub type AllocExtra = VClockAlloc;
 pub type MemoryExtra = Rc<GlobalState>;
 
-/// Valid atomic read-write operations, alias of atomic::Ordering (not non-exhaustive)
+/// Valid atomic read-write operations, alias of atomic::Ordering (not non-exhaustive).
 #[derive(Copy, Clone, PartialEq, Eq, Debug)]
-pub enum AtomicRWOp {
+pub enum AtomicRwOp {
     Relaxed,
     Acquire,
     Release,
@@ -43,7 +63,7 @@ pub enum AtomicRWOp {
     SeqCst,
 }
 
-/// Valid atomic read operations, subset of atomic::Ordering
+/// Valid atomic read operations, subset of atomic::Ordering.
 #[derive(Copy, Clone, PartialEq, Eq, Debug)]
 pub enum AtomicReadOp {
     Relaxed,
@@ -51,7 +71,7 @@ pub enum AtomicReadOp {
     SeqCst,
 }
 
-/// Valid atomic write operations, subset of atomic::Ordering
+/// Valid atomic write operations, subset of atomic::Ordering.
 #[derive(Copy, Clone, PartialEq, Eq, Debug)]
 pub enum AtomicWriteOp {
     Relaxed,
@@ -60,7 +80,7 @@ pub enum AtomicWriteOp {
 }
 
 
-/// Valid atomic fence operations, subset of atomic::Ordering
+/// Valid atomic fence operations, subset of atomic::Ordering.
 #[derive(Copy, Clone, PartialEq, Eq, Debug)]
 pub enum AtomicFenceOp {
     Acquire,
@@ -69,315 +89,124 @@ pub enum AtomicFenceOp {
     SeqCst,
 }
 
-/// Evaluation context extensions
-impl<'mir, 'tcx: 'mir> EvalContextExt<'mir, 'tcx> for MiriEvalContext<'mir, 'tcx> {}
-pub trait EvalContextExt<'mir, 'tcx: 'mir>: MiriEvalContextExt<'mir, 'tcx> {
-
-    // Temporarily allow data-races to occur, this should only be
-    //  used if either one of the appropiate `validate_atomic` functions
-    //  will be called to treat a memory access as atomic or if the memory
-    //  being accessed should be treated as internal state, that cannot be
-    //  accessed by the interpreted program.
-    #[inline]
-    fn allow_data_races_ref<R>(&self, op: impl FnOnce(&MiriEvalContext<'mir, 'tcx>) -> R) -> R {
-        let this = self.eval_context_ref();
-        let data_race = &*this.memory.extra.data_race;
-        let old = data_race.multi_threaded.replace(false);
-        let result = op(this);
-        data_race.multi_threaded.set(old);
-        result
-    }
-
-    /// Same as `allow_data_races_ref`, this temporarily disables any data-race detection and
-    ///  so should only be used for atomic operations or internal state that the program cannot
-    ///  access
-    #[inline]
-    fn allow_data_races_mut<R>(&mut self, op: impl FnOnce(&mut MiriEvalContext<'mir, 'tcx>) -> R) -> R {
-        let this = self.eval_context_mut();
-        let data_race = &*this.memory.extra.data_race;
-        let old = data_race.multi_threaded.replace(false);
-        let result = op(this);
-        let data_race = &*this.memory.extra.data_race;
-        data_race.multi_threaded.set(old);
-        result
-    }
 
 
-    fn read_scalar_at_offset_atomic(
-        &self,
-        op: OpTy<'tcx, Tag>,
-        offset: u64,
-        layout: TyAndLayout<'tcx>,
-        atomic: AtomicReadOp
-    ) -> InterpResult<'tcx, ScalarMaybeUninit<Tag>> {
-        let this = self.eval_context_ref();
-        let op_place = this.deref_operand(op)?;
-        let offset = Size::from_bytes(offset);
-        // Ensure that the following read at an offset is within bounds
-        assert!(op_place.layout.size >= offset + layout.size);
-        let value_place = op_place.offset(offset, MemPlaceMeta::None, layout, this)?;
-        this.read_scalar_atomic(value_place, atomic)
-    }
-    fn write_scalar_at_offset_atomic(
-        &mut self,
-        op: OpTy<'tcx, Tag>,
-        offset: u64,
-        value: impl Into<ScalarMaybeUninit<Tag>>,
-        layout: TyAndLayout<'tcx>,
-        atomic: AtomicWriteOp
-    ) -> InterpResult<'tcx> {
-        let this = self.eval_context_mut();
-        let op_place = this.deref_operand(op)?;
-        let offset = Size::from_bytes(offset);
-        // Ensure that the following read at an offset is within bounds
-        assert!(op_place.layout.size >= offset + layout.size);
-        let value_place = op_place.offset(offset, MemPlaceMeta::None, layout, this)?;
-        this.write_scalar_atomic(value.into(), value_place, atomic)
-    }
-    fn read_scalar_atomic(
-        &self, place: MPlaceTy<'tcx, Tag>, atomic: AtomicReadOp
-    ) -> InterpResult<'tcx, ScalarMaybeUninit<Tag>> {
-        let scalar = self.allow_data_races_ref(move |this| {
-            this.read_scalar(place.into())
-        })?;
-        self.validate_atomic_load(place, atomic)?;
-        Ok(scalar)
-    }
-    fn write_scalar_atomic(
-        &mut self, val: ScalarMaybeUninit<Tag>, dest: MPlaceTy<'tcx, Tag>,
-        atomic: AtomicWriteOp
-    ) -> InterpResult<'tcx> {
-        self.allow_data_races_mut(move |this| {
-            this.write_scalar(val, dest.into())
-        })?;
-        self.validate_atomic_store(dest, atomic)
-    }
-    
-    /// Update the data-race detector for an atomic read occuring at the
-    ///  associated memory-place and on the current thread
-    fn validate_atomic_load(
-        &self, place: MPlaceTy<'tcx, Tag>, atomic: AtomicReadOp
-    ) -> InterpResult<'tcx> {
-        let this = self.eval_context_ref();
-        this.validate_atomic_op(
-            place, atomic, "Atomic Load",
-            move |memory, clocks, index, atomic| {
-                if atomic == AtomicReadOp::Relaxed {
-                    memory.load_relaxed(&mut *clocks, index)
-                }else{
-                    memory.acquire(&mut *clocks, index)
-                }
-            }
-        )
-    }
+/// The current set of vector clocks describing the state
+/// of a thread, contains the happens-before clock and
+/// additional metadata to model atomic fence operations.
+#[derive(Clone, Default, Debug)]
+struct ThreadClockSet {
 
-    /// Update the data-race detector for an atomic write occuring at the
-    ///  associated memory-place and on the current thread
-    fn validate_atomic_store(
-        &mut self, place: MPlaceTy<'tcx, Tag>, atomic: AtomicWriteOp
-    ) -> InterpResult<'tcx> {
-        let this = self.eval_context_ref();
-        this.validate_atomic_op(
-            place, atomic, "Atomic Store",
-            move |memory, clocks, index, atomic| {
-                if atomic == AtomicWriteOp::Relaxed {
-                    memory.store_relaxed(clocks, index)
-                }else{
-                    memory.release(clocks, index)
-                }
-            }
-        )
-    }
-
-    /// Update the data-race detector for an atomic read-modify-write occuring
-    ///  at the associated memory place and on the current thread
-    fn validate_atomic_rmw(
-        &mut self, place: MPlaceTy<'tcx, Tag>, atomic: AtomicRWOp
-    ) -> InterpResult<'tcx> {
-        use AtomicRWOp::*;
-        let acquire = matches!(atomic, Acquire | AcqRel | SeqCst);
-        let release = matches!(atomic, Release | AcqRel | SeqCst);
-        let this = self.eval_context_ref();
-        this.validate_atomic_op(
-            place, atomic, "Atomic RMW",
-            move |memory, clocks, index, _| {
-                if acquire {
-                    memory.acquire(clocks, index)?;
-                }else{
-                    memory.load_relaxed(clocks, index)?;
-                }
-                if release {
-                    memory.rmw_release(clocks, index)
-                }else{
-                    memory.rmw_relaxed(clocks, index)
-                }
-            }
-        )
-    }
-
-    /// Update the data-race detector for an atomic fence on the current thread
-    fn validate_atomic_fence(&mut self, atomic: AtomicFenceOp) -> InterpResult<'tcx> {
-        let this = self.eval_context_mut();
-        let data_race = &*this.memory.extra.data_race;
-        data_race.maybe_perform_sync_operation(move |index, mut clocks| {
-            log::trace!("Atomic fence on {:?} with ordering {:?}", index, atomic);
-            // Apply data-race detection for the current fences
-            //  this treats AcqRel and SeqCst as the same as a acquire
-            //  and release fence applied in the same timestamp.
-            if atomic != AtomicFenceOp::Release {
-                // Either Acquire | AcqRel | SeqCst
-                clocks.apply_acquire_fence();
-            }
-            if atomic != AtomicFenceOp::Acquire {
-                // Either Release | AcqRel | SeqCst
-                clocks.apply_release_fence();
-            }
-            Ok(())
-        })
-    }
-}
-
-impl<'mir, 'tcx: 'mir> EvalContextPrivExt<'mir, 'tcx> for MiriEvalContext<'mir, 'tcx> {}
-trait EvalContextPrivExt<'mir, 'tcx: 'mir>: MiriEvalContextExt<'mir, 'tcx> {
-
-    /// Generic atomic operation implementation,
-    ///  this accesses memory via get_raw instead of
-    ///  get_raw_mut, due to issues calling get_raw_mut
-    ///  for atomic loads from read-only memory
-    /// FIXME: is this valid, or should get_raw_mut be used for
-    ///  atomic-stores/atomic-rmw?
-    fn validate_atomic_op<A: Debug + Copy>(
-        &self, place: MPlaceTy<'tcx, Tag>,
-        atomic: A, description: &str,
-        mut op: impl FnMut(
-            &mut MemoryCellClocks, &mut ThreadClockSet, VectorIdx, A
-        ) -> Result<(), DataRace>
-    ) -> InterpResult<'tcx> {
-        let this = self.eval_context_ref();
-        let data_race = &*this.memory.extra.data_race;
-        if data_race.multi_threaded.get() {
-
-            // Load an log the atomic operation
-            let place_ptr = place.ptr.assert_ptr();
-            let size = place.layout.size;
-            let alloc_meta =  &this.memory.get_raw(place_ptr.alloc_id)?.extra.data_race;
-            log::trace!(
-                "Atomic op({}) with ordering {:?} on memory({:?}, offset={}, size={})",
-                description, &atomic, place_ptr.alloc_id, place_ptr.offset.bytes(), size.bytes()
-            );
-
-            // Perform the atomic operation
-            let data_race = &alloc_meta.global;
-            data_race.maybe_perform_sync_operation(|index, mut clocks| {
-                for (_,range) in alloc_meta.alloc_ranges.borrow_mut().iter_mut(place_ptr.offset, size) {
-                    if let Err(DataRace) = op(range, &mut *clocks, index, atomic) {
-                        mem::drop(clocks);
-                        return VClockAlloc::report_data_race(
-                            &alloc_meta.global, range, description, true,
-                            place_ptr, size
-                        );
-                    }
-                }
-                Ok(())
-            })?;
-
-            // Log changes to atomic memory
-            if log::log_enabled!(log::Level::Trace) {
-                for (_,range) in alloc_meta.alloc_ranges.borrow().iter(place_ptr.offset, size) {
-                    log::trace!(
-                        "Updated atomic memory({:?}, offset={}, size={}) to {:#?}",
-                        place.ptr.assert_ptr().alloc_id, place_ptr.offset.bytes(), size.bytes(),
-                        range.atomic_ops
-                    );
-                }
-            }
-        }
-        Ok(())
-    }
-
-}
-
-/// Handle for locks to express their
-///  acquire-release semantics
-#[derive(Clone, Debug, Default)]
-pub struct DataRaceLockHandle {
-
-    /// Internal acquire-release clock
-    ///  to express the acquire release sync
-    ///  found in concurrency primitives
+    /// The increasing clock representing timestamps
+    /// that happen-before this thread.
     clock: VClock,
+
+    /// The set of timestamps that will happen-before this
+    /// thread once it performs an acquire fence.
+    fence_acquire: VClock,
+
+    /// The last timesamp of happens-before relations that
+    /// have been released by this thread by a fence.
+    fence_release: VClock,
 }
-impl DataRaceLockHandle {
-    pub fn set_values(&mut self, other: &Self) {
-        self.clock.clone_from(&other.clock)
+
+
+impl ThreadClockSet {
+
+    /// Apply the effects of a release fence to this
+    /// set of thread vector clocks.
+    #[inline]
+    fn apply_release_fence(&mut self) {
+        self.fence_release.clone_from(&self.clock);
     }
-    pub fn reset(&mut self) {
-        self.clock.set_zero_vector();
+
+    /// Apply the effects of a acquire fence to this
+    /// set of thread vector clocks.
+    #[inline]
+    fn apply_acquire_fence(&mut self) {
+        self.clock.join(&self.fence_acquire);
+    }
+
+    /// Increment the happens-before clock at a
+    /// known index.
+    #[inline]
+    fn increment_clock(&mut self, index: VectorIdx) {
+        self.clock.increment_index(index);
+    }
+
+    /// Join the happens-before clock with that of
+    /// another thread, used to model thread join
+    /// operations.
+    fn join_with(&mut self, other: &ThreadClockSet) {
+        self.clock.join(&other.clock);
     }
 }
 
 
 /// Error returned by finding a data race
-///  should be elaborated upon
+/// should be elaborated upon.
 #[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Debug)]
 pub struct DataRace;
 
 /// Externally stored memory cell clocks
-///  explicitly to reduce memory usage for the
-///  common case where no atomic operations
-///  exists on the memory cell
+/// explicitly to reduce memory usage for the
+/// common case where no atomic operations
+/// exists on the memory cell.
 #[derive(Clone, PartialEq, Eq, Default, Debug)]
 struct AtomicMemoryCellClocks {
 
-    /// The clock-vector for the set of atomic read operations
-    ///  used for detecting data-races with non-atomic write
-    ///  operations
+    /// The clock-vector of the timestamp of the last atomic
+    /// read operation performed by each thread.
+    /// This detects potential data-races between atomic read
+    /// and non-atomic write operations.
     read_vector: VClock,
 
-    /// The clock-vector for the set of atomic write operations
-    ///  used for detecting data-races with non-atomic read or
-    ///  write operations
+    /// The clock-vector of the timestamp of the last atomic
+    /// write operation performed by each thread.
+    /// This detects potential data-races between atomic write
+    /// and non-atomic read or write operations.
     write_vector: VClock,
 
     /// Synchronization vector for acquire-release semantics
-    ///   contains the vector of timestamps that will
-    ///   happen-before a thread if an acquire-load is 
-    ///   performed on the data
+    /// contains the vector of timestamps that will
+    /// happen-before a thread if an acquire-load is 
+    /// performed on the data.
     sync_vector: VClock,
 
     /// The Hash-Map of all threads for which a release
-    ///  sequence exists in the memory cell, required
-    ///  since read-modify-write operations do not
-    ///  invalidate existing release sequences 
-    release_sequences: VSmallClockSet,
+    /// sequence exists in the memory cell, required
+    /// since read-modify-write operations do not
+    /// invalidate existing release sequences.
+    /// See page 6 of linked paper.
+    release_sequences: VSmallClockMap,
 }
 
 /// Memory Cell vector clock metadata
-///  for data-race detection
+/// for data-race detection.
 #[derive(Clone, PartialEq, Eq, Debug)]
 struct MemoryCellClocks {
 
-    /// The vector-clock of the last write, only one value is stored
-    ///  since all previous writes happened-before the current write
+    /// The vector-clock timestamp of the last write
+    /// corresponding to the writing threads timestamp.
     write: VTimestamp,
 
-    /// The identifier of the thread that performed the last write
-    ///  operation
+    /// The identifier of the vector index, corresponding to a thread
+    /// that performed the last write operation.
     write_index: VectorIdx,
 
-    /// The vector-clock of the set of previous reads
-    ///  each index is set to the timestamp that the associated
-    ///  thread last read this value.
+    /// The vector-clock of the timestamp of the last read operation
+    /// performed by a thread since the last write operation occured.
     read: VClock,
 
-    /// Atomic acquire & release sequence tracking clocks
-    ///  for non-atomic memory in the common case this
-    ///  value is set to None
+    /// Atomic acquire & release sequence tracking clocks.
+    /// For non-atomic memory in the common case this
+    /// value is set to None.
     atomic_ops: Option<Box<AtomicMemoryCellClocks>>,
 }
 
+
 /// Create a default memory cell clocks instance
-///  for uninitialized memory
+/// for uninitialized memory.
 impl Default for MemoryCellClocks {
     fn default() -> Self {
         MemoryCellClocks {
@@ -389,9 +218,10 @@ impl Default for MemoryCellClocks {
     }
 }
 
+
 impl MemoryCellClocks {
 
-    /// Load the internal atomic memory cells if they exist
+    /// Load the internal atomic memory cells if they exist.
     #[inline]
     fn atomic(&self) -> Option<&AtomicMemoryCellClocks> {
         match &self.atomic_ops {
@@ -401,25 +231,26 @@ impl MemoryCellClocks {
     }
 
     /// Load or create the internal atomic memory metadata
-    ///  if it does not exist
+    /// if it does not exist.
     #[inline]
     fn atomic_mut(&mut self) -> &mut AtomicMemoryCellClocks {
         self.atomic_ops.get_or_insert_with(Default::default)
     }
 
     /// Update memory cell data-race tracking for atomic
-    ///  load acquire semantics, is a no-op if this memory was
-    ///  not used previously as atomic memory
-    fn acquire(&mut self, clocks: &mut ThreadClockSet, index: VectorIdx) -> Result<(), DataRace> {
+    /// load acquire semantics, is a no-op if this memory was
+    /// not used previously as atomic memory.
+    fn load_acquire(&mut self, clocks: &mut ThreadClockSet, index: VectorIdx) -> Result<(), DataRace> {
         self.atomic_read_detect(clocks, index)?;
         if let Some(atomic) = self.atomic() {
             clocks.clock.join(&atomic.sync_vector);
         }
         Ok(())
     }
+
     /// Update memory cell data-race tracking for atomic
-    ///  load relaxed semantics, is a no-op if this memory was
-    ///  not used previously as atomic memory
+    /// load relaxed semantics, is a no-op if this memory was
+    /// not used previously as atomic memory.
     fn load_relaxed(&mut self, clocks: &mut ThreadClockSet, index: VectorIdx) -> Result<(), DataRace> {
         self.atomic_read_detect(clocks, index)?;
         if let Some(atomic) = self.atomic() {
@@ -430,8 +261,8 @@ impl MemoryCellClocks {
 
 
     /// Update the memory cell data-race tracking for atomic
-    ///  store release semantics
-    fn release(&mut self, clocks: &ThreadClockSet, index: VectorIdx) -> Result<(), DataRace> {
+    /// store release semantics.
+    fn store_release(&mut self, clocks: &ThreadClockSet, index: VectorIdx) -> Result<(), DataRace> {
         self.atomic_write_detect(clocks, index)?;
         let atomic = self.atomic_mut();
         atomic.sync_vector.clone_from(&clocks.clock);
@@ -439,8 +270,9 @@ impl MemoryCellClocks {
         atomic.release_sequences.insert(index, &clocks.clock);
         Ok(())
     }
+
     /// Update the memory cell data-race tracking for atomic
-    ///  store relaxed semantics
+    /// store relaxed semantics.
     fn store_relaxed(&mut self, clocks: &ThreadClockSet, index: VectorIdx) -> Result<(), DataRace> {
         self.atomic_write_detect(clocks, index)?;
         let atomic = self.atomic_mut();
@@ -451,8 +283,9 @@ impl MemoryCellClocks {
         atomic.release_sequences.retain_index(index);
         Ok(())
     }
+
     /// Update the memory cell data-race tracking for atomic
-    ///  store release semantics for RMW operations
+    /// store release semantics for RMW operations.
     fn rmw_release(&mut self, clocks: &ThreadClockSet, index: VectorIdx) -> Result<(), DataRace> {
         self.atomic_write_detect(clocks, index)?;
         let atomic = self.atomic_mut();
@@ -460,8 +293,9 @@ impl MemoryCellClocks {
         atomic.release_sequences.insert(index, &clocks.clock);
         Ok(())
     }
+
     /// Update the memory cell data-race tracking for atomic
-    ///  store relaxed semantics for RMW operations
+    /// store relaxed semantics for RMW operations.
     fn rmw_relaxed(&mut self, clocks: &ThreadClockSet, index: VectorIdx) -> Result<(), DataRace> {
         self.atomic_write_detect(clocks, index)?;
         let atomic = self.atomic_mut();
@@ -470,60 +304,60 @@ impl MemoryCellClocks {
     }
     
     /// Detect data-races with an atomic read, caused by a non-atomic write that does
-    ///  not happen-before the atomic-read
+    /// not happen-before the atomic-read.
     fn atomic_read_detect(&mut self, clocks: &ThreadClockSet, index: VectorIdx) -> Result<(), DataRace> {
         log::trace!("Atomic read with vectors: {:#?} :: {:#?}", self, clocks);
         if self.write <= clocks.clock[self.write_index] {
             let atomic = self.atomic_mut();
             atomic.read_vector.set_at_index(&clocks.clock, index);
             Ok(())
-        }else{
+        } else {
             Err(DataRace)
         }
     }
 
     /// Detect data-races with an atomic write, either with a non-atomic read or with
-    ///  a non-atomic write:
+    /// a non-atomic write.
     fn atomic_write_detect(&mut self, clocks: &ThreadClockSet, index: VectorIdx) -> Result<(), DataRace> {
         log::trace!("Atomic write with vectors: {:#?} :: {:#?}", self, clocks);
         if self.write <= clocks.clock[self.write_index] && self.read <= clocks.clock {
             let atomic = self.atomic_mut();
             atomic.write_vector.set_at_index(&clocks.clock, index);
             Ok(())
-        }else{
+        } else {
             Err(DataRace)
         }
     }
 
     /// Detect races for non-atomic read operations at the current memory cell
-    ///  returns true if a data-race is detected
+    /// returns true if a data-race is detected.
     fn read_race_detect(&mut self, clocks: &ThreadClockSet, index: VectorIdx) -> Result<(), DataRace> {
         log::trace!("Unsynchronized read with vectors: {:#?} :: {:#?}", self, clocks);
         if self.write <= clocks.clock[self.write_index] {
             let race_free = if let Some(atomic) = self.atomic() {
                 atomic.write_vector <= clocks.clock
-            }else{
+            } else {
                 true
             };
             if race_free {
                 self.read.set_at_index(&clocks.clock, index);
                 Ok(())
-            }else{
+            } else {
                 Err(DataRace)
             }
-        }else{
+        } else {
             Err(DataRace)
         }
     }
 
     /// Detect races for non-atomic write operations at the current memory cell
-    ///  returns true if a data-race is detected
+    /// returns true if a data-race is detected.
     fn write_race_detect(&mut self, clocks: &ThreadClockSet, index: VectorIdx)  -> Result<(), DataRace> {
         log::trace!("Unsynchronized write with vectors: {:#?} :: {:#?}", self, clocks);
         if self.write <= clocks.clock[self.write_index] && self.read <= clocks.clock {
             let race_free = if let Some(atomic) = self.atomic() {
                 atomic.write_vector <= clocks.clock && atomic.read_vector <= clocks.clock
-            }else{
+            } else {
                 true
             };
             if race_free {
@@ -531,30 +365,269 @@ impl MemoryCellClocks {
                 self.write_index = index;
                 self.read.set_zero_vector();
                 Ok(())
-            }else{
+            } else {
                 Err(DataRace)
             }
-        }else{
+        } else {
             Err(DataRace)
         }
     }
 }
 
-/// Vector clock metadata for a logical memory allocation
+
+/// Evaluation context extensions.
+impl<'mir, 'tcx: 'mir> EvalContextExt<'mir, 'tcx> for MiriEvalContext<'mir, 'tcx> {}
+pub trait EvalContextExt<'mir, 'tcx: 'mir>: MiriEvalContextExt<'mir, 'tcx> {
+
+    /// Atomic variant of read_scalar_at_offset.
+    fn read_scalar_at_offset_atomic(
+        &self,
+        op: OpTy<'tcx, Tag>,
+        offset: u64,
+        layout: TyAndLayout<'tcx>,
+        atomic: AtomicReadOp
+    ) -> InterpResult<'tcx, ScalarMaybeUninit<Tag>> {
+        let this = self.eval_context_ref();
+        let op_place = this.deref_operand(op)?;
+        let offset = Size::from_bytes(offset);
+
+        // Ensure that the following read at an offset is within bounds.
+        assert!(op_place.layout.size >= offset + layout.size);
+        let value_place = op_place.offset(offset, MemPlaceMeta::None, layout, this)?;
+        this.read_scalar_atomic(value_place, atomic)
+    }
+
+    /// Atomic variant of write_scalar_at_offset.
+    fn write_scalar_at_offset_atomic(
+        &mut self,
+        op: OpTy<'tcx, Tag>,
+        offset: u64,
+        value: impl Into<ScalarMaybeUninit<Tag>>,
+        layout: TyAndLayout<'tcx>,
+        atomic: AtomicWriteOp
+    ) -> InterpResult<'tcx> {
+        let this = self.eval_context_mut();
+        let op_place = this.deref_operand(op)?;
+        let offset = Size::from_bytes(offset);
+
+        // Ensure that the following read at an offset is within bounds.
+        assert!(op_place.layout.size >= offset + layout.size);
+        let value_place = op_place.offset(offset, MemPlaceMeta::None, layout, this)?;
+        this.write_scalar_atomic(value.into(), value_place, atomic)
+    }
+
+    /// Perform an atomic read operation at the memory location.
+    fn read_scalar_atomic(
+        &self, place: MPlaceTy<'tcx, Tag>, atomic: AtomicReadOp
+    ) -> InterpResult<'tcx, ScalarMaybeUninit<Tag>> {
+        let this = self.eval_context_ref();
+        let scalar = this.allow_data_races_ref(move |this| {
+            this.read_scalar(place.into())
+        })?;
+        self.validate_atomic_load(place, atomic)?;
+        Ok(scalar)
+    }
+
+    /// Perform an atomic write operation at the memory location.
+    fn write_scalar_atomic(
+        &mut self, val: ScalarMaybeUninit<Tag>, dest: MPlaceTy<'tcx, Tag>,
+        atomic: AtomicWriteOp
+    ) -> InterpResult<'tcx> {
+        let this = self.eval_context_mut();
+        this.allow_data_races_mut(move |this| {
+            this.write_scalar(val, dest.into())
+        })?;
+        self.validate_atomic_store(dest, atomic)
+    }
+
+    /// Perform a atomic operation on a memory location.
+    fn atomic_op_immediate(
+        &mut self,
+        place: MPlaceTy<'tcx, Tag>, rhs: ImmTy<'tcx, Tag>,
+        op: mir::BinOp, neg: bool, atomic: AtomicRwOp
+    ) -> InterpResult<'tcx, ImmTy<'tcx, Tag>> {
+        let this = self.eval_context_mut();
+
+        let old = this.allow_data_races_mut(|this| {
+            this.read_immediate(place. into())
+        })?;        
+
+        // Atomics wrap around on overflow.
+        let val = this.binary_op(op, old, rhs)?;
+        let val = if neg { this.unary_op(mir::UnOp::Not, val)? } else { val };
+        this.allow_data_races_mut(|this| {
+            this.write_immediate(*val, place.into())
+        })?;
+
+        this.validate_atomic_rmw(place, atomic)?;
+        Ok(old)
+    }
+
+    /// Perform an atomic exchange with a memory place and a new
+    /// scalar value, the old value is returned.
+    fn atomic_exchange_scalar(
+        &mut self,
+        place: MPlaceTy<'tcx, Tag>, new: ScalarMaybeUninit<Tag>,
+        atomic: AtomicRwOp
+    ) -> InterpResult<'tcx, ScalarMaybeUninit<Tag>> {
+        let this = self.eval_context_mut();
+
+        let old = this.allow_data_races_mut(|this| {
+            this.read_scalar(place.into())
+        })?;
+        this.allow_data_races_mut(|this| {
+            this.write_scalar(new, place.into())
+        })?;
+        this.validate_atomic_rmw(place, atomic)?;
+        Ok(old)
+    }
+
+    /// Perform an atomic compare and exchange at a given memory location
+    /// on success an atomic RMW operation is performed and on failure
+    /// only an atomic read occurs.
+    fn atomic_compare_exchange_scalar(
+        &mut self, place: MPlaceTy<'tcx, Tag>,
+        expect_old: ImmTy<'tcx, Tag>, new: ScalarMaybeUninit<Tag>,
+        success: AtomicRwOp, fail: AtomicReadOp
+    ) -> InterpResult<'tcx, Immediate<Tag>> {
+        let this = self.eval_context_mut();
+
+        // Failure ordering cannot be stronger than success ordering, therefore first attempt
+        // to read with the failure ordering and if successfull then try again with the success
+        // read ordering and write in the success case.
+        // Read as immediate for the sake of `binary_op()`
+        let old = this.allow_data_races_mut(|this| {
+            this.read_immediate(place.into())
+        })?; 
+
+        // `binary_op` will bail if either of them is not a scalar.
+        let eq = this.overflowing_binary_op(mir::BinOp::Eq, old, expect_old)?.0;
+        let res = Immediate::ScalarPair(old.to_scalar_or_uninit(), eq.into());
+
+        // Update ptr depending on comparison.
+        // if successful, perform a full rw-atomic validation
+        // otherwise treat this as an atomic load with the fail ordering.
+        if eq.to_bool()? {
+            this.allow_data_races_mut(|this| {
+                this.write_scalar(new, place.into())
+            })?;
+            this.validate_atomic_rmw(place, success)?;
+        } else {
+            this.validate_atomic_load(place, fail)?;
+        }
+
+        // Return the old value.
+        Ok(res)
+    }
+    
+    
+    /// Update the data-race detector for an atomic read occuring at the
+    /// associated memory-place and on the current thread.
+    fn validate_atomic_load(
+        &self, place: MPlaceTy<'tcx, Tag>, atomic: AtomicReadOp
+    ) -> InterpResult<'tcx> {
+        let this = self.eval_context_ref();
+        this.validate_atomic_op(
+            place, atomic, "Atomic Load",
+            move |memory, clocks, index, atomic| {
+                if atomic == AtomicReadOp::Relaxed {
+                    memory.load_relaxed(&mut *clocks, index)
+                } else {
+                    memory.load_acquire(&mut *clocks, index)
+                }
+            }
+        )
+    }
+
+    /// Update the data-race detector for an atomic write occuring at the
+    /// associated memory-place and on the current thread.
+    fn validate_atomic_store(
+        &mut self, place: MPlaceTy<'tcx, Tag>, atomic: AtomicWriteOp
+    ) -> InterpResult<'tcx> {
+        let this = self.eval_context_ref();
+        this.validate_atomic_op(
+            place, atomic, "Atomic Store",
+            move |memory, clocks, index, atomic| {
+                if atomic == AtomicWriteOp::Relaxed {
+                    memory.store_relaxed(clocks, index)
+                } else {
+                    memory.store_release(clocks, index)
+                }
+            }
+        )
+    }
+
+    /// Update the data-race detector for an atomic read-modify-write occuring
+    /// at the associated memory place and on the current thread.
+    fn validate_atomic_rmw(
+        &mut self, place: MPlaceTy<'tcx, Tag>, atomic: AtomicRwOp
+    ) -> InterpResult<'tcx> {
+        use AtomicRwOp::*;
+        let acquire = matches!(atomic, Acquire | AcqRel | SeqCst);
+        let release = matches!(atomic, Release | AcqRel | SeqCst);
+        let this = self.eval_context_ref();
+        this.validate_atomic_op(
+            place, atomic, "Atomic RMW",
+            move |memory, clocks, index, _| {
+                if acquire {
+                    memory.load_acquire(clocks, index)?;
+                } else {
+                    memory.load_relaxed(clocks, index)?;
+                }
+                if release {
+                    memory.rmw_release(clocks, index)
+                } else {
+                    memory.rmw_relaxed(clocks, index)
+                }
+            }
+        )
+    }
+
+    /// Update the data-race detector for an atomic fence on the current thread.
+    fn validate_atomic_fence(&mut self, atomic: AtomicFenceOp) -> InterpResult<'tcx> {
+        let this = self.eval_context_mut();
+        if let Some(data_race) = &this.memory.extra.data_race {
+            data_race.maybe_perform_sync_operation(move |index, mut clocks| {
+                log::trace!("Atomic fence on {:?} with ordering {:?}", index, atomic);
+
+                // Apply data-race detection for the current fences
+                // this treats AcqRel and SeqCst as the same as a acquire
+                // and release fence applied in the same timestamp.
+                if atomic != AtomicFenceOp::Release {
+                    // Either Acquire | AcqRel | SeqCst
+                    clocks.apply_acquire_fence();
+                }
+                if atomic != AtomicFenceOp::Acquire {
+                    // Either Release | AcqRel | SeqCst
+                    clocks.apply_release_fence();
+                }
+                Ok(())
+            })
+        } else {
+            Ok(())
+        }
+    }
+}
+
+
+
+/// Vector clock metadata for a logical memory allocation.
 #[derive(Debug, Clone)]
 pub struct VClockAlloc {
 
-    /// Range of Vector clocks, mapping to the vector-clock
-    ///  index of the last write to the bytes in this allocation
+    /// Range of Vector clocks, this gives each byte a potentially
+    /// unqiue set of vector clocks, but merges identical information
+    /// together for improved efficiency.
     alloc_ranges: RefCell<RangeMap<MemoryCellClocks>>,
 
-    // Pointer to global state
+    // Pointer to global state.
     global: MemoryExtra,
 }
 
+
 impl VClockAlloc {
 
-    /// Create a new data-race allocation detector
+    /// Create a new data-race allocation detector.
     pub fn new_allocation(global: &MemoryExtra, len: Size) -> VClockAlloc {
         VClockAlloc {
             global: Rc::clone(global),
@@ -565,7 +638,7 @@ impl VClockAlloc {
     }
 
     // Find an index, if one exists where the value
-    //  in `l` is greater than the value in `r`
+    // in `l` is greater than the value in `r`.
     fn find_gt_index(l: &VClock, r: &VClock) -> Option<VectorIdx> {
         let l_slice = l.as_slice();
         let r_slice = r.as_slice();
@@ -575,27 +648,28 @@ impl VClockAlloc {
                 if l > r { Some(idx) } else { None }
             }).or_else(|| {
                 if l_slice.len() > r_slice.len() {
+
                     // By invariant, if l_slice is longer
-                    //  then one element must be larger
+                    // then one element must be larger.
                     // This just validates that this is true
-                    //  and reports earlier elements first
+                    // and reports earlier elements first.
                     let l_remainder_slice = &l_slice[r_slice.len()..];
                     let idx = l_remainder_slice.iter().enumerate()
                         .find_map(|(idx, &r)| {
                             if r == 0 { None } else { Some(idx) }
                         }).expect("Invalid VClock Invariant");
                     Some(idx)
-                }else{
+                } else {
                     None
                 }
             }).map(|idx| VectorIdx::new(idx))
     }
 
-    /// Report a data-race found in the program
-    ///  this finds the two racing threads and the type
-    ///  of data-race that occured, this will also
-    ///  return info about the memory location the data-race
-    ///  occured in
+    /// Report a data-race found in the program.
+    /// This finds the two racing threads and the type
+    /// of data-race that occured. This will also
+    /// return info about the memory location the data-race
+    /// occured in.
     #[cold]
     #[inline(never)]
     fn report_data_race<'tcx>(
@@ -608,39 +682,40 @@ impl VClockAlloc {
         let (
             other_action, other_thread, other_clock
         ) = if range.write > current_clocks.clock[range.write_index] {
+
             // Convert the write action into the vector clock it
-            //  represents for diagnostic purposes
+            // represents for diagnostic purposes.
             write_clock = VClock::new_with_index(range.write_index, range.write);
             ("WRITE", range.write_index, &write_clock)
-        }else if let Some(idx) = Self::find_gt_index(
+        } else if let Some(idx) = Self::find_gt_index(
             &range.read, &current_clocks.clock
         ){
             ("READ", idx, &range.read)
-        }else if !is_atomic {
+        } else if !is_atomic {
             if let Some(atomic) = range.atomic() {
                 if let Some(idx) = Self::find_gt_index(
                     &atomic.write_vector, &current_clocks.clock
                 ) {
                     ("ATOMIC_STORE", idx, &atomic.write_vector)
-                }else if let Some(idx) = Self::find_gt_index(
+                } else if let Some(idx) = Self::find_gt_index(
                     &atomic.read_vector, &current_clocks.clock
                 ) {
                     ("ATOMIC_LOAD", idx, &atomic.read_vector)
-                }else{
-                    unreachable!("Failed to find report data-race for non-atomic operation: no race found")
+                } else {
+                    unreachable!("Failed to report data-race for non-atomic operation: no race found")
                 }
-            }else{
+            } else {
                 unreachable!("Failed to report data-race for non-atomic operation: no atomic component")
             }
-        }else{
+        } else {
             unreachable!("Failed to report data-race for atomic operation")
         };
 
-        // Load elaborated thread information about the racing thread actions
+        // Load elaborated thread information about the racing thread actions.
         let current_thread_info = global.print_thread_metadata(current_index);
         let other_thread_info = global.print_thread_metadata(other_thread);
         
-        // Throw the data-race detection
+        // Throw the data-race detection.
         throw_ub_format!(
             "Data race detected between {} on {} and {} on {}, memory({:?},offset={},size={})\
             \n\t\t -current vector clock = {:?}\
@@ -654,23 +729,25 @@ impl VClockAlloc {
     }
 
     /// Detect data-races for an unsychronized read operation, will not perform
-    ///  data-race threads if `multi-threaded` is false, either due to no threads
-    ///  being created or if it is temporarily disabled during a racy read or write
-    ///  operation
+    /// data-race detection if `multi-threaded` is false, either due to no threads
+    /// being created or if it is temporarily disabled during a racy read or write
+    /// operation for which data-race detection is handled separately, for example
+    /// atomic read operations.
     pub fn read<'tcx>(&self, pointer: Pointer<Tag>, len: Size) -> InterpResult<'tcx> {
         if self.global.multi_threaded.get() {
             let (index, clocks) = self.global.current_thread_state();
             let mut alloc_ranges = self.alloc_ranges.borrow_mut();
             for (_,range) in alloc_ranges.iter_mut(pointer.offset, len) {
                 if let Err(DataRace) = range.read_race_detect(&*clocks, index) {
-                    // Report data-race
+
+                    // Report data-race.
                     return Self::report_data_race(
                         &self.global,range, "READ", false, pointer, len
                     );
                 }
             }
             Ok(())
-        }else{
+        } else {
             Ok(())
         }
     }
@@ -682,6 +759,7 @@ impl VClockAlloc {
             let (index, clocks) = self.global.current_thread_state();
             for (_,range) in self.alloc_ranges.get_mut().iter_mut(pointer.offset, len) {
                 if let Err(DataRace) = range.write_race_detect(&*clocks, index) {
+                    
                     // Report data-race
                     return Self::report_data_race(
                         &self.global, range, action, false, pointer, len
@@ -689,156 +767,208 @@ impl VClockAlloc {
                 }
             }
             Ok(())
-        }else{
+        } else {
             Ok(())
         }
     }
 
     /// Detect data-races for an unsychronized write operation, will not perform
-    ///  data-race threads if `multi-threaded` is false, either due to no threads
-    ///  being created or if it is temporarily disabled during a racy read or write
-    ///  operation
+    /// data-race threads if `multi-threaded` is false, either due to no threads
+    /// being created or if it is temporarily disabled during a racy read or write
+    /// operation
     pub fn write<'tcx>(&mut self, pointer: Pointer<Tag>, len: Size) -> InterpResult<'tcx> {
         self.unique_access(pointer, len, "Write")
     }
+
     /// Detect data-races for an unsychronized deallocate operation, will not perform
-    ///  data-race threads if `multi-threaded` is false, either due to no threads
-    ///  being created or if it is temporarily disabled during a racy read or write
-    ///  operation
+    /// data-race threads if `multi-threaded` is false, either due to no threads
+    /// being created or if it is temporarily disabled during a racy read or write
+    /// operation
     pub fn deallocate<'tcx>(&mut self, pointer: Pointer<Tag>, len: Size) -> InterpResult<'tcx> {
         self.unique_access(pointer, len, "Deallocate")
     }
 }
 
-/// The current set of vector clocks describing the state
-///  of a thread, contains the happens-before clock and
-///  additional metadata to model atomic fence operations
-#[derive(Clone, Default, Debug)]
-struct ThreadClockSet {
+impl<'mir, 'tcx: 'mir> EvalContextPrivExt<'mir, 'tcx> for MiriEvalContext<'mir, 'tcx> {}
+trait EvalContextPrivExt<'mir, 'tcx: 'mir>: MiriEvalContextExt<'mir, 'tcx> {
 
-    /// The increasing clock representing timestamps
-    ///  that happen-before this thread.
-    clock: VClock,
+    // Temporarily allow data-races to occur, this should only be
+    // used if either one of the appropiate `validate_atomic` functions
+    // will be called to treat a memory access as atomic or if the memory
+    // being accessed should be treated as internal state, that cannot be
+    // accessed by the interpreted program.
+    #[inline]
+    fn allow_data_races_ref<R>(&self, op: impl FnOnce(&MiriEvalContext<'mir, 'tcx>) -> R) -> R {
+        let this = self.eval_context_ref();
+        let old = if let Some(data_race) = &this.memory.extra.data_race {
+            data_race.multi_threaded.replace(false)
+        } else {
+            false
+        };
+        let result = op(this);
+        if let Some(data_race) = &this.memory.extra.data_race {
+            data_race.multi_threaded.set(old);
+        }
+        result
+    }
 
-    /// The set of timestamps that will happen-before this
-    ///  thread once it performs an acquire fence
-    fence_acquire: VClock,
+    /// Same as `allow_data_races_ref`, this temporarily disables any data-race detection and
+    /// so should only be used for atomic operations or internal state that the program cannot
+    /// access.
+    #[inline]
+    fn allow_data_races_mut<R>(&mut self, op: impl FnOnce(&mut MiriEvalContext<'mir, 'tcx>) -> R) -> R {
+        let this = self.eval_context_mut();
+        let old = if let Some(data_race) = &this.memory.extra.data_race {
+            data_race.multi_threaded.replace(false)
+        } else {
+            false
+        };
+        let result = op(this);
+        if let Some(data_race) = &this.memory.extra.data_race {
+            data_race.multi_threaded.set(old);
+        }
+        result
+    }
+
+    /// Generic atomic operation implementation,
+    /// this accesses memory via get_raw instead of
+    /// get_raw_mut, due to issues calling get_raw_mut
+    /// for atomic loads from read-only memory.
+    /// FIXME: is this valid, or should get_raw_mut be used for
+    /// atomic-stores/atomic-rmw?
+    fn validate_atomic_op<A: Debug + Copy>(
+        &self, place: MPlaceTy<'tcx, Tag>,
+        atomic: A, description: &str,
+        mut op: impl FnMut(
+            &mut MemoryCellClocks, &mut ThreadClockSet, VectorIdx, A
+        ) -> Result<(), DataRace>
+    ) -> InterpResult<'tcx> {
+        let this = self.eval_context_ref();
+        if let Some(data_race) = &this.memory.extra.data_race {
+            if data_race.multi_threaded.get() {
+
+                // Load and log the atomic operation.
+                let place_ptr = place.ptr.assert_ptr();
+                let size = place.layout.size;
+                let alloc_meta =  &this.memory.get_raw(place_ptr.alloc_id)?.extra.data_race.as_ref().unwrap();
+                log::trace!(
+                    "Atomic op({}) with ordering {:?} on memory({:?}, offset={}, size={})",
+                    description, &atomic, place_ptr.alloc_id, place_ptr.offset.bytes(), size.bytes()
+                );
+
+                // Perform the atomic operation.
+                let data_race = &alloc_meta.global;
+                data_race.maybe_perform_sync_operation(|index, mut clocks| {
+                    for (_,range) in alloc_meta.alloc_ranges.borrow_mut().iter_mut(place_ptr.offset, size) {
+                        if let Err(DataRace) = op(range, &mut *clocks, index, atomic) {
+                            mem::drop(clocks);
+                            return VClockAlloc::report_data_race(
+                                &alloc_meta.global, range, description, true,
+                                place_ptr, size
+                            );
+                        }
+                    }
+                    Ok(())
+                })?;
+
+                // Log changes to atomic memory.
+                if log::log_enabled!(log::Level::Trace) {
+                    for (_,range) in alloc_meta.alloc_ranges.borrow().iter(place_ptr.offset, size) {
+                        log::trace!(
+                            "Updated atomic memory({:?}, offset={}, size={}) to {:#?}",
+                            place.ptr.assert_ptr().alloc_id, place_ptr.offset.bytes(), size.bytes(),
+                            range.atomic_ops
+                        );
+                    }
+                }
+            }
+        }
+        Ok(())
+    }
 
-    /// The last timesamp of happens-before relations that
-    ///  have been released by this thread by a fence
-    fence_release: VClock,
 }
 
-impl ThreadClockSet {
 
-    /// Apply the effects of a release fence to this
-    ///  set of thread vector clocks
-    #[inline]
-    fn apply_release_fence(&mut self) {
-        self.fence_release.clone_from(&self.clock);
-    }
-
-    /// Apply the effects of a acquire fence to this
-    ///  set of thread vector clocks
-    #[inline]
-    fn apply_acquire_fence(&mut self) {
-        self.clock.join(&self.fence_acquire);
-    }
-
-    /// Increment the happens-before clock at a
-    ///  known index
-    #[inline]
-    fn increment_clock(&mut self, index: VectorIdx) {
-        self.clock.increment_index(index);
-    }
-
-    /// Join the happens-before clock with that of
-    ///  another thread, used to model thread join
-    ///  operations
-    fn join_with(&mut self, other: &ThreadClockSet) {
-        self.clock.join(&other.clock);
-    }
-}
-
-/// Extra metadata associated with a thread
+/// Extra metadata associated with a thread.
 #[derive(Debug, Clone, Default)]
 struct ThreadExtraState {
 
     /// The current vector index in use by the
-    ///  thread currently, this is set to None
-    ///  after the vector index has been re-used
-    ///  and hence the value will never need to be
-    ///  read during data-race reporting
+    /// thread currently, this is set to None
+    /// after the vector index has been re-used
+    /// and hence the value will never need to be
+    /// read during data-race reporting.
     vector_index: Option<VectorIdx>,
 
     /// The name of the thread, updated for better
-    ///  diagnostics when reporting detected data
-    ///  races
+    /// diagnostics when reporting detected data
+    /// races.
     thread_name: Option<Box<str>>,
     
     /// Thread termination vector clock, this
-    ///  is set on thread termination and is used
-    ///  for joining on threads since the vector_index
-    ///  may be re-used when the join operation occurs
+    /// is set on thread termination and is used
+    /// for joining on threads since the vector_index
+    /// may be re-used when the join operation occurs.
     termination_vector_clock: Option<VClock>,
 }
 
 /// Global data-race detection state, contains the currently
-///  executing thread as well as the vector-clocks associated
-///  with each of the threads.
+/// executing thread as well as the vector-clocks associated
+/// with each of the threads.
 #[derive(Debug, Clone)]
 pub struct GlobalState {
 
     /// Set to true once the first additional
-    ///  thread has launched, due to the dependency
-    ///  between before and after a thread launch
+    /// thread has launched, due to the dependency
+    /// between before and after a thread launch.
     /// Any data-races must be recorded after this
-    ///  so concurrent execution can ignore recording
-    ///  any data-races
+    /// so concurrent execution can ignore recording
+    /// any data-races.
     multi_threaded: Cell<bool>,
 
     /// Mapping of a vector index to a known set of thread
-    ///  clocks, this is not directly mapping from a thread id
-    ///  since it may refer to multiple threads
+    /// clocks, this is not directly mapping from a thread id
+    /// since it may refer to multiple threads.
     vector_clocks: RefCell<IndexVec<VectorIdx, ThreadClockSet>>,
 
     /// Mapping of a given vector index to the current thread
-    ///  that the execution is representing, this may change
-    ///  if a vector index is re-assigned to a new thread
+    /// that the execution is representing, this may change
+    /// if a vector index is re-assigned to a new thread.
     vector_info: RefCell<IndexVec<VectorIdx, ThreadId>>,
 
-    /// The mapping of a given thread to assocaited thread metadata
+    /// The mapping of a given thread to assocaited thread metadata.
     thread_info: RefCell<IndexVec<ThreadId, ThreadExtraState>>,
 
-    /// The current vector index being executed
+    /// The current vector index being executed.
     current_index: Cell<VectorIdx>,
 
     /// Potential vector indices that could be re-used on thread creation
-    ///  values are inserted here on after the thread has terminated and
-    ///  been joined with, and hence may potentially become free
-    ///  for use as the index for a new thread.
+    /// values are inserted here on after the thread has terminated and
+    /// been joined with, and hence may potentially become free
+    /// for use as the index for a new thread.
     /// Elements in this set may still require the vector index to
-    ///  report data-races, and can only be re-used after all
-    ///  active vector-clocks catch up with the threads timestamp.
+    /// report data-races, and can only be re-used after all
+    /// active vector-clocks catch up with the threads timestamp.
     reuse_candidates: RefCell<FxHashSet<VectorIdx>>,
 
     /// Counts the number of threads that are currently active
-    ///  if the number of active threads reduces to 1 and then
-    ///  a join operation occures with the remaining main thread
-    ///  then multi-threaded execution may be disabled
+    /// if the number of active threads reduces to 1 and then
+    /// a join operation occures with the remaining main thread
+    /// then multi-threaded execution may be disabled.
     active_thread_count: Cell<usize>, 
 
     /// This contains threads that have terminated, but not yet joined
-    ///  and so cannot become re-use candidates until a join operation
-    ///  occurs.
+    /// and so cannot become re-use candidates until a join operation
+    /// occurs.
     /// The associated vector index will be moved into re-use candidates
-    ///  after the join operation occurs
+    /// after the join operation occurs.
     terminated_threads: RefCell<FxHashMap<ThreadId, VectorIdx>>,
 }
+
 impl GlobalState {
 
     /// Create a new global state, setup with just thread-id=0
-    ///  advanced to timestamp = 1
+    /// advanced to timestamp = 1.
     pub fn new() -> Self {
         let global_state = GlobalState {
             multi_threaded: Cell::new(false),
@@ -852,8 +982,8 @@ impl GlobalState {
         };
 
         // Setup the main-thread since it is not explicitly created:
-        //  uses vector index and thread-id 0, also the rust runtime gives
-        //  the main-thread a name of "main".
+        // uses vector index and thread-id 0, also the rust runtime gives
+        // the main-thread a name of "main".
         let index = global_state.vector_clocks.borrow_mut().push(ThreadClockSet::default());
         global_state.vector_info.borrow_mut().push(ThreadId::new(0));
         global_state.thread_info.borrow_mut().push(
@@ -868,7 +998,7 @@ impl GlobalState {
     }
     
     // Try to find vector index values that can potentially be re-used
-    //  by a new thread instead of a new vector index being created
+    // by a new thread instead of a new vector index being created.
     fn find_vector_index_reuse_candidate(&self) -> Option<VectorIdx> {
         let mut reuse = self.reuse_candidates.borrow_mut();
         let vector_clocks = self.vector_clocks.borrow();
@@ -877,24 +1007,26 @@ impl GlobalState {
         for  &candidate in reuse.iter() {
             let target_timestamp = vector_clocks[candidate].clock[candidate];
             if vector_clocks.iter_enumerated().all(|(clock_idx, clock)| {
+
                 // The thread happens before the clock, and hence cannot report
-                //  a data-race with this the candidate index
+                // a data-race with this the candidate index.
                 let no_data_race = clock.clock[candidate] >= target_timestamp;
 
                 // The vector represents a thread that has terminated and hence cannot
-                //  report a data-race with the candidate index
+                // report a data-race with the candidate index.
                 let thread_id = vector_info[clock_idx];
                 let vector_terminated = reuse.contains(&clock_idx)
                     || terminated_threads.contains_key(&thread_id);
 
                 // The vector index cannot report a race with the candidate index
-                //  and hence allows the candidate index to be re-used
+                // and hence allows the candidate index to be re-used.
                 no_data_race || vector_terminated
             }) {
+
                 // All vector clocks for each vector index are equal to
-                //  the target timestamp, and the thread is known to have
-                //  terminated, therefore this vector clock index cannot
-                //  report any more data-races
+                // the target timestamp, and the thread is known to have
+                // terminated, therefore this vector clock index cannot
+                // report any more data-races.
                 assert!(reuse.remove(&candidate));
                 return Some(candidate)
             }
@@ -903,17 +1035,17 @@ impl GlobalState {
     }
 
     // Hook for thread creation, enabled multi-threaded execution and marks
-    //  the current thread timestamp as happening-before the current thread
+    // the current thread timestamp as happening-before the current thread.
     #[inline]
     pub fn thread_created(&self, thread: ThreadId) {
         let current_index = self.current_index();
 
-        // Increment the number of active threads
+        // Increment the number of active threads.
         let active_threads = self.active_thread_count.get();
         self.active_thread_count.set(active_threads + 1);
 
         // Enable multi-threaded execution, there are now two threads
-        //  so data-races are now possible.
+        // so data-races are now possible.
         self.multi_threaded.set(true);
 
         // Load and setup the associated thread metadata
@@ -921,101 +1053,105 @@ impl GlobalState {
         thread_info.ensure_contains_elem(thread, Default::default);
 
         // Assign a vector index for the thread, attempting to re-use an old
-        //  vector index that can no longer report any data-races if possible
+        // vector index that can no longer report any data-races if possible.
         let created_index = if let Some(
             reuse_index
         ) = self.find_vector_index_reuse_candidate() {
+
             // Now re-configure the re-use candidate, increment the clock
-            //  for the new sync use of the vector
+            // for the new sync use of the vector.
             let mut vector_clocks = self.vector_clocks.borrow_mut();
             vector_clocks[reuse_index].increment_clock(reuse_index);
 
             // Locate the old thread the vector was associated with and update
-            //  it to represent the new thread instead
+            // it to represent the new thread instead.
             let mut vector_info = self.vector_info.borrow_mut();
             let old_thread = vector_info[reuse_index];
             vector_info[reuse_index] = thread;
 
             // Mark the thread the vector index was associated with as no longer
-            //  representing a thread index
+            // representing a thread index.
             thread_info[old_thread].vector_index = None;
 
             reuse_index
-        }else{
+        } else {
+
             // No vector re-use candidates available, instead create
-            //  a new vector index
+            // a new vector index.
             let mut vector_info = self.vector_info.borrow_mut();
             vector_info.push(thread)
         };
 
-        // Mark the chosen vector index as in use by the thread
+        // Mark the chosen vector index as in use by the thread.
         thread_info[thread].vector_index = Some(created_index);
 
-        // Create a thread clock set if applicable
+        // Create a thread clock set if applicable.
         let mut vector_clocks = self.vector_clocks.borrow_mut();
         if created_index == vector_clocks.next_index() {
             vector_clocks.push(ThreadClockSet::default());
         }
 
-        // Now load the two clocks and configure the initial state
+        // Now load the two clocks and configure the initial state.
         let (current, created) = vector_clocks.pick2_mut(current_index, created_index);
 
-        // Advance the current thread before the synchronized operation
+        // Advance the current thread before the synchronized operation.
         current.increment_clock(current_index);
 
         // Join the created with current, since the current threads
-        //  previous actions happen-before the created thread
+        // previous actions happen-before the created thread.
         created.join_with(current);
 
-        // Advance both threads after the synchronized operation
+        // Advance both threads after the synchronized operation.
         current.increment_clock(current_index);
         created.increment_clock(created_index);
     }
 
     /// Hook on a thread join to update the implicit happens-before relation
-    ///  between the joined thead and the current thread.
+    /// between the joined thead and the current thread.
     #[inline]
     pub fn thread_joined(&self, current_thread: ThreadId, join_thread: ThreadId) {
         let mut clocks_vec = self.vector_clocks.borrow_mut();
         let thread_info = self.thread_info.borrow();
 
-        // Load the vector clock of the current thread
+        // Load the vector clock of the current thread.
         let current_index = thread_info[current_thread].vector_index
             .expect("Performed thread join on thread with no assigned vector");
         let current = &mut clocks_vec[current_index];
 
-        // Load the associated vector clock for the terminated thread
+        // Load the associated vector clock for the terminated thread.
         let join_clock = thread_info[join_thread].termination_vector_clock
             .as_ref().expect("Joined with thread but thread has not terminated");
 
-        // Pre increment clocks before atomic operation
+        // Pre increment clocks before atomic operation.
         current.increment_clock(current_index);
 
         // The join thread happens-before the current thread
-        //   so update the current vector clock
+        // so update the current vector clock.
         current.clock.join(join_clock);
 
-        // Post increment clocks after atomic operation
+        // Post increment clocks after atomic operation.
         current.increment_clock(current_index);
 
         // Check the number of active threads, if the value is 1
-        //  then test for potentially disabling multi-threaded execution
+        // then test for potentially disabling multi-threaded execution.
         let active_threads = self.active_thread_count.get();
         if active_threads == 1 {
-            // May potentially be able to disable multi-threaded execution
+
+            // May potentially be able to disable multi-threaded execution.
             let current_clock = &clocks_vec[current_index];
             if clocks_vec.iter_enumerated().all(|(idx, clocks)| {
                 clocks.clock[idx] <= current_clock.clock[idx]
             }) {
+
                 // The all thread termations happen-before the current clock
-                //  therefore no data-races can be reported until a new thread
-                //  is created, so disable multi-threaded execution
+                // therefore no data-races can be reported until a new thread
+                // is created, so disable multi-threaded execution.
                 self.multi_threaded.set(false);
             }
         }
 
         // If the thread is marked as terminated but not joined
-        //  then move the thread to the re-use set
+        // then move the thread to the re-use set.
         let mut termination = self.terminated_threads.borrow_mut();
         if let Some(index) = termination.remove(&join_thread) {
             let mut reuse = self.reuse_candidates.borrow_mut();
@@ -1024,47 +1160,47 @@ impl GlobalState {
     }
 
     /// On thread termination, the vector-clock may re-used
-    ///  in the future once all remaining thread-clocks catch
-    ///  up with the time index of the terminated thread.
+    /// in the future once all remaining thread-clocks catch
+    /// up with the time index of the terminated thread.
     /// This assiges thread termination with a unique index
-    ///  which will be used to join the thread
+    /// which will be used to join the thread
     /// This should be called strictly before any calls to
-    ///   `thread_joined`
+    /// `thread_joined`.
     #[inline]
     pub fn thread_terminated(&self) {
         let current_index = self.current_index();
         
-        // Increment the clock to a unique termination timestamp
+        // Increment the clock to a unique termination timestamp.
         let mut vector_clocks = self.vector_clocks.borrow_mut();
         let current_clocks = &mut vector_clocks[current_index];
         current_clocks.increment_clock(current_index);
 
-        // Load the current thread id for the executing vector
+        // Load the current thread id for the executing vector.
         let vector_info = self.vector_info.borrow();
         let current_thread = vector_info[current_index];
 
         // Load the current thread metadata, and move to a terminated
-        //  vector state. Setting up the vector clock all join operations
-        //  will use.
+        // vector state. Setting up the vector clock all join operations
+        // will use.
         let mut thread_info = self.thread_info.borrow_mut();
         let current = &mut thread_info[current_thread];
         current.termination_vector_clock = Some(current_clocks.clock.clone());
 
         // Add this thread as a candidate for re-use after a thread join
-        //  occurs
+        // occurs.
         let mut termination = self.terminated_threads.borrow_mut();
         termination.insert(current_thread, current_index);
             
         // Reduce the number of active threads, now that a thread has
-        //  terminated
+        // terminated.
         let mut active_threads = self.active_thread_count.get();
         active_threads -= 1;
         self.active_thread_count.set(active_threads);
     }
 
     /// Hook for updating the local tracker of the currently
-    ///  enabled thread, should always be updated whenever
-    ///  `active_thread` in thread.rs is updated
+    /// enabled thread, should always be updated whenever
+    /// `active_thread` in thread.rs is updated.
     #[inline]
     pub fn thread_set_active(&self, thread: ThreadId) {
         let thread_info = self.thread_info.borrow();
@@ -1074,9 +1210,9 @@ impl GlobalState {
     }
 
     /// Hook for updating the local tracker of the threads name
-    ///  this should always mirror the local value in thread.rs
-    ///  the thread name is used for improved diagnostics
-    ///  during a data-race
+    /// this should always mirror the local value in thread.rs
+    /// the thread name is used for improved diagnostics
+    /// during a data-race.
     #[inline]
     pub fn thread_set_name(&self, thread: ThreadId, name: String) {
         let name = name.into_boxed_str();
@@ -1086,12 +1222,12 @@ impl GlobalState {
 
 
     /// Attempt to perform a synchronized operation, this
-    ///  will perform no operation if multi-threading is
-    ///  not currently enabled.
+    /// will perform no operation if multi-threading is
+    /// not currently enabled.
     /// Otherwise it will increment the clock for the current
-    ///  vector before and after the operation for data-race
-    ///  detection between any happens-before edges the
-    ///  operation may create
+    /// vector before and after the operation for data-race
+    /// detection between any happens-before edges the
+    /// operation may create.
     fn maybe_perform_sync_operation<'tcx>(
         &self, op: impl FnOnce(VectorIdx, RefMut<'_,ThreadClockSet>) -> InterpResult<'tcx>,
     ) -> InterpResult<'tcx> {
@@ -1107,50 +1243,50 @@ impl GlobalState {
     
 
     /// Internal utility to identify a thread stored internally
-    ///  returns the id and the name for better diagnostics
+    /// returns the id and the name for better diagnostics.
     fn print_thread_metadata(&self, vector: VectorIdx) -> String {
         let thread = self.vector_info.borrow()[vector];
         let thread_name = &self.thread_info.borrow()[thread].thread_name;
         if let Some(name) = thread_name {
             let name: &str = name;
             format!("Thread(id = {:?}, name = {:?})", thread.to_u32(), &*name)
-        }else{
+        } else {
             format!("Thread(id = {:?})", thread.to_u32())
         }
     }
 
 
     /// Acquire a lock, express that the previous call of
-    ///  `validate_lock_release` must happen before this
-    pub fn validate_lock_acquire(&self, lock: &DataRaceLockHandle, thread: ThreadId) {
+    /// `validate_lock_release` must happen before this.
+    pub fn validate_lock_acquire(&self, lock: &VClock, thread: ThreadId) {
         let (index, mut clocks) = self.load_thread_state_mut(thread);
         clocks.increment_clock(index);
-        clocks.clock.join(&lock.clock);
+        clocks.clock.join(&lock);
         clocks.increment_clock(index);
     }
 
     /// Release a lock handle, express that this happens-before
-    ///  any subsequent calls to `validate_lock_acquire`
-    pub fn validate_lock_release(&self, lock: &mut DataRaceLockHandle, thread: ThreadId) {
+    /// any subsequent calls to `validate_lock_acquire`.
+    pub fn validate_lock_release(&self, lock: &mut VClock, thread: ThreadId) {
         let (index, mut clocks) = self.load_thread_state_mut(thread);
         clocks.increment_clock(index);
-        lock.clock.clone_from(&clocks.clock);
+        lock.clone_from(&clocks.clock);
         clocks.increment_clock(index);
     }
 
     /// Release a lock handle, express that this happens-before
-    ///  any subsequent calls to `validate_lock_acquire` as well
-    ///  as any previous calls to this function after any
-    ///  `validate_lock_release` calls
-    pub fn validate_lock_release_shared(&self, lock: &mut DataRaceLockHandle, thread: ThreadId) {
+    /// any subsequent calls to `validate_lock_acquire` as well
+    /// as any previous calls to this function after any
+    /// `validate_lock_release` calls.
+    pub fn validate_lock_release_shared(&self, lock: &mut VClock, thread: ThreadId) {
         let (index, mut clocks) = self.load_thread_state_mut(thread);
         clocks.increment_clock(index);
-        lock.clock.join(&clocks.clock);
+        lock.join(&clocks.clock);
         clocks.increment_clock(index);
     }
 
     /// Load the vector index used by the given thread as well as the set of vector clocks
-    ///  used by the thread
+    /// used by the thread.
     #[inline]
     fn load_thread_state_mut(&self, thread: ThreadId) -> (VectorIdx, RefMut<'_, ThreadClockSet>) {
         let index = self.thread_info.borrow()[thread].vector_index
@@ -1161,7 +1297,7 @@ impl GlobalState {
     }
 
     /// Load the current vector clock in use and the current set of thread clocks
-    ///  in use for the vector
+    /// in use for the vector.
     #[inline]
     fn current_thread_state(&self) -> (VectorIdx, Ref<'_, ThreadClockSet>) {
         let index = self.current_index();
@@ -1171,7 +1307,7 @@ impl GlobalState {
     }
 
     /// Load the current vector clock in use and the current set of thread clocks
-    ///  in use for the vector mutably for modification
+    /// in use for the vector mutably for modification.
     #[inline]
     fn current_thread_state_mut(&self) -> (VectorIdx, RefMut<'_, ThreadClockSet>) {
         let index = self.current_index();
@@ -1181,7 +1317,7 @@ impl GlobalState {
     }
 
     /// Return the current thread, should be the same
-    ///  as the data-race active thread
+    /// as the data-race active thread.
     #[inline]
     fn current_index(&self) -> VectorIdx {
         self.current_index.get()
diff --git a/src/eval.rs b/src/eval.rs
index 54d06feec36d..0a62f14dd3a1 100644
--- a/src/eval.rs
+++ b/src/eval.rs
@@ -48,6 +48,8 @@ pub struct MiriConfig {
     pub tracked_alloc_id: Option<AllocId>,
     /// Whether to track raw pointers in stacked borrows.
     pub track_raw: bool,
+    /// Determine if data race detection should be enabled
+    pub data_race_detector: bool,
 }
 
 impl Default for MiriConfig {
@@ -65,6 +67,7 @@ impl Default for MiriConfig {
             tracked_call_id: None,
             tracked_alloc_id: None,
             track_raw: false,
+            data_race_detector: true,
         }
     }
 }
diff --git a/src/lib.rs b/src/lib.rs
index c8c9e70ec3de..87effe9c6885 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -55,7 +55,7 @@ pub use crate::shims::tls::{EvalContextExt as _, TlsData};
 pub use crate::shims::EvalContextExt as _;
 
 pub use crate::data_race::{
-    AtomicReadOp, AtomicWriteOp, AtomicRWOp, AtomicFenceOp, DataRaceLockHandle,
+    AtomicReadOp, AtomicWriteOp, AtomicRwOp, AtomicFenceOp,
     EvalContextExt as DataRaceEvalContextExt
 };
 pub use crate::diagnostics::{
@@ -81,7 +81,7 @@ pub use crate::sync::{
     EvalContextExt as SyncEvalContextExt, CondvarId, MutexId, RwLockId
 };
 pub use crate::vector_clock::{
-    VClock, VSmallClockSet, VectorIdx, VTimestamp
+    VClock, VSmallClockMap, VectorIdx, VTimestamp
 };
 
 /// Insert rustc arguments at the beginning of the argument list that Miri wants to be
diff --git a/src/machine.rs b/src/machine.rs
index 363513f636c9..9612d9e19110 100644
--- a/src/machine.rs
+++ b/src/machine.rs
@@ -109,15 +109,16 @@ impl fmt::Display for MiriMemoryKind {
 pub struct AllocExtra {
     /// Stacked Borrows state is only added if it is enabled.
     pub stacked_borrows: Option<stacked_borrows::AllocExtra>,
-    /// Data race detection via the use of a vector-clock.
-    pub data_race: data_race::AllocExtra,
+    /// Data race detection via the use of a vector-clock,
+    ///  this is only added if it is enabled.
+    pub data_race: Option<data_race::AllocExtra>,
 }
 
 /// Extra global memory data
 #[derive(Clone, Debug)]
 pub struct MemoryExtra {
     pub stacked_borrows: Option<stacked_borrows::MemoryExtra>,
-    pub data_race: data_race::MemoryExtra,
+    pub data_race: Option<data_race::MemoryExtra>,
     pub intptrcast: intptrcast::MemoryExtra,
 
     /// Mapping extern static names to their canonical allocation.
@@ -147,7 +148,11 @@ impl MemoryExtra {
         } else {
             None
         };
-        let data_race = Rc::new(data_race::GlobalState::new());
+        let data_race = if config.data_race_detector {
+            Some(Rc::new(data_race::GlobalState::new()))
+        }else{
+            None
+        };
         MemoryExtra {
             stacked_borrows,
             data_race,
@@ -472,7 +477,11 @@ impl<'mir, 'tcx> Machine<'mir, 'tcx> for Evaluator<'mir, 'tcx> {
                 // No stacks, no tag.
                 (None, Tag::Untagged)
             };
-        let race_alloc = data_race::AllocExtra::new_allocation(&memory_extra.data_race, alloc.size);
+        let race_alloc = if let Some(data_race) = &memory_extra.data_race {
+            Some(data_race::AllocExtra::new_allocation(&data_race, alloc.size))
+        } else {
+            None
+        };
         let mut stacked_borrows = memory_extra.stacked_borrows.as_ref().map(|sb| sb.borrow_mut());
         let alloc: Allocation<Tag, Self::AllocExtra> = alloc.with_tags_and_extra(
             |alloc| {
@@ -590,7 +599,9 @@ impl AllocationExtra<Tag> for AllocExtra {
         ptr: Pointer<Tag>,
         size: Size,
     ) -> InterpResult<'tcx> {
-        alloc.extra.data_race.read(ptr, size)?;
+        if let Some(data_race) = &alloc.extra.data_race {
+            data_race.read(ptr, size)?;
+        }
         if let Some(stacked_borrows) = &alloc.extra.stacked_borrows {
             stacked_borrows.memory_read(ptr, size)
         } else {
@@ -604,7 +615,9 @@ impl AllocationExtra<Tag> for AllocExtra {
         ptr: Pointer<Tag>,
         size: Size,
     ) -> InterpResult<'tcx> {
-        alloc.extra.data_race.write(ptr, size)?;
+        if let Some(data_race) = &mut alloc.extra.data_race {
+            data_race.write(ptr, size)?;
+        }
         if let Some(stacked_borrows) = &mut alloc.extra.stacked_borrows {
             stacked_borrows.memory_written(ptr, size)
         } else {
@@ -618,7 +631,9 @@ impl AllocationExtra<Tag> for AllocExtra {
         ptr: Pointer<Tag>,
         size: Size,
     ) -> InterpResult<'tcx> {
-        alloc.extra.data_race.deallocate(ptr, size)?;
+        if let Some(data_race) = &mut alloc.extra.data_race {
+            data_race.deallocate(ptr, size)?;
+        }
         if let Some(stacked_borrows) = &mut alloc.extra.stacked_borrows {
             stacked_borrows.memory_deallocated(ptr, size)
         } else {
diff --git a/src/shims/intrinsics.rs b/src/shims/intrinsics.rs
index 50f97af8453e..8f7ae6bebb52 100644
--- a/src/shims/intrinsics.rs
+++ b/src/shims/intrinsics.rs
@@ -324,98 +324,98 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriEvalContextExt<'mir, 'tcx
             "atomic_singlethreadfence_acqrel" => this.compiler_fence(args, AtomicFenceOp::AcqRel)?,
             "atomic_singlethreadfence" => this.compiler_fence(args, AtomicFenceOp::SeqCst)?,
 
-            "atomic_xchg" => this.atomic_exchange(args, dest, AtomicRWOp::SeqCst)?,
-            "atomic_xchg_acq" => this.atomic_exchange(args, dest, AtomicRWOp::Acquire)?,
-            "atomic_xchg_rel" => this.atomic_exchange(args, dest, AtomicRWOp::Release)?,
-            "atomic_xchg_acqrel" => this.atomic_exchange(args, dest, AtomicRWOp::AcqRel)?,
-            "atomic_xchg_relaxed" => this.atomic_exchange(args, dest, AtomicRWOp::Relaxed)?,
+            "atomic_xchg" => this.atomic_exchange(args, dest, AtomicRwOp::SeqCst)?,
+            "atomic_xchg_acq" => this.atomic_exchange(args, dest, AtomicRwOp::Acquire)?,
+            "atomic_xchg_rel" => this.atomic_exchange(args, dest, AtomicRwOp::Release)?,
+            "atomic_xchg_acqrel" => this.atomic_exchange(args, dest, AtomicRwOp::AcqRel)?,
+            "atomic_xchg_relaxed" => this.atomic_exchange(args, dest, AtomicRwOp::Relaxed)?,
 
             "atomic_cxchg" => this.atomic_compare_exchange(
-                args, dest, AtomicRWOp::SeqCst, AtomicReadOp::SeqCst
+                args, dest, AtomicRwOp::SeqCst, AtomicReadOp::SeqCst
             )?,
             "atomic_cxchg_acq" => this.atomic_compare_exchange(
-                args, dest, AtomicRWOp::Acquire, AtomicReadOp::Acquire
+                args, dest, AtomicRwOp::Acquire, AtomicReadOp::Acquire
             )?,
             "atomic_cxchg_rel" => this.atomic_compare_exchange(
-                args, dest, AtomicRWOp::Release, AtomicReadOp::Relaxed
+                args, dest, AtomicRwOp::Release, AtomicReadOp::Relaxed
             )?,
             "atomic_cxchg_acqrel" => this.atomic_compare_exchange
-            (args, dest, AtomicRWOp::AcqRel, AtomicReadOp::Acquire
+            (args, dest, AtomicRwOp::AcqRel, AtomicReadOp::Acquire
             )?,
             "atomic_cxchg_relaxed" => this.atomic_compare_exchange(
-                args, dest, AtomicRWOp::Relaxed, AtomicReadOp::Relaxed
+                args, dest, AtomicRwOp::Relaxed, AtomicReadOp::Relaxed
             )?,
             "atomic_cxchg_acq_failrelaxed" => this.atomic_compare_exchange(
-                args, dest, AtomicRWOp::Acquire, AtomicReadOp::Relaxed
+                args, dest, AtomicRwOp::Acquire, AtomicReadOp::Relaxed
             )?,
             "atomic_cxchg_acqrel_failrelaxed" => this.atomic_compare_exchange(
-                args, dest, AtomicRWOp::AcqRel, AtomicReadOp::Relaxed
+                args, dest, AtomicRwOp::AcqRel, AtomicReadOp::Relaxed
             )?,
             "atomic_cxchg_failrelaxed" => this.atomic_compare_exchange(
-                args, dest, AtomicRWOp::SeqCst, AtomicReadOp::Relaxed
+                args, dest, AtomicRwOp::SeqCst, AtomicReadOp::Relaxed
             )?,
             "atomic_cxchg_failacq" => this.atomic_compare_exchange(
-                args, dest, AtomicRWOp::SeqCst, AtomicReadOp::Acquire
+                args, dest, AtomicRwOp::SeqCst, AtomicReadOp::Acquire
             )?,
 
             "atomic_cxchgweak" => this.atomic_compare_exchange_weak(
-                args, dest, AtomicRWOp::SeqCst, AtomicReadOp::SeqCst
+                args, dest, AtomicRwOp::SeqCst, AtomicReadOp::SeqCst
             )?,
             "atomic_cxchgweak_acq" => this.atomic_compare_exchange_weak(
-                args, dest, AtomicRWOp::Acquire, AtomicReadOp::Acquire
+                args, dest, AtomicRwOp::Acquire, AtomicReadOp::Acquire
             )?,
             "atomic_cxchgweak_rel" => this.atomic_compare_exchange_weak(
-                args, dest, AtomicRWOp::Release, AtomicReadOp::Relaxed
+                args, dest, AtomicRwOp::Release, AtomicReadOp::Relaxed
             )?,
             "atomic_cxchgweak_acqrel" => this.atomic_compare_exchange_weak(
-                args, dest, AtomicRWOp::AcqRel, AtomicReadOp::Acquire
+                args, dest, AtomicRwOp::AcqRel, AtomicReadOp::Acquire
             )?,
             "atomic_cxchgweak_relaxed" => this.atomic_compare_exchange_weak(
-                args, dest, AtomicRWOp::Relaxed, AtomicReadOp::Relaxed
+                args, dest, AtomicRwOp::Relaxed, AtomicReadOp::Relaxed
             )?,
             "atomic_cxchgweak_acq_failrelaxed" => this.atomic_compare_exchange_weak(
-                args, dest, AtomicRWOp::Acquire, AtomicReadOp::Relaxed
+                args, dest, AtomicRwOp::Acquire, AtomicReadOp::Relaxed
             )?,
             "atomic_cxchgweak_acqrel_failrelaxed" => this.atomic_compare_exchange_weak(
-                args, dest, AtomicRWOp::AcqRel, AtomicReadOp::Relaxed
+                args, dest, AtomicRwOp::AcqRel, AtomicReadOp::Relaxed
             )?,
             "atomic_cxchgweak_failrelaxed" => this.atomic_compare_exchange_weak(
-                args, dest, AtomicRWOp::SeqCst, AtomicReadOp::Relaxed
+                args, dest, AtomicRwOp::SeqCst, AtomicReadOp::Relaxed
             )?,
             "atomic_cxchgweak_failacq" => this.atomic_compare_exchange_weak(
-                args, dest, AtomicRWOp::SeqCst, AtomicReadOp::Acquire
+                args, dest, AtomicRwOp::SeqCst, AtomicReadOp::Acquire
             )?,
 
-            "atomic_or" => this.atomic_op(args, dest, BinOp::BitOr, false, AtomicRWOp::SeqCst)?,
-            "atomic_or_acq" => this.atomic_op(args, dest, BinOp::BitOr, false, AtomicRWOp::Acquire)?,
-            "atomic_or_rel" => this.atomic_op(args, dest, BinOp::BitOr, false, AtomicRWOp::Release)?,
-            "atomic_or_acqrel" => this.atomic_op(args, dest, BinOp::BitOr, false, AtomicRWOp::AcqRel)?,
-            "atomic_or_relaxed" => this.atomic_op(args, dest, BinOp::BitOr, false, AtomicRWOp::Relaxed)?,
-            "atomic_xor" => this.atomic_op(args, dest, BinOp::BitXor, false, AtomicRWOp::SeqCst)?,
-            "atomic_xor_acq" => this.atomic_op(args, dest, BinOp::BitXor, false, AtomicRWOp::Acquire)?,
-            "atomic_xor_rel" => this.atomic_op(args, dest, BinOp::BitXor, false, AtomicRWOp::Release)?,
-            "atomic_xor_acqrel" => this.atomic_op(args, dest, BinOp::BitXor, false, AtomicRWOp::AcqRel)?,
-            "atomic_xor_relaxed" => this.atomic_op(args, dest, BinOp::BitXor, false, AtomicRWOp::Relaxed)?,
-            "atomic_and" => this.atomic_op(args, dest, BinOp::BitAnd, false, AtomicRWOp::SeqCst)?,
-            "atomic_and_acq" => this.atomic_op(args, dest, BinOp::BitAnd, false, AtomicRWOp::Acquire)?,
-            "atomic_and_rel" => this.atomic_op(args, dest, BinOp::BitAnd, false, AtomicRWOp::Release)?,
-            "atomic_and_acqrel" => this.atomic_op(args, dest, BinOp::BitAnd, false, AtomicRWOp::AcqRel)?,
-            "atomic_and_relaxed" => this.atomic_op(args, dest, BinOp::BitAnd, false, AtomicRWOp::Relaxed)?,
-            "atomic_nand" => this.atomic_op(args, dest, BinOp::BitAnd, true, AtomicRWOp::SeqCst)?,
-            "atomic_nand_acq" => this.atomic_op(args, dest, BinOp::BitAnd, true, AtomicRWOp::Acquire)?,
-            "atomic_nand_rel" => this.atomic_op(args, dest, BinOp::BitAnd, true, AtomicRWOp::Release)?,
-            "atomic_nand_acqrel" => this.atomic_op(args, dest, BinOp::BitAnd, true, AtomicRWOp::AcqRel)?,
-            "atomic_nand_relaxed" => this.atomic_op(args, dest, BinOp::BitAnd, true, AtomicRWOp::Relaxed)?,
-            "atomic_xadd" => this.atomic_op(args, dest, BinOp::Add, false, AtomicRWOp::SeqCst)?,
-            "atomic_xadd_acq" => this.atomic_op(args, dest, BinOp::Add, false, AtomicRWOp::Acquire)?,
-            "atomic_xadd_rel" => this.atomic_op(args, dest, BinOp::Add, false, AtomicRWOp::Release)?,
-            "atomic_xadd_acqrel" => this.atomic_op(args, dest, BinOp::Add, false, AtomicRWOp::AcqRel)?,
-            "atomic_xadd_relaxed" => this.atomic_op(args, dest, BinOp::Add, false, AtomicRWOp::Relaxed)?,
-            "atomic_xsub" => this.atomic_op(args, dest, BinOp::Sub, false, AtomicRWOp::SeqCst)?,
-            "atomic_xsub_acq" => this.atomic_op(args, dest, BinOp::Sub, false, AtomicRWOp::Acquire)?,
-            "atomic_xsub_rel" => this.atomic_op(args, dest, BinOp::Sub, false, AtomicRWOp::Release)?,
-            "atomic_xsub_acqrel" => this.atomic_op(args, dest, BinOp::Sub, false, AtomicRWOp::AcqRel)?,
-            "atomic_xsub_relaxed" => this.atomic_op(args, dest, BinOp::Sub, false, AtomicRWOp::Relaxed)?,
+            "atomic_or" => this.atomic_op(args, dest, BinOp::BitOr, false, AtomicRwOp::SeqCst)?,
+            "atomic_or_acq" => this.atomic_op(args, dest, BinOp::BitOr, false, AtomicRwOp::Acquire)?,
+            "atomic_or_rel" => this.atomic_op(args, dest, BinOp::BitOr, false, AtomicRwOp::Release)?,
+            "atomic_or_acqrel" => this.atomic_op(args, dest, BinOp::BitOr, false, AtomicRwOp::AcqRel)?,
+            "atomic_or_relaxed" => this.atomic_op(args, dest, BinOp::BitOr, false, AtomicRwOp::Relaxed)?,
+            "atomic_xor" => this.atomic_op(args, dest, BinOp::BitXor, false, AtomicRwOp::SeqCst)?,
+            "atomic_xor_acq" => this.atomic_op(args, dest, BinOp::BitXor, false, AtomicRwOp::Acquire)?,
+            "atomic_xor_rel" => this.atomic_op(args, dest, BinOp::BitXor, false, AtomicRwOp::Release)?,
+            "atomic_xor_acqrel" => this.atomic_op(args, dest, BinOp::BitXor, false, AtomicRwOp::AcqRel)?,
+            "atomic_xor_relaxed" => this.atomic_op(args, dest, BinOp::BitXor, false, AtomicRwOp::Relaxed)?,
+            "atomic_and" => this.atomic_op(args, dest, BinOp::BitAnd, false, AtomicRwOp::SeqCst)?,
+            "atomic_and_acq" => this.atomic_op(args, dest, BinOp::BitAnd, false, AtomicRwOp::Acquire)?,
+            "atomic_and_rel" => this.atomic_op(args, dest, BinOp::BitAnd, false, AtomicRwOp::Release)?,
+            "atomic_and_acqrel" => this.atomic_op(args, dest, BinOp::BitAnd, false, AtomicRwOp::AcqRel)?,
+            "atomic_and_relaxed" => this.atomic_op(args, dest, BinOp::BitAnd, false, AtomicRwOp::Relaxed)?,
+            "atomic_nand" => this.atomic_op(args, dest, BinOp::BitAnd, true, AtomicRwOp::SeqCst)?,
+            "atomic_nand_acq" => this.atomic_op(args, dest, BinOp::BitAnd, true, AtomicRwOp::Acquire)?,
+            "atomic_nand_rel" => this.atomic_op(args, dest, BinOp::BitAnd, true, AtomicRwOp::Release)?,
+            "atomic_nand_acqrel" => this.atomic_op(args, dest, BinOp::BitAnd, true, AtomicRwOp::AcqRel)?,
+            "atomic_nand_relaxed" => this.atomic_op(args, dest, BinOp::BitAnd, true, AtomicRwOp::Relaxed)?,
+            "atomic_xadd" => this.atomic_op(args, dest, BinOp::Add, false, AtomicRwOp::SeqCst)?,
+            "atomic_xadd_acq" => this.atomic_op(args, dest, BinOp::Add, false, AtomicRwOp::Acquire)?,
+            "atomic_xadd_rel" => this.atomic_op(args, dest, BinOp::Add, false, AtomicRwOp::Release)?,
+            "atomic_xadd_acqrel" => this.atomic_op(args, dest, BinOp::Add, false, AtomicRwOp::AcqRel)?,
+            "atomic_xadd_relaxed" => this.atomic_op(args, dest, BinOp::Add, false, AtomicRwOp::Relaxed)?,
+            "atomic_xsub" => this.atomic_op(args, dest, BinOp::Sub, false, AtomicRwOp::SeqCst)?,
+            "atomic_xsub_acq" => this.atomic_op(args, dest, BinOp::Sub, false, AtomicRwOp::Acquire)?,
+            "atomic_xsub_rel" => this.atomic_op(args, dest, BinOp::Sub, false, AtomicRwOp::Release)?,
+            "atomic_xsub_acqrel" => this.atomic_op(args, dest, BinOp::Sub, false, AtomicRwOp::AcqRel)?,
+            "atomic_xsub_relaxed" => this.atomic_op(args, dest, BinOp::Sub, false, AtomicRwOp::Relaxed)?,
 
 
             // Query type information
@@ -514,7 +514,7 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriEvalContextExt<'mir, 'tcx
 
     fn atomic_op(
         &mut self, args: &[OpTy<'tcx, Tag>], dest: PlaceTy<'tcx, Tag>,
-        op: mir::BinOp, neg: bool, atomic: AtomicRWOp
+        op: mir::BinOp, neg: bool, atomic: AtomicRwOp
     ) -> InterpResult<'tcx> {
         let this = self.eval_context_mut();
 
@@ -524,39 +524,26 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriEvalContextExt<'mir, 'tcx
             bug!("Atomic arithmetic operations only work on integer types");
         }
         let rhs = this.read_immediate(rhs)?;
-        let old = this.allow_data_races_mut(|this| {
-            this.read_immediate(place. into())
-        })?;
 
         // Check alignment requirements. Atomics must always be aligned to their size,
         // even if the type they wrap would be less aligned (e.g. AtomicU64 on 32bit must
         // be 8-aligned).
         let align = Align::from_bytes(place.layout.size.bytes()).unwrap();
         this.memory.check_ptr_access(place.ptr, place.layout.size, align)?;
+        
+        let old = this.atomic_op_immediate(place, rhs, op, neg, atomic)?;
         this.write_immediate(*old, dest)?; // old value is returned
-
-        // Atomics wrap around on overflow.
-        let val = this.binary_op(op, old, rhs)?;
-        let val = if neg { this.unary_op(mir::UnOp::Not, val)? } else { val };
-        this.allow_data_races_mut(|this| {
-            this.write_immediate(*val, place.into())
-        })?;
-
-        this.validate_atomic_rmw(place, atomic)?;
         Ok(())
     }
     
     fn atomic_exchange(
-        &mut self, args: &[OpTy<'tcx, Tag>], dest: PlaceTy<'tcx, Tag>, atomic: AtomicRWOp
+        &mut self, args: &[OpTy<'tcx, Tag>], dest: PlaceTy<'tcx, Tag>, atomic: AtomicRwOp
     ) -> InterpResult<'tcx> {
         let this = self.eval_context_mut();
 
         let &[place, new] = check_arg_count(args)?;
         let place = this.deref_operand(place)?;
         let new = this.read_scalar(new)?;
-        let old = this.allow_data_races_mut(|this| {
-            this.read_scalar(place.into())
-        })?;
 
         // Check alignment requirements. Atomics must always be aligned to their size,
         // even if the type they wrap would be less aligned (e.g. AtomicU64 on 32bit must
@@ -564,18 +551,14 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriEvalContextExt<'mir, 'tcx
         let align = Align::from_bytes(place.layout.size.bytes()).unwrap();
         this.memory.check_ptr_access(place.ptr, place.layout.size, align)?;
 
+        let old = this.atomic_exchange_scalar(place, new, atomic)?;
         this.write_scalar(old, dest)?; // old value is returned
-        this.allow_data_races_mut(|this| {
-            this.write_scalar(new, place.into())
-        })?;
-
-        this.validate_atomic_rmw(place, atomic)?;
         Ok(())
     }
 
     fn atomic_compare_exchange(
         &mut self, args: &[OpTy<'tcx, Tag>], dest: PlaceTy<'tcx, Tag>,
-        success: AtomicRWOp, fail: AtomicReadOp
+        success: AtomicRwOp, fail: AtomicReadOp
     ) -> InterpResult<'tcx> {
         let this = self.eval_context_mut();
 
@@ -584,13 +567,6 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriEvalContextExt<'mir, 'tcx
         let expect_old = this.read_immediate(expect_old)?; // read as immediate for the sake of `binary_op()`
         let new = this.read_scalar(new)?;
 
-        // Failure ordering cannot be stronger than success ordering, therefore first attempt
-        //  to read with the failure ordering and if successfull then try again with the success
-        //  read ordering and write in the success case.
-        // Read as immediate for the sake of `binary_op()`
-        let old = this.allow_data_races_mut(|this| {
-            this.read_immediate(place.into())
-        })?; 
 
         // Check alignment requirements. Atomics must always be aligned to their size,
         // even if the type they wrap would be less aligned (e.g. AtomicU64 on 32bit must
@@ -598,31 +574,19 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriEvalContextExt<'mir, 'tcx
         let align = Align::from_bytes(place.layout.size.bytes()).unwrap();
         this.memory.check_ptr_access(place.ptr, place.layout.size, align)?;
 
-        // `binary_op` will bail if either of them is not a scalar.
-        let eq = this.overflowing_binary_op(mir::BinOp::Eq, old, expect_old)?.0;
-        let res = Immediate::ScalarPair(old.to_scalar_or_uninit(), eq.into());
+        
+        let old = this.atomic_compare_exchange_scalar(
+            place, expect_old, new, success, fail
+        )?;
 
         // Return old value.
-        this.write_immediate(res, dest)?;
-
-        // Update ptr depending on comparison.
-        //  if successful, perform a full rw-atomic validation
-        //  otherwise treat this as an atomic load with the fail ordering
-        if eq.to_bool()? {
-            this.allow_data_races_mut(|this| {
-                this.write_scalar(new, place.into())
-            })?;
-            this.validate_atomic_rmw(place, success)?;
-        } else {
-            this.validate_atomic_load(place, fail)?;
-        }
-
+        this.write_immediate(old, dest)?;
         Ok(())
     }
 
     fn atomic_compare_exchange_weak(
         &mut self, args: &[OpTy<'tcx, Tag>], dest: PlaceTy<'tcx, Tag>,
-        success: AtomicRWOp, fail: AtomicReadOp
+        success: AtomicRwOp, fail: AtomicReadOp
     ) -> InterpResult<'tcx> {
 
         // FIXME: the weak part of this is currently not modelled,
diff --git a/src/shims/posix/linux/sync.rs b/src/shims/posix/linux/sync.rs
index 67cea5507737..78244ab7b879 100644
--- a/src/shims/posix/linux/sync.rs
+++ b/src/shims/posix/linux/sync.rs
@@ -78,7 +78,17 @@ pub fn futex<'tcx>(
             // Read an `i32` through the pointer, regardless of any wrapper types.
             // It's not uncommon for `addr` to be passed as another type than `*mut i32`, such as `*const AtomicI32`.
             // FIXME: this fails if `addr` is not a pointer type.
-            // FIXME: what form of atomic operation should the `futex` use to load the value?
+            // The atomic ordering for futex(https://man7.org/linux/man-pages/man2/futex.2.html):
+            //  "The load of the value of the futex word is an
+            //   atomic memory access (i.e., using atomic machine instructions
+            //   of the respective architecture).  This load, the comparison
+            //   with the expected value, and starting to sleep are performed
+            //   atomically and totally ordered with respect to other futex
+            //   operations on the same futex word."
+            // SeqCst is total order over all operations, so uses acquire,
+            // either are equal under the current implementation.
+            // FIXME: is Acquire correct or should some additional ordering constraints be observed?
+            // FIXME: use RMW or similar?
             let futex_val = this.read_scalar_at_offset_atomic(
                 addr.into(), 0, this.machine.layouts.i32, AtomicReadOp::Acquire
             )?.to_i32()?;
diff --git a/src/shims/posix/sync.rs b/src/shims/posix/sync.rs
index d741ef346e94..64308d06139f 100644
--- a/src/shims/posix/sync.rs
+++ b/src/shims/posix/sync.rs
@@ -64,7 +64,7 @@ fn mutex_get_kind<'mir, 'tcx: 'mir>(
     let offset = if ecx.pointer_size().bytes() == 8 { 16 } else { 12 };
     ecx.read_scalar_at_offset_atomic(
         mutex_op, offset, ecx.machine.layouts.i32,
-        AtomicReadOp::SeqCst
+        AtomicReadOp::Acquire
     )
 }
 
@@ -76,7 +76,7 @@ fn mutex_set_kind<'mir, 'tcx: 'mir>(
     let offset = if ecx.pointer_size().bytes() == 8 { 16 } else { 12 };
     ecx.write_scalar_at_offset_atomic(
         mutex_op, offset, kind, ecx.machine.layouts.i32, 
-        AtomicWriteOp::SeqCst
+        AtomicWriteOp::Release
     )
 }
 
@@ -85,7 +85,7 @@ fn mutex_get_id<'mir, 'tcx: 'mir>(
     mutex_op: OpTy<'tcx, Tag>,
 ) -> InterpResult<'tcx, ScalarMaybeUninit<Tag>> {
     ecx.read_scalar_at_offset_atomic(
-        mutex_op, 4, ecx.machine.layouts.u32, AtomicReadOp::SeqCst
+        mutex_op, 4, ecx.machine.layouts.u32, AtomicReadOp::Acquire
     )
 }
 
@@ -96,7 +96,7 @@ fn mutex_set_id<'mir, 'tcx: 'mir>(
 ) -> InterpResult<'tcx, ()> {
     ecx.write_scalar_at_offset_atomic(
         mutex_op, 4, id, ecx.machine.layouts.u32,
-        AtomicWriteOp::SeqCst
+        AtomicWriteOp::Release
     )
 }
 
@@ -129,7 +129,7 @@ fn rwlock_get_id<'mir, 'tcx: 'mir>(
 ) -> InterpResult<'tcx, ScalarMaybeUninit<Tag>> {
     ecx.read_scalar_at_offset_atomic(
         rwlock_op, 4, ecx.machine.layouts.u32,
-        AtomicReadOp::SeqCst
+        AtomicReadOp::Acquire
     )
 }
 
@@ -140,7 +140,7 @@ fn rwlock_set_id<'mir, 'tcx: 'mir>(
 ) -> InterpResult<'tcx, ()> {
     ecx.write_scalar_at_offset_atomic(
         rwlock_op, 4, id, ecx.machine.layouts.u32,
-        AtomicWriteOp::SeqCst
+        AtomicWriteOp::Release
     )
 }
 
@@ -196,7 +196,7 @@ fn cond_get_id<'mir, 'tcx: 'mir>(
 ) -> InterpResult<'tcx, ScalarMaybeUninit<Tag>> {
     ecx.read_scalar_at_offset_atomic(
         cond_op, 4, ecx.machine.layouts.u32,
-        AtomicReadOp::SeqCst
+        AtomicReadOp::Acquire
     )
 }
 
@@ -207,7 +207,7 @@ fn cond_set_id<'mir, 'tcx: 'mir>(
 ) -> InterpResult<'tcx, ()> {
     ecx.write_scalar_at_offset_atomic(
         cond_op, 4, id, ecx.machine.layouts.u32,
-        AtomicWriteOp::SeqCst
+        AtomicWriteOp::Release
     )
 }
 
diff --git a/src/shims/posix/thread.rs b/src/shims/posix/thread.rs
index e823a7d88d6a..847d083bfa9f 100644
--- a/src/shims/posix/thread.rs
+++ b/src/shims/posix/thread.rs
@@ -15,14 +15,14 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriEvalContextExt<'mir, 'tcx
         let this = self.eval_context_mut();
 
         this.tcx.sess.warn(
-            "thread support is experimental.",
+            "thread support is experimental, no weak memory effects are currently emulated.",
         );
 
         // Create the new thread
         let new_thread_id = this.create_thread();
 
         // Write the current thread-id, switch to the next thread later
-        //  to treat this write operation as occuring on this thread index
+        // to treat this write operation as occuring on the current thread.
         let thread_info_place = this.deref_operand(thread)?;
         this.write_scalar(
             Scalar::from_uint(new_thread_id.to_u32(), thread_info_place.layout.size),
@@ -30,15 +30,16 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriEvalContextExt<'mir, 'tcx
         )?;
 
         // Read the function argument that will be sent to the new thread
-        //  again perform the read before the thread starts executing.
+        // before the thread starts executing since reading after the 
+        // context switch will incorrectly report a data-race.
         let fn_ptr = this.read_scalar(start_routine)?.check_init()?;
         let func_arg = this.read_immediate(arg)?;
 
-        // Also switch to new thread so that we can push the first stackframe.
-        //  after this all accesses will be treated as occuring in the new thread
+        // Finally switch to new thread so that we can push the first stackframe.
+        // After this all accesses will be treated as occuring in the new thread.
         let old_thread_id = this.set_active_thread(new_thread_id);
 
-        // Perform the function pointer load in the new thread frame
+        // Perform the function pointer load in the new thread frame.
         let instance = this.memory.get_fn(fn_ptr)?.as_instance()?;
 
         // Note: the returned value is currently ignored (see the FIXME in
@@ -54,7 +55,7 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriEvalContextExt<'mir, 'tcx
             StackPopCleanup::None { cleanup: true },
         )?;
 
-        // Restore the old active thread frame
+        // Restore the old active thread frame.
         this.set_active_thread(old_thread_id);
 
         Ok(0)
diff --git a/src/sync.rs b/src/sync.rs
index 3469afdcd276..828268c06ccf 100644
--- a/src/sync.rs
+++ b/src/sync.rs
@@ -62,7 +62,7 @@ struct Mutex {
     /// The queue of threads waiting for this mutex.
     queue: VecDeque<ThreadId>,
     /// Data race handle
-    data_race: DataRaceLockHandle
+    data_race: VClock
 }
 
 declare_id!(RwLockId);
@@ -80,9 +80,9 @@ struct RwLock {
     /// The queue of reader threads waiting for this lock.
     reader_queue: VecDeque<ThreadId>,
     /// Data race handle for writers
-    data_race: DataRaceLockHandle,
+    data_race: VClock,
     /// Data race handle for readers
-    data_race_reader: DataRaceLockHandle,
+    data_race_reader: VClock,
 }
 
 declare_id!(CondvarId);
@@ -100,14 +100,14 @@ struct CondvarWaiter {
 #[derive(Default, Debug)]
 struct Condvar {
     waiters: VecDeque<CondvarWaiter>,
-    data_race: DataRaceLockHandle,
+    data_race: VClock,
 }
 
 /// The futex state.
 #[derive(Default, Debug)]
 struct Futex {
     waiters: VecDeque<FutexWaiter>,
-    data_race: DataRaceLockHandle,
+    data_race: VClock,
 }
 
 /// A thread waiting on a futex.
@@ -213,7 +213,9 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriEvalContextExt<'mir, 'tcx
             mutex.owner = Some(thread);
         }
         mutex.lock_count = mutex.lock_count.checked_add(1).unwrap();
-        this.memory.extra.data_race.validate_lock_acquire(&mutex.data_race, thread);
+        if let Some(data_race) = &this.memory.extra.data_race {
+            data_race.validate_lock_acquire(&mutex.data_race, thread);
+        }
     }
 
     /// Try unlocking by decreasing the lock count and returning the old lock
@@ -241,7 +243,9 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriEvalContextExt<'mir, 'tcx
                 mutex.owner = None;
                 // The mutex is completely unlocked. Try transfering ownership
                 // to another thread.
-                this.memory.extra.data_race.validate_lock_release(&mut mutex.data_race, current_owner);
+                if let Some(data_race) = &this.memory.extra.data_race {
+                    data_race.validate_lock_release(&mut mutex.data_race, current_owner);
+                }
                 this.mutex_dequeue_and_lock(id);
             }
             Some(old_lock_count)
@@ -297,7 +301,9 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriEvalContextExt<'mir, 'tcx
         let rwlock = &mut this.machine.threads.sync.rwlocks[id];
         let count = rwlock.readers.entry(reader).or_insert(0);
         *count = count.checked_add(1).expect("the reader counter overflowed");
-        this.memory.extra.data_race.validate_lock_acquire(&rwlock.data_race, reader);
+        if let Some(data_race) = &this.memory.extra.data_race {
+            data_race.validate_lock_acquire(&rwlock.data_race, reader);
+        }
     }
 
     /// Try read-unlock the lock for `reader` and potentially give the lock to a new owner.
@@ -319,7 +325,9 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriEvalContextExt<'mir, 'tcx
             }
             Entry::Vacant(_) => return false, // we did not even own this lock
         }
-        this.memory.extra.data_race.validate_lock_release_shared(&mut rwlock.data_race_reader, reader);
+        if let Some(data_race) = &this.memory.extra.data_race {
+            data_race.validate_lock_release_shared(&mut rwlock.data_race_reader, reader);
+        }
 
         // The thread was a reader. If the lock is not held any more, give it to a writer.
         if this.rwlock_is_locked(id).not() {
@@ -328,7 +336,7 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriEvalContextExt<'mir, 'tcx
             //  of the union of all reader data race handles, since the set of readers
             //  happen-before the writers
             let rwlock = &mut this.machine.threads.sync.rwlocks[id];
-            rwlock.data_race.set_values(&rwlock.data_race_reader);
+            rwlock.data_race.clone_from(&rwlock.data_race_reader);
             this.rwlock_dequeue_and_lock_writer(id);
         }
         true
@@ -355,7 +363,9 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriEvalContextExt<'mir, 'tcx
         trace!("rwlock_writer_lock: {:?} now held by {:?}", id, writer);
         let rwlock = &mut this.machine.threads.sync.rwlocks[id];
         rwlock.writer = Some(writer);
-        this.memory.extra.data_race.validate_lock_acquire(&rwlock.data_race, writer);
+        if let Some(data_race) = &this.memory.extra.data_race {
+            data_race.validate_lock_acquire(&rwlock.data_race, writer);
+        }
     }
 
     #[inline]
@@ -373,8 +383,10 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriEvalContextExt<'mir, 'tcx
             // Release memory to both reader and writer vector clocks
             //  since this writer happens-before both the union of readers once they are finished
             //  and the next writer
-            this.memory.extra.data_race.validate_lock_release(&mut rwlock.data_race, current_writer);
-            this.memory.extra.data_race.validate_lock_release(&mut rwlock.data_race_reader, current_writer);
+            if let Some(data_race) = &this.memory.extra.data_race {
+                data_race.validate_lock_release(&mut rwlock.data_race, current_writer);
+                data_race.validate_lock_release(&mut rwlock.data_race_reader, current_writer);
+            }
             // The thread was a writer.
             //
             // We are prioritizing writers here against the readers. As a
@@ -435,14 +447,18 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriEvalContextExt<'mir, 'tcx
         let this = self.eval_context_mut();
         let current_thread = this.get_active_thread();
         let condvar = &mut this.machine.threads.sync.condvars[id];
-        let data_race = &mut this.memory.extra.data_race;
+        let data_race = &this.memory.extra.data_race;
 
         // Each condvar signal happens-before the end of the condvar wake
-        data_race.validate_lock_release(&mut condvar.data_race, current_thread);
+        if let Some(data_race) = data_race {
+            data_race.validate_lock_release(&mut condvar.data_race, current_thread);
+        }
         condvar.waiters
             .pop_front()
             .map(|waiter| {
-                data_race.validate_lock_acquire(&mut condvar.data_race, waiter.thread);
+                if let Some(data_race) = data_race {
+                    data_race.validate_lock_acquire(&mut condvar.data_race, waiter.thread);
+                }
                 (waiter.thread, waiter.mutex)
             })
     }
@@ -466,12 +482,16 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriEvalContextExt<'mir, 'tcx
         let this = self.eval_context_mut();
         let current_thread = this.get_active_thread();
         let futex = &mut this.machine.threads.sync.futexes.get_mut(&addr.erase_tag())?;
-        let data_race =  &mut this.memory.extra.data_race;
+        let data_race =  &this.memory.extra.data_race;
 
         // Each futex-wake happens-before the end of the futex wait
-        data_race.validate_lock_release(&mut futex.data_race, current_thread);
+        if let Some(data_race) = data_race {
+            data_race.validate_lock_release(&mut futex.data_race, current_thread);
+        }
         let res = futex.waiters.pop_front().map(|waiter| {
-            data_race.validate_lock_acquire(&futex.data_race, waiter.thread);  
+            if let Some(data_race) = data_race {
+                data_race.validate_lock_acquire(&futex.data_race, waiter.thread);  
+            }
             waiter.thread
         });
         res
diff --git a/src/thread.rs b/src/thread.rs
index 40cfd04d7923..5d783430417b 100644
--- a/src/thread.rs
+++ b/src/thread.rs
@@ -3,6 +3,7 @@
 use std::cell::RefCell;
 use std::collections::hash_map::Entry;
 use std::convert::TryFrom;
+use std::rc::Rc;
 use std::num::TryFromIntError;
 use std::time::{Duration, Instant, SystemTime};
 
@@ -327,7 +328,7 @@ impl<'mir, 'tcx: 'mir> ThreadManager<'mir, 'tcx> {
     }
 
     /// Mark that the active thread tries to join the thread with `joined_thread_id`.
-    fn join_thread(&mut self, joined_thread_id: ThreadId, data_race: &data_race::GlobalState) -> InterpResult<'tcx> {
+    fn join_thread(&mut self, joined_thread_id: ThreadId, data_race: &Option<Rc<data_race::GlobalState>>) -> InterpResult<'tcx> {
         if self.threads[joined_thread_id].join_status != ThreadJoinStatus::Joinable {
             throw_ub_format!("trying to join a detached or already joined thread");
         }
@@ -351,9 +352,11 @@ impl<'mir, 'tcx: 'mir> ThreadManager<'mir, 'tcx> {
                 self.active_thread,
                 joined_thread_id
             );
-        }else{
+        } else {
             // The thread has already terminated - mark join happens-before
-            data_race.thread_joined(self.active_thread, joined_thread_id);
+            if let Some(data_race) = data_race {
+                data_race.thread_joined(self.active_thread, joined_thread_id);
+            }
         }
         Ok(())
     }
@@ -428,7 +431,7 @@ impl<'mir, 'tcx: 'mir> ThreadManager<'mir, 'tcx> {
 
     /// Wakes up threads joining on the active one and deallocates thread-local statics.
     /// The `AllocId` that can now be freed is returned.
-    fn thread_terminated(&mut self, data_race: &data_race::GlobalState) -> Vec<AllocId> {
+    fn thread_terminated(&mut self, data_race: &Option<Rc<data_race::GlobalState>>) -> Vec<AllocId> {
         let mut free_tls_statics = Vec::new();
         {
             let mut thread_local_statics = self.thread_local_alloc_ids.borrow_mut();
@@ -444,12 +447,16 @@ impl<'mir, 'tcx: 'mir> ThreadManager<'mir, 'tcx> {
             });
         }
         // Set the thread into a terminated state in the data-race detector
-        data_race.thread_terminated();
+        if let Some(data_race) = data_race {
+            data_race.thread_terminated();
+        }
         // Check if we need to unblock any threads.
         for (i, thread) in self.threads.iter_enumerated_mut() {
             if thread.state == ThreadState::BlockedOnJoin(self.active_thread) {
                 // The thread has terminated, mark happens-before edge to joining thread
-                data_race.thread_joined(i, self.active_thread);
+                if let Some(data_race) = data_race {
+                    data_race.thread_joined(i, self.active_thread);
+                }
                 trace!("unblocking {:?} because {:?} terminated", i, self.active_thread);
                 thread.state = ThreadState::Enabled;
             }
@@ -463,7 +470,7 @@ impl<'mir, 'tcx: 'mir> ThreadManager<'mir, 'tcx> {
     /// used in stateless model checkers such as Loom: run the active thread as
     /// long as we can and switch only when we have to (the active thread was
     /// blocked, terminated, or has explicitly asked to be preempted).
-    fn schedule(&mut self, data_race: &data_race::GlobalState) -> InterpResult<'tcx, SchedulingAction> {
+    fn schedule(&mut self, data_race: &Option<Rc<data_race::GlobalState>>) -> InterpResult<'tcx, SchedulingAction> {
         // Check whether the thread has **just** terminated (`check_terminated`
         // checks whether the thread has popped all its stack and if yes, sets
         // the thread state to terminated).
@@ -508,7 +515,9 @@ impl<'mir, 'tcx: 'mir> ThreadManager<'mir, 'tcx> {
             if thread.state == ThreadState::Enabled {
                 if !self.yield_active_thread || id != self.active_thread {
                     self.active_thread = id;
-                    data_race.thread_set_active(self.active_thread);
+                    if let Some(data_race) = data_race {
+                        data_race.thread_set_active(self.active_thread);
+                    }
                     break;
                 }
             }
@@ -563,7 +572,9 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriEvalContextExt<'mir, 'tcx
     fn create_thread(&mut self) -> ThreadId {
         let this = self.eval_context_mut();
         let id = this.machine.threads.create_thread();
-        this.memory.extra.data_race.thread_created(id);
+        if let Some(data_race) = &this.memory.extra.data_race {
+            data_race.thread_created(id);
+        }
         id
     }
 
@@ -576,7 +587,7 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriEvalContextExt<'mir, 'tcx
     #[inline]
     fn join_thread(&mut self, joined_thread_id: ThreadId) -> InterpResult<'tcx> {
         let this = self.eval_context_mut();
-        let data_race = &*this.memory.extra.data_race;
+        let data_race = &this.memory.extra.data_race;
         this.machine.threads.join_thread(joined_thread_id, data_race)?;
         Ok(())
     }
@@ -584,7 +595,9 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriEvalContextExt<'mir, 'tcx
     #[inline]
     fn set_active_thread(&mut self, thread_id: ThreadId) -> ThreadId {
         let this = self.eval_context_mut();
-        this.memory.extra.data_race.thread_set_active(thread_id);
+        if let Some(data_race) = &this.memory.extra.data_race {
+            data_race.thread_set_active(thread_id);
+        }
         this.machine.threads.set_active_thread_id(thread_id)
     }
 
@@ -639,10 +652,12 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriEvalContextExt<'mir, 'tcx
     #[inline]
     fn set_active_thread_name(&mut self, new_thread_name: Vec<u8>) {
         let this = self.eval_context_mut();
-        if let Ok(string) = String::from_utf8(new_thread_name.clone()) {
-            this.memory.extra.data_race.thread_set_name(
-                this.machine.threads.active_thread, string
-            );
+        if let Some(data_race) = &this.memory.extra.data_race {
+            if let Ok(string) = String::from_utf8(new_thread_name.clone()) {
+                data_race.thread_set_name(
+                    this.machine.threads.active_thread, string
+                );
+            }
         }
         this.machine.threads.set_thread_name(new_thread_name);
     }
@@ -713,7 +728,7 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriEvalContextExt<'mir, 'tcx
     #[inline]
     fn schedule(&mut self) -> InterpResult<'tcx, SchedulingAction> {
         let this = self.eval_context_mut();
-        let data_race = &*this.memory.extra.data_race;
+        let data_race = &this.memory.extra.data_race;
         this.machine.threads.schedule(data_race)
     }
 
@@ -724,7 +739,7 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriEvalContextExt<'mir, 'tcx
     #[inline]
     fn thread_terminated(&mut self) -> InterpResult<'tcx> {
         let this = self.eval_context_mut();
-        let data_race = &*this.memory.extra.data_race;
+        let data_race = &this.memory.extra.data_race;
         for alloc_id in this.machine.threads.thread_terminated(data_race) {
             let ptr = this.memory.global_base_pointer(alloc_id.into())?;
             this.memory.deallocate(ptr, None, MiriMemoryKind::Tls.into())?;
diff --git a/src/vector_clock.rs b/src/vector_clock.rs
index 8d05eb1b992b..110b278852d5 100644
--- a/src/vector_clock.rs
+++ b/src/vector_clock.rs
@@ -1,121 +1,132 @@
 use std::{
     fmt::{self, Debug}, cmp::Ordering, ops::Index,
-    num::TryFromIntError, convert::TryFrom, mem
+    convert::TryFrom, mem
 };
 use smallvec::SmallVec;
 use rustc_index::vec::Idx;
 use rustc_data_structures::fx::FxHashMap;
 
 /// A vector clock index, this is associated with a thread id
-///  but in some cases one vector index may be shared with
-///  multiple thread ids.
+/// but in some cases one vector index may be shared with
+/// multiple thread ids id it safe to do so.
 #[derive(Clone, Copy, Debug, PartialOrd, Ord, PartialEq, Eq, Hash)]
 pub struct VectorIdx(u32);
 
-impl VectorIdx{
+impl VectorIdx {
+
+    #[inline(always)]
     pub fn to_u32(self) -> u32 {
         self.0
     }
+
     pub const MAX_INDEX: VectorIdx = VectorIdx(u32::MAX);
+
 }
 
 impl Idx for VectorIdx {
+
+    #[inline]
     fn new(idx: usize) -> Self {
         VectorIdx(u32::try_from(idx).unwrap())
     }
 
+    #[inline]
     fn index(self) -> usize {
         usize::try_from(self.0).unwrap()
     }
-}
 
-impl TryFrom<u64> for VectorIdx {
-    type Error = TryFromIntError;
-    fn try_from(id: u64) -> Result<Self, Self::Error> {
-        u32::try_from(id).map(|id_u32| Self(id_u32))
-    }
 }
 
 impl From<u32> for VectorIdx {
+
+    #[inline]
     fn from(id: u32) -> Self {
         Self(id)
     }
+
 }
 
-
-/// A sparse set of vector clocks, where each vector index
-///  is associated with a vector clock.
-/// This treats all vector clocks that have not been assigned
-///  as equal to the all zero vector clocks
-/// Is optimized for the common case where only 1 element is stored
-///  in the set and the rest can be ignored, falling-back to
-///  using an internal hash-map once more than 1 element is assigned
-///  at any one time
+/// A sparse mapping of vector index values to vector clocks, this
+/// is optimized for the common case with only one element stored
+/// inside the map.
+/// This is used to store the set of currently active release
+/// sequences at a given memory location, since RMW operations
+/// allow for multiple release sequences to be active at once
+/// and to be collapsed back to one active release sequence 
+/// once a non RMW atomic store operation occurs.
+/// An all zero vector is considered to be equal to no
+/// element stored internally since it will never be
+/// stored and has no meaning as a release sequence
+/// vector clock.
 #[derive(Clone)]
-pub struct VSmallClockSet(VSmallClockSetInner);
+pub struct VSmallClockMap(VSmallClockMapInner);
 
 #[derive(Clone)]
-enum VSmallClockSetInner {
+enum VSmallClockMapInner {
+
     /// Zero or 1 vector elements, common
-    ///  case for the sparse set.
+    /// case for the sparse set.
     /// The all zero vector clock is treated
-    ///  as equal to the empty element
+    /// as equal to the empty element.
     Small(VectorIdx, VClock),
 
-    /// Hash-map of vector clocks
+    /// Hash-map of vector clocks.
     Large(FxHashMap<VectorIdx, VClock>)
 }
 
-impl VSmallClockSet {
+impl VSmallClockMap {
 
     /// Remove all clock vectors from the map, setting them
-    ///  to the zero vector
+    /// to the zero vector.
     pub fn clear(&mut self) {
         match &mut self.0 {
-            VSmallClockSetInner::Small(_, clock) => {
+            VSmallClockMapInner::Small(_, clock) => {
                 clock.set_zero_vector()
             }
-            VSmallClockSetInner::Large(hash_map) => {
+            VSmallClockMapInner::Large(hash_map) => {
                 hash_map.clear();
             }
         }
     }
 
     /// Remove all clock vectors except for the clock vector
-    ///  stored at the given index, which is retained
+    /// stored at the given index, which is retained.
     pub fn retain_index(&mut self, index: VectorIdx) {
         match &mut self.0 {
-            VSmallClockSetInner::Small(small_idx, clock) => {
+            VSmallClockMapInner::Small(small_idx, clock) => {
                 if index != *small_idx {
+
                     // The zero-vector is considered to equal
-                    //  the empty element
+                    // the empty element.
                     clock.set_zero_vector()
                 }
             },
-            VSmallClockSetInner::Large(hash_map) => {
-                hash_map.retain(|idx,_| {
-                    *idx == index
-                });
+            VSmallClockMapInner::Large(hash_map) => {
+                let value = hash_map.remove(&index).unwrap_or_default();
+                self.0 = VSmallClockMapInner::Small(index, value);
             }
         }
     }
 
     /// Insert the vector clock into the associated vector
-    ///  index
+    /// index.
     pub fn insert(&mut self, index: VectorIdx, clock: &VClock) {
         match &mut self.0 {
-            VSmallClockSetInner::Small(small_idx, small_clock) => {
+            VSmallClockMapInner::Small(small_idx, small_clock) => {
                 if small_clock.is_zero_vector() {
+
                     *small_idx = index;
                     small_clock.clone_from(clock);
-                }else if !clock.is_zero_vector() {
+                } else if !clock.is_zero_vector() {
+
+                    // Convert to using the hash-map representation.
                     let mut hash_map = FxHashMap::default();
                     hash_map.insert(*small_idx, mem::take(small_clock));
                     hash_map.insert(index, clock.clone());
-                    self.0 = VSmallClockSetInner::Large(hash_map);
+                    self.0 = VSmallClockMapInner::Large(hash_map);
                 }
             },
-            VSmallClockSetInner::Large(hash_map) => {
+            VSmallClockMapInner::Large(hash_map) => {
                 if !clock.is_zero_vector() {
                     hash_map.insert(index, clock.clone());
                 }
@@ -127,41 +138,44 @@ impl VSmallClockSet {
     ///  vector index.
     pub fn get(&self, index: VectorIdx) -> Option<&VClock> {
         match &self.0 {
-            VSmallClockSetInner::Small(small_idx, small_clock) => {
+            VSmallClockMapInner::Small(small_idx, small_clock) => {
                 if *small_idx == index && !small_clock.is_zero_vector() {
                     Some(small_clock)
-                }else{
+                } else {
                     None
                 }
             },
-            VSmallClockSetInner::Large(hash_map) => {
+            VSmallClockMapInner::Large(hash_map) => {
                 hash_map.get(&index)
             }
         }
     }
 }
 
-impl Default for VSmallClockSet {
+impl Default for VSmallClockMap {
+
     #[inline]
     fn default() -> Self {
-        VSmallClockSet(
-            VSmallClockSetInner::Small(VectorIdx::new(0), VClock::default())
+        VSmallClockMap(
+            VSmallClockMapInner::Small(VectorIdx::new(0), VClock::default())
         )
     }
+
 }
 
-impl Debug for VSmallClockSet {
+impl Debug for VSmallClockMap {
+
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         // Print the contents of the small vector clock set as the map
-        //  of vector index to vector clock that they represent
+        // of vector index to vector clock that they represent.
         let mut map = f.debug_map();
         match &self.0 {
-            VSmallClockSetInner::Small(small_idx, small_clock) => {
+            VSmallClockMapInner::Small(small_idx, small_clock) => {
                 if !small_clock.is_zero_vector() {
                     map.entry(&small_idx, &small_clock);
                 }
             },
-            VSmallClockSetInner::Large(hash_map) => {
+            VSmallClockMapInner::Large(hash_map) => {
                 for (idx, elem) in hash_map.iter() {
                     map.entry(idx, elem);
                 }
@@ -169,30 +183,35 @@ impl Debug for VSmallClockSet {
         }
         map.finish()
     }
+
 }
-impl PartialEq for VSmallClockSet {
+
+
+impl PartialEq for VSmallClockMap {
+
     fn eq(&self, other: &Self) -> bool {
-        use VSmallClockSetInner::*;
+        use VSmallClockMapInner::*;
         match (&self.0, &other.0) {
             (Small(i1, c1), Small(i2, c2)) => {
                 if c1.is_zero_vector() {
                     // Either they are both zero or they are non-equal
                     c2.is_zero_vector()
-                }else{
+                } else {
                     // At least one is non-zero, so the full comparison is correct
                     i1 == i2 && c1 == c2
                 }
             }
-            (VSmallClockSetInner::Small(idx, clock), VSmallClockSetInner::Large(hash_map)) |
-            (VSmallClockSetInner::Large(hash_map), VSmallClockSetInner::Small(idx, clock)) => {
+            (Small(idx, clock), Large(hash_map)) |
+            (Large(hash_map), Small(idx, clock)) => {
+
                 if hash_map.len() == 0 {
                     // Equal to the empty hash-map
                     clock.is_zero_vector()
-                }else if hash_map.len() == 1 {
+                } else if hash_map.len() == 1 {
                     // Equal to the hash-map with one element
                     let (hash_idx, hash_clock) = hash_map.iter().next().unwrap();
                     hash_idx == idx && hash_clock == clock
-                }else{
+                } else {
                     false
                 }
             }
@@ -201,32 +220,38 @@ impl PartialEq for VSmallClockSet {
             }
         }
     }
+
 }
-impl Eq for VSmallClockSet {}
+
+impl Eq for VSmallClockMap {}
 
 
 
 /// The size of the vector-clock to store inline
-///  clock vectors larger than this will be stored on the heap
+/// clock vectors larger than this will be stored on the heap
 const SMALL_VECTOR: usize = 4;
 
 /// The type of the time-stamps recorded in the data-race detector
-///  set to a type of unsigned integer
+/// set to a type of unsigned integer
 pub type VTimestamp = u32;
 
-/// A vector clock for detecting data-races
-///  invariants:
-///   - the last element in a VClock must not be 0
-///     -- this means that derive(PartialEq & Eq) is correct
-///     --  as there is no implicit zero tail that might be equal
-///     --  also simplifies the implementation of PartialOrd
+/// A vector clock for detecting data-races, this is conceptually
+/// a map from a vector index (and thus a thread id) to a timestamp.
+/// The compare operations require that the invariant that the last
+/// element in the internal timestamp slice must not be a 0, hence
+/// all zero vector clocks are always represented by the empty slice;
+/// and allows for the implementation of compare operations to short
+/// circuit the calculation and return the correct result faster,
+/// also this means that there is only one unique valid length
+/// for each set of vector clock values and hence the PartialEq
+//  and Eq derivations are correct.
 #[derive(PartialEq, Eq, Default, Debug)]
 pub struct VClock(SmallVec<[VTimestamp; SMALL_VECTOR]>);
 
 impl VClock {
 
     /// Create a new vector-clock containing all zeros except
-    ///  for a value at the given index
+    /// for a value at the given index
     pub fn new_with_index(index: VectorIdx, timestamp: VTimestamp) -> VClock {
         let len = index.index() + 1;
         let mut vec = smallvec::smallvec![0; len];
@@ -241,8 +266,8 @@ impl VClock {
     }
 
     /// Get a mutable slice to the internal vector with minimum `min_len`
-    ///  elements, to preserve invariants this vector must modify
-    ///  the `min_len`-1 nth element to a non-zero value
+    /// elements, to preserve invariants this vector must modify
+    /// the `min_len`-1 nth element to a non-zero value
     #[inline]
     fn get_mut_with_min_len(&mut self, min_len: usize) -> &mut [VTimestamp] {
         if self.0.len() < min_len {
@@ -253,7 +278,7 @@ impl VClock {
     }
 
     /// Increment the vector clock at a known index
-    ///  this will panic if the vector index overflows
+    /// this will panic if the vector index overflows
     #[inline]
     pub fn increment_index(&mut self, idx: VectorIdx) {
         let idx = idx.index();
@@ -263,8 +288,8 @@ impl VClock {
     }
 
     // Join the two vector-clocks together, this
-    //  sets each vector-element to the maximum value
-    //  of that element in either of the two source elements.
+    // sets each vector-element to the maximum value
+    // of that element in either of the two source elements.
     pub fn join(&mut self, other: &Self) {
         let rhs_slice = other.as_slice();
         let lhs_slice = self.get_mut_with_min_len(rhs_slice.len());
@@ -291,30 +316,43 @@ impl VClock {
     pub fn is_zero_vector(&self) -> bool {
         self.0.is_empty()
     }
+
 }
 
 impl Clone for VClock {
+
     fn clone(&self) -> Self {
         VClock(self.0.clone())
     }
+
+    // Optimized clone-from, can be removed
+    // and replaced with a derive once a similar
+    // optimization is inserted into SmallVec's
+    // clone implementation.
     fn clone_from(&mut self, source: &Self) {
         let source_slice = source.as_slice();
         self.0.clear();
         self.0.extend_from_slice(source_slice);
     }
+
 }
 
 impl PartialOrd for VClock {
+
     fn partial_cmp(&self, other: &VClock) -> Option<Ordering> {
 
         // Load the values as slices
         let lhs_slice = self.as_slice();
         let rhs_slice = other.as_slice();
 
-        // Iterate through the combined vector slice
-        //  keeping track of the order that is currently possible to satisfy.
-        // If an ordering relation is detected to be impossible, then bail and
-        //  directly return None
+        // Iterate through the combined vector slice continuously updating
+        // the value of `order` to the current comparison of the vector from
+        // index 0 to the currently checked index.
+        // An Equal ordering can be converted into Less or Greater ordering
+        // on finding an element that is less than or greater than the other
+        // but if one Greater and one Less element-wise comparison is found
+        // then no ordering is possible and so directly return an ordering
+        // of None.
         let mut iter = lhs_slice.iter().zip(rhs_slice.iter());
         let mut order = match iter.next() {
             Some((lhs, rhs)) => lhs.cmp(rhs),
@@ -332,23 +370,23 @@ impl PartialOrd for VClock {
             }
         }
 
-        //Now test if either left or right have trailing elements
+        // Now test if either left or right have trailing elements,
         // by the invariant the trailing elements have at least 1
         // non zero value, so no additional calculation is required
-        // to determine the result of the PartialOrder
+        // to determine the result of the PartialOrder.
         let l_len = lhs_slice.len();
         let r_len = rhs_slice.len();
         match l_len.cmp(&r_len) {
-            // Equal has no additional elements: return current order
+            // Equal means no additional elements: return current order
             Ordering::Equal => Some(order),
             // Right has at least 1 element > than the implicit 0,
-            //  so the only valid values are Ordering::Less or None
+            // so the only valid values are Ordering::Less or None.
             Ordering::Less => match order {
                 Ordering::Less | Ordering::Equal => Some(Ordering::Less),
                 Ordering::Greater => None
             }
             // Left has at least 1 element > than the implicit 0,
-            //  so the only valid values are Ordering::Greater or None
+            // so the only valid values are Ordering::Greater or None.
             Ordering::Greater => match order {
                 Ordering::Greater | Ordering::Equal => Some(Ordering::Greater),
                 Ordering::Less => None
@@ -362,28 +400,28 @@ impl PartialOrd for VClock {
         let rhs_slice = other.as_slice();
 
         // If l_len > r_len then at least one element
-        //  in l_len is > than r_len, therefore the result
-        //  is either Some(Greater) or None, so return false
-        //  early.
+        // in l_len is > than r_len, therefore the result
+        // is either Some(Greater) or None, so return false
+        // early.
         let l_len = lhs_slice.len();
         let r_len = rhs_slice.len();
         if l_len <= r_len {
             // If any elements on the left are greater than the right
-            //  then the result is None or Some(Greater), both of which
-            //  return false, the earlier test asserts that no elements in the
-            //  extended tail violate this assumption. Otherwise l <= r, finally
-            //  the case where the values are potentially equal needs to be considered
-            //  and false returned as well
+            // then the result is None or Some(Greater), both of which
+            // return false, the earlier test asserts that no elements in the
+            // extended tail violate this assumption. Otherwise l <= r, finally
+            // the case where the values are potentially equal needs to be considered
+            // and false returned as well
             let mut equal = l_len == r_len;
             for (&l, &r) in lhs_slice.iter().zip(rhs_slice.iter()) {
                 if l > r {
                     return false
-                }else if l < r {
+                } else if l < r {
                     equal = false;
                 }
             }
             !equal
-        }else{
+         } else {
             false
         }
     }
@@ -394,18 +432,18 @@ impl PartialOrd for VClock {
         let rhs_slice = other.as_slice();
 
         // If l_len > r_len then at least one element
-        //  in l_len is > than r_len, therefore the result
-        //  is either Some(Greater) or None, so return false
-        //  early.
+        // in l_len is > than r_len, therefore the result
+        // is either Some(Greater) or None, so return false
+        // early.
         let l_len = lhs_slice.len();
         let r_len = rhs_slice.len();
         if l_len <= r_len {
             // If any elements on the left are greater than the right
-            //  then the result is None or Some(Greater), both of which
-            //  return false, the earlier test asserts that no elements in the
-            //  extended tail violate this assumption. Otherwise l <= r
+            // then the result is None or Some(Greater), both of which
+            // return false, the earlier test asserts that no elements in the
+            // extended tail violate this assumption. Otherwise l <= r
             !lhs_slice.iter().zip(rhs_slice.iter()).any(|(&l, &r)| l > r)
-        }else{
+        } else {
             false
         }
     }
@@ -416,28 +454,28 @@ impl PartialOrd for VClock {
         let rhs_slice = other.as_slice();
 
         // If r_len > l_len then at least one element
-        //  in r_len is > than l_len, therefore the result
-        //  is either Some(Less) or None, so return false
-        //  early.
+        // in r_len is > than l_len, therefore the result
+        // is either Some(Less) or None, so return false
+        // early.
         let l_len = lhs_slice.len();
         let r_len = rhs_slice.len();
         if l_len >= r_len {
             // If any elements on the left are less than the right
-            //  then the result is None or Some(Less), both of which
-            //  return false, the earlier test asserts that no elements in the
-            //  extended tail violate this assumption. Otherwise l >=, finally
-            //  the case where the values are potentially equal needs to be considered
-            //  and false returned as well
+            // then the result is None or Some(Less), both of which
+            // return false, the earlier test asserts that no elements in the
+            // extended tail violate this assumption. Otherwise l >=, finally
+            // the case where the values are potentially equal needs to be considered
+            // and false returned as well
             let mut equal = l_len == r_len;
             for (&l, &r) in lhs_slice.iter().zip(rhs_slice.iter()) {
                 if l < r {
                     return false
-                }else if l > r {
+                } else if l > r {
                     equal = false;
                 }
             }
             !equal
-        }else{
+        } else {
             false
         }
     }
@@ -448,30 +486,33 @@ impl PartialOrd for VClock {
         let rhs_slice = other.as_slice();
 
         // If r_len > l_len then at least one element
-        //  in r_len is > than l_len, therefore the result
-        //  is either Some(Less) or None, so return false
-        //  early.
+        // in r_len is > than l_len, therefore the result
+        // is either Some(Less) or None, so return false
+        // early.
         let l_len = lhs_slice.len();
         let r_len = rhs_slice.len();
         if l_len >= r_len {
             // If any elements on the left are less than the right
-            //  then the result is None or Some(Less), both of which
-            //  return false, the earlier test asserts that no elements in the
-            //  extended tail violate this assumption. Otherwise l >= r
+            // then the result is None or Some(Less), both of which
+            // return false, the earlier test asserts that no elements in the
+            // extended tail violate this assumption. Otherwise l >= r
             !lhs_slice.iter().zip(rhs_slice.iter()).any(|(&l, &r)| l < r)
-        }else{
+        } else {
             false
         }
     }
+
 }
 
 impl Index<VectorIdx> for VClock {
+
     type Output = VTimestamp;
 
     #[inline]
     fn index(&self, index: VectorIdx) -> &VTimestamp {
        self.as_slice().get(index.to_u32() as usize).unwrap_or(&0)
     }
+
 }
 
 
@@ -480,7 +521,8 @@ impl Index<VectorIdx> for VClock {
 ///  test suite
 #[cfg(test)]
 mod tests {
-    use super::{VClock, VTimestamp, VectorIdx, VSmallClockSet};
+
+    use super::{VClock, VTimestamp, VectorIdx, VSmallClockMap};
     use std::cmp::Ordering;
 
     #[test]
@@ -536,7 +578,7 @@ mod tests {
         let alt_compare = r.partial_cmp(&l);
         assert_eq!(alt_compare, o.map(Ordering::reverse), "Invalid alt comparison\n l: {:?}\n r: {:?}",l,r);
 
-        //Test operatorsm with faster implementations
+        //Test operators with faster implementations
         assert_eq!(
             matches!(compare,Some(Ordering::Less)), l < r,
             "Invalid (<):\n l: {:?}\n r: {:?}",l,r
@@ -573,30 +615,31 @@ mod tests {
 
     #[test]
     pub fn test_vclock_set() {
-        let mut set = VSmallClockSet::default();
+        let mut map = VSmallClockMap::default();
         let v1 = from_slice(&[3,0,1]);
         let v2 = from_slice(&[4,2,3]);
         let v3 = from_slice(&[4,8,3]);
-        set.insert(VectorIdx(0), &v1);
-        assert_eq!(set.get(VectorIdx(0)), Some(&v1));
-        set.insert(VectorIdx(5), &v2);
-        assert_eq!(set.get(VectorIdx(0)), Some(&v1));
-        assert_eq!(set.get(VectorIdx(5)), Some(&v2));
-        set.insert(VectorIdx(53), &v3);
-        assert_eq!(set.get(VectorIdx(0)), Some(&v1));
-        assert_eq!(set.get(VectorIdx(5)), Some(&v2));
-        assert_eq!(set.get(VectorIdx(53)), Some(&v3));
-        set.retain_index(VectorIdx(53));
-        assert_eq!(set.get(VectorIdx(0)), None);
-        assert_eq!(set.get(VectorIdx(5)), None);
-        assert_eq!(set.get(VectorIdx(53)), Some(&v3));
-        set.clear();
-        assert_eq!(set.get(VectorIdx(0)), None);
-        assert_eq!(set.get(VectorIdx(5)), None);
-        assert_eq!(set.get(VectorIdx(53)), None);
-        set.insert(VectorIdx(53), &v3);
-        assert_eq!(set.get(VectorIdx(0)), None);
-        assert_eq!(set.get(VectorIdx(5)), None);
-        assert_eq!(set.get(VectorIdx(53)), Some(&v3));
+        map.insert(VectorIdx(0), &v1);
+        assert_eq!(map.get(VectorIdx(0)), Some(&v1));
+        map.insert(VectorIdx(5), &v2);
+        assert_eq!(map.get(VectorIdx(0)), Some(&v1));
+        assert_eq!(map.get(VectorIdx(5)), Some(&v2));
+        map.insert(VectorIdx(53), &v3);
+        assert_eq!(map.get(VectorIdx(0)), Some(&v1));
+        assert_eq!(map.get(VectorIdx(5)), Some(&v2));
+        assert_eq!(map.get(VectorIdx(53)), Some(&v3));
+        map.retain_index(VectorIdx(53));
+        assert_eq!(map.get(VectorIdx(0)), None);
+        assert_eq!(map.get(VectorIdx(5)), None);
+        assert_eq!(map.get(VectorIdx(53)), Some(&v3));
+        map.clear();
+        assert_eq!(map.get(VectorIdx(0)), None);
+        assert_eq!(map.get(VectorIdx(5)), None);
+        assert_eq!(map.get(VectorIdx(53)), None);
+        map.insert(VectorIdx(53), &v3);
+        assert_eq!(map.get(VectorIdx(0)), None);
+        assert_eq!(map.get(VectorIdx(5)), None);
+        assert_eq!(map.get(VectorIdx(53)), Some(&v3));
     }
+    
 }
diff --git a/tests/run-pass/concurrency/data_race.stderr b/tests/run-pass/concurrency/data_race.stderr
index b01247aea4e0..7ba8087a9b4b 100644
--- a/tests/run-pass/concurrency/data_race.stderr
+++ b/tests/run-pass/concurrency/data_race.stderr
@@ -1,2 +1,2 @@
-warning: thread support is experimental.
+warning: thread support is experimental, no weak memory effects are currently emulated.
 
diff --git a/tests/run-pass/concurrency/linux-futex.stderr b/tests/run-pass/concurrency/linux-futex.stderr
index b01247aea4e0..7ba8087a9b4b 100644
--- a/tests/run-pass/concurrency/linux-futex.stderr
+++ b/tests/run-pass/concurrency/linux-futex.stderr
@@ -1,2 +1,2 @@
-warning: thread support is experimental.
+warning: thread support is experimental, no weak memory effects are currently emulated.
 
diff --git a/tests/run-pass/concurrency/simple.stderr b/tests/run-pass/concurrency/simple.stderr
index f1550dd25aa0..24444fdc17c1 100644
--- a/tests/run-pass/concurrency/simple.stderr
+++ b/tests/run-pass/concurrency/simple.stderr
@@ -1,4 +1,4 @@
-warning: thread support is experimental.
+warning: thread support is experimental, no weak memory effects are currently emulated.
 
 thread '<unnamed>' panicked at 'Hello!', $DIR/simple.rs:54:9
 note: run with `RUST_BACKTRACE=1` environment variable to display a backtrace
diff --git a/tests/run-pass/concurrency/sync.stderr b/tests/run-pass/concurrency/sync.stderr
index b01247aea4e0..7ba8087a9b4b 100644
--- a/tests/run-pass/concurrency/sync.stderr
+++ b/tests/run-pass/concurrency/sync.stderr
@@ -1,2 +1,2 @@
-warning: thread support is experimental.
+warning: thread support is experimental, no weak memory effects are currently emulated.
 
diff --git a/tests/run-pass/concurrency/thread_locals.stderr b/tests/run-pass/concurrency/thread_locals.stderr
index b01247aea4e0..7ba8087a9b4b 100644
--- a/tests/run-pass/concurrency/thread_locals.stderr
+++ b/tests/run-pass/concurrency/thread_locals.stderr
@@ -1,2 +1,2 @@
-warning: thread support is experimental.
+warning: thread support is experimental, no weak memory effects are currently emulated.
 
diff --git a/tests/run-pass/concurrency/tls_lib_drop.stderr b/tests/run-pass/concurrency/tls_lib_drop.stderr
index b01247aea4e0..7ba8087a9b4b 100644
--- a/tests/run-pass/concurrency/tls_lib_drop.stderr
+++ b/tests/run-pass/concurrency/tls_lib_drop.stderr
@@ -1,2 +1,2 @@
-warning: thread support is experimental.
+warning: thread support is experimental, no weak memory effects are currently emulated.
 
diff --git a/tests/run-pass/libc.stderr b/tests/run-pass/libc.stderr
index b01247aea4e0..7ba8087a9b4b 100644
--- a/tests/run-pass/libc.stderr
+++ b/tests/run-pass/libc.stderr
@@ -1,2 +1,2 @@
-warning: thread support is experimental.
+warning: thread support is experimental, no weak memory effects are currently emulated.
 
diff --git a/tests/run-pass/panic/concurrent-panic.stderr b/tests/run-pass/panic/concurrent-panic.stderr
index ca6031e57b40..885385a8dd93 100644
--- a/tests/run-pass/panic/concurrent-panic.stderr
+++ b/tests/run-pass/panic/concurrent-panic.stderr
@@ -1,4 +1,4 @@
-warning: thread support is experimental.
+warning: thread support is experimental, no weak memory effects are currently emulated.
 
 Thread 1 starting, will block on mutex
 Thread 1 reported it has started