Merge pull request #4524 from Stypox/tracing-chrome-overhead-rdtsc

Account for time spent tracing, use RDTSC for faster time
This commit is contained in:
Ralf Jung 2025-08-23 11:18:29 +00:00 committed by GitHub
commit d70fea9149
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 283 additions and 78 deletions

View file

@ -1,2 +1,3 @@
pub mod setup;
mod tracing_chrome;
mod tracing_chrome_instant;

View file

@ -7,12 +7,15 @@
//! (`git log -- path/to/tracing_chrome.rs`), but in summary:
//! - the file attributes were changed and `extern crate` was added at the top
//! - if a tracing span has a field called "tracing_separate_thread", it will be given a separate
//! span ID even in [TraceStyle::Threaded] mode, to make it appear on a separate line when viewing
//! the trace in <https://ui.perfetto.dev>. This is the syntax to trigger this behavior:
//! span ID even in [TraceStyle::Threaded] mode, to make it appear on a separate line when viewing
//! the trace in <https://ui.perfetto.dev>. This is the syntax to trigger this behavior:
//! ```rust
//! tracing::info_span!("my_span", tracing_separate_thread = tracing::field::Empty, /* ... */)
//! ```
//! - use i64 instead of u64 for the "id" in [ChromeLayer::get_root_id] to be compatible with Perfetto
//! - use i64 instead of u64 for the "id" in [ChromeLayer::get_root_id] to be compatible with
//! Perfetto
//! - use [ChromeLayer::with_elapsed_micros_subtracting_tracing] to make time measurements faster on
//! Linux x86/x86_64 and to subtract time spent tracing from the timestamps in the trace file
//!
//! Depending on the tracing-chrome crate from crates.io is unfortunately not possible, since it
//! depends on `tracing_core` which conflicts with rustc_private's `tracing_core` (meaning it would
@ -50,9 +53,22 @@ use std::{
thread::JoinHandle,
};
use crate::log::tracing_chrome_instant::TracingChromeInstant;
/// Contains thread-local data for threads that send tracing spans or events.
struct ThreadData {
/// A unique ID for this thread, will populate "tid" field in the output trace file.
tid: usize,
/// A clone of [ChromeLayer::out] to avoid the expensive operation of accessing a mutex
/// every time. This is used to send [Message]s to the thread that saves trace data to file.
out: Sender<Message>,
/// The instant in time this thread was started. All events happening on this thread will be
/// saved to the trace file with a timestamp (the "ts" field) measured relative to this instant.
start: TracingChromeInstant,
}
thread_local! {
static OUT: RefCell<Option<Sender<Message>>> = const { RefCell::new(None) };
static TID: RefCell<Option<usize>> = const { RefCell::new(None) };
static THREAD_DATA: RefCell<Option<ThreadData>> = const { RefCell::new(None) };
}
type NameFn<S> = Box<dyn Fn(&EventOrSpan<'_, '_, S>) -> String + Send + Sync>;
@ -64,7 +80,6 @@ where
S: Subscriber + for<'span> LookupSpan<'span> + Send + Sync,
{
out: Arc<Mutex<Sender<Message>>>,
start: std::time::Instant,
max_tid: AtomicUsize,
include_args: bool,
include_locations: bool,
@ -323,7 +338,6 @@ where
{
fn new(mut builder: ChromeLayerBuilder<S>) -> (ChromeLayer<S>, FlushGuard) {
let (tx, rx) = mpsc::channel();
OUT.with(|val| val.replace(Some(tx.clone())));
let out_writer = builder
.out_writer
@ -443,7 +457,6 @@ where
};
let layer = ChromeLayer {
out: Arc::new(Mutex::new(tx)),
start: std::time::Instant::now(),
max_tid: AtomicUsize::new(0),
name_fn: builder.name_fn.take(),
cat_fn: builder.cat_fn.take(),
@ -456,22 +469,7 @@ where
(layer, guard)
}
fn get_tid(&self) -> (usize, bool) {
TID.with(|value| {
let tid = *value.borrow();
match tid {
Some(tid) => (tid, false),
None => {
let tid = self.max_tid.fetch_add(1, Ordering::SeqCst);
value.replace(Some(tid));
(tid, true)
}
}
})
}
fn get_callsite(&self, data: EventOrSpan<S>) -> Callsite {
let (tid, new_thread) = self.get_tid();
fn get_callsite(&self, data: EventOrSpan<S>, tid: usize) -> Callsite {
let name = self.name_fn.as_ref().map(|name_fn| name_fn(&data));
let target = self.cat_fn.as_ref().map(|cat_fn| cat_fn(&data));
let meta = match data {
@ -502,14 +500,6 @@ where
(None, None)
};
if new_thread {
let name = match std::thread::current().name() {
Some(name) => name.to_owned(),
None => tid.to_string(),
};
self.send_message(Message::NewThread(tid, name));
}
Callsite {
tid,
name,
@ -548,31 +538,55 @@ where
}
}
fn enter_span(&self, span: SpanRef<S>, ts: f64) {
let callsite = self.get_callsite(EventOrSpan::Span(&span));
fn enter_span(&self, span: SpanRef<S>, ts: f64, tid: usize, out: &Sender<Message>) {
let callsite = self.get_callsite(EventOrSpan::Span(&span), tid);
let root_id = self.get_root_id(span);
self.send_message(Message::Enter(ts, callsite, root_id));
let _ignored = out.send(Message::Enter(ts, callsite, root_id));
}
fn exit_span(&self, span: SpanRef<S>, ts: f64) {
let callsite = self.get_callsite(EventOrSpan::Span(&span));
fn exit_span(&self, span: SpanRef<S>, ts: f64, tid: usize, out: &Sender<Message>) {
let callsite = self.get_callsite(EventOrSpan::Span(&span), tid);
let root_id = self.get_root_id(span);
self.send_message(Message::Exit(ts, callsite, root_id));
let _ignored = out.send(Message::Exit(ts, callsite, root_id));
}
fn get_ts(&self) -> f64 {
self.start.elapsed().as_nanos() as f64 / 1000.0
}
/// Helper function that measures how much time is spent while executing `f` and accounts for it
/// in subsequent calls, with the aim to reduce biases in the data collected by `tracing_chrome`
/// by subtracting the time spent inside tracing functions from the timeline. This makes it so
/// that the time spent inside the `tracing_chrome` functions does not impact the timestamps
/// inside the trace file (i.e. `ts`), even if such functions are slow (e.g. because they need
/// to format arguments on the same thread those arguments are collected on, otherwise memory
/// safety would be broken).
///
/// `f` is called with the microseconds elapsed since the current thread was started (**not**
/// since the program start!), with the current thread ID (i.e. `tid`), and with a [Sender] that
/// can be used to send a [Message] to the thread that collects [Message]s and saves them to the
/// trace file.
#[inline(always)]
fn with_elapsed_micros_subtracting_tracing(&self, f: impl Fn(f64, usize, &Sender<Message>)) {
THREAD_DATA.with(|value| {
let mut thread_data = value.borrow_mut();
let (ThreadData { tid, out, start }, new_thread) = match thread_data.as_mut() {
Some(thread_data) => (thread_data, false),
None => {
let tid = self.max_tid.fetch_add(1, Ordering::SeqCst);
let out = self.out.lock().unwrap().clone();
let start = TracingChromeInstant::setup_for_thread_and_start(tid);
*thread_data = Some(ThreadData { tid, out, start });
(thread_data.as_mut().unwrap(), true)
}
};
fn send_message(&self, message: Message) {
OUT.with(move |val| {
if val.borrow().is_some() {
let _ignored = val.borrow().as_ref().unwrap().send(message);
} else {
let out = self.out.lock().unwrap().clone();
let _ignored = out.send(message);
val.replace(Some(out));
}
start.with_elapsed_micros_subtracting_tracing(|ts| {
if new_thread {
let name = match std::thread::current().name() {
Some(name) => name.to_owned(),
None => tid.to_string(),
};
let _ignored = out.send(Message::NewThread(*tid, name));
}
f(ts, *tid, out);
});
});
}
}
@ -586,52 +600,58 @@ where
return;
}
let ts = self.get_ts();
self.enter_span(ctx.span(id).expect("Span not found."), ts);
self.with_elapsed_micros_subtracting_tracing(|ts, tid, out| {
self.enter_span(ctx.span(id).expect("Span not found."), ts, tid, out);
});
}
fn on_record(&self, id: &span::Id, values: &span::Record<'_>, ctx: Context<'_, S>) {
if self.include_args {
let span = ctx.span(id).unwrap();
let mut exts = span.extensions_mut();
self.with_elapsed_micros_subtracting_tracing(|_, _, _| {
let span = ctx.span(id).unwrap();
let mut exts = span.extensions_mut();
let args = exts.get_mut::<ArgsWrapper>();
let args = exts.get_mut::<ArgsWrapper>();
if let Some(args) = args {
let args = Arc::make_mut(&mut args.args);
values.record(&mut JsonVisitor { object: args });
}
if let Some(args) = args {
let args = Arc::make_mut(&mut args.args);
values.record(&mut JsonVisitor { object: args });
}
});
}
}
fn on_event(&self, event: &Event<'_>, _ctx: Context<'_, S>) {
let ts = self.get_ts();
let callsite = self.get_callsite(EventOrSpan::Event(event));
self.send_message(Message::Event(ts, callsite));
self.with_elapsed_micros_subtracting_tracing(|ts, tid, out| {
let callsite = self.get_callsite(EventOrSpan::Event(event), tid);
let _ignored = out.send(Message::Event(ts, callsite));
});
}
fn on_exit(&self, id: &span::Id, ctx: Context<'_, S>) {
if let TraceStyle::Async = self.trace_style {
return;
}
let ts = self.get_ts();
self.exit_span(ctx.span(id).expect("Span not found."), ts);
self.with_elapsed_micros_subtracting_tracing(|ts, tid, out| {
self.exit_span(ctx.span(id).expect("Span not found."), ts, tid, out);
});
}
fn on_new_span(&self, attrs: &span::Attributes<'_>, id: &span::Id, ctx: Context<'_, S>) {
if self.include_args {
let mut args = Object::new();
attrs.record(&mut JsonVisitor { object: &mut args });
ctx.span(id).unwrap().extensions_mut().insert(ArgsWrapper {
args: Arc::new(args),
});
}
if let TraceStyle::Threaded = self.trace_style {
return;
}
self.with_elapsed_micros_subtracting_tracing(|ts, tid, out| {
if self.include_args {
let mut args = Object::new();
attrs.record(&mut JsonVisitor { object: &mut args });
ctx.span(id).unwrap().extensions_mut().insert(ArgsWrapper {
args: Arc::new(args),
});
}
if let TraceStyle::Threaded = self.trace_style {
return;
}
let ts = self.get_ts();
self.enter_span(ctx.span(id).expect("Span not found."), ts);
self.enter_span(ctx.span(id).expect("Span not found."), ts, tid, out);
});
}
fn on_close(&self, id: span::Id, ctx: Context<'_, S>) {
@ -639,8 +659,9 @@ where
return;
}
let ts = self.get_ts();
self.exit_span(ctx.span(&id).expect("Span not found."), ts);
self.with_elapsed_micros_subtracting_tracing(|ts, tid, out| {
self.exit_span(ctx.span(&id).expect("Span not found."), ts, tid, out);
});
}
}

View file

@ -0,0 +1,183 @@
//! Code in this class was in part inspired by
//! <https://github.com/tikv/minstant/blob/27c9ec5ec90b5b67113a748a4defee0d2519518c/src/tsc_now.rs>.
//! A useful resource is also
//! <https://www.pingcap.com/blog/how-we-trace-a-kv-database-with-less-than-5-percent-performance-impact/>,
//! although this file does not implement TSC synchronization but insteads pins threads to CPUs,
//! since the former is not reliable (i.e. it might lead to non-monotonic time measurements).
//! Another useful resource for future improvements might be measureme's time measurement utils:
//! <https://github.com/rust-lang/measureme/blob/master/measureme/src/counters.rs>.
//! Documentation about how the Linux kernel chooses a clock source can be found here:
//! <https://btorpey.github.io/blog/2014/02/18/clock-sources-in-linux/>.
#![cfg(feature = "tracing")]
/// This alternative `TracingChromeInstant` implementation was made entirely to suit the needs of
/// [crate::log::tracing_chrome], and shouldn't be used for anything else. It featues two functions:
/// - [TracingChromeInstant::setup_for_thread_and_start], which sets up the current thread to do
/// proper time tracking and returns a point in time to use as "t=0", and
/// - [TracingChromeInstant::with_elapsed_micros_subtracting_tracing], which allows
/// obtaining how much time elapsed since [TracingChromeInstant::setup_for_thread_and_start] was
/// called while accounting for (and subtracting) the time spent inside tracing-related functions.
///
/// This measures time using [std::time::Instant], except for x86/x86_64 Linux machines, where
/// [std::time::Instant] is too slow (~1.5us) and thus `rdtsc` is used instead (~5ns).
pub enum TracingChromeInstant {
WallTime {
/// The time at which this instant was created, shifted forward to account
/// for time spent in tracing functions as explained in
/// [TracingChromeInstant::with_elapsed_micros_subtracting_tracing]'s comments.
start_instant: std::time::Instant,
},
#[cfg(all(target_os = "linux", any(target_arch = "x86", target_arch = "x86_64")))]
Tsc {
/// The value in the TSC counter when this instant was created, shifted forward to account
/// for time spent in tracing functions as explained in
/// [TracingChromeInstant::with_elapsed_micros_subtracting_tracing]'s comments.
start_tsc: u64,
/// The period of the TSC counter in microseconds.
tsc_to_microseconds: f64,
},
}
impl TracingChromeInstant {
/// Can be thought of as the same as [std::time::Instant::now()], but also does some setup to
/// make TSC stable in case TSC is available. This is supposed to be called (at most) once per
/// thread since the thread setup takes a few milliseconds.
///
/// WARNING: If TSC is available, `incremental_thread_id` is used to pick to which CPU to pin
/// the current thread. Thread IDs should be assigned contiguously starting from 0. Be aware
/// that the current thread will be restricted to one CPU for the rest of the execution!
pub fn setup_for_thread_and_start(incremental_thread_id: usize) -> TracingChromeInstant {
#[cfg(all(target_os = "linux", any(target_arch = "x86", target_arch = "x86_64")))]
if *tsc::IS_TSC_AVAILABLE.get_or_init(tsc::is_tsc_available) {
// We need to lock this thread to a specific CPU, because CPUs' TSC timers might be out
// of sync.
tsc::set_cpu_affinity(incremental_thread_id);
// Can only use tsc_to_microseconds() and rdtsc() after having set the CPU affinity!
// We compute tsc_to_microseconds anew for every new thread just in case some CPU core
// has a different TSC frequency.
let tsc_to_microseconds = tsc::tsc_to_microseconds();
let start_tsc = tsc::rdtsc();
return TracingChromeInstant::Tsc { start_tsc, tsc_to_microseconds };
}
let _ = incremental_thread_id; // otherwise we get a warning when the TSC branch is disabled
TracingChromeInstant::WallTime { start_instant: std::time::Instant::now() }
}
/// Calls `f` with the time elapsed in microseconds since this [TracingChromeInstant] was built
/// by [TracingChromeInstant::setup_for_thread_and_start], while subtracting all time previously
/// spent executing other `f`s passed to this function. This behavior allows subtracting time
/// spent in functions that log tracing data (which `f` is supposed to be) from the tracing time
/// measurements.
///
/// Note: microseconds are used as the time unit since that's what Chrome trace files should
/// contain, see the definition of the "ts" field in
/// <https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/preview>.
#[inline(always)]
pub fn with_elapsed_micros_subtracting_tracing(&mut self, f: impl Fn(f64)) {
match self {
TracingChromeInstant::WallTime { start_instant } => {
// Obtain the current time (before executing `f`).
let instant_before_f = std::time::Instant::now();
// Using the current time (`instant_before_f`) and the `start_instant` stored in
// `self`, calculate the elapsed time (in microseconds) since this instant was
// instantiated, accounting for any time that was previously spent executing `f`.
// The "accounting" part is not computed in this line, but is rather done by
// shifting forward the `start_instant` down below.
let ts = (instant_before_f - *start_instant).as_nanos() as f64 / 1000.0;
// Run the function (supposedly a function internal to the tracing infrastructure).
f(ts);
// Measure how much time was spent executing `f` and shift `start_instant` forward
// by that amount. This "removes" that time from the trace.
*start_instant += std::time::Instant::now() - instant_before_f;
}
#[cfg(all(target_os = "linux", any(target_arch = "x86", target_arch = "x86_64")))]
TracingChromeInstant::Tsc { start_tsc, tsc_to_microseconds } => {
// the comments above also apply here, since it's the same logic
let tsc_before_f = tsc::rdtsc();
let ts = ((tsc_before_f - *start_tsc) as f64) * (*tsc_to_microseconds);
f(ts);
*start_tsc += tsc::rdtsc() - tsc_before_f;
}
}
}
}
#[cfg(all(target_os = "linux", any(target_arch = "x86", target_arch = "x86_64")))]
mod tsc {
pub static IS_TSC_AVAILABLE: std::sync::OnceLock<bool> = std::sync::OnceLock::new();
/// Reads the timestamp-counter register. Will give monotonic answers only when called from the
/// same thread, because the TSC of different CPUs might be out of sync.
#[inline(always)]
pub(super) fn rdtsc() -> u64 {
#[cfg(target_arch = "x86")]
use core::arch::x86::_rdtsc;
#[cfg(target_arch = "x86_64")]
use core::arch::x86_64::_rdtsc;
unsafe { _rdtsc() }
}
/// Estimates the frequency of the TSC counter by waiting 10ms in a busy loop and
/// looking at how much the TSC increased in the meantime.
pub(super) fn tsc_to_microseconds() -> f64 {
const BUSY_WAIT: std::time::Duration = std::time::Duration::from_millis(10);
let tsc_start = rdtsc();
let instant_start = std::time::Instant::now();
while instant_start.elapsed() < BUSY_WAIT {
// `thread::sleep()` is not very precise at waking up the program at the right time,
// so use a busy loop instead.
core::hint::spin_loop();
}
let tsc_end = rdtsc();
(BUSY_WAIT.as_nanos() as f64) / 1000.0 / ((tsc_end - tsc_start) as f64)
}
/// Checks whether the TSC counter is available and runs at a constant rate independently
/// of CPU frequency even across different power states of the CPU (i.e. checks for the
/// `invariant_tsc` CPUID flag).
pub(super) fn is_tsc_available() -> bool {
#[cfg(target_arch = "x86")]
use core::arch::x86::__cpuid;
#[cfg(target_arch = "x86_64")]
use core::arch::x86_64::__cpuid;
// implemented like https://docs.rs/raw-cpuid/latest/src/raw_cpuid/extended.rs.html#965-967
const LEAF: u32 = 0x80000007; // this is the leaf for "advanced power management info"
let cpuid = unsafe { __cpuid(LEAF) };
(cpuid.edx & (1 << 8)) != 0 // EDX bit 8 indicates invariant TSC
}
/// Forces the current thread to run on a single CPU, which ensures the TSC counter is monotonic
/// (since TSCs of different CPUs might be out-of-sync). `incremental_thread_id` is used to pick
/// to which CPU to pin the current thread, and should be an incremental number that starts from
/// 0.
pub(super) fn set_cpu_affinity(incremental_thread_id: usize) {
let cpu_id = match std::thread::available_parallelism() {
Ok(available_parallelism) => incremental_thread_id % available_parallelism,
_ => panic!("Could not determine CPU count to properly set CPU affinity"),
};
let mut set = unsafe { std::mem::zeroed::<libc::cpu_set_t>() };
unsafe { libc::CPU_SET(cpu_id, &mut set) };
// Set the current thread's core affinity.
if unsafe {
libc::sched_setaffinity(
0, // Defaults to current thread
size_of::<libc::cpu_set_t>(),
&set as *const _,
)
} != 0
{
panic!("Could not set CPU affinity")
}
}
}