mirror of
https://github.com/wolfpld/tracy.git
synced 2024-11-22 14:44:34 +00:00
Use patchable rdtsc sequence to avoid slowdowns under rr
We (Julia) ship both support for using tracy to trace julia applications, as well as using `rr` (https://github.com/rr-debugger/rr) for record-replay debugging. After our most recent rebuild of tracy, users have been reporting signfificant performance slowdowns when `rr` recording a session that happens to also load the tracy library (even if tracing is not enabled). Upon further examination, the recompile happened to trigger a protective heuristic that disabled rr's patching of tracy's use of `rdtsc` because an earlier part of the same function happened to look like a conditional branch into the patch region. See https://github.com/rr-debugger/rr/pull/3580 for details. To avoid this issue occurring again in future rebuilds of tracy, adjust tracy's `rdtsc` sequence to be `nopl; rdtsc`, which (as of of the linked PR) is a sequence that is guaranteed to bypass this heuristic and not incur the additional overhead when run under rr. This functionality is kept behind a compile-time flag `TRACY_PATCHABLE_NOPSLEDS` in order to avoid polluting the instruction cache unnecessarily.
This commit is contained in:
parent
60a3a85069
commit
5417227e83
@ -78,6 +78,7 @@ set_option(TRACY_NO_VERIFY "Disable zone validation for C API" OFF)
|
||||
set_option(TRACY_NO_VSYNC_CAPTURE "Disable capture of hardware Vsync events" OFF)
|
||||
set_option(TRACY_NO_FRAME_IMAGE "Disable the frame image support and its thread" OFF)
|
||||
set_option(TRACY_NO_SYSTEM_TRACING "Disable systrace sampling" OFF)
|
||||
set_option(TRACY_PATCHABLE_NOPSLEDS "Enable nopsleds for efficient patching by system-level tools (e.g. rr)" OFF)
|
||||
set_option(TRACY_DELAYED_INIT "Enable delayed initialization of the library (init on first call)" OFF)
|
||||
set_option(TRACY_MANUAL_LIFETIME "Enable the manual lifetime management of the profile" OFF)
|
||||
set_option(TRACY_FIBERS "Enable fibers support" OFF)
|
||||
|
@ -64,6 +64,10 @@ if get_option('tracy_no_system_tracing')
|
||||
add_project_arguments('-DTRACY_NO_SYSTEM_TRACING', language : 'cpp')
|
||||
endif
|
||||
|
||||
if get_option('tracy_no_extra_nopsleds')
|
||||
add_project_arguments('-DTRACY_PATCHABLE_NOPSLEDS', language : 'cpp')
|
||||
endif
|
||||
|
||||
if get_option('tracy_delayed_init')
|
||||
add_project_arguments('-DTRACY_DELAYED_INIT', language : 'cpp')
|
||||
endif
|
||||
|
@ -209,7 +209,22 @@ public:
|
||||
if( HardwareSupportsInvariantTSC() )
|
||||
{
|
||||
uint64_t rax, rdx;
|
||||
#ifdef TRACY_PATCHABLE_NOPSLEDS
|
||||
// Some external tooling (such as rr) wants to patch our rdtsc and replace it by a
|
||||
// branch to control the external input seen by a program. This kind of patching is
|
||||
// not generally possible depending on the surrounding code and can lead to significant
|
||||
// slowdowns if the compiler generated unlucky code and rr and tracy are used together.
|
||||
// To avoid this, use the rr-safe `nopl 0(%rax, %rax, 1); rdtsc` instruction sequence,
|
||||
// which rr promises will be patchable independent of the surrounding code.
|
||||
asm volatile (
|
||||
// This is nopl 0(%rax, %rax, 1), but assemblers are inconsistent about whether
|
||||
// they emit that as a 4 or 5 byte sequence and we need to be guaranteed to use
|
||||
// the 5 byte one.
|
||||
".byte 0x0f, 0x1f, 0x44, 0x00, 0x00\n\t"
|
||||
"rdtsc" : "=a" (rax), "=d" (rdx) );
|
||||
#else
|
||||
asm volatile ( "rdtsc" : "=a" (rax), "=d" (rdx) );
|
||||
#endif
|
||||
return (int64_t)(( rdx << 32 ) + rax);
|
||||
}
|
||||
# else
|
||||
|
Loading…
Reference in New Issue
Block a user