mirror of
https://github.com/wolfpld/tracy.git
synced 2024-11-10 02:31:48 +00:00
Move disassembly from FAQ to manual.
This commit is contained in:
parent
c774534b47
commit
411e4d42ac
36
FAQ.md
36
FAQ.md
@ -39,39 +39,3 @@ Welp. But there's mobile support.
|
||||
### I do need console support.
|
||||
|
||||
The code is open. Write your own, then send a patch.
|
||||
|
||||
### I don't believe you can capture a zone in 15 ns. Show me the code!
|
||||
|
||||
Following is the annotated assembly code (generated from C++ sources) that's responsible for logging start of the zone:
|
||||
|
||||
```
|
||||
call qword ptr [__imp_GetCurrentThreadId]
|
||||
mov r14d,eax
|
||||
mov qword ptr [rsp+0F0h],r14 // save thread id for later use
|
||||
mov r12d,10h
|
||||
mov rax,qword ptr gs:[58h] // TLS
|
||||
mov r15,qword ptr [rax] // queue address
|
||||
mov rdi,qword ptr [r12+r15] // data address
|
||||
mov rbp,qword ptr [rdi+20h] // buffer counter
|
||||
mov rbx,rbp
|
||||
and ebx,7Fh // 128 item buffer
|
||||
jne Application::InnerLoop+66h --+
|
||||
mov rdx,rbp |
|
||||
mov rcx,rdi |
|
||||
call enqueue_begin_alloc | // reclaim/alloc next buffer
|
||||
shl rbx,5 <---------------------+ // buffer items are 32 bytes
|
||||
add rbx,qword ptr [rdi+40h]
|
||||
mov byte ptr [rbx],4 // queue item type
|
||||
rdtscp
|
||||
mov dword ptr [rbx+19h],ecx // cpu id
|
||||
shl rdx,20h
|
||||
or rax,rdx // 64 bit timestamp
|
||||
mov qword ptr [rbx+1],rax
|
||||
mov qword ptr [rbx+9],r14 // thread id
|
||||
lea rax,[__tracy_source_location] // static struct address
|
||||
mov qword ptr [rbx+11h],rax
|
||||
lea rax,[rbp+1] // increment buffer counter
|
||||
mov qword ptr [rdi+20h],rax
|
||||
```
|
||||
|
||||
There's also a second code block, for the end of the zone. It's similar, but a bit smaller, as it can use some of the variables that were retrieved above.
|
||||
|
@ -229,7 +229,7 @@ In Tracy terminology, the profiled application is a \emph{client} and the profil
|
||||
|
||||
To check how much slowdown is introduced by using Tracy, let's profile an example application. For this purpose we will use etcpak\footnote{\url{https://bitbucket.org/wolfpld/etcpak}}. Let's use an $8192 \times 8192$ pixels test image as input data and instrument everything down to the $4 \times 4$ pixel block compression function (that's 4 million blocks to compress).
|
||||
|
||||
The resulting timing information can be seen in table~\ref{PerformanceImpact}. As can be seen, the cost of a single-zone capture (consisting of the zone begin and zone end events) is \textasciitilde 15 \si{\nano\second}.
|
||||
The resulting timing information is presented in table~\ref{PerformanceImpact}. As can be seen, the cost of a single-zone capture (consisting of the zone begin and zone end events) is \textasciitilde 15 \si{\nano\second}.
|
||||
|
||||
\begin{table}[h]
|
||||
\centering
|
||||
@ -244,6 +244,38 @@ ETC2 + mip-maps & \num{5592822} & 1.034 \si{\second} & 1.119 \si{\second} & +0.0
|
||||
|
||||
It should be noted that Tracy has a constant initialization cost, needed to perform timer calibration. This cost was subtracted from the profiling run times, as it is irrelevant to the single-zone capture time.
|
||||
|
||||
\subsubsection{Assembly analysis}
|
||||
|
||||
To see how such small overhead (only 15 \si{\nano\second}) is achieved, let's take a look at the assembly. The following x64 code is responsible for logging start of a zone. Do note that it is generated by compiling fully portable C++.
|
||||
|
||||
\begin{lstlisting}[language={[x86masm]Assembler}]
|
||||
mov byte ptr [rsp+0C0h],1 ; store zone activity information
|
||||
mov r15d,28h
|
||||
mov rax,qword ptr gs:[58h] ; TLS
|
||||
mov r14,qword ptr [rax] ; queue address
|
||||
mov rdi,qword ptr [r15+r14] ; data address
|
||||
mov rbp,qword ptr [rdi+28h] ; buffer counter
|
||||
mov rbx,rbp
|
||||
and ebx,7Fh ; 128 item buffer
|
||||
jne function+54h -----------+ ; check if current buffer is usable
|
||||
mov rdx,rbp |
|
||||
mov rcx,rdi |
|
||||
call enqueue_begin_alloc | ; reclaim/alloc next buffer
|
||||
shl rbx,5 <-----------------+ ; buffer items are 32 bytes
|
||||
add rbx,qword ptr [rdi+48h] ; calculate queue item address
|
||||
mov byte ptr [rbx],10h ; queue item type
|
||||
rdtsc ; retrieve time
|
||||
shl rdx,20h
|
||||
or rax,rdx ; construct 64 bit timestamp
|
||||
mov qword ptr [rbx+1],rax ; write timestamp
|
||||
lea rax,[__tracy_source_location] ; static struct address
|
||||
mov qword ptr [rbx+9],rax ; write source location data
|
||||
lea rax,[rbp+1] ; increment buffer counter
|
||||
mov qword ptr [rdi+28h],rax ; write buffer counter
|
||||
\end{lstlisting}
|
||||
|
||||
The second code block, responsible for ending a zone, is similar, but smaller, as it can reuse some variables retrieved in the above code.
|
||||
|
||||
\subsection{On the web}
|
||||
|
||||
Tracy can be found at the following web addresses:
|
||||
|
Loading…
Reference in New Issue
Block a user