Move disassembly from FAQ to manual.

This commit is contained in:
Bartosz Taudul 2019-10-20 21:23:16 +02:00
parent c774534b47
commit 411e4d42ac
2 changed files with 33 additions and 37 deletions

36
FAQ.md
View File

@ -39,39 +39,3 @@ Welp. But there's mobile support.
### I do need console support.
The code is open. Write your own, then send a patch.
### I don't believe you can capture a zone in 15 ns. Show me the code!
Following is the annotated assembly code (generated from C++ sources) that's responsible for logging start of the zone:
```
call qword ptr [__imp_GetCurrentThreadId]
mov r14d,eax
mov qword ptr [rsp+0F0h],r14 // save thread id for later use
mov r12d,10h
mov rax,qword ptr gs:[58h] // TLS
mov r15,qword ptr [rax] // queue address
mov rdi,qword ptr [r12+r15] // data address
mov rbp,qword ptr [rdi+20h] // buffer counter
mov rbx,rbp
and ebx,7Fh // 128 item buffer
jne Application::InnerLoop+66h --+
mov rdx,rbp |
mov rcx,rdi |
call enqueue_begin_alloc | // reclaim/alloc next buffer
shl rbx,5 <---------------------+ // buffer items are 32 bytes
add rbx,qword ptr [rdi+40h]
mov byte ptr [rbx],4 // queue item type
rdtscp
mov dword ptr [rbx+19h],ecx // cpu id
shl rdx,20h
or rax,rdx // 64 bit timestamp
mov qword ptr [rbx+1],rax
mov qword ptr [rbx+9],r14 // thread id
lea rax,[__tracy_source_location] // static struct address
mov qword ptr [rbx+11h],rax
lea rax,[rbp+1] // increment buffer counter
mov qword ptr [rdi+20h],rax
```
There's also a second code block, for the end of the zone. It's similar, but a bit smaller, as it can use some of the variables that were retrieved above.

View File

@ -229,7 +229,7 @@ In Tracy terminology, the profiled application is a \emph{client} and the profil
To check how much slowdown is introduced by using Tracy, let's profile an example application. For this purpose we will use etcpak\footnote{\url{https://bitbucket.org/wolfpld/etcpak}}. Let's use an $8192 \times 8192$ pixels test image as input data and instrument everything down to the $4 \times 4$ pixel block compression function (that's 4 million blocks to compress).
The resulting timing information can be seen in table~\ref{PerformanceImpact}. As can be seen, the cost of a single-zone capture (consisting of the zone begin and zone end events) is \textasciitilde 15 \si{\nano\second}.
The resulting timing information is presented in table~\ref{PerformanceImpact}. As can be seen, the cost of a single-zone capture (consisting of the zone begin and zone end events) is \textasciitilde 15 \si{\nano\second}.
\begin{table}[h]
\centering
@ -244,6 +244,38 @@ ETC2 + mip-maps & \num{5592822} & 1.034 \si{\second} & 1.119 \si{\second} & +0.0
It should be noted that Tracy has a constant initialization cost, needed to perform timer calibration. This cost was subtracted from the profiling run times, as it is irrelevant to the single-zone capture time.
\subsubsection{Assembly analysis}
To see how such small overhead (only 15 \si{\nano\second}) is achieved, let's take a look at the assembly. The following x64 code is responsible for logging start of a zone. Do note that it is generated by compiling fully portable C++.
\begin{lstlisting}[language={[x86masm]Assembler}]
mov byte ptr [rsp+0C0h],1 ; store zone activity information
mov r15d,28h
mov rax,qword ptr gs:[58h] ; TLS
mov r14,qword ptr [rax] ; queue address
mov rdi,qword ptr [r15+r14] ; data address
mov rbp,qword ptr [rdi+28h] ; buffer counter
mov rbx,rbp
and ebx,7Fh ; 128 item buffer
jne function+54h -----------+ ; check if current buffer is usable
mov rdx,rbp |
mov rcx,rdi |
call enqueue_begin_alloc | ; reclaim/alloc next buffer
shl rbx,5 <-----------------+ ; buffer items are 32 bytes
add rbx,qword ptr [rdi+48h] ; calculate queue item address
mov byte ptr [rbx],10h ; queue item type
rdtsc ; retrieve time
shl rdx,20h
or rax,rdx ; construct 64 bit timestamp
mov qword ptr [rbx+1],rax ; write timestamp
lea rax,[__tracy_source_location] ; static struct address
mov qword ptr [rbx+9],rax ; write source location data
lea rax,[rbp+1] ; increment buffer counter
mov qword ptr [rdi+28h],rax ; write buffer counter
\end{lstlisting}
The second code block, responsible for ending a zone, is similar, but smaller, as it can reuse some variables retrieved in the above code.
\subsection{On the web}
Tracy can be found at the following web addresses: