V8 logo CC BY-SA 3.0; NYSE trading floor by Kevin Hutchinson, CC BY 2.0; DualSense Edge by Evan-Amos, CC0; Neve 81 console CC BY-SA 4.0; Ford SYNC ECU FCC ID public submission. All via Wikimedia Commons.
Photo: Michael Himbeault, CC BY 2.0, via Wikimedia Commons.
How long does this take?
Seems simple.
Well, let's see...
/// hide
#include <chrono>
#include <print>
/// unhide
namespace sc = std::chrono;
int sum(const std::span<const int> v) {
int total = 0;
for (auto x : v) total += x;
return total;
}
int main() {
constexpr std::array data {1, 2, 3, 4, 5};
const auto start = sc::system_clock::now();
auto result = sum(data);
const auto end = sc::system_clock::now();
std::print("result={}, took {}\n",
result, end - start);
}
/// unhide
now()?
Spaceballs (1987) © Brooksfilms/MGM. Used under fair use for commentary/education. danoshinsky.com
std::chrono clocks
struct some_clock {
using rep = /* rep */;
using period = /* period per s */;
using duration = duration<rep, period>;
using time_point = time_point<some_clock>;
static constexpr bool is_steady = /* ... */;
static time_point now() noexcept {
// magic to find out the current time
}
};
std::chrono clockssystem_clock: wall timesteady_clock: monotonichigh_resolution_clock: steady_clock in a trenchcoat
/// hide
#include <chrono>
namespace sc = std::chrono;
/// unhide
auto t1 = sc::steady_clock::now(); // **steady** clock
auto t2 = sc::system_clock::now(); // **system** clock
auto diff = t2 - t1;
error: no match for 'operator-':
auto diff = t2 - t1;
~~ ^ ~~
| |
| time_point<std::chrono::_V2::steady_clock>
time_point<std::chrono::_V2::system_clock>
now()?
/// hide
#include <chrono>
namespace sc = std::chrono;
/// unhide
auto get_time() {
return sc::steady_clock::now();
}
Pop quiz: What system call?
now()?
std::chrono::_V2::steady_clock::now():
sub rsp, 0x18 ; timespec ts;
mov edi, 0x01 ; param0 = CLOCK_MONOTONIC
mov rsi, rsp ; param1 = &ts
call __clock_gettime ; clock_gettime(
; CLOCK_MONOTONIC, &ts)
imul rax, [rsp], 0x3b9aca00 ; r = ts.tv_sec * 1 billion
add rax, [rsp+0x8] ; r += ts.tv_nsec
add rsp, 0x18 ; restore stack
ret ; return r
Interestingly CLOCK_MONOTONIC
/// hide
#include <chrono>
/// unhide
int main() {
using clock = std::chrono::steady_clock;
for (int i = 0; i < 1'000'000; ++i)
clock::now();
}
$ g++ clock.cpp
$ strace ./a.out 2>&1 | grep -iE 'clock|time'
$
clock_gettime?glibc/sysdeps/unix/clock_gettime.c
int clock_gettime(clockid_t clk_id, timespec *tp) {
switch (clk_id) {
case CLOCK_MONOTONIC:
case CLOCK_REALTIME:
return INLINE_VSYSCALL(clock_gettime, clk_id, tp);
// ...
}
}
clock_gettime?
// Magically populated by the ELF loader...
int (*__vdso_clock_gettime)(int, timeval *);
// Macro magic sort of expands to:
inline int inline_vsyscall_clock_gettime(
clockid_t clk_id, timespec *tp) {
if (__vdso_clock_gettime) {
return __vdso_clock_gettime(clk_id, tp);
}
return syscall(clock_gettime, sc_err, 2, clk_id, tp);
}
vDSO?
$ gdb /bin/true
(gdb) starti
Starting program: /usr/bin/true
(gdb) disassemble __vdso_clock_gettime
0x00007ffff7fbd1e0 <+0>: jmp 0x7ffff7fbc930
(gdb) disassemble 0x7ffff7fbc930,+0x400
0x00007ffff7fbc930: push %rbp
0x00007ffff7fbc931: mov %rsp,%rbp
0x00007ffff7fbc934: push %r14
...
vDSO?
__vdso_clock_gettime:
push rbp
mov rbp, rsp
push r14
push rbx
and rsp, -16
sub rsp, 0x20
cmp edi, 0x17
ja _doSyscall
mov eax, 0x1
mov ecx, edi
lea r11, [rip-26966]
shl eax, cl
mov edx, eax
and edx, 0x883
je _slowPath
mov r9d, [r11]
mov r10d, r9d
and r10d, 0x1
jne _seqlockFail
mov eax, [r11+0x4]
cmp eax, 0x1
jne _notTsc
rdtscp
xchg ax, ax
shl rdx, 0x20
or rdx, rax
btr rdx, 0x3f
movsxd r8, edi
...mul / shift / ret
const auto *cfg = clock_data_for(clk_id);
uint64_t seq, ns, aux;
do {
seq = cfg->seq; // "volatile" read
if (seq & 1) continue;
const uint64_t delta =
__builtin_ia32_rdtscp(&aux) - cfg->cycle_last;
ns = (delta * cfg->mult + cfg->base) >> cfg->shift;
} while (cfg->seq != seq); // "volatile" read
return ns;
PSEUDOCODE! not remotely safe...
Based off code in do_hres() from kernel lib/vdso/gettimeofday.c.
mult, shift, cycle_last updated on tickMONOTONIC: mult steered by NTPMONOTONIC_RAW: mult fixed at boot
std::chrono::steady_clock::now()
→ __clock_gettime(CLOCK_MONOTONIC)
→ vDSO (no syscall!)
seq lock { rdtsc + calibration maths }
← struct timespec
← time_point{timespec to nanos}
Function calls & a magic instruction.
No syscall!
/// hide
#include <chrono>
#include <limits>
#include <numeric>
#include <print>
/// unhide
int main() {
using clock = std::chrono::steady_clock;
auto last = clock::now();
auto minDelta = std::numeric_limits<
clock::duration>::max();
for (int i = 0; i < 5'000; ++i) {
auto now = clock::now();
auto delta = now - last;
minDelta = std::min(minDelta, delta);
last = now;
}
std::print("Min = {}\n", minDelta);
}
Min = 0ns
/// hide
#include <chrono>
#include <limits>
#include <numeric>
#include <print>
/// unhide
int main() {
using clock = std::chrono::steady_clock;
auto last = clock::now();
auto minDelta = std::numeric_limits<
clock::duration>::max();
for (int i = 0; i < 5'000; ++i) {
auto now = clock::now();
auto delta = now - last;
minDelta = std::min(minDelta, delta);
last = now;
}
std::print("Min = {}\n", minDelta);
}
/// hide
#include <chrono>
#include <limits>
#include <numeric>
#include <print>
using namespace std::literals;
/// unhide
int main() {
using clock = std::chrono::steady_clock;
auto last = clock::now();
auto minDelta = 1'000'000'000ns;
for (int i = 0; i < 5'000; ++i) {
auto now = clock::now();
auto delta = now - last;
minDelta = std::min(minDelta, delta);
last = now;
}
std::print("Min = {}\n", minDelta);
}
20-30ns!
Photo: Elliott R. Plack, CC0, via Wikimedia Commons (Harbor East Geodetic Disk).
now()?
Spaceballs (1987) © Brooksfilms/MGM. Used under fair use for commentary/education. danoshinsky.com
/// hide
#include <span>
#include <cstdint>
#include <chrono>
namespace sc = std::chrono;
/// unhide
constexpr std::array data {1, 2, 3, 4, 5};
static int sum(std::span<const int> v) {
int total = 0;
for (auto x : v) total += x;
return total;
}
auto benchmark() {
auto start = sc::steady_clock::now();
int result = sum(data);
return sc::steady_clock::now() - start;
}
error: unused variable 'result' [-Werror=unused-variable]
| int result = sum(data);
| ^~~~~~
/// hide
#include <span>
#include <cstdint>
#include <chrono>
namespace sc = std::chrono;
/// unhide
constexpr std::array data {1, 2, 3, 4, 5};
static int sum(std::span<const int> v) {
int total = 0;
for (auto x : v) total += x;
return total;
}
auto benchmark() {
auto start = sc::steady_clock::now();
[[maybe_unused]] int result = sum(data);
return sc::steady_clock::now() - start;
}
volatile "fix"
/// hide
#include <span>
#include <cstdint>
#include <chrono>
namespace sc = std::chrono;
/// unhide
constexpr std::array data {1, 2, 3, 4, 5};
static int sum(std::span<const int> v) {
int total = 0;
for (auto x : v) total += x;
return total;
}
auto benchmark() {
auto start = sc::steady_clock::now();
[[maybe_unused]] volatile int result = sum(data);
return sc::steady_clock::now() - start;
}
/// hide
#include <span>
#include <cstdint>
#include <chrono>
namespace sc = std::chrono;
/// unhide
/*constexpr*/ std::array data {1, 2, 3, 4, 5};
static int sum(std::span<const int> v) {
int total = 0;
for (auto x : v) total += x;
return total;
}
auto benchmark() {
auto start = sc::steady_clock::now();
[[maybe_unused]] volatile int result = sum(data);
return sc::steady_clock::now() - start;
}
Accesses through volatile glvalues are evaluated strictly according to the rules of the abstract machine.
[intro.abstract] Section 8 - the "as-if" rule
The order of volatile operations cannot change relative to other volatile operations, but may change relative to non-volatile operations.
P1152R0 Deprecating volatile (JF Bastien)
/// hide
#include <span>
#include <cstdint>
#include <chrono>
namespace sc = std::chrono;
constexpr std::array data {1, 2, 3, 4, 5};
static int sum(std::span<const int> v) {
int total = 0;
for (auto x : v) total += x;
return total;
}
/// unhide
auto benchmark() {
auto start = sc::steady_clock::now();
[[maybe_unused]] volatile int result = sum(data);
return sc::steady_clock::now() - start;
}
/// hide
#include <span>
#include <cstdint>
#include <chrono>
namespace sc = std::chrono;
constexpr std::array data {1, 2, 3, 4, 5};
static int sum(std::span<const int> v) {
int total = 0;
for (auto x : v) total += x;
return total;
}
/// unhide
auto benchmark() {
// Is this a valid transform?
[[maybe_unused]] volatile int result = sum(data);
auto start = sc::steady_clock::now();
return sc::steady_clock::now() - start;
}
..I had embarrassingly neglected the possibility that the compiler would reorder the calculation out of the timing region...
Without decimating the as-if rule, there appears to be no way to normatively require such timings to be correct. Nevertheless, timing a block of code or an algorithm is not devoid of meaning.
P0342R0 Timing barriers, Mike Spertus
Bolshoi Theatre curtain: Bgelo777, CC BY-SA 4.0, via Wikimedia Commons.
asm <optionally volatile> (
"template string %0, %1 ..."
: outputs
: inputs
: clobbers
);
"type"(expression)
"r""m""r,m""i""g""a", "b", …=+&e.g. "=r"(dest), "+m"(buf)
/// hide
#include <cstdint>
void test() {
/// unhide
uint64_t source = 1234;
uint64_t dest;
asm /*volatile*/ (
"mov %1, %0" // AT&T syntax
: "=r" (dest) // outputs
: "r" (source) // inputs
: // no clobbers
);
/// hide
}
GCC's optimizers discardasmstatements if there is no need for the output variables. The optimizers may move code out of loops if the code always returns the same result. Using thevolatilequalifier disables these optimizations.
/// hide
#include <array>
/// unhide
template<typename T>
void DoNotOptimize(const T &value) {
asm volatile(
"" // No instructions at all!
: // no outputs
: "r,m" (value) // input
: // no clobbers
);
}
///hide
std::array data {1, 2, 3, 4, 5};
void benchmark_sum() {
int total = 0;
for (auto x : data) total += x;
DoNotOptimize(total);
}
/// hide
#include <array>
#include <chrono>
namespace sc = std::chrono;
/// unhide
template<typename T>
void DoNotOptimize(const T &value) {
asm volatile("" : : "r,m" (value));
}
/// hide
std::array data {1, 2, 3, 4, 5};
static void benchmark_sum() {
int total = 0;
for (auto x : data) total += x;
DoNotOptimize(total);
}
/// unhide
auto benchmark_sum_many() {
auto now = sc::steady_clock::now();
for (int i = 0; i < 16; ++i) {
benchmark_sum();
}
return sc::steady_clock::now() - now;
}
ClobberMemory()
/// hide
#include <array>
#include <chrono>
namespace sc = std::chrono;
template<typename T>
void DoNotOptimize(const T &value) {
asm volatile("" : : "r,m" (value));
}
std::array data {1, 2, 3, 4, 5};
static void benchmark_sum() {
int total = 0;
for (auto x : data) total += x;
DoNotOptimize(total);
}
/// unhide
inline void ClobberMemory() {
asm volatile("" : : : "memory");
}
auto benchmark_sum_many() {
auto now = sc::steady_clock::now();
for (int i = 0; i < 16; ++i) {
ClobberMemory();
benchmark_sum();
}
return sc::steady_clock::now() - now;
}
timing_fence() - rejected
now(), and now() is in another
TU, how does the compiler know there is a fence?"
keep() / touch()
std::hint::black_box. Zig: mem.doNotOptimizeAway.
now()?
Spaceballs (1987) © Brooksfilms/MGM. Used under fair use for commentary/education. danoshinsky.com
Pentium 4 Prescott die: Martijn Boer, Public Domain, via Wikimedia Commons.
When we say "now", when do we mean?
rdtsc, rdtscp(ARM has CNTVCT_EL0)
rdtsc - read counter into edx:eax
/// hide
#include <cstdint>
///unhide
uint64_t now() {
asm("rdtsc"
: // outputs
: // inputs
: // clobbers
);
}
rdtsc - read counter into edx:eax
/// hide
#include <cstdint>
///unhide
uint64_t now() {
asm("rdtsc"
: "=a"(...), "=d"(...) // outputs
: // inputs
: // clobbers
);
}
rdtsc - read counter into edx:eax
/// hide
#include <cstdint>
///unhide
uint64_t now() {
uint32_t lo, hi;
asm("rdtsc"
: "=a"(lo), "=d"(hi) // outputs
: // inputs
: // clobbers
);
}
rdtsc - read counter into edx:eax
/// hide
#include <cstdint>
///unhide
uint64_t now() {
uint32_t lo, hi;
asm("rdtsc"
: "=a"(lo), "=d"(hi)
: // inputs
: // clobbers
);
return static_cast<uint64_t>(lo)
| static_cast<uint64_t>(hi) << 32;
}
rdtsc - read counter into edx:eax
/// hide
#include <cstdint>
///unhide
uint64_t now() {
uint64_t lo, hi;
asm("rdtsc"
: "=a"(lo), "=d"(hi)
: // inputs
: // clobbers
);
return lo | (hi << 32);
}
/// hide
#include <cstdint>
inline uint64_t now() {
uint64_t lo, hi;
asm("rdtsc" : "=a"(lo), "=d"(hi));
return lo | (hi << 32);
}
/// unhide
void testFunction();
auto benchmark() {
const auto start = now();
testFunction();
const auto end = now();
return end - start;
}
/// hide
#include <cstdint>
///unhide
uint64_t now() {
uint64_t lo, hi;
asm volatile("rdtsc" : "=a"(lo), "=d"(hi));
return lo | (hi << 32);
}
uint64_t now() {
return __builtin_ia32_rdtsc();
}
RDTSC - immediateRDTSCP - after prior instrs completed"The RDTSCP instruction ... wait[s] until all previous instructions have executed ... [but] ... subsequent instructions may begin execution before the read operation is performed."
lfence ; ensure all loads complete
rdtsc ; read time
; code under test here
rdtscp ; rdtsc but partially serializing
lfence ; ensure nothing beyond migrates above
#include <cstdint>
auto test() {
__builtin_ia32_lfence();
const auto start = __builtin_ia32_rdtsc();
//...
std::uint32_t aux;
const auto end = __builtin_ia32_rdtscp(&aux);
__builtin_ia32_lfence();
return end - start;
}
extern int x, y, z, w;
auto someFunction() {
return x % y % z % w;
}
idivs?rdtsc |
~111 ns |
rdtscp |
~187 ns |
lfence;rdtsc …rdtscp;lfence
|
~159 ns |
Probably only matters for microbenchmarks
clock_gettime do?
...
and r10d, 0x1
jne _seqlockFail
mov eax, [r11+0x4]
cmp eax, 0x1
jne _notTsc
rdtscp ; rdtscp but no fences in sight!
xchg ax, ax
shl rdx, 0x20
or rdx, rax
btr rdx, 0x3f
movsxd r8, edi
...
| ns | cycles | instr | IPC | |
|---|---|---|---|---|
rdtsc |
14 | 27 | 10 | 0.36 |
rdtscp |
23 | 43 | 11 | 0.25 |
steady_clock::now() |
35 | 68 | 100 | 1.46 |
~25 cycles vDSO overhead.
~89 more instructions?
N runs
// base = overhead + N x cost
const auto base = measure(N);
// bench = overhead + 2N x cost
const auto bench = measure(N * 2);
// result = (overhead + 2N x cost) - overhead + N x cost
// = N x cost
const auto result = (bench - base) / N;
Cancel overhead.
steady_clock::now() mostlyrdtsc/rdtscp for µbenchmarksrdtsc for speed
Photo: Tom Marvel, CC BY 2.0, via Wikimedia Commons (Anechoic chamber blue-1).
From an earlier talk on CPU microarchitecture.
/// hide
#include <cstdint>
#include <vector>
/// unhide
int state_machine(const std::vector<int> &data,
int threshold) {
int state = 0, acc = 0;
for (auto v : data) {
if (state == 0) {
if (v > threshold) { acc += v * v; state = 1; }
else { acc += v; }
} else {
if (v & 1) { acc ^= v; state = 0; }
else { acc += v >> 2; }
}
}
return acc;
}
/// hide
#include <cstdint>
#include <vector>
int state_machine(const std::vector<int> &data,
int threshold) {
int state = 0, acc = 0;
for (auto v : data) {
if (state == 0) {
if (v > threshold) { acc += v * v; state = 1; }
else { acc += v; }
} else {
if (v & 1) { acc ^= v; state = 0; }
else { acc += v >> 2; }
}
}
return acc;
}
/// unhide
auto benchmark(size_t dataSize, size_t N) {
// seeded random data...
auto data = make_data(dataSize);
auto start = now();
for (size_t repeat = 0; repeat < N; ++repeat) {
DoNotOptimize(state_machine(data, 50));
}
auto end = now();
return end - start;
}
$ ./benchmark.tsc_fenced state_machine_256
# samples: 150 (+10 warmup, discarded)
name min
---- ---
state_machine_256 0.59 ns/elem
Random data, mt19937(42)..
Same values & cache footprint.
perf stat
$ perf stat -e cycles,instructions,branches,branch-misses \
taskset -c 2 ./benchmark.tsc_fenced state_machine_4096
531,281,741 cycles
1,347,442,053 instructions # 2.54 insn per cycle
419,833,335 branches
1,839,889 branch-misses # 0.44% of all branches
| dataSize | Instructions | Branches | Miss % | IPC |
|---|---|---|---|---|
| 1,024 | 1.35B | 420M | 0.02% | 3.22 |
| 4,096 | 1.35B | 420M | 0.44% | 2.54 |
| 8,192 | 1.35B | 419M | 8.67% | 0.96 |
| 32,768 | 1.35B | 421M | 18.10% | 0.59 |
…each its own rabbit hole.
Photo: Wikideas1, CC0, via Wikimedia Commons.
Photo: Acagastya, CC0, via Wikimedia Commons.
...have been the focus of this talk...
# pin and isolate
taskset -c 3 ./bench
# at boot: isolcpus=3 nohz_full=3 rcu_nocbs=3
# frequency
sudo cpupower frequency-set -g performance
echo 1 | sudo \
tee /sys/devices/system/cpu/intel_pstate/no_turbo
# IRQs off your core
echo fffffff7 | sudo tee /proc/irq/*/smp_affinity
Noise only makes things slower.
Pick one.
perf & CDF shape.
Benchmark says faster.
App says no difference.
struct ScopedTimer {
uint64_t &accumulator;
uint64_t start;
explicit ScopedTimer(uint64_t &acc)
: accumulator(acc), start(rdtsc()) {}
~ScopedTimer() { accumulator += rdtsc() - start; }
};
void hot_path() {
ScopedTimer t{stats.hot_path};
// ...
}
You need both.
perfperf statperf recordperf reportperf annotateperf topperf list
perf list | wc -l → 7158 events
(branch-misses, LLC-load-misses, resource_stalls.sb,
…)
Pick the right counter.
perf_event_open
Aspen Mountain: Hunter Bryant, CC0 (Unsplash), via Wikimedia Commons.
If any single optimisation makes a routine run two or more times faster, then you’ve broken the code.
- Matt Godbolt, April 2005
In 1993: border colours.
2026: rdtscp, perf, Catch2...
Measure your code!
Magician silhouette: Firkin, CC0, via Wikimedia Commons (after an 1880 illustration).
"Some people say optimisation is magic. People who optimise say no, it isn't magic.
But really it is like magic: once you show people how it's done, it goes from amazing to 'oh, just that?'"
- Pat Farley
mattgodbolt@hudson-trading.com
If any single optimisation makes a routine run two or more times faster, then you've broken the code.
“I proceed to dance around the room… then, just before I commit, I remember to actually check the results - and yes, the reason it's going three times faster is because it's not actually doing any work at all.”
state_machine_16 also matched
state_machine_16384
A benchmarking bug in our benchmark of benchmarking.
rdtsc pipeline trace (uiCA)
rdtsc |P QI D ER
|P QI D E R
|P QI D E R
|P QI D E R
|P QI D E R
|P QI D E R
|P QI D E R
|P QI D E R
|P QI D E R
|P QI D E R
|P QI D E R
|P QI D E R
|P QI D E R
|P QI D E R
|P QI D E R
|P QI D E R
|P QI D E R
|P QI D E R
shl rdx, 0x20 |P QI D E R
or rax, rdx |P QI D E R
mov rcx, rax |P QE R
add rcx, 0x2a |P QI D E R
imul rcx, rcx, 0x7 | P QI D E R
rdtscp pipeline trace (uiCA)
rdtscp |P QI D ER
|P QI D ER
|P QI D ER
|P QI D E R
|P QI D ER
|P QI D E R
|P QI D ER
|P QI D ER
|P QI D ER
|P QI D E R
|P QI D E R
|P QI D E R
|P QI D E R
|P QI D ER
|P QI D ER
|P QI D ER
|P QI D E R
|P QI D E R
|P QI D E R
|P QI D E R
shl rdx, 0x20 |P QI D E R
or rax, rdx |P QI D ER
mov rcx, rax |P QE R
add rcx, 0x2a | P QI D ER
imul rcx, rcx, 0x7 | P QI D ER
Chrome Omnibox: 25,000 allocations per keypress
std::string responsible for nearly half of all Chrome browser process
allocations.
Discovered by profiling. Not by guessing.
memory, filesystem, etc etc
do you want hot or cold? (same question as bpu)
Same code. Same algorithm. Same binary.
Three completely different performance profiles.
TODO: actual benchmark numbers - chart/table of ns/element at different matrix sizes
/// hide
#include <mdspan>
using mat = std::mdspan<double, std::dextents<int, 2>>;
using cmat = std::mdspan<const double, std::dextents<int, 2>>;
/// unhide
// ijk - cache-hostile (stride-N on B)
void multiply_ijk(cmat A, cmat B, mat C) {
int N = A.extent(0);
for (int i = 0; i < N; ++i)
for (int j = 0; j < N; ++j)
for (int k = 0; k < N; ++k)
C[i, j] += A[i, k] * B[k, j];
}
// ikj - cache-friendly (stride-1 on B and C)
void multiply_ikj(cmat A, cmat B, mat C) {
int N = A.extent(0);
for (int i = 0; i < N; ++i)
for (int k = 0; k < N; ++k)
for (int j = 0; j < N; ++j)
C[i, j] += A[i, k] * B[k, j];
}
5–10× difference. And the compiler vectorises only one of them.
TODO: benchmark numbers from two different machines (laptop vs Xeon)
The minimum of many runs represents "what happens when nothing goes wrong." The mean includes all the ways the OS interrupted you.