Introduce basic schedule model for AMD Zen 3 CPU's, a.k.a `znver3`.
This is fully built from scratch, from llvm-mca measurements
and documented reference materials.
Nothing was copied from `znver2`/`znver1`.
I believe this is in a reasonable state of completion for inclusion,
probably better than D52779 `bdver2` was :)
Namely:
* uops are pretty spot-on (at least what llvm-mca can measure)
{F16422596}
* latency is also pretty spot-on (at least what llvm-mca can measure)
{F16422601}
* throughput is within reason
{F16422607}
I haven't run much benchmarks with this,
however RawSpeed benchmarks says this is beneficial:
{F16603978}
{F16604029}
I'll call out the obvious problems there:
* i didn't really bother with X87 instructions
* i didn't really bother with obviously-microcoded/system instructions
* There are large discrepancy in throughput for `mr` and `rm` instructions.
I'm not really sure if it's a modelling defect that needs to be fixed,
or it's a defect of measurments.
* Pipe distributions are probably bad :)
I can't do much here until AMD allows that to be fixed
by documenting the appropriate counters and updating libpfm
That being said, as @RKSimon notes:
>>! In D94395#2647381, @RKSimon wrote:
> I'll mention again that all the znver* models appear to be very inaccurate wrt SIMD/FPU instructions <...>
so how much worse this could possibly be?!
Things that aren't there:
* Various tunings: zero idioms, etc. That is follow-ups.
Differential Revision: https://reviews.llvm.org/D94395
99 lines
5.8 KiB
LLVM
99 lines
5.8 KiB
LLVM
; Intel chips with slow unaligned memory accesses
|
|
|
|
; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=pentium3 2>&1 | FileCheck %s --check-prefix=SLOW
|
|
; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=pentium3m 2>&1 | FileCheck %s --check-prefix=SLOW
|
|
; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=pentium-m 2>&1 | FileCheck %s --check-prefix=SLOW
|
|
; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=pentium4 2>&1 | FileCheck %s --check-prefix=SLOW
|
|
; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=pentium4m 2>&1 | FileCheck %s --check-prefix=SLOW
|
|
; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=yonah 2>&1 | FileCheck %s --check-prefix=SLOW
|
|
; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=prescott 2>&1 | FileCheck %s --check-prefix=SLOW
|
|
; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=nocona 2>&1 | FileCheck %s --check-prefix=SLOW
|
|
; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=core2 2>&1 | FileCheck %s --check-prefix=SLOW
|
|
; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=penryn 2>&1 | FileCheck %s --check-prefix=SLOW
|
|
; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=bonnell 2>&1 | FileCheck %s --check-prefix=SLOW
|
|
|
|
; Intel chips with fast unaligned memory accesses
|
|
|
|
; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=silvermont 2>&1 | FileCheck %s --check-prefix=FAST
|
|
; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=nehalem 2>&1 | FileCheck %s --check-prefix=FAST
|
|
; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=westmere 2>&1 | FileCheck %s --check-prefix=FAST
|
|
; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=sandybridge 2>&1 | FileCheck %s --check-prefix=FAST
|
|
; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=ivybridge 2>&1 | FileCheck %s --check-prefix=FAST
|
|
; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=haswell 2>&1 | FileCheck %s --check-prefix=FAST
|
|
; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=broadwell 2>&1 | FileCheck %s --check-prefix=FAST
|
|
; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=knl 2>&1 | FileCheck %s --check-prefix=FAST
|
|
; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=skylake-avx512 2>&1 | FileCheck %s --check-prefix=FAST
|
|
|
|
; AMD chips with slow unaligned memory accesses
|
|
|
|
; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=athlon-4 2>&1 | FileCheck %s --check-prefix=SLOW
|
|
; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=athlon-xp 2>&1 | FileCheck %s --check-prefix=SLOW
|
|
; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=k8 2>&1 | FileCheck %s --check-prefix=SLOW
|
|
; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=opteron 2>&1 | FileCheck %s --check-prefix=SLOW
|
|
; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=athlon64 2>&1 | FileCheck %s --check-prefix=SLOW
|
|
; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=athlon-fx 2>&1 | FileCheck %s --check-prefix=SLOW
|
|
; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=k8-sse3 2>&1 | FileCheck %s --check-prefix=SLOW
|
|
; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=opteron-sse3 2>&1 | FileCheck %s --check-prefix=SLOW
|
|
; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=athlon64-sse3 2>&1 | FileCheck %s --check-prefix=SLOW
|
|
|
|
; AMD chips with fast unaligned memory accesses
|
|
|
|
; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=amdfam10 2>&1 | FileCheck %s --check-prefix=FAST
|
|
; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=barcelona 2>&1 | FileCheck %s --check-prefix=FAST
|
|
; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=btver1 2>&1 | FileCheck %s --check-prefix=FAST
|
|
; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=btver2 2>&1 | FileCheck %s --check-prefix=FAST
|
|
; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=bdver1 2>&1 | FileCheck %s --check-prefix=FAST
|
|
; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=bdver2 2>&1 | FileCheck %s --check-prefix=FAST
|
|
; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=bdver3 2>&1 | FileCheck %s --check-prefix=FAST
|
|
; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=bdver4 2>&1 | FileCheck %s --check-prefix=FAST
|
|
; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=znver1 2>&1 | FileCheck %s --check-prefix=FAST
|
|
; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=znver2 2>&1 | FileCheck %s --check-prefix=FAST
|
|
; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=znver3 2>&1 | FileCheck %s --check-prefix=FAST
|
|
|
|
; Other chips with slow unaligned memory accesses
|
|
|
|
; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=c3-2 2>&1 | FileCheck %s --check-prefix=SLOW
|
|
|
|
; Verify that the slow/fast unaligned memory attribute is set correctly for each CPU model.
|
|
; Slow chips use 4-byte stores. Fast chips with SSE or later use something other than 4-byte stores.
|
|
; Chips that don't have SSE use 4-byte stores either way, so they're not tested.
|
|
|
|
; Also verify that SSE4.2 or SSE4a imply fast unaligned accesses.
|
|
|
|
; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=sse4.2 2>&1 | FileCheck %s --check-prefix=FAST
|
|
; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=sse4a 2>&1 | FileCheck %s --check-prefix=FAST
|
|
|
|
define void @store_zeros(i8* %a) {
|
|
; SLOW-NOT: not a recognized processor
|
|
; SLOW-LABEL: store_zeros:
|
|
; SLOW: # %bb.0:
|
|
; SLOW-NEXT: movl
|
|
; SLOW-NEXT: movl
|
|
; SLOW-NEXT: movl
|
|
; SLOW-NEXT: movl
|
|
; SLOW-NEXT: movl
|
|
; SLOW-NEXT: movl
|
|
; SLOW-NEXT: movl
|
|
; SLOW-NEXT: movl
|
|
; SLOW-NEXT: movl
|
|
; SLOW-NEXT: movl
|
|
; SLOW-NEXT: movl
|
|
; SLOW-NEXT: movl
|
|
; SLOW-NEXT: movl
|
|
; SLOW-NEXT: movl
|
|
; SLOW-NEXT: movl
|
|
; SLOW-NEXT: movl
|
|
; SLOW-NEXT: movl
|
|
;
|
|
; FAST-NOT: not a recognized processor
|
|
; FAST-LABEL: store_zeros:
|
|
; FAST: # %bb.0:
|
|
; FAST-NEXT: movl {{[0-9]+}}(%esp), %eax
|
|
; FAST-NOT: movl
|
|
call void @llvm.memset.p0i8.i64(i8* %a, i8 0, i64 64, i1 false)
|
|
ret void
|
|
}
|
|
|
|
declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1)
|
|
|